[RLlib] Benchmark and regression test yaml cleanup and restructuring. (#8414)

2025-03-04 17:41:43 -05:00 · 2020-05-26 11:10:27 +02:00 · 2020-05-26 11:10:27 +02:00 · baa053496a
commit baa053496a
parent ae2e1f0883
89 changed files with 614 additions and 584 deletions
--- a/rllib/BUILD
+++ b/rllib/BUILD
@ -37,70 +37,120 @@
 # Tag: learning_tests
 #
 # This will test all yaml files (via `rllib train`)
-# inside rllib/tuned_examples/regression_tests for actual learning success.
+# inside rllib/tuned_examples/[algo-name] for actual learning success.
 # --------------------------------------------------------------------

+# A2C/A3C
 py_test(
-    name = "run_regression_tests_cartpole_pg_a3c_tf",
+    name = "regression_test_a2c_cartpole_tf",
    main = "tests/run_regression_tests.py",
    tags = ["learning_tests_tf", "learning_tests_cartpole"],
-    size = "large",
+    size = "medium",
    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-pg-tf.yaml",
-        "tuned_examples/regression_tests/cartpole-a3c-tf.yaml",
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    data = ["tuned_examples/a3c/cartpole-a2c.yaml"],
+    args = ["--yaml-dir=tuned_examples/a3c"]
 )

+py_test(
+    name = "regression_test_a2c_cartpole_torch",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_torch", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/a3c/cartpole-a2c.yaml"],
+    args = ["--yaml-dir=tuned_examples/a3c", "--torch"]
+)
+
+py_test(
+    name = "regression_test_a3c_cartpole_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/a3c/cartpole-a3c.yaml"],
+    args = ["--yaml-dir=tuned_examples/a3c"]
+)
+
+py_test(
+    name = "regression_test_a3c_cartpole_torch",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_torch", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/a3c/cartpole-a3c.yaml"],
+    args = ["--yaml-dir=tuned_examples/a3c", "--torch"]
+)
+
+# APPO
 py_test(
    name = "run_regression_tests_cartpole_appo_tf",
    main = "tests/run_regression_tests.py",
    tags = ["learning_tests_tf", "learning_tests_cartpole"],
-    size = "large",
+    size = "medium",
    srcs = ["tests/run_regression_tests.py"],
    data = [
-        "tuned_examples/regression_tests/cartpole-appo-tf.yaml",
+        "tuned_examples/ppo/cartpole-appo.yaml",
+        "tuned_examples/ppo/cartpole-appo-vtrace.yaml"
    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    args = ["--yaml-dir=tuned_examples/ppo"]
 )

 py_test(
-    name = "run_regression_tests_cartpole_appo_vtrace_tf",
+    name = "run_regression_tests_cartpole_appo_torch",
    main = "tests/run_regression_tests.py",
-    tags = ["learning_tests_tf", "learning_tests_cartpole"],
-    size = "large",
+    tags = ["learning_tests_torch", "learning_tests_cartpole"],
+    size = "medium",
    srcs = ["tests/run_regression_tests.py"],
    data = [
-        "tuned_examples/regression_tests/cartpole-appo-vtrace-tf.yaml",
+        "tuned_examples/ppo/cartpole-appo.yaml",
+        "tuned_examples/ppo/cartpole-appo-vtrace.yaml"
    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
-)
-
-py_test(
-    name = "run_regression_tests_cartpole_es_tf",
-    main = "tests/run_regression_tests.py",
-    tags = ["learning_tests_tf", "learning_tests_cartpole"],
-    size = "large",
-    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-es-tf.yaml",
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    args = ["--yaml-dir=tuned_examples/ppo", "--torch"]
 )

+# ARS
 py_test(
    name = "run_regression_tests_cartpole_ars_tf",
    main = "tests/run_regression_tests.py",
    tags = ["learning_tests_tf", "learning_tests_cartpole"],
-    size = "large",
+    size = "medium",
    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-ars-tf.yaml",
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    data = ["tuned_examples/ars/cartpole-ars.yaml"],
+    args = ["--yaml-dir=tuned_examples/ars"]
 )

+py_test(
+    name = "run_regression_tests_cartpole_ars_torch",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_torch", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/ars/cartpole-ars.yaml"],
+    args = ["--yaml-dir=tuned_examples/ars", "--torch"]
+)
+
+# DDPG
+py_test(
+    name = "run_regression_tests_pendulum_ddpg_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_pendulum"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = glob(["tuned_examples/ddpg/pendulum-ddpg.yaml"]),
+    args = ["--yaml-dir=tuned_examples/ddpg"]
+)
+
+py_test(
+    name = "run_regression_tests_pendulum_ddpg_torch",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_torch", "learning_tests_pendulum"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = glob(["tuned_examples/ddpg/pendulum-ddpg.yaml"]),
+    args = ["--torch", "--yaml-dir=tuned_examples/ddpg"]
+)
+
+# DQN/Simple-Q
 py_test(
    name = "run_regression_tests_cartpole_dqn_tf",
    main = "tests/run_regression_tests.py",
@ -108,95 +158,11 @@ py_test(
    size = "large",
    srcs = ["tests/run_regression_tests.py"],
    data = [
-        "tuned_examples/regression_tests/cartpole-simpleq-tf.yaml",
-        "tuned_examples/regression_tests/cartpole-dqn-tf.yaml",
-        "tuned_examples/regression_tests/cartpole-dqn-param-noise-tf.yaml",
+        "tuned_examples/dqn/cartpole-simpleq.yaml",
+        "tuned_examples/dqn/cartpole-dqn.yaml",
+        "tuned_examples/dqn/cartpole-dqn-param-noise.yaml",
    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
-)
-
-py_test(
-    name = "run_regression_tests_cartpole_impala_tf",
-    main = "tests/run_regression_tests.py",
-    tags = ["learning_tests_tf", "learning_tests_cartpole"],
-    size = "large",
-    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-impala-tf.yaml",
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
-)
-
-py_test(
-    name = "run_regression_tests_cartpole_sac_tf",
-    main = "tests/run_regression_tests.py",
-    tags = ["learning_tests_tf", "learning_tests_cartpole"],
-    size = "large",
-    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-sac-tf.yaml",
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
-)
-
-py_test(
-    name = "run_regression_tests_cartpole_ppo_tf",
-    main = "tests/run_regression_tests.py",
-    tags = ["learning_tests_tf", "learning_tests_cartpole"],
-    size = "large",
-    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-ppo-tf.yaml",
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
-)
-
-py_test(
-    name = "run_regression_tests_cartpole_a2c_torch",
-    main = "tests/run_regression_tests.py",
-    tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "large",
-    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-a2c-torch.yaml"
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
-)
-
-py_test(
-    name = "run_regression_tests_cartpole_appo_torch",
-    main = "tests/run_regression_tests.py",
-    tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "large",
-    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-appo-torch.yaml"
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
-)
-
-py_test(
-    name = "run_regression_tests_cartpole_appo_vtrace_torch",
-    main = "tests/run_regression_tests.py",
-    tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "large",
-    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-appo-vtrace-torch.yaml"
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
-)
-
-py_test(
-    name = "run_regression_tests_cartpole_ars_torch",
-    main = "tests/run_regression_tests.py",
-    tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "large",
-    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-ars-torch.yaml"
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    args = ["--yaml-dir=tuned_examples/dqn"]
 )

 py_test(
@ -206,91 +172,177 @@ py_test(
    size = "large",
    srcs = ["tests/run_regression_tests.py"],
    data = [
-        "tuned_examples/regression_tests/cartpole-dqn-param-noise-torch.yaml"
+        "tuned_examples/dqn/cartpole-simpleq.yaml",
+        "tuned_examples/dqn/cartpole-dqn.yaml",
+        "tuned_examples/dqn/cartpole-dqn-param-noise.yaml",
    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    args = ["--yaml-dir=tuned_examples/dqn", "--torch"]
+)
+
+# ES
+py_test(
+    name = "run_regression_tests_cartpole_es_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/es/cartpole-es.yaml"],
+    args = ["--yaml-dir=tuned_examples/es"]
 )

 py_test(
    name = "run_regression_tests_cartpole_es_torch",
    main = "tests/run_regression_tests.py",
    tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "large",
+    size = "medium",
    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-es-torch.yaml"
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    data = ["tuned_examples/es/cartpole-es.yaml"],
+    args = ["--yaml-dir=tuned_examples/es", "--torch"]
+)
+
+# IMPALA
+py_test(
+    name = "run_regression_tests_cartpole_impala_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/impala/cartpole-impala.yaml"],
+    args = ["--yaml-dir=tuned_examples/impala"]
 )

 py_test(
    name = "run_regression_tests_cartpole_impala_torch",
    main = "tests/run_regression_tests.py",
    tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "large",
+    size = "medium",
    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-impala-torch.yaml"
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    data = ["tuned_examples/impala/cartpole-impala.yaml"],
+    args = ["--yaml-dir=tuned_examples/impala", "--torch"]
+)
+
+# PG
+py_test(
+    name = "run_regression_tests_cartpole_pg_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/pg/cartpole-pg.yaml"],
+    args = ["--yaml-dir=tuned_examples/pg"]
 )

 py_test(
    name = "run_regression_tests_cartpole_pg_torch",
    main = "tests/run_regression_tests.py",
    tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "large",
+    size = "medium",
    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-pg-torch.yaml"
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    data = ["tuned_examples/pg/cartpole-pg.yaml"],
+    args = ["--yaml-dir=tuned_examples/pg", "--torch"]
+)
+
+# PPO
+py_test(
+    name = "run_regression_tests_cartpole_ppo_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/ppo/cartpole-ppo.yaml"],
+    args = ["--yaml-dir=tuned_examples/ppo"]
 )

 py_test(
    name = "run_regression_tests_cartpole_ppo_torch",
    main = "tests/run_regression_tests.py",
    tags = ["learning_tests_torch", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/ppo/cartpole-ppo.yaml"],
+    args = ["--yaml-dir=tuned_examples/ppo", "--torch"]
+)
+
+py_test(
+    name = "run_regression_tests_pendulum_ppo_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_pendulum"],
    size = "large",
    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-ppo-torch.yaml"
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    data = ["tuned_examples/ppo/pendulum-ppo.yaml"],
+    args = ["--yaml-dir=tuned_examples/ppo"]
+)
+
+py_test(
+    name = "run_regression_tests_pendulum_ppo_torch",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_torch", "learning_tests_pendulum"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/ppo/pendulum-ppo.yaml"],
+    args = ["--torch", "--yaml-dir=tuned_examples/ppo"]
+)
+
+# SAC
+py_test(
+    name = "run_regression_tests_cartpole_sac_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_cartpole"],
+    size = "medium",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/sac/cartpole-sac.yaml"],
+    args = ["--yaml-dir=tuned_examples/sac"]
 )

 py_test(
    name = "run_regression_tests_cartpole_sac_torch",
    main = "tests/run_regression_tests.py",
    tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "large",
+    size = "medium",
    srcs = ["tests/run_regression_tests.py"],
-    data = [
-        "tuned_examples/regression_tests/cartpole-sac-torch.yaml"
-    ],
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    data = ["tuned_examples/sac/cartpole-sac.yaml"],
+    args = ["--yaml-dir=tuned_examples/sac", "--torch"]
 )

 py_test(
-    name = "run_regression_tests_pendulum_tf",
+    name = "run_regression_tests_pendulum_sac_tf",
    main = "tests/run_regression_tests.py",
    tags = ["learning_tests_tf", "learning_tests_pendulum"],
-    size = "enormous",  # = 60min timeout
+    size = "large",
    srcs = ["tests/run_regression_tests.py"],
-    data = glob(["tuned_examples/regression_tests/pendulum-*-tf.yaml"]),
-    # Pass `BAZEL` option and the path to look for yaml regression files.
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    data = ["tuned_examples/sac/pendulum-sac.yaml"],
+    args = ["--yaml-dir=tuned_examples/sac"]
 )

 py_test(
-    name = "run_regression_tests_pendulum_torch",
+    name = "run_regression_tests_pendulum_sac_torch",
    main = "tests/run_regression_tests.py",
    tags = ["learning_tests_torch", "learning_tests_pendulum"],
-    size = "enormous",  # = 60min timeout
+    size = "large",
    srcs = ["tests/run_regression_tests.py"],
-    data = glob(["tuned_examples/regression_tests/pendulum-*-torch.yaml"]),
-    # Pass `BAZEL` option and the path to look for yaml regression files.
-    args = ["BAZEL", "tuned_examples/regression_tests"]
+    data = ["tuned_examples/sac/pendulum-sac.yaml"],
+    args = ["--yaml-dir=tuned_examples/sac", "--torch"]
+)
+
+# TD3
+py_test(
+    name = "run_regression_tests_pendulum_td3_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_pendulum"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/ddpg/pendulum-td3.yaml"],
+    args = ["--yaml-dir=tuned_examples/ddpg"]
+)
+
+py_test(
+    name = "run_regression_tests_pendulum_td3_torch",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_torch", "learning_tests_pendulum"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/ddpg/pendulum-td3.yaml"],
+    args = ["--yaml-dir=tuned_examples/ddpg", "--torch"]
 )

 # --------------------------------------------------------------------
--- a/rllib/agents/ddpg/ddpg_torch_policy.py
+++ b/rllib/agents/ddpg/ddpg_torch_policy.py
@ -200,7 +200,8 @@ def build_ddpg_stats(policy, batch):
        "mean_q": torch.mean(policy.q_t),
        "max_q": torch.max(policy.q_t),
        "min_q": torch.min(policy.q_t),
-        "td_error": policy.td_error
+        "mean_td_error": torch.mean(policy.td_error),
+        "td_error": policy.td_error,
    }
    return stats

--- a/rllib/agents/ddpg/tests/test_ddpg.py
+++ b/rllib/agents/ddpg/tests/test_ddpg.py
@ -23,7 +23,9 @@ class TestDDPG(unittest.TestCase):
        """Test whether a DDPGTrainer can be built with both frameworks."""
        config = ddpg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
-        config["num_envs_per_worker"] = 2  # Run locally.
+        config["num_envs_per_worker"] = 2
+        config["learning_starts"] = 0
+        config["exploration_config"]["random_timesteps"] = 100

        num_iterations = 2

--- a/rllib/tests/run_regression_tests.py
+++ b/rllib/tests/run_regression_tests.py
@ -9,13 +9,15 @@
 #     name = "run_regression_tests",
 #     main = "tests/run_regression_tests.py",
 #     tags = ["learning_tests"],
-#     size = "enormous",  # = 60min timeout
+#     size = "medium",  # 5min timeout
 #     srcs = ["tests/run_regression_tests.py"],
 #     data = glob(["tuned_examples/regression_tests/*.yaml"]),
-#     Pass `BAZEL` option and the path to look for yaml regression files.
+#     # Pass `BAZEL` option and the path to look for yaml regression files.
 #     args = ["BAZEL", "tuned_examples/regression_tests"]
 # )

+import argparse
+import os
 from pathlib import Path
 import sys
 import yaml
@ -24,30 +26,51 @@ import ray
 from ray.tune import run_experiments
 from ray.rllib import _register_all

-if __name__ == "__main__":
-    # Bazel regression test mode: Get path to look for yaml files from argv[2].
-    if sys.argv[1] == "BAZEL":
-        # Get the path to use.
-        rllib_dir = Path(__file__).parent.parent
-        print("rllib dir={}".format(rllib_dir))
-        yaml_files = rllib_dir.rglob(sys.argv[2] + "/*.yaml")
-        yaml_files = sorted(
-            map(lambda path: str(path.absolute()), yaml_files), reverse=True)
-    # Normal mode: Get yaml files to run from command line.
-    else:
-        yaml_files = sys.argv[1:]
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--torch",
+    action="store_true",
+    help="Runs all tests with PyTorch enabled.")
+parser.add_argument(
+    "--yaml-dir",
+    type=str,
+    help="The directory in which to find all yamls to test.")

-    print("Will run the following regression files:")
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Bazel regression test mode: Get path to look for yaml files from argv[2].
+    # Get the path or single file to use.
+    rllib_dir = Path(__file__).parent.parent
+    print("rllib dir={}".format(rllib_dir))
+
+    if not os.path.isdir(os.path.join(rllib_dir, args.yaml_dir)):
+        raise ValueError("yaml-dir ({}) not found!".format(args.yaml_dir))
+
+    yaml_files = rllib_dir.rglob(args.yaml_dir + "/*.yaml")
+    yaml_files = sorted(
+        map(lambda path: str(path.absolute()), yaml_files), reverse=True)
+
+    print("Will run the following regression tests:")
    for yaml_file in yaml_files:
        print("->", yaml_file)

    # Loop through all collected files.
    for yaml_file in yaml_files:
        experiments = yaml.load(open(yaml_file).read())
+        assert len(experiments) == 1,\
+            "Error, can only run a single experiment per yaml file!"

        print("== Test config ==")
        print(yaml.dump(experiments))

+        # Add torch option to exp configs.
+        for exp in experiments.values():
+            if args.torch:
+                exp["config"]["use_pytorch"] = True
+
+        # Try running each test 3 times and make sure it reaches the given
+        # reward.
        passed = False
        for i in range(3):
            try:
--- a/rllib/tuned_examples/a3c/atari-a2c.yaml
+++ b/rllib/tuned_examples/a3c/atari-a2c.yaml
@ -9,6 +9,7 @@ atari-a2c:
            - SpaceInvadersNoFrameskip-v4
    run: A2C
    config:
+        use_pytorch: false  # <- switch on/off torch
        rollout_fragment_length: 20
        clip_rewards: True
        num_workers: 5
--- a/rllib/tuned_examples/regression_tests/cartpole-a2c-microbatch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-a2c-microbatch.yaml
@ -1,10 +1,11 @@
-cartpole-a2c-microbatch-tf:
+cartpole-a2c-microbatch:
    env: CartPole-v0
    run: A2C
    stop:
-        episode_reward_mean: 100
+        episode_reward_mean: 150
        timesteps_total: 100000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        num_workers: 1
        gamma: 0.95
--- a/rllib/tuned_examples/a3c/cartpole-a2c.yaml
+++ b/rllib/tuned_examples/a3c/cartpole-a2c.yaml
@ -0,0 +1,11 @@
+cartpole-a2c:
+    env: CartPole-v0
+    run: A2C
+    stop:
+        episode_reward_mean: 150
+        timesteps_total: 500000
+    config:
+        # Works for both torch and tf.
+        use_pytorch: false
+        num_workers: 0
+        lr: 0.001
--- a/rllib/tuned_examples/regression_tests/cartpole-a3c-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-a3c-tf.yaml
@ -1,10 +1,11 @@
-cartpole-a3c-tf:
+cartpole-a3c:
    env: CartPole-v0
    run: A3C
    stop:
-        episode_reward_mean: 100
-        timesteps_total: 100000
+        episode_reward_mean: 150
+        timesteps_total: 200000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        num_workers: 1
        gamma: 0.95
--- a/rllib/tuned_examples/a3c/pong-a3c.yaml
+++ b/rllib/tuned_examples/a3c/pong-a3c.yaml
@ -4,9 +4,10 @@ pong-a3c:
    env: PongDeterministic-v4
    run: A3C
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        num_workers: 16
        rollout_fragment_length: 20
-        use_pytorch: false
        vf_loss_coeff: 0.5
        entropy_coeff: 0.01
        gamma: 0.99
--- a/rllib/tuned_examples/regression_tests/cartpole-ars-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-ars-torch.yaml
@ -1,11 +1,12 @@
-cartpole-ars-torch:
+cartpole-ars:
    env: CartPole-v0
    run: ARS
    stop:
        episode_reward_mean: 150
        timesteps_total: 500000
    config:
-        use_pytorch: true
+        # Works for both torch and tf.
+        use_pytorch: false
        noise_stdev: 0.02
        num_rollouts: 50
        rollouts_used: 25
@ -13,5 +14,3 @@ cartpole-ars-torch:
        sgd_stepsize: 0.01
        noise_size: 25000000
        eval_prob: 0.5
-        model:
-            fcnet_hiddens: [64, 64]
--- a/rllib/tuned_examples/ars/swimmer-ars.yaml
+++ b/rllib/tuned_examples/ars/swimmer-ars.yaml
@ -3,6 +3,8 @@ swimmer-ars:
    env: Swimmer-v2
    run: ARS
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        noise_stdev: 0.01
        num_rollouts: 1
        rollouts_used: 1
--- a/rllib/tuned_examples/cartpole-marwil-torch.yaml
+++ b/rllib/tuned_examples/cartpole-marwil-torch.yaml
@ -1,13 +0,0 @@
-# To generate training data, first run:
-# $ ./train.py --run=PPO --env=CartPole-v0 \
-#      --stop='{"timesteps_total": 50000}' \
-#      --config='{"use_pytorch": true, "output": "/tmp/out", "batch_mode": "complete_episodes"}'
-cartpole-marwil-torch:
-    env: CartPole-v0
-    run: MARWIL
-    stop:
-        timesteps_total: 500000
-    config:
-        beta:
-            grid_search: [0, 1]   # compare IL (beta=0) vs MARWIL
-        input: /tmp/out
--- a/rllib/tuned_examples/cleanup_experiment.py
+++ b/rllib/tuned_examples/cleanup_experiment.py
@ -0,0 +1,169 @@
+"""
+This script automates cleaning up a benchmark/experiment run of some algo
+against some config (with possibly more than one tune trial,
+e.g. torch=grid_search([True, False])).
+
+Run `python cleanup_experiment.py --help` for more information.
+
+Use on an input directory with trial contents e.g.:
+..
+IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_10-17-54topr3h9k
+IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_13-59-35dqaetxnf
+IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_17-21-28tbhedw72
+IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_10-17-54lv20cgn_
+IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_13-59-35kwzhax_y
+IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_17-21-28a5j0s7za
+
+Then run:
+>> python cleanup_experiment.py --experiment-dir [parent dir w/ trial sub-dirs]
+>>   --output-dir [your out dir] --results-filter dumb_col_2,superfluous_col3
+>>   --results-max-size [max results file size in kb before(!) zipping]
+
+The script will create one output sub-dir for each trial and only copy
+the configuration and the csv results (filtered and every nth row removed
+based on the given args).
+"""
+
+import argparse
+import json
+import os
+import re
+import shutil
+import yaml
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--experiment-dir",
+    type=str,
+    help="Experiment dir in which all sub-runs (seeds) are "
+    "located (as sub-dirs). Each sub0-run dir must contain the files: "
+    "params.json and progress.csv.")
+parser.add_argument(
+    "--output-dir",
+    type=str,
+    help="The output dir, in which the cleaned up output will be placed.")
+parser.add_argument(
+    "--results-filter",
+    type=str,
+    help="comma-separated list of csv fields to exclude.",
+    default="experiment_id,pid,hostname,node_ip,trial_id,hist_stats/episode_"
+    "reward,hist_stats/episode_lengths,experiment_tag")
+parser.add_argument(
+    "--results-max-size",
+    type=int,
+    help="the max. size of the final results.csv file (in kb). Will erase "
+    "every nth line in the original input to reach that goal. "
+    "Use 0 for no limit (default=100).",
+    default=100)
+
+
+def process_single_run(in_dir, out_dir):
+    exp_dir = os.listdir(in_dir)
+
+    # Make sure trials dir is ok.
+    assert "params.json" in exp_dir and "progress.csv" in exp_dir, \
+        "params.json or progress.csv not found in {}!".format(in_dir)
+
+    os.makedirs(out_dir, exist_ok=True)
+
+    for file in exp_dir:
+        absfile = os.path.join(in_dir, file)
+        # Config file -> Convert to yaml and move to output dir.
+        if file == "params.json":
+            assert os.path.isfile(absfile), "{} not a file!".format(file)
+            with open(absfile) as fp:
+                contents = json.load(fp)
+            with open(os.path.join(out_dir, "config.yaml"), "w") as fp:
+                yaml.dump(contents, fp)
+        # Progress csv file -> Filter out some columns, cut, and write to
+        # output_dir.
+        elif file == "progress.csv":
+            assert os.path.isfile(absfile), "{} not a file!".format(file)
+            col_idx_to_filter = []
+            with open(absfile) as fp:
+                # Get column names.
+                col_names_orig = fp.readline().strip().split(",")
+                # Split by comma (abiding to quotes), filter out
+                # unwanted columns, then write to disk.
+                cols_to_filter = args.results_filter.split(",")
+                for i, c in enumerate(col_names_orig):
+                    if c in cols_to_filter:
+                        col_idx_to_filter.insert(0, i)
+                col_names = col_names_orig.copy()
+                for idx in col_idx_to_filter:
+                    col_names.pop(idx)
+                absfile_out = os.path.join(out_dir, "progress.csv")
+                with open(absfile_out, "w") as out_fp:
+                    print(",".join(col_names), file=out_fp)
+                    while True:
+                        line = fp.readline().strip()
+                        if not line:
+                            break
+                        line = re.sub(
+                            "(,{2,})",
+                            lambda m: ",None" * (len(m.group()) - 1) + ",",
+                            line)
+                        cols = re.findall('".+?"|[^,]+', line)
+                        if len(cols) != len(col_names_orig):
+                            continue
+                        for idx in col_idx_to_filter:
+                            cols.pop(idx)
+                        print(",".join(cols), file=out_fp)
+
+            # Reduce the size of the output file if necessary.
+            out_size = os.path.getsize(absfile_out)
+            max_size = args.results_max_size * 1024
+            if 0 < max_size < out_size:
+                # Figure out roughly every which line we have to drop.
+                ratio = out_size / max_size
+                # If ratio > 2.0, we'll have to keep only every nth line.
+                if ratio > 2.0:
+                    nth = out_size // max_size
+                    os.system("awk 'NR==1||NR%{}==0' {} > {}.new".format(
+                        nth, absfile_out, absfile_out))
+                # If ratio < 2.0 (>1.0), we'll have to drop every nth line.
+                else:
+                    nth = out_size // (out_size - max_size)
+                    os.system("awk 'NR==1||NR%{}!=0' {} > {}.new".format(
+                        nth, absfile_out, absfile_out))
+                os.remove(absfile_out)
+                os.rename(absfile_out + ".new", absfile_out)
+
+            # Zip progress.csv into results.zip.
+            zip_file = os.path.join(out_dir, "results.zip")
+            try:
+                os.remove(zip_file)
+            except FileNotFoundError:
+                pass
+            os.system("zip -j {} {}".format(
+                zip_file, os.path.join(out_dir, "progress.csv")))
+            os.remove(os.path.join(out_dir, "progress.csv"))
+
+        # TBX events file -> Move as is.
+        elif re.search("^(events\\.out\\.|params\\.pkl)", file):
+            assert os.path.isfile(absfile), "{} not a file!".format(file)
+            shutil.copyfile(absfile, os.path.join(out_dir, file))
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    exp_dir = os.listdir(args.experiment_dir)
+    # Loop through all sub-directories.
+    for i, sub_run in enumerate(sorted(exp_dir)):
+        abspath = os.path.join(args.experiment_dir, sub_run)
+        # This is a seed run.
+        if os.path.isdir(abspath) and \
+                re.search("^(\\w+?)_(\\w+?-v\\d+)(_\\d+)", sub_run):
+            # Create meaningful output dir name:
+            # [algo]_[env]_[trial #]_[trial-config]_[date YYYY-MM-DD].
+            cleaned_up_out = re.sub(
+                "^(\\w+?)_(\\w+?-v\\d+)(_\\d+)(_.+)?(_\\d{4}-\\d{2}-\\d{2})"
+                "_\\d{2}-\\d{2}-\\w+", "{:02}_\\1_\\2\\4\\5".format(i),
+                sub_run)
+            # Remove superflous `env=` specifier (anv always included in name).
+            cleaned_up_out = re.sub("^(.+)env=\\w+?-v\\d+,?(.+)", "\\1\\2",
+                                    cleaned_up_out)
+            out_path = os.path.join(args.output_dir, cleaned_up_out)
+            process_single_run(abspath, out_path)
+    # Done.
+    print("done")
--- a/rllib/tuned_examples/create_plots.py
+++ b/rllib/tuned_examples/create_plots.py
@ -0,0 +1,5 @@
+# TODO(sven):
+#  Add a simple script that takes n csv input files and generates plot(s)
+#  from these with: x-axis=ts OR wall-time; y-axis=any metric(s) (up to 2).
+#  ability to merge any m csv files (e.g. tf vs torch; or n seeds) together
+#  in one plot.
--- a/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml
+++ b/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml
@ -6,6 +6,7 @@ halfcheetah-ddpg:
        episode_reward_mean: 2000
        time_total_s: 5400 # 90 minutes
    config:
+        use_pytorch: false  # <- switch on/off torch
        # === Model ===
        actor_hiddens: [64, 64]
        critic_hiddens: [64, 64]
--- a/rllib/tuned_examples/ddpg/invertedpendulum-td3.yaml
+++ b/rllib/tuned_examples/ddpg/invertedpendulum-td3.yaml
@ -9,6 +9,8 @@ invertedpendulum-td3:
        time_total_s: 900 # 15 minutes
        timesteps_total: 1000000
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        # === Model ===
        actor_hiddens: [32, 32]
        critic_hiddens: [32, 32]
--- a/rllib/tuned_examples/ddpg/mountaincarcontinuous-apex-ddpg.yaml
+++ b/rllib/tuned_examples/ddpg/mountaincarcontinuous-apex-ddpg.yaml
@ -5,6 +5,8 @@ mountaincarcontinuous-apex-ddpg:
    stop:
        episode_reward_mean: 90
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        clip_rewards: False
        num_workers: 16
        exploration_config:
--- a/rllib/tuned_examples/ddpg/mountaincarcontinuous-ddpg.yaml
+++ b/rllib/tuned_examples/ddpg/mountaincarcontinuous-ddpg.yaml
@ -6,6 +6,8 @@ mountaincarcontinuous-ddpg:
        episode_reward_mean: 90
        time_total_s: 600 # 10 minutes
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        # === Model ===
        actor_hiddens: [32, 64]
        critic_hiddens: [64, 64]
--- a/rllib/tuned_examples/ddpg/mujoco-td3.yaml
+++ b/rllib/tuned_examples/ddpg/mujoco-td3.yaml
@ -15,6 +15,8 @@ mujoco-td3:
    stop:
        timesteps_total: 1000000
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        # === Exploration ===
        learning_starts: 10000
        exploration_config:
--- a/rllib/tuned_examples/ddpg/pendulum-apex-ddpg.yaml
+++ b/rllib/tuned_examples/ddpg/pendulum-apex-ddpg.yaml
@ -5,6 +5,8 @@ pendulum-apex-ddpg:
    stop:
        episode_reward_mean: -160
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        use_huber: True
        clip_rewards: False
        num_workers: 16
--- a/rllib/tuned_examples/ddpg/pendulum-ddpg.yaml
+++ b/rllib/tuned_examples/ddpg/pendulum-ddpg.yaml
@ -1,11 +1,13 @@
-# This configuration can expect to reach -160 reward in 10k-20k timesteps
+# This configuration can expect to reach -160 reward in 10k-20k timesteps.
 pendulum-ddpg:
    env: Pendulum-v0
    run: DDPG
    stop:
-        episode_reward_mean: -160
-        timesteps_total: 100000
+        episode_reward_mean: -900
+        timesteps_total: 20000
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        # === Model ===
        actor_hiddens: [64, 64]
        critic_hiddens: [64, 64]
@ -18,7 +20,7 @@ pendulum-ddpg:
        exploration_config:
            type: "OrnsteinUhlenbeckNoise"
            scale_timesteps: 10000
-            initial_scale: 1.0,
+            initial_scale: 1.0
            final_scale: 0.02
            ou_base_scale: 0.1
            ou_theta: 0.15
--- a/rllib/tuned_examples/ddpg/pendulum-td3.yaml
+++ b/rllib/tuned_examples/ddpg/pendulum-td3.yaml
@ -1,20 +1,20 @@
 # This configuration can expect to reach -160 reward in 10k-20k timesteps
-pendulum-ddpg:
+pendulum-td3:
    env: Pendulum-v0
    run: TD3
    stop:
-        episode_reward_mean: -130
-        time_total_s: 900 # 10 minutes
+        episode_reward_mean: -900
+        timesteps_total: 100000
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        # === Model ===
        actor_hiddens: [64, 64]
        critic_hiddens: [64, 64]
-
        # === Exploration ===
        learning_starts: 5000
        exploration_config:
            random_timesteps: 5000
-
        # === Evaluation ===
        evaluation_interval: 1
        evaluation_num_episodes: 5
--- a/rllib/tuned_examples/dqn/atari-apex.yaml
+++ b/rllib/tuned_examples/dqn/atari-apex.yaml
@ -8,6 +8,7 @@ apex:
            - SpaceInvadersNoFrameskip-v4
    run: APEX
    config:
+        use_pytorch: false  # <- switch on/off torch
        double_q: false
        dueling: false
        num_atoms: 1
--- a/rllib/tuned_examples/dqn/atari-dist-dqn.yaml
+++ b/rllib/tuned_examples/dqn/atari-dist-dqn.yaml
--- a/rllib/tuned_examples/atari-dqn-tf-and-torch.yaml
+++ b/rllib/tuned_examples/atari-dqn-tf-and-torch.yaml
@ -9,6 +9,7 @@ atari-basic-dqn:
            - SpaceInvadersNoFrameskip-v4
    run: DQN
    config:
+        use_pytorch: false  # <- switch on/off torch
        double_q: false
        dueling: false
        num_atoms: 1
--- a/rllib/tuned_examples/dqn/atari-duel-ddqn.yaml
+++ b/rllib/tuned_examples/dqn/atari-duel-ddqn.yaml
@ -9,6 +9,7 @@ dueling-ddqn:
            - SpaceInvadersNoFrameskip-v4
    run: DQN
    config:
+        use_pytorch: false  # <- switch on/off torch
        double_q: true
        dueling: true
        num_atoms: 1
--- a/rllib/tuned_examples/regression_tests/cartpole-dqn-param-noise-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-dqn-param-noise-tf.yaml
@ -1,10 +1,11 @@
-cartpole-dqn-tf-w-param-noise:
+cartpole-dqn-w-param-noise:
    env: CartPole-v0
    run: DQN
    stop:
        episode_reward_mean: 150
        timesteps_total: 300000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        exploration_config:
            type: ParameterNoise
--- a/rllib/tuned_examples/regression_tests/cartpole-dqn-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-dqn-tf.yaml
@ -1,10 +1,11 @@
-cartpole-dqn-tf:
+cartpole-dqn:
    env: CartPole-v0
    run: DQN
    stop:
        episode_reward_mean: 150
        timesteps_total: 50000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        n_step: 3
        gamma: 0.95
--- a/rllib/tuned_examples/regression_tests/cartpole-simpleq-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-simpleq-tf.yaml
@ -1,8 +1,9 @@
-cartpole-dqn-tf:
+cartpole-dqn:
    env: CartPole-v0
    run: SimpleQ
    stop:
        episode_reward_mean: 150
        timesteps_total: 50000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
--- a/rllib/tuned_examples/dqn/pong-apex.yaml
+++ b/rllib/tuned_examples/dqn/pong-apex.yaml
@ -6,6 +6,7 @@ pong-apex:
    env: PongNoFrameskip-v4
    run: APEX
    config:
+        use_pytorch: false
        target_network_update_freq: 20000
        num_workers: 4
        num_envs_per_worker: 8
--- a/rllib/tuned_examples/dqn/pong-dqn.yaml
+++ b/rllib/tuned_examples/dqn/pong-dqn.yaml
@ -6,6 +6,8 @@ pong-deterministic-dqn:
        episode_reward_mean: 20
        time_total_s: 7200
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        num_gpus: 1
        gamma: 0.99
        lr: .0001
--- a/rllib/tuned_examples/dqn/pong-rainbow.yaml
+++ b/rllib/tuned_examples/dqn/pong-rainbow.yaml
--- a/rllib/tuned_examples/regression_tests/cartpole-es-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-es-tf.yaml
@ -1,10 +1,11 @@
-cartpole-es-tf:
+cartpole-es:
    env: CartPole-v0
    run: ES
    stop:
        episode_reward_mean: 150
        timesteps_total: 500000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        num_workers: 2
        noise_size: 25000000
--- a/rllib/tuned_examples/es/humanoid-es.yaml
+++ b/rllib/tuned_examples/es/humanoid-es.yaml
@ -0,0 +1,9 @@
+humanoid-v2-es:
+    env: Humanoid-v2
+    run: ES
+    stop:
+        episode_reward_mean: 6000
+    config:
+        # Works for both torch and tf.
+        use_pytorch: false
+        num_workers: 100
--- a/rllib/tuned_examples/humanoid-es.yaml
+++ b/rllib/tuned_examples/humanoid-es.yaml
@ -1,7 +0,0 @@
-humanoid-es:
-    env: Humanoid-v1
-    run: ES
-    stop:
-        episode_reward_mean: 6000
-    config:
-        num_workers: 100
--- a/rllib/tuned_examples/impala/atari-impala-large.yaml
+++ b/rllib/tuned_examples/impala/atari-impala-large.yaml
--- a/rllib/tuned_examples/impala/atari-impala.yaml
+++ b/rllib/tuned_examples/impala/atari-impala.yaml
--- a/rllib/tuned_examples/regression_tests/cartpole-impala-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-impala-tf.yaml
@ -5,5 +5,6 @@ cartpole-impala-tf:
        episode_reward_mean: 150
        timesteps_total: 500000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        num_gpus: 0
--- a/rllib/tuned_examples/impala/pendulum-impala.yaml
+++ b/rllib/tuned_examples/impala/pendulum-impala.yaml
@ -0,0 +1,6 @@
+pendulum-impala-tf:
+    env: Pendulum-v0
+    run: IMPALA
+    stop:
+        episode_reward_mean: -700
+        timesteps_total: 500000
--- a/rllib/tuned_examples/impala/pong-impala-fast.yaml
+++ b/rllib/tuned_examples/impala/pong-impala-fast.yaml
--- a/rllib/tuned_examples/impala/pong-impala-vectorized.yaml
+++ b/rllib/tuned_examples/impala/pong-impala-vectorized.yaml
--- a/rllib/tuned_examples/impala/pong-impala.yaml
+++ b/rllib/tuned_examples/impala/pong-impala.yaml
--- a/rllib/tuned_examples/marwil/cartpole-marwil.yaml
+++ b/rllib/tuned_examples/marwil/cartpole-marwil.yaml
@ -2,12 +2,13 @@
 # $ ./train.py --run=PPO --env=CartPole-v0 \
 #      --stop='{"timesteps_total": 50000}' \
 #      --config='{"output": "/tmp/out", "batch_mode": "complete_episodes"}'
-cartpole-marwil-tf:
+cartpole-marwil:
    env: CartPole-v0
    run: MARWIL
    stop:
        timesteps_total: 500000
    config:
+        use_pytorch: false  # <- switch on/off torch
        beta:
            grid_search: [0, 1]   # compare IL (beta=0) vs MARWIL
        input: /tmp/out
--- a/rllib/tuned_examples/pendulum-appo-torch.yaml
+++ b/rllib/tuned_examples/pendulum-appo-torch.yaml
@ -1,21 +0,0 @@
-pendulum-appo-vtrace-torch:
-    env: Pendulum-v0
-    run: APPO
-    stop:
-        episode_reward_mean: -1000  # just check it learns a bit
-        timesteps_total: 500000
-    config:
-        use_pytorch: true
-        vtrace: true
-        num_gpus: 0
-        num_workers: 1
-        lambda: 0.1
-        gamma: 0.95
-        lr: 0.0003
-        train_batch_size: 100
-        minibatch_buffer_size: 16
-        num_sgd_iter: 10
-        model:
-            fcnet_hiddens: [256, 256]
-        batch_mode: truncate_episodes
-        observation_filter: MeanStdFilter
--- a/rllib/tuned_examples/pendulum-ppo.yaml
+++ b/rllib/tuned_examples/pendulum-ppo.yaml
@ -1,18 +0,0 @@
-# can expect improvement to -140 reward in ~300-500k timesteps
-pendulum-ppo:
-    env: Pendulum-v0
-    run: PPO
-    config:
-        train_batch_size: 2048
-        vf_clip_param: 10.0
-        num_workers: 0
-        num_envs_per_worker: 10
-        lambda: 0.1
-        gamma: 0.95
-        lr: 0.0003
-        sgd_minibatch_size: 64
-        num_sgd_iter: 10
-        model:
-            fcnet_hiddens: [64, 64]
-        batch_mode: complete_episodes
-        observation_filter: MeanStdFilter
--- a/rllib/tuned_examples/regression_tests/cartpole-pg-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-pg-torch.yaml
@ -1,9 +1,10 @@
-cartpole-pg-torch:
+cartpole-pg:
    env: CartPole-v0
    run: PG
    stop:
        episode_reward_mean: 150
        timesteps_total: 100000
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        num_workers: 0
-        use_pytorch: true
--- a/rllib/tuned_examples/pong-a3c-pytorch.yaml
+++ b/rllib/tuned_examples/pong-a3c-pytorch.yaml
@ -1,21 +0,0 @@
-pong-a3c-pytorch-cnn:
-    env: PongDeterministic-v4
-    run: A3C
-    config:
-        num_workers: 16
-        rollout_fragment_length: 20
-        use_pytorch: true
-        vf_loss_coeff: 0.5
-        entropy_coeff: 0.01
-        gamma: 0.99
-        grad_clip: 40.0
-        lambda: 1.0
-        lr: 0.0001
-        observation_filter: NoFilter
-        model:
-            use_lstm: false
-            dim: 84
-            grayscale: true
-            zero_mean: false
-        optimizer:
-            grads_per_step: 1000
--- a/rllib/tuned_examples/ppo/atari-ddppo.yaml
+++ b/rllib/tuned_examples/ppo/atari-ddppo.yaml
@ -7,6 +7,7 @@ atari-ddppo:
            - BreakoutNoFrameskip-v4
    run: DDPPO
    config:
+        use_pytorch: true  # DDPPO only supports PyTorch so far
        # Worker config: 10 workers, each of which requires a GPU.
        num_workers: 10
        num_gpus_per_worker: 1
--- a/rllib/tuned_examples/ppo/atari-ppo.yaml
+++ b/rllib/tuned_examples/ppo/atari-ppo.yaml
@ -9,6 +9,7 @@ atari-ppo:
            - SpaceInvadersNoFrameskip-v4
    run: PPO
    config:
+        use_pytorch: false  # <- switch on/off torch
        lambda: 0.95
        kl_coeff: 0.5
        clip_rewards: True
--- a/rllib/tuned_examples/regression_tests/cartpole-appo-vtrace-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-appo-vtrace-tf.yaml
@ -1,10 +1,11 @@
-cartpole-appo-vtrace-tf:
+cartpole-appo-vtrace:
    env: CartPole-v0
    run: APPO
    stop:
        episode_reward_mean: 150
        timesteps_total: 200000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        rollout_fragment_length: 10
        train_batch_size: 10
--- a/rllib/tuned_examples/regression_tests/cartpole-appo-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-appo-tf.yaml
@ -1,10 +1,11 @@
-cartpole-appo-tf:
+cartpole-appo:
    env: CartPole-v0
    run: APPO
    stop:
        episode_reward_mean: 150
        timesteps_total: 200000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        rollout_fragment_length: 10
        train_batch_size: 10
--- a/rllib/tuned_examples/regression_tests/cartpole-ddppo.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-ddppo.yaml
@ -1,9 +1,8 @@
-cartpole-ddppo-torch:
+cartpole-ddppo:
    env: CartPole-v0
    run: DDPPO
    stop:
-        episode_reward_mean: 100
+        episode_reward_mean: 150
        timesteps_total: 100000
    config:
-        use_pytorch: true
        num_gpus_per_worker: 0
--- a/rllib/tuned_examples/ppo/cartpole-grid-search-example.yaml
+++ b/rllib/tuned_examples/ppo/cartpole-grid-search-example.yaml
@ -5,6 +5,7 @@ cartpole-ppo:
        episode_reward_mean: 200
        time_total_s: 180
    config:
+        use_pytorch: false  # <- switch on/off torch
        num_workers: 2
        num_sgd_iter:
            grid_search: [1, 4]
--- a/rllib/tuned_examples/ppo/cartpole-ppo-hyperband.yaml
+++ b/rllib/tuned_examples/ppo/cartpole-ppo-hyperband.yaml
@ -6,6 +6,8 @@ cartpole-ppo:
        episode_reward_mean: 200
        time_total_s: 180
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        num_workers: 1
        num_sgd_iter:
            grid_search: [1, 4]
--- a/rllib/tuned_examples/regression_tests/cartpole-ppo-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-ppo-torch.yaml
@ -1,11 +1,12 @@
-cartpole-ppo-torch:
+cartpole-ppo:
    env: CartPole-v0
    run: PPO
    stop:
        episode_reward_mean: 150
        timesteps_total: 100000
    config:
-        use_pytorch: true
+        # Works for both torch and tf.
+        use_pytorch: false
        gamma: 0.99
        lr: 0.0003
        num_workers: 1
--- a/rllib/tuned_examples/ppo/halfcheetah-appo.yaml
+++ b/rllib/tuned_examples/ppo/halfcheetah-appo.yaml
@ -1,11 +1,12 @@
-# This can reach 9k reward in 2 hours on a Titan XP GPU 
+# This can reach 9k reward in 2 hours on a Titan XP GPU
 # with 16 workers and 8 envs per worker.
 halfcheetah-appo:
    env: HalfCheetah-v2
    run: APPO
    stop:
-        time_total_s: 10800   
+        time_total_s: 10800
    config:
+        use_pytorch: false  # <- switch on/off torch
        vtrace: True
        gamma: 0.99
        lambda: 0.95
@ -30,6 +31,6 @@ halfcheetah-appo:
        batch_mode: truncate_episodes
        use_kl_loss: True
        kl_coeff: 1.0
-        kl_target: 0.04             
+        kl_target: 0.04
        observation_filter: MeanStdFilter

--- a/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml
+++ b/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml
@ -1,23 +1,24 @@
-halfcheetah-ppo:
-    env: HalfCheetah-v2
-    run: PPO
-    stop:
-        episode_reward_mean: 9800
-        time_total_s: 10800           
-    config:
-        gamma: 0.99
-        lambda: 0.95
-        kl_coeff: 1.0
-        num_sgd_iter: 32
-        lr: .0003
-        vf_loss_coeff: 0.5
-        clip_param: 0.2
-        sgd_minibatch_size: 4096
-        train_batch_size: 65536
-        num_workers: 16
-        num_gpus: 1
-        grad_clip: 0.5
-        num_envs_per_worker: 
-            grid_search: [16, 32]
-        batch_mode: truncate_episodes
-        observation_filter: MeanStdFilter
+halfcheetah-ppo:
+    env: HalfCheetah-v2
+    run: PPO
+    stop:
+        episode_reward_mean: 9800
+        time_total_s: 10800
+    config:
+        use_pytorch: false  # <- switch on/off torch
+        gamma: 0.99
+        lambda: 0.95
+        kl_coeff: 1.0
+        num_sgd_iter: 32
+        lr: .0003
+        vf_loss_coeff: 0.5
+        clip_param: 0.2
+        sgd_minibatch_size: 4096
+        train_batch_size: 65536
+        num_workers: 16
+        num_gpus: 1
+        grad_clip: 0.5
+        num_envs_per_worker:
+            grid_search: [16, 32]
+        batch_mode: truncate_episodes
+        observation_filter: MeanStdFilter
--- a/rllib/tuned_examples/ppo/hopper-ppo.yaml
+++ b/rllib/tuned_examples/ppo/hopper-ppo.yaml
@ -2,6 +2,8 @@ hopper-ppo:
    env: Hopper-v1
    run: PPO
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        gamma: 0.995
        kl_coeff: 1.0
        num_sgd_iter: 20
--- a/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml
+++ b/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml
@ -4,6 +4,8 @@ humanoid-ppo-gae:
    stop:
        episode_reward_mean: 6000
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        gamma: 0.995
        lambda: 0.95
        clip_param: 0.2
--- a/rllib/tuned_examples/ppo/humanoid-ppo.yaml
+++ b/rllib/tuned_examples/ppo/humanoid-ppo.yaml
@ -4,6 +4,8 @@ humanoid-ppo:
    stop:
        episode_reward_mean: 6000
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        gamma: 0.995
        kl_coeff: 1.0
        num_sgd_iter: 20
--- a/rllib/tuned_examples/ppo/pendulum-appo.yaml
+++ b/rllib/tuned_examples/ppo/pendulum-appo.yaml
@ -1,10 +1,11 @@
-pendulum-appo-vtrace-tf:
+pendulum-appo-vtrace:
    env: Pendulum-v0
    run: APPO
    stop:
        episode_reward_mean: -1000  # just check it learns a bit
        timesteps_total: 500000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        vtrace: true
        num_gpus: 0
--- a/rllib/tuned_examples/regression_tests/pendulum-ppo-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/pendulum-ppo-tf.yaml
@ -1,10 +1,12 @@
-pendulum-ppo-tf:
+# Can expect improvement to -140 reward in ~300-500k timesteps.
+pendulum-ppo:
    env: Pendulum-v0
    run: PPO
    stop:
        episode_reward_mean: -500
        timesteps_total: 400000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        train_batch_size: 2048
        vf_clip_param: 10.0
--- a/rllib/tuned_examples/ppo/pong-appo.yaml
+++ b/rllib/tuned_examples/ppo/pong-appo.yaml
@ -1,29 +1,31 @@
-# This can reach 18-19 reward in ~5-7 minutes on a Titan XP GPU
-# with 32 workers and 8 envs per worker. IMPALA, when ran with 
-# similar configurations, solved Pong in 10-12 minutes.
-# APPO can also solve Pong in 2.5 million timesteps, which is
-# 2x more efficient than that of IMPALA.
-pong-appo:
-    env: PongNoFrameskip-v4
-    run: APPO
-    stop:
-        episode_reward_mean: 18.0
-        timesteps_total: 5000000
-    config:
-        vtrace: True
-        use_kl_loss: False
-        rollout_fragment_length: 50
-        train_batch_size: 750
-        num_workers: 32
-        broadcast_interval: 1
-        max_sample_requests_in_flight_per_worker: 1
-        num_data_loader_buffers: 1
-        num_envs_per_worker: 8
-        minibatch_buffer_size: 4
-        num_sgd_iter: 2
-        vf_loss_coeff: 1.0
-        clip_param: 0.3
-        num_gpus: 1
-        grad_clip: 10
-        model:
-          dim: 42
+# This can reach 18-19 reward in ~5-7 minutes on a Titan XP GPU
+# with 32 workers and 8 envs per worker. IMPALA, when ran with
+# similar configurations, solved Pong in 10-12 minutes.
+# APPO can also solve Pong in 2.5 million timesteps, which is
+# 2x more efficient than that of IMPALA.
+pong-appo:
+    env: PongNoFrameskip-v4
+    run: APPO
+    stop:
+        episode_reward_mean: 18.0
+        timesteps_total: 5000000
+    config:
+        # Works for both torch and tf.
+        use_pytorch: false
+        vtrace: True
+        use_kl_loss: False
+        rollout_fragment_length: 50
+        train_batch_size: 750
+        num_workers: 32
+        broadcast_interval: 1
+        max_sample_requests_in_flight_per_worker: 1
+        num_data_loader_buffers: 1
+        num_envs_per_worker: 8
+        minibatch_buffer_size: 4
+        num_sgd_iter: 2
+        vf_loss_coeff: 1.0
+        clip_param: 0.3
+        num_gpus: 1
+        grad_clip: 10
+        model:
+          dim: 42
--- a/rllib/tuned_examples/ppo/pong-ppo.yaml
+++ b/rllib/tuned_examples/ppo/pong-ppo.yaml
@ -1,11 +1,13 @@
 # On a single GPU, this achieves maximum reward in ~15-20 minutes.
 #
-# $ python train.py -f tuned_examples/pong-ppo.yaml
+# $ python train.py -f tuned_configs/pong-ppo.yaml
 #
 pong-ppo:
    env: PongNoFrameskip-v4
    run: PPO
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        lambda: 0.95
        kl_coeff: 0.5
        clip_rewards: True
--- a/rllib/tuned_examples/ppo/walker2d-ppo.yaml
+++ b/rllib/tuned_examples/ppo/walker2d-ppo.yaml
@ -2,6 +2,8 @@ walker2d-v1-ppo:
    env: Walker2d-v1
    run: PPO
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        kl_coeff: 1.0
        num_sgd_iter: 20
        lr: .0001
--- a/rllib/tuned_examples/regression_tests/cartpole-a2c-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-a2c-torch.yaml
@ -1,9 +0,0 @@
-cartpole-a2c-torch:
-    env: CartPole-v0
-    run: A2C
-    stop:
-        episode_reward_mean: 100
-        timesteps_total: 100000
-    config:
-        num_workers: 0
-        use_pytorch: true
--- a/rllib/tuned_examples/regression_tests/cartpole-appo-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-appo-torch.yaml
@ -1,14 +0,0 @@
-cartpole-appo-torch:
-    env: CartPole-v0
-    run: APPO
-    stop:
-        episode_reward_mean: 150
-        timesteps_total: 200000
-    config:
-        use_pytorch: true
-        rollout_fragment_length: 10
-        train_batch_size: 10
-        num_envs_per_worker: 5
-        num_workers: 1
-        num_gpus: 0
-        vtrace: false
--- a/rllib/tuned_examples/regression_tests/cartpole-appo-vtrace-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-appo-vtrace-torch.yaml
@ -1,14 +0,0 @@
-cartpole-appo-vtrace-torch:
-    env: CartPole-v0
-    run: APPO
-    stop:
-        episode_reward_mean: 150
-        timesteps_total: 200000
-    config:
-        use_pytorch: true
-        rollout_fragment_length: 10
-        train_batch_size: 10
-        num_envs_per_worker: 5
-        num_workers: 1
-        num_gpus: 0
-        vtrace: true
--- a/rllib/tuned_examples/regression_tests/cartpole-ars-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-ars-tf.yaml
@ -1,17 +0,0 @@
-cartpole-ars-tf:
-    env: CartPole-v0
-    run: ARS
-    stop:
-        episode_reward_mean: 50
-        timesteps_total: 500000
-    config:
-        use_pytorch: false
-        noise_stdev: 0.02
-        num_rollouts: 50
-        rollouts_used: 25
-        num_workers: 2
-        sgd_stepsize: 0.01
-        noise_size: 25000000
-        eval_prob: 0.5
-        model:
-            fcnet_hiddens: []  # a linear policy
--- a/rllib/tuned_examples/regression_tests/cartpole-dqn-param-noise-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-dqn-param-noise-torch.yaml
@ -1,18 +0,0 @@
-cartpole-dqn-torch-w-param-noise:
-    env: CartPole-v0
-    run: DQN
-    stop:
-        episode_reward_mean: 150
-        timesteps_total: 300000
-    config:
-        use_pytorch: true
-        exploration_config:
-            type: ParameterNoise
-            random_timesteps: 10000
-            initial_stddev: 1.0
-        batch_mode: complete_episodes
-        lr: 0.0008
-        num_workers: 0
-        model:
-            fcnet_hiddens: [32, 32]
-            fcnet_activation: tanh
--- a/rllib/tuned_examples/regression_tests/cartpole-dqn-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-dqn-torch.yaml
@ -1,10 +0,0 @@
-cartpole-dqn-torch:
-    env: CartPole-v0
-    run: DQN
-    stop:
-        episode_reward_mean: 150
-        timesteps_total: 50000
-    config:
-        use_pytorch: true
-        n_step: 3
-        gamma: 0.95
--- a/rllib/tuned_examples/regression_tests/cartpole-es-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-es-torch.yaml
@ -1,11 +0,0 @@
-cartpole-es-torch:
-    env: CartPole-v0
-    run: ES
-    stop:
-        episode_reward_mean: 150
-        timesteps_total: 500000
-    config:
-        use_pytorch: true
-        num_workers: 2
-        noise_size: 25000000
-        episodes_per_batch: 50
--- a/rllib/tuned_examples/regression_tests/cartpole-impala-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-impala-torch.yaml
@ -1,9 +0,0 @@
-cartpole-impala-torch:
-    env: CartPole-v0
-    run: IMPALA
-    stop:
-        episode_reward_mean: 150
-        timesteps_total: 500000
-    config:
-        use_pytorch: true
-        num_gpus: 0
--- a/rllib/tuned_examples/regression_tests/cartpole-pg-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-pg-tf.yaml
@ -1,8 +0,0 @@
-cartpole-pg-tf:
-    env: CartPole-v0
-    run: PG
-    stop:
-        episode_reward_mean: 100
-        timesteps_total: 100000
-    config:
-        num_workers: 0
--- a/rllib/tuned_examples/regression_tests/cartpole-ppo-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-ppo-tf.yaml
@ -1,17 +0,0 @@
-cartpole-ppo-tf:
-    env: CartPole-v0
-    run: PPO
-    stop:
-        episode_reward_mean: 150
-        timesteps_total: 100000
-    config:
-        gamma: 0.99
-        lr: 0.0003
-        num_workers: 1
-        observation_filter: MeanStdFilter
-        num_sgd_iter: 6
-        vf_share_layers: true
-        vf_loss_coeff: 0.01
-        model:
-          fcnet_hiddens: [32]
-          fcnet_activation: linear
--- a/rllib/tuned_examples/regression_tests/cartpole-sac-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-sac-torch.yaml
@ -1,17 +0,0 @@
-cartpole-sac-torch:
-    env: CartPole-v0
-    run: SAC
-    stop:
-        episode_reward_mean: 150
-        timesteps_total: 50000
-    config:
-        use_pytorch: true
-        gamma: 0.95
-        no_done_at_end: false
-        target_network_update_freq: 32
-        tau: 1.0
-        train_batch_size: 32
-        optimization:
-            actor_learning_rate: 0.005
-            critic_learning_rate: 0.005
-            entropy_learning_rate: 0.0001
--- a/rllib/tuned_examples/regression_tests/cartpole-simpleq-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-simpleq-torch.yaml
@ -1,8 +0,0 @@
-cartpole-dqn-torch:
-    env: CartPole-v0
-    run: SimpleQ
-    stop:
-        episode_reward_mean: 150
-        timesteps_total: 50000
-    config:
-        use_pytorch: true
--- a/rllib/tuned_examples/regression_tests/pendulum-ddpg-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/pendulum-ddpg-tf.yaml
@ -1,10 +0,0 @@
-pendulum-ddpg-tf:
-    env: Pendulum-v0
-    run: DDPG
-    stop:
-        episode_reward_mean: -700
-        timesteps_total: 100000
-    config:
-        use_pytorch: false
-        use_huber: true
-        clip_rewards: false
--- a/rllib/tuned_examples/regression_tests/pendulum-ddpg-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/pendulum-ddpg-torch.yaml
@ -1,10 +0,0 @@
-pendulum-ddpg-torch:
-    env: Pendulum-v0
-    run: DDPG
-    stop:
-        episode_reward_mean: -700
-        timesteps_total: 100000
-    config:
-        use_pytorch: true
-        use_huber: true
-        clip_rewards: false
--- a/rllib/tuned_examples/regression_tests/pendulum-ppo-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/pendulum-ppo-torch.yaml
@ -1,21 +0,0 @@
-pendulum-ppo-torch:
-    env: Pendulum-v0
-    run: PPO
-    stop:
-        episode_reward_mean: -500
-        timesteps_total: 400000
-    config:
-        use_pytorch: true
-        train_batch_size: 2048
-        vf_clip_param: 10.0
-        num_workers: 0
-        num_envs_per_worker: 10
-        lambda: 0.1
-        gamma: 0.95
-        lr: 0.0003
-        sgd_minibatch_size: 64
-        num_sgd_iter: 10
-        model:
-            fcnet_hiddens: [64, 64]
-        batch_mode: complete_episodes
-        observation_filter: MeanStdFilter
--- a/rllib/tuned_examples/regression_tests/pendulum-sac-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/pendulum-sac-tf.yaml
@ -1,13 +0,0 @@
-pendulum-sac-tf:
-    env: Pendulum-v0
-    run: SAC
-    stop:
-        episode_reward_mean: -300  # note that evaluation perf is higher
-        timesteps_total: 10000
-    config:
-        use_pytorch: false
-        soft_horizon: true
-        clip_actions: false
-        normalize_actions: true
-        metrics_smoothing_episodes: 5
-        no_done_at_end: true
--- a/rllib/tuned_examples/regression_tests/pendulum-sac-torch.yaml
+++ b/rllib/tuned_examples/regression_tests/pendulum-sac-torch.yaml
@ -1,13 +0,0 @@
-pendulum-sac-torch:
-    env: Pendulum-v0
-    run: SAC
-    stop:
-        episode_reward_mean: -300  # note that evaluation perf is higher
-        timesteps_total: 10000
-    config:
-        use_pytorch: true
-        soft_horizon: true
-        clip_actions: false
-        normalize_actions: true
-        metrics_smoothing_episodes: 5
-        no_done_at_end: true
--- a/rllib/tuned_examples/regression_tests/pendulum-td3.yaml
+++ b/rllib/tuned_examples/regression_tests/pendulum-td3.yaml
@ -1,8 +0,0 @@
-pendulum-td3-tf:
-    env: Pendulum-v0
-    run: TD3
-    config:
-        use_pytorch: false
-    stop:
-        episode_reward_mean: -900
-        timesteps_total: 100000
--- a/rllib/tuned_examples/sac/atari-sac.yaml
+++ b/rllib/tuned_examples/sac/atari-sac.yaml
@ -10,9 +10,7 @@ atari-sac-tf-and-torch:
    stop:
        timesteps_total: 20000000
    config:
-        # Works for both torch and tf.
-        use_pytorch:
-            grid_search: [false, true]
+        use_pytorch: false  # <- switch on/off torch
        gamma: 0.99
        # state-preprocessor=Our default Atari Conv2D-net.
        use_state_preprocessor: true
--- a/rllib/tuned_examples/regression_tests/cartpole-sac-tf.yaml
+++ b/rllib/tuned_examples/regression_tests/cartpole-sac-tf.yaml
@ -1,10 +1,11 @@
-cartpole-sac-tf:
+cartpole-sac:
    env: CartPole-v0
    run: SAC
    stop:
        episode_reward_mean: 150
-        timesteps_total: 50000
+        timesteps_total: 100000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        gamma: 0.95
        no_done_at_end: false
--- a/rllib/tuned_examples/sac/halfcheetah-sac.yaml
+++ b/rllib/tuned_examples/sac/halfcheetah-sac.yaml
@ -5,6 +5,7 @@ halfcheetah_sac:
    stop:
        episode_reward_mean: 9000
    config:
+        use_pytorch: false  # <- switch on/off torch
        horizon: 1000
        soft_horizon: false
        Q_model:
--- a/rllib/tuned_examples/sac/mspacman-sac.yaml
+++ b/rllib/tuned_examples/sac/mspacman-sac.yaml
@ -8,6 +8,7 @@ mspacman-sac-tf:
        episode_reward_mean: 800
        timesteps_total: 100000
    config:
+        # Works for both torch and tf.
        use_pytorch: false
        gamma: 0.99
        # state-preprocessor=Our default Atari Conv2D-net.
--- a/rllib/tuned_examples/sac/pendulum-sac.yaml
+++ b/rllib/tuned_examples/sac/pendulum-sac.yaml
@ -1,13 +1,16 @@
 # Pendulum SAC can attain -150+ reward in 6-7k
 # Configurations are the similar to original softlearning/sac codebase
-pendulum_sac:
+pendulum-sac:
    env: Pendulum-v0
    run: SAC
    stop:
-        episode_reward_mean: -150
+        episode_reward_mean: -300
+        timesteps_total: 10000
    config:
+        # Works for both torch and tf.
+        use_pytorch: false
        horizon: 200
-        soft_horizon: False
+        soft_horizon: true
        Q_model:
          fcnet_activation: relu
          fcnet_hiddens: [256, 256]
@ -16,10 +19,10 @@ pendulum_sac:
          fcnet_hiddens: [256, 256]
        tau: 0.005
        target_entropy: auto
-        no_done_at_end: True
+        no_done_at_end: true
        n_step: 1
        rollout_fragment_length: 1
-        prioritized_replay: False
+        prioritized_replay: true
        train_batch_size: 256
        target_network_update_freq: 1
        timesteps_per_iteration: 1000
@ -31,6 +34,6 @@ pendulum_sac:
        num_workers: 0
        num_gpus: 0
        clip_actions: False
-        normalize_actions: True
+        normalize_actions: true
        evaluation_interval: 1
        metrics_smoothing_episodes: 5
--- a/rllib/utils/schedules/piecewise_schedule.py
+++ b/rllib/utils/schedules/piecewise_schedule.py
@ -43,7 +43,7 @@ class PiecewiseSchedule(Schedule):
        assert idxes == sorted(idxes)
        self.interpolation = interpolation
        self.outside_value = outside_value
-        self.endpoints = endpoints
+        self.endpoints = [(int(e[0]), float(e[1])) for e in endpoints]

    @override(Schedule)
    def _value(self, t):