From 8acb469b047cd9b327c9477a13b030eb7357860e Mon Sep 17 00:00:00 2001
From: Sven Mika <sven@anyscale.io>
Date: Thu, 26 Aug 2021 14:09:20 +0200
Subject: [PATCH] [RLlib; Testing] Green all RLlib nightly tests. (#18073)

---
 release/rllib_tests/app_config.yaml           |  2 +
 .../learning_tests/hard_learning_tests.yaml   | 93 +++++++++----------
 .../debug_learning_failure_git_bisect.py      | 63 ++++++++-----
 .../tuned_examples/ddpg/halfcheetah-ddpg.yaml |  4 +-
 .../ddpg/halfcheetah-pybullet-ddpg.yaml       | 42 +++++++++
 .../ddpg/hopper-pybullet-ddpg.yaml            | 44 +++++++++
 6 files changed, 176 insertions(+), 72 deletions(-)
 create mode 100644 rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml
 create mode 100644 rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml

diff --git a/release/rllib_tests/app_config.yaml b/release/rllib_tests/app_config.yaml
index aa574172a..93ae92d01 100755
--- a/release/rllib_tests/app_config.yaml
+++ b/release/rllib_tests/app_config.yaml
@@ -13,6 +13,8 @@ python:
     - gym[atari]
     - atari_py
     - pybullet
+    # Pin this to 2.4.3 so it'll work with CUDA=11.0.
+    - tensorflow==2.4.3
   conda_packages: []
 
 post_build_cmds:
diff --git a/release/rllib_tests/learning_tests/hard_learning_tests.yaml b/release/rllib_tests/learning_tests/hard_learning_tests.yaml
index 7efc08225..95c31ac1f 100644
--- a/release/rllib_tests/learning_tests/hard_learning_tests.yaml
+++ b/release/rllib_tests/learning_tests/hard_learning_tests.yaml
@@ -52,6 +52,51 @@ apex-breakoutnoframeskip-v4:
         target_network_update_freq: 50000
         timesteps_per_iteration: 25000
 
+ddpg-hopperbulletenv-v0:
+    env: HopperBulletEnv-v0
+    run: DDPG
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 120.0
+        timesteps_total: 50000
+    stop:
+        time_total_s: 3600
+    config:
+        actor_hiddens: [256, 256]
+        critic_hiddens: [256, 256]
+        n_step: 3
+        model: {}
+        gamma: 0.99
+        env_config: {}
+        exploration_config:
+            initial_scale: 1.0
+            final_scale: 0.02
+            scale_timesteps: 10000
+            ou_base_scale: 0.1
+            ou_theta: 0.15
+            ou_sigma: 0.2
+        timesteps_per_iteration: 1000
+        target_network_update_freq: 0
+        tau: 0.001
+        buffer_size: 10000
+        prioritized_replay: True
+        prioritized_replay_alpha: 0.6
+        prioritized_replay_beta: 0.4
+        prioritized_replay_eps: 0.000001
+        clip_rewards: false
+        actor_lr: 0.001
+        critic_lr: 0.001
+        use_huber: true
+        huber_threshold: 1.0
+        l2_reg: 0.000001
+        learning_starts: 500
+        rollout_fragment_length: 1
+        train_batch_size: 48
+        num_gpus: 1
+        num_workers: 0
+        num_gpus_per_worker: 0
+        worker_side_prioritization: false
+
 dqn-breakoutnoframeskip-v4:
     env: BreakoutNoFrameskip-v4
     run: DQN
@@ -173,51 +218,3 @@ sac-halfcheetahbulletenv-v0:
         normalize_actions: true
         evaluation_interval: 1
         metrics_smoothing_episodes: 5
-
-# Expect roughly 1000 reward after 1h on 1GPU
-# TODO: (sven) this seems to be somewhat broken on tf AND torch (?)
-#  try to find older version that still works.
-ddpg-halfcheetahbulletenv-v0:
-    env: HalfCheetahBulletEnv-v0
-    run: DDPG
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: -100.0
-        timesteps_total: 400000
-    stop:
-        time_total_s: 7200
-    config:
-        actor_hiddens: [64, 64]
-        critic_hiddens: [64, 64]
-        n_step: 1
-        model: {}
-        gamma: 0.99
-        env_config: {}
-        exploration_config:
-            initial_scale: 1.0
-            final_scale: 0.02
-            scale_timesteps: 10000
-            ou_base_scale: 0.1
-            ou_theta: 0.15
-            ou_sigma: 0.2
-        timesteps_per_iteration: 1000
-        target_network_update_freq: 0
-        tau: 0.001
-        buffer_size: 10000
-        prioritized_replay: True
-        prioritized_replay_alpha: 0.6
-        prioritized_replay_beta: 0.4
-        prioritized_replay_eps: 0.000001
-        clip_rewards: False
-        actor_lr: 0.001
-        critic_lr: 0.001
-        use_huber: False
-        huber_threshold: 1.0
-        l2_reg: 0.000001
-        learning_starts: 500
-        rollout_fragment_length: 1
-        train_batch_size: 64
-        num_workers: 0
-        num_gpus: 1
-        num_gpus_per_worker: 0
-        worker_side_prioritization: False
diff --git a/rllib/tests/git_bisect/debug_learning_failure_git_bisect.py b/rllib/tests/git_bisect/debug_learning_failure_git_bisect.py
index 13ebdded5..238063c93 100644
--- a/rllib/tests/git_bisect/debug_learning_failure_git_bisect.py
+++ b/rllib/tests/git_bisect/debug_learning_failure_git_bisect.py
@@ -27,6 +27,7 @@ $ python debug_learning_failure_git_bisect.py -f [yaml file] --stop-reward=180
 import argparse
 import importlib
 import json
+import numpy as np
 import os
 import subprocess
 import yaml
@@ -47,6 +48,11 @@ parser.add_argument(
     "--skip-install-ray",
     action="store_true",
     help="If set, do not attempt to re-build ray from source.")
+parser.add_argument(
+    "--num-samples",
+    type=int,
+    default=1,
+    help="The number of samples to run for the given experiment.")
 parser.add_argument(
     "--stop-iters",
     type=int,
@@ -122,8 +128,9 @@ if __name__ == "__main__":
     if args.framework:
         config["framework"] = args.framework
 
-    # Define stopping criteria.
-    stop = {}
+    # Define stopping criteria. From the yaml file ..
+    stop = experiment_config.get("stop", {})
+    # .. but override with command line provided ones.
     if args.stop_iters:
         stop["training_iteration"] = args.stop_iters
     if args.stop_timesteps:
@@ -133,15 +140,24 @@ if __name__ == "__main__":
     if args.stop_time:
         stop["time_total_s"] = args.stop_time
 
+    # Invalid pass criteria.
+    if stop.get("episode_reward_mean") is None and \
+            (stop.get("timesteps_total") is None or
+             stop.get("time_total_s") is None):
+        raise ValueError("Invalid pass criterium! Must use either "
+                         "(--stop-reward + optionally any other) OR "
+                         "(--stop-timesteps + --stop-time).")
+
     # - Stop ray.
-    # - Uninstall and re-install ray (from source) if required.
-    # - Start ray.
+    # Do this twice to make sure all processes are stopped (older versions of
+    # ray used to not kill everything the first time around).
     try:
         subprocess.run("ray stop".split(" "))
         subprocess.run("ray stop".split(" "))
     except Exception:
         pass
 
+    # - Uninstall and re-install ray (from source) if required.
     # Install ray from the checked out repo.
     if not args.skip_install_ray:
         subprocess.run("sudo apt-get update".split(" "))
@@ -158,10 +174,15 @@ if __name__ == "__main__":
         subprocess.run("pip install -e . --verbose".split(" "))
         os.chdir("../")
 
+    # - Start ray.
     try:
         subprocess.run("ray start --head".split(" "))
     except Exception:
-        subprocess.run("ray stop".split(" "))
+        try:
+            subprocess.run("ray stop".split(" "))
+            subprocess.run("ray stop".split(" "))
+        except Exception:
+            pass
         try:
             subprocess.run("ray start --head".split(" "))
         except Exception as e:
@@ -175,31 +196,29 @@ if __name__ == "__main__":
     ray.init()
 
     results = tune.run(run, stop=stop, config=config)
+    last_results = [t.last_result for t in results.trials]
 
-    # Criterium is to have reached some min reward.
-    if args.stop_reward:
-        last_result = results.trials[0].last_result
-        avg_reward = last_result["episode_reward_mean"]
-        if avg_reward < args.stop_reward:
+    # Criterion is to have reached some min reward within given
+    # wall time, iters, or timesteps.
+    if stop.get("episode_reward_mean") is not None:
+        max_avg_reward = np.max(
+            [r["episode_reward_mean"] for r in last_results])
+        if max_avg_reward < stop["episode_reward_mean"]:
             raise ValueError("`stop-reward` of {} not reached!".format(
-                args.stop_reward))
-    # Criterium is to have run through n env timesteps in some wall time m.
-    elif args.stop_timesteps and args.stop_time:
-        last_result = results.trials[0].last_result
-        total_timesteps = last_result["timesteps_total"]
-        total_time = last_result["time_total_s"]
-        desired_speed = args.stop_timesteps / args.stop_time
+                stop["episode_reward_mean"]))
+    # Criterion is to have run through n env timesteps in some wall time m
+    # (minimum throughput).
+    else:
+        total_timesteps = np.sum([r["timesteps_total"] for r in last_results])
+        total_time = np.sum([r["time_total_s"] for r in last_results])
+        desired_speed = stop["timesteps_total"] / stop["time_total_s"]
         actual_speed = total_timesteps / total_time
         # We stopped because we reached the time limit ->
         # Means throughput is too slow (time steps not reached).
         if actual_speed < desired_speed:
             raise ValueError(
                 "`stop-timesteps` of {} not reached in {}sec!".format(
-                    args.stop_timesteps, args.stop_time))
-    else:
-        raise ValueError("Invalid pass criterium! Must use either "
-                         "(--stop-reward + optionally any other) OR "
-                         "(--stop-timesteps + --stop-time).")
+                    stop["timesteps_total"], stop["time_total_s"]))
 
     print("ok")
     ray.shutdown()
diff --git a/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml b/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml
index 047a92da1..c9a5a607e 100644
--- a/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml
+++ b/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml
@@ -40,7 +40,7 @@ halfcheetah-ddpg:
         # === Optimization ===
         actor_lr: 0.001
         critic_lr: 0.001
-        use_huber: False
+        use_huber: false
         huber_threshold: 1.0
         l2_reg: 0.000001
         learning_starts: 500
@@ -50,7 +50,7 @@ halfcheetah-ddpg:
         # === Parallelism ===
         num_workers: 0
         num_gpus_per_worker: 0
-        worker_side_prioritization: False
+        worker_side_prioritization: false
 
         # === Evaluation ===
         evaluation_interval: 5
diff --git a/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml b/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml
new file mode 100644
index 000000000..3d60c4e96
--- /dev/null
+++ b/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml
@@ -0,0 +1,42 @@
+# Note: HalfCheetahBulletEnv-v0 is not the same as MuJoCo's HalfCheetah-v0.
+ddpg-halfcheetahbulletenv-v0:
+    env: HalfCheetahBulletEnv-v0
+    run: DDPG
+    stop:
+        episode_reward_mean: -300.0
+        timesteps_total: 200000
+    config:
+        actor_hiddens: [256, 256]
+        critic_hiddens: [256, 256]
+        n_step: 3
+        model: {}
+        gamma: 0.99
+        env_config: {}
+        exploration_config:
+            initial_scale: 1.0
+            final_scale: 0.02
+            scale_timesteps: 10000
+            ou_base_scale: 0.1
+            ou_theta: 0.15
+            ou_sigma: 0.2
+        timesteps_per_iteration: 1000
+        target_network_update_freq: 0
+        tau: 0.001
+        buffer_size: 15000
+        prioritized_replay: true
+        prioritized_replay_alpha: 0.6
+        prioritized_replay_beta: 0.4
+        prioritized_replay_eps: 0.000001
+        clip_rewards: false
+        actor_lr: 0.001
+        critic_lr: 0.001
+        use_huber: true
+        huber_threshold: 1.0
+        l2_reg: 0.000001
+        learning_starts: 500
+        rollout_fragment_length: 1
+        train_batch_size: 48
+        num_workers: 0
+        num_gpus: 1
+        num_gpus_per_worker: 0
+        worker_side_prioritization: false
diff --git a/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml b/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml
new file mode 100644
index 000000000..73200df7e
--- /dev/null
+++ b/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml
@@ -0,0 +1,44 @@
+# Note: HopperBulletEnv-v0 is not the same as MuJoCo's Hopper-v0.
+ddpg-hopperbulletenv-v0:
+    env: HopperBulletEnv-v0
+    run: DDPG
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 120.0
+        timesteps_total: 50000
+    stop:
+        time_total_s: 2000
+    config:
+        actor_hiddens: [256, 256]
+        critic_hiddens: [256, 256]
+        n_step: 3
+        model: {}
+        gamma: 0.99
+        env_config: {}
+        exploration_config:
+            initial_scale: 1.0
+            final_scale: 0.02
+            scale_timesteps: 10000
+            ou_base_scale: 0.1
+            ou_theta: 0.15
+            ou_sigma: 0.2
+        timesteps_per_iteration: 1000
+        target_network_update_freq: 0
+        tau: 0.001
+        buffer_size: 10000
+        prioritized_replay: True
+        prioritized_replay_alpha: 0.6
+        prioritized_replay_beta: 0.4
+        prioritized_replay_eps: 0.000001
+        clip_rewards: False
+        actor_lr: 0.001
+        critic_lr: 0.001
+        use_huber: False
+        huber_threshold: 1.0
+        l2_reg: 0.000001
+        learning_starts: 500
+        rollout_fragment_length: 1
+        train_batch_size: 48
+        num_workers: 0
+        num_gpus_per_worker: 0
+        worker_side_prioritization: False