[RLlib; Testing] Green all RLlib nightly tests. (#18073)

2025-03-06 02:21:39 -05:00 · 2021-08-26 14:09:20 +02:00 · 2021-08-26 14:09:20 +02:00 · 8acb469b04
commit 8acb469b04
parent 089dd9b949
6 changed files with 176 additions and 72 deletions
--- a/release/rllib_tests/app_config.yaml
+++ b/release/rllib_tests/app_config.yaml
@ -13,6 +13,8 @@ python:
    - gym[atari]
    - atari_py
    - pybullet
+    # Pin this to 2.4.3 so it'll work with CUDA=11.0.
+    - tensorflow==2.4.3
  conda_packages: []

 post_build_cmds:
--- a/release/rllib_tests/learning_tests/hard_learning_tests.yaml
+++ b/release/rllib_tests/learning_tests/hard_learning_tests.yaml
@ -52,6 +52,51 @@ apex-breakoutnoframeskip-v4:
        target_network_update_freq: 50000
        timesteps_per_iteration: 25000

+ddpg-hopperbulletenv-v0:
+    env: HopperBulletEnv-v0
+    run: DDPG
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 120.0
+        timesteps_total: 50000
+    stop:
+        time_total_s: 3600
+    config:
+        actor_hiddens: [256, 256]
+        critic_hiddens: [256, 256]
+        n_step: 3
+        model: {}
+        gamma: 0.99
+        env_config: {}
+        exploration_config:
+            initial_scale: 1.0
+            final_scale: 0.02
+            scale_timesteps: 10000
+            ou_base_scale: 0.1
+            ou_theta: 0.15
+            ou_sigma: 0.2
+        timesteps_per_iteration: 1000
+        target_network_update_freq: 0
+        tau: 0.001
+        buffer_size: 10000
+        prioritized_replay: True
+        prioritized_replay_alpha: 0.6
+        prioritized_replay_beta: 0.4
+        prioritized_replay_eps: 0.000001
+        clip_rewards: false
+        actor_lr: 0.001
+        critic_lr: 0.001
+        use_huber: true
+        huber_threshold: 1.0
+        l2_reg: 0.000001
+        learning_starts: 500
+        rollout_fragment_length: 1
+        train_batch_size: 48
+        num_gpus: 1
+        num_workers: 0
+        num_gpus_per_worker: 0
+        worker_side_prioritization: false
+
 dqn-breakoutnoframeskip-v4:
    env: BreakoutNoFrameskip-v4
    run: DQN
@ -173,51 +218,3 @@ sac-halfcheetahbulletenv-v0:
        normalize_actions: true
        evaluation_interval: 1
        metrics_smoothing_episodes: 5
-
-# Expect roughly 1000 reward after 1h on 1GPU
-# TODO: (sven) this seems to be somewhat broken on tf AND torch (?)
-#  try to find older version that still works.
-ddpg-halfcheetahbulletenv-v0:
-    env: HalfCheetahBulletEnv-v0
-    run: DDPG
-    # Minimum reward and total ts (in given time_total_s) to pass this test.
-    pass_criteria:
-        episode_reward_mean: -100.0
-        timesteps_total: 400000
-    stop:
-        time_total_s: 7200
-    config:
-        actor_hiddens: [64, 64]
-        critic_hiddens: [64, 64]
-        n_step: 1
-        model: {}
-        gamma: 0.99
-        env_config: {}
-        exploration_config:
-            initial_scale: 1.0
-            final_scale: 0.02
-            scale_timesteps: 10000
-            ou_base_scale: 0.1
-            ou_theta: 0.15
-            ou_sigma: 0.2
-        timesteps_per_iteration: 1000
-        target_network_update_freq: 0
-        tau: 0.001
-        buffer_size: 10000
-        prioritized_replay: True
-        prioritized_replay_alpha: 0.6
-        prioritized_replay_beta: 0.4
-        prioritized_replay_eps: 0.000001
-        clip_rewards: False
-        actor_lr: 0.001
-        critic_lr: 0.001
-        use_huber: False
-        huber_threshold: 1.0
-        l2_reg: 0.000001
-        learning_starts: 500
-        rollout_fragment_length: 1
-        train_batch_size: 64
-        num_workers: 0
-        num_gpus: 1
-        num_gpus_per_worker: 0
-        worker_side_prioritization: False
--- a/rllib/tests/git_bisect/debug_learning_failure_git_bisect.py
+++ b/rllib/tests/git_bisect/debug_learning_failure_git_bisect.py
@ -27,6 +27,7 @@ $ python debug_learning_failure_git_bisect.py -f [yaml file] --stop-reward=180
 import argparse
 import importlib
 import json
+import numpy as np
 import os
 import subprocess
 import yaml
@ -47,6 +48,11 @@ parser.add_argument(
    "--skip-install-ray",
    action="store_true",
    help="If set, do not attempt to re-build ray from source.")
+parser.add_argument(
+    "--num-samples",
+    type=int,
+    default=1,
+    help="The number of samples to run for the given experiment.")
 parser.add_argument(
    "--stop-iters",
    type=int,
@ -122,8 +128,9 @@ if __name__ == "__main__":
    if args.framework:
        config["framework"] = args.framework

-    # Define stopping criteria.
-    stop = {}
+    # Define stopping criteria. From the yaml file ..
+    stop = experiment_config.get("stop", {})
+    # .. but override with command line provided ones.
    if args.stop_iters:
        stop["training_iteration"] = args.stop_iters
    if args.stop_timesteps:
@ -133,15 +140,24 @@ if __name__ == "__main__":
    if args.stop_time:
        stop["time_total_s"] = args.stop_time

+    # Invalid pass criteria.
+    if stop.get("episode_reward_mean") is None and \
+            (stop.get("timesteps_total") is None or
+             stop.get("time_total_s") is None):
+        raise ValueError("Invalid pass criterium! Must use either "
+                         "(--stop-reward + optionally any other) OR "
+                         "(--stop-timesteps + --stop-time).")
+
    # - Stop ray.
-    # - Uninstall and re-install ray (from source) if required.
-    # - Start ray.
+    # Do this twice to make sure all processes are stopped (older versions of
+    # ray used to not kill everything the first time around).
    try:
        subprocess.run("ray stop".split(" "))
        subprocess.run("ray stop".split(" "))
    except Exception:
        pass

+    # - Uninstall and re-install ray (from source) if required.
    # Install ray from the checked out repo.
    if not args.skip_install_ray:
        subprocess.run("sudo apt-get update".split(" "))
@ -158,10 +174,15 @@ if __name__ == "__main__":
        subprocess.run("pip install -e . --verbose".split(" "))
        os.chdir("../")

+    # - Start ray.
    try:
        subprocess.run("ray start --head".split(" "))
    except Exception:
+        try:
            subprocess.run("ray stop".split(" "))
+            subprocess.run("ray stop".split(" "))
+        except Exception:
+            pass
        try:
            subprocess.run("ray start --head".split(" "))
        except Exception as e:
@ -175,31 +196,29 @@ if __name__ == "__main__":
    ray.init()

    results = tune.run(run, stop=stop, config=config)
+    last_results = [t.last_result for t in results.trials]

-    # Criterium is to have reached some min reward.
-    if args.stop_reward:
-        last_result = results.trials[0].last_result
-        avg_reward = last_result["episode_reward_mean"]
-        if avg_reward < args.stop_reward:
+    # Criterion is to have reached some min reward within given
+    # wall time, iters, or timesteps.
+    if stop.get("episode_reward_mean") is not None:
+        max_avg_reward = np.max(
+            [r["episode_reward_mean"] for r in last_results])
+        if max_avg_reward < stop["episode_reward_mean"]:
            raise ValueError("`stop-reward` of {} not reached!".format(
-                args.stop_reward))
-    # Criterium is to have run through n env timesteps in some wall time m.
-    elif args.stop_timesteps and args.stop_time:
-        last_result = results.trials[0].last_result
-        total_timesteps = last_result["timesteps_total"]
-        total_time = last_result["time_total_s"]
-        desired_speed = args.stop_timesteps / args.stop_time
+                stop["episode_reward_mean"]))
+    # Criterion is to have run through n env timesteps in some wall time m
+    # (minimum throughput).
+    else:
+        total_timesteps = np.sum([r["timesteps_total"] for r in last_results])
+        total_time = np.sum([r["time_total_s"] for r in last_results])
+        desired_speed = stop["timesteps_total"] / stop["time_total_s"]
        actual_speed = total_timesteps / total_time
        # We stopped because we reached the time limit ->
        # Means throughput is too slow (time steps not reached).
        if actual_speed < desired_speed:
            raise ValueError(
                "`stop-timesteps` of {} not reached in {}sec!".format(
-                    args.stop_timesteps, args.stop_time))
-    else:
-        raise ValueError("Invalid pass criterium! Must use either "
-                         "(--stop-reward + optionally any other) OR "
-                         "(--stop-timesteps + --stop-time).")
+                    stop["timesteps_total"], stop["time_total_s"]))

    print("ok")
    ray.shutdown()
--- a/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml
+++ b/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml
@ -40,7 +40,7 @@ halfcheetah-ddpg:
        # === Optimization ===
        actor_lr: 0.001
        critic_lr: 0.001
-        use_huber: False
+        use_huber: false
        huber_threshold: 1.0
        l2_reg: 0.000001
        learning_starts: 500
@ -50,7 +50,7 @@ halfcheetah-ddpg:
        # === Parallelism ===
        num_workers: 0
        num_gpus_per_worker: 0
-        worker_side_prioritization: False
+        worker_side_prioritization: false

        # === Evaluation ===
        evaluation_interval: 5
--- a/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml
+++ b/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml
@ -0,0 +1,42 @@
+# Note: HalfCheetahBulletEnv-v0 is not the same as MuJoCo's HalfCheetah-v0.
+ddpg-halfcheetahbulletenv-v0:
+    env: HalfCheetahBulletEnv-v0
+    run: DDPG
+    stop:
+        episode_reward_mean: -300.0
+        timesteps_total: 200000
+    config:
+        actor_hiddens: [256, 256]
+        critic_hiddens: [256, 256]
+        n_step: 3
+        model: {}
+        gamma: 0.99
+        env_config: {}
+        exploration_config:
+            initial_scale: 1.0
+            final_scale: 0.02
+            scale_timesteps: 10000
+            ou_base_scale: 0.1
+            ou_theta: 0.15
+            ou_sigma: 0.2
+        timesteps_per_iteration: 1000
+        target_network_update_freq: 0
+        tau: 0.001
+        buffer_size: 15000
+        prioritized_replay: true
+        prioritized_replay_alpha: 0.6
+        prioritized_replay_beta: 0.4
+        prioritized_replay_eps: 0.000001
+        clip_rewards: false
+        actor_lr: 0.001
+        critic_lr: 0.001
+        use_huber: true
+        huber_threshold: 1.0
+        l2_reg: 0.000001
+        learning_starts: 500
+        rollout_fragment_length: 1
+        train_batch_size: 48
+        num_workers: 0
+        num_gpus: 1
+        num_gpus_per_worker: 0
+        worker_side_prioritization: false
--- a/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml
+++ b/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml
@ -0,0 +1,44 @@
+# Note: HopperBulletEnv-v0 is not the same as MuJoCo's Hopper-v0.
+ddpg-hopperbulletenv-v0:
+    env: HopperBulletEnv-v0
+    run: DDPG
+    # Minimum reward and total ts (in given time_total_s) to pass this test.
+    pass_criteria:
+        episode_reward_mean: 120.0
+        timesteps_total: 50000
+    stop:
+        time_total_s: 2000
+    config:
+        actor_hiddens: [256, 256]
+        critic_hiddens: [256, 256]
+        n_step: 3
+        model: {}
+        gamma: 0.99
+        env_config: {}
+        exploration_config:
+            initial_scale: 1.0
+            final_scale: 0.02
+            scale_timesteps: 10000
+            ou_base_scale: 0.1
+            ou_theta: 0.15
+            ou_sigma: 0.2
+        timesteps_per_iteration: 1000
+        target_network_update_freq: 0
+        tau: 0.001
+        buffer_size: 10000
+        prioritized_replay: True
+        prioritized_replay_alpha: 0.6
+        prioritized_replay_beta: 0.4
+        prioritized_replay_eps: 0.000001
+        clip_rewards: False
+        actor_lr: 0.001
+        critic_lr: 0.001
+        use_huber: False
+        huber_threshold: 1.0
+        l2_reg: 0.000001
+        learning_starts: 500
+        rollout_fragment_length: 1
+        train_batch_size: 48
+        num_workers: 0
+        num_gpus_per_worker: 0
+        worker_side_prioritization: False