From 8acb469b047cd9b327c9477a13b030eb7357860e Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Thu, 26 Aug 2021 14:09:20 +0200 Subject: [PATCH] [RLlib; Testing] Green all RLlib nightly tests. (#18073) --- release/rllib_tests/app_config.yaml | 2 + .../learning_tests/hard_learning_tests.yaml | 93 +++++++++---------- .../debug_learning_failure_git_bisect.py | 63 ++++++++----- .../tuned_examples/ddpg/halfcheetah-ddpg.yaml | 4 +- .../ddpg/halfcheetah-pybullet-ddpg.yaml | 42 +++++++++ .../ddpg/hopper-pybullet-ddpg.yaml | 44 +++++++++ 6 files changed, 176 insertions(+), 72 deletions(-) create mode 100644 rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml create mode 100644 rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml diff --git a/release/rllib_tests/app_config.yaml b/release/rllib_tests/app_config.yaml index aa574172a..93ae92d01 100755 --- a/release/rllib_tests/app_config.yaml +++ b/release/rllib_tests/app_config.yaml @@ -13,6 +13,8 @@ python: - gym[atari] - atari_py - pybullet + # Pin this to 2.4.3 so it'll work with CUDA=11.0. + - tensorflow==2.4.3 conda_packages: [] post_build_cmds: diff --git a/release/rllib_tests/learning_tests/hard_learning_tests.yaml b/release/rllib_tests/learning_tests/hard_learning_tests.yaml index 7efc08225..95c31ac1f 100644 --- a/release/rllib_tests/learning_tests/hard_learning_tests.yaml +++ b/release/rllib_tests/learning_tests/hard_learning_tests.yaml @@ -52,6 +52,51 @@ apex-breakoutnoframeskip-v4: target_network_update_freq: 50000 timesteps_per_iteration: 25000 +ddpg-hopperbulletenv-v0: + env: HopperBulletEnv-v0 + run: DDPG + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: 120.0 + timesteps_total: 50000 + stop: + time_total_s: 3600 + config: + actor_hiddens: [256, 256] + critic_hiddens: [256, 256] + n_step: 3 + model: {} + gamma: 0.99 + env_config: {} + exploration_config: + initial_scale: 1.0 + final_scale: 0.02 + scale_timesteps: 10000 + ou_base_scale: 0.1 + ou_theta: 0.15 + ou_sigma: 0.2 + timesteps_per_iteration: 1000 + target_network_update_freq: 0 + tau: 0.001 + buffer_size: 10000 + prioritized_replay: True + prioritized_replay_alpha: 0.6 + prioritized_replay_beta: 0.4 + prioritized_replay_eps: 0.000001 + clip_rewards: false + actor_lr: 0.001 + critic_lr: 0.001 + use_huber: true + huber_threshold: 1.0 + l2_reg: 0.000001 + learning_starts: 500 + rollout_fragment_length: 1 + train_batch_size: 48 + num_gpus: 1 + num_workers: 0 + num_gpus_per_worker: 0 + worker_side_prioritization: false + dqn-breakoutnoframeskip-v4: env: BreakoutNoFrameskip-v4 run: DQN @@ -173,51 +218,3 @@ sac-halfcheetahbulletenv-v0: normalize_actions: true evaluation_interval: 1 metrics_smoothing_episodes: 5 - -# Expect roughly 1000 reward after 1h on 1GPU -# TODO: (sven) this seems to be somewhat broken on tf AND torch (?) -# try to find older version that still works. -ddpg-halfcheetahbulletenv-v0: - env: HalfCheetahBulletEnv-v0 - run: DDPG - # Minimum reward and total ts (in given time_total_s) to pass this test. - pass_criteria: - episode_reward_mean: -100.0 - timesteps_total: 400000 - stop: - time_total_s: 7200 - config: - actor_hiddens: [64, 64] - critic_hiddens: [64, 64] - n_step: 1 - model: {} - gamma: 0.99 - env_config: {} - exploration_config: - initial_scale: 1.0 - final_scale: 0.02 - scale_timesteps: 10000 - ou_base_scale: 0.1 - ou_theta: 0.15 - ou_sigma: 0.2 - timesteps_per_iteration: 1000 - target_network_update_freq: 0 - tau: 0.001 - buffer_size: 10000 - prioritized_replay: True - prioritized_replay_alpha: 0.6 - prioritized_replay_beta: 0.4 - prioritized_replay_eps: 0.000001 - clip_rewards: False - actor_lr: 0.001 - critic_lr: 0.001 - use_huber: False - huber_threshold: 1.0 - l2_reg: 0.000001 - learning_starts: 500 - rollout_fragment_length: 1 - train_batch_size: 64 - num_workers: 0 - num_gpus: 1 - num_gpus_per_worker: 0 - worker_side_prioritization: False diff --git a/rllib/tests/git_bisect/debug_learning_failure_git_bisect.py b/rllib/tests/git_bisect/debug_learning_failure_git_bisect.py index 13ebdded5..238063c93 100644 --- a/rllib/tests/git_bisect/debug_learning_failure_git_bisect.py +++ b/rllib/tests/git_bisect/debug_learning_failure_git_bisect.py @@ -27,6 +27,7 @@ $ python debug_learning_failure_git_bisect.py -f [yaml file] --stop-reward=180 import argparse import importlib import json +import numpy as np import os import subprocess import yaml @@ -47,6 +48,11 @@ parser.add_argument( "--skip-install-ray", action="store_true", help="If set, do not attempt to re-build ray from source.") +parser.add_argument( + "--num-samples", + type=int, + default=1, + help="The number of samples to run for the given experiment.") parser.add_argument( "--stop-iters", type=int, @@ -122,8 +128,9 @@ if __name__ == "__main__": if args.framework: config["framework"] = args.framework - # Define stopping criteria. - stop = {} + # Define stopping criteria. From the yaml file .. + stop = experiment_config.get("stop", {}) + # .. but override with command line provided ones. if args.stop_iters: stop["training_iteration"] = args.stop_iters if args.stop_timesteps: @@ -133,15 +140,24 @@ if __name__ == "__main__": if args.stop_time: stop["time_total_s"] = args.stop_time + # Invalid pass criteria. + if stop.get("episode_reward_mean") is None and \ + (stop.get("timesteps_total") is None or + stop.get("time_total_s") is None): + raise ValueError("Invalid pass criterium! Must use either " + "(--stop-reward + optionally any other) OR " + "(--stop-timesteps + --stop-time).") + # - Stop ray. - # - Uninstall and re-install ray (from source) if required. - # - Start ray. + # Do this twice to make sure all processes are stopped (older versions of + # ray used to not kill everything the first time around). try: subprocess.run("ray stop".split(" ")) subprocess.run("ray stop".split(" ")) except Exception: pass + # - Uninstall and re-install ray (from source) if required. # Install ray from the checked out repo. if not args.skip_install_ray: subprocess.run("sudo apt-get update".split(" ")) @@ -158,10 +174,15 @@ if __name__ == "__main__": subprocess.run("pip install -e . --verbose".split(" ")) os.chdir("../") + # - Start ray. try: subprocess.run("ray start --head".split(" ")) except Exception: - subprocess.run("ray stop".split(" ")) + try: + subprocess.run("ray stop".split(" ")) + subprocess.run("ray stop".split(" ")) + except Exception: + pass try: subprocess.run("ray start --head".split(" ")) except Exception as e: @@ -175,31 +196,29 @@ if __name__ == "__main__": ray.init() results = tune.run(run, stop=stop, config=config) + last_results = [t.last_result for t in results.trials] - # Criterium is to have reached some min reward. - if args.stop_reward: - last_result = results.trials[0].last_result - avg_reward = last_result["episode_reward_mean"] - if avg_reward < args.stop_reward: + # Criterion is to have reached some min reward within given + # wall time, iters, or timesteps. + if stop.get("episode_reward_mean") is not None: + max_avg_reward = np.max( + [r["episode_reward_mean"] for r in last_results]) + if max_avg_reward < stop["episode_reward_mean"]: raise ValueError("`stop-reward` of {} not reached!".format( - args.stop_reward)) - # Criterium is to have run through n env timesteps in some wall time m. - elif args.stop_timesteps and args.stop_time: - last_result = results.trials[0].last_result - total_timesteps = last_result["timesteps_total"] - total_time = last_result["time_total_s"] - desired_speed = args.stop_timesteps / args.stop_time + stop["episode_reward_mean"])) + # Criterion is to have run through n env timesteps in some wall time m + # (minimum throughput). + else: + total_timesteps = np.sum([r["timesteps_total"] for r in last_results]) + total_time = np.sum([r["time_total_s"] for r in last_results]) + desired_speed = stop["timesteps_total"] / stop["time_total_s"] actual_speed = total_timesteps / total_time # We stopped because we reached the time limit -> # Means throughput is too slow (time steps not reached). if actual_speed < desired_speed: raise ValueError( "`stop-timesteps` of {} not reached in {}sec!".format( - args.stop_timesteps, args.stop_time)) - else: - raise ValueError("Invalid pass criterium! Must use either " - "(--stop-reward + optionally any other) OR " - "(--stop-timesteps + --stop-time).") + stop["timesteps_total"], stop["time_total_s"])) print("ok") ray.shutdown() diff --git a/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml b/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml index 047a92da1..c9a5a607e 100644 --- a/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml +++ b/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml @@ -40,7 +40,7 @@ halfcheetah-ddpg: # === Optimization === actor_lr: 0.001 critic_lr: 0.001 - use_huber: False + use_huber: false huber_threshold: 1.0 l2_reg: 0.000001 learning_starts: 500 @@ -50,7 +50,7 @@ halfcheetah-ddpg: # === Parallelism === num_workers: 0 num_gpus_per_worker: 0 - worker_side_prioritization: False + worker_side_prioritization: false # === Evaluation === evaluation_interval: 5 diff --git a/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml b/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml new file mode 100644 index 000000000..3d60c4e96 --- /dev/null +++ b/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml @@ -0,0 +1,42 @@ +# Note: HalfCheetahBulletEnv-v0 is not the same as MuJoCo's HalfCheetah-v0. +ddpg-halfcheetahbulletenv-v0: + env: HalfCheetahBulletEnv-v0 + run: DDPG + stop: + episode_reward_mean: -300.0 + timesteps_total: 200000 + config: + actor_hiddens: [256, 256] + critic_hiddens: [256, 256] + n_step: 3 + model: {} + gamma: 0.99 + env_config: {} + exploration_config: + initial_scale: 1.0 + final_scale: 0.02 + scale_timesteps: 10000 + ou_base_scale: 0.1 + ou_theta: 0.15 + ou_sigma: 0.2 + timesteps_per_iteration: 1000 + target_network_update_freq: 0 + tau: 0.001 + buffer_size: 15000 + prioritized_replay: true + prioritized_replay_alpha: 0.6 + prioritized_replay_beta: 0.4 + prioritized_replay_eps: 0.000001 + clip_rewards: false + actor_lr: 0.001 + critic_lr: 0.001 + use_huber: true + huber_threshold: 1.0 + l2_reg: 0.000001 + learning_starts: 500 + rollout_fragment_length: 1 + train_batch_size: 48 + num_workers: 0 + num_gpus: 1 + num_gpus_per_worker: 0 + worker_side_prioritization: false diff --git a/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml b/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml new file mode 100644 index 000000000..73200df7e --- /dev/null +++ b/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml @@ -0,0 +1,44 @@ +# Note: HopperBulletEnv-v0 is not the same as MuJoCo's Hopper-v0. +ddpg-hopperbulletenv-v0: + env: HopperBulletEnv-v0 + run: DDPG + # Minimum reward and total ts (in given time_total_s) to pass this test. + pass_criteria: + episode_reward_mean: 120.0 + timesteps_total: 50000 + stop: + time_total_s: 2000 + config: + actor_hiddens: [256, 256] + critic_hiddens: [256, 256] + n_step: 3 + model: {} + gamma: 0.99 + env_config: {} + exploration_config: + initial_scale: 1.0 + final_scale: 0.02 + scale_timesteps: 10000 + ou_base_scale: 0.1 + ou_theta: 0.15 + ou_sigma: 0.2 + timesteps_per_iteration: 1000 + target_network_update_freq: 0 + tau: 0.001 + buffer_size: 10000 + prioritized_replay: True + prioritized_replay_alpha: 0.6 + prioritized_replay_beta: 0.4 + prioritized_replay_eps: 0.000001 + clip_rewards: False + actor_lr: 0.001 + critic_lr: 0.001 + use_huber: False + huber_threshold: 1.0 + l2_reg: 0.000001 + learning_starts: 500 + rollout_fragment_length: 1 + train_batch_size: 48 + num_workers: 0 + num_gpus_per_worker: 0 + worker_side_prioritization: False