mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[RLlib; Testing] Green all RLlib nightly tests. (#18073)
This commit is contained in:
parent
089dd9b949
commit
8acb469b04
6 changed files with 176 additions and 72 deletions
|
@ -13,6 +13,8 @@ python:
|
|||
- gym[atari]
|
||||
- atari_py
|
||||
- pybullet
|
||||
# Pin this to 2.4.3 so it'll work with CUDA=11.0.
|
||||
- tensorflow==2.4.3
|
||||
conda_packages: []
|
||||
|
||||
post_build_cmds:
|
||||
|
|
|
@ -52,6 +52,51 @@ apex-breakoutnoframeskip-v4:
|
|||
target_network_update_freq: 50000
|
||||
timesteps_per_iteration: 25000
|
||||
|
||||
ddpg-hopperbulletenv-v0:
|
||||
env: HopperBulletEnv-v0
|
||||
run: DDPG
|
||||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: 120.0
|
||||
timesteps_total: 50000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
||||
actor_hiddens: [256, 256]
|
||||
critic_hiddens: [256, 256]
|
||||
n_step: 3
|
||||
model: {}
|
||||
gamma: 0.99
|
||||
env_config: {}
|
||||
exploration_config:
|
||||
initial_scale: 1.0
|
||||
final_scale: 0.02
|
||||
scale_timesteps: 10000
|
||||
ou_base_scale: 0.1
|
||||
ou_theta: 0.15
|
||||
ou_sigma: 0.2
|
||||
timesteps_per_iteration: 1000
|
||||
target_network_update_freq: 0
|
||||
tau: 0.001
|
||||
buffer_size: 10000
|
||||
prioritized_replay: True
|
||||
prioritized_replay_alpha: 0.6
|
||||
prioritized_replay_beta: 0.4
|
||||
prioritized_replay_eps: 0.000001
|
||||
clip_rewards: false
|
||||
actor_lr: 0.001
|
||||
critic_lr: 0.001
|
||||
use_huber: true
|
||||
huber_threshold: 1.0
|
||||
l2_reg: 0.000001
|
||||
learning_starts: 500
|
||||
rollout_fragment_length: 1
|
||||
train_batch_size: 48
|
||||
num_gpus: 1
|
||||
num_workers: 0
|
||||
num_gpus_per_worker: 0
|
||||
worker_side_prioritization: false
|
||||
|
||||
dqn-breakoutnoframeskip-v4:
|
||||
env: BreakoutNoFrameskip-v4
|
||||
run: DQN
|
||||
|
@ -173,51 +218,3 @@ sac-halfcheetahbulletenv-v0:
|
|||
normalize_actions: true
|
||||
evaluation_interval: 1
|
||||
metrics_smoothing_episodes: 5
|
||||
|
||||
# Expect roughly 1000 reward after 1h on 1GPU
|
||||
# TODO: (sven) this seems to be somewhat broken on tf AND torch (?)
|
||||
# try to find older version that still works.
|
||||
ddpg-halfcheetahbulletenv-v0:
|
||||
env: HalfCheetahBulletEnv-v0
|
||||
run: DDPG
|
||||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: -100.0
|
||||
timesteps_total: 400000
|
||||
stop:
|
||||
time_total_s: 7200
|
||||
config:
|
||||
actor_hiddens: [64, 64]
|
||||
critic_hiddens: [64, 64]
|
||||
n_step: 1
|
||||
model: {}
|
||||
gamma: 0.99
|
||||
env_config: {}
|
||||
exploration_config:
|
||||
initial_scale: 1.0
|
||||
final_scale: 0.02
|
||||
scale_timesteps: 10000
|
||||
ou_base_scale: 0.1
|
||||
ou_theta: 0.15
|
||||
ou_sigma: 0.2
|
||||
timesteps_per_iteration: 1000
|
||||
target_network_update_freq: 0
|
||||
tau: 0.001
|
||||
buffer_size: 10000
|
||||
prioritized_replay: True
|
||||
prioritized_replay_alpha: 0.6
|
||||
prioritized_replay_beta: 0.4
|
||||
prioritized_replay_eps: 0.000001
|
||||
clip_rewards: False
|
||||
actor_lr: 0.001
|
||||
critic_lr: 0.001
|
||||
use_huber: False
|
||||
huber_threshold: 1.0
|
||||
l2_reg: 0.000001
|
||||
learning_starts: 500
|
||||
rollout_fragment_length: 1
|
||||
train_batch_size: 64
|
||||
num_workers: 0
|
||||
num_gpus: 1
|
||||
num_gpus_per_worker: 0
|
||||
worker_side_prioritization: False
|
||||
|
|
|
@ -27,6 +27,7 @@ $ python debug_learning_failure_git_bisect.py -f [yaml file] --stop-reward=180
|
|||
import argparse
|
||||
import importlib
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import subprocess
|
||||
import yaml
|
||||
|
@ -47,6 +48,11 @@ parser.add_argument(
|
|||
"--skip-install-ray",
|
||||
action="store_true",
|
||||
help="If set, do not attempt to re-build ray from source.")
|
||||
parser.add_argument(
|
||||
"--num-samples",
|
||||
type=int,
|
||||
default=1,
|
||||
help="The number of samples to run for the given experiment.")
|
||||
parser.add_argument(
|
||||
"--stop-iters",
|
||||
type=int,
|
||||
|
@ -122,8 +128,9 @@ if __name__ == "__main__":
|
|||
if args.framework:
|
||||
config["framework"] = args.framework
|
||||
|
||||
# Define stopping criteria.
|
||||
stop = {}
|
||||
# Define stopping criteria. From the yaml file ..
|
||||
stop = experiment_config.get("stop", {})
|
||||
# .. but override with command line provided ones.
|
||||
if args.stop_iters:
|
||||
stop["training_iteration"] = args.stop_iters
|
||||
if args.stop_timesteps:
|
||||
|
@ -133,15 +140,24 @@ if __name__ == "__main__":
|
|||
if args.stop_time:
|
||||
stop["time_total_s"] = args.stop_time
|
||||
|
||||
# Invalid pass criteria.
|
||||
if stop.get("episode_reward_mean") is None and \
|
||||
(stop.get("timesteps_total") is None or
|
||||
stop.get("time_total_s") is None):
|
||||
raise ValueError("Invalid pass criterium! Must use either "
|
||||
"(--stop-reward + optionally any other) OR "
|
||||
"(--stop-timesteps + --stop-time).")
|
||||
|
||||
# - Stop ray.
|
||||
# - Uninstall and re-install ray (from source) if required.
|
||||
# - Start ray.
|
||||
# Do this twice to make sure all processes are stopped (older versions of
|
||||
# ray used to not kill everything the first time around).
|
||||
try:
|
||||
subprocess.run("ray stop".split(" "))
|
||||
subprocess.run("ray stop".split(" "))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# - Uninstall and re-install ray (from source) if required.
|
||||
# Install ray from the checked out repo.
|
||||
if not args.skip_install_ray:
|
||||
subprocess.run("sudo apt-get update".split(" "))
|
||||
|
@ -158,10 +174,15 @@ if __name__ == "__main__":
|
|||
subprocess.run("pip install -e . --verbose".split(" "))
|
||||
os.chdir("../")
|
||||
|
||||
# - Start ray.
|
||||
try:
|
||||
subprocess.run("ray start --head".split(" "))
|
||||
except Exception:
|
||||
try:
|
||||
subprocess.run("ray stop".split(" "))
|
||||
subprocess.run("ray stop".split(" "))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
subprocess.run("ray start --head".split(" "))
|
||||
except Exception as e:
|
||||
|
@ -175,31 +196,29 @@ if __name__ == "__main__":
|
|||
ray.init()
|
||||
|
||||
results = tune.run(run, stop=stop, config=config)
|
||||
last_results = [t.last_result for t in results.trials]
|
||||
|
||||
# Criterium is to have reached some min reward.
|
||||
if args.stop_reward:
|
||||
last_result = results.trials[0].last_result
|
||||
avg_reward = last_result["episode_reward_mean"]
|
||||
if avg_reward < args.stop_reward:
|
||||
# Criterion is to have reached some min reward within given
|
||||
# wall time, iters, or timesteps.
|
||||
if stop.get("episode_reward_mean") is not None:
|
||||
max_avg_reward = np.max(
|
||||
[r["episode_reward_mean"] for r in last_results])
|
||||
if max_avg_reward < stop["episode_reward_mean"]:
|
||||
raise ValueError("`stop-reward` of {} not reached!".format(
|
||||
args.stop_reward))
|
||||
# Criterium is to have run through n env timesteps in some wall time m.
|
||||
elif args.stop_timesteps and args.stop_time:
|
||||
last_result = results.trials[0].last_result
|
||||
total_timesteps = last_result["timesteps_total"]
|
||||
total_time = last_result["time_total_s"]
|
||||
desired_speed = args.stop_timesteps / args.stop_time
|
||||
stop["episode_reward_mean"]))
|
||||
# Criterion is to have run through n env timesteps in some wall time m
|
||||
# (minimum throughput).
|
||||
else:
|
||||
total_timesteps = np.sum([r["timesteps_total"] for r in last_results])
|
||||
total_time = np.sum([r["time_total_s"] for r in last_results])
|
||||
desired_speed = stop["timesteps_total"] / stop["time_total_s"]
|
||||
actual_speed = total_timesteps / total_time
|
||||
# We stopped because we reached the time limit ->
|
||||
# Means throughput is too slow (time steps not reached).
|
||||
if actual_speed < desired_speed:
|
||||
raise ValueError(
|
||||
"`stop-timesteps` of {} not reached in {}sec!".format(
|
||||
args.stop_timesteps, args.stop_time))
|
||||
else:
|
||||
raise ValueError("Invalid pass criterium! Must use either "
|
||||
"(--stop-reward + optionally any other) OR "
|
||||
"(--stop-timesteps + --stop-time).")
|
||||
stop["timesteps_total"], stop["time_total_s"]))
|
||||
|
||||
print("ok")
|
||||
ray.shutdown()
|
||||
|
|
|
@ -40,7 +40,7 @@ halfcheetah-ddpg:
|
|||
# === Optimization ===
|
||||
actor_lr: 0.001
|
||||
critic_lr: 0.001
|
||||
use_huber: False
|
||||
use_huber: false
|
||||
huber_threshold: 1.0
|
||||
l2_reg: 0.000001
|
||||
learning_starts: 500
|
||||
|
@ -50,7 +50,7 @@ halfcheetah-ddpg:
|
|||
# === Parallelism ===
|
||||
num_workers: 0
|
||||
num_gpus_per_worker: 0
|
||||
worker_side_prioritization: False
|
||||
worker_side_prioritization: false
|
||||
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 5
|
||||
|
|
42
rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml
Normal file
42
rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml
Normal file
|
@ -0,0 +1,42 @@
|
|||
# Note: HalfCheetahBulletEnv-v0 is not the same as MuJoCo's HalfCheetah-v0.
|
||||
ddpg-halfcheetahbulletenv-v0:
|
||||
env: HalfCheetahBulletEnv-v0
|
||||
run: DDPG
|
||||
stop:
|
||||
episode_reward_mean: -300.0
|
||||
timesteps_total: 200000
|
||||
config:
|
||||
actor_hiddens: [256, 256]
|
||||
critic_hiddens: [256, 256]
|
||||
n_step: 3
|
||||
model: {}
|
||||
gamma: 0.99
|
||||
env_config: {}
|
||||
exploration_config:
|
||||
initial_scale: 1.0
|
||||
final_scale: 0.02
|
||||
scale_timesteps: 10000
|
||||
ou_base_scale: 0.1
|
||||
ou_theta: 0.15
|
||||
ou_sigma: 0.2
|
||||
timesteps_per_iteration: 1000
|
||||
target_network_update_freq: 0
|
||||
tau: 0.001
|
||||
buffer_size: 15000
|
||||
prioritized_replay: true
|
||||
prioritized_replay_alpha: 0.6
|
||||
prioritized_replay_beta: 0.4
|
||||
prioritized_replay_eps: 0.000001
|
||||
clip_rewards: false
|
||||
actor_lr: 0.001
|
||||
critic_lr: 0.001
|
||||
use_huber: true
|
||||
huber_threshold: 1.0
|
||||
l2_reg: 0.000001
|
||||
learning_starts: 500
|
||||
rollout_fragment_length: 1
|
||||
train_batch_size: 48
|
||||
num_workers: 0
|
||||
num_gpus: 1
|
||||
num_gpus_per_worker: 0
|
||||
worker_side_prioritization: false
|
44
rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml
Normal file
44
rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml
Normal file
|
@ -0,0 +1,44 @@
|
|||
# Note: HopperBulletEnv-v0 is not the same as MuJoCo's Hopper-v0.
|
||||
ddpg-hopperbulletenv-v0:
|
||||
env: HopperBulletEnv-v0
|
||||
run: DDPG
|
||||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: 120.0
|
||||
timesteps_total: 50000
|
||||
stop:
|
||||
time_total_s: 2000
|
||||
config:
|
||||
actor_hiddens: [256, 256]
|
||||
critic_hiddens: [256, 256]
|
||||
n_step: 3
|
||||
model: {}
|
||||
gamma: 0.99
|
||||
env_config: {}
|
||||
exploration_config:
|
||||
initial_scale: 1.0
|
||||
final_scale: 0.02
|
||||
scale_timesteps: 10000
|
||||
ou_base_scale: 0.1
|
||||
ou_theta: 0.15
|
||||
ou_sigma: 0.2
|
||||
timesteps_per_iteration: 1000
|
||||
target_network_update_freq: 0
|
||||
tau: 0.001
|
||||
buffer_size: 10000
|
||||
prioritized_replay: True
|
||||
prioritized_replay_alpha: 0.6
|
||||
prioritized_replay_beta: 0.4
|
||||
prioritized_replay_eps: 0.000001
|
||||
clip_rewards: False
|
||||
actor_lr: 0.001
|
||||
critic_lr: 0.001
|
||||
use_huber: False
|
||||
huber_threshold: 1.0
|
||||
l2_reg: 0.000001
|
||||
learning_starts: 500
|
||||
rollout_fragment_length: 1
|
||||
train_batch_size: 48
|
||||
num_workers: 0
|
||||
num_gpus_per_worker: 0
|
||||
worker_side_prioritization: False
|
Loading…
Add table
Reference in a new issue