mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[RLlib; testing] Fix bug in stress tests not handling >1 trials per experiment (due to grid-search in IMPALA stress tests). (#18705)
This commit is contained in:
parent
8d6ddcee53
commit
e6aae61487
5 changed files with 167 additions and 114 deletions
|
@ -19,41 +19,41 @@ a2c-breakoutnoframeskip-v4:
|
|||
[20000000, 0.000000000001],
|
||||
]
|
||||
|
||||
a3c-pongdeterministic-v4:
|
||||
env: PongDeterministic-v4
|
||||
run: A3C
|
||||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: 18.0
|
||||
timesteps_total: 5000000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
||||
ignore_worker_failures: true
|
||||
num_gpus: 0
|
||||
num_workers: 16
|
||||
rollout_fragment_length: 20
|
||||
vf_loss_coeff: 0.5
|
||||
entropy_coeff: 0.01
|
||||
gamma: 0.99
|
||||
grad_clip: 40.0
|
||||
lambda: 1.0
|
||||
lr: 0.0001
|
||||
observation_filter: NoFilter
|
||||
preprocessor_pref: rllib
|
||||
model:
|
||||
use_lstm: true
|
||||
conv_activation: elu
|
||||
dim: 42
|
||||
grayscale: true
|
||||
zero_mean: false
|
||||
# Reduced channel depth and kernel size from default.
|
||||
conv_filters: [
|
||||
[32, [3, 3], 2],
|
||||
[32, [3, 3], 2],
|
||||
[32, [3, 3], 2],
|
||||
[32, [3, 3], 2],
|
||||
]
|
||||
# a3c-pongdeterministic-v4:
|
||||
# env: PongDeterministic-v4
|
||||
# run: A3C
|
||||
# # Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
# pass_criteria:
|
||||
# episode_reward_mean: 18.0
|
||||
# timesteps_total: 5000000
|
||||
# stop:
|
||||
# time_total_s: 3600
|
||||
# config:
|
||||
# ignore_worker_failures: true
|
||||
# num_gpus: 0
|
||||
# num_workers: 16
|
||||
# rollout_fragment_length: 20
|
||||
# vf_loss_coeff: 0.5
|
||||
# entropy_coeff: 0.01
|
||||
# gamma: 0.99
|
||||
# grad_clip: 40.0
|
||||
# lambda: 1.0
|
||||
# lr: 0.0001
|
||||
# observation_filter: NoFilter
|
||||
# preprocessor_pref: rllib
|
||||
# model:
|
||||
# use_lstm: true
|
||||
# conv_activation: elu
|
||||
# dim: 42
|
||||
# grayscale: true
|
||||
# zero_mean: false
|
||||
# # Reduced channel depth and kernel size from default.
|
||||
# conv_filters: [
|
||||
# [32, [3, 3], 2],
|
||||
# [32, [3, 3], 2],
|
||||
# [32, [3, 3], 2],
|
||||
# [32, [3, 3], 2],
|
||||
# ]
|
||||
|
||||
apex-breakoutnoframeskip-v4:
|
||||
env: BreakoutNoFrameskip-v4
|
||||
|
@ -61,7 +61,7 @@ apex-breakoutnoframeskip-v4:
|
|||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: 20.0
|
||||
timesteps_total: 10000000
|
||||
timesteps_total: 7000000
|
||||
stop:
|
||||
time_total_s: 7200
|
||||
config:
|
||||
|
@ -115,26 +115,27 @@ appo-pongnoframeskip-v4:
|
|||
model:
|
||||
dim: 42
|
||||
|
||||
ars-hopperbulletenv-v0:
|
||||
env: HopperBulletEnv-v0
|
||||
run: ARS
|
||||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: 100.0
|
||||
timesteps_total: 2000000
|
||||
stop:
|
||||
time_total_s: 2000
|
||||
config:
|
||||
noise_stdev: 0.01
|
||||
num_rollouts: 1
|
||||
rollouts_used: 1
|
||||
num_workers: 1
|
||||
sgd_stepsize: 0.02
|
||||
noise_size: 250000000
|
||||
eval_prob: 0.2
|
||||
offset: 0
|
||||
observation_filter: NoFilter
|
||||
report_length: 3
|
||||
# ARS was never tested/tuned on Hopper. Maybe change to ReacherBulletEnv-v0?
|
||||
# ars-hopperbulletenv-v0:
|
||||
# env: HopperBulletEnv-v0
|
||||
# run: ARS
|
||||
# # Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
# pass_criteria:
|
||||
# episode_reward_mean: 100.0
|
||||
# timesteps_total: 2000000
|
||||
# stop:
|
||||
# time_total_s: 2000
|
||||
# config:
|
||||
# noise_stdev: 0.01
|
||||
# num_rollouts: 1
|
||||
# rollouts_used: 1
|
||||
# num_workers: 1
|
||||
# sgd_stepsize: 0.02
|
||||
# noise_size: 250000000
|
||||
# eval_prob: 0.2
|
||||
# offset: 0
|
||||
# observation_filter: NoFilter
|
||||
# report_length: 3
|
||||
|
||||
# TODO: (sven) Fix all BC-dependent learning tests for cont. actions.
|
||||
# These seem quite hard to learn from the SAC-recorded HalfCheetahBulletEnv.
|
||||
|
@ -218,7 +219,7 @@ ddpg-hopperbulletenv-v0:
|
|||
run: DDPG
|
||||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: 120.0
|
||||
episode_reward_mean: 110.0
|
||||
timesteps_total: 50000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
|
@ -261,40 +262,40 @@ ddpg-hopperbulletenv-v0:
|
|||
# Basically the same as atari-ppo, but adapted for DDPPO. Note that DDPPO
|
||||
# isn't actually any more efficient on Atari, since the network size is
|
||||
# relatively small and the env doesn't require a GPU.
|
||||
ddppo-breakoutnoframeskip-v4:
|
||||
env: BreakoutNoFrameskip-v4
|
||||
run: DDPPO
|
||||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: 50.0
|
||||
timesteps_total: 10000000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
||||
# DDPPO only supports PyTorch so far.
|
||||
framework: torch
|
||||
# Worker config: 10 workers, each of which requires a GPU.
|
||||
num_workers: 16
|
||||
# Workers require GPUs, but share 1 GPU amongst 2 workers.
|
||||
num_gpus_per_worker: 0.25
|
||||
# Each worker will sample 100 * 5 envs per worker steps = 500 steps
|
||||
# per optimization round. This is 5000 steps summed across workers.
|
||||
rollout_fragment_length: 100
|
||||
num_envs_per_worker: 5
|
||||
# Each worker will take a minibatch of 50. There are 10 workers total,
|
||||
# so the effective minibatch size will be 500.
|
||||
sgd_minibatch_size: 50
|
||||
num_sgd_iter: 30
|
||||
# Params from standard PPO Atari config:
|
||||
lambda: 0.95
|
||||
kl_coeff: 0.5
|
||||
clip_rewards: true
|
||||
clip_param: 0.1
|
||||
vf_loss_coeff: 0.1
|
||||
vf_clip_param: 10.0
|
||||
entropy_coeff: 0.01
|
||||
batch_mode: truncate_episodes
|
||||
observation_filter: NoFilter
|
||||
# ddppo-breakoutnoframeskip-v4:
|
||||
# env: BreakoutNoFrameskip-v4
|
||||
# run: DDPPO
|
||||
# # Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
# pass_criteria:
|
||||
# episode_reward_mean: 50.0
|
||||
# timesteps_total: 10000000
|
||||
# stop:
|
||||
# time_total_s: 3600
|
||||
# config:
|
||||
# # DDPPO only supports PyTorch so far.
|
||||
# framework: torch
|
||||
# # Worker config: 10 workers, each of which requires a GPU.
|
||||
# num_workers: 16
|
||||
# # Workers require GPUs, but share 1 GPU amongst 2 workers.
|
||||
# num_gpus_per_worker: 0.25
|
||||
# # Each worker will sample 100 * 5 envs per worker steps = 500 steps
|
||||
# # per optimization round. This is 5000 steps summed across workers.
|
||||
# rollout_fragment_length: 100
|
||||
# num_envs_per_worker: 5
|
||||
# # Each worker will take a minibatch of 50. There are 10 workers total,
|
||||
# # so the effective minibatch size will be 500.
|
||||
# sgd_minibatch_size: 50
|
||||
# num_sgd_iter: 30
|
||||
# # Params from standard PPO Atari config:
|
||||
# lambda: 0.95
|
||||
# kl_coeff: 0.5
|
||||
# clip_rewards: true
|
||||
# clip_param: 0.1
|
||||
# vf_loss_coeff: 0.1
|
||||
# vf_clip_param: 10.0
|
||||
# entropy_coeff: 0.01
|
||||
# batch_mode: truncate_episodes
|
||||
# observation_filter: NoFilter
|
||||
|
||||
dqn-breakoutnoframeskip-v4:
|
||||
env: BreakoutNoFrameskip-v4
|
||||
|
@ -302,7 +303,7 @@ dqn-breakoutnoframeskip-v4:
|
|||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: 30.0
|
||||
timesteps_total: 450000
|
||||
timesteps_total: 400000
|
||||
stop:
|
||||
time_total_s: 7200
|
||||
config:
|
||||
|
@ -394,7 +395,7 @@ ppo-breakoutnoframeskip-v4:
|
|||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: 50.0
|
||||
timesteps_total: 10000000
|
||||
timesteps_total: 7000000
|
||||
stop:
|
||||
time_total_s: 7200
|
||||
config:
|
||||
|
|
|
@ -11,11 +11,11 @@ atari-impala:
|
|||
run: IMPALA
|
||||
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
||||
pass_criteria:
|
||||
episode_reward_mean: 40.0
|
||||
timesteps_total: 45000000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
||||
framework: tf
|
||||
num_gpus: 1
|
||||
num_cpus_for_driver: 0
|
||||
rollout_fragment_length: 50
|
||||
|
|
|
@ -351,7 +351,7 @@ def run(args, parser):
|
|||
target_episodes=num_episodes,
|
||||
save_info=args.save_info) as saver:
|
||||
rollout(agent, args.env, num_steps, num_episodes, saver,
|
||||
args.no_render, video_dir)
|
||||
not args.render, video_dir)
|
||||
agent.stop()
|
||||
|
||||
|
||||
|
|
|
@ -54,10 +54,16 @@ def create_parser(parser_creator=None):
|
|||
type=str,
|
||||
help="Connect to an existing Ray cluster at this address instead "
|
||||
"of starting a new one.")
|
||||
parser.add_argument(
|
||||
"--ray-ui",
|
||||
action="store_true",
|
||||
help="Whether to enable the Ray web UI.")
|
||||
# Deprecated: Use --ray-ui, instead.
|
||||
parser.add_argument(
|
||||
"--no-ray-ui",
|
||||
action="store_true",
|
||||
help="Whether to disable the Ray web ui.")
|
||||
help="Deprecated! Ray UI is disabled by default now. "
|
||||
"Use `--ray-ui` to enable.")
|
||||
parser.add_argument(
|
||||
"--local-mode",
|
||||
action="store_true",
|
||||
|
@ -171,6 +177,11 @@ def run(args, parser):
|
|||
}
|
||||
}
|
||||
|
||||
# Ray UI.
|
||||
if args.no_ray_ui:
|
||||
deprecation_warning(old="--no-ray-ui", new="--ray-ui", error=False)
|
||||
args.ray_ui = False
|
||||
|
||||
verbose = 1
|
||||
for exp in experiments.values():
|
||||
# Bazel makes it hard to find files specified in `args` (and `data`).
|
||||
|
@ -234,7 +245,7 @@ def run(args, parser):
|
|||
ray.init(address=cluster.address)
|
||||
else:
|
||||
ray.init(
|
||||
include_dashboard=not args.no_ray_ui,
|
||||
include_dashboard=args.ray_ui,
|
||||
address=args.ray_address,
|
||||
object_store_memory=args.ray_object_store_memory,
|
||||
num_cpus=args.ray_num_cpus,
|
||||
|
|
|
@ -416,6 +416,9 @@ def run_learning_tests_from_yaml(
|
|||
frameworks = ["tf", "torch"]
|
||||
e["config"]["framework"] = "tf"
|
||||
|
||||
e["stop"] = e["stop"] or {}
|
||||
e["pass_criteria"] = e["pass_criteria"] or {}
|
||||
|
||||
# For smoke-tests, we just run for n min.
|
||||
if smoke_test:
|
||||
# 0sec for each(!) experiment/trial.
|
||||
|
@ -425,8 +428,10 @@ def run_learning_tests_from_yaml(
|
|||
e["stop"]["time_total_s"] = 0
|
||||
else:
|
||||
# We also stop early, once we reach the desired reward.
|
||||
e["stop"]["episode_reward_mean"] = \
|
||||
e["pass_criteria"]["episode_reward_mean"]
|
||||
min_reward = e.get("pass_criteria",
|
||||
{}).get("episode_reward_mean")
|
||||
if min_reward is not None:
|
||||
e["stop"]["episode_reward_mean"] = min_reward
|
||||
|
||||
keys = []
|
||||
# Generate the torch copy of the experiment.
|
||||
|
@ -450,9 +455,12 @@ def run_learning_tests_from_yaml(
|
|||
for k_ in keys:
|
||||
e = experiments[k_]
|
||||
checks[k_] = {
|
||||
"min_reward": e["pass_criteria"]["episode_reward_mean"],
|
||||
"min_timesteps": e["pass_criteria"]["timesteps_total"],
|
||||
"time_total_s": e["stop"]["time_total_s"],
|
||||
"min_reward": e["pass_criteria"].get(
|
||||
"episode_reward_mean"),
|
||||
"min_throughput": e["pass_criteria"].get(
|
||||
"timesteps_total", 0.0) /
|
||||
(e["stop"].get("time_total_s", 1.0) or 1.0),
|
||||
"time_total_s": e["stop"].get("time_total_s"),
|
||||
"failures": 0,
|
||||
"passed": False,
|
||||
}
|
||||
|
@ -484,11 +492,19 @@ def run_learning_tests_from_yaml(
|
|||
trials = run_experiments(experiments_to_run, resume=False, verbose=2)
|
||||
all_trials.extend(trials)
|
||||
|
||||
# Check each trial for whether we passed.
|
||||
# Check each experiment for whether it passed.
|
||||
# Criteria is to a) reach reward AND b) to have reached the throughput
|
||||
# defined by `timesteps_total` / `time_total_s`.
|
||||
for t in trials:
|
||||
experiment = re.sub(".+/([^/]+)$", "\\1", t.local_dir)
|
||||
for experiment in experiments_to_run.copy():
|
||||
print(f"Analyzing experiment {experiment} ...")
|
||||
# Collect all trials within this experiment (some experiments may
|
||||
# have num_samples or grid_searches defined).
|
||||
trials_for_experiment = []
|
||||
for t in trials:
|
||||
trial_exp = re.sub(".+/([^/]+)$", "\\1", t.local_dir)
|
||||
if trial_exp == experiment:
|
||||
trials_for_experiment.append(t)
|
||||
print(f" ... Trials: {trials_for_experiment}.")
|
||||
|
||||
# If we have evaluation workers, use their rewards.
|
||||
# This is useful for offline learning tests, where
|
||||
|
@ -497,33 +513,58 @@ def run_learning_tests_from_yaml(
|
|||
"evaluation_interval", None) is not None
|
||||
|
||||
# Error: Increase failure count and repeat.
|
||||
if t.status == "ERROR":
|
||||
if any(t.status == "ERROR" for t in trials_for_experiment):
|
||||
print(" ... ERROR.")
|
||||
checks[experiment]["failures"] += 1
|
||||
# Smoke-tests always succeed.
|
||||
elif smoke_test:
|
||||
print(" ... SMOKE TEST (mark ok).")
|
||||
checks[experiment]["passed"] = True
|
||||
del experiments_to_run[experiment]
|
||||
# Experiment finished: Check reward achieved and timesteps done
|
||||
# (throughput).
|
||||
else:
|
||||
reward_mean = \
|
||||
t.last_result["evaluation"]["episode_reward_mean"] if \
|
||||
check_eval else t.last_result["episode_reward_mean"]
|
||||
if check_eval:
|
||||
episode_reward_mean = np.mean([
|
||||
t.last_result["evaluation"]["episode_reward_mean"]
|
||||
for t in trials_for_experiment
|
||||
])
|
||||
else:
|
||||
episode_reward_mean = np.mean([
|
||||
t.last_result["episode_reward_mean"]
|
||||
for t in trials_for_experiment
|
||||
])
|
||||
desired_reward = checks[experiment]["min_reward"]
|
||||
|
||||
throughput = t.last_result["timesteps_total"] / \
|
||||
timesteps_total = np.mean([
|
||||
t.last_result["timesteps_total"]
|
||||
for t in trials_for_experiment
|
||||
])
|
||||
total_time_s = np.mean([
|
||||
t.last_result["time_total_s"]
|
||||
desired_timesteps = checks[experiment]["min_timesteps"]
|
||||
desired_throughput = \
|
||||
desired_timesteps / t.stopping_criterion["time_total_s"]
|
||||
for t in trials_for_experiment
|
||||
])
|
||||
|
||||
throughput = timesteps_total / (total_time_s or 1.0)
|
||||
desired_throughput = None
|
||||
# TODO(Jun): Stop checking throughput for now.
|
||||
# desired_throughput = checks[experiment]["min_throughput"]
|
||||
|
||||
print(f" ... Desired reward={desired_reward}; "
|
||||
f"desired throughput={desired_throughput}")
|
||||
|
||||
# We failed to reach desired reward or the desired throughput.
|
||||
if reward_mean < desired_reward or \
|
||||
if (desired_reward and
|
||||
episode_reward_mean < desired_reward) or \
|
||||
(desired_throughput and
|
||||
throughput < desired_throughput):
|
||||
print(" ... Not successful: Actual "
|
||||
f"reward={episode_reward_mean}; "
|
||||
f"actual throughput={throughput}")
|
||||
checks[experiment]["failures"] += 1
|
||||
# We succeeded!
|
||||
else:
|
||||
print(" ... Successful: (mark ok).")
|
||||
checks[experiment]["passed"] = True
|
||||
del experiments_to_run[experiment]
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue