mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[RLlib] Fix the 2 failing RLlib release tests. (#25603)
This commit is contained in:
parent
d5541cccb1
commit
c026374acb
20 changed files with 117 additions and 40 deletions
|
@ -2104,7 +2104,7 @@
|
|||
# RLlib tests
|
||||
########################
|
||||
|
||||
- name: rllib_learning_tests
|
||||
- name: rllib_learning_tests_a_to_e
|
||||
group: RLlib tests
|
||||
working_dir: rllib_tests
|
||||
|
||||
|
@ -2117,11 +2117,34 @@
|
|||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: 8gpus_64cpus.yaml
|
||||
cluster_compute: 12gpus_192cpus.yaml
|
||||
|
||||
run:
|
||||
timeout: 14400
|
||||
script: python learning_tests/run.py
|
||||
timeout: 18000
|
||||
script: python learning_tests/run.py --yaml-sub-dir=a-e
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
|
||||
alert: default
|
||||
|
||||
- name: rllib_learning_tests_f_to_z
|
||||
group: RLlib tests
|
||||
working_dir: rllib_tests
|
||||
|
||||
legacy:
|
||||
test_name: learning_tests
|
||||
test_suite: rllib_tests
|
||||
|
||||
frequency: nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: 8gpus_96cpus.yaml
|
||||
|
||||
run:
|
||||
timeout: 18000
|
||||
script: python learning_tests/run.py --yaml-sub-dir=f-z
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
|
||||
|
|
21
release/rllib_tests/learning_tests/README.md
Normal file
21
release/rllib_tests/learning_tests/README.md
Normal file
|
@ -0,0 +1,21 @@
|
|||
# RLlib Hard Learning Test
|
||||
|
||||
Test most important RLlib algorithms with hard enough tasks to prevent performance regression.
|
||||
|
||||
Algorithms in this suite are split into multiple tests, so groups of tests can run in parallel. This is to ensure reasonable total runtime.
|
||||
|
||||
All learning tests have ``stop`` and ``pass_criteria`` configured, where ``stop`` specifies a fixed test duration, and ``pass_criteria`` specified performance goals like ``minimum reward`` and ``minimum throughput``.
|
||||
|
||||
Unlike normal tuned examples, these learning tests always run to the full specified test duration, and would NOT stop early when the ``pass_criteria`` is met.
|
||||
|
||||
This is so they can serve better as performance regression tests:
|
||||
|
||||
* By giving these tests more time, we get better idea of where they actually peak out (instead of simply stopping at a pre-specified reward). So we will have better ideas of minor peak performance regressions when they happen.
|
||||
* By decoupling peak performance from ``pass_criteria``, we can specify a relatively conservative ``pass_criteria``, to avoid having flaky tests that pass and fail because of random fluctuations.
|
||||
* These conservative passing thresholds help alert us when some algorithms are badly broken.
|
||||
* Peak reward and throughput numbers gets save in DB, so we can see, hopefully step function, trends over time when we improve things.
|
||||
|
||||
TODO: we don't see progress right now in the time series chart, if an algorithm learns faster, but to the same peak performance.
|
||||
For that, we need to plot multiple lines at different percentage time mark.
|
||||
|
||||
If you have any questions about these tests, ping jungong@.
|
|
@ -19,10 +19,20 @@ if __name__ == "__main__":
|
|||
default=False,
|
||||
help="Finish quickly for training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--yaml-sub-dir",
|
||||
type=str,
|
||||
default="",
|
||||
help="Sub directory under yaml_files/ to look for test files.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.yaml_sub_dir, "--yaml-sub-dir can't be empty."
|
||||
|
||||
# Get path of this very script to look for yaml files.
|
||||
abs_yaml_path = os.path.join(str(Path(__file__).parent), "yaml_files")
|
||||
abs_yaml_path = os.path.join(
|
||||
str(Path(__file__).parent), "yaml_files", args.yaml_sub_dir
|
||||
)
|
||||
print("abs_yaml_path={}".format(abs_yaml_path))
|
||||
|
||||
yaml_files = Path(abs_yaml_path).rglob("*.yaml")
|
||||
|
@ -33,12 +43,14 @@ if __name__ == "__main__":
|
|||
# Run all tests in the found yaml files.
|
||||
results = run_learning_tests_from_yaml(
|
||||
yaml_files=yaml_files,
|
||||
# Note(jungong) : run learning tests to full desired duration
|
||||
# for performance regression purpose.
|
||||
# Talk to jungong@ if you have questions about why we do this.
|
||||
use_pass_criteria_as_stop=False,
|
||||
smoke_test=args.smoke_test,
|
||||
)
|
||||
|
||||
test_output_json = os.environ.get(
|
||||
"TEST_OUTPUT_JSON", "/tmp/rllib_learning_tests.json"
|
||||
)
|
||||
test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/learning_test.json")
|
||||
with open(test_output_json, "wt") as f:
|
||||
json.dump(results, f)
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ appo-pongnoframeskip-v4:
|
|||
episode_reward_mean: 18.0
|
||||
timesteps_total: 5000000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
time_total_s: 1800
|
||||
config:
|
||||
vtrace: True
|
||||
use_kl_loss: False
|
|
@ -3,9 +3,10 @@ bc-halfcheetahbulletenv-v0:
|
|||
run: BC
|
||||
pass_criteria:
|
||||
evaluation/episode_reward_mean: 400.0
|
||||
timesteps_total: 10000000
|
||||
# Can not check throughput for offline methods.
|
||||
# timesteps_total: 10000000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
time_total_s: 7200
|
||||
config:
|
||||
# Use input produced by expert SAC algo.
|
||||
input: ["~/halfcheetah_expert_sac.zip"]
|
|
@ -3,7 +3,8 @@ cql-halfcheetahbulletenv-v0:
|
|||
run: CQL
|
||||
pass_criteria:
|
||||
evaluation/episode_reward_mean: 400.0
|
||||
timesteps_total: 10000000
|
||||
# Can not check throughput for offline methods.
|
||||
# timesteps_total: 10000000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
|
@ -6,7 +6,7 @@ ddpg-hopperbulletenv-v0:
|
|||
episode_reward_mean: 110.0
|
||||
timesteps_total: 50000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
time_total_s: 1800
|
||||
config:
|
||||
actor_hiddens: [256, 256]
|
||||
critic_hiddens: [256, 256]
|
||||
|
@ -31,6 +31,7 @@ ddpg-hopperbulletenv-v0:
|
|||
prioritized_replay_beta: 0.4
|
||||
prioritized_replay_eps: 0.000001
|
||||
learning_starts: 500
|
||||
worker_side_prioritization: false
|
||||
clip_rewards: false
|
||||
actor_lr: 0.001
|
||||
critic_lr: 0.001
|
||||
|
@ -42,4 +43,3 @@ ddpg-hopperbulletenv-v0:
|
|||
num_gpus: 1
|
||||
num_workers: 0
|
||||
num_gpus_per_worker: 0
|
||||
worker_side_prioritization: false
|
|
@ -12,13 +12,13 @@ dqn-breakoutnoframeskip-v4:
|
|||
dueling: false
|
||||
num_atoms: 1
|
||||
noisy: false
|
||||
prioritized_replay: false
|
||||
n_step: 1
|
||||
target_network_update_freq: 8000
|
||||
lr: .0000625
|
||||
adam_epsilon: .00015
|
||||
hiddens: [512]
|
||||
replay_buffer_config:
|
||||
type: MultiAgentReplayBuffer
|
||||
capacity: 1000000
|
||||
learning_starts: 20000
|
||||
prioritized_replay_alpha: 0.5
|
|
@ -6,7 +6,7 @@ impala-breakoutnoframeskip-v4:
|
|||
episode_reward_mean: 200.0
|
||||
timesteps_total: 6000000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
time_total_s: 1800
|
||||
config:
|
||||
rollout_fragment_length: 50
|
||||
train_batch_size: 500
|
|
@ -3,7 +3,8 @@ marwil-halfcheetahbulletenv-v0:
|
|||
run: MARWIL
|
||||
pass_criteria:
|
||||
evaluation/episode_reward_mean: 400.0
|
||||
timesteps_total: 10000000
|
||||
# Can not check throughput for offline methods.
|
||||
# timesteps_total: 10000000
|
||||
stop:
|
||||
time_total_s: 3600
|
||||
config:
|
|
@ -6,7 +6,7 @@ sac-halfcheetahbulletenv-v0:
|
|||
episode_reward_mean: 400.0
|
||||
timesteps_total: 200000
|
||||
stop:
|
||||
time_total_s: 7200
|
||||
time_total_s: 1800
|
||||
config:
|
||||
horizon: 1000
|
||||
soft_horizon: false
|
|
@ -2,7 +2,7 @@ slateq-interest-evolution-recsim-env:
|
|||
env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv
|
||||
run: SlateQ
|
||||
pass_criteria:
|
||||
episode_reward_mean: 162.0
|
||||
episode_reward_mean: 160.0
|
||||
timesteps_total: 300000
|
||||
stop:
|
||||
time_total_s: 7200
|
|
@ -6,7 +6,7 @@ td3-halfcheetahbulletenv-v0:
|
|||
episode_reward_mean: 400.0
|
||||
timesteps_total: 1000000
|
||||
stop:
|
||||
time_total_s: 7200
|
||||
time_total_s: 3600
|
||||
config:
|
||||
num_gpus: 1
|
||||
replay_buffer_config:
|
|
@ -11,6 +11,9 @@ a2c-stateless-cartpole:
|
|||
config:
|
||||
num_gpus: 2
|
||||
num_workers: 23
|
||||
# Use a large train batch size to make sure mini batches work
|
||||
# after split to 2 GPU towers.
|
||||
train_batch_size: 200
|
||||
lr: 0.001
|
||||
# Test w/ LSTMs.
|
||||
model:
|
||||
|
|
|
@ -577,6 +577,7 @@ def run_learning_tests_from_yaml(
|
|||
yaml_files: List[str],
|
||||
*,
|
||||
max_num_repeats: int = 2,
|
||||
use_pass_criteria_as_stop: bool = True,
|
||||
smoke_test: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""Runs the given experiments in yaml_files and returns results dict.
|
||||
|
@ -585,6 +586,8 @@ def run_learning_tests_from_yaml(
|
|||
yaml_files: List of yaml file names.
|
||||
max_num_repeats: How many times should we repeat a failed
|
||||
experiment?
|
||||
use_pass_criteria_as_stop: Configure the Trial so that it stops
|
||||
as soon as pass criterias are met.
|
||||
smoke_test: Whether this is just a smoke-test. If True,
|
||||
set time_total_s to 5min and don't early out due to rewards
|
||||
or timesteps reached.
|
||||
|
@ -635,6 +638,13 @@ def run_learning_tests_from_yaml(
|
|||
e["stop"] = e["stop"] if "stop" in e else {}
|
||||
e["pass_criteria"] = e["pass_criteria"] if "pass_criteria" in e else {}
|
||||
|
||||
check_eval = should_check_eval(e)
|
||||
episode_reward_key = (
|
||||
"episode_reward_mean"
|
||||
if not check_eval
|
||||
else "evaluation/episode_reward_mean"
|
||||
)
|
||||
|
||||
# For smoke-tests, we just run for n min.
|
||||
if smoke_test:
|
||||
# 0sec for each(!) experiment/trial.
|
||||
|
@ -643,16 +653,11 @@ def run_learning_tests_from_yaml(
|
|||
# create its Algorithm and run a first iteration.
|
||||
e["stop"]["time_total_s"] = 0
|
||||
else:
|
||||
check_eval = should_check_eval(e)
|
||||
episode_reward_key = (
|
||||
"episode_reward_mean"
|
||||
if not check_eval
|
||||
else "evaluation/episode_reward_mean"
|
||||
)
|
||||
# We also stop early, once we reach the desired reward.
|
||||
min_reward = e.get("pass_criteria", {}).get(episode_reward_key)
|
||||
if min_reward is not None:
|
||||
e["stop"][episode_reward_key] = min_reward
|
||||
if use_pass_criteria_as_stop:
|
||||
# We also stop early, once we reach the desired reward.
|
||||
min_reward = e.get("pass_criteria", {}).get(episode_reward_key)
|
||||
if min_reward is not None:
|
||||
e["stop"][episode_reward_key] = min_reward
|
||||
|
||||
# Generate `checks` dict for all experiments
|
||||
# (tf, tf2 and/or torch).
|
||||
|
@ -664,7 +669,7 @@ def run_learning_tests_from_yaml(
|
|||
ec["config"]["eager_tracing"] = True
|
||||
|
||||
checks[k_] = {
|
||||
"min_reward": ec["pass_criteria"].get("episode_reward_mean", 0.0),
|
||||
"min_reward": ec["pass_criteria"].get(episode_reward_key, 0.0),
|
||||
"min_throughput": ec["pass_criteria"].get("timesteps_total", 0.0)
|
||||
/ (ec["stop"].get("time_total_s", 1.0) or 1.0),
|
||||
"time_total_s": ec["stop"].get("time_total_s"),
|
||||
|
@ -677,10 +682,6 @@ def run_learning_tests_from_yaml(
|
|||
# One experiment to run.
|
||||
experiments[k_] = ec
|
||||
|
||||
# Print out the actual config.
|
||||
print("== Test config ==")
|
||||
print(yaml.dump(experiments))
|
||||
|
||||
# Keep track of those experiments we still have to run.
|
||||
# If an experiment passes, we'll remove it from this dict.
|
||||
experiments_to_run = experiments.copy()
|
||||
|
@ -698,6 +699,10 @@ def run_learning_tests_from_yaml(
|
|||
|
||||
print(f"Starting learning test iteration {i}...")
|
||||
|
||||
# Print out the actual config.
|
||||
print("== Test config ==")
|
||||
print(yaml.dump(experiments_to_run))
|
||||
|
||||
# Run remaining experiments.
|
||||
trials = run_experiments(
|
||||
experiments_to_run,
|
||||
|
@ -713,6 +718,7 @@ def run_learning_tests_from_yaml(
|
|||
"episode_reward_mean": "reward_mean",
|
||||
"evaluation/episode_reward_mean": "eval_reward_mean",
|
||||
},
|
||||
parameter_columns=["framework"],
|
||||
sort_by_metric=True,
|
||||
max_report_frequency=30,
|
||||
),
|
||||
|
@ -748,22 +754,24 @@ def run_learning_tests_from_yaml(
|
|||
# Experiment finished: Check reward achieved and timesteps done
|
||||
# (throughput).
|
||||
else:
|
||||
# Use best_result's reward to check min_reward.
|
||||
if check_eval:
|
||||
episode_reward_mean = np.mean(
|
||||
[
|
||||
t.last_result["evaluation"]["episode_reward_mean"]
|
||||
t.metric_analysis["evaluation/episode_reward_mean"]["max"]
|
||||
for t in trials_for_experiment
|
||||
]
|
||||
)
|
||||
else:
|
||||
episode_reward_mean = np.mean(
|
||||
[
|
||||
t.last_result["episode_reward_mean"]
|
||||
t.metric_analysis["episode_reward_mean"]["max"]
|
||||
for t in trials_for_experiment
|
||||
]
|
||||
)
|
||||
desired_reward = checks[experiment]["min_reward"]
|
||||
|
||||
# Use last_result["timesteps_total"] to check throughput.
|
||||
timesteps_total = np.mean(
|
||||
[t.last_result["timesteps_total"] for t in trials_for_experiment]
|
||||
)
|
||||
|
@ -773,8 +781,11 @@ def run_learning_tests_from_yaml(
|
|||
|
||||
# TODO(jungong) : track training- and env throughput separately.
|
||||
throughput = timesteps_total / (total_time_s or 1.0)
|
||||
# TODO(jungong) : enable throughput check again after
|
||||
# TD3_HalfCheetahBulletEnv is fixed and verified.
|
||||
# Throughput verification is not working. Many algorithm, e.g. TD3,
|
||||
# achieves the learning goal, but fails the throughput check
|
||||
# miserably.
|
||||
# TODO(jungong): Figure out why.
|
||||
#
|
||||
# desired_throughput = checks[experiment]["min_throughput"]
|
||||
desired_throughput = None
|
||||
|
||||
|
@ -803,7 +814,11 @@ def run_learning_tests_from_yaml(
|
|||
checks[experiment]["failures"] += 1
|
||||
# We succeeded!
|
||||
else:
|
||||
print(" ... Successful: (mark ok).")
|
||||
print(
|
||||
" ... Successful: (mark ok). Actual "
|
||||
f"reward={episode_reward_mean}; "
|
||||
f"actual throughput={throughput}"
|
||||
)
|
||||
checks[experiment]["passed"] = True
|
||||
del experiments_to_run[experiment]
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue