mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[RLlib] Bring back BC and Marwil learning tests. (#21574)
This commit is contained in:
parent
ded4128ebf
commit
7517aefe05
5 changed files with 70 additions and 58 deletions
|
@ -136,37 +136,35 @@ appo-pongnoframeskip-v4:
|
||||||
# observation_filter: NoFilter
|
# observation_filter: NoFilter
|
||||||
# report_length: 3
|
# report_length: 3
|
||||||
|
|
||||||
# TODO: (sven) Fix all BC-dependent learning tests for cont. actions.
|
bc-halfcheetahbulletenv-v0:
|
||||||
# These seem quite hard to learn from the SAC-recorded HalfCheetahBulletEnv.
|
env: HalfCheetahBulletEnv-v0
|
||||||
# bc-halfcheetahbulletenv-v0:
|
run: BC
|
||||||
# env: HalfCheetahBulletEnv-v0
|
pass_criteria:
|
||||||
# run: BC
|
evaluation/episode_reward_mean: 400.0
|
||||||
# pass_criteria:
|
timesteps_total: 10000000
|
||||||
# episode_reward_mean: 400.0
|
stop:
|
||||||
# timesteps_total: 10000000
|
time_total_s: 3600
|
||||||
# stop:
|
config:
|
||||||
# time_total_s: 3600
|
# Use input produced by expert SAC algo.
|
||||||
# config:
|
input: ["~/halfcheetah_expert_sac.zip"]
|
||||||
# # Use input produced by expert SAC algo.
|
actions_in_input_normalized: true
|
||||||
# input: ["~/halfcheetah_expert_sac.zip"]
|
|
||||||
# actions_in_input_normalized: true
|
|
||||||
|
|
||||||
# num_gpus: 1
|
num_gpus: 1
|
||||||
|
|
||||||
# model:
|
model:
|
||||||
# fcnet_activation: relu
|
fcnet_activation: relu
|
||||||
# fcnet_hiddens: [256, 256, 256]
|
fcnet_hiddens: [256, 256, 256]
|
||||||
|
|
||||||
# evaluation_num_workers: 1
|
evaluation_num_workers: 1
|
||||||
# evaluation_interval: 3
|
evaluation_interval: 3
|
||||||
# evaluation_config:
|
evaluation_config:
|
||||||
# input: sampler
|
input: sampler
|
||||||
|
|
||||||
cql-halfcheetahbulletenv-v0:
|
cql-halfcheetahbulletenv-v0:
|
||||||
env: HalfCheetahBulletEnv-v0
|
env: HalfCheetahBulletEnv-v0
|
||||||
run: CQL
|
run: CQL
|
||||||
pass_criteria:
|
pass_criteria:
|
||||||
episode_reward_mean: 400.0
|
evaluation/episode_reward_mean: 400.0
|
||||||
timesteps_total: 10000000
|
timesteps_total: 10000000
|
||||||
stop:
|
stop:
|
||||||
time_total_s: 3600
|
time_total_s: 3600
|
||||||
|
@ -363,31 +361,31 @@ impala-breakoutnoframeskip-v4:
|
||||||
]
|
]
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
|
|
||||||
# marwil-halfcheetahbulletenv-v0:
|
marwil-halfcheetahbulletenv-v0:
|
||||||
# env: HalfCheetahBulletEnv-v0
|
env: HalfCheetahBulletEnv-v0
|
||||||
# run: MARWIL
|
run: MARWIL
|
||||||
# pass_criteria:
|
pass_criteria:
|
||||||
# episode_reward_mean: 400.0
|
evaluation/episode_reward_mean: 400.0
|
||||||
# timesteps_total: 10000000
|
timesteps_total: 10000000
|
||||||
# stop:
|
stop:
|
||||||
# time_total_s: 3600
|
time_total_s: 3600
|
||||||
# config:
|
config:
|
||||||
# # Use input produced by expert SAC algo.
|
# Use input produced by expert SAC algo.
|
||||||
# input: ["~/halfcheetah_expert_sac.zip"]
|
input: ["~/halfcheetah_expert_sac.zip"]
|
||||||
# actions_in_input_normalized: true
|
actions_in_input_normalized: true
|
||||||
# # Switch off input evaluation (data does not contain action probs).
|
# Switch off input evaluation (data does not contain action probs).
|
||||||
# input_evaluation: []
|
input_evaluation: []
|
||||||
|
|
||||||
# num_gpus: 1
|
num_gpus: 1
|
||||||
|
|
||||||
# model:
|
model:
|
||||||
# fcnet_activation: relu
|
fcnet_activation: relu
|
||||||
# fcnet_hiddens: [256, 256, 256]
|
fcnet_hiddens: [256, 256, 256]
|
||||||
|
|
||||||
# evaluation_num_workers: 1
|
evaluation_num_workers: 1
|
||||||
# evaluation_interval: 1
|
evaluation_interval: 1
|
||||||
# evaluation_config:
|
evaluation_config:
|
||||||
# input: sampler
|
input: sampler
|
||||||
|
|
||||||
ppo-breakoutnoframeskip-v4:
|
ppo-breakoutnoframeskip-v4:
|
||||||
env: BreakoutNoFrameskip-v4
|
env: BreakoutNoFrameskip-v4
|
||||||
|
|
|
@ -189,7 +189,7 @@ class TestTrainer(unittest.TestCase):
|
||||||
# Eval results are not available at step 0.
|
# Eval results are not available at step 0.
|
||||||
# But step 3 should still have it, even though no eval was
|
# But step 3 should still have it, even though no eval was
|
||||||
# run during that step.
|
# run during that step.
|
||||||
self.assertFalse("evaluation" in r0)
|
self.assertTrue("evaluation" in r0)
|
||||||
self.assertTrue("evaluation" in r1)
|
self.assertTrue("evaluation" in r1)
|
||||||
self.assertTrue("evaluation" in r2)
|
self.assertTrue("evaluation" in r2)
|
||||||
self.assertTrue("evaluation" in r3)
|
self.assertTrue("evaluation" in r3)
|
||||||
|
|
|
@ -724,10 +724,19 @@ class Trainer(Trainable):
|
||||||
self._episode_history = []
|
self._episode_history = []
|
||||||
self._episodes_to_be_collected = []
|
self._episodes_to_be_collected = []
|
||||||
|
|
||||||
# Evaluation WorkerSet.
|
# Evaluation WorkerSet and metrics last returned by `self.evaluate()`.
|
||||||
self.evaluation_workers: Optional[WorkerSet] = None
|
self.evaluation_workers: Optional[WorkerSet] = None
|
||||||
# Metrics most recently returned by `self.evaluate()`.
|
# Initialize common evaluation_metrics to nan, before they become
|
||||||
self.evaluation_metrics = {}
|
# available. We want to make sure the metrics are always present
|
||||||
|
# (although their values may be nan), so that Tune does not complain
|
||||||
|
# when we use these as stopping criteria.
|
||||||
|
self.evaluation_metrics = {
|
||||||
|
"evaluation": {
|
||||||
|
"episode_reward_max": np.nan,
|
||||||
|
"episode_reward_min": np.nan,
|
||||||
|
"episode_reward_mean": np.nan,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
super().__init__(config, logger_creator, remote_checkpoint_dir,
|
super().__init__(config, logger_creator, remote_checkpoint_dir,
|
||||||
sync_function_tpl)
|
sync_function_tpl)
|
||||||
|
|
|
@ -74,7 +74,7 @@ class AssertEvalCallback(DefaultCallbacks):
|
||||||
# Make sure we always run exactly the given evaluation duration,
|
# Make sure we always run exactly the given evaluation duration,
|
||||||
# no matter what the other settings are (such as
|
# no matter what the other settings are (such as
|
||||||
# `evaluation_num_workers` or `evaluation_parallel_to_training`).
|
# `evaluation_num_workers` or `evaluation_parallel_to_training`).
|
||||||
if "evaluation" in result:
|
if "evaluation" in result and "hist_stats" in result["evaluation"]:
|
||||||
hist_stats = result["evaluation"]["hist_stats"]
|
hist_stats = result["evaluation"]["hist_stats"]
|
||||||
# We count in episodes.
|
# We count in episodes.
|
||||||
if trainer.config["evaluation_duration_unit"] == "episodes":
|
if trainer.config["evaluation_duration_unit"] == "episodes":
|
||||||
|
|
|
@ -607,6 +607,13 @@ def run_learning_tests_from_yaml(
|
||||||
|
|
||||||
start_time = time.monotonic()
|
start_time = time.monotonic()
|
||||||
|
|
||||||
|
def should_check_eval(experiment):
|
||||||
|
# If we have evaluation workers, use their rewards.
|
||||||
|
# This is useful for offline learning tests, where
|
||||||
|
# we evaluate against an actual environment.
|
||||||
|
return experiment["config"].get("evaluation_interval",
|
||||||
|
None) is not None
|
||||||
|
|
||||||
# Loop through all collected files and gather experiments.
|
# Loop through all collected files and gather experiments.
|
||||||
# Augment all by `torch` framework.
|
# Augment all by `torch` framework.
|
||||||
for yaml_file in yaml_files:
|
for yaml_file in yaml_files:
|
||||||
|
@ -637,11 +644,13 @@ def run_learning_tests_from_yaml(
|
||||||
# create its trainer and run a first iteration.
|
# create its trainer and run a first iteration.
|
||||||
e["stop"]["time_total_s"] = 0
|
e["stop"]["time_total_s"] = 0
|
||||||
else:
|
else:
|
||||||
|
check_eval = should_check_eval(e)
|
||||||
|
episode_reward_key = ("episode_reward_mean" if not check_eval
|
||||||
|
else "evaluation/episode_reward_mean")
|
||||||
# We also stop early, once we reach the desired reward.
|
# We also stop early, once we reach the desired reward.
|
||||||
min_reward = e.get("pass_criteria",
|
min_reward = e.get("pass_criteria", {}).get(episode_reward_key)
|
||||||
{}).get("episode_reward_mean")
|
|
||||||
if min_reward is not None:
|
if min_reward is not None:
|
||||||
e["stop"]["episode_reward_mean"] = min_reward
|
e["stop"][episode_reward_key] = min_reward
|
||||||
|
|
||||||
# Generate `checks` dict for all experiments
|
# Generate `checks` dict for all experiments
|
||||||
# (tf, tf2 and/or torch).
|
# (tf, tf2 and/or torch).
|
||||||
|
@ -723,11 +732,7 @@ def run_learning_tests_from_yaml(
|
||||||
trials_for_experiment.append(t)
|
trials_for_experiment.append(t)
|
||||||
print(f" ... Trials: {trials_for_experiment}.")
|
print(f" ... Trials: {trials_for_experiment}.")
|
||||||
|
|
||||||
# If we have evaluation workers, use their rewards.
|
check_eval = should_check_eval(experiments[experiment])
|
||||||
# This is useful for offline learning tests, where
|
|
||||||
# we evaluate against an actual environment.
|
|
||||||
check_eval = experiments[experiment]["config"].get(
|
|
||||||
"evaluation_interval", None) is not None
|
|
||||||
|
|
||||||
# Error: Increase failure count and repeat.
|
# Error: Increase failure count and repeat.
|
||||||
if any(t.status == "ERROR" for t in trials_for_experiment):
|
if any(t.status == "ERROR" for t in trials_for_experiment):
|
||||||
|
|
Loading…
Add table
Reference in a new issue