mirror of
https://github.com/vale981/ray
synced 2025-03-06 18:41:40 -05:00

* Create a core set of algorithms tests to run nightly. * Run release tests under tf, tf2, and torch frameworks. * Fix * Add eager_tracing option for tf2 framework. * make sure core tests can run in parallel. * cql * Report progress while running nightly/weekly tests. * Innclude SAC in nightly lineup. * Revert changes to learning_tests * rebrand to performance test. * update build_pipeline.py with new performance_tests name. * Record stats. * bug fix, need to populate experiments dict. * Alphabetize yaml files. * Allow specifying frameworks. And do not run tf2 by default. * remove some debugging code. * fix * Undo testing changes. * Do not run CQL regression for now. * LINT. Co-authored-by: sven1977 <svenmika1977@gmail.com>
470 lines
13 KiB
YAML
470 lines
13 KiB
YAML
|
|
a2c-breakoutnoframeskip-v4:
|
|
env: BreakoutNoFrameskip-v4
|
|
run: A2C
|
|
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
pass_criteria:
|
|
episode_reward_mean: 50.0
|
|
timesteps_total: 5000000
|
|
stop:
|
|
time_total_s: 7200
|
|
config:
|
|
rollout_fragment_length: 20
|
|
clip_rewards: True
|
|
num_workers: 5
|
|
num_envs_per_worker: 5
|
|
num_gpus: 1
|
|
lr_schedule: [
|
|
[0, 0.0007],
|
|
[20000000, 0.000000000001],
|
|
]
|
|
|
|
# a3c-pongdeterministic-v4:
|
|
# env: PongDeterministic-v4
|
|
# run: A3C
|
|
# # Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
# pass_criteria:
|
|
# episode_reward_mean: 18.0
|
|
# timesteps_total: 5000000
|
|
# stop:
|
|
# time_total_s: 3600
|
|
# config:
|
|
# ignore_worker_failures: true
|
|
# num_gpus: 0
|
|
# num_workers: 16
|
|
# rollout_fragment_length: 20
|
|
# vf_loss_coeff: 0.5
|
|
# entropy_coeff: 0.01
|
|
# gamma: 0.99
|
|
# grad_clip: 40.0
|
|
# lambda: 1.0
|
|
# lr: 0.0001
|
|
# observation_filter: NoFilter
|
|
# preprocessor_pref: rllib
|
|
# model:
|
|
# use_lstm: true
|
|
# conv_activation: elu
|
|
# dim: 42
|
|
# grayscale: true
|
|
# zero_mean: false
|
|
# # Reduced channel depth and kernel size from default.
|
|
# conv_filters: [
|
|
# [32, [3, 3], 2],
|
|
# [32, [3, 3], 2],
|
|
# [32, [3, 3], 2],
|
|
# [32, [3, 3], 2],
|
|
# ]
|
|
|
|
apex-breakoutnoframeskip-v4:
|
|
env: BreakoutNoFrameskip-v4
|
|
run: APEX
|
|
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
pass_criteria:
|
|
episode_reward_mean: 20.0
|
|
timesteps_total: 7000000
|
|
stop:
|
|
time_total_s: 7200
|
|
config:
|
|
double_q: false
|
|
dueling: false
|
|
num_atoms: 1
|
|
noisy: false
|
|
n_step: 3
|
|
lr: .0001
|
|
adam_epsilon: .00015
|
|
hiddens: [512]
|
|
buffer_size: 1000000
|
|
exploration_config:
|
|
epsilon_timesteps: 200000
|
|
final_epsilon: 0.01
|
|
prioritized_replay_alpha: 0.5
|
|
final_prioritized_replay_beta: 1.0
|
|
prioritized_replay_beta_annealing_timesteps: 2000000
|
|
num_gpus: 1
|
|
num_workers: 8
|
|
num_envs_per_worker: 8
|
|
rollout_fragment_length: 20
|
|
train_batch_size: 512
|
|
target_network_update_freq: 50000
|
|
timesteps_per_iteration: 25000
|
|
|
|
appo-pongnoframeskip-v4:
|
|
env: PongNoFrameskip-v4
|
|
run: APPO
|
|
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
pass_criteria:
|
|
episode_reward_mean: 18.0
|
|
timesteps_total: 5000000
|
|
stop:
|
|
time_total_s: 2000
|
|
config:
|
|
vtrace: True
|
|
use_kl_loss: False
|
|
rollout_fragment_length: 50
|
|
train_batch_size: 750
|
|
num_workers: 31
|
|
broadcast_interval: 1
|
|
max_sample_requests_in_flight_per_worker: 1
|
|
num_multi_gpu_tower_stacks: 1
|
|
num_envs_per_worker: 8
|
|
num_sgd_iter: 2
|
|
vf_loss_coeff: 1.0
|
|
clip_param: 0.3
|
|
num_gpus: 1
|
|
grad_clip: 10
|
|
model:
|
|
dim: 42
|
|
|
|
# ARS was never tested/tuned on Hopper. Maybe change to ReacherBulletEnv-v0?
|
|
# ars-hopperbulletenv-v0:
|
|
# env: HopperBulletEnv-v0
|
|
# run: ARS
|
|
# # Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
# pass_criteria:
|
|
# episode_reward_mean: 100.0
|
|
# timesteps_total: 2000000
|
|
# stop:
|
|
# time_total_s: 2000
|
|
# config:
|
|
# noise_stdev: 0.01
|
|
# num_rollouts: 1
|
|
# rollouts_used: 1
|
|
# num_workers: 1
|
|
# sgd_stepsize: 0.02
|
|
# noise_size: 250000000
|
|
# eval_prob: 0.2
|
|
# offset: 0
|
|
# observation_filter: NoFilter
|
|
# report_length: 3
|
|
|
|
# TODO: (sven) Fix all BC-dependent learning tests for cont. actions.
|
|
# These seem quite hard to learn from the SAC-recorded HalfCheetahBulletEnv.
|
|
# bc-halfcheetahbulletenv-v0:
|
|
# env: HalfCheetahBulletEnv-v0
|
|
# run: BC
|
|
# pass_criteria:
|
|
# episode_reward_mean: 400.0
|
|
# timesteps_total: 10000000
|
|
# stop:
|
|
# time_total_s: 3600
|
|
# config:
|
|
# # Use input produced by expert SAC algo.
|
|
# input: ["~/halfcheetah_expert_sac.zip"]
|
|
# actions_in_input_normalized: true
|
|
|
|
# num_gpus: 1
|
|
|
|
# model:
|
|
# fcnet_activation: relu
|
|
# fcnet_hiddens: [256, 256, 256]
|
|
|
|
# evaluation_num_workers: 1
|
|
# evaluation_interval: 3
|
|
# evaluation_config:
|
|
# input: sampler
|
|
|
|
# cql-halfcheetahbulletenv-v0:
|
|
# env: HalfCheetahBulletEnv-v0
|
|
# run: CQL
|
|
# pass_criteria:
|
|
# episode_reward_mean: 400.0
|
|
# timesteps_total: 10000000
|
|
# stop:
|
|
# time_total_s: 3600
|
|
# config:
|
|
# # Use input produced by expert SAC algo.
|
|
# input: ["~/halfcheetah_expert_sac.zip"]
|
|
# actions_in_input_normalized: true
|
|
#
|
|
# soft_horizon: False
|
|
# horizon: 1000
|
|
# Q_model:
|
|
# fcnet_activation: relu
|
|
# fcnet_hiddens: [256, 256, 256]
|
|
# policy_model:
|
|
# fcnet_activation: relu
|
|
# fcnet_hiddens: [256, 256, 256]
|
|
# tau: 0.005
|
|
# target_entropy: auto
|
|
# no_done_at_end: false
|
|
# n_step: 3
|
|
# rollout_fragment_length: 1
|
|
# prioritized_replay: false
|
|
# train_batch_size: 256
|
|
# target_network_update_freq: 0
|
|
# timesteps_per_iteration: 1000
|
|
# learning_starts: 256
|
|
# optimization:
|
|
# actor_learning_rate: 0.0001
|
|
# critic_learning_rate: 0.0003
|
|
# entropy_learning_rate: 0.0001
|
|
# num_workers: 0
|
|
# num_gpus: 1
|
|
# metrics_smoothing_episodes: 5
|
|
#
|
|
# # CQL Configs
|
|
# min_q_weight: 5.0
|
|
# bc_iters: 20000
|
|
# temperature: 1.0
|
|
# num_actions: 10
|
|
# lagrangian: False
|
|
#
|
|
# # Switch on online evaluation.
|
|
# evaluation_interval: 3
|
|
# evaluation_config:
|
|
# input: sampler
|
|
|
|
ddpg-hopperbulletenv-v0:
|
|
env: HopperBulletEnv-v0
|
|
run: DDPG
|
|
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
pass_criteria:
|
|
episode_reward_mean: 110.0
|
|
timesteps_total: 50000
|
|
stop:
|
|
time_total_s: 3600
|
|
config:
|
|
actor_hiddens: [256, 256]
|
|
critic_hiddens: [256, 256]
|
|
n_step: 3
|
|
model: {}
|
|
gamma: 0.99
|
|
env_config: {}
|
|
exploration_config:
|
|
initial_scale: 1.0
|
|
final_scale: 0.02
|
|
scale_timesteps: 10000
|
|
ou_base_scale: 0.1
|
|
ou_theta: 0.15
|
|
ou_sigma: 0.2
|
|
timesteps_per_iteration: 1000
|
|
target_network_update_freq: 0
|
|
tau: 0.001
|
|
buffer_size: 10000
|
|
prioritized_replay: True
|
|
prioritized_replay_alpha: 0.6
|
|
prioritized_replay_beta: 0.4
|
|
prioritized_replay_eps: 0.000001
|
|
clip_rewards: false
|
|
actor_lr: 0.001
|
|
critic_lr: 0.001
|
|
use_huber: true
|
|
huber_threshold: 1.0
|
|
l2_reg: 0.000001
|
|
learning_starts: 500
|
|
rollout_fragment_length: 1
|
|
train_batch_size: 48
|
|
num_gpus: 1
|
|
num_workers: 0
|
|
num_gpus_per_worker: 0
|
|
worker_side_prioritization: false
|
|
|
|
# Basically the same as atari-ppo, but adapted for DDPPO. Note that DDPPO
|
|
# isn't actually any more efficient on Atari, since the network size is
|
|
# relatively small and the env doesn't require a GPU.
|
|
# ddppo-breakoutnoframeskip-v4:
|
|
# env: BreakoutNoFrameskip-v4
|
|
# run: DDPPO
|
|
# # Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
# pass_criteria:
|
|
# episode_reward_mean: 50.0
|
|
# timesteps_total: 10000000
|
|
# stop:
|
|
# time_total_s: 3600
|
|
# config:
|
|
# # DDPPO only supports PyTorch so far.
|
|
# frameworks: [ "torch" ]
|
|
# # Worker config: 10 workers, each of which requires a GPU.
|
|
# num_workers: 16
|
|
# # Workers require GPUs, but share 1 GPU amongst 2 workers.
|
|
# num_gpus_per_worker: 0.25
|
|
# # Each worker will sample 100 * 5 envs per worker steps = 500 steps
|
|
# # per optimization round. This is 5000 steps summed across workers.
|
|
# rollout_fragment_length: 100
|
|
# num_envs_per_worker: 5
|
|
# # Each worker will take a minibatch of 50. There are 10 workers total,
|
|
# # so the effective minibatch size will be 500.
|
|
# sgd_minibatch_size: 50
|
|
# num_sgd_iter: 30
|
|
# # Params from standard PPO Atari config:
|
|
# lambda: 0.95
|
|
# kl_coeff: 0.5
|
|
# clip_rewards: true
|
|
# clip_param: 0.1
|
|
# vf_loss_coeff: 0.1
|
|
# vf_clip_param: 10.0
|
|
# entropy_coeff: 0.01
|
|
# batch_mode: truncate_episodes
|
|
# observation_filter: NoFilter
|
|
|
|
dqn-breakoutnoframeskip-v4:
|
|
env: BreakoutNoFrameskip-v4
|
|
run: DQN
|
|
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
pass_criteria:
|
|
episode_reward_mean: 20.0
|
|
timesteps_total: 400000
|
|
stop:
|
|
time_total_s: 7200
|
|
config:
|
|
double_q: false
|
|
dueling: false
|
|
num_atoms: 1
|
|
noisy: false
|
|
prioritized_replay: false
|
|
n_step: 1
|
|
target_network_update_freq: 8000
|
|
lr: .0000625
|
|
adam_epsilon: .00015
|
|
hiddens: [512]
|
|
learning_starts: 20000
|
|
buffer_size: 1000000
|
|
rollout_fragment_length: 4
|
|
train_batch_size: 32
|
|
exploration_config:
|
|
epsilon_timesteps: 200000
|
|
final_epsilon: 0.01
|
|
prioritized_replay_alpha: 0.5
|
|
final_prioritized_replay_beta: 1.0
|
|
prioritized_replay_beta_annealing_timesteps: 2000000
|
|
num_gpus: 0.5
|
|
timesteps_per_iteration: 10000
|
|
|
|
es-humanoidbulletenv-v0:
|
|
env: HumanoidBulletEnv-v0
|
|
run: ES
|
|
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
pass_criteria:
|
|
episode_reward_mean: 100.0
|
|
timesteps_total: 5000000
|
|
stop:
|
|
time_total_s: 3600
|
|
config:
|
|
num_workers: 50
|
|
|
|
impala-breakoutnoframeskip-v4:
|
|
env: BreakoutNoFrameskip-v4
|
|
run: IMPALA
|
|
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
pass_criteria:
|
|
episode_reward_mean: 200.0
|
|
timesteps_total: 6000000
|
|
stop:
|
|
time_total_s: 3600
|
|
config:
|
|
rollout_fragment_length: 50
|
|
train_batch_size: 500
|
|
num_workers: 10
|
|
num_envs_per_worker: 5
|
|
clip_rewards: True
|
|
lr_schedule: [
|
|
[0, 0.0005],
|
|
[20000000, 0.000000000001],
|
|
]
|
|
num_gpus: 1
|
|
|
|
# marwil-halfcheetahbulletenv-v0:
|
|
# env: HalfCheetahBulletEnv-v0
|
|
# run: MARWIL
|
|
# pass_criteria:
|
|
# episode_reward_mean: 400.0
|
|
# timesteps_total: 10000000
|
|
# stop:
|
|
# time_total_s: 3600
|
|
# config:
|
|
# # Use input produced by expert SAC algo.
|
|
# input: ["~/halfcheetah_expert_sac.zip"]
|
|
# actions_in_input_normalized: true
|
|
# # Switch off input evaluation (data does not contain action probs).
|
|
# input_evaluation: []
|
|
|
|
# num_gpus: 1
|
|
|
|
# model:
|
|
# fcnet_activation: relu
|
|
# fcnet_hiddens: [256, 256, 256]
|
|
|
|
# evaluation_num_workers: 1
|
|
# evaluation_interval: 1
|
|
# evaluation_config:
|
|
# input: sampler
|
|
|
|
ppo-breakoutnoframeskip-v4:
|
|
env: BreakoutNoFrameskip-v4
|
|
run: PPO
|
|
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
pass_criteria:
|
|
episode_reward_mean: 50.0
|
|
timesteps_total: 7000000
|
|
stop:
|
|
time_total_s: 7200
|
|
config:
|
|
lambda: 0.95
|
|
kl_coeff: 0.5
|
|
clip_rewards: True
|
|
clip_param: 0.1
|
|
vf_clip_param: 10.0
|
|
entropy_coeff: 0.01
|
|
train_batch_size: 5000
|
|
rollout_fragment_length: 100
|
|
sgd_minibatch_size: 500
|
|
num_sgd_iter: 10
|
|
num_workers: 10
|
|
num_envs_per_worker: 5
|
|
batch_mode: truncate_episodes
|
|
observation_filter: NoFilter
|
|
model:
|
|
vf_share_layers: true
|
|
num_gpus: 1
|
|
|
|
# Expect roughly 1000 reward after 1h on 1GPU
|
|
sac-halfcheetahbulletenv-v0:
|
|
env: HalfCheetahBulletEnv-v0
|
|
run: SAC
|
|
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
pass_criteria:
|
|
episode_reward_mean: 500.0
|
|
timesteps_total: 100000
|
|
stop:
|
|
time_total_s: 7200
|
|
config:
|
|
horizon: 1000
|
|
soft_horizon: false
|
|
Q_model:
|
|
fcnet_activation: relu
|
|
fcnet_hiddens: [256, 256]
|
|
policy_model:
|
|
fcnet_activation: relu
|
|
fcnet_hiddens: [256, 256]
|
|
tau: 0.005
|
|
target_entropy: auto
|
|
no_done_at_end: false
|
|
n_step: 3
|
|
rollout_fragment_length: 1
|
|
prioritized_replay: true
|
|
train_batch_size: 256
|
|
target_network_update_freq: 1
|
|
timesteps_per_iteration: 1000
|
|
learning_starts: 10000
|
|
optimization:
|
|
actor_learning_rate: 0.0003
|
|
critic_learning_rate: 0.0003
|
|
entropy_learning_rate: 0.0003
|
|
num_workers: 0
|
|
num_gpus: 1
|
|
metrics_smoothing_episodes: 5
|
|
|
|
td3-halfcheetahbulletenv-v0:
|
|
env: HalfCheetahBulletEnv-v0
|
|
run: TD3
|
|
# Minimum reward and total ts (in given time_total_s) to pass this test.
|
|
pass_criteria:
|
|
episode_reward_mean: 400.0
|
|
timesteps_total: 1000000
|
|
stop:
|
|
time_total_s: 7200
|
|
config:
|
|
num_gpus: 1
|
|
learning_starts: 10000
|
|
exploration_config:
|
|
random_timesteps: 10000
|