a3c-pongdeterministic-v4: env: PongDeterministic-v4 run: A3C # TODO(sven, jungong): fix A3C on torch and re-enable. frameworks: [ "tf", "tf2" ] stop: time_total_s: 3600 config: num_gpus: 0 num_workers: 16 rollout_fragment_length: 20 vf_loss_coeff: 0.5 entropy_coeff: 0.01 gamma: 0.99 grad_clip: 40.0 lambda: 1.0 lr: 0.0001 observation_filter: NoFilter preprocessor_pref: rllib model: use_lstm: true conv_activation: elu dim: 42 grayscale: true zero_mean: false # Reduced channel depth and kernel size from default. conv_filters: [ [32, [3, 3], 2], [32, [3, 3], 2], [32, [3, 3], 2], [32, [3, 3], 2], ] # TODO(jungong) : flip to True after we have collected some data. _disable_execution_plan_api: false apex-breakoutnoframeskip-v4: env: BreakoutNoFrameskip-v4 run: APEX frameworks: [ "tf", "tf2", "torch" ] stop: time_total_s: 3600 config: double_q: false dueling: false num_atoms: 1 noisy: false n_step: 3 lr: .0001 adam_epsilon: .00015 hiddens: [512] buffer_size: 1000000 exploration_config: epsilon_timesteps: 200000 final_epsilon: 0.01 prioritized_replay_alpha: 0.5 num_gpus: 1 num_workers: 8 num_envs_per_worker: 8 rollout_fragment_length: 20 train_batch_size: 512 target_network_update_freq: 50000 timesteps_per_iteration: 25000 appo-pongnoframeskip-v4: env: PongNoFrameskip-v4 run: APPO frameworks: [ "tf", "tf2", "torch" ] stop: time_total_s: 2000 config: vtrace: True use_kl_loss: False rollout_fragment_length: 50 train_batch_size: 750 num_workers: 31 broadcast_interval: 1 max_sample_requests_in_flight_per_worker: 1 num_multi_gpu_tower_stacks: 1 num_envs_per_worker: 8 num_sgd_iter: 2 vf_loss_coeff: 1.0 clip_param: 0.3 num_gpus: 1 grad_clip: 10 model: dim: 42 cql-halfcheetahbulletenv-v0: env: HalfCheetahBulletEnv-v0 run: CQL frameworks: [ "tf", "tf2", "torch" ] stop: time_total_s: 3600 config: # Use input produced by expert SAC algo. input: ["~/halfcheetah_expert_sac.zip"] actions_in_input_normalized: true soft_horizon: False horizon: 1000 Q_model: fcnet_activation: relu fcnet_hiddens: [256, 256, 256] policy_model: fcnet_activation: relu fcnet_hiddens: [256, 256, 256] tau: 0.005 target_entropy: auto no_done_at_end: false n_step: 3 rollout_fragment_length: 1 prioritized_replay: false train_batch_size: 256 target_network_update_freq: 0 timesteps_per_iteration: 1000 learning_starts: 256 optimization: actor_learning_rate: 0.0001 critic_learning_rate: 0.0003 entropy_learning_rate: 0.0001 num_workers: 0 num_gpus: 1 metrics_smoothing_episodes: 5 # CQL Configs min_q_weight: 5.0 bc_iters: 20000 temperature: 1.0 num_actions: 10 lagrangian: False # Switch on online evaluation. evaluation_interval: 3 evaluation_config: input: sampler always_attach_evaluation_results: True sac-halfcheetahbulletenv-v0: env: HalfCheetahBulletEnv-v0 run: SAC frameworks: [ "tf", "tf2", "torch" ] stop: time_total_s: 3600 config: horizon: 1000 soft_horizon: false Q_model: fcnet_activation: relu fcnet_hiddens: [256, 256] policy_model: fcnet_activation: relu fcnet_hiddens: [256, 256] tau: 0.005 target_entropy: auto no_done_at_end: false n_step: 3 rollout_fragment_length: 1 prioritized_replay: true train_batch_size: 256 target_network_update_freq: 1 timesteps_per_iteration: 1000 learning_starts: 10000 optimization: actor_learning_rate: 0.0003 critic_learning_rate: 0.0003 entropy_learning_rate: 0.0003 num_workers: 0 num_gpus: 1 metrics_smoothing_episodes: 5