ray/rllib/tuned_examples/ppo/pendulum-ppo.yaml

22 lines
583 B
YAML

# Can expect improvement to -140 reward in ~300-500k timesteps.
pendulum-ppo:
env: Pendulum-v0
run: PPO
stop:
episode_reward_mean: -500
timesteps_total: 400000
config:
# Works for both torch and tf.
framework: tf
train_batch_size: 512
vf_clip_param: 10.0
num_workers: 0
num_envs_per_worker: 20
lambda: 0.1
gamma: 0.95
lr: 0.0003
sgd_minibatch_size: 64
num_sgd_iter: 6
model:
fcnet_hiddens: [256, 256]
observation_filter: MeanStdFilter