ray/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml

28 lines
805 B
YAML

# Can expect improvement to -140 reward in ~300-500k timesteps.
pendulum-ppo:
env: ray.rllib.examples.env.transformed_action_space_env.TransformedActionPendulum
run: PPO
stop:
episode_reward_mean: -500
timesteps_total: 400000
config:
# Works for both torch and tf.
framework: tf
# Test, whether PPO is able to learn in "distorted" action spaces.
env_config:
config:
low: 300.0
high: 500.0
normalize_actions: true
clip_actions: false
vf_clip_param: 10.0
num_envs_per_worker: 20
lambda: 0.1
gamma: 0.95
lr: 0.0003
train_batch_size: 512
sgd_minibatch_size: 64
num_sgd_iter: 6
observation_filter: MeanStdFilter