# Can expect improvement to -140 reward in ~300-500k timesteps. pendulum-ppo: env: ray.rllib.examples.env.transformed_action_space_env.TransformedActionPendulum run: PPO stop: episode_reward_mean: -500 timesteps_total: 400000 config: # Works for both torch and tf. framework: tf # Test, whether PPO is able to learn in "distorted" action spaces. env_config: config: low: 300.0 high: 500.0 normalize_actions: true clip_actions: false vf_clip_param: 10.0 num_envs_per_worker: 20 lambda: 0.1 gamma: 0.95 lr: 0.0003 train_batch_size: 512 sgd_minibatch_size: 64 num_sgd_iter: 6 observation_filter: MeanStdFilter