# On a single GPU, this achieves maximum reward in ~15-20 minutes. # # $ python train.py -f tuned_examples/pong-ppo.yaml # pong-ppo: env: PongNoFrameskip-v4 run: PPO config: lambda: 0.95 kl_coeff: 0.5 clip_rewards: True clip_param: 0.1 vf_clip_param: 10.0 entropy_coeff: 0.01 train_batch_size: 5000 rollout_fragment_length: 20 sgd_minibatch_size: 500 num_sgd_iter: 10 num_workers: 32 num_envs_per_worker: 5 batch_mode: truncate_episodes observation_filter: NoFilter vf_share_layers: true num_gpus: 1 model: dim: 42