# This reaches ~19 reward in < 40 minutes (3M env steps) on a p3.8xlarge AWS instance. # See https://app.wandb.ai/zplizzi/test/runs/ayuuhixr?workspace=user-zplizzi # for training curves. pong-apex: env: PongNoFrameskip-v4 run: APEX stop: episode_reward_mean: 19.0 timesteps_total: 4000000 config: # Works for both torch and tf. framework: tf target_network_update_freq: 20000 num_workers: 4 num_envs_per_worker: 8 lr: .00005 train_batch_size: 64 replay_buffer_config: type: MultiAgentPrioritizedReplayBuffer capacity: 1000000 # we should set compress_observations to True because few machines # would be able to contain the replay buffers in memory otherwise compress_observations: True gamma: 0.99 training_intensity: 16