# This reaches ~19 reward in < 40 minutes (3M env steps) on a p3.8xlarge AWS instance.
# See https://app.wandb.ai/zplizzi/test/runs/ayuuhixr?workspace=user-zplizzi
# for training curves.
pong-apex:
    env: PongNoFrameskip-v4
    run: APEX
    stop:
        episode_reward_mean: 19.0
        timesteps_total: 4000000
    config:
        # Works for both torch and tf.
        framework: tf
        target_network_update_freq: 20000
        num_workers: 4
        num_envs_per_worker: 8
        lr: .00005
        train_batch_size: 64
        replay_buffer_config:
          type: MultiAgentPrioritizedReplayBuffer
          capacity: 1000000
        # we should set compress_observations to True because few machines
        # would be able to contain the replay buffers in memory otherwise
        compress_observations: True
        gamma: 0.99
        training_intensity: 16