# This gets to ~19-20 reward in ~30 minutes / 4m steps on a m4.10xl instance # TODO(rliaw): this has regressed in performance pong-a3c: env: PongDeterministic-v4 run: A3C config: # Works for both torch and tf. framework: tf num_workers: 16 rollout_fragment_length: 20 vf_loss_coeff: 0.5 entropy_coeff: 0.01 gamma: 0.99 grad_clip: 40.0 lambda: 1.0 lr: 0.0001 observation_filter: NoFilter preprocessor_pref: rllib model: use_lstm: true conv_activation: elu dim: 42 grayscale: true zero_mean: false # Reduced channel depth and kernel size from default conv_filters: [ [32, [3, 3], 2], [32, [3, 3], 2], [32, [3, 3], 2], [32, [3, 3], 2], ]