# Our implementation of SAC discrete can reach up # to ~750 reward in 40k timesteps. Run e.g. on a g3.4xlarge with `num_gpus=1`. # Uses the hyperparameters published in [2] (see rllib/agents/sac/README.md). mspacman-sac-tf: env: MsPacmanNoFrameskip-v4 run: SAC stop: episode_reward_mean: 800 timesteps_total: 100000 config: use_pytorch: false gamma: 0.99 # state-preprocessor=Our default Atari Conv2D-net. use_state_preprocessor: true Q_model: hidden_activation: relu hidden_layer_sizes: [512] policy_model: hidden_activation: relu hidden_layer_sizes: [512] # Do hard syncs. # Soft-syncs seem to work less reliably for discrete action spaces. tau: 1.0 target_network_update_freq: 8000 # paper uses: 0.98 * -log(1/|A|) target_entropy: 1.755 clip_rewards: 1.0 no_done_at_end: False n_step: 1 rollout_fragment_length: 1 prioritized_replay: true train_batch_size: 64 timesteps_per_iteration: 4 # Paper uses 20k random timesteps, which is not exactly the same, but # seems to work nevertheless. learning_starts: 20000 optimization: actor_learning_rate: 0.0003 critic_learning_rate: 0.0003 entropy_learning_rate: 0.0003 num_workers: 0 num_gpus: 0 metrics_smoothing_episodes: 5