ray/rllib/tuned_examples/sac/mspacman-sac.yaml

42 lines
1.4 KiB
YAML

# Our implementation of SAC discrete can reach up
# to ~750 reward in 40k timesteps. Run e.g. on a g3.4xlarge with `num_gpus=1`.
# Uses the hyperparameters published in [2] (see rllib/agents/sac/README.md).
mspacman-sac-tf:
env: MsPacmanNoFrameskip-v4
run: SAC
stop:
episode_reward_mean: 800
timesteps_total: 100000
config:
# Works for both torch and tf.
framework: tf
gamma: 0.99
Q_model:
fcnet_hiddens: [512]
fcnet_activation: relu
policy_model:
fcnet_hiddens: [512]
fcnet_activation: relu
# Do hard syncs.
# Soft-syncs seem to work less reliably for discrete action spaces.
tau: 1.0
target_network_update_freq: 8000
# paper uses: 0.98 * -log(1/|A|)
target_entropy: 1.755
clip_rewards: 1.0
no_done_at_end: False
n_step: 1
rollout_fragment_length: 1
prioritized_replay: true
train_batch_size: 64
timesteps_per_iteration: 4
# Paper uses 20k random timesteps, which is not exactly the same, but
# seems to work nevertheless.
learning_starts: 20000
optimization:
actor_learning_rate: 0.0003
critic_learning_rate: 0.0003
entropy_learning_rate: 0.0003
num_workers: 0
num_gpus: 0
metrics_smoothing_episodes: 5