ray/rllib/agents/sac/sac.py

120 lines
4.2 KiB
Python
Raw Normal View History

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
from ray.rllib.agents.sac.sac_policy import SACTFPolicy
OPTIMIZER_SHARED_CONFIGS = [
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
"prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
"train_batch_size", "learning_starts"
]
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# === Model ===
"twin_q": True,
"use_state_preprocessor": False,
"policy": "GaussianLatentSpacePolicy",
# RLlib model options for the Q function
"Q_model": {
"hidden_activation": "relu",
"hidden_layer_sizes": (256, 256),
},
# RLlib model options for the policy function
"policy_model": {
"hidden_activation": "relu",
"hidden_layer_sizes": (256, 256),
},
# === Learning ===
# Update the target by \tau * policy + (1-\tau) * target_policy
"tau": 5e-3,
# Target entropy lower bound. This is the inverse of reward scale,
# and will be optimized automatically.
"target_entropy": "auto",
# Disable setting done=True at end of episode.
"no_done_at_end": True,
# N-step target updates
"n_step": 1,
# === Evaluation ===
# The evaluation stats will be reported under the "evaluation" metric key.
"evaluation_interval": 1,
# Number of episodes to run per evaluation period.
"evaluation_num_episodes": 1,
# Extra configuration that disables exploration.
"evaluation_config": {
"exploration_enabled": False,
},
# === Exploration ===
# Number of env steps to optimize for before returning
"timesteps_per_iteration": 1000,
"exploration_enabled": True,
# === Replay buffer ===
# Size of the replay buffer. Note that if async_updates is set, then
# each worker will have a replay buffer of this size.
"buffer_size": int(1e6),
# If True prioritized replay buffer will be used.
# TODO(hartikainen): Make sure this works or remove the option.
"prioritized_replay": False,
"prioritized_replay_alpha": 0.6,
"prioritized_replay_beta": 0.4,
"prioritized_replay_eps": 1e-6,
"beta_annealing_fraction": 0.2,
"final_prioritized_replay_beta": 0.4,
"compress_observations": False,
# === Optimization ===
"optimization": {
"actor_learning_rate": 3e-4,
"critic_learning_rate": 3e-4,
"entropy_learning_rate": 3e-4,
},
# If not None, clip gradients during optimization at this value
"grad_norm_clipping": None,
# How many steps of the model to sample before learning starts.
"learning_starts": 1500,
# Update the replay buffer with this many samples at once. Note that this
# setting applies per-worker if num_workers > 1.
"sample_batch_size": 1,
# Size of a batched sampled from replay buffer for training. Note that
# if async_updates is set, then each worker returns gradients for a
# batch of this size.
"train_batch_size": 256,
# Update the target network every `target_network_update_freq` steps.
"target_network_update_freq": 0,
# === Parallelism ===
# Whether to use a GPU for local optimization.
"num_gpus": 0,
# Number of workers for collecting samples with. This only makes sense
# to increase if your environment is particularly slow to sample, or if
# you"re using the Async or Ape-X optimizers.
"num_workers": 0,
# Whether to allocate GPUs for workers (if > 0).
"num_gpus_per_worker": 0,
# Whether to allocate CPUs for workers (if > 0).
"num_cpus_per_worker": 1,
# Whether to compute priorities on workers.
"worker_side_prioritization": False,
# Prevent iterations from going lower than this time span
"min_iter_time_s": 1,
# TODO(ekl) these are unused; remove them from sac config
"per_worker_exploration": False,
"exploration_fraction": 0.1,
"schedule_max_timesteps": 100000,
"exploration_final_eps": 0.02,
})
# __sphinx_doc_end__
# yapf: enable
SACTrainer = GenericOffPolicyTrainer.with_updates(
name="SAC", default_config=DEFAULT_CONFIG, default_policy=SACTFPolicy)