ray/rllib/agents/sac/sac.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
from ray.rllib.agents.sac.sac_policy import SACTFPolicy

OPTIMIZER_SHARED_CONFIGS = [
    "buffer_size", "prioritized_replay", "prioritized_replay_alpha",
    "prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
    "train_batch_size", "learning_starts"
]

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    # === Model ===
    "twin_q": True,
    "use_state_preprocessor": False,
    "policy": "GaussianLatentSpacePolicy",
    # RLlib model options for the Q function
    "Q_model": {
        "hidden_activation": "relu",
        "hidden_layer_sizes": (256, 256),
    },
    # RLlib model options for the policy function
    "policy_model": {
        "hidden_activation": "relu",
        "hidden_layer_sizes": (256, 256),
    },

    # === Learning ===
    # Update the target by \tau * policy + (1-\tau) * target_policy
    "tau": 5e-3,
    # Target entropy lower bound. This is the inverse of reward scale,
    # and will be optimized automatically.
    "target_entropy": "auto",
    # Disable setting done=True at end of episode.
    "no_done_at_end": True,
    # N-step target updates
    "n_step": 1,

    # === Evaluation ===
    # The evaluation stats will be reported under the "evaluation" metric key.
    "evaluation_interval": 1,
    # Number of episodes to run per evaluation period.
    "evaluation_num_episodes": 1,
    # Extra configuration that disables exploration.
    "evaluation_config": {
        "exploration_enabled": False,
    },

    # === Exploration ===
    # Number of env steps to optimize for before returning
    "timesteps_per_iteration": 1000,
    "exploration_enabled": True,

    # === Replay buffer ===
    # Size of the replay buffer. Note that if async_updates is set, then
    # each worker will have a replay buffer of this size.
    "buffer_size": int(1e6),
    # If True prioritized replay buffer will be used.
    # TODO(hartikainen): Make sure this works or remove the option.
    "prioritized_replay": False,
    "prioritized_replay_alpha": 0.6,
    "prioritized_replay_beta": 0.4,
    "prioritized_replay_eps": 1e-6,
    "beta_annealing_fraction": 0.2,
    "final_prioritized_replay_beta": 0.4,
    "compress_observations": False,

    # === Optimization ===
    "optimization": {
        "actor_learning_rate": 3e-4,
        "critic_learning_rate": 3e-4,
        "entropy_learning_rate": 3e-4,
    },
    # If not None, clip gradients during optimization at this value
    "grad_norm_clipping": None,
    # How many steps of the model to sample before learning starts.
    "learning_starts": 1500,
    # Update the replay buffer with this many samples at once. Note that this
    # setting applies per-worker if num_workers > 1.
    "sample_batch_size": 1,
    # Size of a batched sampled from replay buffer for training. Note that
    # if async_updates is set, then each worker returns gradients for a
    # batch of this size.
    "train_batch_size": 256,
    # Update the target network every `target_network_update_freq` steps.
    "target_network_update_freq": 0,

    # === Parallelism ===
    # Whether to use a GPU for local optimization.
    "num_gpus": 0,
    # Number of workers for collecting samples with. This only makes sense
    # to increase if your environment is particularly slow to sample, or if
    # you"re using the Async or Ape-X optimizers.
    "num_workers": 0,
    # Whether to allocate GPUs for workers (if > 0).
    "num_gpus_per_worker": 0,
    # Whether to allocate CPUs for workers (if > 0).
    "num_cpus_per_worker": 1,
    # Whether to compute priorities on workers.
    "worker_side_prioritization": False,
    # Prevent iterations from going lower than this time span
    "min_iter_time_s": 1,

    # TODO(ekl) these are unused; remove them from sac config
    "per_worker_exploration": False,
    "exploration_fraction": 0.1,
    "schedule_max_timesteps": 100000,
    "exploration_final_eps": 0.02,
})
# __sphinx_doc_end__
# yapf: enable

SACTrainer = GenericOffPolicyTrainer.with_updates(
    name="SAC", default_config=DEFAULT_CONFIG, default_policy=SACTFPolicy)
[rllib] Feature/soft actor critic v2 (#5328) * Add base for Soft Actor-Critic * Pick changes from old SAC branch * Update sac.py * First implementation of sac model * Remove unnecessary SAC imports * Prune unnecessary noise and exploration code * Implement SAC model and use that in SAC policy * runs but doesn't learn * clear state * fix batch size * Add missing alpha grads and vars * -200 by 2k timesteps * doc * lazy squash * one file * ignore tfp * revert done 2019-08-01 23:37:36 -07:00			`from __future__ import absolute_import`
			`from __future__ import division`
			`from __future__ import print_function`

			`from ray.rllib.agents.trainer import with_common_config`
			`from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer`
			`from ray.rllib.agents.sac.sac_policy import SACTFPolicy`

			`OPTIMIZER_SHARED_CONFIGS = [`
			`"buffer_size", "prioritized_replay", "prioritized_replay_alpha",`
			`"prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",`
			`"train_batch_size", "learning_starts"`
			`]`

			`# yapf: disable`
			`# __sphinx_doc_begin__`
			`DEFAULT_CONFIG = with_common_config({`
			`# === Model ===`
			`"twin_q": True,`
			`"use_state_preprocessor": False,`
			`"policy": "GaussianLatentSpacePolicy",`
			`# RLlib model options for the Q function`
			`"Q_model": {`
			`"hidden_activation": "relu",`
			`"hidden_layer_sizes": (256, 256),`
			`},`
			`# RLlib model options for the policy function`
			`"policy_model": {`
			`"hidden_activation": "relu",`
			`"hidden_layer_sizes": (256, 256),`
			`},`

			`# === Learning ===`
			`# Update the target by \tau * policy + (1-\tau) * target_policy`
			`"tau": 5e-3,`
			`# Target entropy lower bound. This is the inverse of reward scale,`
			`# and will be optimized automatically.`
			`"target_entropy": "auto",`
			`# Disable setting done=True at end of episode.`
			`"no_done_at_end": True,`
			`# N-step target updates`
			`"n_step": 1,`

			`# === Evaluation ===`
			`# The evaluation stats will be reported under the "evaluation" metric key.`
			`"evaluation_interval": 1,`
			`# Number of episodes to run per evaluation period.`
			`"evaluation_num_episodes": 1,`
			`# Extra configuration that disables exploration.`
			`"evaluation_config": {`
			`"exploration_enabled": False,`
			`},`

			`# === Exploration ===`
			`# Number of env steps to optimize for before returning`
			`"timesteps_per_iteration": 1000,`
			`"exploration_enabled": True,`

			`# === Replay buffer ===`
			`# Size of the replay buffer. Note that if async_updates is set, then`
			`# each worker will have a replay buffer of this size.`
			`"buffer_size": int(1e6),`
			`# If True prioritized replay buffer will be used.`
			`# TODO(hartikainen): Make sure this works or remove the option.`
			`"prioritized_replay": False,`
			`"prioritized_replay_alpha": 0.6,`
			`"prioritized_replay_beta": 0.4,`
			`"prioritized_replay_eps": 1e-6,`
			`"beta_annealing_fraction": 0.2,`
			`"final_prioritized_replay_beta": 0.4,`
			`"compress_observations": False,`

			`# === Optimization ===`
			`"optimization": {`
			`"actor_learning_rate": 3e-4,`
			`"critic_learning_rate": 3e-4,`
			`"entropy_learning_rate": 3e-4,`
			`},`
			`# If not None, clip gradients during optimization at this value`
			`"grad_norm_clipping": None,`
			`# How many steps of the model to sample before learning starts.`
			`"learning_starts": 1500,`
			`# Update the replay buffer with this many samples at once. Note that this`
			`# setting applies per-worker if num_workers > 1.`
			`"sample_batch_size": 1,`
			`# Size of a batched sampled from replay buffer for training. Note that`
			`# if async_updates is set, then each worker returns gradients for a`
			`# batch of this size.`
			`"train_batch_size": 256,`
			# Update the target network every `target_network_update_freq` steps.
			`"target_network_update_freq": 0,`

			`# === Parallelism ===`
			`# Whether to use a GPU for local optimization.`
			`"num_gpus": 0,`
			`# Number of workers for collecting samples with. This only makes sense`
			`# to increase if your environment is particularly slow to sample, or if`
			`# you"re using the Async or Ape-X optimizers.`
			`"num_workers": 0,`
			`# Whether to allocate GPUs for workers (if > 0).`
			`"num_gpus_per_worker": 0,`
			`# Whether to allocate CPUs for workers (if > 0).`
			`"num_cpus_per_worker": 1,`
			`# Whether to compute priorities on workers.`
			`"worker_side_prioritization": False,`
			`# Prevent iterations from going lower than this time span`
			`"min_iter_time_s": 1,`

			`# TODO(ekl) these are unused; remove them from sac config`
			`"per_worker_exploration": False,`
			`"exploration_fraction": 0.1,`
			`"schedule_max_timesteps": 100000,`
			`"exploration_final_eps": 0.02,`
			`})`
			`# __sphinx_doc_end__`
			`# yapf: enable`

			`SACTrainer = GenericOffPolicyTrainer.with_updates(`
			`name="SAC", default_config=DEFAULT_CONFIG, default_policy=SACTFPolicy)`