ray/rllib/algorithms/td3/td3.py

"""A more stable successor to TD3.

By default, this uses a near-identical configuration to that reported in the
TD3 paper.
"""
from ray.rllib.algorithms.ddpg.ddpg import DDPG, DDPGConfig
from ray.rllib.utils.annotations import override
from ray.rllib.utils.deprecation import Deprecated
from ray.rllib.utils.typing import AlgorithmConfigDict
from ray.rllib.utils.deprecation import DEPRECATED_VALUE


class TD3Config(DDPGConfig):
    """Defines a configuration class from which a TD3 Algorithm can be built.

    Example:
        >>> from ray.rllib.algorithms.ddpg.td3 import TD3Config
        >>> config = TD3Config().training(lr=0.01).resources(num_gpus=1)
        >>> print(config.to_dict())
        >>> # Build a Algorithm object from the config and run one training iteration.
        >>> algo = config.build(env="Pendulum-v1")
        >>> algo.train()

    Example:
        >>> from ray.rllib.algorithms.ddpg.td3 import TD3Config
        >>> from ray import tune
        >>> config = TD3Config()
        >>> # Print out some default values.
        >>> print(config.lr)
            0.0004
        >>> # Update the config object.
        >>> config.training(lr=tune.grid_search([0.001, 0.0001]))
        >>> # Set the config object's env.
        >>> config.environment(env="Pendulum-v1")
        >>> # Use to_dict() to get the old-style python config dict
        >>> # when running with tune.
        >>> tune.run(
        ...     "TD3",
        ...     stop={"episode_reward_mean": 200},
        ...     config=config.to_dict(),
        ... )
    """

    def __init__(self, algo_class=None):
        """Initializes a TD3Config instance."""
        super().__init__(algo_class=algo_class or TD3)

        # fmt: off
        # __sphinx_doc_begin__

        # Override some of DDPG/SimpleQ/Algorithm's default values with TD3-specific
        # values.

        # .training()

        # largest changes: twin Q functions, delayed policy updates, target
        # smoothing, no l2-regularization.
        self.twin_q = True
        self.policy_delay = 2
        self.smooth_target_policy = True,
        self.l2_reg = 0.0
        # Different tau (affecting target network update).
        self.tau = 5e-3
        # Different batch size.
        self.train_batch_size = 100
        # No prioritized replay by default (we may want to change this at some
        # point).
        self.replay_buffer_config = {
            "type": "MultiAgentReplayBuffer",
            # Specify prioritized replay by supplying a buffer type that supports
            # prioritization, for example: MultiAgentPrioritizedReplayBuffer.
            "prioritized_replay": DEPRECATED_VALUE,
            "capacity": 1000000,
            "worker_side_prioritization": False,
        }
        # Number of timesteps to collect from rollout workers before we start
        # sampling from replay buffers for learning. Whether we count this in agent
        # steps  or environment steps depends on config["multiagent"]["count_steps_by"].
        self.num_steps_sampled_before_learning_starts = 10000

        # .exploration()
        # TD3 uses Gaussian Noise by default.
        self.exploration_config = {
            # TD3 uses simple Gaussian noise on top of deterministic NN-output
            # actions (after a possible pure random phase of n timesteps).
            "type": "GaussianNoise",
            # For how many timesteps should we return completely random
            # actions, before we start adding (scaled) noise?
            "random_timesteps": 10000,
            # Gaussian stddev of action noise for exploration.
            "stddev": 0.1,
            # Scaling settings by which the Gaussian noise is scaled before
            # being added to the actions. NOTE: The scale timesteps start only
            # after(!) any random steps have been finished.
            # By default, do not anneal over time (fixed 1.0).
            "initial_scale": 1.0,
            "final_scale": 1.0,
            "scale_timesteps": 1,
        }
        # __sphinx_doc_end__
        # fmt: on


class TD3(DDPG):
    @classmethod
    @override(DDPG)
    def get_default_config(cls) -> AlgorithmConfigDict:
        return TD3Config().to_dict()


# Deprecated: Use ray.rllib.algorithms.ddpg..td3.TD3Config instead!
class _deprecated_default_config(dict):
    def __init__(self):
        super().__init__(TD3Config().to_dict())

    @Deprecated(
        old="ray.rllib.algorithms.ddpg.td3::TD3_DEFAULT_CONFIG",
        new="ray.rllib.algorithms.td3.td3::TD3Config(...)",
        error=False,
    )
    def __getitem__(self, item):
        return super().__getitem__(item)


TD3_DEFAULT_CONFIG = _deprecated_default_config()
[rllib] Port remainder of algorithms to build_trainer() pattern (#4920) 2019-06-07 16:45:36 -07:00			`"""A more stable successor to TD3.`

			`By default, this uses a near-identical configuration to that reported in the`
			`TD3 paper.`
			`"""`
[RLlib] Move all remaining algos into `algorithms` directory. (#25366) 2022-06-04 07:35:24 +02:00			`from ray.rllib.algorithms.ddpg.ddpg import DDPG, DDPGConfig`
[RLlib] Trainer sub-class DDPG/TD3/APEX-DDPG (instead of `build_trainer`). (#20636) 2021-12-01 10:52:12 +01:00			`from ray.rllib.utils.annotations import override`
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00			`from ray.rllib.utils.deprecation import Deprecated`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`from ray.rllib.utils.typing import AlgorithmConfigDict`
[RLlib] DDPG Training iteration fn & Replay Buffer API (#24212) 2022-05-05 09:41:38 +02:00			`from ray.rllib.utils.deprecation import DEPRECATED_VALUE`
[rllib] TD3/DDPG improvements and MuJoCo benchmarks (#4694) * [rllib] Separate optimisers for DDPG actor & crit. * [rllib] Better names for DDPG variables & options Config changes: - noise_scale -> exploration_ou_noise_scale - exploration_theta -> exploration_ou_theta - exploration_sigma -> exploration_ou_sigma - act_noise -> exploration_gaussian_sigma - noise_clip -> target_noise_clip * [rllib] Make DDPG less class-y Used functions to replace three classes with only an __init__ method & a handful of unrelated attributes. * [rllib] Refactor DDPG noise * [rllib] Unify DDPG exploration annealing Added option "exploration_should_anneal" to enable linear annealing of exploration noise. By default this is off, for consistency with DDPG & TD3 papers. Also renamed "exploration_final_eps" to "exploration_final_scale" (that name seems to have been carried over from DQN, and doesn't really make sense here). Finally, tried to rename "eps" to "noise_scale" wherever possible. 2019-04-26 17:49:53 -07:00
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00
			`class TD3Config(DDPGConfig):`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`"""Defines a configuration class from which a TD3 Algorithm can be built.`
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00
			`Example:`
			`>>> from ray.rllib.algorithms.ddpg.td3 import TD3Config`
			`>>> config = TD3Config().training(lr=0.01).resources(num_gpus=1)`
			`>>> print(config.to_dict())`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`>>> # Build a Algorithm object from the config and run one training iteration.`
[RLlib] More Trainer -> Algorithm renaming cleanups. (#25869) 2022-06-20 15:54:00 +02:00			`>>> algo = config.build(env="Pendulum-v1")`
			`>>> algo.train()`
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00
			`Example:`
			`>>> from ray.rllib.algorithms.ddpg.td3 import TD3Config`
			`>>> from ray import tune`
			`>>> config = TD3Config()`
			`>>> # Print out some default values.`
			`>>> print(config.lr)`
			`0.0004`
			`>>> # Update the config object.`
			`>>> config.training(lr=tune.grid_search([0.001, 0.0001]))`
			`>>> # Set the config object's env.`
			`>>> config.environment(env="Pendulum-v1")`
			`>>> # Use to_dict() to get the old-style python config dict`
			`>>> # when running with tune.`
			`>>> tune.run(`
			`... "TD3",`
			`... stop={"episode_reward_mean": 200},`
			`... config=config.to_dict(),`
			`... )`
			`"""`

[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`def __init__(self, algo_class=None):`
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00			`"""Initializes a TD3Config instance."""`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`super().__init__(algo_class=algo_class or TD3)`
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00
			`# fmt: off`
			`# __sphinx_doc_begin__`

[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`# Override some of DDPG/SimpleQ/Algorithm's default values with TD3-specific`
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00			`# values.`

			`# .training()`

			`# largest changes: twin Q functions, delayed policy updates, target`
			`# smoothing, no l2-regularization.`
			`self.twin_q = True`
			`self.policy_delay = 2`
			`self.smooth_target_policy = True,`
			`self.l2_reg = 0.0`
			`# Different tau (affecting target network update).`
			`self.tau = 5e-3`
			`# Different batch size.`
			`self.train_batch_size = 100`
			`# No prioritized replay by default (we may want to change this at some`
			`# point).`
			`self.replay_buffer_config = {`
			`"type": "MultiAgentReplayBuffer",`
			`# Specify prioritized replay by supplying a buffer type that supports`
			`# prioritization, for example: MultiAgentPrioritizedReplayBuffer.`
			`"prioritized_replay": DEPRECATED_VALUE,`
			`"capacity": 1000000,`
			`"worker_side_prioritization": False,`
			`}`
[RLlib] Move learning_starts logic from buffers into `training_step()`. (#26032) 2022-08-11 13:07:30 +02:00			`# Number of timesteps to collect from rollout workers before we start`
			`# sampling from replay buffers for learning. Whether we count this in agent`
			`# steps or environment steps depends on config["multiagent"]["count_steps_by"].`
			`self.num_steps_sampled_before_learning_starts = 10000`
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00
			`# .exploration()`
			`# TD3 uses Gaussian Noise by default.`
			`self.exploration_config = {`
[RLlib] DDPG refactor and Exploration API action noise classes. (#7314) * WIP. * WIP. * WIP. * WIP. * WIP. * Fix * WIP. * Add TD3 quick Pendulum regresison. * Cleanup. * Fix. * LINT. * Fix. * Sort quick_learning test cases, add TD3. * Sort quick_learning test cases, add TD3. * Revert test_checkpoint_restore.py (debugging) changes. * Fix old soft_q settings in documentation and test configs. * More doc fixes. * Fix test case. * Fix test case. * Lower test load. * WIP. 2020-03-01 20:53:35 +01:00			`# TD3 uses simple Gaussian noise on top of deterministic NN-output`
			`# actions (after a possible pure random phase of n timesteps).`
			`"type": "GaussianNoise",`
			`# For how many timesteps should we return completely random`
			`# actions, before we start adding (scaled) noise?`
			`"random_timesteps": 10000,`
			`# Gaussian stddev of action noise for exploration.`
			`"stddev": 0.1,`
			`# Scaling settings by which the Gaussian noise is scaled before`
			`# being added to the actions. NOTE: The scale timesteps start only`
			`# after(!) any random steps have been finished.`
			`# By default, do not anneal over time (fixed 1.0).`
			`"initial_scale": 1.0,`
			`"final_scale": 1.0,`
			`"scale_timesteps": 1,`
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00			`}`
			`# __sphinx_doc_end__`
			`# fmt: on`
[rllib] TD3/DDPG improvements and MuJoCo benchmarks (#4694) * [rllib] Separate optimisers for DDPG actor & crit. * [rllib] Better names for DDPG variables & options Config changes: - noise_scale -> exploration_ou_noise_scale - exploration_theta -> exploration_ou_theta - exploration_sigma -> exploration_ou_sigma - act_noise -> exploration_gaussian_sigma - noise_clip -> target_noise_clip * [rllib] Make DDPG less class-y Used functions to replace three classes with only an __init__ method & a handful of unrelated attributes. * [rllib] Refactor DDPG noise * [rllib] Unify DDPG exploration annealing Added option "exploration_should_anneal" to enable linear annealing of exploration noise. By default this is off, for consistency with DDPG & TD3 papers. Also renamed "exploration_final_eps" to "exploration_final_scale" (that name seems to have been carried over from DQN, and doesn't really make sense here). Finally, tried to rename "eps" to "noise_scale" wherever possible. 2019-04-26 17:49:53 -07:00
[RLlib] Trainer sub-class DDPG/TD3/APEX-DDPG (instead of `build_trainer`). (#20636) 2021-12-01 10:52:12 +01:00
[RLlib] Move all remaining algos into `algorithms` directory. (#25366) 2022-06-04 07:35:24 +02:00			`class TD3(DDPG):`
[RLlib] Trainer sub-class DDPG/TD3/APEX-DDPG (instead of `build_trainer`). (#20636) 2021-12-01 10:52:12 +01:00			`@classmethod`
[RLlib] Move all remaining algos into `algorithms` directory. (#25366) 2022-06-04 07:35:24 +02:00			`@override(DDPG)`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`def get_default_config(cls) -> AlgorithmConfigDict:`
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00			`return TD3Config().to_dict()`


			`# Deprecated: Use ray.rllib.algorithms.ddpg..td3.TD3Config instead!`
			`class _deprecated_default_config(dict):`
			`def __init__(self):`
			`super().__init__(TD3Config().to_dict())`

			`@Deprecated(`
			`old="ray.rllib.algorithms.ddpg.td3::TD3_DEFAULT_CONFIG",`
[RLlib] Move all remaining algos into `algorithms` directory. (#25366) 2022-06-04 07:35:24 +02:00			`new="ray.rllib.algorithms.td3.td3::TD3Config(...)",`
[RLlib] TD3 config objects. (#25065) 2022-05-23 10:07:13 +02:00			`error=False,`
			`)`
			`def __getitem__(self, item):`
			`return super().__getitem__(item)`


			`TD3_DEFAULT_CONFIG = _deprecated_default_config()`