ray/rllib/agents/dqn/r2d2.py

import logging
from typing import Type

from ray.rllib.algorithms.dqn import DQNTrainer, DEFAULT_CONFIG as DQN_DEFAULT_CONFIG
from ray.rllib.agents.dqn.r2d2_tf_policy import R2D2TFPolicy
from ray.rllib.agents.dqn.r2d2_torch_policy import R2D2TorchPolicy
from ray.rllib.agents.trainer import Trainer
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import TrainerConfigDict
from ray.rllib.utils.deprecation import DEPRECATED_VALUE

logger = logging.getLogger(__name__)

# fmt: off
# __sphinx_doc_begin__
R2D2_DEFAULT_CONFIG = Trainer.merge_trainer_configs(
    DQN_DEFAULT_CONFIG,  # See keys in dqn.py, which are also supported.
    {
        # Learning rate for adam optimizer.
        "lr": 1e-4,
        # Discount factor.
        "gamma": 0.997,
        # Train batch size (in number of single timesteps).
        "train_batch_size": 64,
        # Adam epsilon hyper parameter
        "adam_epsilon": 1e-3,
        # Run in parallel by default.
        "num_workers": 2,
        # Batch mode must be complete_episodes.
        "batch_mode": "complete_episodes",

        # === Replay buffer ===
        "replay_buffer_config": {
            "type": "MultiAgentReplayBuffer",
            # Specify prioritized replay by supplying a buffer type that supports
            # prioritization, for example: MultiAgentPrioritizedReplayBuffer.
            "prioritized_replay": DEPRECATED_VALUE,
            # Size of the replay buffer (in sequences, not timesteps).
            "capacity": 100000,
            "storage_unit": "sequences",
            # Set automatically: The number
            # of contiguous environment steps to
            # replay at once. Will be calculated via
            # model->max_seq_len + burn_in.
            # Do not set this to any valid value!
            "replay_sequence_length": -1,
            # If > 0, use the `replay_burn_in` first steps of each replay-sampled
            # sequence (starting either from all 0.0-values if `zero_init_state=True` or
            # from the already stored values) to calculate an even more accurate
            # initial states for the actual sequence (starting after this burn-in
            # window). In the burn-in case, the actual length of the sequence
            # used for loss calculation is `n - replay_burn_in` time steps
            # (n=LSTM’s/attention net’s max_seq_len).
            "replay_burn_in": 0,
        },
        # If True, assume a zero-initialized state input (no matter where in
        # the episode the sequence is located).
        # If False, store the initial states along with each SampleBatch, use
        # it (as initial state when running through the network for training),
        # and update that initial state during training (from the internal
        # state outputs of the immediately preceding sequence).
        "zero_init_states": True,

        # Whether to use the h-function from the paper [1] to scale target
        # values in the R2D2-loss function:
        # h(x) = sign(x)(􏰅|x| + 1 − 1) + εx
        "use_h_function": True,
        # The epsilon parameter from the R2D2 loss function (only used
        # if `use_h_function`=True.
        "h_function_epsilon": 1e-3,

        # Update the target network every `target_network_update_freq` sample steps.
        "target_network_update_freq": 2500,

        # Deprecated keys:
        # Use config["replay_buffer_config"]["replay_burn_in"] instead
        "burn_in": DEPRECATED_VALUE
    },
    _allow_unknown_configs=True,
)
# __sphinx_doc_end__
# fmt: on


# Build an R2D2 trainer, which uses the framework specific Policy
# determined in `get_policy_class()` above.
class R2D2Trainer(DQNTrainer):
    """Recurrent Experience Replay in Distrib. Reinforcement Learning (R2D2).

    Trainer defining the distributed R2D2 algorithm.
    See `r2d2_[tf|torch]_policy.py` for the definition of the policies.

    [1] Recurrent Experience Replay in Distributed Reinforcement Learning -
        S Kapturowski, G Ostrovski, J Quan, R Munos, W Dabney - 2019, DeepMind


    Detailed documentation:
    https://docs.ray.io/en/master/rllib-algorithms.html#\
    recurrent-replay-distributed-dqn-r2d2
    """

    @classmethod
    @override(DQNTrainer)
    def get_default_config(cls) -> TrainerConfigDict:
        return R2D2_DEFAULT_CONFIG

    @override(DQNTrainer)
    def get_default_policy_class(self, config: TrainerConfigDict) -> Type[Policy]:
        if config["framework"] == "torch":
            return R2D2TorchPolicy
        else:
            return R2D2TFPolicy

    @override(DQNTrainer)
    def validate_config(self, config: TrainerConfigDict) -> None:
        """Checks and updates the config based on settings.

        Rewrites rollout_fragment_length to take into account burn-in and
        max_seq_len truncation.
        """
        # Call super's validation method.
        super().validate_config(config)

        if config["replay_buffer_config"]["replay_sequence_length"] != -1:
            raise ValueError(
                "`replay_sequence_length` is calculated automatically to be "
                "model->max_seq_len + burn_in!"
            )
        # Add the `burn_in` to the Model's max_seq_len.
        # Set the replay sequence length to the max_seq_len of the model.
        config["replay_buffer_config"]["replay_sequence_length"] = (
            config["replay_buffer_config"]["replay_burn_in"]
            + config["model"]["max_seq_len"]
        )

        if config.get("batch_mode") != "complete_episodes":
            raise ValueError("`batch_mode` must be 'complete_episodes'!")
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								import logging
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								from typing import Type
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
-												[RLlib] Agents to algos: DQN w/o Apex and R2D2, DDPG/TD3, SAC, SlateQ, QMIX, PG, Bandits (#24896)


											
										
										
											2022-05-19 09:30:42 -07:00
+								from ray.rllib.algorithms.dqn import DQNTrainer, DEFAULT_CONFIG as DQN_DEFAULT_CONFIG
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								from ray.rllib.agents.dqn.r2d2_tf_policy import R2D2TFPolicy
 								from ray.rllib.agents.dqn.r2d2_torch_policy import R2D2TorchPolicy
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								from ray.rllib.agents.trainer import Trainer
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								from ray.rllib.policy.policy import Policy
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								from ray.rllib.utils.annotations import override
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								from ray.rllib.utils.typing import TrainerConfigDict
-												[RLlib] R2D2 Replay Buffer API integration. (#24473)


											
										
										
											2022-05-10 20:36:14 +02:00
+								from ray.rllib.utils.deprecation import DEPRECATED_VALUE
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
 								logger = logging.getLogger(__name__)
-												[CI] Replace YAPF disables with Black disables (#21982)


											
										
										
											2022-02-08 16:29:25 -08:00
+								# fmt: off
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								# __sphinx_doc_begin__
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								R2D2_DEFAULT_CONFIG = Trainer.merge_trainer_configs(
-												[RLlib] Config objects for DDPG and SimpleQ. (#24339)


											
										
										
											2022-05-12 16:12:42 +02:00
+								    DQN_DEFAULT_CONFIG,  # See keys in dqn.py, which are also supported.
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								    {
-												[RLlib] Allow n-step > 1 and prio. replay for R2D2 and RNNSAC. (#18939)


											
										
										
											2021-09-29 21:31:34 +02:00
+								        # Learning rate for adam optimizer.
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								        "lr": 1e-4,
 								        # Discount factor.
 								        "gamma": 0.997,
 								        # Train batch size (in number of single timesteps).
-												[RLlib] Config objects for DDPG and SimpleQ. (#24339)


											
										
										
											2022-05-12 16:12:42 +02:00
+								        "train_batch_size": 64,
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								        # Adam epsilon hyper parameter
 								        "adam_epsilon": 1e-3,
 								        # Run in parallel by default.
 								        "num_workers": 2,
 								        # Batch mode must be complete_episodes.
 								        "batch_mode": "complete_episodes",
-												[RLlib] Simple-Q uses training iteration fn (instead of execution_plan); ReplayBuffer API for Simple-Q (#22842)


											
										
										
											2022-03-29 15:44:40 +03:00
+								        # === Replay buffer ===
 								        "replay_buffer_config": {
 								            "type": "MultiAgentReplayBuffer",
-												[RLlib] Replay Buffer API and Ape-X. (#24506)


											
										
										
											2022-05-17 13:43:49 +02:00
+								            # Specify prioritized replay by supplying a buffer type that supports
 								            # prioritization, for example: MultiAgentPrioritizedReplayBuffer.
 								            "prioritized_replay": DEPRECATED_VALUE,
-												[RLlib] Replay Buffer API and Training Iteration Fn for DQN. (#23420)


											
										
										
											2022-04-18 12:20:12 +02:00
+								            # Size of the replay buffer (in sequences, not timesteps).
 								            "capacity": 100000,
-												[RLlib] Config objects for DDPG and SimpleQ. (#24339)


											
										
										
											2022-05-12 16:12:42 +02:00
+								            "storage_unit": "sequences",
-												[RLlib] Replay Buffer API and Training Iteration Fn for DQN. (#23420)


											
										
										
											2022-04-18 12:20:12 +02:00
+								            # Set automatically: The number
 								            # of contiguous environment steps to
 								            # replay at once. Will be calculated via
 								            # model->max_seq_len + burn_in.
 								            # Do not set this to any valid value!
 								            "replay_sequence_length": -1,
-												[RLlib] R2D2 Replay Buffer API integration. (#24473)


											
										
										
											2022-05-10 20:36:14 +02:00
+								            # If > 0, use the `replay_burn_in` first steps of each replay-sampled
 								            # sequence (starting either from all 0.0-values if `zero_init_state=True` or
 								            # from the already stored values) to calculate an even more accurate
 								            # initial states for the actual sequence (starting after this burn-in
 								            # window). In the burn-in case, the actual length of the sequence
 								            # used for loss calculation is `n - replay_burn_in` time steps
 								            # (n=LSTM’s/attention net’s max_seq_len).
 								            "replay_burn_in": 0,
-												[RLlib] Simple-Q uses training iteration fn (instead of execution_plan); ReplayBuffer API for Simple-Q (#22842)


											
										
										
											2022-03-29 15:44:40 +03:00
+								        },
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								        # If True, assume a zero-initialized state input (no matter where in
 								        # the episode the sequence is located).
 								        # If False, store the initial states along with each SampleBatch, use
 								        # it (as initial state when running through the network for training),
 								        # and update that initial state during training (from the internal
 								        # state outputs of the immediately preceding sequence).
 								        "zero_init_states": True,
 								        # Whether to use the h-function from the paper [1] to scale target
 								        # values in the R2D2-loss function:
 								        # h(x) = sign(x)(􏰅|x| + 1 − 1) + εx
 								        "use_h_function": True,
 								        # The epsilon parameter from the R2D2 loss function (only used
 								        # if `use_h_function`=True.
 								        "h_function_epsilon": 1e-3,
-												[RLlib] APPO Training iteration fn. (#24545)


											
										
										
											2022-05-17 10:31:07 +02:00
+								        # Update the target network every `target_network_update_freq` sample steps.
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								        "target_network_update_freq": 2500,
-												[RLlib] R2D2 Replay Buffer API integration. (#24473)


											
										
										
											2022-05-10 20:36:14 +02:00
 								        # Deprecated keys:
 								        # Use config["replay_buffer_config"]["replay_burn_in"] instead
 								        "burn_in": DEPRECATED_VALUE
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								    },
 								    _allow_unknown_configs=True,
 								)
 								# __sphinx_doc_end__
-												[CI] Replace YAPF disables with Black disables (#21982)


											
										
										
											2022-02-08 16:29:25 -08:00
+								# fmt: on
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								# Build an R2D2 trainer, which uses the framework specific Policy
 								# determined in `get_policy_class()` above.
 								class R2D2Trainer(DQNTrainer):
 								    """Recurrent Experience Replay in Distrib. Reinforcement Learning (R2D2).
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								    Trainer defining the distributed R2D2 algorithm.
 								    See `r2d2_[tf|torch]_policy.py` for the definition of the policies.
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								    [1] Recurrent Experience Replay in Distributed Reinforcement Learning -
 								        S Kapturowski, G Ostrovski, J Quan, R Munos, W Dabney - 2019, DeepMind
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
 								    Detailed documentation:
 								    https://docs.ray.io/en/master/rllib-algorithms.html#\
 								    recurrent-replay-distributed-dqn-r2d2
 								    """
 								    @classmethod
 								    @override(DQNTrainer)
 								    def get_default_config(cls) -> TrainerConfigDict:
 								        return R2D2_DEFAULT_CONFIG
 								    @override(DQNTrainer)
-												[CI] Format Python code with Black (#21975)

See #21316 and #21311 for the motivation behind these changes.
											
										
										
											2022-01-29 18:41:57 -08:00
+								    def get_default_policy_class(self, config: TrainerConfigDict) -> Type[Policy]:
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								        if config["framework"] == "torch":
 								            return R2D2TorchPolicy
 								        else:
 								            return R2D2TFPolicy
 								    @override(DQNTrainer)
 								    def validate_config(self, config: TrainerConfigDict) -> None:
 								        """Checks and updates the config based on settings.
 								        Rewrites rollout_fragment_length to take into account burn-in and
 								        max_seq_len truncation.
 								        """
-												[RLlib] Issue 20920 (partial solution): contrib/MADDPG + pettingzoo coop-pong-v4 not working. (#21452)


											
										
										
											2022-01-10 11:19:40 +01:00
+								        # Call super's validation method.
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								        super().validate_config(config)
-												[RLlib] Replay Buffer API and Training Iteration Fn for DQN. (#23420)


											
										
										
											2022-04-18 12:20:12 +02:00
+								        if config["replay_buffer_config"]["replay_sequence_length"] != -1:
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								            raise ValueError(
 								                "`replay_sequence_length` is calculated automatically to be "
-												[CI] Format Python code with Black (#21975)

See #21316 and #21311 for the motivation behind these changes.
											
										
										
											2022-01-29 18:41:57 -08:00
+								                "model->max_seq_len + burn_in!"
 								            )
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								        # Add the `burn_in` to the Model's max_seq_len.
 								        # Set the replay sequence length to the max_seq_len of the model.
-												[RLlib] Replay Buffer API and Training Iteration Fn for DQN. (#23420)


											
										
										
											2022-04-18 12:20:12 +02:00
+								        config["replay_buffer_config"]["replay_sequence_length"] = (
-												[RLlib] R2D2 Replay Buffer API integration. (#24473)


											
										
										
											2022-05-10 20:36:14 +02:00
+								            config["replay_buffer_config"]["replay_burn_in"]
 								            + config["model"]["max_seq_len"]
-												[CI] Format Python code with Black (#21975)

See #21316 and #21311 for the motivation behind these changes.
											
										
										
											2022-01-29 18:41:57 -08:00
+								        )
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
 								        if config.get("batch_mode") != "complete_episodes":
 								            raise ValueError("`batch_mode` must be 'complete_episodes'!")