ray/rllib/algorithms/r2d2/r2d2.py

import logging
from typing import Optional, Type

from ray.rllib.algorithms.dqn import DQN, DQNConfig
from ray.rllib.algorithms.r2d2.r2d2_tf_policy import R2D2TFPolicy
from ray.rllib.algorithms.r2d2.r2d2_torch_policy import R2D2TorchPolicy
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.deprecation import Deprecated
from ray.rllib.utils.typing import AlgorithmConfigDict
from ray.rllib.utils.deprecation import DEPRECATED_VALUE

logger = logging.getLogger(__name__)


class R2D2Config(DQNConfig):
    """Defines a configuration class from which a R2D2 Algorithm can be built.

    Example:
        >>> from ray.rllib.algorithms.r2d2.r2d2 import R2D2Config
        >>> config = R2D2Config()
        >>> print(config.h_function_epsilon)
        >>> replay_config = config.replay_buffer_config.update(
        >>>     {
        >>>         "capacity": 1000000,
        >>>         "replay_burn_in": 20,
        >>>     }
        >>> )
        >>> config.training(replay_buffer_config=replay_config)\
        >>>       .resources(num_gpus=1)\
        >>>       .rollouts(num_rollout_workers=30)\
        >>>       .environment("CartPole-v1")
        >>> algo = R2D2(config=config)
        >>> while True:
        >>>     algo.train()

    Example:
        >>> from ray.rllib.algorithms.r2d2.r2d2 import R2D2Config
        >>> from ray import tune
        >>> config = R2D2Config()
        >>> config.training(train_batch_size=tune.grid_search([256, 64])
        >>> config.environment(env="CartPole-v1")
        >>> tune.run(
        >>>     "R2D2",
        >>>     stop={"episode_reward_mean":200},
        >>>     config=config.to_dict()
        >>> )

    Example:
        >>> from ray.rllib.algorithms.r2d2.r2d2 import R2D2Config
        >>> config = R2D2Config()
        >>> print(config.exploration_config)
        >>> explore_config = config.exploration_config.update(
        >>>     {
        >>>         "initial_epsilon": 1.0,
        >>>         "final_epsilon": 0.1,
        >>>         "epsilone_timesteps": 200000,
        >>>     }
        >>> )
        >>> config.training(lr_schedule=[[1, 1e-3, [500, 5e-3]])\
        >>>       .exploration(exploration_config=explore_config)

    Example:
        >>> from ray.rllib.algorithms.r2d2.r2d2 import R2D2Config
        >>> config = R2D2Config()
        >>> print(config.exploration_config)
        >>> explore_config = config.exploration_config.update(
        >>>     {
        >>>         "type": "SoftQ",
        >>>         "temperature": [1.0],
        >>>     }
        >>> )
        >>> config.training(lr_schedule=[[1, 1e-3, [500, 5e-3]])\
        >>>       .exploration(exploration_config=explore_config)
    """

    def __init__(self, algo_class=None):
        """Initializes a ApexConfig instance."""
        super().__init__(algo_class=algo_class or R2D2)

        # fmt: off
        # __sphinx_doc_begin__
        # R2D2-specific settings:
        self.zero_init_states = True
        self.use_h_function = True
        self.h_function_epsilon = 1e-3

        # R2D2 settings overriding DQN ones:
        # .training()
        self.adam_epsilon = 1e-3
        self.lr = 1e-4
        self.gamma = 0.997
        self.train_batch_size = 1000
        self.target_network_update_freq = 1000
        self.training_intensity = 150
        # R2D2 is using a buffer that stores sequences.
        self.replay_buffer_config = {
            "type": "MultiAgentReplayBuffer",
            # Specify prioritized replay by supplying a buffer type that supports
            # prioritization, for example: MultiAgentPrioritizedReplayBuffer.
            "prioritized_replay": DEPRECATED_VALUE,
            # Size of the replay buffer (in sequences, not timesteps).
            "capacity": 100000,
            # This algorithm learns on sequences. We therefore require the replay buffer
            # to slice sampled batches into sequences before replay. How sequences
            # are sliced depends on the parameters `replay_sequence_length`,
            # `replay_burn_in`, and `replay_zero_init_states`.
            "storage_unit": "sequences",
            # Set automatically: The number
            # of contiguous environment steps to
            # replay at once. Will be calculated via
            # model->max_seq_len + burn_in.
            # Do not set this to any valid value!
            "replay_sequence_length": -1,
            # If > 0, use the `replay_burn_in` first steps of each replay-sampled
            # sequence (starting either from all 0.0-values if `zero_init_state=True` or
            # from the already stored values) to calculate an even more accurate
            # initial states for the actual sequence (starting after this burn-in
            # window). In the burn-in case, the actual length of the sequence
            # used for loss calculation is `n - replay_burn_in` time steps
            # (n=LSTM’s/attention net’s max_seq_len).
            "replay_burn_in": 0,
        }

        # .rollouts()
        self.num_workers = 2
        self.batch_mode = "complete_episodes"

        # fmt: on
        # __sphinx_doc_end__

        self.burn_in = DEPRECATED_VALUE

    def training(
        self,
        *,
        zero_init_states: Optional[bool] = None,
        use_h_function: Optional[bool] = None,
        h_function_epsilon: Optional[float] = None,
        **kwargs,
    ) -> "R2D2Config":
        """Sets the training related configuration.

        Args:
            zero_init_states: If True, assume a zero-initialized state input (no
                matter where in the episode the sequence is located).
                If False, store the initial states along with each SampleBatch, use
                it (as initial state when running through the network for training),
                and update that initial state during training (from the internal
                state outputs of the immediately preceding sequence).
            use_h_function: Whether to use the h-function from the paper [1] to scale
                target values in the R2D2-loss function:
                h(x) = sign(x)(􏰅|x| + 1 − 1) + εx
            h_function_epsilon: The epsilon parameter from the R2D2 loss function (only
                used if `use_h_function`=True.

        Returns:
            This updated AlgorithmConfig object.
        """
        # Pass kwargs onto super's `training()` method.
        super().training(**kwargs)

        if zero_init_states is not None:
            self.zero_init_states = zero_init_states
        if use_h_function is not None:
            self.use_h_function = use_h_function
        if h_function_epsilon is not None:
            self.h_function_epsilon = h_function_epsilon

        return self


class R2D2(DQN):
    """Recurrent Experience Replay in Distrib. Reinforcement Learning (R2D2).

    Algorithm defining the distributed R2D2 algorithm.
    See `r2d2_[tf|torch]_policy.py` for the definition of the policies.

    [1] Recurrent Experience Replay in Distributed Reinforcement Learning -
        S Kapturowski, G Ostrovski, J Quan, R Munos, W Dabney - 2019, DeepMind


    Detailed documentation:
    https://docs.ray.io/en/master/rllib-algorithms.html#\
    recurrent-replay-distributed-dqn-r2d2
    """

    @classmethod
    @override(DQN)
    def get_default_config(cls) -> AlgorithmConfigDict:
        return R2D2Config().to_dict()

    @override(DQN)
    def get_default_policy_class(self, config: AlgorithmConfigDict) -> Type[Policy]:
        if config["framework"] == "torch":
            return R2D2TorchPolicy
        else:
            return R2D2TFPolicy

    @override(DQN)
    def validate_config(self, config: AlgorithmConfigDict) -> None:
        """Checks and updates the config based on settings.

        Rewrites rollout_fragment_length to take into account burn-in and
        max_seq_len truncation.
        """
        # Call super's validation method.
        super().validate_config(config)

        if config["replay_buffer_config"].get("replay_sequence_length", -1) != -1:
            raise ValueError(
                "`replay_sequence_length` is calculated automatically to be "
                "model->max_seq_len + burn_in!"
            )
        # Add the `burn_in` to the Model's max_seq_len.
        # Set the replay sequence length to the max_seq_len of the model.
        config["replay_buffer_config"]["replay_sequence_length"] = (
            config["replay_buffer_config"]["replay_burn_in"]
            + config["model"]["max_seq_len"]
        )

        if config.get("batch_mode") != "complete_episodes":
            raise ValueError("`batch_mode` must be 'complete_episodes'!")


# Deprecated: Use ray.rllib.algorithms.r2d2.r2d2.R2D2Config instead!
class _deprecated_default_config(dict):
    def __init__(self):
        super().__init__(R2D2Config().to_dict())

    @Deprecated(
        old="ray.rllib.agents.dqn.r2d2::R2D2_DEFAULT_CONFIG",
        new="ray.rllib.algorithms.r2d2.r2d2::R2D2Config(...)",
        error=False,
    )
    def __getitem__(self, item):
        return super().__getitem__(item)


R2D2_DEFAULT_CONFIG = _deprecated_default_config()
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								import logging
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								from typing import Optional, Type
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								from ray.rllib.algorithms.dqn import DQN, DQNConfig
 								from ray.rllib.algorithms.r2d2.r2d2_tf_policy import R2D2TFPolicy
 								from ray.rllib.algorithms.r2d2.r2d2_torch_policy import R2D2TorchPolicy
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
+								from ray.rllib.policy.policy import Policy
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								from ray.rllib.utils.annotations import override
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								from ray.rllib.utils.deprecation import Deprecated
-												[RLlib] `Trainer` to `Algorithm` renaming. (#25539)


											
										
										
											2022-06-11 15:10:39 +02:00
+								from ray.rllib.utils.typing import AlgorithmConfigDict
-												[RLlib] R2D2 Replay Buffer API integration. (#24473)


											
										
										
											2022-05-10 20:36:14 +02:00
+								from ray.rllib.utils.deprecation import DEPRECATED_VALUE
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
 								logger = logging.getLogger(__name__)
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
 								class R2D2Config(DQNConfig):
-												[RLlib] `Trainer` to `Algorithm` renaming. (#25539)


											
										
										
											2022-06-11 15:10:39 +02:00
+								    """Defines a configuration class from which a R2D2 Algorithm can be built.
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
 								    Example:
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								        >>> from ray.rllib.algorithms.r2d2.r2d2 import R2D2Config
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        >>> config = R2D2Config()
 								        >>> print(config.h_function_epsilon)
 								        >>> replay_config = config.replay_buffer_config.update(
 								        >>>     {
 								        >>>         "capacity": 1000000,
 								        >>>         "replay_burn_in": 20,
 								        >>>     }
 								        >>> )
 								        >>> config.training(replay_buffer_config=replay_config)\
 								        >>>       .resources(num_gpus=1)\
 								        >>>       .rollouts(num_rollout_workers=30)\
 								        >>>       .environment("CartPole-v1")
-												[RLlib] More Trainer -> Algorithm renaming cleanups. (#25869)


											
										
										
											2022-06-20 15:54:00 +02:00
+								        >>> algo = R2D2(config=config)
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        >>> while True:
-												[RLlib] More Trainer -> Algorithm renaming cleanups. (#25869)


											
										
										
											2022-06-20 15:54:00 +02:00
+								        >>>     algo.train()
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
 								    Example:
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								        >>> from ray.rllib.algorithms.r2d2.r2d2 import R2D2Config
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        >>> from ray import tune
 								        >>> config = R2D2Config()
 								        >>> config.training(train_batch_size=tune.grid_search([256, 64])
 								        >>> config.environment(env="CartPole-v1")
 								        >>> tune.run(
 								        >>>     "R2D2",
 								        >>>     stop={"episode_reward_mean":200},
 								        >>>     config=config.to_dict()
 								        >>> )
 								    Example:
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								        >>> from ray.rllib.algorithms.r2d2.r2d2 import R2D2Config
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        >>> config = R2D2Config()
 								        >>> print(config.exploration_config)
 								        >>> explore_config = config.exploration_config.update(
 								        >>>     {
 								        >>>         "initial_epsilon": 1.0,
 								        >>>         "final_epsilon": 0.1,
 								        >>>         "epsilone_timesteps": 200000,
 								        >>>     }
 								        >>> )
 								        >>> config.training(lr_schedule=[[1, 1e-3, [500, 5e-3]])\
 								        >>>       .exploration(exploration_config=explore_config)
 								    Example:
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								        >>> from ray.rllib.algorithms.r2d2.r2d2 import R2D2Config
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        >>> config = R2D2Config()
 								        >>> print(config.exploration_config)
 								        >>> explore_config = config.exploration_config.update(
 								        >>>     {
 								        >>>         "type": "SoftQ",
 								        >>>         "temperature": [1.0],
 								        >>>     }
 								        >>> )
 								        >>> config.training(lr_schedule=[[1, 1e-3, [500, 5e-3]])\
 								        >>>       .exploration(exploration_config=explore_config)
 								    """
-												[RLlib] `Trainer` to `Algorithm` renaming. (#25539)


											
										
										
											2022-06-11 15:10:39 +02:00
+								    def __init__(self, algo_class=None):
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        """Initializes a ApexConfig instance."""
-												[RLlib] `Trainer` to `Algorithm` renaming. (#25539)


											
										
										
											2022-06-11 15:10:39 +02:00
+								        super().__init__(algo_class=algo_class or R2D2)
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
 								        # fmt: off
 								        # __sphinx_doc_begin__
 								        # R2D2-specific settings:
 								        self.zero_init_states = True
 								        self.use_h_function = True
 								        self.h_function_epsilon = 1e-3
 								        # R2D2 settings overriding DQN ones:
 								        # .training()
 								        self.adam_epsilon = 1e-3
 								        self.lr = 1e-4
 								        self.gamma = 0.997
-												[RLLib] Fix RNNSAC example failing on CI + fixes for recurrent models for other Q Learning Algos. (#24923)


											
										
										
											2022-05-24 14:39:43 +02:00
+								        self.train_batch_size = 1000
-												[RLlib] Better default values for `training_intensity` and `target_network_update_freq` for R2D2. (#25510)


											
										
										
											2022-06-07 10:29:56 +02:00
+								        self.target_network_update_freq = 1000
 								        self.training_intensity = 150
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        # R2D2 is using a buffer that stores sequences.
 								        self.replay_buffer_config = {
-												[RLlib] Simple-Q uses training iteration fn (instead of execution_plan); ReplayBuffer API for Simple-Q (#22842)


											
										
										
											2022-03-29 15:44:40 +03:00
+								            "type": "MultiAgentReplayBuffer",
-												[RLlib] Replay Buffer API and Ape-X. (#24506)


											
										
										
											2022-05-17 13:43:49 +02:00
+								            # Specify prioritized replay by supplying a buffer type that supports
 								            # prioritization, for example: MultiAgentPrioritizedReplayBuffer.
 								            "prioritized_replay": DEPRECATED_VALUE,
-												[RLlib] Replay Buffer API and Training Iteration Fn for DQN. (#23420)


											
										
										
											2022-04-18 12:20:12 +02:00
+								            # Size of the replay buffer (in sequences, not timesteps).
 								            "capacity": 100000,
-												[RLLib] Fix RNNSAC example failing on CI + fixes for recurrent models for other Q Learning Algos. (#24923)


											
										
										
											2022-05-24 14:39:43 +02:00
+								            # This algorithm learns on sequences. We therefore require the replay buffer
 								            # to slice sampled batches into sequences before replay. How sequences
 								            # are sliced depends on the parameters `replay_sequence_length`,
 								            # `replay_burn_in`, and `replay_zero_init_states`.
-												[RLlib] Config objects for DDPG and SimpleQ. (#24339)


											
										
										
											2022-05-12 16:12:42 +02:00
+								            "storage_unit": "sequences",
-												[RLlib] Replay Buffer API and Training Iteration Fn for DQN. (#23420)


											
										
										
											2022-04-18 12:20:12 +02:00
+								            # Set automatically: The number
 								            # of contiguous environment steps to
 								            # replay at once. Will be calculated via
 								            # model->max_seq_len + burn_in.
 								            # Do not set this to any valid value!
 								            "replay_sequence_length": -1,
-												[RLlib] R2D2 Replay Buffer API integration. (#24473)


											
										
										
											2022-05-10 20:36:14 +02:00
+								            # If > 0, use the `replay_burn_in` first steps of each replay-sampled
 								            # sequence (starting either from all 0.0-values if `zero_init_state=True` or
 								            # from the already stored values) to calculate an even more accurate
 								            # initial states for the actual sequence (starting after this burn-in
 								            # window). In the burn-in case, the actual length of the sequence
 								            # used for loss calculation is `n - replay_burn_in` time steps
 								            # (n=LSTM’s/attention net’s max_seq_len).
 								            "replay_burn_in": 0,
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        }
 								        # .rollouts()
 								        self.num_workers = 2
 								        self.batch_mode = "complete_episodes"
 								        # fmt: on
 								        # __sphinx_doc_end__
 								        self.burn_in = DEPRECATED_VALUE
 								    def training(
 								        self,
 								        *,
 								        zero_init_states: Optional[bool] = None,
 								        use_h_function: Optional[bool] = None,
 								        h_function_epsilon: Optional[float] = None,
 								        **kwargs,
 								    ) -> "R2D2Config":
 								        """Sets the training related configuration.
 								        Args:
 								            zero_init_states: If True, assume a zero-initialized state input (no
 								                matter where in the episode the sequence is located).
 								                If False, store the initial states along with each SampleBatch, use
 								                it (as initial state when running through the network for training),
 								                and update that initial state during training (from the internal
 								                state outputs of the immediately preceding sequence).
 								            use_h_function: Whether to use the h-function from the paper [1] to scale
 								                target values in the R2D2-loss function:
 								                h(x) = sign(x)(􏰅|x| + 1 − 1) + εx
 								            h_function_epsilon: The epsilon parameter from the R2D2 loss function (only
 								                used if `use_h_function`=True.
 								        Returns:
-												[RLlib] `Trainer` to `Algorithm` renaming. (#25539)


											
										
										
											2022-06-11 15:10:39 +02:00
+								            This updated AlgorithmConfig object.
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        """
 								        # Pass kwargs onto super's `training()` method.
 								        super().training(**kwargs)
 								        if zero_init_states is not None:
 								            self.zero_init_states = zero_init_states
 								        if use_h_function is not None:
 								            self.use_h_function = use_h_function
 								        if h_function_epsilon is not None:
 								            self.h_function_epsilon = h_function_epsilon
 								        return self
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								class R2D2(DQN):
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								    """Recurrent Experience Replay in Distrib. Reinforcement Learning (R2D2).
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
-												[RLlib] `Trainer` to `Algorithm` renaming. (#25539)


											
										
										
											2022-06-11 15:10:39 +02:00
+								    Algorithm defining the distributed R2D2 algorithm.
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								    See `r2d2_[tf|torch]_policy.py` for the definition of the policies.
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								    [1] Recurrent Experience Replay in Distributed Reinforcement Learning -
 								        S Kapturowski, G Ostrovski, J Quan, R Munos, W Dabney - 2019, DeepMind
-												[RLlib] R2D2 Implementation. (#13933)


											
										
										
											2021-02-25 12:18:11 +01:00
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
 								    Detailed documentation:
 								    https://docs.ray.io/en/master/rllib-algorithms.html#\
 								    recurrent-replay-distributed-dqn-r2d2
 								    """
 								    @classmethod
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								    @override(DQN)
-												[RLlib] `Trainer` to `Algorithm` renaming. (#25539)


											
										
										
											2022-06-11 15:10:39 +02:00
+								    def get_default_config(cls) -> AlgorithmConfigDict:
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        return R2D2Config().to_dict()
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								    @override(DQN)
-												[RLlib] `Trainer` to `Algorithm` renaming. (#25539)


											
										
										
											2022-06-11 15:10:39 +02:00
+								    def get_default_policy_class(self, config: AlgorithmConfigDict) -> Type[Policy]:
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								        if config["framework"] == "torch":
 								            return R2D2TorchPolicy
 								        else:
 								            return R2D2TFPolicy
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								    @override(DQN)
-												[RLlib] `Trainer` to `Algorithm` renaming. (#25539)


											
										
										
											2022-06-11 15:10:39 +02:00
+								    def validate_config(self, config: AlgorithmConfigDict) -> None:
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								        """Checks and updates the config based on settings.
 								        Rewrites rollout_fragment_length to take into account burn-in and
 								        max_seq_len truncation.
 								        """
-												[RLlib] Issue 20920 (partial solution): contrib/MADDPG + pettingzoo coop-pong-v4 not working. (#21452)


											
										
										
											2022-01-10 11:19:40 +01:00
+								        # Call super's validation method.
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								        super().validate_config(config)
-												[RLlib] Replay Buffer API documentation. (#24683)


											
										
										
											2022-06-10 16:47:51 +02:00
+								        if config["replay_buffer_config"].get("replay_sequence_length", -1) != -1:
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
+								            raise ValueError(
 								                "`replay_sequence_length` is calculated automatically to be "
 								                "model->max_seq_len + burn_in!"
 								            )
 								        # Add the `burn_in` to the Model's max_seq_len.
 								        # Set the replay sequence length to the max_seq_len of the model.
-												[RLlib] Replay Buffer API and Training Iteration Fn for DQN. (#23420)


											
										
										
											2022-04-18 12:20:12 +02:00
+								        config["replay_buffer_config"]["replay_sequence_length"] = (
-												[RLlib] R2D2 Replay Buffer API integration. (#24473)


											
										
										
											2022-05-10 20:36:14 +02:00
+								            config["replay_buffer_config"]["replay_burn_in"]
 								            + config["model"]["max_seq_len"]
-												[CI] Format Python code with Black (#21975)

See #21316 and #21311 for the motivation behind these changes.
											
										
										
											2022-01-29 18:41:57 -08:00
+								        )
-												[RLlib] Trainer sub-class DQN/SimpleQ/APEX-DQN/R2D2 (instead of using `build_trainer`). (#20633)


											
										
										
											2021-11-30 18:05:44 +01:00
 								        if config.get("batch_mode") != "complete_episodes":
 								            raise ValueError("`batch_mode` must be 'complete_episodes'!")
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								# Deprecated: Use ray.rllib.algorithms.r2d2.r2d2.R2D2Config instead!
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								class _deprecated_default_config(dict):
 								    def __init__(self):
 								        super().__init__(R2D2Config().to_dict())
 								    @Deprecated(
-												[RLlib] Move all remaining algos into `algorithms` directory. (#25366)


											
										
										
											2022-06-04 07:35:24 +02:00
+								        old="ray.rllib.agents.dqn.r2d2::R2D2_DEFAULT_CONFIG",
 								        new="ray.rllib.algorithms.r2d2.r2d2::R2D2Config(...)",
-												[RLlib] APEX-DQN and R2D2 config objects. (#25067)


											
										
										
											2022-05-23 12:15:45 +02:00
+								        error=False,
 								    )
 								    def __getitem__(self, item):
 								        return super().__getitem__(item)
 								R2D2_DEFAULT_CONFIG = _deprecated_default_config()