ray/rllib/algorithms/maddpg/maddpg.py

"""Contributed port of MADDPG from OpenAI baselines.

The implementation has a couple assumptions:
- The number of agents is fixed and known upfront.
- Each agent is bound to a policy of the same name.
- Discrete actions are sent as logits (pre-softmax).

For a minimal example, see rllib/examples/two_step_game.py,
and the README for how to run with the multi-agent particle envs.
"""

import logging
from typing import List, Optional, Type

from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
from ray.rllib.algorithms.dqn.dqn import DQN
from ray.rllib.algorithms.maddpg.maddpg_tf_policy import MADDPGTFPolicy
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
from ray.rllib.utils.annotations import Deprecated, override
from ray.rllib.utils.typing import AlgorithmConfigDict
from ray.rllib.utils.deprecation import DEPRECATED_VALUE

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class MADDPGConfig(AlgorithmConfig):
    """Defines a configuration class from which a MADDPG Algorithm can be built.

    Example:
        >>> from ray.rllib.algorithms.maddpg.maddpg import MADDPGConfig
        >>> config = MADDPGConfig()
        >>> print(config.replay_buffer_config)
        >>> replay_config = config.replay_buffer_config.update(
        >>>     {
        >>>         "capacity": 100000,
        >>>         "prioritized_replay_alpha": 0.8,
        >>>         "prioritized_replay_beta": 0.45,
        >>>         "prioritized_replay_eps": 2e-6,
        >>>     }
        >>> )
        >>> config.training(replay_buffer_config=replay_config)\
        >>>       .resources(num_gpus=0)\
        >>>       .rollouts(num_rollout_workers=4)\
        >>>       .environment("CartPole-v1")
        >>> algo = config.build()
        >>> while True:
        >>>     algo.train()

    Example:
        >>> from ray.rllib.algorithms.maddpg.maddpg import MADDPGConfig
        >>> from ray import tune
        >>> config = MADDPGConfig()
        >>> config.training(n_step=tune.grid_search([3, 5]))
        >>> config.environment(env="CartPole-v1")
        >>> tune.run(
        >>>     "MADDPG",
        >>>     stop={"episode_reward_mean":200},
        >>>     config=config.to_dict()
        >>> )
    """

    def __init__(self, algo_class=None):
        """Initializes a DQNConfig instance."""
        super().__init__(algo_class=algo_class or MADDPG)

        # fmt: off
        # __sphinx_doc_begin__
        # MADDPG specific config settings:
        self.agent_id = None
        self.use_local_critic = False
        self.use_state_preprocessor = False
        self.actor_hiddens = [64, 64]
        self.actor_hidden_activation = "relu"
        self.critic_hiddens = [64, 64]
        self.critic_hidden_activation = "relu"
        self.n_step = 1
        self.good_policy = "maddpg"
        self.adv_policy = "maddpg"
        self.replay_buffer_config = {
            "type": "MultiAgentReplayBuffer",
            # Specify prioritized replay by supplying a buffer type that supports
            # prioritization, for example: MultiAgentPrioritizedReplayBuffer.
            "prioritized_replay": DEPRECATED_VALUE,
            "capacity": int(1e6),
            # Force lockstep replay mode for MADDPG.
            "replay_mode": "lockstep",
        }
        self.training_intensity = None
        # Number of timesteps to collect from rollout workers before we start
        # sampling from replay buffers for learning. Whether we count this in agent
        # steps  or environment steps depends on config["multiagent"]["count_steps_by"].
        self.num_steps_sampled_before_learning_starts = 1024 * 25
        self.critic_lr = 1e-2
        self.actor_lr = 1e-2
        self.target_network_update_freq = 0
        self.tau = 0.01
        self.actor_feature_reg = 0.001
        self.grad_norm_clipping = 0.5

        # Changes to Algorithm's default:
        self.rollout_fragment_length = 100
        self.train_batch_size = 1024
        self.num_workers = 1
        self.min_time_s_per_iteration = 0
        # fmt: on
        # __sphinx_doc_end__

    @override(AlgorithmConfig)
    def training(
        self,
        *,
        agent_id: Optional[str] = None,
        use_local_critic: Optional[bool] = None,
        use_state_preprocessor: Optional[bool] = None,
        actor_hiddens: Optional[List[int]] = None,
        actor_hidden_activation: Optional[str] = None,
        critic_hiddens: Optional[List[int]] = None,
        critic_hidden_activation: Optional[str] = None,
        n_step: Optional[int] = None,
        good_policy: Optional[str] = None,
        adv_policy: Optional[str] = None,
        replay_buffer_config: Optional[dict] = None,
        training_intensity: Optional[float] = None,
        critic_lr: Optional[float] = None,
        actor_lr: Optional[float] = None,
        target_network_update_freq: Optional[int] = None,
        tau: Optional[float] = None,
        actor_feature_reg: Optional[float] = None,
        grad_norm_clipping: Optional[float] = None,
        **kwargs,
    ) -> "MADDPGConfig":
        """Sets the training related configuration.

        Args:
            agent_id: ID of the agent controlled by this policy.
            use_local_critic: Use a local critic for this policy.
            use_state_preprocessor: Apply a state preprocessor with spec given by the
                "model" config option (like other RL algorithms). This is mostly useful
                if you have a weird observation shape, like an image. Disabled by
                default.
            actor_hiddens: Postprocess the policy network model output with these hidden
                layers. If `use_state_preprocessor` is False, then these will be the
                *only* hidden layers in the network.
            actor_hidden_activation: Hidden layers activation of the postprocessing
                stage of the policy network.
            critic_hiddens: Postprocess the critic network model output with these
                hidden layers; again, if use_state_preprocessor is True, then the state
                will be preprocessed by the model specified with the "model" config
                option first.
            critic_hidden_activation: Hidden layers activation of the postprocessing
                state of the critic.
            n_step: N-step for Q-learning.
            good_policy: Algorithm for good policies.
            adv_policy: Algorithm for adversary policies.
            replay_buffer_config: Replay buffer config.
                Examples:
                {
                "_enable_replay_buffer_api": True,
                "type": "MultiAgentReplayBuffer",
                "capacity": 50000,
                "replay_sequence_length": 1,
                }
                - OR -
                {
                "_enable_replay_buffer_api": True,
                "type": "MultiAgentPrioritizedReplayBuffer",
                "capacity": 50000,
                "prioritized_replay_alpha": 0.6,
                "prioritized_replay_beta": 0.4,
                "prioritized_replay_eps": 1e-6,
                "replay_sequence_length": 1,
                }
                - Where -
                prioritized_replay_alpha: Alpha parameter controls the degree of
                prioritization in the buffer. In other words, when a buffer sample has
                a higher temporal-difference error, with how much more probability
                should it drawn to use to update the parametrized Q-network. 0.0
                corresponds to uniform probability. Setting much above 1.0 may quickly
                result as the sampling distribution could become heavily “pointy” with
                low entropy.
                prioritized_replay_beta: Beta parameter controls the degree of
                importance sampling which suppresses the influence of gradient updates
                from samples that have higher probability of being sampled via alpha
                parameter and the temporal-difference error.
                prioritized_replay_eps: Epsilon parameter sets the baseline probability
                for sampling so that when the temporal-difference error of a sample is
                zero, there is still a chance of drawing the sample.
            training_intensity: If set, this will fix the ratio of replayed from a
                buffer and learned on timesteps to sampled from an environment and
                stored in the replay buffer timesteps. Otherwise, the replay will
                proceed at the native ratio determined by
                `(train_batch_size / rollout_fragment_length)`.
            critic_lr: Learning rate for the critic (Q-function) optimizer.
            actor_lr: Learning rate for the actor (policy) optimizer.
            target_network_update_freq: Update the target network every
                `target_network_update_freq` sample steps.
            tau: Update the target by \tau * policy + (1-\tau) * target_policy.
            actor_feature_reg: Weights for feature regularization for the actor.
            grad_norm_clipping: If not None, clip gradients during optimization at this
                value.

        Returns:
            This updated AlgorithmConfig object.
        """

        # Pass kwargs onto super's `training()` method.
        super().training(**kwargs)

        if agent_id is not None:
            self.agent_id = agent_id
        if use_local_critic is not None:
            self.use_local_critic = use_local_critic
        if use_state_preprocessor is not None:
            self.use_state_preprocessor = use_state_preprocessor
        if actor_hiddens is not None:
            self.actor_hiddens = actor_hiddens
        if actor_hidden_activation is not None:
            self.actor_hidden_activation = actor_hidden_activation
        if critic_hiddens is not None:
            self.critic_hiddens = critic_hiddens
        if critic_hidden_activation is not None:
            self.critic_hidden_activation = critic_hidden_activation
        if n_step is not None:
            self.n_step = n_step
        if good_policy is not None:
            self.good_policy = good_policy
        if adv_policy is not None:
            self.adv_policy = adv_policy
        if replay_buffer_config is not None:
            self.replay_buffer_config = replay_buffer_config
        if training_intensity is not None:
            self.training_intensity = training_intensity
        if critic_lr is not None:
            self.critic_lr = critic_lr
        if actor_lr is not None:
            self.actor_lr = actor_lr
        if target_network_update_freq is not None:
            self.target_network_update_freq = target_network_update_freq
        if tau is not None:
            self.tau = tau
        if actor_feature_reg is not None:
            self.actor_feature_reg = actor_feature_reg
        if grad_norm_clipping is not None:
            self.grad_norm_clipping = grad_norm_clipping

        return self


def before_learn_on_batch(multi_agent_batch, policies, train_batch_size):
    samples = {}

    # Modify keys.
    for pid, p in policies.items():
        i = p.config["agent_id"]
        keys = multi_agent_batch.policy_batches[pid].keys()
        keys = ["_".join([k, str(i)]) for k in keys]
        samples.update(dict(zip(keys, multi_agent_batch.policy_batches[pid].values())))

    # Make ops and feed_dict to get "new_obs" from target action sampler.
    new_obs_ph_n = [p.new_obs_ph for p in policies.values()]
    new_obs_n = list()
    for k, v in samples.items():
        if "new_obs" in k:
            new_obs_n.append(v)

    for i, p in enumerate(policies.values()):
        feed_dict = {new_obs_ph_n[i]: new_obs_n[i]}
        new_act = p.get_session().run(p.target_act_sampler, feed_dict)
        samples.update({"new_actions_%d" % i: new_act})

    # Share samples among agents.
    policy_batches = {pid: SampleBatch(samples) for pid in policies.keys()}
    return MultiAgentBatch(policy_batches, train_batch_size)


class MADDPG(DQN):
    @classmethod
    @override(DQN)
    def get_default_config(cls) -> AlgorithmConfigDict:
        return MADDPGConfig().to_dict()

    @override(DQN)
    def validate_config(self, config: AlgorithmConfigDict) -> None:
        """Adds the `before_learn_on_batch` hook to the config.

        This hook is called explicitly prior to TrainOneStep() in the execution
        setups for DQN and APEX.
        """
        # Call super's validation method.
        super().validate_config(config)

        def f(batch, workers, config):
            policies = dict(
                workers.local_worker().foreach_policy_to_train(lambda p, i: (i, p))
            )
            return before_learn_on_batch(batch, policies, config["train_batch_size"])

        config["before_learn_on_batch"] = f

    @override(DQN)
    def get_default_policy_class(self, config: AlgorithmConfigDict) -> Type[Policy]:
        return MADDPGTFPolicy


# Deprecated: Use ray.rllib.algorithms.maddpg.MADDPG instead!
class _deprecated_default_config(dict):
    def __init__(self):
        super().__init__(MADDPGConfig().to_dict())

    @Deprecated(
        old="ray.rllib.algorithms.maddpg.maddpg.DEFAULT_CONFIG",
        new="ray.rllib.algorithms.maddpg.maddpg.MADDPGConfig(...)",
        error=False,
    )
    def __getitem__(self, item):
        return super().__getitem__(item)


DEFAULT_CONFIG = _deprecated_default_config()