ray/rllib/algorithms/a3c/a3c_torch_policy.py

from typing import Dict, List, Optional, Type, Union

import ray
from ray.rllib.evaluation.episode import Episode
from ray.rllib.evaluation.postprocessing import (
    compute_gae_for_sample_batch,
    Postprocessing,
)
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.torch_mixins import (
    EntropyCoeffSchedule,
    LearningRateSchedule,
    ValueNetworkMixin,
)
from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
from ray.rllib.utils.annotations import override
from ray.rllib.utils.deprecation import Deprecated
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.numpy import convert_to_numpy
from ray.rllib.utils.torch_utils import apply_grad_clipping, sequence_mask
from ray.rllib.utils.typing import AgentID, TensorType

torch, nn = try_import_torch()


class A3CTorchPolicy(
    ValueNetworkMixin, LearningRateSchedule, EntropyCoeffSchedule, TorchPolicyV2
):
    """PyTorch Policy class used with A3C."""

    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.algorithms.a3c.a3c.A3CConfig().to_dict(), **config)

        TorchPolicyV2.__init__(
            self,
            observation_space,
            action_space,
            config,
            max_seq_len=config["model"]["max_seq_len"],
        )
        ValueNetworkMixin.__init__(self, config)
        LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"])
        EntropyCoeffSchedule.__init__(
            self, config["entropy_coeff"], config["entropy_coeff_schedule"]
        )

        # TODO: Don't require users to call this manually.
        self._initialize_loss_from_dummy_batch()

    @override(TorchPolicyV2)
    def loss(
        self,
        model: ModelV2,
        dist_class: Type[TorchDistributionWrapper],
        train_batch: SampleBatch,
    ) -> Union[TensorType, List[TensorType]]:
        """Constructs the loss function.

        Args:
            model: The Model to calculate the loss for.
            dist_class: The action distr. class.
            train_batch: The training data.

        Returns:
            The A3C loss tensor given the input batch.
        """
        logits, _ = model(train_batch)
        values = model.value_function()

        if self.is_recurrent():
            B = len(train_batch[SampleBatch.SEQ_LENS])
            max_seq_len = logits.shape[0] // B
            mask_orig = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
            valid_mask = torch.reshape(mask_orig, [-1])
        else:
            valid_mask = torch.ones_like(values, dtype=torch.bool)

        dist = dist_class(logits, model)
        log_probs = dist.logp(train_batch[SampleBatch.ACTIONS]).reshape(-1)
        pi_err = -torch.sum(
            torch.masked_select(
                log_probs * train_batch[Postprocessing.ADVANTAGES], valid_mask
            )
        )

        # Compute a value function loss.
        if self.config["use_critic"]:
            value_err = 0.5 * torch.sum(
                torch.pow(
                    torch.masked_select(
                        values.reshape(-1) - train_batch[Postprocessing.VALUE_TARGETS],
                        valid_mask,
                    ),
                    2.0,
                )
            )
        # Ignore the value function.
        else:
            value_err = 0.0

        entropy = torch.sum(torch.masked_select(dist.entropy(), valid_mask))

        total_loss = (
            pi_err
            + value_err * self.config["vf_loss_coeff"]
            - entropy * self.entropy_coeff
        )

        # Store values for stats function in model (tower), such that for
        # multi-GPU, we do not override them during the parallel loss phase.
        model.tower_stats["entropy"] = entropy
        model.tower_stats["pi_err"] = pi_err
        model.tower_stats["value_err"] = value_err

        return total_loss

    @override(TorchPolicyV2)
    def optimizer(
        self,
    ) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]:
        """Returns a torch optimizer (Adam) for A3C."""
        return torch.optim.Adam(self.model.parameters(), lr=self.config["lr"])

    @override(TorchPolicyV2)
    def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
        return convert_to_numpy(
            {
                "cur_lr": self.cur_lr,
                "entropy_coeff": self.entropy_coeff,
                "policy_entropy": torch.mean(
                    torch.stack(self.get_tower_stats("entropy"))
                ),
                "policy_loss": torch.mean(torch.stack(self.get_tower_stats("pi_err"))),
                "vf_loss": torch.mean(torch.stack(self.get_tower_stats("value_err"))),
            }
        )

    @override(TorchPolicyV2)
    def postprocess_trajectory(
        self,
        sample_batch: SampleBatch,
        other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
        episode: Optional[Episode] = None,
    ):
        sample_batch = super().postprocess_trajectory(sample_batch)
        return compute_gae_for_sample_batch(
            self, sample_batch, other_agent_batches, episode
        )

    @override(TorchPolicyV2)
    def extra_grad_process(
        self, optimizer: "torch.optim.Optimizer", loss: TensorType
    ) -> Dict[str, TensorType]:
        return apply_grad_clipping(self, optimizer, loss)


@Deprecated(
    old="rllib.algorithms.a3c.a3c_torch_policy.add_advantages",
    new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch",
    error=True,
)
def add_advantages(*args, **kwargs):
    pass
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`from typing import Dict, List, Optional, Type, Union`
[RLlib] Issue 9071 A3C w/ RNN not working due to VF assuming no RNN. (#13238) 2021-01-19 14:22:36 +01:00
[rllib] Part 2 of multiagent support (#2286) * wip * cls * re * wip * wip * a3c working * torch support * pg works * lint * rm v2 * consumer id * clean up pg * clean up more * fix python 2.7 * tf session management * docs * dqn wip * fix compile * dqn * apex runs * up * impotrs * ddpg * quotes * fix tests * fix last r * fix tests * lint * pass checkpoint restore * kwar * nits * policy graph * fix yapf * com * class * pyt * vectorization * update * test cpe * unit test * fix ddpg2 * changes * wip * args * faster test * common * fix * add alg option * batch mode and policy serving * multi serving test * todo * wip * serving test * doc async env * num envs * comments * thread * remove init hook * update * fix ppo * comments1 * fix * updates * add jenkins tests * fix * fix pytorch * fix * fixes * fix a3c policy * fix squeeze * fix trunc on apex * fix squeezing for real * update * remove horizon test for now * multiagent wip * update * fix race condition * fix ma * t * doc * st * wip * example * wip * working * cartpole * wip * batch wip * fix bug * make other_batches None default * working * debug * nit * warn * comments * fix ppo * fix obs filter * update * fix obs filter * pass thru worker index * fix * fix log action * debug name * fix sphinx 2018-06-25 22:33:57 -07:00			`import ray`
[RLlib; Docs overhaul] Docstring cleanup: Evaluation (#19783) 2021-10-29 12:03:56 +02:00			`from ray.rllib.evaluation.episode import Episode`
[RLlib] Issue 9071 A3C w/ RNN not working due to VF assuming no RNN. (#13238) 2021-01-19 14:22:36 +01:00			`from ray.rllib.evaluation.postprocessing import (`
			`compute_gae_for_sample_batch,`
[rllib] Minor cleanups to TFPolicyGraph: add init args, constants for loss inputs (#4478) 2019-03-29 12:44:23 -07:00			`Postprocessing,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`from ray.rllib.models.modelv2 import ModelV2`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper`
[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`from ray.rllib.policy.sample_batch import SampleBatch`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`from ray.rllib.policy.torch_mixins import (`
			`EntropyCoeffSchedule,`
			`LearningRateSchedule,`
			`ValueNetworkMixin,`
			`)`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2`
			`from ray.rllib.utils.annotations import override`
[RLlib; Docs overhaul] Docstring cleanup: rllib/utils (#19829) 2021-11-01 21:46:02 +01:00			`from ray.rllib.utils.deprecation import Deprecated`
Get utils ready for better Agent torch support. (#6561) 2019-12-30 15:27:32 -05:00			`from ray.rllib.utils.framework import try_import_torch`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`from ray.rllib.utils.numpy import convert_to_numpy`
[RLlib] Fix deprecated warning for torch_ops.py (soft-replaced by torch_utils.py). (#19982) 2021-11-03 10:00:46 +01:00			`from ray.rllib.utils.torch_utils import apply_grad_clipping, sequence_mask`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`from ray.rllib.utils.typing import AgentID, TensorType`
Get utils ready for better Agent torch support. (#6561) 2019-12-30 15:27:32 -05:00
			`torch, nn = try_import_torch()`
[rllib] [RFC] Dynamic definition of loss functions and modularization support (#4795) * dynamic graph * wip * clean up * fix * document trainer * wip * initialize the graph using a fake batch * clean up dynamic init * wip * spelling * use builder for ppo pol graph * add ppo graph * fix naming * order * docs * set class name correctly * add torch builder * add custom model support in builder * cleanup * remove underscores * fix py2 compat * Update dynamic_tf_policy_graph.py * Update tracking_dict.py * wip * rename * debug level * rename policy_graph -> policy in new classes * fix test * rename ppo tf policy * port appo too * forgot grads * default policy optimizer * make default config optional * add config to optimizer * use lr by default in optimizer * update * comments * remove optimizer * fix tuple actions support in dynamic tf graph 2019-05-18 00:23:11 -07:00

[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`class A3CTorchPolicy(`
			`ValueNetworkMixin, LearningRateSchedule, EntropyCoeffSchedule, TorchPolicyV2`
			`):`
[RLlib] A2C + A3C move to `algorithms` folder and re-name into A2C/A3C (from ...Trainer). (#25314) 2022-06-01 09:29:16 +02:00			`"""PyTorch Policy class used with A3C."""`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00
			`def __init__(self, observation_space, action_space, config):`
[RLlib] A2C + A3C move to `algorithms` folder and re-name into A2C/A3C (from ...Trainer). (#25314) 2022-06-01 09:29:16 +02:00			`config = dict(ray.rllib.algorithms.a3c.a3c.A3CConfig().to_dict(), **config)`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00
			`TorchPolicyV2.__init__(`
			`self,`
			`observation_space,`
			`action_space,`
			`config,`
			`max_seq_len=config["model"]["max_seq_len"],`
[RLlib] Mask out padded values for A3C loss with recurrent policy (#15525) 2021-04-27 02:36:04 -04:00			`)`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`ValueNetworkMixin.__init__(self, config)`
			`LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"])`
			`EntropyCoeffSchedule.__init__(`
			`self, config["entropy_coeff"], config["entropy_coeff_schedule"]`
			`)`

			`# TODO: Don't require users to call this manually.`
			`self._initialize_loss_from_dummy_batch()`

			`@override(TorchPolicyV2)`
			`def loss(`
			`self,`
			`model: ModelV2,`
			`dist_class: Type[TorchDistributionWrapper],`
			`train_batch: SampleBatch,`
			`) -> Union[TensorType, List[TensorType]]:`
			`"""Constructs the loss function.`

			`Args:`
			`model: The Model to calculate the loss for.`
			`dist_class: The action distr. class.`
			`train_batch: The training data.`

			`Returns:`
			`The A3C loss tensor given the input batch.`
			`"""`
			`logits, _ = model(train_batch)`
			`values = model.value_function()`

			`if self.is_recurrent():`
			`B = len(train_batch[SampleBatch.SEQ_LENS])`
			`max_seq_len = logits.shape[0] // B`
			`mask_orig = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)`
			`valid_mask = torch.reshape(mask_orig, [-1])`
			`else:`
			`valid_mask = torch.ones_like(values, dtype=torch.bool)`

			`dist = dist_class(logits, model)`
			`log_probs = dist.logp(train_batch[SampleBatch.ACTIONS]).reshape(-1)`
			`pi_err = -torch.sum(`
			`torch.masked_select(`
			`log_probs * train_batch[Postprocessing.ADVANTAGES], valid_mask`
			`)`
			`)`

			`# Compute a value function loss.`
			`if self.config["use_critic"]:`
			`value_err = 0.5 * torch.sum(`
			`torch.pow(`
			`torch.masked_select(`
			`values.reshape(-1) - train_batch[Postprocessing.VALUE_TARGETS],`
			`valid_mask,`
			`),`
			`2.0,`
			`)`
[RLlib] Discussion 2021: PPO does not learn vf, iff use_gae=False (ignores use_critic setting). (#15610) 2021-05-04 14:17:00 +02:00			`)`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`# Ignore the value function.`
			`else:`
			`value_err = 0.0`

			`entropy = torch.sum(torch.masked_select(dist.entropy(), valid_mask))`

			`total_loss = (`
			`pi_err`
			`+ value_err * self.config["vf_loss_coeff"]`
			`- entropy * self.entropy_coeff`
			`)`

			`# Store values for stats function in model (tower), such that for`
			`# multi-GPU, we do not override them during the parallel loss phase.`
			`model.tower_stats["entropy"] = entropy`
			`model.tower_stats["pi_err"] = pi_err`
			`model.tower_stats["value_err"] = value_err`

			`return total_loss`

			`@override(TorchPolicyV2)`
			`def optimizer(`
			`self,`
			`) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]:`
			`"""Returns a torch optimizer (Adam) for A3C."""`
			`return torch.optim.Adam(self.model.parameters(), lr=self.config["lr"])`

			`@override(TorchPolicyV2)`
			`def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:`
			`return convert_to_numpy(`
			`{`
			`"cur_lr": self.cur_lr,`
			`"entropy_coeff": self.entropy_coeff,`
			`"policy_entropy": torch.mean(`
			`torch.stack(self.get_tower_stats("entropy"))`
			`),`
			`"policy_loss": torch.mean(torch.stack(self.get_tower_stats("pi_err"))),`
			`"vf_loss": torch.mean(torch.stack(self.get_tower_stats("value_err"))),`
			`}`
			`)`

			`@override(TorchPolicyV2)`
			`def postprocess_trajectory(`
			`self,`
			`sample_batch: SampleBatch,`
			`other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,`
			`episode: Optional[Episode] = None,`
			`):`
			`sample_batch = super().postprocess_trajectory(sample_batch)`
			`return compute_gae_for_sample_batch(`
			`self, sample_batch, other_agent_batches, episode`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00
			`@override(TorchPolicyV2)`
			`def extra_grad_process(`
			`self, optimizer: "torch.optim.Optimizer", loss: TensorType`
			`) -> Dict[str, TensorType]:`
			`return apply_grad_clipping(self, optimizer, loss)`


			`@Deprecated(`
[RLlib] A2C + A3C move to `algorithms` folder and re-name into A2C/A3C (from ...Trainer). (#25314) 2022-06-01 09:29:16 +02:00			`old="rllib.algorithms.a3c.a3c_torch_policy.add_advantages",`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch",`
			`error=True,`
[RLlib] Trajectory view API: Simple List Collector (on by default for PPO); LSTM-agnostic (#11056) 2020-10-01 16:57:10 +02:00			`)`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`def add_advantages(args, *kwargs):`
			`pass`