ray/rllib/agents/ppo/ppo_torch_policy.py

import logging
from typing import Dict, List, Type, Union

import ray
from ray.rllib.agents.ppo.ppo_tf_policy import validate_config
from ray.rllib.evaluation.postprocessing import (
    compute_gae_for_sample_batch,
    Postprocessing,
)
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.action_dist import ActionDistribution
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.torch_mixins import (
    EntropyCoeffSchedule,
    KLCoeffMixin,
    LearningRateSchedule,
    ValueNetworkMixin,
)
from ray.rllib.policy.torch_policy import TorchPolicy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.numpy import convert_to_numpy
from ray.rllib.utils.torch_utils import (
    apply_grad_clipping,
    explained_variance,
    sequence_mask,
)
from ray.rllib.utils.typing import TensorType

torch, nn = try_import_torch()

logger = logging.getLogger(__name__)


class PPOTorchPolicy(
    ValueNetworkMixin,
    LearningRateSchedule,
    EntropyCoeffSchedule,
    KLCoeffMixin,
    TorchPolicy,
):
    """PyTorch policy class used with PPOTrainer."""

    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config)
        validate_config(self, observation_space, action_space, config)

        TorchPolicy.__init__(
            self,
            observation_space,
            action_space,
            config,
            max_seq_len=config["model"]["max_seq_len"],
        )

        ValueNetworkMixin.__init__(self, config)
        EntropyCoeffSchedule.__init__(
            self, config["entropy_coeff"], config["entropy_coeff_schedule"]
        )
        LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"])

        # The current KL value (as python float).
        self.kl_coeff = self.config["kl_coeff"]
        # Constant target value.
        self.kl_target = self.config["kl_target"]

        # TODO: Don't require users to call this manually.
        self._initialize_loss_from_dummy_batch()

    @override(TorchPolicy)
    def postprocess_trajectory(
        self, sample_batch, other_agent_batches=None, episode=None
    ):
        # Do all post-processing always with no_grad().
        # Not using this here will introduce a memory leak
        # in torch (issue #6962).
        # TODO: no_grad still necessary?
        with torch.no_grad():
            return compute_gae_for_sample_batch(
                self, sample_batch, other_agent_batches, episode
            )

    # TODO: Add method to Policy base class (as the new way of defining loss
    #  functions (instead of passing 'loss` to the super's constructor)).
    @override(TorchPolicy)
    def loss(
        self,
        model: ModelV2,
        dist_class: Type[ActionDistribution],
        train_batch: SampleBatch,
    ) -> Union[TensorType, List[TensorType]]:
        """Constructs the loss for Proximal Policy Objective.

        Args:
            model: The Model to calculate the loss for.
            dist_class: The action distr. class.
            train_batch: The training data.

        Returns:
            The PPO loss tensor given the input batch.
        """

        logits, state = model(train_batch)
        curr_action_dist = dist_class(logits, model)

        # RNN case: Mask away 0-padded chunks at end of time axis.
        if state:
            B = len(train_batch[SampleBatch.SEQ_LENS])
            max_seq_len = logits.shape[0] // B
            mask = sequence_mask(
                train_batch[SampleBatch.SEQ_LENS],
                max_seq_len,
                time_major=model.is_time_major(),
            )
            mask = torch.reshape(mask, [-1])
            num_valid = torch.sum(mask)

            def reduce_mean_valid(t):
                return torch.sum(t[mask]) / num_valid

        # non-RNN case: No masking.
        else:
            mask = None
            reduce_mean_valid = torch.mean

        prev_action_dist = dist_class(
            train_batch[SampleBatch.ACTION_DIST_INPUTS], model
        )

        logp_ratio = torch.exp(
            curr_action_dist.logp(train_batch[SampleBatch.ACTIONS])
            - train_batch[SampleBatch.ACTION_LOGP]
        )

        # Only calculate kl loss if necessary (kl-coeff > 0.0).
        if self.config["kl_coeff"] > 0.0:
            action_kl = prev_action_dist.kl(curr_action_dist)
            mean_kl_loss = reduce_mean_valid(action_kl)
        else:
            mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)

        curr_entropy = curr_action_dist.entropy()
        mean_entropy = reduce_mean_valid(curr_entropy)

        surrogate_loss = torch.min(
            train_batch[Postprocessing.ADVANTAGES] * logp_ratio,
            train_batch[Postprocessing.ADVANTAGES]
            * torch.clamp(
                logp_ratio, 1 - self.config["clip_param"], 1 + self.config["clip_param"]
            ),
        )
        mean_policy_loss = reduce_mean_valid(-surrogate_loss)

        # Compute a value function loss.
        if self.config["use_critic"]:
            value_fn_out = model.value_function()
            vf_loss = torch.pow(
                value_fn_out - train_batch[Postprocessing.VALUE_TARGETS], 2.0
            )
            vf_loss_clipped = torch.clamp(vf_loss, 0, self.config["vf_clip_param"])
            mean_vf_loss = reduce_mean_valid(vf_loss_clipped)
        # Ignore the value function.
        else:
            value_fn_out = 0
            vf_loss_clipped = mean_vf_loss = 0.0

        total_loss = reduce_mean_valid(
            -surrogate_loss
            + self.config["vf_loss_coeff"] * vf_loss_clipped
            - self.entropy_coeff * curr_entropy
        )

        # Add mean_kl_loss (already processed through `reduce_mean_valid`),
        # if necessary.
        if self.config["kl_coeff"] > 0.0:
            total_loss += self.kl_coeff * mean_kl_loss

        # Store values for stats function in model (tower), such that for
        # multi-GPU, we do not override them during the parallel loss phase.
        model.tower_stats["total_loss"] = total_loss
        model.tower_stats["mean_policy_loss"] = mean_policy_loss
        model.tower_stats["mean_vf_loss"] = mean_vf_loss
        model.tower_stats["vf_explained_var"] = explained_variance(
            train_batch[Postprocessing.VALUE_TARGETS], value_fn_out
        )
        model.tower_stats["mean_entropy"] = mean_entropy
        model.tower_stats["mean_kl_loss"] = mean_kl_loss

        return total_loss

    # TODO: Make this an event-style subscription (e.g.:
    #  "after_gradients_computed").
    @override(TorchPolicy)
    def extra_grad_process(self, local_optimizer, loss):
        return apply_grad_clipping(self, local_optimizer, loss)

    # TODO: Make this an event-style subscription (e.g.:
    #  "after_losses_computed").
    @override(TorchPolicy)
    def extra_grad_info(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
        return convert_to_numpy(
            {
                "cur_kl_coeff": self.kl_coeff,
                "cur_lr": self.cur_lr,
                "total_loss": torch.mean(
                    torch.stack(self.get_tower_stats("total_loss"))
                ),
                "policy_loss": torch.mean(
                    torch.stack(self.get_tower_stats("mean_policy_loss"))
                ),
                "vf_loss": torch.mean(
                    torch.stack(self.get_tower_stats("mean_vf_loss"))
                ),
                "vf_explained_var": torch.mean(
                    torch.stack(self.get_tower_stats("vf_explained_var"))
                ),
                "kl": torch.mean(torch.stack(self.get_tower_stats("mean_kl_loss"))),
                "entropy": torch.mean(
                    torch.stack(self.get_tower_stats("mean_entropy"))
                ),
                "entropy_coeff": self.entropy_coeff,
            }
        )
[rllib] Guard against PPO value function not training with RNN models (#4037) * better lstm settings * 1.0 * docs * warn on truncate * clarify * Update ppo_policy_graph.py * Update ppo_policy_graph.py * Update ppo_policy_graph.py 2019-02-22 11:18:51 -08:00			`import logging`
[RLlib] PPO, APPO, and DD-PPO code cleanup. (#10420) 2020-09-02 14:03:01 +02:00			`from typing import Dict, List, Type, Union`
[rllib] PPO onto new RLlib APIs (#2270) 2018-06-28 09:49:08 -07:00
[rllib] Better support and add two-trainer example for multiagent (#2443) This adds a simple DQN+PPO example for multi-agent. We don't do anything fancy here, just syncing weights between two separate trainers. This potentially is wasting some compute, but is very simple to set up. It might be nice to share experience collection between the top-level trainers in the future. 2018-07-22 05:09:25 -07:00			`import ray`
[RLlib] Migrate MAML, MB-MPO, MARWIL, and BC to use Policy sub-classing implementation. (#24914) 2022-05-20 05:10:59 -07:00			`from ray.rllib.agents.ppo.ppo_tf_policy import validate_config`
[RLlib] Issue 9071 A3C w/ RNN not working due to VF assuming no RNN. (#13238) 2021-01-19 14:22:36 +01:00			`from ray.rllib.evaluation.postprocessing import (`
			`compute_gae_for_sample_batch,`
			`Postprocessing,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[RLlib] PPO, APPO, and DD-PPO code cleanup. (#10420) 2020-09-02 14:03:01 +02:00			`from ray.rllib.models.modelv2 import ModelV2`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`from ray.rllib.models.action_dist import ActionDistribution`
[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`from ray.rllib.policy.sample_batch import SampleBatch`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`from ray.rllib.policy.torch_mixins import (`
[RLlib] Implement PPO torch version. (#6826) 2020-01-21 08:06:50 +01:00			`EntropyCoeffSchedule,`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`KLCoeffMixin,`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`LearningRateSchedule,`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`ValueNetworkMixin,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`from ray.rllib.policy.torch_policy import TorchPolicy`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`from ray.rllib.utils.annotations import override`
[RLlib] Minor `rllib.utils` cleanup. (#8932) 2020-06-16 08:52:20 +02:00			`from ray.rllib.utils.framework import try_import_torch`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`from ray.rllib.utils.numpy import convert_to_numpy`
[RLlib] Fix deprecated warning for torch_ops.py (soft-replaced by torch_utils.py). (#19982) 2021-11-03 10:00:46 +01:00			`from ray.rllib.utils.torch_utils import (`
			`apply_grad_clipping,`
[RLlib] Remove all non-trajectory view API code. (#14860) 2021-03-23 17:50:18 +01:00			`explained_variance,`
			`sequence_mask,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`from ray.rllib.utils.typing import TensorType`
[rllib] Remove dependency on TensorFlow (#4764) * remove hard tf dep * add test * comment fix * fix test 2019-05-10 20:36:18 -07:00
[RLlib] Implement PPO torch version. (#6826) 2020-01-21 08:06:50 +01:00			`torch, nn = try_import_torch()`
[rllib] PPO onto new RLlib APIs (#2270) 2018-06-28 09:49:08 -07:00
[rllib] Guard against PPO value function not training with RNN models (#4037) * better lstm settings * 1.0 * docs * warn on truncate * clarify * Update ppo_policy_graph.py * Update ppo_policy_graph.py * Update ppo_policy_graph.py 2019-02-22 11:18:51 -08:00			`logger = logging.getLogger(__name__)`

[rllib] PPO onto new RLlib APIs (#2270) 2018-06-28 09:49:08 -07:00
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`class PPOTorchPolicy(`
			`ValueNetworkMixin,`
			`LearningRateSchedule,`
			`EntropyCoeffSchedule,`
			`KLCoeffMixin,`
			`TorchPolicy,`
			`):`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`"""PyTorch policy class used with PPOTrainer."""`
[RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) 2021-11-15 10:41:54 +01:00
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`def __init__(self, observation_space, action_space, config):`
			`config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config)`
[RLlib] Migrate MAML, MB-MPO, MARWIL, and BC to use Policy sub-classing implementation. (#24914) 2022-05-20 05:10:59 -07:00			`validate_config(self, observation_space, action_space, config)`
[rllib] Adds eager support with a generic `TFEagerPolicy` class (#5436) 2019-08-23 02:21:11 -04:00
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`TorchPolicy.__init__(`
			`self,`
			`observation_space,`
			`action_space,`
			`config,`
			`max_seq_len=config["model"]["max_seq_len"],`
			`)`
Revert "[RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061)" (#20399) This reverts commit 5b1c8e46e16b04b46463eaef523e661f4e31e8ff. 2021-11-15 16:11:35 -08:00
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`ValueNetworkMixin.__init__(self, config)`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`EntropyCoeffSchedule.__init__(`
			`self, config["entropy_coeff"], config["entropy_coeff_schedule"]`
			`)`
			`LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"])`
Revert "[RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061)" (#20399) This reverts commit 5b1c8e46e16b04b46463eaef523e661f4e31e8ff. 2021-11-15 16:11:35 -08:00
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`# The current KL value (as python float).`
			`self.kl_coeff = self.config["kl_coeff"]`
			`# Constant target value.`
			`self.kl_target = self.config["kl_target"]`

			`# TODO: Don't require users to call this manually.`
			`self._initialize_loss_from_dummy_batch()`

			`@override(TorchPolicy)`
			`def postprocess_trajectory(`
			`self, sample_batch, other_agent_batches=None, episode=None`
			`):`
			`# Do all post-processing always with no_grad().`
			`# Not using this here will introduce a memory leak`
			`# in torch (issue #6962).`
			`# TODO: no_grad still necessary?`
			`with torch.no_grad():`
			`return compute_gae_for_sample_batch(`
			`self, sample_batch, other_agent_batches, episode`
			`)`

			`# TODO: Add method to Policy base class (as the new way of defining loss`
			# functions (instead of passing 'loss` to the super's constructor)).
			`@override(TorchPolicy)`
			`def loss(`
			`self,`
			`model: ModelV2,`
			`dist_class: Type[ActionDistribution],`
			`train_batch: SampleBatch,`
			`) -> Union[TensorType, List[TensorType]]:`
			`"""Constructs the loss for Proximal Policy Objective.`

			`Args:`
			`model: The Model to calculate the loss for.`
			`dist_class: The action distr. class.`
			`train_batch: The training data.`

			`Returns:`
			`The PPO loss tensor given the input batch.`
			`"""`

			`logits, state = model(train_batch)`
			`curr_action_dist = dist_class(logits, model)`

			`# RNN case: Mask away 0-padded chunks at end of time axis.`
			`if state:`
			`B = len(train_batch[SampleBatch.SEQ_LENS])`
			`max_seq_len = logits.shape[0] // B`
			`mask = sequence_mask(`
			`train_batch[SampleBatch.SEQ_LENS],`
			`max_seq_len,`
			`time_major=model.is_time_major(),`
			`)`
			`mask = torch.reshape(mask, [-1])`
			`num_valid = torch.sum(mask)`

			`def reduce_mean_valid(t):`
			`return torch.sum(t[mask]) / num_valid`

			`# non-RNN case: No masking.`
			`else:`
			`mask = None`
			`reduce_mean_valid = torch.mean`

			`prev_action_dist = dist_class(`
			`train_batch[SampleBatch.ACTION_DIST_INPUTS], model`
			`)`

			`logp_ratio = torch.exp(`
			`curr_action_dist.logp(train_batch[SampleBatch.ACTIONS])`
			`- train_batch[SampleBatch.ACTION_LOGP]`
			`)`
[RLlib] Issue 21297: Ignore PPO KL-loss term completely if kl-coeff == 0.0 to avoid NaN values due to some discrete action probs==0.0 (#21456) 2022-01-10 11:22:40 +01:00
			`# Only calculate kl loss if necessary (kl-coeff > 0.0).`
			`if self.config["kl_coeff"] > 0.0:`
			`action_kl = prev_action_dist.kl(curr_action_dist)`
			`mean_kl_loss = reduce_mean_valid(action_kl)`
			`else:`
			`mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device)`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00
			`curr_entropy = curr_action_dist.entropy()`
			`mean_entropy = reduce_mean_valid(curr_entropy)`

			`surrogate_loss = torch.min(`
			`train_batch[Postprocessing.ADVANTAGES] * logp_ratio,`
			`train_batch[Postprocessing.ADVANTAGES]`
			`* torch.clamp(`
			`logp_ratio, 1 - self.config["clip_param"], 1 + self.config["clip_param"]`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`),`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`)`
			`mean_policy_loss = reduce_mean_valid(-surrogate_loss)`

			`# Compute a value function loss.`
			`if self.config["use_critic"]:`
			`value_fn_out = model.value_function()`
[RLlib] Fix zero gradients for ppo-clipped vf (#22171) 2022-02-15 07:57:18 +00:00			`vf_loss = torch.pow(`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`value_fn_out - train_batch[Postprocessing.VALUE_TARGETS], 2.0`
			`)`
[RLlib] Fix zero gradients for ppo-clipped vf (#22171) 2022-02-15 07:57:18 +00:00			`vf_loss_clipped = torch.clamp(vf_loss, 0, self.config["vf_clip_param"])`
			`mean_vf_loss = reduce_mean_valid(vf_loss_clipped)`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`# Ignore the value function.`
			`else:`
[RLlib] PPOTorchPolicy: Remove extra call to `model.value_function` (#23671) 2022-04-05 07:40:29 +01:00			`value_fn_out = 0`
[RLlib] Fix zero gradients for ppo-clipped vf (#22171) 2022-02-15 07:57:18 +00:00			`vf_loss_clipped = mean_vf_loss = 0.0`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00
			`total_loss = reduce_mean_valid(`
			`-surrogate_loss`
[RLlib] Fix zero gradients for ppo-clipped vf (#22171) 2022-02-15 07:57:18 +00:00			`+ self.config["vf_loss_coeff"] * vf_loss_clipped`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`- self.entropy_coeff * curr_entropy`
			`)`

[RLlib] Issue 21297: Ignore PPO KL-loss term completely if kl-coeff == 0.0 to avoid NaN values due to some discrete action probs==0.0 (#21456) 2022-01-10 11:22:40 +01:00			# Add mean_kl_loss (already processed through `reduce_mean_valid`),
			`# if necessary.`
			`if self.config["kl_coeff"] > 0.0:`
			`total_loss += self.kl_coeff * mean_kl_loss`

Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`# Store values for stats function in model (tower), such that for`
			`# multi-GPU, we do not override them during the parallel loss phase.`
			`model.tower_stats["total_loss"] = total_loss`
			`model.tower_stats["mean_policy_loss"] = mean_policy_loss`
			`model.tower_stats["mean_vf_loss"] = mean_vf_loss`
			`model.tower_stats["vf_explained_var"] = explained_variance(`
[RLlib] PPOTorchPolicy: Remove extra call to `model.value_function` (#23671) 2022-04-05 07:40:29 +01:00			`train_batch[Postprocessing.VALUE_TARGETS], value_fn_out`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
Revert "Revert [RLlib] POC: Deprecate `build_policy` (policy template) for torch only; PPOTorchPolicy (#20061) (#20399)" (#20417) This reverts commit 90dc5460d414df1f646a1be3a2b3bb42fe6a8777. 2021-11-16 14:49:41 +01:00			`model.tower_stats["mean_entropy"] = mean_entropy`
			`model.tower_stats["mean_kl_loss"] = mean_kl_loss`

			`return total_loss`

			`# TODO: Make this an event-style subscription (e.g.:`
			`# "after_gradients_computed").`
			`@override(TorchPolicy)`
			`def extra_grad_process(self, local_optimizer, loss):`
			`return apply_grad_clipping(self, local_optimizer, loss)`

			`# TODO: Make this an event-style subscription (e.g.:`
			`# "after_losses_computed").`
			`@override(TorchPolicy)`
			`def extra_grad_info(self, train_batch: SampleBatch) -> Dict[str, TensorType]:`
			`return convert_to_numpy(`
			`{`
			`"cur_kl_coeff": self.kl_coeff,`
			`"cur_lr": self.cur_lr,`
			`"total_loss": torch.mean(`
			`torch.stack(self.get_tower_stats("total_loss"))`
			`),`
			`"policy_loss": torch.mean(`
			`torch.stack(self.get_tower_stats("mean_policy_loss"))`
			`),`
			`"vf_loss": torch.mean(`
			`torch.stack(self.get_tower_stats("mean_vf_loss"))`
			`),`
			`"vf_explained_var": torch.mean(`
			`torch.stack(self.get_tower_stats("vf_explained_var"))`
			`),`
			`"kl": torch.mean(torch.stack(self.get_tower_stats("mean_kl_loss"))),`
			`"entropy": torch.mean(`
			`torch.stack(self.get_tower_stats("mean_entropy"))`
			`),`
			`"entropy_coeff": self.entropy_coeff,`
			`}`
			`)`