ray/rllib/agents/ppo/ppo_tf_policy.py

"""
TensorFlow policy class used for PPO.
"""

import logging
from typing import Dict, List, Type, Union

import ray
from ray.rllib.evaluation.postprocessing import (
    Postprocessing,
    compute_gae_for_sample_batch,
)
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_action_dist import TFActionDistribution
from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2
from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_mixins import (
    EntropyCoeffSchedule,
    LearningRateSchedule,
    KLCoeffMixin,
    ValueNetworkMixin,
    compute_gradients,
)
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.tf_utils import explained_variance
from ray.rllib.utils.typing import (
    LocalOptimizer,
    ModelGradients,
    TensorType,
    TrainerConfigDict,
)

tf1, tf, tfv = try_import_tf()

logger = logging.getLogger(__name__)


def validate_config(config: TrainerConfigDict) -> None:
    """Executed before Policy is "initialized" (at beginning of constructor).
    Args:
        config (TrainerConfigDict): The Policy's config.
    """
    # If vf_share_layers is True, inform about the need to tune vf_loss_coeff.
    if config.get("model", {}).get("vf_share_layers") is True:
        logger.info(
            "`vf_share_layers=True` in your model. "
            "Therefore, remember to tune the value of `vf_loss_coeff`!"
        )


# We need this builder function because we want to share the same
# custom logics between TF1 dynamic and TF2 eager policies.
def get_ppo_tf_policy(base: type) -> type:
    """Construct a PPOTFPolicy inheriting either dynamic or eager base policies.

    Args:
        base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2.

    Returns:
        A TF Policy to be used with PPOTrainer.
    """

    class PPOTFPolicy(
        EntropyCoeffSchedule,
        LearningRateSchedule,
        KLCoeffMixin,
        ValueNetworkMixin,
        base,
    ):
        def __init__(
            self,
            obs_space,
            action_space,
            config,
            existing_model=None,
            existing_inputs=None,
        ):
            # First thing first, enable eager execution if necessary.
            base.enable_eager_execution_if_necessary()

            config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config)
            validate_config(config)

            # Initialize base class.
            base.__init__(
                self,
                obs_space,
                action_space,
                config,
                existing_inputs=existing_inputs,
                existing_model=existing_model,
            )

            # Initialize MixIns.
            ValueNetworkMixin.__init__(self, config)
            KLCoeffMixin.__init__(self, config)
            EntropyCoeffSchedule.__init__(
                self, config["entropy_coeff"], config["entropy_coeff_schedule"]
            )
            LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"])

            # Note: this is a bit ugly, but loss and optimizer initialization must
            # happen after all the MixIns are initialized.
            self.maybe_initialize_optimizer_and_loss()

        @override(base)
        def loss(
            self,
            model: Union[ModelV2, "tf.keras.Model"],
            dist_class: Type[TFActionDistribution],
            train_batch: SampleBatch,
        ) -> Union[TensorType, List[TensorType]]:
            if isinstance(model, tf.keras.Model):
                logits, state, extra_outs = model(train_batch)
                value_fn_out = extra_outs[SampleBatch.VF_PREDS]
            else:
                logits, state = model(train_batch)
                value_fn_out = model.value_function()

            curr_action_dist = dist_class(logits, model)

            # RNN case: Mask away 0-padded chunks at end of time axis.
            if state:
                # Derive max_seq_len from the data itself, not from the seq_lens
                # tensor. This is in case e.g. seq_lens=[2, 3], but the data is still
                # 0-padded up to T=5 (as it's the case for attention nets).
                B = tf.shape(train_batch[SampleBatch.SEQ_LENS])[0]
                max_seq_len = tf.shape(logits)[0] // B

                mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
                mask = tf.reshape(mask, [-1])

                def reduce_mean_valid(t):
                    return tf.reduce_mean(tf.boolean_mask(t, mask))

            # non-RNN case: No masking.
            else:
                mask = None
                reduce_mean_valid = tf.reduce_mean

            prev_action_dist = dist_class(
                train_batch[SampleBatch.ACTION_DIST_INPUTS], model
            )

            logp_ratio = tf.exp(
                curr_action_dist.logp(train_batch[SampleBatch.ACTIONS])
                - train_batch[SampleBatch.ACTION_LOGP]
            )

            # Only calculate kl loss if necessary (kl-coeff > 0.0).
            if self.config["kl_coeff"] > 0.0:
                action_kl = prev_action_dist.kl(curr_action_dist)
                mean_kl_loss = reduce_mean_valid(action_kl)
            else:
                mean_kl_loss = tf.constant(0.0)

            curr_entropy = curr_action_dist.entropy()
            mean_entropy = reduce_mean_valid(curr_entropy)

            surrogate_loss = tf.minimum(
                train_batch[Postprocessing.ADVANTAGES] * logp_ratio,
                train_batch[Postprocessing.ADVANTAGES]
                * tf.clip_by_value(
                    logp_ratio,
                    1 - self.config["clip_param"],
                    1 + self.config["clip_param"],
                ),
            )
            mean_policy_loss = reduce_mean_valid(-surrogate_loss)

            # Compute a value function loss.
            if self.config["use_critic"]:
                vf_loss = tf.math.square(
                    value_fn_out - train_batch[Postprocessing.VALUE_TARGETS]
                )
                vf_loss_clipped = tf.clip_by_value(
                    vf_loss,
                    0,
                    self.config["vf_clip_param"],
                )
                mean_vf_loss = reduce_mean_valid(vf_loss_clipped)
            # Ignore the value function.
            else:
                vf_loss_clipped = mean_vf_loss = tf.constant(0.0)

            total_loss = reduce_mean_valid(
                -surrogate_loss
                + self.config["vf_loss_coeff"] * vf_loss_clipped
                - self.entropy_coeff * curr_entropy
            )
            # Add mean_kl_loss (already processed through `reduce_mean_valid`),
            # if necessary.
            if self.config["kl_coeff"] > 0.0:
                total_loss += self.kl_coeff * mean_kl_loss

            # Store stats in policy for stats_fn.
            self._total_loss = total_loss
            self._mean_policy_loss = mean_policy_loss
            self._mean_vf_loss = mean_vf_loss
            self._mean_entropy = mean_entropy
            # Backward compatibility: Deprecate self._mean_kl.
            self._mean_kl_loss = self._mean_kl = mean_kl_loss
            self._value_fn_out = value_fn_out

            return total_loss

        @override(base)
        def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
            return {
                "cur_kl_coeff": tf.cast(self.kl_coeff, tf.float64),
                "cur_lr": tf.cast(self.cur_lr, tf.float64),
                "total_loss": self._total_loss,
                "policy_loss": self._mean_policy_loss,
                "vf_loss": self._mean_vf_loss,
                "vf_explained_var": explained_variance(
                    train_batch[Postprocessing.VALUE_TARGETS], self._value_fn_out
                ),
                "kl": self._mean_kl_loss,
                "entropy": self._mean_entropy,
                "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64),
            }

        @override(base)
        def postprocess_trajectory(
            self, sample_batch, other_agent_batches=None, episode=None
        ):
            sample_batch = super().postprocess_trajectory(sample_batch)
            return compute_gae_for_sample_batch(
                self, sample_batch, other_agent_batches, episode
            )

        @override(base)
        def compute_gradients_fn(
            self, optimizer: LocalOptimizer, loss: TensorType
        ) -> ModelGradients:
            return compute_gradients(self, optimizer, loss)

    return PPOTFPolicy


PPOStaticGraphTFPolicy = get_ppo_tf_policy(DynamicTFPolicyV2)
PPOEagerTFPolicy = get_ppo_tf_policy(EagerTFPolicyV2)
[RLlib] PPO, APPO, and DD-PPO code cleanup. (#10420) 2020-09-02 14:03:01 +02:00			`"""`
			`TensorFlow policy class used for PPO.`
			`"""`

[rllib] Guard against PPO value function not training with RNN models (#4037) * better lstm settings * 1.0 * docs * warn on truncate * clarify * Update ppo_policy_graph.py * Update ppo_policy_graph.py * Update ppo_policy_graph.py 2019-02-22 11:18:51 -08:00			`import logging`
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`from typing import Dict, List, Type, Union`
[rllib] PPO onto new RLlib APIs (#2270) 2018-06-28 09:49:08 -07:00
[rllib] Better support and add two-trainer example for multiagent (#2443) This adds a simple DQN+PPO example for multi-agent. We don't do anything fancy here, just syncing weights between two separate trainers. This potentially is wasting some compute, but is very simple to set up. It might be nice to share experience collection between the top-level trainers in the future. 2018-07-22 05:09:25 -07:00			`import ray`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`from ray.rllib.evaluation.postprocessing import (`
			`Postprocessing,`
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`compute_gae_for_sample_batch,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[RLlib] PPO, APPO, and DD-PPO code cleanup. (#10420) 2020-09-02 14:03:01 +02:00			`from ray.rllib.models.modelv2 import ModelV2`
			`from ray.rllib.models.tf.tf_action_dist import TFActionDistribution`
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2`
			`from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2`
[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`from ray.rllib.policy.sample_batch import SampleBatch`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`from ray.rllib.policy.tf_mixins import (`
			`EntropyCoeffSchedule,`
			`LearningRateSchedule,`
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`KLCoeffMixin,`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`ValueNetworkMixin,`
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`compute_gradients,`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`)`
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`from ray.rllib.utils.annotations import override`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`from ray.rllib.utils.framework import try_import_tf`
			`from ray.rllib.utils.tf_utils import explained_variance`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`from ray.rllib.utils.typing import (`
			`LocalOptimizer,`
			`ModelGradients,`
			`TensorType,`
			`TrainerConfigDict,`
			`)`
[rllib] Remove dependency on TensorFlow (#4764) * remove hard tf dep * add test * comment fix * fix test 2019-05-10 20:36:18 -07:00
[RLlib] Tf2x preparation; part 2 (upgrading `try_import_tf()`). (#9136) * WIP. * Fixes. * LINT. * WIP. * WIP. * Fixes. * Fixes. * Fixes. * Fixes. * WIP. * Fixes. * Test * Fix. * Fixes and LINT. * Fixes and LINT. * LINT. 2020-06-30 10:13:20 +02:00			`tf1, tf, tfv = try_import_tf()`
[rllib] PPO onto new RLlib APIs (#2270) 2018-06-28 09:49:08 -07:00
[rllib] Guard against PPO value function not training with RNN models (#4037) * better lstm settings * 1.0 * docs * warn on truncate * clarify * Update ppo_policy_graph.py * Update ppo_policy_graph.py * Update ppo_policy_graph.py 2019-02-22 11:18:51 -08:00			`logger = logging.getLogger(__name__)`

[rllib] PPO onto new RLlib APIs (#2270) 2018-06-28 09:49:08 -07:00
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`def validate_config(config: TrainerConfigDict) -> None:`
[RLlib] PPO, APPO, and DD-PPO code cleanup. (#10420) 2020-09-02 14:03:01 +02:00			`"""Executed before Policy is "initialized" (at beginning of constructor).`
			`Args:`
			`config (TrainerConfigDict): The Policy's config.`
			`"""`
[RLlib] Deprecate `vf_share_layers` in top-level PPO/MAML/MB-MPO configs. (#13397) 2021-01-19 09:51:35 +01:00			`# If vf_share_layers is True, inform about the need to tune vf_loss_coeff.`
			`if config.get("model", {}).get("vf_share_layers") is True:`
			`logger.info(`
			"`vf_share_layers=True` in your model. "
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			"Therefore, remember to tune the value of `vf_loss_coeff`!"
			`)`
[rllib] Port IMPALA to ModelV2/build_tf_policy (#5130) * port vtrace * fix vf * fix vs * fix the example * wip ddpg * fix tests * fix tests * remove ddpg model * comments * set vf share layers True by default * typo * fix test 2019-07-07 15:06:41 -07:00

[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`# We need this builder function because we want to share the same`
			`# custom logics between TF1 dynamic and TF2 eager policies.`
			`def get_ppo_tf_policy(base: type) -> type:`
			`"""Construct a PPOTFPolicy inheriting either dynamic or eager base policies.`
[RLlib] PPO, APPO, and DD-PPO code cleanup. (#10420) 2020-09-02 14:03:01 +02:00
			`Args:`
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2.`

			`Returns:`
			`A TF Policy to be used with PPOTrainer.`
[RLlib] PPO, APPO, and DD-PPO code cleanup. (#10420) 2020-09-02 14:03:01 +02:00			`"""`
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00
			`class PPOTFPolicy(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`EntropyCoeffSchedule,`
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`LearningRateSchedule,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`KLCoeffMixin,`
			`ValueNetworkMixin,`
[RLlib] Migrate PPO Impala and APPO policies to use sub-classing implementation. (#25117) 2022-05-25 05:38:03 -07:00			`base,`
			`):`
			`def __init__(`
			`self,`
			`obs_space,`
			`action_space,`
			`config,`
			`existing_model=None,`
			`existing_inputs=None,`
			`):`
			`# First thing first, enable eager execution if necessary.`
			`base.enable_eager_execution_if_necessary()`

			`config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config)`
			`validate_config(config)`

			`# Initialize base class.`
			`base.__init__(`
			`self,`
			`obs_space,`
			`action_space,`
			`config,`
			`existing_inputs=existing_inputs,`
			`existing_model=existing_model,`
			`)`

			`# Initialize MixIns.`
			`ValueNetworkMixin.__init__(self, config)`
			`KLCoeffMixin.__init__(self, config)`
			`EntropyCoeffSchedule.__init__(`
			`self, config["entropy_coeff"], config["entropy_coeff_schedule"]`
			`)`
			`LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"])`

			`# Note: this is a bit ugly, but loss and optimizer initialization must`
			`# happen after all the MixIns are initialized.`
			`self.maybe_initialize_optimizer_and_loss()`

			`@override(base)`
			`def loss(`
			`self,`
			`model: Union[ModelV2, "tf.keras.Model"],`
			`dist_class: Type[TFActionDistribution],`
			`train_batch: SampleBatch,`
			`) -> Union[TensorType, List[TensorType]]:`
			`if isinstance(model, tf.keras.Model):`
			`logits, state, extra_outs = model(train_batch)`
			`value_fn_out = extra_outs[SampleBatch.VF_PREDS]`
			`else:`
			`logits, state = model(train_batch)`
			`value_fn_out = model.value_function()`

			`curr_action_dist = dist_class(logits, model)`

			`# RNN case: Mask away 0-padded chunks at end of time axis.`
			`if state:`
			`# Derive max_seq_len from the data itself, not from the seq_lens`
			`# tensor. This is in case e.g. seq_lens=[2, 3], but the data is still`
			`# 0-padded up to T=5 (as it's the case for attention nets).`
			`B = tf.shape(train_batch[SampleBatch.SEQ_LENS])[0]`
			`max_seq_len = tf.shape(logits)[0] // B`

			`mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)`
			`mask = tf.reshape(mask, [-1])`

			`def reduce_mean_valid(t):`
			`return tf.reduce_mean(tf.boolean_mask(t, mask))`

			`# non-RNN case: No masking.`
			`else:`
			`mask = None`
			`reduce_mean_valid = tf.reduce_mean`

			`prev_action_dist = dist_class(`
			`train_batch[SampleBatch.ACTION_DIST_INPUTS], model`
			`)`

			`logp_ratio = tf.exp(`
			`curr_action_dist.logp(train_batch[SampleBatch.ACTIONS])`
			`- train_batch[SampleBatch.ACTION_LOGP]`
			`)`

			`# Only calculate kl loss if necessary (kl-coeff > 0.0).`
			`if self.config["kl_coeff"] > 0.0:`
			`action_kl = prev_action_dist.kl(curr_action_dist)`
			`mean_kl_loss = reduce_mean_valid(action_kl)`
			`else:`
			`mean_kl_loss = tf.constant(0.0)`

			`curr_entropy = curr_action_dist.entropy()`
			`mean_entropy = reduce_mean_valid(curr_entropy)`

			`surrogate_loss = tf.minimum(`
			`train_batch[Postprocessing.ADVANTAGES] * logp_ratio,`
			`train_batch[Postprocessing.ADVANTAGES]`
			`* tf.clip_by_value(`
			`logp_ratio,`
			`1 - self.config["clip_param"],`
			`1 + self.config["clip_param"],`
			`),`
			`)`
			`mean_policy_loss = reduce_mean_valid(-surrogate_loss)`

			`# Compute a value function loss.`
			`if self.config["use_critic"]:`
			`vf_loss = tf.math.square(`
			`value_fn_out - train_batch[Postprocessing.VALUE_TARGETS]`
			`)`
			`vf_loss_clipped = tf.clip_by_value(`
			`vf_loss,`
			`0,`
			`self.config["vf_clip_param"],`
			`)`
			`mean_vf_loss = reduce_mean_valid(vf_loss_clipped)`
			`# Ignore the value function.`
			`else:`
			`vf_loss_clipped = mean_vf_loss = tf.constant(0.0)`

			`total_loss = reduce_mean_valid(`
			`-surrogate_loss`
			`+ self.config["vf_loss_coeff"] * vf_loss_clipped`
			`- self.entropy_coeff * curr_entropy`
			`)`
			# Add mean_kl_loss (already processed through `reduce_mean_valid`),
			`# if necessary.`
			`if self.config["kl_coeff"] > 0.0:`
			`total_loss += self.kl_coeff * mean_kl_loss`

			`# Store stats in policy for stats_fn.`
			`self._total_loss = total_loss`
			`self._mean_policy_loss = mean_policy_loss`
			`self._mean_vf_loss = mean_vf_loss`
			`self._mean_entropy = mean_entropy`
			`# Backward compatibility: Deprecate self._mean_kl.`
			`self._mean_kl_loss = self._mean_kl = mean_kl_loss`
			`self._value_fn_out = value_fn_out`

			`return total_loss`

			`@override(base)`
			`def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:`
			`return {`
			`"cur_kl_coeff": tf.cast(self.kl_coeff, tf.float64),`
			`"cur_lr": tf.cast(self.cur_lr, tf.float64),`
			`"total_loss": self._total_loss,`
			`"policy_loss": self._mean_policy_loss,`
			`"vf_loss": self._mean_vf_loss,`
			`"vf_explained_var": explained_variance(`
			`train_batch[Postprocessing.VALUE_TARGETS], self._value_fn_out`
			`),`
			`"kl": self._mean_kl_loss,`
			`"entropy": self._mean_entropy,`
			`"entropy_coeff": tf.cast(self.entropy_coeff, tf.float64),`
			`}`

			`@override(base)`
			`def postprocess_trajectory(`
			`self, sample_batch, other_agent_batches=None, episode=None`
			`):`
			`sample_batch = super().postprocess_trajectory(sample_batch)`
			`return compute_gae_for_sample_batch(`
			`self, sample_batch, other_agent_batches, episode`
			`)`

			`@override(base)`
			`def compute_gradients_fn(`
			`self, optimizer: LocalOptimizer, loss: TensorType`
			`) -> ModelGradients:`
			`return compute_gradients(self, optimizer, loss)`

			`return PPOTFPolicy`


			`PPOStaticGraphTFPolicy = get_ppo_tf_policy(DynamicTFPolicyV2)`
			`PPOEagerTFPolicy = get_ppo_tf_policy(EagerTFPolicyV2)`