ray/rllib/algorithms/a3c/a3c_tf_policy.py

"""Note: Keep in sync with changes to VTraceTFPolicy."""
from typing import Dict, List, Optional, Type, Union

import ray
from ray.rllib.evaluation.episode import Episode
from ray.rllib.evaluation.postprocessing import (
    compute_gae_for_sample_batch,
    Postprocessing,
)
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_action_dist import TFActionDistribution
from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2
from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_mixins import (
    compute_gradients,
    EntropyCoeffSchedule,
    LearningRateSchedule,
    ValueNetworkMixin,
)
from ray.rllib.utils.annotations import override
from ray.rllib.utils.deprecation import Deprecated
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.tf_utils import explained_variance
from ray.rllib.utils.typing import (
    AgentID,
    LocalOptimizer,
    ModelGradients,
    TensorType,
    TFPolicyV2Type,
)

tf1, tf, tfv = try_import_tf()


# We need this builder function because we want to share the same
# custom logics between TF1 dynamic and TF2 eager policies.
def get_a3c_tf_policy(name: str, base: TFPolicyV2Type) -> TFPolicyV2Type:
    """Construct a A3CTFPolicy inheriting either dynamic or eager base policies.

    Args:
        base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2.

    Returns:
        A TF Policy to be used with MAML.
    """

    class A3CTFPolicy(
        ValueNetworkMixin, LearningRateSchedule, EntropyCoeffSchedule, base
    ):
        def __init__(
            self,
            obs_space,
            action_space,
            config,
            existing_model=None,
            existing_inputs=None,
        ):
            # First thing first, enable eager execution if necessary.
            base.enable_eager_execution_if_necessary()

            config = dict(ray.rllib.algorithms.a3c.a3c.A3CConfig().to_dict(), **config)

            # Initialize base class.
            base.__init__(
                self,
                obs_space,
                action_space,
                config,
                existing_inputs=existing_inputs,
                existing_model=existing_model,
            )

            ValueNetworkMixin.__init__(self, self.config)
            LearningRateSchedule.__init__(
                self, self.config["lr"], self.config["lr_schedule"]
            )
            EntropyCoeffSchedule.__init__(
                self, config["entropy_coeff"], config["entropy_coeff_schedule"]
            )

            # Note: this is a bit ugly, but loss and optimizer initialization must
            # happen after all the MixIns are initialized.
            self.maybe_initialize_optimizer_and_loss()

        @override(base)
        def loss(
            self,
            model: Union[ModelV2, "tf.keras.Model"],
            dist_class: Type[TFActionDistribution],
            train_batch: SampleBatch,
        ) -> Union[TensorType, List[TensorType]]:

            model_out, _ = model(train_batch)
            action_dist = dist_class(model_out, model)
            if self.is_recurrent():
                max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
                valid_mask = tf.sequence_mask(
                    train_batch[SampleBatch.SEQ_LENS], max_seq_len
                )
                valid_mask = tf.reshape(valid_mask, [-1])
            else:
                valid_mask = tf.ones_like(train_batch[SampleBatch.REWARDS])

            log_prob = action_dist.logp(train_batch[SampleBatch.ACTIONS])
            vf = model.value_function()

            # The "policy gradients" loss
            self.pi_loss = -tf.reduce_sum(
                tf.boolean_mask(
                    log_prob * train_batch[Postprocessing.ADVANTAGES], valid_mask
                )
            )

            delta = tf.boolean_mask(
                vf - train_batch[Postprocessing.VALUE_TARGETS], valid_mask
            )

            # Compute a value function loss.
            if self.config.get("use_critic", True):
                self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta))
            # Ignore the value function.
            else:
                self.vf_loss = tf.constant(0.0)

            self.entropy_loss = tf.reduce_sum(
                tf.boolean_mask(action_dist.entropy(), valid_mask)
            )

            self.total_loss = (
                self.pi_loss
                + self.vf_loss * self.config["vf_loss_coeff"]
                - self.entropy_loss * self.entropy_coeff
            )

            return self.total_loss

        @override(base)
        def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
            return {
                "cur_lr": tf.cast(self.cur_lr, tf.float64),
                "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64),
                "policy_loss": self.pi_loss,
                "policy_entropy": self.entropy_loss,
                "var_gnorm": tf.linalg.global_norm(
                    list(self.model.trainable_variables())
                ),
                "vf_loss": self.vf_loss,
            }

        @override(base)
        def grad_stats_fn(
            self, train_batch: SampleBatch, grads: ModelGradients
        ) -> Dict[str, TensorType]:
            return {
                "grad_gnorm": tf.linalg.global_norm(grads),
                "vf_explained_var": explained_variance(
                    train_batch[Postprocessing.VALUE_TARGETS],
                    self.model.value_function(),
                ),
            }

        @override(base)
        def postprocess_trajectory(
            self,
            sample_batch: SampleBatch,
            other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
            episode: Optional[Episode] = None,
        ):
            sample_batch = super().postprocess_trajectory(sample_batch)
            return compute_gae_for_sample_batch(
                self, sample_batch, other_agent_batches, episode
            )

        @override(base)
        def compute_gradients_fn(
            self, optimizer: LocalOptimizer, loss: TensorType
        ) -> ModelGradients:
            return compute_gradients(self, optimizer, loss)

    A3CTFPolicy.__name__ = name
    A3CTFPolicy.__qualname__ = name

    return A3CTFPolicy


A3CTF1Policy = get_a3c_tf_policy("A3CTF1Policy", DynamicTFPolicyV2)
A3CTF2Policy = get_a3c_tf_policy("A3CTF2Policy", EagerTFPolicyV2)


@Deprecated(
    old="rllib.algorithms.a3c.a3c_tf_policy.postprocess_advantages",
    new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch",
    error=True,
)
def postprocess_advantages(*args, **kwargs):
    pass
[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`"""Note: Keep in sync with changes to VTraceTFPolicy."""`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`from typing import Dict, List, Optional, Type, Union`
[rllib] Basic IMPALA implementation (using deepmind's reference vtrace.py) (#2504) Rename AsyncSamplesOptimizer -> AsyncReplayOptimizer Add AsyncSamplesOptimizer that implements the IMPALA architecture integrate V-trace with a3c policy graph audit V-trace integration benchmark compare vs A3C and with V-trace on/off PongNoFrameskip-v4 on IMPALA scaling from 16 to 128 workers, solving Pong in <10 min. For reference, solving this env takes ~40 minutes for Ape-X and several hours for A3C. 2018-08-01 20:53:53 -07:00
[rllib] Part 2 of multiagent support (#2286) * wip * cls * re * wip * wip * a3c working * torch support * pg works * lint * rm v2 * consumer id * clean up pg * clean up more * fix python 2.7 * tf session management * docs * dqn wip * fix compile * dqn * apex runs * up * impotrs * ddpg * quotes * fix tests * fix last r * fix tests * lint * pass checkpoint restore * kwar * nits * policy graph * fix yapf * com * class * pyt * vectorization * update * test cpe * unit test * fix ddpg2 * changes * wip * args * faster test * common * fix * add alg option * batch mode and policy serving * multi serving test * todo * wip * serving test * doc async env * num envs * comments * thread * remove init hook * update * fix ppo * comments1 * fix * updates * add jenkins tests * fix * fix pytorch * fix * fixes * fix a3c policy * fix squeeze * fix trunc on apex * fix squeezing for real * update * remove horizon test for now * multiagent wip * update * fix race condition * fix ma * t * doc * st * wip * example * wip * working * cartpole * wip * batch wip * fix bug * make other_batches None default * working * debug * nit * warn * comments * fix ppo * fix obs filter * update * fix obs filter * pass thru worker index * fix * fix log action * debug name * fix sphinx 2018-06-25 22:33:57 -07:00			`import ray`
[RLlib; Docs overhaul] Docstring cleanup: Evaluation (#19783) 2021-10-29 12:03:56 +02:00			`from ray.rllib.evaluation.episode import Episode`
[RLlib] Issue 9071 A3C w/ RNN not working due to VF assuming no RNN. (#13238) 2021-01-19 14:22:36 +01:00			`from ray.rllib.evaluation.postprocessing import (`
			`compute_gae_for_sample_batch,`
[rllib] Minor cleanups to TFPolicyGraph: add init args, constants for loss inputs (#4478) 2019-03-29 12:44:23 -07:00			`Postprocessing,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[RLlib; Docs overhaul] Docstring cleanup: Evaluation (#19783) 2021-10-29 12:03:56 +02:00			`from ray.rllib.models.modelv2 import ModelV2`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`from ray.rllib.models.tf.tf_action_dist import TFActionDistribution`
			`from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2`
			`from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2`
			`from ray.rllib.policy.sample_batch import SampleBatch`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`from ray.rllib.policy.tf_mixins import (`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`compute_gradients,`
[RLlib] Clean up Policy mixins. (#24746) 2022-05-17 08:16:08 -07:00			`EntropyCoeffSchedule,`
			`LearningRateSchedule,`
			`ValueNetworkMixin,`
			`)`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`from ray.rllib.utils.annotations import override`
[RLlib; Docs overhaul] Docstring cleanup: rllib/utils (#19829) 2021-11-01 21:46:02 +01:00			`from ray.rllib.utils.deprecation import Deprecated`
[RLlib] Minor `rllib.utils` cleanup. (#8932) 2020-06-16 08:52:20 +02:00			`from ray.rllib.utils.framework import try_import_tf`
[RLlib; Docs overhaul] Docstring cleanup: rllib/utils (#19829) 2021-11-01 21:46:02 +01:00			`from ray.rllib.utils.tf_utils import explained_variance`
[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`from ray.rllib.utils.typing import (`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`AgentID,`
[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`LocalOptimizer,`
			`ModelGradients,`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`TensorType,`
			`TFPolicyV2Type,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`)`
[rllib] Remove dependency on TensorFlow (#4764) * remove hard tf dep * add test * comment fix * fix test 2019-05-10 20:36:18 -07:00
[RLlib] Tf2x preparation; part 2 (upgrading `try_import_tf()`). (#9136) * WIP. * Fixes. * LINT. * WIP. * WIP. * Fixes. * Fixes. * Fixes. * Fixes. * WIP. * Fixes. * Test * Fix. * Fixes and LINT. * Fixes and LINT. * LINT. 2020-06-30 10:13:20 +02:00			`tf1, tf, tfv = try_import_tf()`
[rllib] Refactor rllib to have a common sample collection pathway (#2149) 2018-06-09 00:21:35 -07:00

[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`# We need this builder function because we want to share the same`
			`# custom logics between TF1 dynamic and TF2 eager policies.`
[RLlib] Fix a bunch of issues related to connectors. (#26510) 2022-07-13 09:55:20 -07:00			`def get_a3c_tf_policy(name: str, base: TFPolicyV2Type) -> TFPolicyV2Type:`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`"""Construct a A3CTFPolicy inheriting either dynamic or eager base policies.`

			`Args:`
			`base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2.`

			`Returns:`
[RLlib] Move all remaining algos into `algorithms` directory. (#25366) 2022-06-04 07:35:24 +02:00			`A TF Policy to be used with MAML.`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`"""`

			`class A3CTFPolicy(`
			`ValueNetworkMixin, LearningRateSchedule, EntropyCoeffSchedule, base`
			`):`
			`def __init__(`
			`self,`
			`obs_space,`
			`action_space,`
			`config,`
			`existing_model=None,`
			`existing_inputs=None,`
			`):`
			`# First thing first, enable eager execution if necessary.`
			`base.enable_eager_execution_if_necessary()`

[RLlib] A2C + A3C move to `algorithms` folder and re-name into A2C/A3C (from ...Trainer). (#25314) 2022-06-01 09:29:16 +02:00			`config = dict(ray.rllib.algorithms.a3c.a3c.A3CConfig().to_dict(), **config)`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00
			`# Initialize base class.`
			`base.__init__(`
			`self,`
			`obs_space,`
			`action_space,`
			`config,`
			`existing_inputs=existing_inputs,`
			`existing_model=existing_model,`
			`)`

			`ValueNetworkMixin.__init__(self, self.config)`
			`LearningRateSchedule.__init__(`
			`self, self.config["lr"], self.config["lr_schedule"]`
			`)`
			`EntropyCoeffSchedule.__init__(`
			`self, config["entropy_coeff"], config["entropy_coeff_schedule"]`
			`)`

			`# Note: this is a bit ugly, but loss and optimizer initialization must`
			`# happen after all the MixIns are initialized.`
			`self.maybe_initialize_optimizer_and_loss()`

			`@override(base)`
			`def loss(`
			`self,`
			`model: Union[ModelV2, "tf.keras.Model"],`
			`dist_class: Type[TFActionDistribution],`
			`train_batch: SampleBatch,`
			`) -> Union[TensorType, List[TensorType]]:`

			`model_out, _ = model(train_batch)`
			`action_dist = dist_class(model_out, model)`
			`if self.is_recurrent():`
			`max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])`
			`valid_mask = tf.sequence_mask(`
			`train_batch[SampleBatch.SEQ_LENS], max_seq_len`
			`)`
			`valid_mask = tf.reshape(valid_mask, [-1])`
			`else:`
			`valid_mask = tf.ones_like(train_batch[SampleBatch.REWARDS])`

			`log_prob = action_dist.logp(train_batch[SampleBatch.ACTIONS])`
			`vf = model.value_function()`

			`# The "policy gradients" loss`
			`self.pi_loss = -tf.reduce_sum(`
			`tf.boolean_mask(`
			`log_prob * train_batch[Postprocessing.ADVANTAGES], valid_mask`
			`)`
			`)`

			`delta = tf.boolean_mask(`
			`vf - train_batch[Postprocessing.VALUE_TARGETS], valid_mask`
			`)`

			`# Compute a value function loss.`
			`if self.config.get("use_critic", True):`
			`self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta))`
			`# Ignore the value function.`
			`else:`
			`self.vf_loss = tf.constant(0.0)`

			`self.entropy_loss = tf.reduce_sum(`
			`tf.boolean_mask(action_dist.entropy(), valid_mask)`
			`)`

			`self.total_loss = (`
			`self.pi_loss`
			`+ self.vf_loss * self.config["vf_loss_coeff"]`
			`- self.entropy_loss * self.entropy_coeff`
			`)`

			`return self.total_loss`

			`@override(base)`
			`def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:`
			`return {`
			`"cur_lr": tf.cast(self.cur_lr, tf.float64),`
			`"entropy_coeff": tf.cast(self.entropy_coeff, tf.float64),`
			`"policy_loss": self.pi_loss,`
			`"policy_entropy": self.entropy_loss,`
			`"var_gnorm": tf.linalg.global_norm(`
			`list(self.model.trainable_variables())`
			`),`
			`"vf_loss": self.vf_loss,`
			`}`

			`@override(base)`
			`def grad_stats_fn(`
			`self, train_batch: SampleBatch, grads: ModelGradients`
			`) -> Dict[str, TensorType]:`
			`return {`
			`"grad_gnorm": tf.linalg.global_norm(grads),`
			`"vf_explained_var": explained_variance(`
			`train_batch[Postprocessing.VALUE_TARGETS],`
			`self.model.value_function(),`
			`),`
			`}`

			`@override(base)`
			`def postprocess_trajectory(`
			`self,`
			`sample_batch: SampleBatch,`
			`other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,`
			`episode: Optional[Episode] = None,`
			`):`
			`sample_batch = super().postprocess_trajectory(sample_batch)`
			`return compute_gae_for_sample_batch(`
			`self, sample_batch, other_agent_batches, episode`
			`)`

			`@override(base)`
			`def compute_gradients_fn(`
			`self, optimizer: LocalOptimizer, loss: TensorType`
			`) -> ModelGradients:`
			`return compute_gradients(self, optimizer, loss)`

[RLlib] Fix a bunch of issues related to connectors. (#26510) 2022-07-13 09:55:20 -07:00			`A3CTFPolicy.__name__ = name`
			`A3CTFPolicy.__qualname__ = name`

[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`return A3CTFPolicy`


[RLlib] Fix a bunch of issues related to connectors. (#26510) 2022-07-13 09:55:20 -07:00			`A3CTF1Policy = get_a3c_tf_policy("A3CTF1Policy", DynamicTFPolicyV2)`
			`A3CTF2Policy = get_a3c_tf_policy("A3CTF2Policy", EagerTFPolicyV2)`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00

[RLlib] Add @Deprecated decorator to simplify/unify deprecation of classes, methods, functions. (#17530) 2021-08-03 18:30:02 -04:00			`@Deprecated(`
[RLlib] A2C + A3C move to `algorithms` folder and re-name into A2C/A3C (from ...Trainer). (#25314) 2022-06-01 09:29:16 +02:00			`old="rllib.algorithms.a3c.a3c_tf_policy.postprocess_advantages",`
[RLlib] Add @Deprecated decorator to simplify/unify deprecation of classes, methods, functions. (#17530) 2021-08-03 18:30:02 -04:00			`new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch",`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`error=True,`
[RLlib] Added LearningRateSchedule and EntropyCoeffSchedule to TF and Torch versions of A3C and PPO (#19276) 2021-10-25 10:39:35 +03:00			`)`
[RLlib] A2/3C policy sub-classing schema. (#25078) 2022-05-28 09:54:47 +02:00			`def postprocess_advantages(args, *kwargs):`
			`pass`