ray/rllib/agents/a3c/a3c_tf_policy.py

"""Note: Keep in sync with changes to VTraceTFPolicy."""
from typing import Optional, Dict
import gym

import ray
from ray.rllib.agents.ppo.ppo_tf_policy import ValueNetworkMixin
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.evaluation.postprocessing import compute_gae_for_sample_batch, \
    Postprocessing
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.policy.tf_policy import LearningRateSchedule
from ray.rllib.utils.annotations import Deprecated
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.tf_ops import explained_variance
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.typing import TrainerConfigDict, TensorType, \
    PolicyID, LocalOptimizer, ModelGradients
from ray.rllib.models.action_dist import ActionDistribution
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.evaluation import MultiAgentEpisode

tf1, tf, tfv = try_import_tf()


@Deprecated(
    old="rllib.agents.a3c.a3c_tf_policy.postprocess_advantages",
    new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch",
    error=False)
def postprocess_advantages(
        policy: Policy,
        sample_batch: SampleBatch,
        other_agent_batches: Optional[Dict[PolicyID, SampleBatch]] = None,
        episode: Optional[MultiAgentEpisode] = None) -> SampleBatch:

    return compute_gae_for_sample_batch(policy, sample_batch,
                                        other_agent_batches, episode)


class A3CLoss:
    def __init__(self,
                 action_dist: ActionDistribution,
                 actions: TensorType,
                 advantages: TensorType,
                 v_target: TensorType,
                 vf: TensorType,
                 valid_mask: TensorType,
                 vf_loss_coeff: float = 0.5,
                 entropy_coeff: float = 0.01,
                 use_critic: bool = True):
        log_prob = action_dist.logp(actions)

        # The "policy gradients" loss
        self.pi_loss = -tf.reduce_sum(
            tf.boolean_mask(log_prob * advantages, valid_mask))

        delta = tf.boolean_mask(vf - v_target, valid_mask)

        # Compute a value function loss.
        if use_critic:
            self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta))
        # Ignore the value function.
        else:
            self.vf_loss = tf.constant(0.0)

        self.entropy = tf.reduce_sum(
            tf.boolean_mask(action_dist.entropy(), valid_mask))

        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
                           self.entropy * entropy_coeff)


def actor_critic_loss(policy: Policy, model: ModelV2,
                      dist_class: ActionDistribution,
                      train_batch: SampleBatch) -> TensorType:
    model_out, _ = model.from_batch(train_batch)
    action_dist = dist_class(model_out, model)
    if policy.is_recurrent():
        max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
        mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
        mask = tf.reshape(mask, [-1])
    else:
        mask = tf.ones_like(train_batch[SampleBatch.REWARDS])
    policy.loss = A3CLoss(action_dist, train_batch[SampleBatch.ACTIONS],
                          train_batch[Postprocessing.ADVANTAGES],
                          train_batch[Postprocessing.VALUE_TARGETS],
                          model.value_function(), mask,
                          policy.config["vf_loss_coeff"],
                          policy.config["entropy_coeff"],
                          policy.config.get("use_critic", True))
    return policy.loss.total_loss


def add_value_function_fetch(policy: Policy) -> Dict[str, TensorType]:
    return {SampleBatch.VF_PREDS: policy.model.value_function()}


def stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]:
    return {
        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
        "policy_loss": policy.loss.pi_loss,
        "policy_entropy": policy.loss.entropy,
        "var_gnorm": tf.linalg.global_norm(
            list(policy.model.trainable_variables())),
        "vf_loss": policy.loss.vf_loss,
    }


def grad_stats(policy: Policy, train_batch: SampleBatch,
               grads: ModelGradients) -> Dict[str, TensorType]:
    return {
        "grad_gnorm": tf.linalg.global_norm(grads),
        "vf_explained_var": explained_variance(
            train_batch[Postprocessing.VALUE_TARGETS],
            policy.model.value_function()),
    }


def clip_gradients(policy: Policy, optimizer: LocalOptimizer,
                   loss: TensorType) -> ModelGradients:
    grads_and_vars = optimizer.compute_gradients(
        loss, policy.model.trainable_variables())
    grads = [g for (g, v) in grads_and_vars]
    grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
    clipped_grads = list(zip(grads, policy.model.trainable_variables()))
    return clipped_grads


def setup_mixins(policy: Policy, obs_space: gym.spaces.Space,
                 action_space: gym.spaces.Space,
                 config: TrainerConfigDict) -> None:
    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])


A3CTFPolicy = build_tf_policy(
    name="A3CTFPolicy",
    get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
    loss_fn=actor_critic_loss,
    stats_fn=stats,
    grad_stats_fn=grad_stats,
    compute_gradients_fn=clip_gradients,
    postprocess_fn=compute_gae_for_sample_batch,
    extra_action_out_fn=add_value_function_fetch,
    before_loss_init=setup_mixins,
    mixins=[ValueNetworkMixin, LearningRateSchedule])
[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`"""Note: Keep in sync with changes to VTraceTFPolicy."""`
[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`from typing import Optional, Dict`
			`import gym`
[rllib] Basic IMPALA implementation (using deepmind's reference vtrace.py) (#2504) Rename AsyncSamplesOptimizer -> AsyncReplayOptimizer Add AsyncSamplesOptimizer that implements the IMPALA architecture integrate V-trace with a3c policy graph audit V-trace integration benchmark compare vs A3C and with V-trace on/off PongNoFrameskip-v4 on IMPALA scaling from 16 to 128 workers, solving Pong in <10 min. For reference, solving this env takes ~40 minutes for Ape-X and several hours for A3C. 2018-08-01 20:53:53 -07:00
[rllib] Part 2 of multiagent support (#2286) * wip * cls * re * wip * wip * a3c working * torch support * pg works * lint * rm v2 * consumer id * clean up pg * clean up more * fix python 2.7 * tf session management * docs * dqn wip * fix compile * dqn * apex runs * up * impotrs * ddpg * quotes * fix tests * fix last r * fix tests * lint * pass checkpoint restore * kwar * nits * policy graph * fix yapf * com * class * pyt * vectorization * update * test cpe * unit test * fix ddpg2 * changes * wip * args * faster test * common * fix * add alg option * batch mode and policy serving * multi serving test * todo * wip * serving test * doc async env * num envs * comments * thread * remove init hook * update * fix ppo * comments1 * fix * updates * add jenkins tests * fix * fix pytorch * fix * fixes * fix a3c policy * fix squeeze * fix trunc on apex * fix squeezing for real * update * remove horizon test for now * multiagent wip * update * fix race condition * fix ma * t * doc * st * wip * example * wip * working * cartpole * wip * batch wip * fix bug * make other_batches None default * working * debug * nit * warn * comments * fix ppo * fix obs filter * update * fix obs filter * pass thru worker index * fix * fix log action * debug name * fix sphinx 2018-06-25 22:33:57 -07:00			`import ray`
[RLlib] Issue 9071 A3C w/ RNN not working due to VF assuming no RNN. (#13238) 2021-01-19 14:22:36 +01:00			`from ray.rllib.agents.ppo.ppo_tf_policy import ValueNetworkMixin`
[rllib] Rename PolicyGraph => Policy, move from evaluation/ to policy/ (#4819) This implements some of the renames proposed in #4813 We leave behind backwards-compatibility aliases for *PolicyGraph and SampleBatch. 2019-05-20 16:46:05 -07:00			`from ray.rllib.policy.sample_batch import SampleBatch`
[RLlib] Issue 9071 A3C w/ RNN not working due to VF assuming no RNN. (#13238) 2021-01-19 14:22:36 +01:00			`from ray.rllib.evaluation.postprocessing import compute_gae_for_sample_batch, \`
[rllib] Minor cleanups to TFPolicyGraph: add init args, constants for loss inputs (#4478) 2019-03-29 12:44:23 -07:00			`Postprocessing`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`from ray.rllib.policy.tf_policy_template import build_tf_policy`
			`from ray.rllib.policy.tf_policy import LearningRateSchedule`
[RLlib] Add @Deprecated decorator to simplify/unify deprecation of classes, methods, functions. (#17530) 2021-08-03 18:30:02 -04:00			`from ray.rllib.utils.annotations import Deprecated`
[RLlib] Minor `rllib.utils` cleanup. (#8932) 2020-06-16 08:52:20 +02:00			`from ray.rllib.utils.framework import try_import_tf`
[RLlib] Issue 9071 A3C w/ RNN not working due to VF assuming no RNN. (#13238) 2021-01-19 14:22:36 +01:00			`from ray.rllib.utils.tf_ops import explained_variance`
[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`from ray.rllib.policy.policy import Policy`
			`from ray.rllib.utils.typing import TrainerConfigDict, TensorType, \`
			`PolicyID, LocalOptimizer, ModelGradients`
			`from ray.rllib.models.action_dist import ActionDistribution`
			`from ray.rllib.models.modelv2 import ModelV2`
			`from ray.rllib.evaluation import MultiAgentEpisode`
[rllib] Remove dependency on TensorFlow (#4764) * remove hard tf dep * add test * comment fix * fix test 2019-05-10 20:36:18 -07:00
[RLlib] Tf2x preparation; part 2 (upgrading `try_import_tf()`). (#9136) * WIP. * Fixes. * LINT. * WIP. * WIP. * Fixes. * Fixes. * Fixes. * Fixes. * WIP. * Fixes. * Test * Fix. * Fixes and LINT. * Fixes and LINT. * LINT. 2020-06-30 10:13:20 +02:00			`tf1, tf, tfv = try_import_tf()`
[rllib] Refactor rllib to have a common sample collection pathway (#2149) 2018-06-09 00:21:35 -07:00

[RLlib] Add @Deprecated decorator to simplify/unify deprecation of classes, methods, functions. (#17530) 2021-08-03 18:30:02 -04:00			`@Deprecated(`
			`old="rllib.agents.a3c.a3c_tf_policy.postprocess_advantages",`
			`new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch",`
			`error=False)`
[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`def postprocess_advantages(`
			`policy: Policy,`
			`sample_batch: SampleBatch,`
			`other_agent_batches: Optional[Dict[PolicyID, SampleBatch]] = None,`
			`episode: Optional[MultiAgentEpisode] = None) -> SampleBatch:`
[RLlib] Issue 9071 A3C w/ RNN not working due to VF assuming no RNN. (#13238) 2021-01-19 14:22:36 +01:00
			`return compute_gae_for_sample_batch(policy, sample_batch,`
			`other_agent_batches, episode)`


Remove (object) from class declarations. (#6658) 2020-01-02 17:42:13 -08:00			`class A3CLoss:`
[rllib] format with yapf (#2427) * initial yapf * manual fix yapf bugs 2018-07-19 15:30:36 -07:00			`def __init__(self,`
[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`action_dist: ActionDistribution,`
			`actions: TensorType,`
			`advantages: TensorType,`
			`v_target: TensorType,`
			`vf: TensorType,`
			`valid_mask: TensorType,`
			`vf_loss_coeff: float = 0.5,`
			`entropy_coeff: float = 0.01,`
			`use_critic: bool = True):`
[rllib] Modularize Torch and TF policy graphs (#2294) * wip * cls * re * wip * wip * a3c working * torch support * pg works * lint * rm v2 * consumer id * clean up pg * clean up more * fix python 2.7 * tf session management * docs * dqn wip * fix compile * dqn * apex runs * up * impotrs * ddpg * quotes * fix tests * fix last r * fix tests * lint * pass checkpoint restore * kwar * nits * policy graph * fix yapf * com * class * pyt * vectorization * update * test cpe * unit test * fix ddpg2 * changes * wip * args * faster test * common * fix * add alg option * batch mode and policy serving * multi serving test * todo * wip * serving test * doc async env * num envs * comments * thread * remove init hook * update * fix ppo * comments1 * fix * updates * add jenkins tests * fix * fix pytorch * fix * fixes * fix a3c policy * fix squeeze * fix trunc on apex * fix squeezing for real * update * remove horizon test for now * multiagent wip * update * fix race condition * fix ma * t * doc * st * wip * example * wip * working * cartpole * wip * batch wip * fix bug * make other_batches None default * working * debug * nit * warn * comments * fix ppo * fix obs filter * update * wip * tf * update * fix * cleanup * cleanup * spacing * model * fix * dqn * fix ddpg * doc * keep names * update * fix * com * docs * clarify model outputs * Update torch_policy_graph.py * fix obs filter * pass thru worker index * fix * rename * vlad torch comments * fix log action * debug name * fix lstm * remove unused ddpg net * remove conv net * revert lstm * cast * clean up * fix lstm check * move to end * fix sphinx * fix cmd * remove bad doc * clarify * copy * async sa * fix 2018-06-26 13:17:15 -07:00			`log_prob = action_dist.logp(actions)`
[rllib] Refactor rllib to have a common sample collection pathway (#2149) 2018-06-09 00:21:35 -07:00
[rllib] Modularize Torch and TF policy graphs (#2294) * wip * cls * re * wip * wip * a3c working * torch support * pg works * lint * rm v2 * consumer id * clean up pg * clean up more * fix python 2.7 * tf session management * docs * dqn wip * fix compile * dqn * apex runs * up * impotrs * ddpg * quotes * fix tests * fix last r * fix tests * lint * pass checkpoint restore * kwar * nits * policy graph * fix yapf * com * class * pyt * vectorization * update * test cpe * unit test * fix ddpg2 * changes * wip * args * faster test * common * fix * add alg option * batch mode and policy serving * multi serving test * todo * wip * serving test * doc async env * num envs * comments * thread * remove init hook * update * fix ppo * comments1 * fix * updates * add jenkins tests * fix * fix pytorch * fix * fixes * fix a3c policy * fix squeeze * fix trunc on apex * fix squeezing for real * update * remove horizon test for now * multiagent wip * update * fix race condition * fix ma * t * doc * st * wip * example * wip * working * cartpole * wip * batch wip * fix bug * make other_batches None default * working * debug * nit * warn * comments * fix ppo * fix obs filter * update * wip * tf * update * fix * cleanup * cleanup * spacing * model * fix * dqn * fix ddpg * doc * keep names * update * fix * com * docs * clarify model outputs * Update torch_policy_graph.py * fix obs filter * pass thru worker index * fix * rename * vlad torch comments * fix log action * debug name * fix lstm * remove unused ddpg net * remove conv net * revert lstm * cast * clean up * fix lstm check * move to end * fix sphinx * fix cmd * remove bad doc * clarify * copy * async sa * fix 2018-06-26 13:17:15 -07:00			`# The "policy gradients" loss`
[RLlib] Mask out padded values for A3C loss with recurrent policy (#15525) 2021-04-27 02:36:04 -04:00			`self.pi_loss = -tf.reduce_sum(`
			`tf.boolean_mask(log_prob * advantages, valid_mask))`
[rllib] Refactor rllib to have a common sample collection pathway (#2149) 2018-06-09 00:21:35 -07:00
[RLlib] Mask out padded values for A3C loss with recurrent policy (#15525) 2021-04-27 02:36:04 -04:00			`delta = tf.boolean_mask(vf - v_target, valid_mask)`
[RLlib] Discussion 2021: PPO does not learn vf, iff use_gae=False (ignores use_critic setting). (#15610) 2021-05-04 14:17:00 +02:00
			`# Compute a value function loss.`
			`if use_critic:`
			`self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta))`
			`# Ignore the value function.`
			`else:`
			`self.vf_loss = tf.constant(0.0)`

[RLlib] Mask out padded values for A3C loss with recurrent policy (#15525) 2021-04-27 02:36:04 -04:00			`self.entropy = tf.reduce_sum(`
			`tf.boolean_mask(action_dist.entropy(), valid_mask))`
[RLlib] Discussion 2021: PPO does not learn vf, iff use_gae=False (ignores use_critic setting). (#15610) 2021-05-04 14:17:00 +02:00
[rllib] Flip sign of A2C, IMPALA entropy coefficient; raise DeprecationWarning if negative (#4374) 2019-03-17 18:07:37 -07:00			`self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -`
[rllib] Modularize Torch and TF policy graphs (#2294) * wip * cls * re * wip * wip * a3c working * torch support * pg works * lint * rm v2 * consumer id * clean up pg * clean up more * fix python 2.7 * tf session management * docs * dqn wip * fix compile * dqn * apex runs * up * impotrs * ddpg * quotes * fix tests * fix last r * fix tests * lint * pass checkpoint restore * kwar * nits * policy graph * fix yapf * com * class * pyt * vectorization * update * test cpe * unit test * fix ddpg2 * changes * wip * args * faster test * common * fix * add alg option * batch mode and policy serving * multi serving test * todo * wip * serving test * doc async env * num envs * comments * thread * remove init hook * update * fix ppo * comments1 * fix * updates * add jenkins tests * fix * fix pytorch * fix * fixes * fix a3c policy * fix squeeze * fix trunc on apex * fix squeezing for real * update * remove horizon test for now * multiagent wip * update * fix race condition * fix ma * t * doc * st * wip * example * wip * working * cartpole * wip * batch wip * fix bug * make other_batches None default * working * debug * nit * warn * comments * fix ppo * fix obs filter * update * wip * tf * update * fix * cleanup * cleanup * spacing * model * fix * dqn * fix ddpg * doc * keep names * update * fix * com * docs * clarify model outputs * Update torch_policy_graph.py * fix obs filter * pass thru worker index * fix * rename * vlad torch comments * fix log action * debug name * fix lstm * remove unused ddpg net * remove conv net * revert lstm * cast * clean up * fix lstm check * move to end * fix sphinx * fix cmd * remove bad doc * clarify * copy * async sa * fix 2018-06-26 13:17:15 -07:00			`self.entropy * entropy_coeff)`
[rllib] Refactor rllib to have a common sample collection pathway (#2149) 2018-06-09 00:21:35 -07:00

[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`def actor_critic_loss(policy: Policy, model: ModelV2,`
			`dist_class: ActionDistribution,`
			`train_batch: SampleBatch) -> TensorType:`
[rllib] Adds eager support with a generic `TFEagerPolicy` class (#5436) 2019-08-23 02:21:11 -04:00			`model_out, _ = model.from_batch(train_batch)`
			`action_dist = dist_class(model_out, model)`
[RLlib] Mask out padded values for A3C loss with recurrent policy (#15525) 2021-04-27 02:36:04 -04:00			`if policy.is_recurrent():`
[RLlib] Replace "seq_lens" w/ SampleBatch.SEQ_LENS. (#17928) 2021-08-21 17:05:48 +02:00			`max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])`
			`mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)`
[RLlib] Mask out padded values for A3C loss with recurrent policy (#15525) 2021-04-27 02:36:04 -04:00			`mask = tf.reshape(mask, [-1])`
			`else:`
			`mask = tf.ones_like(train_batch[SampleBatch.REWARDS])`
[rllib] Adds eager support with a generic `TFEagerPolicy` class (#5436) 2019-08-23 02:21:11 -04:00			`policy.loss = A3CLoss(action_dist, train_batch[SampleBatch.ACTIONS],`
			`train_batch[Postprocessing.ADVANTAGES],`
			`train_batch[Postprocessing.VALUE_TARGETS],`
[RLlib] Mask out padded values for A3C loss with recurrent policy (#15525) 2021-04-27 02:36:04 -04:00			`model.value_function(), mask,`
[rllib] Adds eager support with a generic `TFEagerPolicy` class (#5436) 2019-08-23 02:21:11 -04:00			`policy.config["vf_loss_coeff"],`
[RLlib] Discussion 2021: PPO does not learn vf, iff use_gae=False (ignores use_critic setting). (#15610) 2021-05-04 14:17:00 +02:00			`policy.config["entropy_coeff"],`
			`policy.config.get("use_critic", True))`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`return policy.loss.total_loss`


[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`def add_value_function_fetch(policy: Policy) -> Dict[str, TensorType]:`
[rllib] Adds eager support with a generic `TFEagerPolicy` class (#5436) 2019-08-23 02:21:11 -04:00			`return {SampleBatch.VF_PREDS: policy.model.value_function()}`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00

[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`def stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]:`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`return {`
[rllib] Remove experimental eager support 2019-07-21 12:27:17 -07:00			`"cur_lr": tf.cast(policy.cur_lr, tf.float64),`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`"policy_loss": policy.loss.pi_loss,`
			`"policy_entropy": policy.loss.entropy,`
[RLlib] Minor cleanup in preparation to tf2.x support. (#9130) * WIP. * Fixes. * LINT. * Fixes. * Fixes and LINT. * WIP. 2020-06-25 19:01:32 +02:00			`"var_gnorm": tf.linalg.global_norm(`
			`list(policy.model.trainable_variables())),`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`"vf_loss": policy.loss.vf_loss,`
			`}`


[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`def grad_stats(policy: Policy, train_batch: SampleBatch,`
			`grads: ModelGradients) -> Dict[str, TensorType]:`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`return {`
[RLlib] Minor cleanup in preparation to tf2.x support. (#9130) * WIP. * Fixes. * LINT. * Fixes. * Fixes and LINT. * WIP. 2020-06-25 19:01:32 +02:00			`"grad_gnorm": tf.linalg.global_norm(grads),`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`"vf_explained_var": explained_variance(`
[rllib] Adds eager support with a generic `TFEagerPolicy` class (#5436) 2019-08-23 02:21:11 -04:00			`train_batch[Postprocessing.VALUE_TARGETS],`
			`policy.model.value_function()),`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`}`


[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`def clip_gradients(policy: Policy, optimizer: LocalOptimizer,`
			`loss: TensorType) -> ModelGradients:`
[rllib] Adds eager support with a generic `TFEagerPolicy` class (#5436) 2019-08-23 02:21:11 -04:00			`grads_and_vars = optimizer.compute_gradients(`
			`loss, policy.model.trainable_variables())`
			`grads = [g for (g, v) in grads_and_vars]`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])`
[rllib] Adds eager support with a generic `TFEagerPolicy` class (#5436) 2019-08-23 02:21:11 -04:00			`clipped_grads = list(zip(grads, policy.model.trainable_variables()))`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`return clipped_grads`


[RLlib] DDPG/TD3 + A3C/A2C + MARWIL/BC Annotation/Comments/Code Cleanup (#14707) 2021-05-19 07:32:29 -07:00			`def setup_mixins(policy: Policy, obs_space: gym.spaces.Space,`
			`action_space: gym.spaces.Space,`
			`config: TrainerConfigDict) -> None:`
[RLlib] Issue 9071 A3C w/ RNN not working due to VF assuming no RNN. (#13238) 2021-01-19 14:22:36 +01:00			`ValueNetworkMixin.__init__(policy, obs_space, action_space, config)`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])`


			`A3CTFPolicy = build_tf_policy(`
			`name="A3CTFPolicy",`
			`get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,`
			`loss_fn=actor_critic_loss,`
			`stats_fn=stats,`
			`grad_stats_fn=grad_stats,`
[RLlib] CQL TensorFlow support (#15841) 2021-05-18 11:10:46 +02:00			`compute_gradients_fn=clip_gradients,`
[RLlib] Issue 9071 A3C w/ RNN not working due to VF assuming no RNN. (#13238) 2021-01-19 14:22:36 +01:00			`postprocess_fn=compute_gae_for_sample_batch,`
[RLlib] R2D2 Implementation. (#13933) 2021-02-25 12:18:11 +01:00			`extra_action_out_fn=add_value_function_fetch,`
[rllib] Rough port of DQN to build_tf_policy() pattern (#4823) 2019-06-02 14:14:31 +08:00			`before_loss_init=setup_mixins,`
			`mixins=[ValueNetworkMixin, LearningRateSchedule])`