ray/rllib/agents/pg/pg_tf_policy.py

import ray
from ray.rllib.evaluation.postprocessing import Postprocessing, \
    compute_advantages
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils import try_import_tf

tf = try_import_tf()


def post_process_advantages(policy, sample_batch, other_agent_batches=None,
                            episode=None):
    """This adds the "advantages" column to the sample train_batch."""
    return compute_advantages(sample_batch, 0.0, policy.config["gamma"],
                              use_gae=False)


def pg_tf_loss(policy, model, dist_class, train_batch):
    """The basic policy gradients loss."""
    logits, _ = model.from_batch(train_batch)
    action_dist = dist_class(logits, model)
    return -tf.reduce_mean(action_dist.logp(train_batch[SampleBatch.ACTIONS])
                           * train_batch[Postprocessing.ADVANTAGES])


PGTFPolicy = build_tf_policy(
    name="PGTFPolicy",
    get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
    postprocess_fn=post_process_advantages,
    loss_fn=pg_tf_loss)
PG unify/cleanup tf vs torch and PG functionality test cases (tf + torch). (#6650) * Unifying the code for PGTrainer/Policy wrt tf vs torch. Adding loss function test cases for the PGAgent (confirm equivalence of tf and torch). * Fix LINT line-len errors. * Fix LINT errors. * Fix `tf_pg_policy` imports (formerly: `pg_policy`). * Rename tf_pg_... into pg_tf_... following <alg>_<framework>_... convention, where ...=policy/loss/agent/trainer. Retire `PGAgent` class (use PGTrainer instead). * - Move PG test into agents/pg/tests directory. - All test cases will be located near the classes that are tested and then built into the Bazel/Travis test suite. * Moved post_process_advantages into pg.py (from pg_tf_policy.py), b/c the function is not a tf-specific one. * Fix remaining import errors for agents/pg/... * Fix circular dependency in pg imports. * Add pg tests to Jenkins test suite. 2020-01-02 19:08:03 -05:00			`import ray`
			`from ray.rllib.evaluation.postprocessing import Postprocessing, \`
			`compute_advantages`
			`from ray.rllib.policy.tf_policy_template import build_tf_policy`
			`from ray.rllib.policy.sample_batch import SampleBatch`
			`from ray.rllib.utils import try_import_tf`

			`tf = try_import_tf()`


			`def post_process_advantages(policy, sample_batch, other_agent_batches=None,`
			`episode=None):`
			`"""This adds the "advantages" column to the sample train_batch."""`
			`return compute_advantages(sample_batch, 0.0, policy.config["gamma"],`
			`use_gae=False)`


			`def pg_tf_loss(policy, model, dist_class, train_batch):`
			`"""The basic policy gradients loss."""`
			`logits, _ = model.from_batch(train_batch)`
			`action_dist = dist_class(logits, model)`
			`return -tf.reduce_mean(action_dist.logp(train_batch[SampleBatch.ACTIONS])`
			`* train_batch[Postprocessing.ADVANTAGES])`


			`PGTFPolicy = build_tf_policy(`
			`name="PGTFPolicy",`
			`get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,`
			`postprocess_fn=post_process_advantages,`
			`loss_fn=pg_tf_loss)`