ray/rllib/algorithms/pg/pg.py

from typing import Type

from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.deprecation import Deprecated
from ray.rllib.utils.typing import AlgorithmConfigDict


class PGConfig(AlgorithmConfig):
    """Defines a configuration class from which a PG Algorithm can be built.

    Example:
        >>> from ray.rllib.algorithms.pg import PGConfig
        >>> config = PGConfig().training(lr=0.01).resources(num_gpus=1)
        >>> print(config.to_dict())
        >>> # Build a Algorithm object from the config and run 1 training iteration.
        >>> trainer = config.build(env="CartPole-v1")
        >>> trainer.train()

    Example:
        >>> from ray.rllib.algorithms.pg import PGConfig
        >>> from ray import tune
        >>> config = PGConfig()
        >>> # Print out some default values.
        >>> print(config.lr) # doctest: +SKIP
        0.0004
        >>> # Update the config object.
        >>> config.training(lr=tune.grid_search([0.001, 0.0001]))
        >>> # Set the config object's env.
        >>> config.environment(env="CartPole-v1")
        >>> # Use to_dict() to get the old-style python config dict
        >>> # when running with tune.
        >>> tune.run(
        ...     "PG",
        ...     stop={"episode_reward_mean": 200},
        ...     config=config.to_dict(),
        ... )
    """

    def __init__(self):
        """Initializes a PGConfig instance."""
        super().__init__(algo_class=PG)

        # fmt: off
        # __sphinx_doc_begin__
        # Override some of AlgorithmConfig's default values with PG-specific values.
        self.num_workers = 0
        self.lr = 0.0004
        self._disable_preprocessor_api = True
        # __sphinx_doc_end__
        # fmt: on


class PG(Algorithm):
    """Policy Gradient (PG) Trainer.

    Defines the distributed Trainer class for policy gradients.
    See `pg_[tf|torch]_policy.py` for the definition of the policy losses for
    TensorFlow and PyTorch.

    Detailed documentation:
    https://docs.ray.io/en/master/rllib-algorithms.html#pg

    Only overrides the default config- and policy selectors
    (`get_default_policy` and `get_default_config`). Utilizes
    the default `training_step()` method of `Trainer`.
    """

    @classmethod
    @override(Algorithm)
    def get_default_config(cls) -> AlgorithmConfigDict:
        return PGConfig().to_dict()

    @override(Algorithm)
    def get_default_policy_class(self, config) -> Type[Policy]:
        if config["framework"] == "torch":
            from ray.rllib.algorithms.pg.pg_torch_policy import PGTorchPolicy

            return PGTorchPolicy
        elif config["framework"] == "tf":
            from ray.rllib.algorithms.pg.pg_tf_policy import PGStaticGraphTFPolicy

            return PGStaticGraphTFPolicy
        else:
            from ray.rllib.algorithms.pg.pg_tf_policy import PGEagerTFPolicy

            return PGEagerTFPolicy


# Deprecated: Use ray.rllib.algorithms.pg.PGConfig instead!
class _deprecated_default_config(dict):
    def __init__(self):
        super().__init__(PGConfig().to_dict())

    @Deprecated(
        old="ray.rllib.algorithms.pg.default_config::DEFAULT_CONFIG",
        new="ray.rllib.algorithms.pg.pg::PGConfig(...)",
        error=False,
    )
    def __getitem__(self, item):
        return super().__getitem__(item)


DEFAULT_CONFIG = _deprecated_default_config()
Revert "Revert [RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`." (#20285) * Revert "Revert "[RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`. (#20055)" (#20284)" This reverts commit 246787cdd9f806de8f1bcfb60b7d1ae332b57f40. Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 11:26:47 +00:00			`from typing import Type`
[RLlib] First attempt at cleaning up algo code in RLlib: PG. (#10115) 2020-08-20 17:05:57 +02:00
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`from ray.rllib.algorithms.algorithm import Algorithm`
			`from ray.rllib.algorithms.algorithm_config import AlgorithmConfig`
[RLlib] First attempt at cleaning up algo code in RLlib: PG. (#10115) 2020-08-20 17:05:57 +02:00			`from ray.rllib.policy.policy import Policy`
Revert "Revert [RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`." (#20285) * Revert "Revert "[RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`. (#20055)" (#20284)" This reverts commit 246787cdd9f806de8f1bcfb60b7d1ae332b57f40. Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 11:26:47 +00:00			`from ray.rllib.utils.annotations import override`
[RLlib] PGTrainer config object class (`PGConfig`). (#24295) 2022-04-28 22:25:16 +02:00			`from ray.rllib.utils.deprecation import Deprecated`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`from ray.rllib.utils.typing import AlgorithmConfigDict`
[rllib] Document "v2" APIs (#2316) * re * wip * wip * a3c working * torch support * pg works * lint * rm v2 * consumer id * clean up pg * clean up more * fix python 2.7 * tf session management * docs * dqn wip * fix compile * dqn * apex runs * up * impotrs * ddpg * quotes * fix tests * fix last r * fix tests * lint * pass checkpoint restore * kwar * nits * policy graph * fix yapf * com * class * pyt * vectorization * update * test cpe * unit test * fix ddpg2 * changes * wip * args * faster test * common * fix * add alg option * batch mode and policy serving * multi serving test * todo * wip * serving test * doc async env * num envs * comments * thread * remove init hook * update * fix ppo * comments1 * fix * updates * add jenkins tests * fix * fix pytorch * fix * fixes * fix a3c policy * fix squeeze * fix trunc on apex * fix squeezing for real * update * remove horizon test for now * multiagent wip * update * fix race condition * fix ma * t * doc * st * wip * example * wip * working * cartpole * wip * batch wip * fix bug * make other_batches None default * working * debug * nit * warn * comments * fix ppo * fix obs filter * update * wip * tf * update * fix * cleanup * cleanup * spacing * model * fix * dqn * fix ddpg * doc * keep names * update * fix * com * docs * clarify model outputs * Update torch_policy_graph.py * fix obs filter * pass thru worker index * fix * rename * vlad torch comments * fix log action * debug name * fix lstm * remove unused ddpg net * remove conv net * revert lstm * wip * wip * cast * wip * works * fix a3c * works * lstm util test * doc * clean up * update * fix lstm check * move to end * fix sphinx * fix cmd * remove bad doc * envs * vec * doc prep * models * rl * alg * up * clarify * copy * async sa * fix * comments * fix a3c conf * tune lstm * fix reshape * fix * back to 16 * tuned a3c update * update * tuned * optional * merge * wip * fix up * move pg class * rename env * wip * update * tip * alg * readme * fix catalog * readme * doc * context * remove prep * comma * add env * link to paper * paper * update * rnn * update * wip * clean up ev creation * fix * fix * fix * fix lint * up * no comma * ma * Update run_multi_node_tests.sh * fix * sphinx is stupid * sphinx is stupid * clarify torch graph * no horizon * fix config * sb * Update test_optimizers.py 2018-07-01 00:05:08 -07:00
[RLlib] Trajectory view API: Simple List Collector (on by default for PPO); LSTM-agnostic (#11056) 2020-10-01 16:57:10 +02:00
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`class PGConfig(AlgorithmConfig):`
			`"""Defines a configuration class from which a PG Algorithm can be built.`
[RLlib] PGTrainer config object class (`PGConfig`). (#24295) 2022-04-28 22:25:16 +02:00
			`Example:`
[RLlib] Agents to algos: DQN w/o Apex and R2D2, DDPG/TD3, SAC, SlateQ, QMIX, PG, Bandits (#24896) 2022-05-19 09:30:42 -07:00			`>>> from ray.rllib.algorithms.pg import PGConfig`
[RLlib] PGTrainer config object class (`PGConfig`). (#24295) 2022-04-28 22:25:16 +02:00			`>>> config = PGConfig().training(lr=0.01).resources(num_gpus=1)`
			`>>> print(config.to_dict())`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`>>> # Build a Algorithm object from the config and run 1 training iteration.`
[RLlib] PGTrainer config object class (`PGConfig`). (#24295) 2022-04-28 22:25:16 +02:00			`>>> trainer = config.build(env="CartPole-v1")`
			`>>> trainer.train()`

			`Example:`
[RLlib] Agents to algos: DQN w/o Apex and R2D2, DDPG/TD3, SAC, SlateQ, QMIX, PG, Bandits (#24896) 2022-05-19 09:30:42 -07:00			`>>> from ray.rllib.algorithms.pg import PGConfig`
[RLlib] A2/3C Config objects (A2CConfig and A3CConfig). (#24332) 2022-04-30 09:51:09 +02:00			`>>> from ray import tune`
[RLlib] PGTrainer config object class (`PGConfig`). (#24295) 2022-04-28 22:25:16 +02:00			`>>> config = PGConfig()`
			`>>> # Print out some default values.`
[RLlib] APEX-DQN and R2D2 config objects. (#25067) 2022-05-23 12:15:45 +02:00			`>>> print(config.lr) # doctest: +SKIP`
			`0.0004`
[RLlib] PGTrainer config object class (`PGConfig`). (#24295) 2022-04-28 22:25:16 +02:00			`>>> # Update the config object.`
			`>>> config.training(lr=tune.grid_search([0.001, 0.0001]))`
			`>>> # Set the config object's env.`
			`>>> config.environment(env="CartPole-v1")`
			`>>> # Use to_dict() to get the old-style python config dict`
			`>>> # when running with tune.`
			`>>> tune.run(`
			`... "PG",`
			`... stop={"episode_reward_mean": 200},`
			`... config=config.to_dict(),`
			`... )`
			`"""`

			`def __init__(self):`
			`"""Initializes a PGConfig instance."""`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`super().__init__(algo_class=PG)`
[RLlib] PGTrainer config object class (`PGConfig`). (#24295) 2022-04-28 22:25:16 +02:00
			`# fmt: off`
			`# __sphinx_doc_begin__`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`# Override some of AlgorithmConfig's default values with PG-specific values.`
[RLlib] PGTrainer config object class (`PGConfig`). (#24295) 2022-04-28 22:25:16 +02:00			`self.num_workers = 0`
			`self.lr = 0.0004`
			`self._disable_preprocessor_api = True`
			`# __sphinx_doc_end__`
			`# fmt: on`


[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`class PG(Algorithm):`
Revert "Revert [RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`." (#20285) * Revert "Revert "[RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`. (#20055)" (#20284)" This reverts commit 246787cdd9f806de8f1bcfb60b7d1ae332b57f40. Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 11:26:47 +00:00			`"""Policy Gradient (PG) Trainer.`
[RLlib] First attempt at cleaning up algo code in RLlib: PG. (#10115) 2020-08-20 17:05:57 +02:00
Revert "Revert [RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`." (#20285) * Revert "Revert "[RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`. (#20055)" (#20284)" This reverts commit 246787cdd9f806de8f1bcfb60b7d1ae332b57f40. Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 11:26:47 +00:00			`Defines the distributed Trainer class for policy gradients.`
			See `pg_[tf\|torch]_policy.py` for the definition of the policy losses for
			`TensorFlow and PyTorch.`
[RLlib] First attempt at cleaning up algo code in RLlib: PG. (#10115) 2020-08-20 17:05:57 +02:00
Revert "Revert [RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`." (#20285) * Revert "Revert "[RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`. (#20055)" (#20284)" This reverts commit 246787cdd9f806de8f1bcfb60b7d1ae332b57f40. Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 11:26:47 +00:00			`Detailed documentation:`
			`https://docs.ray.io/en/master/rllib-algorithms.html#pg`
[rllib] Include config dicts in the sphinx docs (#3064) 2018-10-16 15:55:11 -07:00
Revert "Revert [RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`." (#20285) * Revert "Revert "[RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`. (#20055)" (#20284)" This reverts commit 246787cdd9f806de8f1bcfb60b7d1ae332b57f40. Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 11:26:47 +00:00			`Only overrides the default config- and policy selectors`
			(`get_default_policy` and `get_default_config`). Utilizes
[RLlib] Remove execution plan code no longer used by RLlib. (#25624) 2022-06-14 01:57:27 -07:00			the default `training_step()` method of `Trainer`.
Revert "Revert [RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`." (#20285) * Revert "Revert "[RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`. (#20055)" (#20284)" This reverts commit 246787cdd9f806de8f1bcfb60b7d1ae332b57f40. Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 11:26:47 +00:00			`"""`
[rllib] Document "v2" APIs (#2316) * re * wip * wip * a3c working * torch support * pg works * lint * rm v2 * consumer id * clean up pg * clean up more * fix python 2.7 * tf session management * docs * dqn wip * fix compile * dqn * apex runs * up * impotrs * ddpg * quotes * fix tests * fix last r * fix tests * lint * pass checkpoint restore * kwar * nits * policy graph * fix yapf * com * class * pyt * vectorization * update * test cpe * unit test * fix ddpg2 * changes * wip * args * faster test * common * fix * add alg option * batch mode and policy serving * multi serving test * todo * wip * serving test * doc async env * num envs * comments * thread * remove init hook * update * fix ppo * comments1 * fix * updates * add jenkins tests * fix * fix pytorch * fix * fixes * fix a3c policy * fix squeeze * fix trunc on apex * fix squeezing for real * update * remove horizon test for now * multiagent wip * update * fix race condition * fix ma * t * doc * st * wip * example * wip * working * cartpole * wip * batch wip * fix bug * make other_batches None default * working * debug * nit * warn * comments * fix ppo * fix obs filter * update * wip * tf * update * fix * cleanup * cleanup * spacing * model * fix * dqn * fix ddpg * doc * keep names * update * fix * com * docs * clarify model outputs * Update torch_policy_graph.py * fix obs filter * pass thru worker index * fix * rename * vlad torch comments * fix log action * debug name * fix lstm * remove unused ddpg net * remove conv net * revert lstm * wip * wip * cast * wip * works * fix a3c * works * lstm util test * doc * clean up * update * fix lstm check * move to end * fix sphinx * fix cmd * remove bad doc * envs * vec * doc prep * models * rl * alg * up * clarify * copy * async sa * fix * comments * fix a3c conf * tune lstm * fix reshape * fix * back to 16 * tuned a3c update * update * tuned * optional * merge * wip * fix up * move pg class * rename env * wip * update * tip * alg * readme * fix catalog * readme * doc * context * remove prep * comma * add env * link to paper * paper * update * rnn * update * wip * clean up ev creation * fix * fix * fix * fix lint * up * no comma * ma * Update run_multi_node_tests.sh * fix * sphinx is stupid * sphinx is stupid * clarify torch graph * no horizon * fix config * sb * Update test_optimizers.py 2018-07-01 00:05:08 -07:00
Revert "Revert [RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`." (#20285) * Revert "Revert "[RLlib] POC: `PGTrainer` class that works by sub-classing, not `trainer_template.py`. (#20055)" (#20284)" This reverts commit 246787cdd9f806de8f1bcfb60b7d1ae332b57f40. Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 11:26:47 +00:00			`@classmethod`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`@override(Algorithm)`
			`def get_default_config(cls) -> AlgorithmConfigDict:`
[RLlib] A2/3C Config objects (A2CConfig and A3CConfig). (#24332) 2022-04-30 09:51:09 +02:00			`return PGConfig().to_dict()`
[RLlib] Trainer sub-class PPO/DDPPO (instead of `build_trainer()`). (#20571) 2021-11-23 23:01:05 +01:00
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`@override(Algorithm)`
[RLlib] Trainer sub-class PPO/DDPPO (instead of `build_trainer()`). (#20571) 2021-11-23 23:01:05 +01:00			`def get_default_policy_class(self, config) -> Type[Policy]:`
[RLlib] PG policy subclassing conversion. (#25288) 2022-06-06 13:07:47 +02:00			`if config["framework"] == "torch":`
			`from ray.rllib.algorithms.pg.pg_torch_policy import PGTorchPolicy`

			`return PGTorchPolicy`
			`elif config["framework"] == "tf":`
			`from ray.rllib.algorithms.pg.pg_tf_policy import PGStaticGraphTFPolicy`

			`return PGStaticGraphTFPolicy`
			`else:`
			`from ray.rllib.algorithms.pg.pg_tf_policy import PGEagerTFPolicy`

			`return PGEagerTFPolicy`
[RLlib] A2/3C Config objects (A2CConfig and A3CConfig). (#24332) 2022-04-30 09:51:09 +02:00

[RLlib] Agents to algos: DQN w/o Apex and R2D2, DDPG/TD3, SAC, SlateQ, QMIX, PG, Bandits (#24896) 2022-05-19 09:30:42 -07:00			`# Deprecated: Use ray.rllib.algorithms.pg.PGConfig instead!`
[RLlib] A2/3C Config objects (A2CConfig and A3CConfig). (#24332) 2022-04-30 09:51:09 +02:00			`class _deprecated_default_config(dict):`
			`def __init__(self):`
			`super().__init__(PGConfig().to_dict())`

			`@Deprecated(`
[RLlib] Agents to algos: DQN w/o Apex and R2D2, DDPG/TD3, SAC, SlateQ, QMIX, PG, Bandits (#24896) 2022-05-19 09:30:42 -07:00			`old="ray.rllib.algorithms.pg.default_config::DEFAULT_CONFIG",`
[RLlib] Move all remaining algos into `algorithms` directory. (#25366) 2022-06-04 07:35:24 +02:00			`new="ray.rllib.algorithms.pg.pg::PGConfig(...)",`
[RLlib] A2/3C Config objects (A2CConfig and A3CConfig). (#24332) 2022-04-30 09:51:09 +02:00			`error=False,`
			`)`
			`def __getitem__(self, item):`
			`return super().__getitem__(item)`


			`DEFAULT_CONFIG = _deprecated_default_config()`