ray/rllib/examples/random_parametric_agent.py

from abc import ABC

import ray

import numpy as np

from ray.rllib import Policy
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
from ray.rllib.examples.env.parametric_actions_cartpole import ParametricActionsCartPole
from ray.rllib.models.modelv2 import restore_original_dimensions
from ray.rllib.utils import override
from ray.rllib.utils.typing import ResultDict
from ray.tune.registry import register_env


class RandomParametricPolicy(Policy, ABC):
    """
    Just pick a random legal action
    The outputted state of the environment needs to be a dictionary with an
    'action_mask' key containing the legal actions for the agent.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()

    @override(Policy)
    def compute_actions(
        self,
        obs_batch,
        state_batches=None,
        prev_action_batch=None,
        prev_reward_batch=None,
        info_batch=None,
        episodes=None,
        **kwargs
    ):

        obs_batch = restore_original_dimensions(
            np.array(obs_batch, dtype=np.float32), self.observation_space, tensorlib=np
        )

        def pick_legal_action(legal_action):
            return np.random.choice(
                len(legal_action), 1, p=(legal_action / legal_action.sum())
            )[0]

        return [pick_legal_action(x) for x in obs_batch["action_mask"]], [], {}

    def learn_on_batch(self, samples):
        pass

    def get_weights(self):
        pass

    def set_weights(self, weights):
        pass


class RandomParametricAlgorithm(Algorithm):
    """Algo with Policy and config defined above and overriding `training_step`.

    Overrides the `training_step` method, which only runs a (dummy)
    rollout and performs no learning.
    """

    def get_default_policy_class(self, config):
        return RandomParametricPolicy

    @override(Algorithm)
    def training_step(self) -> ResultDict:
        # Perform rollouts (only for collecting metrics later).
        synchronous_parallel_sample(worker_set=self.workers)

        # Return (empty) training metrics.
        return {}


def main():
    register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10))
    algo = RandomParametricAlgorithm(env="pa_cartpole")
    result = algo.train()
    assert result["episode_reward_mean"] > 10, result
    print("Test: OK")


if __name__ == "__main__":
    ray.init()
    main()
[RLLib] Random Parametric Trainer (#11366) 2020-11-04 11:12:51 +01:00			`from abc import ABC`

			`import ray`

			`import numpy as np`

			`from ray.rllib import Policy`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`from ray.rllib.algorithms.algorithm import Algorithm`
[RLlib] Examples folder: All `training_iteration` translations. (#23712) 2022-04-05 16:33:50 +02:00			`from ray.rllib.execution.rollout_ops import synchronous_parallel_sample`
[RLLib] Random Parametric Trainer (#11366) 2020-11-04 11:12:51 +01:00			`from ray.rllib.examples.env.parametric_actions_cartpole import ParametricActionsCartPole`
			`from ray.rllib.models.modelv2 import restore_original_dimensions`
			`from ray.rllib.utils import override`
[RLlib] Examples folder: All `training_iteration` translations. (#23712) 2022-04-05 16:33:50 +02:00			`from ray.rllib.utils.typing import ResultDict`
[RLLib] Random Parametric Trainer (#11366) 2020-11-04 11:12:51 +01:00			`from ray.tune.registry import register_env`


[RLlib] Examples folder: All `training_iteration` translations. (#23712) 2022-04-05 16:33:50 +02:00			`class RandomParametricPolicy(Policy, ABC):`
[RLLib] Random Parametric Trainer (#11366) 2020-11-04 11:12:51 +01:00			`"""`
			`Just pick a random legal action`
			`The outputted state of the environment needs to be a dictionary with an`
			`'action_mask' key containing the legal actions for the agent.`
			`"""`

			`def __init__(self, args, *kwargs):`
			`super().__init__(args, *kwargs)`
			`self.exploration = self._create_exploration()`

			`@override(Policy)`
			`def compute_actions(`
			`self,`
			`obs_batch,`
			`state_batches=None,`
			`prev_action_batch=None,`
			`prev_reward_batch=None,`
			`info_batch=None,`
			`episodes=None,`
			`**kwargs`
			`):`

			`obs_batch = restore_original_dimensions(`
			`np.array(obs_batch, dtype=np.float32), self.observation_space, tensorlib=np`
			`)`

			`def pick_legal_action(legal_action):`
			`return np.random.choice(`
			`len(legal_action), 1, p=(legal_action / legal_action.sum())`
			`)[0]`

			`return [pick_legal_action(x) for x in obs_batch["action_mask"]], [], {}`

			`def learn_on_batch(self, samples):`
			`pass`

			`def get_weights(self):`
			`pass`

			`def set_weights(self, weights):`
			`pass`


[RLlib] More Trainer -> Algorithm renaming cleanups. (#25869) 2022-06-20 15:54:00 +02:00			`class RandomParametricAlgorithm(Algorithm):`
			"""Algo with Policy and config defined above and overriding `training_step`.
[RLlib] Examples folder: All `training_iteration` translations. (#23712) 2022-04-05 16:33:50 +02:00
[RLlib] More Trainer -> Algorithm renaming cleanups. (#25869) 2022-06-20 15:54:00 +02:00			Overrides the `training_step` method, which only runs a (dummy)
[RLlib] Examples folder: All `training_iteration` translations. (#23712) 2022-04-05 16:33:50 +02:00			`rollout and performs no learning.`
			`"""`

[RLlib] trainer_template.py: hard deprecation (error when used). (#23488) 2022-03-25 18:25:51 +01:00			`def get_default_policy_class(self, config):`
[RLlib] Examples folder: All `training_iteration` translations. (#23712) 2022-04-05 16:33:50 +02:00			`return RandomParametricPolicy`
[RLLib] Random Parametric Trainer (#11366) 2020-11-04 11:12:51 +01:00
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`@override(Algorithm)`
[RLlib] Trainer.training_iteration -> Trainer.training_step; Iterations vs reportings: Clarification of terms. (#25076) 2022-06-10 17:09:18 +02:00			`def training_step(self) -> ResultDict:`
[RLlib] Examples folder: All `training_iteration` translations. (#23712) 2022-04-05 16:33:50 +02:00			`# Perform rollouts (only for collecting metrics later).`
[RLlib] Rewrite PPO to use training_iteration + enable DD-PPO for Win32. (#23673) 2022-04-11 08:39:10 +02:00			`synchronous_parallel_sample(worker_set=self.workers)`
[RLLib] Random Parametric Trainer (#11366) 2020-11-04 11:12:51 +01:00
[RLlib] Examples folder: All `training_iteration` translations. (#23712) 2022-04-05 16:33:50 +02:00			`# Return (empty) training metrics.`
			`return {}`
[RLLib] Random Parametric Trainer (#11366) 2020-11-04 11:12:51 +01:00
[RLlib] Fix reverted RockPaperScissors Pettingzoo example (#16896) 2021-07-22 07:55:07 -07:00
			`def main():`
[RLLib] Random Parametric Trainer (#11366) 2020-11-04 11:12:51 +01:00			`register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10))`
[RLlib] More Trainer -> Algorithm renaming cleanups. (#25869) 2022-06-20 15:54:00 +02:00			`algo = RandomParametricAlgorithm(env="pa_cartpole")`
[RLlib] `Trainer` to `Algorithm` renaming. (#25539) 2022-06-11 15:10:39 +02:00			`result = algo.train()`
[RLLib] Random Parametric Trainer (#11366) 2020-11-04 11:12:51 +01:00			`assert result["episode_reward_mean"] > 10, result`
			`print("Test: OK")`
[RLlib] Fix reverted RockPaperScissors Pettingzoo example (#16896) 2021-07-22 07:55:07 -07:00

			`if __name__ == "__main__":`
			`ray.init()`
			`main()`