ray/rllib/examples/policy/random_policy.py

from gym.spaces import Box
import numpy as np
import random
import tree  # pip install dm_tree

from ray.rllib.policy.policy import Policy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import ModelWeights


class RandomPolicy(Policy):
    """Hand-coded policy that returns random actions."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Whether for compute_actions, the bounds given in action_space
        # should be ignored (default: False). This is to test action-clipping
        # and any Env's reaction to bounds breaches.
        if self.config.get("ignore_action_bounds", False) and isinstance(
            self.action_space, Box
        ):
            self.action_space_for_sampling = Box(
                -float("inf"),
                float("inf"),
                shape=self.action_space.shape,
                dtype=self.action_space.dtype,
            )
        else:
            self.action_space_for_sampling = self.action_space

    @override(Policy)
    def init_view_requirements(self):
        super().init_view_requirements()
        # Disable for_training and action attributes for SampleBatch.INFOS column
        # since it can not be properly batched.
        vr = self.view_requirements[SampleBatch.INFOS]
        vr.used_for_training = False
        vr.used_for_compute_actions = False

    @override(Policy)
    def compute_actions(
        self,
        obs_batch,
        state_batches=None,
        prev_action_batch=None,
        prev_reward_batch=None,
        **kwargs
    ):
        # Alternatively, a numpy array would work here as well.
        # e.g.: np.array([random.choice([0, 1])] * len(obs_batch))
        return [self.action_space_for_sampling.sample() for _ in obs_batch], [], {}

    @override(Policy)
    def learn_on_batch(self, samples):
        """No learning."""
        return {}

    @override(Policy)
    def compute_log_likelihoods(
        self,
        actions,
        obs_batch,
        state_batches=None,
        prev_action_batch=None,
        prev_reward_batch=None,
    ):
        return np.array([random.random()] * len(obs_batch))

    @override(Policy)
    def get_weights(self) -> ModelWeights:
        """No weights to save."""
        return {}

    @override(Policy)
    def set_weights(self, weights: ModelWeights) -> None:
        """No weights to set."""
        pass

    @override(Policy)
    def _get_dummy_batch_from_view_requirements(self, batch_size: int = 1):
        return SampleBatch(
            {
                SampleBatch.OBS: tree.map_structure(
                    lambda s: s[None], self.observation_space.sample()
                ),
            }
        )
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`from gym.spaces import Box`
[RLlib] Memory leak finding toolset using tracemalloc + CI memory leak tests. (#15412) 2022-04-12 07:50:09 +02:00			`import numpy as np`
			`import random`
			`import tree # pip install dm_tree`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00
[RLlib] Examples folder restructuring (models) part 1 (#8353) 2020-05-08 08:20:18 +02:00			`from ray.rllib.policy.policy import Policy`
[RLlib] Memory leak finding toolset using tracemalloc + CI memory leak tests. (#15412) 2022-04-12 07:50:09 +02:00			`from ray.rllib.policy.sample_batch import SampleBatch`
[RLlib] Examples folder restructuring (models) part 1 (#8353) 2020-05-08 08:20:18 +02:00			`from ray.rllib.utils.annotations import override`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00			`from ray.rllib.utils.typing import ModelWeights`
[RLlib] Examples folder restructuring (models) part 1 (#8353) 2020-05-08 08:20:18 +02:00

			`class RandomPolicy(Policy):`
			`"""Hand-coded policy that returns random actions."""`

			`def __init__(self, args, *kwargs):`
			`super().__init__(args, *kwargs)`

[RLlib] Enhance reward clipping test; add action_clipping tests. (#9684) 2020-07-28 10:44:54 +02:00			`# Whether for compute_actions, the bounds given in action_space`
			`# should be ignored (default: False). This is to test action-clipping`
			`# and any Env's reaction to bounds breaches.`
			`if self.config.get("ignore_action_bounds", False) and isinstance(`
			`self.action_space, Box`
			`):`
			`self.action_space_for_sampling = Box(`
ci: Redo `format.sh --all` script & backfill lint fixes (#9956) 2020-08-07 16:49:49 -07:00			`-float("inf"),`
			`float("inf"),`
			`shape=self.action_space.shape,`
			`dtype=self.action_space.dtype,`
			`)`
[RLlib] Enhance reward clipping test; add action_clipping tests. (#9684) 2020-07-28 10:44:54 +02:00			`else:`
			`self.action_space_for_sampling = self.action_space`

[RLlib] more connector polishes and fixes. (#26645) 2022-07-19 08:50:28 -07:00			`@override(Policy)`
			`def init_view_requirements(self):`
			`super().init_view_requirements()`
			`# Disable for_training and action attributes for SampleBatch.INFOS column`
			`# since it can not be properly batched.`
			`vr = self.view_requirements[SampleBatch.INFOS]`
			`vr.used_for_training = False`
			`vr.used_for_compute_actions = False`

[RLlib] Examples folder restructuring (models) part 1 (#8353) 2020-05-08 08:20:18 +02:00			`@override(Policy)`
			`def compute_actions(`
			`self,`
			`obs_batch,`
			`state_batches=None,`
			`prev_action_batch=None,`
			`prev_reward_batch=None,`
			`**kwargs`
			`):`
			`# Alternatively, a numpy array would work here as well.`
			`# e.g.: np.array([random.choice([0, 1])] * len(obs_batch))`
[RLlib] Enhance reward clipping test; add action_clipping tests. (#9684) 2020-07-28 10:44:54 +02:00			`return [self.action_space_for_sampling.sample() for _ in obs_batch], [], {}`
[RLlib] Examples folder restructuring (models) part 1 (#8353) 2020-05-08 08:20:18 +02:00
			`@override(Policy)`
			`def learn_on_batch(self, samples):`
			`"""No learning."""`
			`return {}`

			`@override(Policy)`
			`def compute_log_likelihoods(`
			`self,`
			`actions,`
			`obs_batch,`
			`state_batches=None,`
			`prev_action_batch=None,`
			`prev_reward_batch=None,`
			`):`
			`return np.array([random.random()] * len(obs_batch))`
[RLlib] Implement the SlateQ algorithm (#11450) 2020-11-03 00:52:04 -08:00
			`@override(Policy)`
			`def get_weights(self) -> ModelWeights:`
			`"""No weights to save."""`
			`return {}`

			`@override(Policy)`
			`def set_weights(self, weights: ModelWeights) -> None:`
			`"""No weights to set."""`
			`pass`
[RLlib] Memory leak finding toolset using tracemalloc + CI memory leak tests. (#15412) 2022-04-12 07:50:09 +02:00
			`@override(Policy)`
			`def _get_dummy_batch_from_view_requirements(self, batch_size: int = 1):`
			`return SampleBatch(`
			`{`
			`SampleBatch.OBS: tree.map_structure(`
			`lambda s: s[None], self.observation_space.sample()`
			`),`
			`}`
			`)`