ray/rllib/examples/policy/episode_env_aware_policy.py

from gym.spaces import Box
import numpy as np

from ray.rllib.examples.policy.random_policy import RandomPolicy
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.view_requirement import ViewRequirement
from ray.rllib.utils.annotations import override


class EpisodeEnvAwareLSTMPolicy(RandomPolicy):
    """A Policy that always knows the current EpisodeID and EnvID and
    returns these in its actions."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.state_space = Box(-1.0, 1.0, (1,))

        class _fake_model:
            pass

        self.model = _fake_model()
        self.model.time_major = True
        self.model.view_requirements = {
            SampleBatch.AGENT_INDEX: ViewRequirement(),
            SampleBatch.EPS_ID: ViewRequirement(),
            "env_id": ViewRequirement(),
            "t": ViewRequirement(),
            SampleBatch.OBS: ViewRequirement(),
            SampleBatch.PREV_ACTIONS: ViewRequirement(
                SampleBatch.ACTIONS, space=self.action_space, shift=-1
            ),
            SampleBatch.PREV_REWARDS: ViewRequirement(SampleBatch.REWARDS, shift=-1),
        }
        for i in range(2):
            self.model.view_requirements["state_in_{}".format(i)] = ViewRequirement(
                "state_out_{}".format(i), shift=-1, space=self.state_space
            )
            self.model.view_requirements["state_out_{}".format(i)] = ViewRequirement(
                space=self.state_space
            )

        self.view_requirements = dict(
            **{
                SampleBatch.NEXT_OBS: ViewRequirement(SampleBatch.OBS, shift=1),
                SampleBatch.ACTIONS: ViewRequirement(space=self.action_space),
                SampleBatch.REWARDS: ViewRequirement(),
                SampleBatch.DONES: ViewRequirement(),
                SampleBatch.UNROLL_ID: ViewRequirement(),
            },
            **self.model.view_requirements
        )

    @override(Policy)
    def is_recurrent(self):
        return True

    @override(Policy)
    def compute_actions_from_input_dict(
        self, input_dict, explore=None, timestep=None, **kwargs
    ):
        ts = input_dict["t"]
        print(ts)
        # Always return [episodeID, envID] as actions.
        actions = np.array(
            [
                [
                    input_dict[SampleBatch.AGENT_INDEX][i],
                    input_dict[SampleBatch.EPS_ID][i],
                    input_dict["env_id"][i],
                ]
                for i, _ in enumerate(input_dict["obs"])
            ]
        )
        states = [
            np.array([[ts[i]] for i in range(len(input_dict["obs"]))]) for _ in range(2)
        ]
        return actions, states, {}

    @override(Policy)
    def postprocess_trajectory(
        self, sample_batch, other_agent_batches=None, episode=None
    ):
        sample_batch["2xobs"] = sample_batch["obs"] * 2.0
        return sample_batch


class EpisodeEnvAwareAttentionPolicy(RandomPolicy):
    """A Policy that always knows the current EpisodeID and EnvID and
    returns these in its actions."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.state_space = Box(-1.0, 1.0, (1,))
        self.config["model"] = {"max_seq_len": 50}

        class _fake_model:
            pass

        self.model = _fake_model()
        self.model.view_requirements = {
            SampleBatch.AGENT_INDEX: ViewRequirement(),
            SampleBatch.EPS_ID: ViewRequirement(),
            "env_id": ViewRequirement(),
            "t": ViewRequirement(),
            SampleBatch.OBS: ViewRequirement(),
            "state_in_0": ViewRequirement(
                "state_out_0",
                # Provide state outs -50 to -1 as "state-in".
                shift="-50:-1",
                # Repeat the incoming state every n time steps (usually max seq
                # len).
                batch_repeat_value=self.config["model"]["max_seq_len"],
                space=self.state_space,
            ),
            "state_out_0": ViewRequirement(
                space=self.state_space, used_for_compute_actions=False
            ),
        }

        self.view_requirements = dict(
            super()._get_default_view_requirements(), **self.model.view_requirements
        )

    @override(Policy)
    def is_recurrent(self):
        return True

    @override(Policy)
    def compute_actions_from_input_dict(
        self, input_dict, explore=None, timestep=None, **kwargs
    ):
        ts = input_dict["t"]
        print(ts)
        # Always return [episodeID, envID] as actions.
        actions = np.array(
            [
                [
                    input_dict[SampleBatch.AGENT_INDEX][i],
                    input_dict[SampleBatch.EPS_ID][i],
                    input_dict["env_id"][i],
                ]
                for i, _ in enumerate(input_dict["obs"])
            ]
        )
        states = [np.array([[ts[i]] for i in range(len(input_dict["obs"]))])]
        self.global_timestep += 1
        return actions, states, {}

    @override(Policy)
    def postprocess_trajectory(
        self, sample_batch, other_agent_batches=None, episode=None
    ):
        sample_batch["3xobs"] = sample_batch["obs"] * 3.0
        return sample_batch
[RLlib] Trajectory view API: Simple List Collector (on by default for PPO); LSTM-agnostic (#11056) 2020-10-01 16:57:10 +02:00			`from gym.spaces import Box`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00			`import numpy as np`

			`from ray.rllib.examples.policy.random_policy import RandomPolicy`
			`from ray.rllib.policy.policy import Policy`
			`from ray.rllib.policy.sample_batch import SampleBatch`
			`from ray.rllib.policy.view_requirement import ViewRequirement`
			`from ray.rllib.utils.annotations import override`


[RLlib] Attention Net prep PR #1: Smaller cleanups. (#12447) * WIP. * Fix. * Fix. * Fix. 2020-11-28 01:25:47 +01:00			`class EpisodeEnvAwareLSTMPolicy(RandomPolicy):`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00			`"""A Policy that always knows the current EpisodeID and EnvID and`
			`returns these in its actions."""`

			`def __init__(self, args, *kwargs):`
			`super().__init__(args, *kwargs)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.state_space = Box(-1.0, 1.0, (1,))`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00
			`class _fake_model:`
			`pass`

			`self.model = _fake_model()`
			`self.model.time_major = True`
[RLlib] Trajectory view API docs. (#12718) 2020-12-30 20:32:21 -05:00			`self.model.view_requirements = {`
[RLlib] Trajectory view API: Simple List Collector (on by default for PPO); LSTM-agnostic (#11056) 2020-10-01 16:57:10 +02:00			`SampleBatch.AGENT_INDEX: ViewRequirement(),`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00			`SampleBatch.EPS_ID: ViewRequirement(),`
			`"env_id": ViewRequirement(),`
[RLlib] Trajectory view API: Simple List Collector (on by default for PPO); LSTM-agnostic (#11056) 2020-10-01 16:57:10 +02:00			`"t": ViewRequirement(),`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00			`SampleBatch.OBS: ViewRequirement(),`
			`SampleBatch.PREV_ACTIONS: ViewRequirement(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`SampleBatch.ACTIONS, space=self.action_space, shift=-1`
			`),`
			`SampleBatch.PREV_REWARDS: ViewRequirement(SampleBatch.REWARDS, shift=-1),`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00			`}`
[RLlib] Trajectory view API: Simple List Collector (on by default for PPO); LSTM-agnostic (#11056) 2020-10-01 16:57:10 +02:00			`for i in range(2):`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.model.view_requirements["state_in_{}".format(i)] = ViewRequirement(`
			`"state_out_{}".format(i), shift=-1, space=self.state_space`
			`)`
			`self.model.view_requirements["state_out_{}".format(i)] = ViewRequirement(`
			`space=self.state_space`
			`)`
[RLlib] Trajectory view API: Simple List Collector (on by default for PPO); LSTM-agnostic (#11056) 2020-10-01 16:57:10 +02:00
			`self.view_requirements = dict(`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00			`**{`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`SampleBatch.NEXT_OBS: ViewRequirement(SampleBatch.OBS, shift=1),`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00			`SampleBatch.ACTIONS: ViewRequirement(space=self.action_space),`
			`SampleBatch.REWARDS: ViewRequirement(),`
			`SampleBatch.DONES: ViewRequirement(),`
[RLlib] No Preprocessors (part 2). (#18468) 2021-09-23 12:56:45 +02:00			`SampleBatch.UNROLL_ID: ViewRequirement(),`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00			`},`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`**self.model.view_requirements`
			`)`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00
			`@override(Policy)`
			`def is_recurrent(self):`
			`return True`

			`@override(Policy)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`def compute_actions_from_input_dict(`
			`self, input_dict, explore=None, timestep=None, **kwargs`
			`):`
[RLlib] Trajectory view API: Simple List Collector (on by default for PPO); LSTM-agnostic (#11056) 2020-10-01 16:57:10 +02:00			`ts = input_dict["t"]`
			`print(ts)`
			`# Always return [episodeID, envID] as actions.`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`actions = np.array(`
			`[`
			`[`
			`input_dict[SampleBatch.AGENT_INDEX][i],`
			`input_dict[SampleBatch.EPS_ID][i],`
			`input_dict["env_id"][i],`
			`]`
			`for i, _ in enumerate(input_dict["obs"])`
			`]`
			`)`
[RLlib] Trajectory view API: Simple List Collector (on by default for PPO); LSTM-agnostic (#11056) 2020-10-01 16:57:10 +02:00			`states = [`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`np.array([[ts[i]] for i in range(len(input_dict["obs"]))]) for _ in range(2)`
[RLlib] Trajectory view API: Simple List Collector (on by default for PPO); LSTM-agnostic (#11056) 2020-10-01 16:57:10 +02:00			`]`
			`return actions, states, {}`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00
			`@override(Policy)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`def postprocess_trajectory(`
			`self, sample_batch, other_agent_batches=None, episode=None`
			`):`
[RLlib] Attention Net prep PR #1: Smaller cleanups. (#12447) * WIP. * Fix. * Fix. * Fix. 2020-11-28 01:25:47 +01:00			`sample_batch["2xobs"] = sample_batch["obs"] * 2.0`
			`return sample_batch`


			`class EpisodeEnvAwareAttentionPolicy(RandomPolicy):`
			`"""A Policy that always knows the current EpisodeID and EnvID and`
			`returns these in its actions."""`

			`def __init__(self, args, *kwargs):`
			`super().__init__(args, *kwargs)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.state_space = Box(-1.0, 1.0, (1,))`
[RLlib] Attention Net prep PR #1: Smaller cleanups. (#12447) * WIP. * Fix. * Fix. * Fix. 2020-11-28 01:25:47 +01:00			`self.config["model"] = {"max_seq_len": 50}`

			`class _fake_model:`
			`pass`

			`self.model = _fake_model()`
[RLlib] Trajectory view API docs. (#12718) 2020-12-30 20:32:21 -05:00			`self.model.view_requirements = {`
[RLlib] Attention Net prep PR #1: Smaller cleanups. (#12447) * WIP. * Fix. * Fix. * Fix. 2020-11-28 01:25:47 +01:00			`SampleBatch.AGENT_INDEX: ViewRequirement(),`
			`SampleBatch.EPS_ID: ViewRequirement(),`
			`"env_id": ViewRequirement(),`
			`"t": ViewRequirement(),`
			`SampleBatch.OBS: ViewRequirement(),`
			`"state_in_0": ViewRequirement(`
			`"state_out_0",`
			`# Provide state outs -50 to -1 as "state-in".`
[RLlib] Attention Net prep PR #3. (#12450) 2020-12-07 13:08:17 +01:00			`shift="-50:-1",`
[RLlib] Attention Net prep PR #1: Smaller cleanups. (#12447) * WIP. * Fix. * Fix. * Fix. 2020-11-28 01:25:47 +01:00			`# Repeat the incoming state every n time steps (usually max seq`
			`# len).`
			`batch_repeat_value=self.config["model"]["max_seq_len"],`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`space=self.state_space,`
			`),`
[RLlib] Reinstate trajectory view API tests. (#18809) 2021-09-23 08:31:51 +02:00			`"state_out_0": ViewRequirement(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`space=self.state_space, used_for_compute_actions=False`
			`),`
[RLlib] Attention Net prep PR #1: Smaller cleanups. (#12447) * WIP. * Fix. * Fix. * Fix. 2020-11-28 01:25:47 +01:00			`}`

[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.view_requirements = dict(`
			`super()._get_default_view_requirements(), **self.model.view_requirements`
			`)`
[RLlib] Attention Net prep PR #1: Smaller cleanups. (#12447) * WIP. * Fix. * Fix. * Fix. 2020-11-28 01:25:47 +01:00
			`@override(Policy)`
			`def is_recurrent(self):`
			`return True`

			`@override(Policy)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`def compute_actions_from_input_dict(`
			`self, input_dict, explore=None, timestep=None, **kwargs`
			`):`
[RLlib] Attention Net prep PR #1: Smaller cleanups. (#12447) * WIP. * Fix. * Fix. * Fix. 2020-11-28 01:25:47 +01:00			`ts = input_dict["t"]`
			`print(ts)`
			`# Always return [episodeID, envID] as actions.`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`actions = np.array(`
			`[`
			`[`
			`input_dict[SampleBatch.AGENT_INDEX][i],`
			`input_dict[SampleBatch.EPS_ID][i],`
			`input_dict["env_id"][i],`
			`]`
			`for i, _ in enumerate(input_dict["obs"])`
			`]`
			`)`
[RLlib] Attention Net prep PR #1: Smaller cleanups. (#12447) * WIP. * Fix. * Fix. * Fix. 2020-11-28 01:25:47 +01:00			`states = [np.array([[ts[i]] for i in range(len(input_dict["obs"]))])]`
			`self.global_timestep += 1`
			`return actions, states, {}`

			`@override(Policy)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`def postprocess_trajectory(`
			`self, sample_batch, other_agent_batches=None, episode=None`
			`):`
[RLlib] Attention Net prep PR #1: Smaller cleanups. (#12447) * WIP. * Fix. * Fix. * Fix. 2020-11-28 01:25:47 +01:00			`sample_batch["3xobs"] = sample_batch["obs"] * 3.0`
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00			`return sample_batch`