ray/rllib/examples/parametric_action_cartpole.py

"""Example of handling variable length and/or parametric action spaces.

This is a toy example of the action-embedding based approach for handling large
discrete action spaces (potentially infinite in size), similar to this:

    https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/

This currently works with RLlib's policy gradient style algorithms
(e.g., PG, PPO, IMPALA, A2C) and also DQN.

Note that since the model outputs now include "-inf" tf.float32.min
values, not all algorithm options are supported at the moment. For example,
algorithms might crash if they don't properly ignore the -inf action scores.
Working configurations are given below.
"""

import argparse
import random
import numpy as np
import gym
from gym.spaces import Box, Discrete, Dict

import ray
from ray import tune
from ray.rllib.agents.dqn.distributional_q_tf_model import \
    DistributionalQTFModel
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.tune.registry import register_env
from ray.rllib.utils import try_import_tf

tf = try_import_tf()

parser = argparse.ArgumentParser()
parser.add_argument("--stop", type=int, default=200)
parser.add_argument("--run", type=str, default="PPO")


class ParametricActionCartpole(gym.Env):
    """Parametric action version of CartPole.

    In this env there are only ever two valid actions, but we pretend there are
    actually up to `max_avail_actions` actions that can be taken, and the two
    valid actions are randomly hidden among this set.

    At each step, we emit a dict of:
        - the actual cart observation
        - a mask of valid actions (e.g., [0, 0, 1, 0, 0, 1] for 6 max avail)
        - the list of action embeddings (w/ zeroes for invalid actions) (e.g.,
            [[0, 0],
             [0, 0],
             [-0.2322, -0.2569],
             [0, 0],
             [0, 0],
             [0.7878, 1.2297]] for max_avail_actions=6)

    In a real environment, the actions embeddings would be larger than two
    units of course, and also there would be a variable number of valid actions
    per step instead of always [LEFT, RIGHT].
    """

    def __init__(self, max_avail_actions):
        # Use simple random 2-unit action embeddings for [LEFT, RIGHT]
        self.left_action_embed = np.random.randn(2)
        self.right_action_embed = np.random.randn(2)
        self.action_space = Discrete(max_avail_actions)
        self.wrapped = gym.make("CartPole-v0")
        self.observation_space = Dict({
            "action_mask": Box(0, 1, shape=(max_avail_actions, )),
            "avail_actions": Box(-10, 10, shape=(max_avail_actions, 2)),
            "cart": self.wrapped.observation_space,
        })

    def update_avail_actions(self):
        self.action_assignments = np.array([[0., 0.]] * self.action_space.n)
        self.action_mask = np.array([0.] * self.action_space.n)
        self.left_idx, self.right_idx = random.sample(
            range(self.action_space.n), 2)
        self.action_assignments[self.left_idx] = self.left_action_embed
        self.action_assignments[self.right_idx] = self.right_action_embed
        self.action_mask[self.left_idx] = 1
        self.action_mask[self.right_idx] = 1

    def reset(self):
        self.update_avail_actions()
        return {
            "action_mask": self.action_mask,
            "avail_actions": self.action_assignments,
            "cart": self.wrapped.reset(),
        }

    def step(self, action):
        if action == self.left_idx:
            actual_action = 0
        elif action == self.right_idx:
            actual_action = 1
        else:
            raise ValueError(
                "Chosen action was not one of the non-zero action embeddings",
                action, self.action_assignments, self.action_mask,
                self.left_idx, self.right_idx)
        orig_obs, rew, done, info = self.wrapped.step(actual_action)
        self.update_avail_actions()
        obs = {
            "action_mask": self.action_mask,
            "avail_actions": self.action_assignments,
            "cart": orig_obs,
        }
        return obs, rew, done, info


class ParametricActionsModel(DistributionalQTFModel, TFModelV2):
    """Parametric action model that handles the dot product and masking.

    This assumes the outputs are logits for a single Categorical action dist.
    Getting this to work with a more complex output (e.g., if the action space
    is a tuple of several distributions) is also possible but left as an
    exercise to the reader.
    """

    def __init__(self,
                 obs_space,
                 action_space,
                 num_outputs,
                 model_config,
                 name,
                 true_obs_shape=(4, ),
                 action_embed_size=2,
                 **kw):
        super(ParametricActionsModel, self).__init__(
            obs_space, action_space, num_outputs, model_config, name, **kw)
        self.action_embed_model = FullyConnectedNetwork(
            Box(-1, 1, shape=true_obs_shape), action_space, action_embed_size,
            model_config, name + "_action_embed")
        self.register_variables(self.action_embed_model.variables())

    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        avail_actions = input_dict["obs"]["avail_actions"]
        action_mask = input_dict["obs"]["action_mask"]

        # Compute the predicted action embedding
        action_embed, _ = self.action_embed_model({
            "obs": input_dict["obs"]["cart"]
        })

        # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
        # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
        intent_vector = tf.expand_dims(action_embed, 1)

        # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
        action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)

        # Mask out invalid actions (use tf.float32.min for stability)
        inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
        return action_logits + inf_mask, state

    def value_function(self):
        return self.action_embed_model.value_function()


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
    register_env("pa_cartpole", lambda _: ParametricActionCartpole(10))
    if args.run == "DQN":
        cfg = {
            # TODO(ekl) we need to set these to prevent the masked values
            # from being further processed in DistributionalQModel, which
            # would mess up the masking. It is possible to support these if we
            # defined a a custom DistributionalQModel that is aware of masking.
            "hiddens": [],
            "dueling": False,
        }
    else:
        cfg = {}
    tune.run(
        args.run,
        stop={
            "episode_reward_mean": args.stop,
        },
        config=dict({
            "env": "pa_cartpole",
            "model": {
                "custom_model": "pa_model",
            },
            "num_workers": 0,
        }, **cfg),
    )
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00			`"""Example of handling variable length and/or parametric action spaces.`

			`This is a toy example of the action-embedding based approach for handling large`
[rllib] Custom supervised loss API (#4083) 2019-02-24 15:36:13 -08:00			`discrete action spaces (potentially infinite in size), similar to this:`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00
			`https://neuro.cs.ut.ee/the-use-of-embeddings-in-openai-five/`

			`This currently works with RLlib's policy gradient style algorithms`
			`(e.g., PG, PPO, IMPALA, A2C) and also DQN.`

			`Note that since the model outputs now include "-inf" tf.float32.min`
			`values, not all algorithm options are supported at the moment. For example,`
			`algorithms might crash if they don't properly ignore the -inf action scores.`
			`Working configurations are given below.`
			`"""`

			`import argparse`
			`import random`
			`import numpy as np`
			`import gym`
			`from gym.spaces import Box, Discrete, Dict`

			`import ray`
[rllib] Switch to tune.run() instead of run_experiments() (#4515) 2019-03-30 14:07:50 -07:00			`from ray import tune`
[RLlib] DQN torch version. (#7597) * Fix. * Rollback. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * Fix. * Fix. * Fix. * Fix. * Fix. * WIP. * WIP. * Fix. * Test case fixes. * Test case fixes and LINT. * Test case fixes and LINT. * Rollback. * WIP. * WIP. * Test case fixes. * Fix. * Fix. * Fix. * Add regression test for DQN w/ param noise. * Fixes and LINT. * Fixes and LINT. * Fixes and LINT. * Fixes and LINT. * Fixes and LINT. * Comment * Regression test case. * WIP. * WIP. * LINT. * LINT. * WIP. * Fix. * Fix. * Fix. * LINT. * Fix (SAC does currently not support eager). * Fix. * WIP. * LINT. * Update rllib/evaluation/sampler.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/evaluation/sampler.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/utils/exploration/exploration.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/utils/exploration/exploration.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * WIP. * Fix. * LINT. * LINT. * Fix and LINT. * WIP. * WIP. * WIP. * WIP. * Fix. * LINT. * Fix. * Fix and LINT. * Update rllib/utils/exploration/exploration.py * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Fixes. * WIP. * LINT. * Fixes and LINT. * LINT and fixes. * LINT. * Move action_dist back into torch extra_action_out_fn and LINT. * Working SimpleQ learning cartpole on both torch AND tf. * Working Rainbow learning cartpole on tf. * Working Rainbow learning cartpole on tf. * WIP. * LINT. * LINT. * Update docs and add torch to APEX test. * LINT. * Fix. * LINT. * Fix. * Fix. * Fix and docstrings. * Fix broken RLlib tests in master. * Split BAZEL learning tests into cartpole and pendulum (reached the 60min barrier). * Fix error_outputs option in BAZEL for RLlib regression tests. * Fix. * Tune param-noise tests. * LINT. * Fix. * Fix. * test * test * test * Fix. * Fix. * WIP. * WIP. * WIP. * WIP. * LINT. * WIP. Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-04-06 20:56:16 +02:00			`from ray.rllib.agents.dqn.distributional_q_tf_model import \`
			`DistributionalQTFModel`
[rllib] Document ModelV2 and clean up the models/ directory (#5277) 2019-07-27 02:08:16 -07:00			`from ray.rllib.models import ModelCatalog`
			`from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork`
			`from ray.rllib.models.tf.tf_modelv2 import TFModelV2`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00			`from ray.tune.registry import register_env`
[rllib] TensorFlow 2 compatibility (#4802) 2019-05-16 22:12:07 -07:00			`from ray.rllib.utils import try_import_tf`

			`tf = try_import_tf()`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--stop", type=int, default=200)`
			`parser.add_argument("--run", type=str, default="PPO")`


			`class ParametricActionCartpole(gym.Env):`
			`"""Parametric action version of CartPole.`

			`In this env there are only ever two valid actions, but we pretend there are`
			actually up to `max_avail_actions` actions that can be taken, and the two
			`valid actions are randomly hidden among this set.`

			`At each step, we emit a dict of:`
			`- the actual cart observation`
			`- a mask of valid actions (e.g., [0, 0, 1, 0, 0, 1] for 6 max avail)`
			`- the list of action embeddings (w/ zeroes for invalid actions) (e.g.,`
			`[[0, 0],`
			`[0, 0],`
			`[-0.2322, -0.2569],`
			`[0, 0],`
			`[0, 0],`
			`[0.7878, 1.2297]] for max_avail_actions=6)`

			`In a real environment, the actions embeddings would be larger than two`
			`units of course, and also there would be a variable number of valid actions`
			`per step instead of always [LEFT, RIGHT].`
			`"""`

			`def __init__(self, max_avail_actions):`
			`# Use simple random 2-unit action embeddings for [LEFT, RIGHT]`
			`self.left_action_embed = np.random.randn(2)`
			`self.right_action_embed = np.random.randn(2)`
			`self.action_space = Discrete(max_avail_actions)`
			`self.wrapped = gym.make("CartPole-v0")`
			`self.observation_space = Dict({`
			`"action_mask": Box(0, 1, shape=(max_avail_actions, )),`
[rllib] validate observation in NoPreprocessor (#4546) 2019-04-07 16:11:50 -07:00			`"avail_actions": Box(-10, 10, shape=(max_avail_actions, 2)),`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00			`"cart": self.wrapped.observation_space,`
			`})`

			`def update_avail_actions(self):`
[rllib] validate observation in NoPreprocessor (#4546) 2019-04-07 16:11:50 -07:00			`self.action_assignments = np.array([[0., 0.]] * self.action_space.n)`
			`self.action_mask = np.array([0.] * self.action_space.n)`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00			`self.left_idx, self.right_idx = random.sample(`
			`range(self.action_space.n), 2)`
			`self.action_assignments[self.left_idx] = self.left_action_embed`
			`self.action_assignments[self.right_idx] = self.right_action_embed`
			`self.action_mask[self.left_idx] = 1`
			`self.action_mask[self.right_idx] = 1`

			`def reset(self):`
			`self.update_avail_actions()`
			`return {`
			`"action_mask": self.action_mask,`
			`"avail_actions": self.action_assignments,`
			`"cart": self.wrapped.reset(),`
			`}`

			`def step(self, action):`
			`if action == self.left_idx:`
			`actual_action = 0`
			`elif action == self.right_idx:`
			`actual_action = 1`
			`else:`
			`raise ValueError(`
			`"Chosen action was not one of the non-zero action embeddings",`
			`action, self.action_assignments, self.action_mask,`
			`self.left_idx, self.right_idx)`
			`orig_obs, rew, done, info = self.wrapped.step(actual_action)`
			`self.update_avail_actions()`
			`obs = {`
			`"action_mask": self.action_mask,`
			`"avail_actions": self.action_assignments,`
			`"cart": orig_obs,`
			`}`
			`return obs, rew, done, info`


[RLlib] DQN torch version. (#7597) * Fix. * Rollback. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * WIP. * Fix. * Fix. * Fix. * Fix. * Fix. * WIP. * WIP. * Fix. * Test case fixes. * Test case fixes and LINT. * Test case fixes and LINT. * Rollback. * WIP. * WIP. * Test case fixes. * Fix. * Fix. * Fix. * Add regression test for DQN w/ param noise. * Fixes and LINT. * Fixes and LINT. * Fixes and LINT. * Fixes and LINT. * Fixes and LINT. * Comment * Regression test case. * WIP. * WIP. * LINT. * LINT. * WIP. * Fix. * Fix. * Fix. * LINT. * Fix (SAC does currently not support eager). * Fix. * WIP. * LINT. * Update rllib/evaluation/sampler.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/evaluation/sampler.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/utils/exploration/exploration.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/utils/exploration/exploration.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * WIP. * Fix. * LINT. * LINT. * Fix and LINT. * WIP. * WIP. * WIP. * WIP. * Fix. * LINT. * Fix. * Fix and LINT. * Update rllib/utils/exploration/exploration.py * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Update rllib/policy/dynamic_tf_policy.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Fixes. * WIP. * LINT. * Fixes and LINT. * LINT and fixes. * LINT. * Move action_dist back into torch extra_action_out_fn and LINT. * Working SimpleQ learning cartpole on both torch AND tf. * Working Rainbow learning cartpole on tf. * Working Rainbow learning cartpole on tf. * WIP. * LINT. * LINT. * Update docs and add torch to APEX test. * LINT. * Fix. * LINT. * Fix. * Fix. * Fix and docstrings. * Fix broken RLlib tests in master. * Split BAZEL learning tests into cartpole and pendulum (reached the 60min barrier). * Fix error_outputs option in BAZEL for RLlib regression tests. * Fix. * Tune param-noise tests. * LINT. * Fix. * Fix. * test * test * test * Fix. * Fix. * WIP. * WIP. * WIP. * WIP. * LINT. * WIP. Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-04-06 20:56:16 +02:00			`class ParametricActionsModel(DistributionalQTFModel, TFModelV2):`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00			`"""Parametric action model that handles the dot product and masking.`

			`This assumes the outputs are logits for a single Categorical action dist.`
			`Getting this to work with a more complex output (e.g., if the action space`
			`is a tuple of several distributions) is also possible but left as an`
			`exercise to the reader.`
			`"""`

[rllib] Document ModelV2 and clean up the models/ directory (#5277) 2019-07-27 02:08:16 -07:00			`def __init__(self,`
			`obs_space,`
			`action_space,`
			`num_outputs,`
			`model_config,`
			`name,`
			`true_obs_shape=(4, ),`
			`action_embed_size=2,`
			`**kw):`
			`super(ParametricActionsModel, self).__init__(`
			`obs_space, action_space, num_outputs, model_config, name, **kw)`
			`self.action_embed_model = FullyConnectedNetwork(`
			`Box(-1, 1, shape=true_obs_shape), action_space, action_embed_size,`
			`model_config, name + "_action_embed")`
			`self.register_variables(self.action_embed_model.variables())`

			`def forward(self, input_dict, state, seq_lens):`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00			`# Extract the available actions tensor from the observation.`
			`avail_actions = input_dict["obs"]["avail_actions"]`
			`action_mask = input_dict["obs"]["action_mask"]`
[rllib] Document ModelV2 and clean up the models/ directory (#5277) 2019-07-27 02:08:16 -07:00
			`# Compute the predicted action embedding`
			`action_embed, _ = self.action_embed_model({`
			`"obs": input_dict["obs"]["cart"]`
			`})`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00
			`# Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the`
			`# avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].`
[rllib] Document ModelV2 and clean up the models/ directory (#5277) 2019-07-27 02:08:16 -07:00			`intent_vector = tf.expand_dims(action_embed, 1)`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00
			`# Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].`
			`action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)`

			`# Mask out invalid actions (use tf.float32.min for stability)`
			`inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)`
[rllib] Document ModelV2 and clean up the models/ directory (#5277) 2019-07-27 02:08:16 -07:00			`return action_logits + inf_mask, state`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00
[rllib] Document ModelV2 and clean up the models/ directory (#5277) 2019-07-27 02:08:16 -07:00			`def value_function(self):`
			`return self.action_embed_model.value_function()`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00

			`if __name__ == "__main__":`
			`args = parser.parse_args()`
			`ray.init()`

			`ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)`
			`register_env("pa_cartpole", lambda _: ParametricActionCartpole(10))`
[rllib] Document ModelV2 and clean up the models/ directory (#5277) 2019-07-27 02:08:16 -07:00			`if args.run == "DQN":`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00			`cfg = {`
[rllib] Document ModelV2 and clean up the models/ directory (#5277) 2019-07-27 02:08:16 -07:00			`# TODO(ekl) we need to set these to prevent the masked values`
			`# from being further processed in DistributionalQModel, which`
			`# would mess up the masking. It is possible to support these if we`
			`# defined a a custom DistributionalQModel that is aware of masking.`
			`"hiddens": [],`
[rllib] ModelV2 API (#4926) 2019-07-03 15:59:47 -07:00			`"dueling": False,`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00			`}`
			`else:`
[rllib] Document ModelV2 and clean up the models/ directory (#5277) 2019-07-27 02:08:16 -07:00			`cfg = {}`
[rllib] Switch to tune.run() instead of run_experiments() (#4515) 2019-03-30 14:07:50 -07:00			`tune.run(`
			`args.run,`
			`stop={`
			`"episode_reward_mean": args.stop,`
			`},`
			`config=dict({`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00			`"env": "pa_cartpole",`
[rllib] Switch to tune.run() instead of run_experiments() (#4515) 2019-03-30 14:07:50 -07:00			`"model": {`
			`"custom_model": "pa_model",`
[rllib] example and docs on how to use parametric actions with DQN / PG algorithms (#3384) 2018-11-27 23:35:19 -08:00			`},`
[rllib] Switch to tune.run() instead of run_experiments() (#4515) 2019-03-30 14:07:50 -07:00			`"num_workers": 0,`
			`}, **cfg),`
			`)`