ray/rllib/examples/models/action_mask_model.py

from gym.spaces import Dict

from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.torch_utils import FLOAT_MIN

tf1, tf, tfv = try_import_tf()
torch, nn = try_import_torch()


class ActionMaskModel(TFModelV2):
    """Model that handles simple discrete action masking.

    This assumes the outputs are logits for a single Categorical action dist.
    Getting this to work with a more complex output (e.g., if the action space
    is a tuple of several distributions) is also possible but left as an
    exercise to the reader.
    """

    def __init__(
        self, obs_space, action_space, num_outputs, model_config, name, **kwargs
    ):

        orig_space = getattr(obs_space, "original_space", obs_space)
        assert (
            isinstance(orig_space, Dict)
            and "action_mask" in orig_space.spaces
            and "observations" in orig_space.spaces
        )

        super().__init__(obs_space, action_space, num_outputs, model_config, name)

        self.internal_model = FullyConnectedNetwork(
            orig_space["observations"],
            action_space,
            num_outputs,
            model_config,
            name + "_internal",
        )

        # disable action masking --> will likely lead to invalid actions
        self.no_masking = model_config["custom_model_config"].get("no_masking", False)

    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        action_mask = input_dict["obs"]["action_mask"]

        # Compute the unmasked logits.
        logits, _ = self.internal_model({"obs": input_dict["obs"]["observations"]})

        # If action masking is disabled, directly return unmasked logits
        if self.no_masking:
            return logits, state

        # Convert action_mask into a [0.0 || -inf]-type mask.
        inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
        masked_logits = logits + inf_mask

        # Return masked logits.
        return masked_logits, state

    def value_function(self):
        return self.internal_model.value_function()


class TorchActionMaskModel(TorchModelV2, nn.Module):
    """PyTorch version of above ActionMaskingModel."""

    def __init__(
        self,
        obs_space,
        action_space,
        num_outputs,
        model_config,
        name,
        **kwargs,
    ):
        orig_space = getattr(obs_space, "original_space", obs_space)
        assert (
            isinstance(orig_space, Dict)
            and "action_mask" in orig_space.spaces
            and "observations" in orig_space.spaces
        )

        TorchModelV2.__init__(
            self, obs_space, action_space, num_outputs, model_config, name, **kwargs
        )
        nn.Module.__init__(self)

        self.internal_model = TorchFC(
            orig_space["observations"],
            action_space,
            num_outputs,
            model_config,
            name + "_internal",
        )

        # disable action masking --> will likely lead to invalid actions
        self.no_masking = False
        if "no_masking" in model_config["custom_model_config"]:
            self.no_masking = model_config["custom_model_config"]["no_masking"]

    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        action_mask = input_dict["obs"]["action_mask"]

        # Compute the unmasked logits.
        logits, _ = self.internal_model({"obs": input_dict["obs"]["observations"]})

        # If action masking is disabled, directly return unmasked logits
        if self.no_masking:
            return logits, state

        # Convert action_mask into a [0.0 || -inf]-type mask.
        inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
        masked_logits = logits + inf_mask

        # Return masked logits.
        return masked_logits, state

    def value_function(self):
        return self.internal_model.value_function()
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00			`from gym.spaces import Dict`

			`from ray.rllib.models.tf.fcnet import FullyConnectedNetwork`
			`from ray.rllib.models.tf.tf_modelv2 import TFModelV2`
			`from ray.rllib.models.torch.torch_modelv2 import TorchModelV2`
			`from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC`
			`from ray.rllib.utils.framework import try_import_tf, try_import_torch`
[RLlib] Fix deprecated warning for torch_ops.py (soft-replaced by torch_utils.py). (#19982) 2021-11-03 10:00:46 +01:00			`from ray.rllib.utils.torch_utils import FLOAT_MIN`
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00
			`tf1, tf, tfv = try_import_tf()`
			`torch, nn = try_import_torch()`


			`class ActionMaskModel(TFModelV2):`
			`"""Model that handles simple discrete action masking.`

			`This assumes the outputs are logits for a single Categorical action dist.`
			`Getting this to work with a more complex output (e.g., if the action space`
			`is a tuple of several distributions) is also possible but left as an`
			`exercise to the reader.`
			`"""`

[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`def __init__(`
			`self, obs_space, action_space, num_outputs, model_config, name, **kwargs`
			`):`
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00
			`orig_space = getattr(obs_space, "original_space", obs_space)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`assert (`
			`isinstance(orig_space, Dict)`
			`and "action_mask" in orig_space.spaces`
			`and "observations" in orig_space.spaces`
			`)`
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`super().__init__(obs_space, action_space, num_outputs, model_config, name)`
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00
			`self.internal_model = FullyConnectedNetwork(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`orig_space["observations"],`
			`action_space,`
			`num_outputs,`
			`model_config,`
			`name + "_internal",`
			`)`
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00
[RLlib] Document and extend action mask example. (#20390) Co-authored-by: Richard Liaw <rliaw@berkeley.edu> Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 13:20:41 +01:00			`# disable action masking --> will likely lead to invalid actions`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.no_masking = model_config["custom_model_config"].get("no_masking", False)`
[RLlib] Document and extend action mask example. (#20390) Co-authored-by: Richard Liaw <rliaw@berkeley.edu> Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 13:20:41 +01:00
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00			`def forward(self, input_dict, state, seq_lens):`
			`# Extract the available actions tensor from the observation.`
			`action_mask = input_dict["obs"]["action_mask"]`

			`# Compute the unmasked logits.`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`logits, _ = self.internal_model({"obs": input_dict["obs"]["observations"]})`
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00
[RLlib] Document and extend action mask example. (#20390) Co-authored-by: Richard Liaw <rliaw@berkeley.edu> Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 13:20:41 +01:00			`# If action masking is disabled, directly return unmasked logits`
			`if self.no_masking:`
			`return logits, state`

[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00			`# Convert action_mask into a [0.0 \|\| -inf]-type mask.`
			`inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)`
			`masked_logits = logits + inf_mask`

			`# Return masked logits.`
			`return masked_logits, state`

			`def value_function(self):`
			`return self.internal_model.value_function()`


			`class TorchActionMaskModel(TorchModelV2, nn.Module):`
			`"""PyTorch version of above ActionMaskingModel."""`

			`def __init__(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self,`
			`obs_space,`
			`action_space,`
			`num_outputs,`
			`model_config,`
			`name,`
			`**kwargs,`
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00			`):`
			`orig_space = getattr(obs_space, "original_space", obs_space)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`assert (`
			`isinstance(orig_space, Dict)`
			`and "action_mask" in orig_space.spaces`
			`and "observations" in orig_space.spaces`
			`)`

			`TorchModelV2.__init__(`
			`self, obs_space, action_space, num_outputs, model_config, name, **kwargs`
			`)`
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00			`nn.Module.__init__(self)`

[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.internal_model = TorchFC(`
			`orig_space["observations"],`
			`action_space,`
			`num_outputs,`
			`model_config,`
			`name + "_internal",`
			`)`
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00
[RLlib] Document and extend action mask example. (#20390) Co-authored-by: Richard Liaw <rliaw@berkeley.edu> Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 13:20:41 +01:00			`# disable action masking --> will likely lead to invalid actions`
			`self.no_masking = False`
			`if "no_masking" in model_config["custom_model_config"]:`
			`self.no_masking = model_config["custom_model_config"]["no_masking"]`

[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00			`def forward(self, input_dict, state, seq_lens):`
			`# Extract the available actions tensor from the observation.`
			`action_mask = input_dict["obs"]["action_mask"]`

			`# Compute the unmasked logits.`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`logits, _ = self.internal_model({"obs": input_dict["obs"]["observations"]})`
[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00
[RLlib] Document and extend action mask example. (#20390) Co-authored-by: Richard Liaw <rliaw@berkeley.edu> Co-authored-by: Sven Mika <sven@anyscale.io> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-16 13:20:41 +01:00			`# If action masking is disabled, directly return unmasked logits`
			`if self.no_masking:`
			`return logits, state`

[RLlib] Add simple action-masking example script/env/model (tf and torch). (#18494) 2021-09-11 23:08:09 +02:00			`# Convert action_mask into a [0.0 \|\| -inf]-type mask.`
			`inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)`
			`masked_logits = logits + inf_mask`

			`# Return masked logits.`
			`return masked_logits, state`

			`def value_function(self):`
			`return self.internal_model.value_function()`