2020-07-24 12:01:46 -07:00
|
|
|
import numpy as np
|
|
|
|
import gym
|
|
|
|
|
|
|
|
from ray.rllib.models.modelv2 import ModelV2
|
2019-07-27 02:08:16 -07:00
|
|
|
from ray.rllib.utils.annotations import DeveloperAPI
|
2020-08-15 13:24:22 +02:00
|
|
|
from ray.rllib.utils.typing import TensorType, List, Union, ModelConfigDict
|
2018-08-09 19:51:32 -07:00
|
|
|
|
2017-03-07 23:42:44 -08:00
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2020-01-02 17:42:13 -08:00
|
|
|
class ActionDistribution:
|
2017-07-17 01:58:54 -07:00
|
|
|
"""The policy action distribution of an agent.
|
|
|
|
|
2019-08-10 14:05:12 -07:00
|
|
|
Attributes:
|
2022-06-01 11:27:54 -07:00
|
|
|
inputs: input vector to compute samples from.
|
2019-08-10 14:05:12 -07:00
|
|
|
model (ModelV2): reference to model producing the inputs.
|
2017-07-17 01:58:54 -07:00
|
|
|
"""
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2020-07-24 12:01:46 -07:00
|
|
|
def __init__(self, inputs: List[TensorType], model: ModelV2):
|
2020-07-29 21:15:09 +02:00
|
|
|
"""Initializes an ActionDist object.
|
2019-08-10 14:05:12 -07:00
|
|
|
|
2020-07-29 21:15:09 +02:00
|
|
|
Args:
|
2022-06-01 11:27:54 -07:00
|
|
|
inputs: input vector to compute samples from.
|
2019-08-10 14:05:12 -07:00
|
|
|
model (ModelV2): reference to model producing the inputs. This
|
|
|
|
is mainly useful if you want to use model variables to compute
|
|
|
|
action outputs (i.e., for auto-regressive action distributions,
|
|
|
|
see examples/autoregressive_action_dist.py).
|
|
|
|
"""
|
2017-07-17 01:58:54 -07:00
|
|
|
self.inputs = inputs
|
2019-08-10 14:05:12 -07:00
|
|
|
self.model = model
|
2019-07-27 02:08:16 -07:00
|
|
|
|
|
|
|
@DeveloperAPI
|
2020-07-24 12:01:46 -07:00
|
|
|
def sample(self) -> TensorType:
|
2019-07-27 02:08:16 -07:00
|
|
|
"""Draw a sample from the action distribution."""
|
|
|
|
raise NotImplementedError
|
2017-07-17 01:58:54 -07:00
|
|
|
|
2020-02-19 21:18:45 +01:00
|
|
|
@DeveloperAPI
|
2020-07-24 12:01:46 -07:00
|
|
|
def deterministic_sample(self) -> TensorType:
|
2020-02-19 21:18:45 +01:00
|
|
|
"""
|
|
|
|
Get the deterministic "sampling" output from the distribution.
|
|
|
|
This is usually the max likelihood output, i.e. mean for Normal, argmax
|
|
|
|
for Categorical, etc..
|
|
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-08-10 14:05:12 -07:00
|
|
|
@DeveloperAPI
|
2020-07-24 12:01:46 -07:00
|
|
|
def sampled_action_logp(self) -> TensorType:
|
2019-08-10 14:05:12 -07:00
|
|
|
"""Returns the log probability of the last sampled action."""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2020-07-24 12:01:46 -07:00
|
|
|
def logp(self, x: TensorType) -> TensorType:
|
2017-09-12 23:38:21 -07:00
|
|
|
"""The log-likelihood of the action distribution."""
|
2017-07-17 01:58:54 -07:00
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2020-07-24 12:01:46 -07:00
|
|
|
def kl(self, other: "ActionDistribution") -> TensorType:
|
2018-01-01 11:10:44 -08:00
|
|
|
"""The KL-divergence between two action distributions."""
|
2017-07-17 01:58:54 -07:00
|
|
|
raise NotImplementedError
|
|
|
|
|
2019-01-23 21:27:26 -08:00
|
|
|
@DeveloperAPI
|
2020-07-24 12:01:46 -07:00
|
|
|
def entropy(self) -> TensorType:
|
2019-02-13 16:25:05 -08:00
|
|
|
"""The entropy of the action distribution."""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
2020-07-24 12:01:46 -07:00
|
|
|
def multi_kl(self, other: "ActionDistribution") -> TensorType:
|
2019-07-19 12:12:04 -07:00
|
|
|
"""The KL-divergence between two action distributions.
|
|
|
|
|
|
|
|
This differs from kl() in that it can return an array for
|
|
|
|
MultiDiscrete. TODO(ekl) consider removing this.
|
|
|
|
"""
|
|
|
|
return self.kl(other)
|
|
|
|
|
2020-07-24 12:01:46 -07:00
|
|
|
def multi_entropy(self) -> TensorType:
|
2019-07-19 12:12:04 -07:00
|
|
|
"""The entropy of the action distribution.
|
|
|
|
|
|
|
|
This differs from entropy() in that it can return an array for
|
|
|
|
MultiDiscrete. TODO(ekl) consider removing this.
|
|
|
|
"""
|
|
|
|
return self.entropy()
|
2019-08-06 18:13:16 +00:00
|
|
|
|
|
|
|
@staticmethod
|
2022-05-21 15:05:07 -07:00
|
|
|
@DeveloperAPI
|
2020-07-24 12:01:46 -07:00
|
|
|
def required_model_output_shape(
|
|
|
|
action_space: gym.Space, model_config: ModelConfigDict
|
|
|
|
) -> Union[int, np.ndarray]:
|
2019-08-06 18:13:16 +00:00
|
|
|
"""Returns the required shape of an input parameter tensor for a
|
|
|
|
particular action space and an optional dict of distribution-specific
|
|
|
|
options.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
action_space (gym.Space): The action space this distribution will
|
|
|
|
be used for, whose shape attributes will be used to determine
|
|
|
|
the required shape of the input parameter tensor.
|
2022-06-01 11:27:54 -07:00
|
|
|
model_config: Model's config dict (as defined in catalog.py)
|
2019-08-06 18:13:16 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
model_output_shape (int or np.ndarray of ints): size of the
|
|
|
|
required input vector (minus leading batch dimension).
|
|
|
|
"""
|
|
|
|
raise NotImplementedError
|