mirror of
https://github.com/vale981/ray
synced 2025-03-07 02:51:39 -05:00

* Avoid warning about swap being unlimited Currently we get the following message on Jenkins: "Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap." Since we're not limiting swap anyway, we might as well avoid trying to. https://docs.docker.com/config/containers/resource_constraints/#--memory-swap-details * Fix escaping in re.search() * Fix escaping in _noisy_layer() * Raise a more descriptive error when dashboard data isn't found * Don't error on dashboard files not being found when webui isn't required * Change dashboard error to a warning instead
274 lines
11 KiB
Python
274 lines
11 KiB
Python
import numpy as np
|
|
|
|
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
|
from ray.rllib.utils import try_import_tf
|
|
|
|
tf = try_import_tf()
|
|
|
|
|
|
class DistributionalQModel(TFModelV2):
|
|
"""Extension of standard TFModel to provide distributional Q values.
|
|
|
|
It also supports options for noisy nets and parameter space noise.
|
|
|
|
Data flow:
|
|
obs -> forward() -> model_out
|
|
model_out -> get_q_value_distributions() -> Q(s, a) atoms
|
|
model_out -> get_state_value() -> V(s)
|
|
|
|
Note that this class by itself is not a valid model unless you
|
|
implement forward() in a subclass."""
|
|
|
|
def __init__(self,
|
|
obs_space,
|
|
action_space,
|
|
num_outputs,
|
|
model_config,
|
|
name,
|
|
q_hiddens=(256, ),
|
|
dueling=False,
|
|
num_atoms=1,
|
|
use_noisy=False,
|
|
v_min=-10.0,
|
|
v_max=10.0,
|
|
sigma0=0.5,
|
|
parameter_noise=False):
|
|
"""Initialize variables of this model.
|
|
|
|
Extra model kwargs:
|
|
q_hiddens (list): defines size of hidden layers for the q head.
|
|
These will be used to postprocess the model output for the
|
|
purposes of computing Q values.
|
|
dueling (bool): whether to build the state value head for DDQN
|
|
num_atoms (int): if >1, enables distributional DQN
|
|
use_noisy (bool): use noisy nets
|
|
v_min (float): min value support for distributional DQN
|
|
v_max (float): max value support for distributional DQN
|
|
sigma0 (float): initial value of noisy nets
|
|
parameter_noise (bool): enable layer norm for param noise
|
|
|
|
Note that the core layers for forward() are not defined here, this
|
|
only defines the layers for the Q head. Those layers for forward()
|
|
should be defined in subclasses of DistributionalQModel.
|
|
"""
|
|
|
|
super(DistributionalQModel, self).__init__(
|
|
obs_space, action_space, num_outputs, model_config, name)
|
|
|
|
# setup the Q head output (i.e., model for get_q_values)
|
|
self.model_out = tf.keras.layers.Input(
|
|
shape=(num_outputs, ), name="model_out")
|
|
|
|
def build_action_value(model_out):
|
|
if q_hiddens:
|
|
action_out = model_out
|
|
for i in range(len(q_hiddens)):
|
|
if use_noisy:
|
|
action_out = self._noisy_layer(
|
|
"hidden_%d" % i, action_out, q_hiddens[i], sigma0)
|
|
elif parameter_noise:
|
|
action_out = tf.keras.layers.Dense(
|
|
units=q_hiddens[i],
|
|
activation_fn=tf.nn.relu,
|
|
normalizer_fn=tf.keras.layers.LayerNormalization
|
|
)(action_out)
|
|
else:
|
|
action_out = tf.keras.layers.Dense(
|
|
units=q_hiddens[i],
|
|
activation=tf.nn.relu,
|
|
name="hidden_%d" % i
|
|
)(action_out)
|
|
else:
|
|
# Avoid postprocessing the outputs. This enables custom models
|
|
# to be used for parametric action DQN.
|
|
action_out = model_out
|
|
if use_noisy:
|
|
action_scores = self._noisy_layer(
|
|
"output",
|
|
action_out,
|
|
self.action_space.n * num_atoms,
|
|
sigma0,
|
|
non_linear=False)
|
|
elif q_hiddens:
|
|
action_scores = tf.keras.layers.Dense(
|
|
units=self.action_space.n * num_atoms,
|
|
activation=None
|
|
)(action_out)
|
|
else:
|
|
action_scores = model_out
|
|
if num_atoms > 1:
|
|
# Distributional Q-learning uses a discrete support z
|
|
# to represent the action value distribution
|
|
z = tf.range(num_atoms, dtype=tf.float32)
|
|
z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
|
|
support_logits_per_action = tf.reshape(
|
|
tensor=action_scores,
|
|
shape=(-1, self.action_space.n, num_atoms))
|
|
support_prob_per_action = tf.nn.softmax(
|
|
logits=support_logits_per_action)
|
|
action_scores = tf.reduce_sum(
|
|
input_tensor=z * support_prob_per_action, axis=-1)
|
|
logits = support_logits_per_action
|
|
dist = support_prob_per_action
|
|
return [
|
|
action_scores, z, support_logits_per_action, logits, dist
|
|
]
|
|
else:
|
|
logits = tf.expand_dims(tf.ones_like(action_scores), -1)
|
|
dist = tf.expand_dims(tf.ones_like(action_scores), -1)
|
|
return [action_scores, logits, dist]
|
|
|
|
def build_state_score(model_out):
|
|
state_out = model_out
|
|
for i in range(len(q_hiddens)):
|
|
if use_noisy:
|
|
state_out = self._noisy_layer("dueling_hidden_%d" % i,
|
|
state_out, q_hiddens[i],
|
|
sigma0)
|
|
elif parameter_noise:
|
|
state_out = tf.keras.layers.Dense(
|
|
units=q_hiddens[i],
|
|
activation_fn=tf.nn.relu,
|
|
normalizer_fn=tf.contrib.layers.layer_norm
|
|
)(state_out)
|
|
else:
|
|
state_out = tf.keras.layers.Dense(
|
|
units=q_hiddens[i], activation=tf.nn.relu
|
|
)(state_out)
|
|
if use_noisy:
|
|
state_score = self._noisy_layer(
|
|
"dueling_output",
|
|
state_out,
|
|
num_atoms,
|
|
sigma0,
|
|
non_linear=False)
|
|
else:
|
|
state_score = tf.keras.layers.Dense(
|
|
units=num_atoms, activation=None
|
|
)(state_out)
|
|
return state_score
|
|
|
|
if tf.executing_eagerly():
|
|
from tensorflow.python.ops import variable_scope
|
|
# Have to use a variable store to reuse variables in eager mode
|
|
store = variable_scope.EagerVariableStore()
|
|
|
|
# Save the scope objects, since in eager we will execute this
|
|
# path repeatedly and there is no guarantee it will always be run
|
|
# in the same original scope.
|
|
with tf.variable_scope(name + "/action_value") as action_scope:
|
|
pass
|
|
with tf.variable_scope(name + "/state_value") as state_scope:
|
|
pass
|
|
|
|
def build_action_value_in_scope(model_out):
|
|
with store.as_default():
|
|
with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE):
|
|
return build_action_value(model_out)
|
|
|
|
def build_state_score_in_scope(model_out):
|
|
with store.as_default():
|
|
with tf.variable_scope(state_scope, reuse=tf.AUTO_REUSE):
|
|
return build_state_score(model_out)
|
|
else:
|
|
|
|
def build_action_value_in_scope(model_out):
|
|
with tf.variable_scope(
|
|
name + "/action_value", reuse=tf.AUTO_REUSE):
|
|
return build_action_value(model_out)
|
|
|
|
def build_state_score_in_scope(model_out):
|
|
with tf.variable_scope(
|
|
name + "/state_value", reuse=tf.AUTO_REUSE):
|
|
return build_state_score(model_out)
|
|
|
|
q_out = build_action_value_in_scope(self.model_out)
|
|
self.q_value_head = tf.keras.Model(self.model_out, q_out)
|
|
self.register_variables(self.q_value_head.variables)
|
|
|
|
if dueling:
|
|
state_out = build_state_score_in_scope(
|
|
self.model_out)
|
|
self.state_value_head = tf.keras.Model(self.model_out, state_out)
|
|
self.register_variables(self.state_value_head.variables)
|
|
|
|
def get_q_value_distributions(self, model_out):
|
|
"""Returns distributional values for Q(s, a) given a state embedding.
|
|
|
|
Override this in your custom model to customize the Q output head.
|
|
|
|
Arguments:
|
|
model_out (Tensor): embedding from the model layers
|
|
|
|
Returns:
|
|
(action_scores, logits, dist) if num_atoms == 1, otherwise
|
|
(action_scores, z, support_logits_per_action, logits, dist)
|
|
"""
|
|
|
|
return self.q_value_head(model_out)
|
|
|
|
def get_state_value(self, model_out):
|
|
"""Returns the state value prediction for the given state embedding."""
|
|
|
|
return self.state_value_head(model_out)
|
|
|
|
def _noisy_layer(self,
|
|
prefix,
|
|
action_in,
|
|
out_size,
|
|
sigma0,
|
|
non_linear=True):
|
|
"""
|
|
a common dense layer: y = w^{T}x + b
|
|
a noisy layer: y = (w + \\epsilon_w*\\sigma_w)^{T}x +
|
|
(b+\\epsilon_b*\\sigma_b)
|
|
where \epsilon are random variables sampled from factorized normal
|
|
distributions and \\sigma are trainable variables which are expected to
|
|
vanish along the training procedure
|
|
"""
|
|
in_size = int(action_in.shape[1])
|
|
|
|
epsilon_in = tf.random_normal(shape=[in_size])
|
|
epsilon_out = tf.random_normal(shape=[out_size])
|
|
epsilon_in = self._f_epsilon(epsilon_in)
|
|
epsilon_out = self._f_epsilon(epsilon_out)
|
|
epsilon_w = tf.matmul(
|
|
a=tf.expand_dims(epsilon_in, -1), b=tf.expand_dims(epsilon_out, 0))
|
|
epsilon_b = epsilon_out
|
|
sigma_w = tf.get_variable(
|
|
name=prefix + "_sigma_w",
|
|
shape=[in_size, out_size],
|
|
dtype=tf.float32,
|
|
initializer=tf.random_uniform_initializer(
|
|
minval=-1.0 / np.sqrt(float(in_size)),
|
|
maxval=1.0 / np.sqrt(float(in_size))))
|
|
# TF noise generation can be unreliable on GPU
|
|
# If generating the noise on the CPU,
|
|
# lowering sigma0 to 0.1 may be helpful
|
|
sigma_b = tf.get_variable(
|
|
name=prefix + "_sigma_b",
|
|
shape=[out_size],
|
|
dtype=tf.float32, # 0.5~GPU, 0.1~CPU
|
|
initializer=tf.constant_initializer(
|
|
sigma0 / np.sqrt(float(in_size))))
|
|
|
|
w = tf.get_variable(
|
|
name=prefix + "_fc_w",
|
|
shape=[in_size, out_size],
|
|
dtype=tf.float32,
|
|
initializer=tf.initializers.GlorotUniform())
|
|
b = tf.get_variable(
|
|
name=prefix + "_fc_b",
|
|
shape=[out_size],
|
|
dtype=tf.float32,
|
|
initializer=tf.zeros_initializer())
|
|
|
|
action_activation = tf.nn.xw_plus_b(action_in, w + sigma_w * epsilon_w,
|
|
b + sigma_b * epsilon_b)
|
|
|
|
if not non_linear:
|
|
return action_activation
|
|
return tf.nn.relu(action_activation)
|
|
|
|
def _f_epsilon(self, x):
|
|
return tf.sign(x) * tf.sqrt(tf.abs(x))
|