from functools import partial import logging import numpy as np import gym from typing import Dict, Tuple, List, Type, Union, Optional, Any import ray import ray.experimental.tf_utils from ray.rllib.algorithms.ddpg.utils import make_ddpg_models, validate_spaces from ray.rllib.algorithms.dqn.dqn_tf_policy import ( postprocess_nstep_and_prio, PRIO_WEIGHTS, ) from ray.rllib.evaluation import Episode from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.tf.tf_action_dist import ( Deterministic, Dirichlet, TFActionDistribution, ) from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 from ray.rllib.utils.annotations import override from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy import TFPolicy from ray.rllib.utils.framework import get_variable, try_import_tf from ray.rllib.utils.spaces.simplex import Simplex from ray.rllib.utils.tf_utils import huber_loss, make_tf_callable from ray.rllib.utils.typing import ( AlgorithmConfigDict, TensorType, LocalOptimizer, ModelGradients, ) from ray.util.debug import log_once tf1, tf, tfv = try_import_tf() logger = logging.getLogger(__name__) class ComputeTDErrorMixin: def __init__(self: Union[DynamicTFPolicyV2, EagerTFPolicyV2]): @make_tf_callable(self.get_session(), dynamic_shape=True) def compute_td_error( obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights ): input_dict = SampleBatch( { SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_t), SampleBatch.ACTIONS: tf.convert_to_tensor(act_t), SampleBatch.REWARDS: tf.convert_to_tensor(rew_t), SampleBatch.NEXT_OBS: tf.convert_to_tensor(obs_tp1), SampleBatch.DONES: tf.convert_to_tensor(done_mask), PRIO_WEIGHTS: tf.convert_to_tensor(importance_weights), } ) # Do forward pass on loss to update td errors attribute # (one TD-error value per item in batch to update PR weights). self.loss(self.model, None, input_dict) # `self.td_error` is set in loss_fn. return self.td_error self.compute_td_error = compute_td_error class TargetNetworkMixin: def __init__(self: Union[DynamicTFPolicyV2, EagerTFPolicyV2]): @make_tf_callable(self.get_session()) def update_target_fn(tau): tau = tf.convert_to_tensor(tau, dtype=tf.float32) update_target_expr = [] model_vars = self.model.trainable_variables() target_model_vars = self.target_model.trainable_variables() assert len(model_vars) == len(target_model_vars), ( model_vars, target_model_vars, ) for var, var_target in zip(model_vars, target_model_vars): update_target_expr.append( var_target.assign(tau * var + (1.0 - tau) * var_target) ) logger.debug("Update target op {}".format(var_target)) return tf.group(*update_target_expr) # Hard initial update. self._do_update = update_target_fn self.update_target(tau=1.0) # Support both hard and soft sync. def update_target( self: Union[DynamicTFPolicyV2, EagerTFPolicyV2], tau: int = None ) -> None: self._do_update(np.float32(tau or self.config.get("tau"))) @override(TFPolicy) def variables(self: Union[DynamicTFPolicyV2, EagerTFPolicyV2]) -> List[TensorType]: return self.model.variables() + self.target_model.variables() # We need this builder function because we want to share the same # custom logics between TF1 dynamic and TF2 eager policies. def get_ddpg_tf_policy( name: str, base: Type[Union[DynamicTFPolicyV2, EagerTFPolicyV2]] ) -> Type: """Construct a DDPGTFPolicy inheriting either dynamic or eager base policies. Args: base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. Returns: A TF Policy to be used with DDPG. """ class DDPGTFPolicy(TargetNetworkMixin, ComputeTDErrorMixin, base): def __init__( self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: AlgorithmConfigDict, *, existing_inputs: Optional[Dict[str, "tf1.placeholder"]] = None, existing_model: Optional[ModelV2] = None, ): # First thing first, enable eager execution if necessary. base.enable_eager_execution_if_necessary() config = dict( ray.rllib.algorithms.ddpg.ddpg.DDPGConfig().to_dict(), **config ) # Validate action space for DDPG validate_spaces(self, observation_space, action_space) base.__init__( self, observation_space, action_space, config, existing_inputs=existing_inputs, existing_model=existing_model, ) ComputeTDErrorMixin.__init__(self) self.maybe_initialize_optimizer_and_loss() TargetNetworkMixin.__init__(self) @override(base) def make_model(self) -> ModelV2: return make_ddpg_models(self) @override(base) def optimizer( self, ) -> List["tf.keras.optimizers.Optimizer"]: """Create separate optimizers for actor & critic losses.""" if self.config["framework"] in ["tf2", "tfe"]: self.global_step = get_variable(0, tf_name="global_step") self._actor_optimizer = tf.keras.optimizers.Adam( learning_rate=self.config["actor_lr"] ) self._critic_optimizer = tf.keras.optimizers.Adam( learning_rate=self.config["critic_lr"] ) # Static graph mode. else: self.global_step = tf1.train.get_or_create_global_step() self._actor_optimizer = tf1.train.AdamOptimizer( learning_rate=self.config["actor_lr"] ) self._critic_optimizer = tf1.train.AdamOptimizer( learning_rate=self.config["critic_lr"] ) return [self._actor_optimizer, self._critic_optimizer] @override(base) def compute_gradients_fn( self, optimizer: LocalOptimizer, loss: TensorType ) -> ModelGradients: if self.config["framework"] in ["tf2", "tfe"]: tape = optimizer.tape pol_weights = self.model.policy_variables() actor_grads_and_vars = list( zip(tape.gradient(self.actor_loss, pol_weights), pol_weights) ) q_weights = self.model.q_variables() critic_grads_and_vars = list( zip(tape.gradient(self.critic_loss, q_weights), q_weights) ) else: actor_grads_and_vars = self._actor_optimizer.compute_gradients( self.actor_loss, var_list=self.model.policy_variables() ) critic_grads_and_vars = self._critic_optimizer.compute_gradients( self.critic_loss, var_list=self.model.q_variables() ) # Clip if necessary. if self.config["grad_clip"]: clip_func = partial(tf.clip_by_norm, clip_norm=self.config["grad_clip"]) else: clip_func = tf.identity # Save grads and vars for later use in `build_apply_op`. self._actor_grads_and_vars = [ (clip_func(g), v) for (g, v) in actor_grads_and_vars if g is not None ] self._critic_grads_and_vars = [ (clip_func(g), v) for (g, v) in critic_grads_and_vars if g is not None ] grads_and_vars = self._actor_grads_and_vars + self._critic_grads_and_vars return grads_and_vars @override(base) def apply_gradients_fn( self, optimizer: "tf.keras.optimizers.Optimizer", grads: ModelGradients, ) -> "tf.Operation": # For policy gradient, update policy net one time v.s. # update critic net `policy_delay` time(s). should_apply_actor_opt = tf.equal( tf.math.floormod(self.global_step, self.config["policy_delay"]), 0 ) def make_apply_op(): return self._actor_optimizer.apply_gradients(self._actor_grads_and_vars) actor_op = tf.cond( should_apply_actor_opt, true_fn=make_apply_op, false_fn=lambda: tf.no_op(), ) critic_op = self._critic_optimizer.apply_gradients( self._critic_grads_and_vars ) # Increment global step & apply ops. if self.config["framework"] in ["tf2", "tfe"]: self.global_step.assign_add(1) return tf.no_op() else: with tf1.control_dependencies([tf1.assign_add(self.global_step, 1)]): return tf.group(actor_op, critic_op) @override(base) def action_distribution_fn( self, model: ModelV2, *, obs_batch: TensorType, state_batches: TensorType, is_training: bool = False, **kwargs, ) -> Tuple[TensorType, type, List[TensorType]]: model_out, _ = model(SampleBatch(obs=obs_batch, _is_training=is_training)) dist_inputs = model.get_policy_output(model_out) if isinstance(self.action_space, Simplex): distr_class = Dirichlet else: distr_class = Deterministic return dist_inputs, distr_class, [] # []=state out @override(base) def postprocess_trajectory( self, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[Any, SampleBatch]] = None, episode: Optional[Episode] = None, ) -> SampleBatch: return postprocess_nstep_and_prio( self, sample_batch, other_agent_batches, episode ) @override(base) def loss( self, model: Union[ModelV2, "tf.keras.Model"], dist_class: Type[TFActionDistribution], train_batch: SampleBatch, ) -> TensorType: twin_q = self.config["twin_q"] gamma = self.config["gamma"] n_step = self.config["n_step"] use_huber = self.config["use_huber"] huber_threshold = self.config["huber_threshold"] l2_reg = self.config["l2_reg"] input_dict = SampleBatch( obs=train_batch[SampleBatch.CUR_OBS], _is_training=True ) input_dict_next = SampleBatch( obs=train_batch[SampleBatch.NEXT_OBS], _is_training=True ) model_out_t, _ = model(input_dict, [], None) model_out_tp1, _ = model(input_dict_next, [], None) target_model_out_tp1, _ = self.target_model(input_dict_next, [], None) self.target_q_func_vars = self.target_model.variables() # Policy network evaluation. policy_t = model.get_policy_output(model_out_t) policy_tp1 = self.target_model.get_policy_output(target_model_out_tp1) # Action outputs. if self.config["smooth_target_policy"]: target_noise_clip = self.config["target_noise_clip"] clipped_normal_sample = tf.clip_by_value( tf.random.normal( tf.shape(policy_tp1), stddev=self.config["target_noise"] ), -target_noise_clip, target_noise_clip, ) policy_tp1_smoothed = tf.clip_by_value( policy_tp1 + clipped_normal_sample, self.action_space.low * tf.ones_like(policy_tp1), self.action_space.high * tf.ones_like(policy_tp1), ) else: # No smoothing, just use deterministic actions. policy_tp1_smoothed = policy_tp1 # Q-net(s) evaluation. # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) # Q-values for given actions & observations in given current q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) # Q-values for current policy (no noise) in given current state q_t_det_policy = model.get_q_values(model_out_t, policy_t) if twin_q: twin_q_t = model.get_twin_q_values( model_out_t, train_batch[SampleBatch.ACTIONS] ) # Target q-net(s) evaluation. q_tp1 = self.target_model.get_q_values( target_model_out_tp1, policy_tp1_smoothed ) if twin_q: twin_q_tp1 = self.target_model.get_twin_q_values( target_model_out_tp1, policy_tp1_smoothed ) q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) if twin_q: twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) q_tp1 = tf.minimum(q_tp1, twin_q_tp1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = ( 1.0 - tf.cast(train_batch[SampleBatch.DONES], tf.float32) ) * q_tp1_best # Compute RHS of bellman equation. q_t_selected_target = tf.stop_gradient( tf.cast(train_batch[SampleBatch.REWARDS], tf.float32) + gamma ** n_step * q_tp1_best_masked ) # Compute the error (potentially clipped). if twin_q: td_error = q_t_selected - q_t_selected_target twin_td_error = twin_q_t_selected - q_t_selected_target if use_huber: errors = huber_loss(td_error, huber_threshold) + huber_loss( twin_td_error, huber_threshold ) else: errors = 0.5 * tf.math.square(td_error) + 0.5 * tf.math.square( twin_td_error ) else: td_error = q_t_selected - q_t_selected_target if use_huber: errors = huber_loss(td_error, huber_threshold) else: errors = 0.5 * tf.math.square(td_error) critic_loss = tf.reduce_mean( tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) * errors ) actor_loss = -tf.reduce_mean(q_t_det_policy) # Add l2-regularization if required. if l2_reg is not None: for var in self.model.policy_variables(): if "bias" not in var.name: actor_loss += l2_reg * tf.nn.l2_loss(var) for var in self.model.q_variables(): if "bias" not in var.name: critic_loss += l2_reg * tf.nn.l2_loss(var) # Model self-supervised losses. if self.config["use_state_preprocessor"]: # Expand input_dict in case custom_loss' need them. input_dict[SampleBatch.ACTIONS] = train_batch[SampleBatch.ACTIONS] input_dict[SampleBatch.REWARDS] = train_batch[SampleBatch.REWARDS] input_dict[SampleBatch.DONES] = train_batch[SampleBatch.DONES] input_dict[SampleBatch.NEXT_OBS] = train_batch[SampleBatch.NEXT_OBS] if log_once("ddpg_custom_loss"): logger.warning( "You are using a state-preprocessor with DDPG and " "therefore, `custom_loss` will be called on your Model! " "Please be aware that DDPG now uses the ModelV2 API, which " "merges all previously separate sub-models (policy_model, " "q_model, and twin_q_model) into one ModelV2, on which " "`custom_loss` is called, passing it " "[actor_loss, critic_loss] as 1st argument. " "You may have to change your custom loss function to handle " "this." ) [actor_loss, critic_loss] = model.custom_loss( [actor_loss, critic_loss], input_dict ) # Store values for stats function. self.actor_loss = actor_loss self.critic_loss = critic_loss self.td_error = td_error self.q_t = q_t # Return one loss value (even though we treat them separately in our # 2 optimizers: actor and critic). return self.critic_loss + self.actor_loss @override(base) def extra_learn_fetches_fn(self) -> Dict[str, Any]: return {"td_error": self.td_error} @override(base) def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: stats = { "mean_q": tf.reduce_mean(self.q_t), "max_q": tf.reduce_max(self.q_t), "min_q": tf.reduce_min(self.q_t), } return stats DDPGTFPolicy.__name__ = name DDPGTFPolicy.__qualname__ = name return DDPGTFPolicy DDPGTF1Policy = get_ddpg_tf_policy("DDPGTF1Policy", DynamicTFPolicyV2) DDPGTF2Policy = get_ddpg_tf_policy("DDPGTF2Policy", EagerTFPolicyV2)