2019-02-22 11:18:51 -08:00
|
|
|
import logging
|
2020-06-27 20:50:01 +02:00
|
|
|
import numpy as np
|
2018-06-28 09:49:08 -07:00
|
|
|
|
2018-07-22 05:09:25 -07:00
|
|
|
import ray
|
2020-01-21 08:06:50 +01:00
|
|
|
from ray.rllib.agents.a3c.a3c_torch_policy import apply_grad_clipping
|
|
|
|
from ray.rllib.agents.ppo.ppo_tf_policy import postprocess_ppo_gae, \
|
|
|
|
setup_config
|
|
|
|
from ray.rllib.evaluation.postprocessing import Postprocessing
|
2019-05-20 16:46:05 -07:00
|
|
|
from ray.rllib.policy.sample_batch import SampleBatch
|
2020-01-21 08:06:50 +01:00
|
|
|
from ray.rllib.policy.torch_policy import EntropyCoeffSchedule, \
|
|
|
|
LearningRateSchedule
|
|
|
|
from ray.rllib.policy.torch_policy_template import build_torch_policy
|
2020-08-06 10:54:20 +02:00
|
|
|
from ray.rllib.policy.view_requirement import ViewRequirement
|
2020-06-16 08:52:20 +02:00
|
|
|
from ray.rllib.utils.framework import try_import_torch
|
2020-06-27 20:50:01 +02:00
|
|
|
from ray.rllib.utils.torch_ops import convert_to_torch_tensor, \
|
|
|
|
explained_variance, sequence_mask
|
2019-05-10 20:36:18 -07:00
|
|
|
|
2020-01-21 08:06:50 +01:00
|
|
|
torch, nn = try_import_torch()
|
2018-06-28 09:49:08 -07:00
|
|
|
|
2019-02-22 11:18:51 -08:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2018-06-28 09:49:08 -07:00
|
|
|
|
2020-01-02 17:42:13 -08:00
|
|
|
class PPOLoss:
|
2018-07-19 15:30:36 -07:00
|
|
|
def __init__(self,
|
2019-08-10 14:05:12 -07:00
|
|
|
dist_class,
|
|
|
|
model,
|
2018-07-19 15:30:36 -07:00
|
|
|
value_targets,
|
|
|
|
advantages,
|
|
|
|
actions,
|
2019-08-10 14:05:12 -07:00
|
|
|
prev_logits,
|
|
|
|
prev_actions_logp,
|
2018-07-19 15:30:36 -07:00
|
|
|
vf_preds,
|
|
|
|
curr_action_dist,
|
|
|
|
value_fn,
|
|
|
|
cur_kl_coeff,
|
2018-10-15 11:02:50 -07:00
|
|
|
valid_mask,
|
2018-07-19 15:30:36 -07:00
|
|
|
entropy_coeff=0,
|
|
|
|
clip_param=0.1,
|
2018-09-23 13:11:17 -07:00
|
|
|
vf_clip_param=0.1,
|
2018-07-19 15:30:36 -07:00
|
|
|
vf_loss_coeff=1.0,
|
2020-01-21 08:06:50 +01:00
|
|
|
use_gae=True):
|
2018-06-28 09:49:08 -07:00
|
|
|
"""Constructs the loss for Proximal Policy Objective.
|
|
|
|
|
|
|
|
Arguments:
|
2019-08-10 14:05:12 -07:00
|
|
|
dist_class: action distribution class for logits.
|
2018-06-28 09:49:08 -07:00
|
|
|
value_targets (Placeholder): Placeholder for target values; used
|
|
|
|
for GAE.
|
|
|
|
actions (Placeholder): Placeholder for actions taken
|
|
|
|
from previous model evaluation.
|
|
|
|
advantages (Placeholder): Placeholder for calculated advantages
|
|
|
|
from previous model evaluation.
|
2019-08-10 14:05:12 -07:00
|
|
|
prev_logits (Placeholder): Placeholder for logits output from
|
|
|
|
previous model evaluation.
|
|
|
|
prev_actions_logp (Placeholder): Placeholder for prob output from
|
2018-06-28 09:49:08 -07:00
|
|
|
previous model evaluation.
|
|
|
|
vf_preds (Placeholder): Placeholder for value function output
|
|
|
|
from previous model evaluation.
|
|
|
|
curr_action_dist (ActionDistribution): ActionDistribution
|
|
|
|
of the current model.
|
|
|
|
value_fn (Tensor): Current value function output Tensor.
|
|
|
|
cur_kl_coeff (Variable): Variable holding the current PPO KL
|
|
|
|
coefficient.
|
2018-10-15 11:02:50 -07:00
|
|
|
valid_mask (Tensor): A bool mask of valid input elements (#2992).
|
2018-06-28 09:49:08 -07:00
|
|
|
entropy_coeff (float): Coefficient of the entropy regularizer.
|
|
|
|
clip_param (float): Clip parameter
|
2018-09-23 13:11:17 -07:00
|
|
|
vf_clip_param (float): Clip parameter for the value function
|
2018-06-28 09:49:08 -07:00
|
|
|
vf_loss_coeff (float): Coefficient of the value function loss
|
|
|
|
use_gae (bool): If true, use the Generalized Advantage Estimator.
|
|
|
|
"""
|
2020-03-02 19:53:19 +01:00
|
|
|
if valid_mask is not None:
|
2020-04-01 07:00:28 +02:00
|
|
|
num_valid = torch.sum(valid_mask)
|
2018-10-15 11:02:50 -07:00
|
|
|
|
2020-03-02 19:53:19 +01:00
|
|
|
def reduce_mean_valid(t):
|
2020-07-12 16:59:35 -04:00
|
|
|
return torch.sum(t[valid_mask]) / num_valid
|
2020-03-02 19:53:19 +01:00
|
|
|
|
|
|
|
else:
|
2020-08-19 17:49:50 +02:00
|
|
|
reduce_mean_valid = torch.mean
|
2018-10-15 11:02:50 -07:00
|
|
|
|
2019-08-10 14:05:12 -07:00
|
|
|
prev_dist = dist_class(prev_logits, model)
|
2018-06-28 09:49:08 -07:00
|
|
|
# Make loss functions.
|
2020-01-21 08:06:50 +01:00
|
|
|
logp_ratio = torch.exp(
|
|
|
|
curr_action_dist.logp(actions) - prev_actions_logp)
|
2018-06-28 09:49:08 -07:00
|
|
|
action_kl = prev_dist.kl(curr_action_dist)
|
2018-10-15 11:02:50 -07:00
|
|
|
self.mean_kl = reduce_mean_valid(action_kl)
|
2018-06-28 09:49:08 -07:00
|
|
|
|
|
|
|
curr_entropy = curr_action_dist.entropy()
|
2018-10-15 11:02:50 -07:00
|
|
|
self.mean_entropy = reduce_mean_valid(curr_entropy)
|
2018-06-28 09:49:08 -07:00
|
|
|
|
2020-01-21 08:06:50 +01:00
|
|
|
surrogate_loss = torch.min(
|
2018-06-28 09:49:08 -07:00
|
|
|
advantages * logp_ratio,
|
2020-01-21 08:06:50 +01:00
|
|
|
advantages * torch.clamp(logp_ratio, 1 - clip_param,
|
|
|
|
1 + clip_param))
|
2018-10-15 11:02:50 -07:00
|
|
|
self.mean_policy_loss = reduce_mean_valid(-surrogate_loss)
|
2018-06-28 09:49:08 -07:00
|
|
|
|
|
|
|
if use_gae:
|
2020-01-21 08:06:50 +01:00
|
|
|
vf_loss1 = torch.pow(value_fn - value_targets, 2.0)
|
|
|
|
vf_clipped = vf_preds + torch.clamp(value_fn - vf_preds,
|
|
|
|
-vf_clip_param, vf_clip_param)
|
|
|
|
vf_loss2 = torch.pow(vf_clipped - value_targets, 2.0)
|
|
|
|
vf_loss = torch.max(vf_loss1, vf_loss2)
|
2018-10-15 11:02:50 -07:00
|
|
|
self.mean_vf_loss = reduce_mean_valid(vf_loss)
|
|
|
|
loss = reduce_mean_valid(
|
|
|
|
-surrogate_loss + cur_kl_coeff * action_kl +
|
|
|
|
vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy)
|
2018-06-28 09:49:08 -07:00
|
|
|
else:
|
2020-01-21 08:06:50 +01:00
|
|
|
self.mean_vf_loss = 0.0
|
2018-10-15 11:02:50 -07:00
|
|
|
loss = reduce_mean_valid(-surrogate_loss +
|
|
|
|
cur_kl_coeff * action_kl -
|
|
|
|
entropy_coeff * curr_entropy)
|
2018-06-28 09:49:08 -07:00
|
|
|
self.loss = loss
|
|
|
|
|
|
|
|
|
2019-08-23 02:21:11 -04:00
|
|
|
def ppo_surrogate_loss(policy, model, dist_class, train_batch):
|
2020-07-29 21:15:09 +02:00
|
|
|
logits, state = model.from_batch(train_batch, is_training=True)
|
2019-08-23 02:21:11 -04:00
|
|
|
action_dist = dist_class(logits, model)
|
|
|
|
|
2020-03-02 19:53:19 +01:00
|
|
|
mask = None
|
2019-08-23 02:21:11 -04:00
|
|
|
if state:
|
2020-01-21 08:06:50 +01:00
|
|
|
max_seq_len = torch.max(train_batch["seq_lens"])
|
2020-08-21 12:35:16 +02:00
|
|
|
mask = sequence_mask(
|
|
|
|
train_batch["seq_lens"],
|
|
|
|
max_seq_len,
|
|
|
|
time_major=model.is_time_major())
|
2020-01-21 08:06:50 +01:00
|
|
|
mask = torch.reshape(mask, [-1])
|
2019-05-18 00:23:11 -07:00
|
|
|
|
|
|
|
policy.loss_obj = PPOLoss(
|
2019-08-23 02:21:11 -04:00
|
|
|
dist_class,
|
|
|
|
model,
|
|
|
|
train_batch[Postprocessing.VALUE_TARGETS],
|
|
|
|
train_batch[Postprocessing.ADVANTAGES],
|
|
|
|
train_batch[SampleBatch.ACTIONS],
|
2020-04-01 09:43:21 +02:00
|
|
|
train_batch[SampleBatch.ACTION_DIST_INPUTS],
|
|
|
|
train_batch[SampleBatch.ACTION_LOGP],
|
2019-08-23 02:21:11 -04:00
|
|
|
train_batch[SampleBatch.VF_PREDS],
|
|
|
|
action_dist,
|
|
|
|
model.value_function(),
|
2019-07-21 12:27:17 -07:00
|
|
|
policy.kl_coeff,
|
2019-05-18 00:23:11 -07:00
|
|
|
mask,
|
2019-07-21 12:27:17 -07:00
|
|
|
entropy_coeff=policy.entropy_coeff,
|
2019-05-18 00:23:11 -07:00
|
|
|
clip_param=policy.config["clip_param"],
|
|
|
|
vf_clip_param=policy.config["vf_clip_param"],
|
|
|
|
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
2019-08-06 18:13:16 +00:00
|
|
|
use_gae=policy.config["use_gae"],
|
2020-01-21 08:06:50 +01:00
|
|
|
)
|
2019-05-18 00:23:11 -07:00
|
|
|
|
|
|
|
return policy.loss_obj.loss
|
|
|
|
|
|
|
|
|
2019-08-23 02:21:11 -04:00
|
|
|
def kl_and_loss_stats(policy, train_batch):
|
2019-07-03 15:59:47 -07:00
|
|
|
return {
|
2020-01-21 08:06:50 +01:00
|
|
|
"cur_kl_coeff": policy.kl_coeff,
|
|
|
|
"cur_lr": policy.cur_lr,
|
2020-02-22 20:02:31 +01:00
|
|
|
"total_loss": policy.loss_obj.loss,
|
|
|
|
"policy_loss": policy.loss_obj.mean_policy_loss,
|
|
|
|
"vf_loss": policy.loss_obj.mean_vf_loss,
|
2019-07-03 15:59:47 -07:00
|
|
|
"vf_explained_var": explained_variance(
|
2019-08-23 02:21:11 -04:00
|
|
|
train_batch[Postprocessing.VALUE_TARGETS],
|
2020-06-16 08:52:20 +02:00
|
|
|
policy.model.value_function()),
|
2020-02-22 20:02:31 +01:00
|
|
|
"kl": policy.loss_obj.mean_kl,
|
|
|
|
"entropy": policy.loss_obj.mean_entropy,
|
2020-01-21 08:06:50 +01:00
|
|
|
"entropy_coeff": policy.entropy_coeff,
|
2019-05-18 00:23:11 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-04-01 09:43:21 +02:00
|
|
|
def vf_preds_fetches(policy, input_dict, state_batches, model, action_dist):
|
|
|
|
"""Adds value function outputs to experience train_batches."""
|
2019-05-18 00:23:11 -07:00
|
|
|
return {
|
2020-02-22 20:02:31 +01:00
|
|
|
SampleBatch.VF_PREDS: policy.model.value_function(),
|
2019-05-18 00:23:11 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-01-02 17:42:13 -08:00
|
|
|
class KLCoeffMixin:
|
2019-05-18 00:23:11 -07:00
|
|
|
def __init__(self, config):
|
2020-01-21 08:06:50 +01:00
|
|
|
# KL Coefficient.
|
|
|
|
self.kl_coeff = config["kl_coeff"]
|
2019-05-18 00:23:11 -07:00
|
|
|
self.kl_target = config["kl_target"]
|
2018-06-28 09:49:08 -07:00
|
|
|
|
2019-05-18 00:23:11 -07:00
|
|
|
def update_kl(self, sampled_kl):
|
|
|
|
if sampled_kl > 2.0 * self.kl_target:
|
2020-01-21 08:06:50 +01:00
|
|
|
self.kl_coeff *= 1.5
|
2019-05-18 00:23:11 -07:00
|
|
|
elif sampled_kl < 0.5 * self.kl_target:
|
2020-01-21 08:06:50 +01:00
|
|
|
self.kl_coeff *= 0.5
|
|
|
|
return self.kl_coeff
|
2019-05-18 00:23:11 -07:00
|
|
|
|
|
|
|
|
2020-01-02 17:42:13 -08:00
|
|
|
class ValueNetworkMixin:
|
2019-05-18 00:23:11 -07:00
|
|
|
def __init__(self, obs_space, action_space, config):
|
|
|
|
if config["use_gae"]:
|
2019-08-23 02:21:11 -04:00
|
|
|
|
|
|
|
def value(ob, prev_action, prev_reward, *state):
|
|
|
|
model_out, _ = self.model({
|
2020-06-27 20:50:01 +02:00
|
|
|
SampleBatch.CUR_OBS: convert_to_torch_tensor(
|
2020-07-16 14:55:50 +02:00
|
|
|
np.asarray([ob]), self.device),
|
2020-06-27 20:50:01 +02:00
|
|
|
SampleBatch.PREV_ACTIONS: convert_to_torch_tensor(
|
2020-07-16 14:55:50 +02:00
|
|
|
np.asarray([prev_action]), self.device),
|
2020-06-27 20:50:01 +02:00
|
|
|
SampleBatch.PREV_REWARDS: convert_to_torch_tensor(
|
2020-07-16 14:55:50 +02:00
|
|
|
np.asarray([prev_reward]), self.device),
|
2020-01-21 08:06:50 +01:00
|
|
|
"is_training": False,
|
2020-07-29 21:15:09 +02:00
|
|
|
}, [
|
|
|
|
convert_to_torch_tensor(np.asarray([s]), self.device)
|
|
|
|
for s in state
|
|
|
|
], convert_to_torch_tensor(np.asarray([1]), self.device))
|
2019-08-23 02:21:11 -04:00
|
|
|
return self.model.value_function()[0]
|
|
|
|
|
2018-06-28 09:49:08 -07:00
|
|
|
else:
|
2019-08-23 02:21:11 -04:00
|
|
|
|
|
|
|
def value(ob, prev_action, prev_reward, *state):
|
2020-01-21 08:06:50 +01:00
|
|
|
return 0.0
|
2019-08-23 02:21:11 -04:00
|
|
|
|
|
|
|
self._value = value
|
2019-05-18 00:23:11 -07:00
|
|
|
|
|
|
|
|
|
|
|
def setup_mixins(policy, obs_space, action_space, config):
|
|
|
|
ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
|
|
|
|
KLCoeffMixin.__init__(policy, config)
|
2019-07-09 03:30:32 +02:00
|
|
|
EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
|
|
|
|
config["entropy_coeff_schedule"])
|
2019-05-18 00:23:11 -07:00
|
|
|
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
|
|
|
|
|
|
|
|
|
2020-08-06 10:54:20 +02:00
|
|
|
def training_view_requirements_fn(policy):
|
|
|
|
return {
|
|
|
|
# Next obs are needed for PPO postprocessing.
|
|
|
|
SampleBatch.NEXT_OBS: ViewRequirement(SampleBatch.OBS, shift=1),
|
|
|
|
# VF preds are needed for the loss.
|
|
|
|
SampleBatch.VF_PREDS: ViewRequirement(shift=0),
|
2020-08-21 12:35:16 +02:00
|
|
|
# Needed for postprocessing.
|
|
|
|
SampleBatch.ACTION_DIST_INPUTS: ViewRequirement(shift=0),
|
|
|
|
SampleBatch.ACTION_LOGP: ViewRequirement(shift=0),
|
|
|
|
# Created during postprocessing.
|
|
|
|
Postprocessing.ADVANTAGES: ViewRequirement(shift=0),
|
|
|
|
Postprocessing.VALUE_TARGETS: ViewRequirement(shift=0),
|
2020-08-06 10:54:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-01-21 08:06:50 +01:00
|
|
|
PPOTorchPolicy = build_torch_policy(
|
|
|
|
name="PPOTorchPolicy",
|
2019-05-18 00:23:11 -07:00
|
|
|
get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
|
|
|
|
loss_fn=ppo_surrogate_loss,
|
|
|
|
stats_fn=kl_and_loss_stats,
|
2020-04-01 09:43:21 +02:00
|
|
|
extra_action_out_fn=vf_preds_fetches,
|
2019-05-18 00:23:11 -07:00
|
|
|
postprocess_fn=postprocess_ppo_gae,
|
2020-01-21 08:06:50 +01:00
|
|
|
extra_grad_process_fn=apply_grad_clipping,
|
2019-07-07 15:06:41 -07:00
|
|
|
before_init=setup_config,
|
2020-01-21 08:06:50 +01:00
|
|
|
after_init=setup_mixins,
|
2020-05-23 19:54:18 +02:00
|
|
|
mixins=[
|
|
|
|
LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
|
|
|
|
ValueNetworkMixin
|
2020-08-06 10:54:20 +02:00
|
|
|
],
|
|
|
|
training_view_requirements_fn=training_view_requirements_fn,
|
|
|
|
)
|