ray/rllib/examples/compute_adapted_gae_on_postprocess_trajectory.py

150 lines
5.5 KiB
Python

"""
Adapted (time-dependent) GAE for PPO algorithm can be activated by setting
use_adapted_gae=True in the policy config. Additionally, it is required that
"callbacks" include the custom callback class in the Trainer's config.
Furthermore, the env must return in its info dictionary a key-value pair of
the form "d_ts": ... where the value is the length (time) of recent agent step.
This adapted, time-dependent computation of advantages may be useful in cases
where agent's actions take various times and thus time steps are not
equidistant (https://docdro.id/400TvlR)
"""
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.evaluation.postprocessing import Postprocessing
from ray.rllib.utils.annotations import override
import numpy as np
class MyCallbacks(DefaultCallbacks):
@override(DefaultCallbacks)
def on_postprocess_trajectory(
self,
*,
worker,
episode,
agent_id,
policy_id,
policies,
postprocessed_batch,
original_batches,
**kwargs
):
super().on_postprocess_trajectory(
worker=worker,
episode=episode,
agent_id=agent_id,
policy_id=policy_id,
policies=policies,
postprocessed_batch=postprocessed_batch,
original_batches=original_batches,
**kwargs
)
if policies[policy_id].config.get("use_adapted_gae", False):
policy = policies[policy_id]
assert policy.config[
"use_gae"
], "Can't use adapted gae without use_gae=True!"
info_dicts = postprocessed_batch[SampleBatch.INFOS]
assert np.all(
["d_ts" in info_dict for info_dict in info_dicts]
), "Info dicts in sample batch must contain data 'd_ts' \
(=ts[i+1]-ts[i] length of time steps)!"
d_ts = np.array(
[np.float(info_dict.get("d_ts")) for info_dict in info_dicts]
)
assert np.all(
[e.is_integer() for e in d_ts]
), "Elements of 'd_ts' (length of time steps) must be integer!"
# Trajectory is actually complete -> last r=0.0.
if postprocessed_batch[SampleBatch.DONES][-1]:
last_r = 0.0
# Trajectory has been truncated -> last r=VF estimate of last obs.
else:
# Input dict is provided to us automatically via the Model's
# requirements. It's a single-timestep (last one in trajectory)
# input_dict.
# Create an input dict according to the Model's requirements.
input_dict = postprocessed_batch.get_single_step_input_dict(
policy.model.view_requirements, index="last"
)
last_r = policy._value(**input_dict)
gamma = policy.config["gamma"]
lambda_ = policy.config["lambda"]
vpred_t = np.concatenate(
[postprocessed_batch[SampleBatch.VF_PREDS], np.array([last_r])]
)
delta_t = (
postprocessed_batch[SampleBatch.REWARDS]
+ gamma ** d_ts * vpred_t[1:]
- vpred_t[:-1]
)
# This formula for the advantage is an adaption of
# "Generalized Advantage Estimation"
# (https://arxiv.org/abs/1506.02438) which accounts for time steps
# of irregular length (see proposal here ).
# NOTE: last time step delta is not required
postprocessed_batch[
Postprocessing.ADVANTAGES
] = generalized_discount_cumsum(delta_t, d_ts[:-1], gamma * lambda_)
postprocessed_batch[Postprocessing.VALUE_TARGETS] = (
postprocessed_batch[Postprocessing.ADVANTAGES]
+ postprocessed_batch[SampleBatch.VF_PREDS]
).astype(np.float32)
postprocessed_batch[Postprocessing.ADVANTAGES] = postprocessed_batch[
Postprocessing.ADVANTAGES
].astype(np.float32)
def generalized_discount_cumsum(
x: np.ndarray, deltas: np.ndarray, gamma: float
) -> np.ndarray:
"""Calculates the 'time-dependent' discounted cumulative sum over a
(reward) sequence `x`.
Recursive equations:
y[t] - gamma**deltas[t+1]*y[t+1] = x[t]
reversed(y)[t] - gamma**reversed(deltas)[t-1]*reversed(y)[t-1] =
reversed(x)[t]
Args:
x (np.ndarray): A sequence of rewards or one-step TD residuals.
deltas (np.ndarray): A sequence of time step deltas (length of time
steps).
gamma: The discount factor gamma.
Returns:
np.ndarray: The sequence containing the 'time-dependent' discounted
cumulative sums for each individual element in `x` till the end of
the trajectory.
Examples:
>>> x = np.array([0.0, 1.0, 2.0, 3.0])
>>> deltas = np.array([1.0, 4.0, 15.0])
>>> gamma = 0.9
>>> generalized_discount_cumsum(x, deltas, gamma)
... array([0.0 + 0.9^1.0*1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0,
... 1.0 + 0.9^4.0*2.0 + 0.9^15.0*3.0,
... 2.0 + 0.9^15.0*3.0,
... 3.0])
"""
reversed_x = x[::-1]
reversed_deltas = deltas[::-1]
reversed_y = np.empty_like(x)
reversed_y[0] = reversed_x[0]
for i in range(1, x.size):
reversed_y[i] = (
reversed_x[i] + gamma ** reversed_deltas[i - 1] * reversed_y[i - 1]
)
return reversed_y[::-1]