mirror of
https://github.com/vale981/ray
synced 2025-03-04 17:41:43 -05:00
[RLlib] Fix ope v_gain (#28136)
This commit is contained in:
parent
3590a86db0
commit
5779ee764d
6 changed files with 100 additions and 89 deletions
|
@ -80,8 +80,8 @@ RLlib's OPE estimators output six metrics:
|
||||||
- ``v_behavior_std``: The standard deviation corresponding to v_behavior.
|
- ``v_behavior_std``: The standard deviation corresponding to v_behavior.
|
||||||
- ``v_target``: The OPE's estimated discounted return for the target policy, averaged over episodes in the batch.
|
- ``v_target``: The OPE's estimated discounted return for the target policy, averaged over episodes in the batch.
|
||||||
- ``v_target_std``: The standard deviation corresponding to v_target.
|
- ``v_target_std``: The standard deviation corresponding to v_target.
|
||||||
- ``v_gain``: ``v_target / max(v_behavior, 1e-8)``, averaged over episodes in the batch. ``v_gain > 1.0`` indicates that the policy is better than the policy that generated the behavior data.
|
- ``v_gain``: ``v_target / max(v_behavior, 1e-8)``. ``v_gain > 1.0`` indicates that the policy is better than the policy that generated the behavior data. In case, ``v_behavior <= 0``, ``v_delta`` should be used instead for comparison.
|
||||||
- ``v_gain_std``: The standard deviation corresponding to v_gain.
|
- ``v_delta``: The difference between v_target and v_behavior.
|
||||||
|
|
||||||
As an example, we generate an evaluation dataset for off-policy estimation:
|
As an example, we generate an evaluation dataset for off-policy estimation:
|
||||||
|
|
||||||
|
@ -170,7 +170,7 @@ We can now train a DQN algorithm offline and evaluate it using OPE:
|
||||||
batch = reader.next()
|
batch = reader.next()
|
||||||
print(estimator.estimate(batch))
|
print(estimator.estimate(batch))
|
||||||
# {'v_behavior': ..., 'v_target': ..., 'v_gain': ...,
|
# {'v_behavior': ..., 'v_target': ..., 'v_gain': ...,
|
||||||
# 'v_behavior_std': ..., 'v_target_std': ..., 'v_gain_std': ...}
|
# 'v_behavior_std': ..., 'v_target_std': ..., 'v_delta': ...}
|
||||||
|
|
||||||
Example: Converting external experiences to batch format
|
Example: Converting external experiences to batch format
|
||||||
--------------------------------------------------------
|
--------------------------------------------------------
|
||||||
|
|
|
@ -75,12 +75,12 @@ class DirectMethod(OffPolicyEstimator):
|
||||||
- v_target: The estimated discounted return for `self.policy`,
|
- v_target: The estimated discounted return for `self.policy`,
|
||||||
averaged over episodes in the batch
|
averaged over episodes in the batch
|
||||||
- v_target_std: The standard deviation corresponding to v_target
|
- v_target_std: The standard deviation corresponding to v_target
|
||||||
- v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
|
- v_gain: v_target / max(v_behavior, 1e-8)
|
||||||
- v_gain_std: The standard deviation corresponding to v_gain
|
- v_delta: The difference between v_target and v_behavior.
|
||||||
"""
|
"""
|
||||||
batch = self.convert_ma_batch_to_sample_batch(batch)
|
batch = self.convert_ma_batch_to_sample_batch(batch)
|
||||||
self.check_action_prob_in_batch(batch)
|
self.check_action_prob_in_batch(batch)
|
||||||
estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
|
estimates_per_epsiode = {"v_behavior": [], "v_target": []}
|
||||||
# Calculate Direct Method OPE estimates
|
# Calculate Direct Method OPE estimates
|
||||||
for episode in batch.split_by_episode():
|
for episode in batch.split_by_episode():
|
||||||
rewards = episode["rewards"]
|
rewards = episode["rewards"]
|
||||||
|
@ -93,15 +93,18 @@ class DirectMethod(OffPolicyEstimator):
|
||||||
v_target = self.model.estimate_v(init_step)
|
v_target = self.model.estimate_v(init_step)
|
||||||
v_target = convert_to_numpy(v_target).item()
|
v_target = convert_to_numpy(v_target).item()
|
||||||
|
|
||||||
estimates["v_behavior"].append(v_behavior)
|
estimates_per_epsiode["v_behavior"].append(v_behavior)
|
||||||
estimates["v_target"].append(v_target)
|
estimates_per_epsiode["v_target"].append(v_target)
|
||||||
estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
|
|
||||||
estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
|
estimates = {
|
||||||
estimates["v_behavior"] = np.mean(estimates["v_behavior"])
|
"v_behavior": np.mean(estimates_per_epsiode["v_behavior"]),
|
||||||
estimates["v_target_std"] = np.std(estimates["v_target"])
|
"v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]),
|
||||||
estimates["v_target"] = np.mean(estimates["v_target"])
|
"v_target": np.mean(estimates_per_epsiode["v_target"]),
|
||||||
estimates["v_gain_std"] = np.std(estimates["v_gain"])
|
"v_target_std": np.std(estimates_per_epsiode["v_target"]),
|
||||||
estimates["v_gain"] = np.mean(estimates["v_gain"])
|
}
|
||||||
|
estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8)
|
||||||
|
estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"]
|
||||||
|
|
||||||
return estimates
|
return estimates
|
||||||
|
|
||||||
@override(OffPolicyEstimator)
|
@override(OffPolicyEstimator)
|
||||||
|
|
|
@ -90,12 +90,12 @@ class DoublyRobust(OffPolicyEstimator):
|
||||||
- v_target: The estimated discounted return for `self.policy`,
|
- v_target: The estimated discounted return for `self.policy`,
|
||||||
averaged over episodes in the batch
|
averaged over episodes in the batch
|
||||||
- v_target_std: The standard deviation corresponding to v_target
|
- v_target_std: The standard deviation corresponding to v_target
|
||||||
- v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
|
- v_gain: v_target / max(v_behavior, 1e-8)'
|
||||||
- v_gain_std: The standard deviation corresponding to v_gain
|
- v_delta: The difference between v_target and v_behavior.
|
||||||
"""
|
"""
|
||||||
batch = self.convert_ma_batch_to_sample_batch(batch)
|
batch = self.convert_ma_batch_to_sample_batch(batch)
|
||||||
self.check_action_prob_in_batch(batch)
|
self.check_action_prob_in_batch(batch)
|
||||||
estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
|
estimates_per_epsiode = {"v_behavior": [], "v_target": []}
|
||||||
# Calculate doubly robust OPE estimates
|
# Calculate doubly robust OPE estimates
|
||||||
for episode in batch.split_by_episode():
|
for episode in batch.split_by_episode():
|
||||||
rewards, old_prob = episode["rewards"], episode["action_prob"]
|
rewards, old_prob = episode["rewards"], episode["action_prob"]
|
||||||
|
@ -119,15 +119,18 @@ class DoublyRobust(OffPolicyEstimator):
|
||||||
)
|
)
|
||||||
v_target = v_target.item()
|
v_target = v_target.item()
|
||||||
|
|
||||||
estimates["v_behavior"].append(v_behavior)
|
estimates_per_epsiode["v_behavior"].append(v_behavior)
|
||||||
estimates["v_target"].append(v_target)
|
estimates_per_epsiode["v_target"].append(v_target)
|
||||||
estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
|
|
||||||
estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
|
estimates = {
|
||||||
estimates["v_behavior"] = np.mean(estimates["v_behavior"])
|
"v_behavior": np.mean(estimates_per_epsiode["v_behavior"]),
|
||||||
estimates["v_target_std"] = np.std(estimates["v_target"])
|
"v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]),
|
||||||
estimates["v_target"] = np.mean(estimates["v_target"])
|
"v_target": np.mean(estimates_per_epsiode["v_target"]),
|
||||||
estimates["v_gain_std"] = np.std(estimates["v_gain"])
|
"v_target_std": np.std(estimates_per_epsiode["v_target"]),
|
||||||
estimates["v_gain"] = np.mean(estimates["v_gain"])
|
}
|
||||||
|
estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8)
|
||||||
|
estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"]
|
||||||
|
|
||||||
return estimates
|
return estimates
|
||||||
|
|
||||||
@override(OffPolicyEstimator)
|
@override(OffPolicyEstimator)
|
||||||
|
|
|
@ -37,12 +37,12 @@ class ImportanceSampling(OffPolicyEstimator):
|
||||||
- v_target: The estimated discounted return for `self.policy`,
|
- v_target: The estimated discounted return for `self.policy`,
|
||||||
averaged over episodes in the batch
|
averaged over episodes in the batch
|
||||||
- v_target_std: The standard deviation corresponding to v_target
|
- v_target_std: The standard deviation corresponding to v_target
|
||||||
- v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
|
- v_gain: v_target / max(v_behavior, 1e-8)
|
||||||
- v_gain_std: The standard deviation corresponding to v_gain
|
- v_delta: The difference between v_target and v_behavior.
|
||||||
"""
|
"""
|
||||||
batch = self.convert_ma_batch_to_sample_batch(batch)
|
batch = self.convert_ma_batch_to_sample_batch(batch)
|
||||||
self.check_action_prob_in_batch(batch)
|
self.check_action_prob_in_batch(batch)
|
||||||
estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
|
estimates_per_epsiode = {"v_behavior": [], "v_target": []}
|
||||||
for episode in batch.split_by_episode():
|
for episode in batch.split_by_episode():
|
||||||
rewards, old_prob = episode["rewards"], episode["action_prob"]
|
rewards, old_prob = episode["rewards"], episode["action_prob"]
|
||||||
log_likelihoods = compute_log_likelihoods_from_input_dict(
|
log_likelihoods = compute_log_likelihoods_from_input_dict(
|
||||||
|
@ -66,13 +66,16 @@ class ImportanceSampling(OffPolicyEstimator):
|
||||||
v_behavior += rewards[t] * self.gamma ** t
|
v_behavior += rewards[t] * self.gamma ** t
|
||||||
v_target += p[t] * rewards[t] * self.gamma ** t
|
v_target += p[t] * rewards[t] * self.gamma ** t
|
||||||
|
|
||||||
estimates["v_behavior"].append(v_behavior)
|
estimates_per_epsiode["v_behavior"].append(v_behavior)
|
||||||
estimates["v_target"].append(v_target)
|
estimates_per_epsiode["v_target"].append(v_target)
|
||||||
estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
|
|
||||||
estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
|
estimates = {
|
||||||
estimates["v_behavior"] = np.mean(estimates["v_behavior"])
|
"v_behavior": np.mean(estimates_per_epsiode["v_behavior"]),
|
||||||
estimates["v_target_std"] = np.std(estimates["v_target"])
|
"v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]),
|
||||||
estimates["v_target"] = np.mean(estimates["v_target"])
|
"v_target": np.mean(estimates_per_epsiode["v_target"]),
|
||||||
estimates["v_gain_std"] = np.std(estimates["v_gain"])
|
"v_target_std": np.std(estimates_per_epsiode["v_target"]),
|
||||||
estimates["v_gain"] = np.mean(estimates["v_gain"])
|
}
|
||||||
|
estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8)
|
||||||
|
estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"]
|
||||||
|
|
||||||
return estimates
|
return estimates
|
||||||
|
|
|
@ -25,6 +25,15 @@ import ray
|
||||||
|
|
||||||
torch, _ = try_import_torch()
|
torch, _ = try_import_torch()
|
||||||
|
|
||||||
|
ESTIMATOR_OUTPUTS = {
|
||||||
|
"v_behavior",
|
||||||
|
"v_behavior_std",
|
||||||
|
"v_target",
|
||||||
|
"v_target_std",
|
||||||
|
"v_gain",
|
||||||
|
"v_delta",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class TestOPE(unittest.TestCase):
|
class TestOPE(unittest.TestCase):
|
||||||
"""Compilation tests for using OPE both standalone and in an RLlib Algorithm"""
|
"""Compilation tests for using OPE both standalone and in an RLlib Algorithm"""
|
||||||
|
@ -75,49 +84,38 @@ class TestOPE(unittest.TestCase):
|
||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
ray.shutdown()
|
ray.shutdown()
|
||||||
|
|
||||||
def test_ope_standalone(self):
|
def test_is_and_wis_standalone(self):
|
||||||
# Test all OPE methods standalone
|
ope_classes = [
|
||||||
estimator_outputs = {
|
ImportanceSampling,
|
||||||
"v_behavior",
|
WeightedImportanceSampling,
|
||||||
"v_behavior_std",
|
]
|
||||||
"v_target",
|
|
||||||
"v_target_std",
|
|
||||||
"v_gain",
|
|
||||||
"v_gain_std",
|
|
||||||
}
|
|
||||||
estimator = ImportanceSampling(
|
|
||||||
policy=self.algo.get_policy(),
|
|
||||||
gamma=self.gamma,
|
|
||||||
)
|
|
||||||
estimates = estimator.estimate(self.batch)
|
|
||||||
self.assertEqual(estimates.keys(), estimator_outputs)
|
|
||||||
|
|
||||||
estimator = WeightedImportanceSampling(
|
for class_module in ope_classes:
|
||||||
policy=self.algo.get_policy(),
|
estimator = class_module(
|
||||||
gamma=self.gamma,
|
policy=self.algo.get_policy(),
|
||||||
)
|
gamma=self.gamma,
|
||||||
estimates = estimator.estimate(self.batch)
|
)
|
||||||
self.assertEqual(estimates.keys(), estimator_outputs)
|
estimates = estimator.estimate(self.batch)
|
||||||
|
self.assertEqual(set(estimates.keys()), ESTIMATOR_OUTPUTS)
|
||||||
|
check(estimates["v_gain"], estimates["v_target"] / estimates["v_behavior"])
|
||||||
|
|
||||||
estimator = DirectMethod(
|
def test_dm_and_dr_standalone(self):
|
||||||
policy=self.algo.get_policy(),
|
ope_classes = [
|
||||||
gamma=self.gamma,
|
DirectMethod,
|
||||||
q_model_config=self.q_model_config,
|
DoublyRobust,
|
||||||
)
|
]
|
||||||
losses = estimator.train(self.batch)
|
|
||||||
assert losses, "DM estimator did not return mean loss"
|
|
||||||
estimates = estimator.estimate(self.batch)
|
|
||||||
self.assertEqual(estimates.keys(), estimator_outputs)
|
|
||||||
|
|
||||||
estimator = DoublyRobust(
|
for class_module in ope_classes:
|
||||||
policy=self.algo.get_policy(),
|
estimator = class_module(
|
||||||
gamma=self.gamma,
|
policy=self.algo.get_policy(),
|
||||||
q_model_config=self.q_model_config,
|
gamma=self.gamma,
|
||||||
)
|
q_model_config=self.q_model_config,
|
||||||
losses = estimator.train(self.batch)
|
)
|
||||||
assert losses, "DM estimator did not return mean loss"
|
losses = estimator.train(self.batch)
|
||||||
estimates = estimator.estimate(self.batch)
|
assert losses, f"{class_module.__name__} estimator did not return mean loss"
|
||||||
self.assertEqual(estimates.keys(), estimator_outputs)
|
estimates = estimator.estimate(self.batch)
|
||||||
|
self.assertEqual(set(estimates.keys()), ESTIMATOR_OUTPUTS)
|
||||||
|
check(estimates["v_gain"], estimates["v_target"] / estimates["v_behavior"])
|
||||||
|
|
||||||
def test_ope_in_algo(self):
|
def test_ope_in_algo(self):
|
||||||
# Test OPE in DQN, during training as well as by calling evaluate()
|
# Test OPE in DQN, during training as well as by calling evaluate()
|
||||||
|
|
|
@ -49,10 +49,11 @@ class WeightedImportanceSampling(OffPolicyEstimator):
|
||||||
- v_target_std: The standard deviation corresponding to v_target
|
- v_target_std: The standard deviation corresponding to v_target
|
||||||
- v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
|
- v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
|
||||||
- v_gain_std: The standard deviation corresponding to v_gain
|
- v_gain_std: The standard deviation corresponding to v_gain
|
||||||
|
- v_delta: The difference between v_target and v_behavior.
|
||||||
"""
|
"""
|
||||||
batch = self.convert_ma_batch_to_sample_batch(batch)
|
batch = self.convert_ma_batch_to_sample_batch(batch)
|
||||||
self.check_action_prob_in_batch(batch)
|
self.check_action_prob_in_batch(batch)
|
||||||
estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
|
estimates_per_epsiode = {"v_behavior": [], "v_target": []}
|
||||||
for episode in batch.split_by_episode():
|
for episode in batch.split_by_episode():
|
||||||
rewards, old_prob = episode["rewards"], episode["action_prob"]
|
rewards, old_prob = episode["rewards"], episode["action_prob"]
|
||||||
log_likelihoods = compute_log_likelihoods_from_input_dict(
|
log_likelihoods = compute_log_likelihoods_from_input_dict(
|
||||||
|
@ -84,13 +85,16 @@ class WeightedImportanceSampling(OffPolicyEstimator):
|
||||||
w_t = self.filter_values[t] / self.filter_counts[t]
|
w_t = self.filter_values[t] / self.filter_counts[t]
|
||||||
v_target += p[t] / w_t * rewards[t] * self.gamma ** t
|
v_target += p[t] / w_t * rewards[t] * self.gamma ** t
|
||||||
|
|
||||||
estimates["v_behavior"].append(v_behavior)
|
estimates_per_epsiode["v_behavior"].append(v_behavior)
|
||||||
estimates["v_target"].append(v_target)
|
estimates_per_epsiode["v_target"].append(v_target)
|
||||||
estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
|
|
||||||
estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
|
estimates = {
|
||||||
estimates["v_behavior"] = np.mean(estimates["v_behavior"])
|
"v_behavior": np.mean(estimates_per_epsiode["v_behavior"]),
|
||||||
estimates["v_target_std"] = np.std(estimates["v_target"])
|
"v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]),
|
||||||
estimates["v_target"] = np.mean(estimates["v_target"])
|
"v_target": np.mean(estimates_per_epsiode["v_target"]),
|
||||||
estimates["v_gain_std"] = np.std(estimates["v_gain"])
|
"v_target_std": np.std(estimates_per_epsiode["v_target"]),
|
||||||
estimates["v_gain"] = np.mean(estimates["v_gain"])
|
}
|
||||||
|
estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8)
|
||||||
|
estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"]
|
||||||
|
|
||||||
return estimates
|
return estimates
|
||||||
|
|
Loading…
Add table
Reference in a new issue