diff --git a/doc/source/rllib/rllib-offline.rst b/doc/source/rllib/rllib-offline.rst index 8c2ea44a5..1e4a676c5 100644 --- a/doc/source/rllib/rllib-offline.rst +++ b/doc/source/rllib/rllib-offline.rst @@ -80,8 +80,8 @@ RLlib's OPE estimators output six metrics: - ``v_behavior_std``: The standard deviation corresponding to v_behavior. - ``v_target``: The OPE's estimated discounted return for the target policy, averaged over episodes in the batch. - ``v_target_std``: The standard deviation corresponding to v_target. -- ``v_gain``: ``v_target / max(v_behavior, 1e-8)``, averaged over episodes in the batch. ``v_gain > 1.0`` indicates that the policy is better than the policy that generated the behavior data. -- ``v_gain_std``: The standard deviation corresponding to v_gain. +- ``v_gain``: ``v_target / max(v_behavior, 1e-8)``. ``v_gain > 1.0`` indicates that the policy is better than the policy that generated the behavior data. In case, ``v_behavior <= 0``, ``v_delta`` should be used instead for comparison. +- ``v_delta``: The difference between v_target and v_behavior. As an example, we generate an evaluation dataset for off-policy estimation: @@ -170,7 +170,7 @@ We can now train a DQN algorithm offline and evaluate it using OPE: batch = reader.next() print(estimator.estimate(batch)) # {'v_behavior': ..., 'v_target': ..., 'v_gain': ..., - # 'v_behavior_std': ..., 'v_target_std': ..., 'v_gain_std': ...} + # 'v_behavior_std': ..., 'v_target_std': ..., 'v_delta': ...} Example: Converting external experiences to batch format -------------------------------------------------------- diff --git a/rllib/offline/estimators/direct_method.py b/rllib/offline/estimators/direct_method.py index 3329f64ec..7e1b376d2 100644 --- a/rllib/offline/estimators/direct_method.py +++ b/rllib/offline/estimators/direct_method.py @@ -75,12 +75,12 @@ class DirectMethod(OffPolicyEstimator): - v_target: The estimated discounted return for `self.policy`, averaged over episodes in the batch - v_target_std: The standard deviation corresponding to v_target - - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes - - v_gain_std: The standard deviation corresponding to v_gain + - v_gain: v_target / max(v_behavior, 1e-8) + - v_delta: The difference between v_target and v_behavior. """ batch = self.convert_ma_batch_to_sample_batch(batch) self.check_action_prob_in_batch(batch) - estimates = {"v_behavior": [], "v_target": [], "v_gain": []} + estimates_per_epsiode = {"v_behavior": [], "v_target": []} # Calculate Direct Method OPE estimates for episode in batch.split_by_episode(): rewards = episode["rewards"] @@ -93,15 +93,18 @@ class DirectMethod(OffPolicyEstimator): v_target = self.model.estimate_v(init_step) v_target = convert_to_numpy(v_target).item() - estimates["v_behavior"].append(v_behavior) - estimates["v_target"].append(v_target) - estimates["v_gain"].append(v_target / max(v_behavior, 1e-8)) - estimates["v_behavior_std"] = np.std(estimates["v_behavior"]) - estimates["v_behavior"] = np.mean(estimates["v_behavior"]) - estimates["v_target_std"] = np.std(estimates["v_target"]) - estimates["v_target"] = np.mean(estimates["v_target"]) - estimates["v_gain_std"] = np.std(estimates["v_gain"]) - estimates["v_gain"] = np.mean(estimates["v_gain"]) + estimates_per_epsiode["v_behavior"].append(v_behavior) + estimates_per_epsiode["v_target"].append(v_target) + + estimates = { + "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]), + "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]), + "v_target": np.mean(estimates_per_epsiode["v_target"]), + "v_target_std": np.std(estimates_per_epsiode["v_target"]), + } + estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8) + estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"] + return estimates @override(OffPolicyEstimator) diff --git a/rllib/offline/estimators/doubly_robust.py b/rllib/offline/estimators/doubly_robust.py index 7ee4a052b..8f1b90c7d 100644 --- a/rllib/offline/estimators/doubly_robust.py +++ b/rllib/offline/estimators/doubly_robust.py @@ -90,12 +90,12 @@ class DoublyRobust(OffPolicyEstimator): - v_target: The estimated discounted return for `self.policy`, averaged over episodes in the batch - v_target_std: The standard deviation corresponding to v_target - - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes - - v_gain_std: The standard deviation corresponding to v_gain + - v_gain: v_target / max(v_behavior, 1e-8)' + - v_delta: The difference between v_target and v_behavior. """ batch = self.convert_ma_batch_to_sample_batch(batch) self.check_action_prob_in_batch(batch) - estimates = {"v_behavior": [], "v_target": [], "v_gain": []} + estimates_per_epsiode = {"v_behavior": [], "v_target": []} # Calculate doubly robust OPE estimates for episode in batch.split_by_episode(): rewards, old_prob = episode["rewards"], episode["action_prob"] @@ -119,15 +119,18 @@ class DoublyRobust(OffPolicyEstimator): ) v_target = v_target.item() - estimates["v_behavior"].append(v_behavior) - estimates["v_target"].append(v_target) - estimates["v_gain"].append(v_target / max(v_behavior, 1e-8)) - estimates["v_behavior_std"] = np.std(estimates["v_behavior"]) - estimates["v_behavior"] = np.mean(estimates["v_behavior"]) - estimates["v_target_std"] = np.std(estimates["v_target"]) - estimates["v_target"] = np.mean(estimates["v_target"]) - estimates["v_gain_std"] = np.std(estimates["v_gain"]) - estimates["v_gain"] = np.mean(estimates["v_gain"]) + estimates_per_epsiode["v_behavior"].append(v_behavior) + estimates_per_epsiode["v_target"].append(v_target) + + estimates = { + "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]), + "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]), + "v_target": np.mean(estimates_per_epsiode["v_target"]), + "v_target_std": np.std(estimates_per_epsiode["v_target"]), + } + estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8) + estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"] + return estimates @override(OffPolicyEstimator) diff --git a/rllib/offline/estimators/importance_sampling.py b/rllib/offline/estimators/importance_sampling.py index 9a0d40d4a..256ca8102 100644 --- a/rllib/offline/estimators/importance_sampling.py +++ b/rllib/offline/estimators/importance_sampling.py @@ -37,12 +37,12 @@ class ImportanceSampling(OffPolicyEstimator): - v_target: The estimated discounted return for `self.policy`, averaged over episodes in the batch - v_target_std: The standard deviation corresponding to v_target - - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes - - v_gain_std: The standard deviation corresponding to v_gain + - v_gain: v_target / max(v_behavior, 1e-8) + - v_delta: The difference between v_target and v_behavior. """ batch = self.convert_ma_batch_to_sample_batch(batch) self.check_action_prob_in_batch(batch) - estimates = {"v_behavior": [], "v_target": [], "v_gain": []} + estimates_per_epsiode = {"v_behavior": [], "v_target": []} for episode in batch.split_by_episode(): rewards, old_prob = episode["rewards"], episode["action_prob"] log_likelihoods = compute_log_likelihoods_from_input_dict( @@ -66,13 +66,16 @@ class ImportanceSampling(OffPolicyEstimator): v_behavior += rewards[t] * self.gamma ** t v_target += p[t] * rewards[t] * self.gamma ** t - estimates["v_behavior"].append(v_behavior) - estimates["v_target"].append(v_target) - estimates["v_gain"].append(v_target / max(v_behavior, 1e-8)) - estimates["v_behavior_std"] = np.std(estimates["v_behavior"]) - estimates["v_behavior"] = np.mean(estimates["v_behavior"]) - estimates["v_target_std"] = np.std(estimates["v_target"]) - estimates["v_target"] = np.mean(estimates["v_target"]) - estimates["v_gain_std"] = np.std(estimates["v_gain"]) - estimates["v_gain"] = np.mean(estimates["v_gain"]) + estimates_per_epsiode["v_behavior"].append(v_behavior) + estimates_per_epsiode["v_target"].append(v_target) + + estimates = { + "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]), + "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]), + "v_target": np.mean(estimates_per_epsiode["v_target"]), + "v_target_std": np.std(estimates_per_epsiode["v_target"]), + } + estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8) + estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"] + return estimates diff --git a/rllib/offline/estimators/tests/test_ope.py b/rllib/offline/estimators/tests/test_ope.py index 0535df5db..53c43b27e 100644 --- a/rllib/offline/estimators/tests/test_ope.py +++ b/rllib/offline/estimators/tests/test_ope.py @@ -25,6 +25,15 @@ import ray torch, _ = try_import_torch() +ESTIMATOR_OUTPUTS = { + "v_behavior", + "v_behavior_std", + "v_target", + "v_target_std", + "v_gain", + "v_delta", +} + class TestOPE(unittest.TestCase): """Compilation tests for using OPE both standalone and in an RLlib Algorithm""" @@ -75,49 +84,38 @@ class TestOPE(unittest.TestCase): def tearDownClass(cls): ray.shutdown() - def test_ope_standalone(self): - # Test all OPE methods standalone - estimator_outputs = { - "v_behavior", - "v_behavior_std", - "v_target", - "v_target_std", - "v_gain", - "v_gain_std", - } - estimator = ImportanceSampling( - policy=self.algo.get_policy(), - gamma=self.gamma, - ) - estimates = estimator.estimate(self.batch) - self.assertEqual(estimates.keys(), estimator_outputs) + def test_is_and_wis_standalone(self): + ope_classes = [ + ImportanceSampling, + WeightedImportanceSampling, + ] - estimator = WeightedImportanceSampling( - policy=self.algo.get_policy(), - gamma=self.gamma, - ) - estimates = estimator.estimate(self.batch) - self.assertEqual(estimates.keys(), estimator_outputs) + for class_module in ope_classes: + estimator = class_module( + policy=self.algo.get_policy(), + gamma=self.gamma, + ) + estimates = estimator.estimate(self.batch) + self.assertEqual(set(estimates.keys()), ESTIMATOR_OUTPUTS) + check(estimates["v_gain"], estimates["v_target"] / estimates["v_behavior"]) - estimator = DirectMethod( - policy=self.algo.get_policy(), - gamma=self.gamma, - q_model_config=self.q_model_config, - ) - losses = estimator.train(self.batch) - assert losses, "DM estimator did not return mean loss" - estimates = estimator.estimate(self.batch) - self.assertEqual(estimates.keys(), estimator_outputs) + def test_dm_and_dr_standalone(self): + ope_classes = [ + DirectMethod, + DoublyRobust, + ] - estimator = DoublyRobust( - policy=self.algo.get_policy(), - gamma=self.gamma, - q_model_config=self.q_model_config, - ) - losses = estimator.train(self.batch) - assert losses, "DM estimator did not return mean loss" - estimates = estimator.estimate(self.batch) - self.assertEqual(estimates.keys(), estimator_outputs) + for class_module in ope_classes: + estimator = class_module( + policy=self.algo.get_policy(), + gamma=self.gamma, + q_model_config=self.q_model_config, + ) + losses = estimator.train(self.batch) + assert losses, f"{class_module.__name__} estimator did not return mean loss" + estimates = estimator.estimate(self.batch) + self.assertEqual(set(estimates.keys()), ESTIMATOR_OUTPUTS) + check(estimates["v_gain"], estimates["v_target"] / estimates["v_behavior"]) def test_ope_in_algo(self): # Test OPE in DQN, during training as well as by calling evaluate() diff --git a/rllib/offline/estimators/weighted_importance_sampling.py b/rllib/offline/estimators/weighted_importance_sampling.py index 1354d1301..6b12cc718 100644 --- a/rllib/offline/estimators/weighted_importance_sampling.py +++ b/rllib/offline/estimators/weighted_importance_sampling.py @@ -49,10 +49,11 @@ class WeightedImportanceSampling(OffPolicyEstimator): - v_target_std: The standard deviation corresponding to v_target - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes - v_gain_std: The standard deviation corresponding to v_gain + - v_delta: The difference between v_target and v_behavior. """ batch = self.convert_ma_batch_to_sample_batch(batch) self.check_action_prob_in_batch(batch) - estimates = {"v_behavior": [], "v_target": [], "v_gain": []} + estimates_per_epsiode = {"v_behavior": [], "v_target": []} for episode in batch.split_by_episode(): rewards, old_prob = episode["rewards"], episode["action_prob"] log_likelihoods = compute_log_likelihoods_from_input_dict( @@ -84,13 +85,16 @@ class WeightedImportanceSampling(OffPolicyEstimator): w_t = self.filter_values[t] / self.filter_counts[t] v_target += p[t] / w_t * rewards[t] * self.gamma ** t - estimates["v_behavior"].append(v_behavior) - estimates["v_target"].append(v_target) - estimates["v_gain"].append(v_target / max(v_behavior, 1e-8)) - estimates["v_behavior_std"] = np.std(estimates["v_behavior"]) - estimates["v_behavior"] = np.mean(estimates["v_behavior"]) - estimates["v_target_std"] = np.std(estimates["v_target"]) - estimates["v_target"] = np.mean(estimates["v_target"]) - estimates["v_gain_std"] = np.std(estimates["v_gain"]) - estimates["v_gain"] = np.mean(estimates["v_gain"]) + estimates_per_epsiode["v_behavior"].append(v_behavior) + estimates_per_epsiode["v_target"].append(v_target) + + estimates = { + "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]), + "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]), + "v_target": np.mean(estimates_per_epsiode["v_target"]), + "v_target_std": np.std(estimates_per_epsiode["v_target"]), + } + estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8) + estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"] + return estimates