diff --git a/doc/source/rllib/rllib-offline.rst b/doc/source/rllib/rllib-offline.rst
index 8c2ea44a5..1e4a676c5 100644
--- a/doc/source/rllib/rllib-offline.rst
+++ b/doc/source/rllib/rllib-offline.rst
@@ -80,8 +80,8 @@ RLlib's OPE estimators output six metrics:
 - ``v_behavior_std``: The standard deviation corresponding to v_behavior.
 - ``v_target``: The OPE's estimated discounted return for the target policy, averaged over episodes in the batch.
 - ``v_target_std``: The standard deviation corresponding to v_target.
-- ``v_gain``: ``v_target / max(v_behavior, 1e-8)``, averaged over episodes in the batch. ``v_gain > 1.0`` indicates that the policy is better than the policy that generated the behavior data.
-- ``v_gain_std``: The standard deviation corresponding to v_gain.
+- ``v_gain``: ``v_target / max(v_behavior, 1e-8)``. ``v_gain > 1.0`` indicates that the policy is better than the policy that generated the behavior data. In case, ``v_behavior <= 0``, ``v_delta`` should be used instead for comparison.
+- ``v_delta``: The difference between v_target and v_behavior.
 
 As an example, we generate an evaluation dataset for off-policy estimation:
 
@@ -170,7 +170,7 @@ We can now train a DQN algorithm offline and evaluate it using OPE:
         batch = reader.next()
         print(estimator.estimate(batch))
         # {'v_behavior': ..., 'v_target': ..., 'v_gain': ...,
-        # 'v_behavior_std': ..., 'v_target_std': ..., 'v_gain_std': ...}
+        # 'v_behavior_std': ..., 'v_target_std': ..., 'v_delta': ...}
 
 Example: Converting external experiences to batch format
 --------------------------------------------------------
diff --git a/rllib/offline/estimators/direct_method.py b/rllib/offline/estimators/direct_method.py
index 3329f64ec..7e1b376d2 100644
--- a/rllib/offline/estimators/direct_method.py
+++ b/rllib/offline/estimators/direct_method.py
@@ -75,12 +75,12 @@ class DirectMethod(OffPolicyEstimator):
             - v_target: The estimated discounted return for `self.policy`,
             averaged over episodes in the batch
             - v_target_std: The standard deviation corresponding to v_target
-            - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
-            - v_gain_std: The standard deviation corresponding to v_gain
+            - v_gain: v_target / max(v_behavior, 1e-8)
+            - v_delta: The difference between v_target and v_behavior.
         """
         batch = self.convert_ma_batch_to_sample_batch(batch)
         self.check_action_prob_in_batch(batch)
-        estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
+        estimates_per_epsiode = {"v_behavior": [], "v_target": []}
         # Calculate Direct Method OPE estimates
         for episode in batch.split_by_episode():
             rewards = episode["rewards"]
@@ -93,15 +93,18 @@ class DirectMethod(OffPolicyEstimator):
             v_target = self.model.estimate_v(init_step)
             v_target = convert_to_numpy(v_target).item()
 
-            estimates["v_behavior"].append(v_behavior)
-            estimates["v_target"].append(v_target)
-            estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
-        estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
-        estimates["v_behavior"] = np.mean(estimates["v_behavior"])
-        estimates["v_target_std"] = np.std(estimates["v_target"])
-        estimates["v_target"] = np.mean(estimates["v_target"])
-        estimates["v_gain_std"] = np.std(estimates["v_gain"])
-        estimates["v_gain"] = np.mean(estimates["v_gain"])
+            estimates_per_epsiode["v_behavior"].append(v_behavior)
+            estimates_per_epsiode["v_target"].append(v_target)
+
+        estimates = {
+            "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]),
+            "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]),
+            "v_target": np.mean(estimates_per_epsiode["v_target"]),
+            "v_target_std": np.std(estimates_per_epsiode["v_target"]),
+        }
+        estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8)
+        estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"]
+
         return estimates
 
     @override(OffPolicyEstimator)
diff --git a/rllib/offline/estimators/doubly_robust.py b/rllib/offline/estimators/doubly_robust.py
index 7ee4a052b..8f1b90c7d 100644
--- a/rllib/offline/estimators/doubly_robust.py
+++ b/rllib/offline/estimators/doubly_robust.py
@@ -90,12 +90,12 @@ class DoublyRobust(OffPolicyEstimator):
             - v_target: The estimated discounted return for `self.policy`,
             averaged over episodes in the batch
             - v_target_std: The standard deviation corresponding to v_target
-            - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
-            - v_gain_std: The standard deviation corresponding to v_gain
+            - v_gain: v_target / max(v_behavior, 1e-8)'
+            - v_delta: The difference between v_target and v_behavior.
         """
         batch = self.convert_ma_batch_to_sample_batch(batch)
         self.check_action_prob_in_batch(batch)
-        estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
+        estimates_per_epsiode = {"v_behavior": [], "v_target": []}
         # Calculate doubly robust OPE estimates
         for episode in batch.split_by_episode():
             rewards, old_prob = episode["rewards"], episode["action_prob"]
@@ -119,15 +119,18 @@ class DoublyRobust(OffPolicyEstimator):
                 )
             v_target = v_target.item()
 
-            estimates["v_behavior"].append(v_behavior)
-            estimates["v_target"].append(v_target)
-            estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
-        estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
-        estimates["v_behavior"] = np.mean(estimates["v_behavior"])
-        estimates["v_target_std"] = np.std(estimates["v_target"])
-        estimates["v_target"] = np.mean(estimates["v_target"])
-        estimates["v_gain_std"] = np.std(estimates["v_gain"])
-        estimates["v_gain"] = np.mean(estimates["v_gain"])
+            estimates_per_epsiode["v_behavior"].append(v_behavior)
+            estimates_per_epsiode["v_target"].append(v_target)
+
+        estimates = {
+            "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]),
+            "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]),
+            "v_target": np.mean(estimates_per_epsiode["v_target"]),
+            "v_target_std": np.std(estimates_per_epsiode["v_target"]),
+        }
+        estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8)
+        estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"]
+
         return estimates
 
     @override(OffPolicyEstimator)
diff --git a/rllib/offline/estimators/importance_sampling.py b/rllib/offline/estimators/importance_sampling.py
index 9a0d40d4a..256ca8102 100644
--- a/rllib/offline/estimators/importance_sampling.py
+++ b/rllib/offline/estimators/importance_sampling.py
@@ -37,12 +37,12 @@ class ImportanceSampling(OffPolicyEstimator):
             - v_target: The estimated discounted return for `self.policy`,
             averaged over episodes in the batch
             - v_target_std: The standard deviation corresponding to v_target
-            - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
-            - v_gain_std: The standard deviation corresponding to v_gain
+            - v_gain: v_target / max(v_behavior, 1e-8)
+            - v_delta: The difference between v_target and v_behavior.
         """
         batch = self.convert_ma_batch_to_sample_batch(batch)
         self.check_action_prob_in_batch(batch)
-        estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
+        estimates_per_epsiode = {"v_behavior": [], "v_target": []}
         for episode in batch.split_by_episode():
             rewards, old_prob = episode["rewards"], episode["action_prob"]
             log_likelihoods = compute_log_likelihoods_from_input_dict(
@@ -66,13 +66,16 @@ class ImportanceSampling(OffPolicyEstimator):
                 v_behavior += rewards[t] * self.gamma ** t
                 v_target += p[t] * rewards[t] * self.gamma ** t
 
-            estimates["v_behavior"].append(v_behavior)
-            estimates["v_target"].append(v_target)
-            estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
-        estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
-        estimates["v_behavior"] = np.mean(estimates["v_behavior"])
-        estimates["v_target_std"] = np.std(estimates["v_target"])
-        estimates["v_target"] = np.mean(estimates["v_target"])
-        estimates["v_gain_std"] = np.std(estimates["v_gain"])
-        estimates["v_gain"] = np.mean(estimates["v_gain"])
+            estimates_per_epsiode["v_behavior"].append(v_behavior)
+            estimates_per_epsiode["v_target"].append(v_target)
+
+        estimates = {
+            "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]),
+            "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]),
+            "v_target": np.mean(estimates_per_epsiode["v_target"]),
+            "v_target_std": np.std(estimates_per_epsiode["v_target"]),
+        }
+        estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8)
+        estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"]
+
         return estimates
diff --git a/rllib/offline/estimators/tests/test_ope.py b/rllib/offline/estimators/tests/test_ope.py
index 0535df5db..53c43b27e 100644
--- a/rllib/offline/estimators/tests/test_ope.py
+++ b/rllib/offline/estimators/tests/test_ope.py
@@ -25,6 +25,15 @@ import ray
 
 torch, _ = try_import_torch()
 
+ESTIMATOR_OUTPUTS = {
+    "v_behavior",
+    "v_behavior_std",
+    "v_target",
+    "v_target_std",
+    "v_gain",
+    "v_delta",
+}
+
 
 class TestOPE(unittest.TestCase):
     """Compilation tests for using OPE both standalone and in an RLlib Algorithm"""
@@ -75,49 +84,38 @@ class TestOPE(unittest.TestCase):
     def tearDownClass(cls):
         ray.shutdown()
 
-    def test_ope_standalone(self):
-        # Test all OPE methods standalone
-        estimator_outputs = {
-            "v_behavior",
-            "v_behavior_std",
-            "v_target",
-            "v_target_std",
-            "v_gain",
-            "v_gain_std",
-        }
-        estimator = ImportanceSampling(
-            policy=self.algo.get_policy(),
-            gamma=self.gamma,
-        )
-        estimates = estimator.estimate(self.batch)
-        self.assertEqual(estimates.keys(), estimator_outputs)
+    def test_is_and_wis_standalone(self):
+        ope_classes = [
+            ImportanceSampling,
+            WeightedImportanceSampling,
+        ]
 
-        estimator = WeightedImportanceSampling(
-            policy=self.algo.get_policy(),
-            gamma=self.gamma,
-        )
-        estimates = estimator.estimate(self.batch)
-        self.assertEqual(estimates.keys(), estimator_outputs)
+        for class_module in ope_classes:
+            estimator = class_module(
+                policy=self.algo.get_policy(),
+                gamma=self.gamma,
+            )
+            estimates = estimator.estimate(self.batch)
+            self.assertEqual(set(estimates.keys()), ESTIMATOR_OUTPUTS)
+            check(estimates["v_gain"], estimates["v_target"] / estimates["v_behavior"])
 
-        estimator = DirectMethod(
-            policy=self.algo.get_policy(),
-            gamma=self.gamma,
-            q_model_config=self.q_model_config,
-        )
-        losses = estimator.train(self.batch)
-        assert losses, "DM estimator did not return mean loss"
-        estimates = estimator.estimate(self.batch)
-        self.assertEqual(estimates.keys(), estimator_outputs)
+    def test_dm_and_dr_standalone(self):
+        ope_classes = [
+            DirectMethod,
+            DoublyRobust,
+        ]
 
-        estimator = DoublyRobust(
-            policy=self.algo.get_policy(),
-            gamma=self.gamma,
-            q_model_config=self.q_model_config,
-        )
-        losses = estimator.train(self.batch)
-        assert losses, "DM estimator did not return mean loss"
-        estimates = estimator.estimate(self.batch)
-        self.assertEqual(estimates.keys(), estimator_outputs)
+        for class_module in ope_classes:
+            estimator = class_module(
+                policy=self.algo.get_policy(),
+                gamma=self.gamma,
+                q_model_config=self.q_model_config,
+            )
+            losses = estimator.train(self.batch)
+            assert losses, f"{class_module.__name__} estimator did not return mean loss"
+            estimates = estimator.estimate(self.batch)
+            self.assertEqual(set(estimates.keys()), ESTIMATOR_OUTPUTS)
+            check(estimates["v_gain"], estimates["v_target"] / estimates["v_behavior"])
 
     def test_ope_in_algo(self):
         # Test OPE in DQN, during training as well as by calling evaluate()
diff --git a/rllib/offline/estimators/weighted_importance_sampling.py b/rllib/offline/estimators/weighted_importance_sampling.py
index 1354d1301..6b12cc718 100644
--- a/rllib/offline/estimators/weighted_importance_sampling.py
+++ b/rllib/offline/estimators/weighted_importance_sampling.py
@@ -49,10 +49,11 @@ class WeightedImportanceSampling(OffPolicyEstimator):
             - v_target_std: The standard deviation corresponding to v_target
             - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
             - v_gain_std: The standard deviation corresponding to v_gain
+            - v_delta: The difference between v_target and v_behavior.
         """
         batch = self.convert_ma_batch_to_sample_batch(batch)
         self.check_action_prob_in_batch(batch)
-        estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
+        estimates_per_epsiode = {"v_behavior": [], "v_target": []}
         for episode in batch.split_by_episode():
             rewards, old_prob = episode["rewards"], episode["action_prob"]
             log_likelihoods = compute_log_likelihoods_from_input_dict(
@@ -84,13 +85,16 @@ class WeightedImportanceSampling(OffPolicyEstimator):
                 w_t = self.filter_values[t] / self.filter_counts[t]
                 v_target += p[t] / w_t * rewards[t] * self.gamma ** t
 
-            estimates["v_behavior"].append(v_behavior)
-            estimates["v_target"].append(v_target)
-            estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
-        estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
-        estimates["v_behavior"] = np.mean(estimates["v_behavior"])
-        estimates["v_target_std"] = np.std(estimates["v_target"])
-        estimates["v_target"] = np.mean(estimates["v_target"])
-        estimates["v_gain_std"] = np.std(estimates["v_gain"])
-        estimates["v_gain"] = np.mean(estimates["v_gain"])
+            estimates_per_epsiode["v_behavior"].append(v_behavior)
+            estimates_per_epsiode["v_target"].append(v_target)
+
+        estimates = {
+            "v_behavior": np.mean(estimates_per_epsiode["v_behavior"]),
+            "v_behavior_std": np.std(estimates_per_epsiode["v_behavior"]),
+            "v_target": np.mean(estimates_per_epsiode["v_target"]),
+            "v_target_std": np.std(estimates_per_epsiode["v_target"]),
+        }
+        estimates["v_gain"] = estimates["v_target"] / max(estimates["v_behavior"], 1e-8)
+        estimates["v_delta"] = estimates["v_target"] - estimates["v_behavior"]
+
         return estimates