From 14f2729b3a12711fabb2daeaa06176992870c154 Mon Sep 17 00:00:00 2001
From: Clark Zinzow <clarkzinzow@gmail.com>
Date: Thu, 28 Apr 2022 19:40:03 -0700
Subject: [PATCH] [Datasets] Rename _experimental_lazy() -->
 experimental_lazy(). (#24321)

The experimental_ prefix should suffice, we shouldn't also need to make it a private method.
---
 python/ray/data/dataset.py             | 15 +++++++++++----
 python/ray/data/tests/test_dataset.py  |  2 +-
 python/ray/data/tests/test_optimize.py | 12 ++++++------
 python/ray/tune/impl/test_utils.py     |  8 ++++----
 4 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
index d3f1dbdf7..76d9596d9 100644
--- a/python/ray/data/dataset.py
+++ b/python/ray/data/dataset.py
@@ -2884,10 +2884,17 @@ List[str]]]): The names of the columns to use as the features. Can be a list of
         """
         return self._plan.execute().get_blocks()
 
-    def _experimental_lazy(self) -> "Dataset[T]":
-        """Enable lazy evaluation (experimental)."""
-        self._lazy = True
-        return self
+    def experimental_lazy(self) -> "Dataset[T]":
+        """EXPERIMENTAL: Enable lazy evaluation.
+
+        The returned dataset is a lazy dataset, where all subsequent operations on the
+        dataset won't be executed until the dataset is consumed (e.g. ``.take()``,
+        ``.iter_batches()``, ``.to_torch()``, ``.to_tf()``, etc.) or execution is
+        manually triggered via ``.fully_executed()``.
+        """
+        ds = Dataset(self._plan, self._epoch, lazy=True)
+        ds._set_uuid(self._get_uuid())
+        return ds
 
     def has_serializable_lineage(self) -> bool:
         """Whether this dataset's lineage is able to be serialized for storage and
diff --git a/python/ray/data/tests/test_dataset.py b/python/ray/data/tests/test_dataset.py
index 741719f6e..e7579a56d 100644
--- a/python/ray/data/tests/test_dataset.py
+++ b/python/ray/data/tests/test_dataset.py
@@ -41,7 +41,7 @@ def maybe_pipeline(ds, enabled):
 
 def maybe_lazy(ds, enabled):
     if enabled:
-        return ds._experimental_lazy()
+        return ds.experimental_lazy()
     else:
         return ds
 
diff --git a/python/ray/data/tests/test_optimize.py b/python/ray/data/tests/test_optimize.py
index cda281a40..b233f9ece 100644
--- a/python/ray/data/tests/test_optimize.py
+++ b/python/ray/data/tests/test_optimize.py
@@ -50,7 +50,7 @@ def test_memory_release_lazy(shutdown_only):
     ds = ray.data.range(10)
 
     # Should get fused into single stage.
-    ds = ds._experimental_lazy()
+    ds = ds.experimental_lazy()
     ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
     ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
     ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
@@ -69,7 +69,7 @@ def test_memory_release_lazy_shuffle(shutdown_only):
             ds = ray.data.range(10)
 
             # Should get fused into single stage.
-            ds = ds._experimental_lazy()
+            ds = ds.experimental_lazy()
             ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
             ds.random_shuffle().fully_executed()
             meminfo = memory_summary(info.address_info["address"], stats_only=True)
@@ -84,7 +84,7 @@ def test_memory_release_lazy_shuffle(shutdown_only):
 
 
 def test_spread_hint_inherit(ray_start_regular_shared):
-    ds = ray.data.range(10)._experimental_lazy()
+    ds = ray.data.range(10).experimental_lazy()
     ds = ds.map(lambda x: x + 1)
     ds = ds.random_shuffle()
     for s in ds._plan._stages_before_snapshot:
@@ -97,7 +97,7 @@ def test_spread_hint_inherit(ray_start_regular_shared):
 
 
 def test_execution_preserves_original(ray_start_regular_shared):
-    ds = ray.data.range(10).map(lambda x: x + 1)._experimental_lazy()
+    ds = ray.data.range(10).map(lambda x: x + 1).experimental_lazy()
     ds1 = ds.map(lambda x: x + 1)
     assert ds1._plan._snapshot_blocks is not None
     assert len(ds1._plan._stages_after_snapshot) == 1
@@ -136,7 +136,7 @@ def test_stage_linking(ray_start_regular_shared):
     _assert_has_stages(ds._plan._last_optimized_stages, ["read->map"])
 
     # Test lazy dataset.
-    ds = ray.data.range(10)._experimental_lazy()
+    ds = ray.data.range(10).experimental_lazy()
     assert len(ds._plan._stages_before_snapshot) == 0
     assert len(ds._plan._stages_after_snapshot) == 0
     assert len(ds._plan._last_optimized_stages) == 0
@@ -328,7 +328,7 @@ def test_optimize_lazy_reuse_base_data(
     ds = ray.data.read_datasource(source, parallelism=4, paths=paths)
     num_reads = ray.get(counter.get.remote())
     assert num_reads == 1, num_reads
-    ds = ds._experimental_lazy()
+    ds = ds.experimental_lazy()
     ds = ds.map(lambda x: x)
     if with_shuffle:
         ds = ds.random_shuffle()
diff --git a/python/ray/tune/impl/test_utils.py b/python/ray/tune/impl/test_utils.py
index 0cd9b85c6..62dbfb28f 100644
--- a/python/ray/tune/impl/test_utils.py
+++ b/python/ray/tune/impl/test_utils.py
@@ -33,8 +33,8 @@ def gen_dataset_func() -> Dataset:
 
 
 def test_grid_search():
-    ds1 = gen_dataset_func()._experimental_lazy().map(lambda x: x)
-    ds2 = gen_dataset_func()._experimental_lazy().map(lambda x: x)
+    ds1 = gen_dataset_func().experimental_lazy().map(lambda x: x)
+    ds2 = gen_dataset_func().experimental_lazy().map(lambda x: x)
     assert not ds1._plan._has_final_stage_snapshot()
     assert not ds2._plan._has_final_stage_snapshot()
     param_space = {"train_dataset": tune.grid_search([ds1, ds2])}
@@ -46,8 +46,8 @@ def test_grid_search():
 
 
 def test_choice():
-    ds1 = gen_dataset_func()._experimental_lazy().map(lambda x: x)
-    ds2 = gen_dataset_func()._experimental_lazy().map(lambda x: x)
+    ds1 = gen_dataset_func().experimental_lazy().map(lambda x: x)
+    ds2 = gen_dataset_func().experimental_lazy().map(lambda x: x)
     assert not ds1._plan._has_final_stage_snapshot()
     assert not ds2._plan._has_final_stage_snapshot()
     param_space = {"train_dataset": tune.choice([ds1, ds2])}