From 14f2729b3a12711fabb2daeaa06176992870c154 Mon Sep 17 00:00:00 2001 From: Clark Zinzow Date: Thu, 28 Apr 2022 19:40:03 -0700 Subject: [PATCH] [Datasets] Rename _experimental_lazy() --> experimental_lazy(). (#24321) The experimental_ prefix should suffice, we shouldn't also need to make it a private method. --- python/ray/data/dataset.py | 15 +++++++++++---- python/ray/data/tests/test_dataset.py | 2 +- python/ray/data/tests/test_optimize.py | 12 ++++++------ python/ray/tune/impl/test_utils.py | 8 ++++---- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index d3f1dbdf7..76d9596d9 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -2884,10 +2884,17 @@ List[str]]]): The names of the columns to use as the features. Can be a list of """ return self._plan.execute().get_blocks() - def _experimental_lazy(self) -> "Dataset[T]": - """Enable lazy evaluation (experimental).""" - self._lazy = True - return self + def experimental_lazy(self) -> "Dataset[T]": + """EXPERIMENTAL: Enable lazy evaluation. + + The returned dataset is a lazy dataset, where all subsequent operations on the + dataset won't be executed until the dataset is consumed (e.g. ``.take()``, + ``.iter_batches()``, ``.to_torch()``, ``.to_tf()``, etc.) or execution is + manually triggered via ``.fully_executed()``. + """ + ds = Dataset(self._plan, self._epoch, lazy=True) + ds._set_uuid(self._get_uuid()) + return ds def has_serializable_lineage(self) -> bool: """Whether this dataset's lineage is able to be serialized for storage and diff --git a/python/ray/data/tests/test_dataset.py b/python/ray/data/tests/test_dataset.py index 741719f6e..e7579a56d 100644 --- a/python/ray/data/tests/test_dataset.py +++ b/python/ray/data/tests/test_dataset.py @@ -41,7 +41,7 @@ def maybe_pipeline(ds, enabled): def maybe_lazy(ds, enabled): if enabled: - return ds._experimental_lazy() + return ds.experimental_lazy() else: return ds diff --git a/python/ray/data/tests/test_optimize.py b/python/ray/data/tests/test_optimize.py index cda281a40..b233f9ece 100644 --- a/python/ray/data/tests/test_optimize.py +++ b/python/ray/data/tests/test_optimize.py @@ -50,7 +50,7 @@ def test_memory_release_lazy(shutdown_only): ds = ray.data.range(10) # Should get fused into single stage. - ds = ds._experimental_lazy() + ds = ds.experimental_lazy() ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) @@ -69,7 +69,7 @@ def test_memory_release_lazy_shuffle(shutdown_only): ds = ray.data.range(10) # Should get fused into single stage. - ds = ds._experimental_lazy() + ds = ds.experimental_lazy() ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds.random_shuffle().fully_executed() meminfo = memory_summary(info.address_info["address"], stats_only=True) @@ -84,7 +84,7 @@ def test_memory_release_lazy_shuffle(shutdown_only): def test_spread_hint_inherit(ray_start_regular_shared): - ds = ray.data.range(10)._experimental_lazy() + ds = ray.data.range(10).experimental_lazy() ds = ds.map(lambda x: x + 1) ds = ds.random_shuffle() for s in ds._plan._stages_before_snapshot: @@ -97,7 +97,7 @@ def test_spread_hint_inherit(ray_start_regular_shared): def test_execution_preserves_original(ray_start_regular_shared): - ds = ray.data.range(10).map(lambda x: x + 1)._experimental_lazy() + ds = ray.data.range(10).map(lambda x: x + 1).experimental_lazy() ds1 = ds.map(lambda x: x + 1) assert ds1._plan._snapshot_blocks is not None assert len(ds1._plan._stages_after_snapshot) == 1 @@ -136,7 +136,7 @@ def test_stage_linking(ray_start_regular_shared): _assert_has_stages(ds._plan._last_optimized_stages, ["read->map"]) # Test lazy dataset. - ds = ray.data.range(10)._experimental_lazy() + ds = ray.data.range(10).experimental_lazy() assert len(ds._plan._stages_before_snapshot) == 0 assert len(ds._plan._stages_after_snapshot) == 0 assert len(ds._plan._last_optimized_stages) == 0 @@ -328,7 +328,7 @@ def test_optimize_lazy_reuse_base_data( ds = ray.data.read_datasource(source, parallelism=4, paths=paths) num_reads = ray.get(counter.get.remote()) assert num_reads == 1, num_reads - ds = ds._experimental_lazy() + ds = ds.experimental_lazy() ds = ds.map(lambda x: x) if with_shuffle: ds = ds.random_shuffle() diff --git a/python/ray/tune/impl/test_utils.py b/python/ray/tune/impl/test_utils.py index 0cd9b85c6..62dbfb28f 100644 --- a/python/ray/tune/impl/test_utils.py +++ b/python/ray/tune/impl/test_utils.py @@ -33,8 +33,8 @@ def gen_dataset_func() -> Dataset: def test_grid_search(): - ds1 = gen_dataset_func()._experimental_lazy().map(lambda x: x) - ds2 = gen_dataset_func()._experimental_lazy().map(lambda x: x) + ds1 = gen_dataset_func().experimental_lazy().map(lambda x: x) + ds2 = gen_dataset_func().experimental_lazy().map(lambda x: x) assert not ds1._plan._has_final_stage_snapshot() assert not ds2._plan._has_final_stage_snapshot() param_space = {"train_dataset": tune.grid_search([ds1, ds2])} @@ -46,8 +46,8 @@ def test_grid_search(): def test_choice(): - ds1 = gen_dataset_func()._experimental_lazy().map(lambda x: x) - ds2 = gen_dataset_func()._experimental_lazy().map(lambda x: x) + ds1 = gen_dataset_func().experimental_lazy().map(lambda x: x) + ds2 = gen_dataset_func().experimental_lazy().map(lambda x: x) assert not ds1._plan._has_final_stage_snapshot() assert not ds2._plan._has_final_stage_snapshot() param_space = {"train_dataset": tune.choice([ds1, ds2])}