[Dask] Dask Example Tests (#16346)

* add examples * update dask docs * add build file * formatting * fix ci command * fix * Update python/ray/util/dask/BUILD * newline * fix pytest fixtures * fixes * formatting * fix shuffle example
2025-03-06 02:21:39 -05:00 · 2021-06-12 20:25:45 -07:00 · 2021-06-12 20:25:45 -07:00 · f9936c4252
commit f9936c4252
parent acb439e8f2
12 changed files with 245 additions and 102 deletions
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@ -150,6 +150,8 @@
      --test_tag_filters=-kubernetes,-jenkins_only,client_tests,-flaky
      --test_env=RAY_CLIENT_MODE=1 --test_env=RAY_PROFILING=1
      python/ray/tests/...
    # Dask tests and examples.
    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-client python/ray/util/dask/...
 - label: ":python: (Medium A-J)"
  conditions: ["RAY_CI_PYTHON_AFFECTED"]
  commands:
@ -296,6 +298,7 @@
    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
    - TUNE_TESTING=1 PYTHON=3.7 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh
    - rm -rf ./python/ray/thirdparty_files; ./ci/travis/ci.sh build
    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=client --test_env=RAY_CLIENT_MODE=1 python/ray/util/dask/...
    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=client,-flaky python/ray/tune/...
    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=client,-client_unit_tests python/ray/util/sgd/...
    - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=client,-flaky python/ray/util/xgboost/...
--- a/doc/source/dask-on-ray.rst
+++ b/doc/source/dask-on-ray.rst
@ -31,33 +31,8 @@ any Dask `.compute() <https://docs.dask.org/en/latest/api.html#dask.compute>`__
 call.
 Here's an example:
-.. code-block:: python
+.. literalinclude:: ../../python/ray/util/dask/examples/dask_ray_scheduler_example.py
-
+    :language: python
   import ray
   from ray.util.dask import ray_dask_get
   import dask
   import dask.array as da
   import dask.dataframe as dd
   import numpy as np
   import pandas as pd
   # Start Ray.
   # Tip: If you're connecting to an existing cluster, use ray.init(address="auto").
   ray.init()
   d_arr = da.from_array(np.random.randint(0, 1000, size=(256, 256)))
   # The Dask scheduler submits the underlying task graph to Ray.
   d_arr.mean().compute(scheduler=ray_dask_get)
   # Set the scheduler to ray_dask_get in your config so you don't have to specify it on
   # each compute call.
   dask.config.set(scheduler=ray_dask_get)
   df = dd.from_pandas(pd.DataFrame(
       np.random.randint(0, 100, size=(1024, 2)),
       columns=["age", "grade"]), npartitions=2)
   df.groupby(["age"]).mean().compute()
 .. note::
  For execution on a Ray cluster, you should *not* use the
@ -121,42 +96,8 @@ aggregations): those downstream computations will be faster since that base coll
 computation was kicked off early and referenced by all downstream computations, often
 via shared memory.
-.. code-block:: python
+.. literalinclude:: ../../python/ray/util/dask/examples/dask_ray_persist_example.py
-
+    :language: python
   import ray
   from ray.util.dask import ray_dask_get
   import dask
   import dask.array as da
   import numpy as np
   # Start Ray.
   # Tip: If you're connecting to an existing cluster, use ray.init(address="auto").
   ray.init()
   # Set the scheduler to ray_dask_get in your config so you don't have to specify it on
   # each compute call.
   dask.config.set(scheduler=ray_dask_get)
   d_arr = da.ones(100)
   print(dask.base.collections_to_dsk([d_arr]))
   # {('ones-c345e6f8436ff9bcd68ddf25287d27f3',
   #   0): (functools.partial(<function _broadcast_trick_inner at 0x7f27f1a71f80>,
   #   dtype=dtype('float64')), (5,))}
   # This submits all underlying Ray tasks to the cluster and returns a Dask array with
   # the Ray futures inlined.
   d_arr_p = d_arr.persist()
   # Notice that the Ray ObjectRef is inlined. The dask.ones() task has been submitted
   # to and is running on the Ray cluster.
   dask.base.collections_to_dsk([d_arr_p])
   # {('ones-c345e6f8436ff9bcd68ddf25287d27f3',
   #   0): ObjectRef(8b4e50dc1ddac855ffffffffffffffffffffffff0100000001000000)}
   # Future computations on this persisted Dask Array will be fast since we already
   # started computing d_arr_p in the background.
   d_arr_p.sum().compute()
   d_arr_p.min().compute()
   d_arr_p.max().compute()
 Custom optimization for Dask DataFrame shuffling
@ -168,31 +109,8 @@ Dask on Ray provides a Dask DataFrame optimizer that leverages Ray's ability to
 execute multiple-return tasks in order to speed up shuffling by as much as 4x on Ray.
 Simply set the `dataframe_optimize` configuration option to our optimizer function, similar to how you specify the Dask-on-Ray scheduler:
-.. code-block:: python
+.. literalinclude:: ../../python/ray/util/dask/examples/dask_ray_shuffle_optimization.py
-
+    :language: python
   import ray
   from ray.util.dask import ray_dask_get, dataframe_optimize
   import dask
   import dask.dataframe as dd
   import numpy as np
   import pandas as pd
   # Start Ray.
   # Tip: If you're connecting to an existing cluster, use ray.init(address="auto").
   ray.init()
   # Set the scheduler to ray_dask_get, and set the Dask DataFrame optimizer to our
   # custom optimization function, this time using the config setter as a context manager.
   with dask.config.set(scheduler=ray_dask_get, dataframe_optimize=dataframe_optimize):
       npartitions = 100
       df = dd.from_pandas(pd.DataFrame(
           np.random.randint(0, 100, size=(10000, 2)),
           columns=["age", "grade"]), npartitions=npartitions)
       # We set max_branch to infinity in order to ensure that the task-based shuffle
       # happens in a single stage, which is required in order for our optimization to
       # work.
       df.set_index(
           ["age"], shuffle="tasks", max_branch=float("inf")).head(10, npartitions=-1)
 Callbacks
 ---------
--- a/python/ray/tests/BUILD
+++ b/python/ray/tests/BUILD
@ -125,8 +125,6 @@ py_test_module_list(
    "test_command_runner.py",
    "test_component_failures.py",
    "test_coordinator_server.py",
    "test_dask_callback.py",
    "test_dask_scheduler.py",
    "test_debug_tools.py",
    "test_job.py",
    "test_k8s_operator_unit_tests.py",
@ -239,9 +237,6 @@ py_test_module_list(
    "test_basic_2.py",
    "test_basic_3.py",
    "test_asyncio.py",
    "test_dask_callback.py",
    "test_dask_scheduler.py",
    "test_dask_optimization.py",
  ],
  size = "medium",
  extra_srcs = SRCS,
--- a/python/ray/util/dask/BUILD
+++ b/python/ray/util/dask/BUILD
@ -0,0 +1,118 @@
 # --------------------------------------------------------------------
 # Tests from the python/ray/util/dask/tests directory.
 # Please keep these sorted alphabetically.
 # --------------------------------------------------------------------
 py_test(
    name = "test_dask_callback",
    size = "small",
    srcs = ["tests/test_dask_callback.py"],
    tags = ["exclusive"],
    deps = [":dask_lib"]
 )
 py_test(
    name = "test_dask_callback_client_mode",
    size = "small",
    main = "test_dask_callback.py",
    srcs = ["tests/test_dask_callback.py"],
    tags = ["exclusive", "client"],
    deps = [":dask_lib"]
 )
 py_test(
    name = "test_dask_optimization",
    size = "small",
    srcs = ["tests/test_dask_optimization.py"],
    tags = ["exclusive"],
    deps = [":dask_lib"]
 )
 py_test(
    name = "test_dask_optimization_client_mode",
    size = "small",
    main = "test_dask_optimization.py",
    srcs = ["tests/test_dask_optimization.py"],
    tags = ["exclusive", "client"],
    deps = [":dask_lib"]
 )
 py_test(
    name = "test_dask_scheduler",
    size = "small",
    srcs = ["tests/test_dask_scheduler.py"],
    tags = ["exclusive"],
    deps = [":dask_lib"]
 )
 py_test(
    name = "test_dask_scheduler_client_mode",
    size = "small",
    main = "test_dask_scheduler.py",
    srcs = ["tests/test_dask_scheduler.py"],
    tags = ["exclusive", "client"],
    deps = [":dask_lib"]
 )
 # --------------------------------------------------------------------
 # Tests from the python/ray/util/dask/examples directory.
 # Please keep these sorted alphabetically.
 # --------------------------------------------------------------------
 py_test(
    name = "dask_ray_persist_example",
    size = "medium",
    srcs = ["examples/dask_ray_persist_example.py"],
    tags = ["exclusive"],
    deps = [":dask_lib"],
 )
 py_test(
    name = "dask_ray_persist_example_client_mode",
    size = "medium",
    main = "dask_ray_persist_example.py",
    srcs = ["examples/dask_ray_persist_example.py"],
    tags = ["exclusive", "client"],
    deps = [":dask_lib"],
 )
 py_test(
    name = "dask_ray_scheduler_example",
    size = "medium",
    srcs = ["examples/dask_ray_scheduler_example.py"],
    tags = ["exclusive"],
    deps = [":dask_lib"],
 )
 py_test(
    name = "dask_ray_scheduler_example_client_mode",
    size = "medium",
    main = "dask_ray_scheduler_example.py",
    srcs = ["examples/dask_ray_scheduler_example.py"],
    tags = ["exclusive", "client"],
    deps = [":dask_lib"],
 )
 py_test(
    name = "dask_ray_shuffle_optimization",
    size = "medium",
    srcs = ["examples/dask_ray_shuffle_optimization.py"],
    tags = ["exclusive"],
    deps = [":dask_lib"],
 )
 py_test(
    name = "dask_ray_shuffle_optimization_client_mode",
    size = "medium",
    main = "dask_ray_shuffle_optimization.py",
    srcs = ["examples/dask_ray_shuffle_optimization.py"],
    tags = ["exclusive", "client"],
    deps = [":dask_lib"],
 )
 # This is a dummy test dependency that causes the above tests to be
 # re-run if any of these files changes.
 py_library(
    name = "dask_lib",
    srcs = glob(["**/*.py"], exclude=["tests/*.py"]),
 )
--- a/python/ray/util/dask/examples/init.py
+++ b/python/ray/util/dask/examples/init.py
--- a/python/ray/util/dask/examples/dask_ray_persist_example.py
+++ b/python/ray/util/dask/examples/dask_ray_persist_example.py
@ -0,0 +1,36 @@
 import ray
 from ray.util.dask import ray_dask_get
 import dask
 import dask.array as da
 # Start Ray.
 # Tip: If connecting to an existing cluster, use ray.init(address="auto").
 ray.init()
 # Set the scheduler to ray_dask_get in your config so you don't
 # have to specify it on each compute call.
 dask.config.set(scheduler=ray_dask_get)
 d_arr = da.ones(100)
 print(dask.base.collections_to_dsk([d_arr]))
 # {('ones-c345e6f8436ff9bcd68ddf25287d27f3',
 #   0): (functools.partial(<function _broadcast_trick_inner at 0x7f27f1a71f80>,
 #   dtype=dtype('float64')), (5,))}
 # This submits all underlying Ray tasks to the cluster and returns
 # a Dask array with the Ray futures inlined.
 d_arr_p = d_arr.persist()
 # Notice that the Ray ObjectRef is inlined. The dask.ones() task has
 # been submitted to and is running on the Ray cluster.
 dask.base.collections_to_dsk([d_arr_p])
 # {('ones-c345e6f8436ff9bcd68ddf25287d27f3',
 #   0): ObjectRef(8b4e50dc1ddac855ffffffffffffffffffffffff0100000001000000)}
 # Future computations on this persisted Dask Array will be fast since we
 # already started computing d_arr_p in the background.
 d_arr_p.sum().compute()
 d_arr_p.min().compute()
 d_arr_p.max().compute()
 ray.shutdown()
--- a/python/ray/util/dask/examples/dask_ray_scheduler_example.py
+++ b/python/ray/util/dask/examples/dask_ray_scheduler_example.py
@ -0,0 +1,28 @@
 import ray
 from ray.util.dask import ray_dask_get
 import dask
 import dask.array as da
 import dask.dataframe as dd
 import numpy as np
 import pandas as pd
 # Start Ray.
 # Tip: If connecting to an existing cluster, use ray.init(address="auto").
 ray.init()
 d_arr = da.from_array(np.random.randint(0, 1000, size=(256, 256)))
 # The Dask scheduler submits the underlying task graph to Ray.
 d_arr.mean().compute(scheduler=ray_dask_get)
 # Set the scheduler to ray_dask_get in your config so you don't have to
 # specify it on each compute call.
 dask.config.set(scheduler=ray_dask_get)
 df = dd.from_pandas(
    pd.DataFrame(
        np.random.randint(0, 100, size=(1024, 2)), columns=["age", "grade"]),
    npartitions=2)
 df.groupby(["age"]).mean().compute()
 ray.shutdown()
--- a/python/ray/util/dask/examples/dask_ray_shuffle_optimization.py
+++ b/python/ray/util/dask/examples/dask_ray_shuffle_optimization.py
@ -0,0 +1,29 @@
 import ray
 from ray.util.dask import dataframe_optimize
 import dask
 import dask.dataframe as dd
 import numpy as np
 import pandas as pd
 # Start Ray.
 # Tip: If connecting to an existing cluster, use ray.init(address="auto").
 ray.init()
 # Set the Dask DataFrame optimizer to
 # our custom optimization function, this time using the config setter as a
 # context manager.
 with dask.config.set(dataframe_optimize=dataframe_optimize):
    npartitions = 100
    df = dd.from_pandas(
        pd.DataFrame(
            np.random.randint(0, 100, size=(10000, 2)),
            columns=["age", "grade"]),
        npartitions=npartitions)
    # We set max_branch to infinity in order to ensure that the task-based
    # shuffle happens in a single stage, which is required in order for our
    # optimization to work.
    df.set_index(
        ["age"], shuffle="tasks", max_branch=float("inf")).head(
            10, npartitions=-1)
 ray.shutdown()
--- a/python/ray/util/dask/tests/init.py
+++ b/python/ray/util/dask/tests/init.py
--- a/python/ray/util/dask/tests/test_dask_callback.py
+++ b/python/ray/util/dask/tests/test_dask_callback.py
@ -5,6 +5,14 @@ import ray
 from ray.util.dask import ray_dask_get, RayDaskCallback
@pytest.fixture
 def ray_start_1_cpu():
    address_info = ray.init(num_cpus=2)
    yield address_info
    # The code after the yield will run as teardown code.
    ray.shutdown()
@dask.delayed
 def add(x, y):
    return x + y
@ -20,7 +28,7 @@ def test_callback_active():
    assert not RayDaskCallback.ray_active
-def test_presubmit_shortcircuit(ray_start_regular_shared):
+def test_presubmit_shortcircuit(ray_start_1_cpu):
    """
    Test that presubmit return short-circuits task submission, and that task's
    result is set to the presubmit return value.
@ -41,7 +49,7 @@ def test_presubmit_shortcircuit(ray_start_regular_shared):
    assert result == 0
-def test_pretask_posttask_shared_state(ray_start_regular_shared):
+def test_pretask_posttask_shared_state(ray_start_1_cpu):
    """
    Test that pretask return value is passed to corresponding posttask
    callback.
@ -61,7 +69,7 @@ def test_pretask_posttask_shared_state(ray_start_regular_shared):
    assert result == 5
-def test_postsubmit(ray_start_regular_shared):
+def test_postsubmit(ray_start_1_cpu):
    """
    Test that postsubmit is called after each task.
    """
@ -94,7 +102,7 @@ def test_postsubmit(ray_start_regular_shared):
    assert result == 5
-def test_postsubmit_all(ray_start_regular_shared):
+def test_postsubmit_all(ray_start_1_cpu):
    """
    Test that postsubmit_all is called once.
    """
@ -126,7 +134,7 @@ def test_postsubmit_all(ray_start_regular_shared):
    assert result == 5
-def test_finish(ray_start_regular_shared):
+def test_finish(ray_start_1_cpu):
    """
    Test that finish callback is called once.
    """
@ -158,7 +166,7 @@ def test_finish(ray_start_regular_shared):
    assert result == 5
-def test_multiple_callbacks(ray_start_regular_shared):
+def test_multiple_callbacks(ray_start_1_cpu):
    """
    Test that multiple callbacks are supported.
    """
@ -194,7 +202,7 @@ def test_multiple_callbacks(ray_start_regular_shared):
    assert result == 5
-def test_pretask_posttask_shared_state_multi(ray_start_regular_shared):
+def test_pretask_posttask_shared_state_multi(ray_start_1_cpu):
    """
    Test that pretask return values are passed to the correct corresponding
    posttask callbacks when multiple callbacks are given.
--- a/python/ray/util/dask/tests/test_dask_optimization.py
+++ b/python/ray/util/dask/tests/test_dask_optimization.py
--- a/python/ray/util/dask/tests/test_dask_scheduler.py
+++ b/python/ray/util/dask/tests/test_dask_scheduler.py
@ -7,8 +7,16 @@ import ray
 from ray.util.client.common import ClientObjectRef
@pytest.fixture
 def ray_start_1_cpu():
    address_info = ray.init(num_cpus=2)
    yield address_info
    # The code after the yield will run as teardown code.
    ray.shutdown()
@unittest.skipIf(sys.platform == "win32", "Failing on Windows.")
-def test_ray_dask_basic(ray_start_regular_shared):
+def test_ray_dask_basic(ray_start_1_cpu):
    from ray.util.dask import ray_dask_get
    @ray.remote
@ -36,7 +44,7 @@ def test_ray_dask_basic(ray_start_regular_shared):
@unittest.skipIf(sys.platform == "win32", "Failing on Windows.")
-def test_ray_dask_persist(ray_start_regular_shared):
+def test_ray_dask_persist(ray_start_1_cpu):
    from ray.util.dask import ray_dask_get
    arr = da.ones(5) + 2
    result = arr.persist(scheduler=ray_dask_get)