Increase dataset read parallelism by default (#18420)

2025-03-06 02:21:39 -05:00 · 2021-09-09 15:07:49 -07:00 · 2021-09-09 15:07:49 -07:00 · 4d2065352b
commit 4d2065352b
parent ccc16a46bb
4 changed files with 14 additions and 7 deletions
--- a/doc/source/data/dataset-tensor-support.rst
+++ b/doc/source/data/dataset-tensor-support.rst
@ -1,7 +1,7 @@
 .. _datasets_tensor_support:
-Datasets Tensor Support
+Dataset Tensor Support
-=======================
+======================
 Tensor-typed values
 -------------------
--- a/doc/source/data/dataset.rst
+++ b/doc/source/data/dataset.rst
@ -62,7 +62,7 @@ Datasource Compatibility Matrices
     - ✅
   * - Spark Dataframe
     - ``ray.data.from_spark()``
-     - (todo)
+     - ✅
   * - Dask Dataframe
     - ``ray.data.from_dask()``
     - ✅
@ -106,7 +106,7 @@ Datasource Compatibility Matrices
     - ✅
   * - Spark Dataframe
     - ``ds.to_spark()``
-     - (todo)
+     - ✅
   * - Dask Dataframe
     - ``ds.to_dask()``
     - ✅
--- a/python/ray/data/dataset_pipeline.py
+++ b/python/ray/data/dataset_pipeline.py
@ -216,9 +216,10 @@ class DatasetPipeline(Generic[T]):
                        time.sleep(self.wait_delay_s)
                        tries += 1
                    if tries > self.warn_threshold:
-                        print("Warning: shard {} of the pipeline has been "
+                        print("Warning: reader on shard {} of the pipeline "
-                              "stalled more than {}s waiting for other shards "
+                              "has been blocked more than {}s waiting for "
-                              "to catch up.".format(
+                              "other readers to catch up. All pipeline shards "
                              "must be read from concurrently.".format(
                                  self.split_index,
                                  self.wait_delay_s * self.warn_threshold))
                        self.warn_threshold *= 2
--- a/python/ray/data/read_api.py
+++ b/python/ray/data/read_api.py
@ -155,6 +155,12 @@ def read_datasource(datasource: Datasource[T],
    if ray_remote_args is None:
        ray_remote_args = {}
    # Increase the read parallelism by default to maximize IO throughput. This
    # is particularly important when reading from e.g., remote storage.
    if "num_cpus" not in ray_remote_args:
        # Note that the too many workers warning triggers at 4x subscription,
        # so we go at 0.5 to avoid the warning message.
        ray_remote_args["num_cpus"] = 0.5
    remote_read = cached_remote_fn(remote_read, **ray_remote_args)
    calls: List[Callable[[], ObjectRef[Block]]] = []