mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
Increase dataset read parallelism by default (#18420)
This commit is contained in:
parent
ccc16a46bb
commit
4d2065352b
4 changed files with 14 additions and 7 deletions
|
@ -1,7 +1,7 @@
|
|||
.. _datasets_tensor_support:
|
||||
|
||||
Datasets Tensor Support
|
||||
=======================
|
||||
Dataset Tensor Support
|
||||
======================
|
||||
|
||||
Tensor-typed values
|
||||
-------------------
|
||||
|
|
|
@ -62,7 +62,7 @@ Datasource Compatibility Matrices
|
|||
- ✅
|
||||
* - Spark Dataframe
|
||||
- ``ray.data.from_spark()``
|
||||
- (todo)
|
||||
- ✅
|
||||
* - Dask Dataframe
|
||||
- ``ray.data.from_dask()``
|
||||
- ✅
|
||||
|
@ -106,7 +106,7 @@ Datasource Compatibility Matrices
|
|||
- ✅
|
||||
* - Spark Dataframe
|
||||
- ``ds.to_spark()``
|
||||
- (todo)
|
||||
- ✅
|
||||
* - Dask Dataframe
|
||||
- ``ds.to_dask()``
|
||||
- ✅
|
||||
|
|
|
@ -216,9 +216,10 @@ class DatasetPipeline(Generic[T]):
|
|||
time.sleep(self.wait_delay_s)
|
||||
tries += 1
|
||||
if tries > self.warn_threshold:
|
||||
print("Warning: shard {} of the pipeline has been "
|
||||
"stalled more than {}s waiting for other shards "
|
||||
"to catch up.".format(
|
||||
print("Warning: reader on shard {} of the pipeline "
|
||||
"has been blocked more than {}s waiting for "
|
||||
"other readers to catch up. All pipeline shards "
|
||||
"must be read from concurrently.".format(
|
||||
self.split_index,
|
||||
self.wait_delay_s * self.warn_threshold))
|
||||
self.warn_threshold *= 2
|
||||
|
|
|
@ -155,6 +155,12 @@ def read_datasource(datasource: Datasource[T],
|
|||
|
||||
if ray_remote_args is None:
|
||||
ray_remote_args = {}
|
||||
# Increase the read parallelism by default to maximize IO throughput. This
|
||||
# is particularly important when reading from e.g., remote storage.
|
||||
if "num_cpus" not in ray_remote_args:
|
||||
# Note that the too many workers warning triggers at 4x subscription,
|
||||
# so we go at 0.5 to avoid the warning message.
|
||||
ray_remote_args["num_cpus"] = 0.5
|
||||
remote_read = cached_remote_fn(remote_read, **ray_remote_args)
|
||||
|
||||
calls: List[Callable[[], ObjectRef[Block]]] = []
|
||||
|
|
Loading…
Add table
Reference in a new issue