Dataset speed up read (#17435)

2025-03-06 02:21:39 -05:00 · 2021-08-01 18:03:46 -07:00 · 2021-08-01 18:03:46 -07:00 · d9cd3800c7
commit d9cd3800c7
parent 6703091cdc
7 changed files with 64 additions and 21 deletions
--- a/python/ray/experimental/data/datasource/binary_datasource.py
+++ b/python/ray/experimental/data/datasource/binary_datasource.py
@ -23,3 +23,6 @@ class BinaryDatasource(FileBasedDatasource):
            return path, data
        else:
            return data
    def _rows_per_file(self):
        return 1
--- a/python/ray/experimental/data/datasource/file_based_datasource.py
+++ b/python/ray/experimental/data/datasource/file_based_datasource.py
@ -62,22 +62,35 @@ class FileBasedDatasource(Datasource[Union[ArrowRow, Any]]):
                        builder.add(data)
            return builder.build()
-        read_tasks = [
+        read_tasks = []
-            ReadTask(
+        for read_paths, file_sizes in zip(
                np.array_split(paths, parallelism),
                np.array_split(file_sizes, parallelism)):
            if len(read_paths) <= 0:
                continue
            if self._rows_per_file() is None:
                num_rows = None
            else:
                num_rows = len(read_paths) * self._rows_per_file()
            read_task = ReadTask(
                lambda read_paths=read_paths: read_files(
                    read_paths, filesystem),
                BlockMetadata(
-                    num_rows=None,
+                    num_rows=num_rows,
                    size_bytes=sum(file_sizes),
                    schema=schema,
-                    input_files=read_paths)) for read_paths, file_sizes in zip(
+                    input_files=read_paths)
-                        np.array_split(paths, parallelism),
+            )
-                        np.array_split(file_sizes, parallelism))
+            read_tasks.append(read_task)
            if len(read_paths) > 0
        ]
        return read_tasks
    def _rows_per_file(self):
        """Returns the number of rows per file, or None if unknown.
        """
        return None
    def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args):
        """Reads a single file, passing all kwargs to the reader.
--- a/python/ray/experimental/data/read_api.py
+++ b/python/ray/experimental/data/read_api.py
@ -1,6 +1,6 @@
 import logging
-from typing import List, Any, Union, Optional, Tuple, Callable, TypeVar, \
+from typing import List, Any, Dict, Union, Optional, Tuple, Callable, \
-    TYPE_CHECKING
+    TypeVar, TYPE_CHECKING
 if TYPE_CHECKING:
    import pyarrow
@ -104,6 +104,7 @@ def range_arrow(n: int, *, parallelism: int = 200) -> Dataset[ArrowRow]:
 def read_datasource(datasource: Datasource[T],
                    *,
                    parallelism: int = 200,
                    ray_remote_args: Dict[str, Any] = None,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.
@ -111,6 +112,7 @@ def read_datasource(datasource: Datasource[T],
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.
    Returns:
        Dataset holding the data read from the datasource.
@ -118,10 +120,14 @@ def read_datasource(datasource: Datasource[T],
    read_tasks = datasource.prepare_read(parallelism, **read_args)
    @ray.remote
    def remote_read(task: ReadTask) -> Block:
        return task()
    if ray_remote_args:
        remote_read = ray.remote(**ray_remote_args)(remote_read)
    else:
        remote_read = ray.remote(remote_read)
    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []
@ -157,6 +163,7 @@ def read_parquet(paths: Union[str, List[str]],
                 filesystem: Optional["pyarrow.fs.FileSystem"] = None,
                 columns: Optional[List[str]] = None,
                 parallelism: int = 200,
                 ray_remote_args: Dict[str, Any] = None,
                 **arrow_parquet_args) -> Dataset[ArrowRow]:
    """Create an Arrow dataset from parquet files.
@ -172,6 +179,7 @@ def read_parquet(paths: Union[str, List[str]],
        filesystem: The filesystem implementation to read from.
        columns: A list of column names to read.
        parallelism: The amount of parallelism to use for the dataset.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.
        arrow_parquet_args: Other parquet read options to pass to pyarrow.
    Returns:
@ -183,6 +191,7 @@ def read_parquet(paths: Union[str, List[str]],
        paths=paths,
        filesystem=filesystem,
        columns=columns,
        ray_remote_args=ray_remote_args,
        **arrow_parquet_args)
@ -191,6 +200,7 @@ def read_json(paths: Union[str, List[str]],
              *,
              filesystem: Optional["pyarrow.fs.FileSystem"] = None,
              parallelism: int = 200,
              ray_remote_args: Dict[str, Any] = None,
              **arrow_json_args) -> Dataset[ArrowRow]:
    """Create an Arrow dataset from json files.
@ -209,6 +219,7 @@ def read_json(paths: Union[str, List[str]],
            A list of paths can contain both files and directories.
        filesystem: The filesystem implementation to read from.
        parallelism: The amount of parallelism to use for the dataset.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.
        arrow_json_args: Other json read options to pass to pyarrow.
    Returns:
@ -219,6 +230,7 @@ def read_json(paths: Union[str, List[str]],
        parallelism=parallelism,
        paths=paths,
        filesystem=filesystem,
        ray_remote_args=ray_remote_args,
        **arrow_json_args)
@ -227,6 +239,7 @@ def read_csv(paths: Union[str, List[str]],
             *,
             filesystem: Optional["pyarrow.fs.FileSystem"] = None,
             parallelism: int = 200,
             ray_remote_args: Dict[str, Any] = None,
             **arrow_csv_args) -> Dataset[ArrowRow]:
    """Create an Arrow dataset from csv files.
@ -245,6 +258,7 @@ def read_csv(paths: Union[str, List[str]],
            A list of paths can contain both files and directories.
        filesystem: The filesystem implementation to read from.
        parallelism: The amount of parallelism to use for the dataset.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.
        arrow_csv_args: Other csv read options to pass to pyarrow.
    Returns:
@ -255,6 +269,7 @@ def read_csv(paths: Union[str, List[str]],
        parallelism=parallelism,
        paths=paths,
        filesystem=filesystem,
        ray_remote_args=ray_remote_args,
        **arrow_csv_args)
@ -264,7 +279,9 @@ def read_binary_files(
        *,
        include_paths: bool = False,
        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
-        parallelism: int = 200) -> Dataset[Union[Tuple[str, bytes], bytes]]:
+        parallelism: int = 200,
        ray_remote_args: Dict[str, Any] = None,
 ) -> Dataset[Union[Tuple[str, bytes], bytes]]:
    """Create a dataset from binary files of arbitrary contents.
    Examples:
@ -280,6 +297,7 @@ def read_binary_files(
            dataset records. When specified, the dataset records will be a
            tuple of the file path and the file contents.
        filesystem: The filesystem implementation to read from.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.
        parallelism: The amount of parallelism to use for the dataset.
    Returns:
@ -291,6 +309,7 @@ def read_binary_files(
        paths=paths,
        include_paths=include_paths,
        filesystem=filesystem,
        ray_remote_args=ray_remote_args,
        schema=bytes)
--- a/release/e2e.py
+++ b/release/e2e.py
@ -1782,6 +1782,7 @@ if __name__ == "__main__":
    if args.ray_wheels:
        os.environ["RAY_WHEELS"] = str(args.ray_wheels)
        url = str(args.ray_wheels)
    elif not args.check:
        url = find_ray_wheels(
            GLOBAL_CONFIG["RAY_REPO"],
--- a/release/nightly_tests/dataset/dataset_test.yaml
+++ b/release/nightly_tests/dataset/dataset_test.yaml
@ -9,6 +9,6 @@
  run:
    timeout: 600
-    prepare: sleep 0
+    prepare: python wait_cluster.py
    script: python inference.py
-  
+  
--- a/release/nightly_tests/dataset/inference.py
+++ b/release/nightly_tests/dataset/inference.py
@ -75,17 +75,15 @@ def infer(batch):
 ray.init()
 while ray.cluster_resources().get("GPU", 0) != 2:
    print("Waiting for GPUs {}/2".format(ray.cluster_resources().get(
        "GPU", 400)))
    time.sleep(5)
 start_time = time.time()
 print("Downloading...")
 ds = ray.experimental.data.read_binary_files(
-    "s3://anyscale-data/small-images/", parallelism=400)
+    "s3://anyscale-data/small-images/",
-ds = ds.limit(100 * 1000)
+    parallelism=1000,
    ray_remote_args={"num_cpus": 0.5})
 # Do a blocking map so that we can measure the download time.
 ds = ds.map(lambda x: x)
 end_download_time = time.time()
 print("Preprocessing...")
--- a/release/nightly_tests/dataset/wait_cluster.py
+++ b/release/nightly_tests/dataset/wait_cluster.py
@ -0,0 +1,9 @@
 import ray
 import time
 ray.init()
 while ray.cluster_resources().get("GPU", 0) != 2:
    print("Waiting for GPUs {}/2".format(ray.cluster_resources().get(
        "GPU", 400)))
    time.sleep(5)