[dataset] Fix conversion to pyarrow tables in several transforms (#16916)

2025-03-06 10:31:39 -05:00 · 2021-07-06 20:40:57 -07:00 · 2021-07-06 20:40:57 -07:00 · ca083e16d4
commit ca083e16d4
parent 23088bd7ea
6 changed files with 109 additions and 35 deletions
--- a/python/ray/experimental/data/dataset.py
+++ b/python/ray/experimental/data/dataset.py
@ -12,6 +12,8 @@ if TYPE_CHECKING:
    import dask
    import pyspark
    import ray.util.sgd
+    import torch
+    import tensorflow as tf

 import collections
 import itertools
@ -57,7 +59,9 @@ class Dataset(Generic[T]):
        self._blocks: BlockList[T] = blocks
        assert isinstance(self._blocks, BlockList), self._blocks

-    def map(self, fn: Callable[[T], U], compute="tasks",
+    def map(self,
+            fn: Callable[[T], U],
+            compute: Optional[str] = None,
            **ray_remote_args) -> "Dataset[U]":
        """Apply the given function to each record of this dataset.

@ -75,8 +79,8 @@ class Dataset(Generic[T]):

        Args:
            fn: The function to apply to each record.
-            compute: The compute strategy, either "tasks" to use Ray tasks,
-                or "actors" to use an autoscaling Ray actor pool.
+            compute: The compute strategy, either "tasks" (default) to use Ray
+                tasks, or "actors" to use an autoscaling Ray actor pool.
            ray_remote_args: Additional resource requirements to request from
                ray (e.g., num_gpus=1 to request GPUs for the map tasks).
        """
@ -94,7 +98,7 @@ class Dataset(Generic[T]):
    def map_batches(self,
                    fn: Callable[[BatchType], BatchType],
                    batch_size: int = None,
-                    compute: str = "tasks",
+                    compute: Optional[str] = None,
                    batch_format: str = "pandas",
                    **ray_remote_args) -> "Dataset[Any]":
        """Apply the given function to batches of records of this dataset.
@ -126,12 +130,12 @@ class Dataset(Generic[T]):
            fn: The function to apply to each record batch.
            batch_size: Request a specific batch size, or leave unspecified
                to use entire blocks as batches.
-            compute: The compute strategy, either "tasks" to use Ray tasks,
-                or "actors" to use an autoscaling Ray actor pool. When using
-                actors, state can be preserved across function invocations
-                in Python global variables. This can be useful for one-time
-                setups, e.g., initializing a model once and re-using it across
-                many function applications.
+            compute: The compute strategy, either "tasks" (default) to use Ray
+                tasks, or "actors" to use an autoscaling Ray actor pool. When
+                using actors, state can be preserved across function
+                invocations in Python global variables. This can be useful for
+                one-time setups, e.g., initializing a model once and re-using
+                it across many function applications.
            batch_format: Specify "pandas" to select ``pandas.DataFrame`` as
                the batch format, or "pyarrow" to select ``pyarrow.Table``.
            ray_remote_args: Additional resource requirements to request from
@ -157,7 +161,7 @@ class Dataset(Generic[T]):
                if batch_format == "pandas":
                    view = view.to_pandas()
                elif batch_format == "pyarrow":
-                    view = view._table
+                    view = view.to_arrow_table()
                else:
                    raise ValueError(
                        f"The given batch format: {batch_format} "
@ -185,7 +189,7 @@ class Dataset(Generic[T]):

    def flat_map(self,
                 fn: Callable[[T], Iterable[U]],
-                 compute="tasks",
+                 compute: Optional[str] = None,
                 **ray_remote_args) -> "Dataset[U]":
        """Apply the given function to each record and then flatten results.

@ -199,8 +203,8 @@ class Dataset(Generic[T]):

        Args:
            fn: The function to apply to each record.
-            compute: The compute strategy, either "tasks" to use Ray tasks,
-                or "actors" to use an autoscaling Ray actor pool.
+            compute: The compute strategy, either "tasks" (default) to use Ray
+                tasks, or "actors" to use an autoscaling Ray actor pool.
            ray_remote_args: Additional resource requirements to request from
                ray (e.g., num_gpus=1 to request GPUs for the map tasks).
        """
@ -218,7 +222,7 @@ class Dataset(Generic[T]):

    def filter(self,
               fn: Callable[[T], bool],
-               compute="tasks",
+               compute: Optional[str] = None,
               **ray_remote_args) -> "Dataset[T]":
        """Filter out records that do not satisfy the given predicate.

@ -232,8 +236,8 @@ class Dataset(Generic[T]):

        Args:
            fn: The predicate function to apply to each record.
-            compute: The compute strategy, either "tasks" to use Ray tasks,
-                or "actors" to use an autoscaling Ray actor pool.
+            compute: The compute strategy, either "tasks" (default) to use Ray
+                tasks, or "actors" to use an autoscaling Ray actor pool.
            ray_remote_args: Additional resource requirements to request from
                ray (e.g., num_gpus=1 to request GPUs for the map tasks).
        """
@ -654,8 +658,9 @@ class Dataset(Generic[T]):
        def parquet_write(write_path, block):
            logger.debug(
                f"Writing {block.num_rows()} records to {write_path}.")
-            with pq.ParquetWriter(write_path, block._table.schema) as writer:
-                writer.write_table(block._table)
+            table = block.to_arrow_table()
+            with pq.ParquetWriter(write_path, table.schema) as writer:
+                writer.write_table(table)

        refs = [
            parquet_write.remote(
@ -808,23 +813,29 @@ class Dataset(Generic[T]):

        raise NotImplementedError  # P1

-    def to_torch(self, **todo) -> "ray.util.sgd.torch.TorchMLDataset":
-        """Return a dataset that can be used for Torch distributed training.
+    def to_torch(self, **todo) -> "torch.utils.data.IterableDataset":
+        """Return a Torch data iterator over this dataset.
+
+        Note that you probably want to call ``.split()`` on this dataset if
+        there are to be multiple Torch workers consuming the data.

        Time complexity: O(1)

        Returns:
-            A TorchMLDataset.
+            A torch IterableDataset.
        """
        raise NotImplementedError  # P1

-    def to_tf(self, **todo) -> "ray.util.sgd.tf.TFMLDataset":
-        """Return a dataset that can be used for TF distributed training.
+    def to_tf(self, **todo) -> "tf.data.Dataset":
+        """Return a TF data iterator over this dataset.
+
+        Note that you probably want to call ``.split()`` on this dataset if
+        there are to be multiple TensorFlow workers consuming the data.

        Time complexity: O(1)

        Returns:
-            A TFMLDataset.
+            A tf.data.Dataset.
        """
        raise NotImplementedError  # P1

@ -855,7 +866,7 @@ class Dataset(Generic[T]):
                    "Dataset.to_dask() must be used with Dask-on-Ray, please "
                    "set the Dask scheduler to ray_dask_get (located in "
                    "ray.util.dask).")
-            return block._table.to_pandas()
+            return block.to_pandas()

        # TODO(Clark): Give Dask a Pandas-esque schema via the Pyarrow schema,
        # once that's implemented.
@ -895,7 +906,7 @@ class Dataset(Generic[T]):

        @ray.remote
        def block_to_df(block: ArrowBlock):
-            return block._table.to_pandas()
+            return block.to_pandas()

        return [block_to_df.remote(block) for block in self._blocks]

--- a/python/ray/experimental/data/impl/arrow_block.py
+++ b/python/ray/experimental/data/impl/arrow_block.py
@ -148,6 +148,9 @@ class ArrowBlock(Block):
    def to_pandas(self) -> "pandas.DataFrame":
        return self._table.to_pandas()

+    def to_arrow_table(self) -> "pyarrow.Table":
+        return self._table
+
    def num_rows(self) -> int:
        return self._table.num_rows

--- a/python/ray/experimental/data/impl/block.py
+++ b/python/ray/experimental/data/impl/block.py
@ -12,14 +12,32 @@ T = TypeVar("T")


 class BlockBuilder(Generic[T]):
+    """A builder class for blocks."""
+
    def add(self, item: T) -> None:
+        """Append a single row to the block being built."""
+        raise NotImplementedError
+
+    def add_block(self, block: "Block[T]") -> None:
+        """Append an entire block to the block being built."""
        raise NotImplementedError

    def build(self) -> "Block[T]":
+        """Build the block."""
        raise NotImplementedError


 class BlockMetadata:
+    """Metadata about the block.
+
+    Attributes:
+        num_rows: The number of rows contained in this block, or None.
+        size_bytes: The approximate size in bytes of this block, or None.
+        schema: The pyarrow schema or types of the block elements, or None.
+        input_files: The list of file paths used to generate this block, or
+            the empty list if indeterminate.
+    """
+
    def __init__(self, *, num_rows: Optional[int], size_bytes: Optional[int],
                 schema: Union[type, "pyarrow.lib.Schema"],
                 input_files: List[str]):
@ -32,25 +50,51 @@ class BlockMetadata:


 class Block(Generic[T]):
+    """Represents a batch of rows to be stored in the Ray object store.
+
+    There are two types of blocks: ``SimpleBlock``, which is backed by a plain
+    Python list, and ``ArrowBlock``, which is backed by a ``pyarrow.Table``.
+    """
+
    def num_rows(self) -> int:
+        """Return the number of rows contained in this block."""
        raise NotImplementedError

    def iter_rows(self) -> Iterator[T]:
+        """Iterate over the rows of this block."""
        raise NotImplementedError

    def slice(self, start: int, end: int, copy: bool) -> "Block[T]":
+        """Return a slice of this block.
+
+        Args:
+            start: The starting index of the slice.
+            end: The ending index of the slice.
+            copy: Whether to perform a data copy for the slice.
+
+        Returns:
+            The sliced block result.
+        """
        raise NotImplementedError

    def to_pandas(self) -> "pandas.DataFrame":
+        """Convert this block into a Pandas dataframe."""
+        raise NotImplementedError
+
+    def to_arrow_table(self) -> "pyarrow.Table":
+        """Convert this block into an Arrow table."""
        raise NotImplementedError

    def size_bytes(self) -> int:
+        """Return the approximate size in bytes of this block."""
        raise NotImplementedError

-    def schema(self) -> Any:
+    def schema(self) -> Union[type, "pyarrow.lib.Schema"]:
+        """Return the Python type or pyarrow schema of this block."""
        raise NotImplementedError

    def get_metadata(self, input_files: List[str]) -> BlockMetadata:
+        """Create a metadata object from this block."""
        return BlockMetadata(
            num_rows=self.num_rows(),
            size_bytes=self.size_bytes(),
@ -59,6 +103,7 @@ class Block(Generic[T]):

    @staticmethod
    def builder() -> BlockBuilder[T]:
+        """Create a builder for this block type."""
        raise NotImplementedError


@ -96,6 +141,10 @@ class SimpleBlock(Block):
        import pandas
        return pandas.DataFrame(self._items)

+    def to_arrow_table(self) -> "pyarrow.Table":
+        import pyarrow
+        return pyarrow.Table.from_pandas(self.to_pandas())
+
    def size_bytes(self) -> int:
        return sys.getsizeof(self._items)

--- a/python/ray/experimental/data/impl/compute.py
+++ b/python/ray/experimental/data/impl/compute.py
@ -1,4 +1,4 @@
-from typing import TypeVar, Iterable, Any
+from typing import TypeVar, Iterable, Any, Union

 import ray
 from ray.experimental.data.impl.block import Block, BlockMetadata, ObjectRef
@ -9,13 +9,13 @@ T = TypeVar("T")
 U = TypeVar("U")


-class ComputePool:
+class ComputeStrategy:
    def apply(self, fn: Any,
              blocks: Iterable[Block[T]]) -> Iterable[ObjectRef[Block]]:
        raise NotImplementedError


-class TaskPool(ComputePool):
+class TaskPool(ComputeStrategy):
    def apply(self, fn: Any, remote_args: dict,
              blocks: BlockList[Any]) -> BlockList[Any]:
        map_bar = ProgressBar("Map Progress", total=len(blocks))
@ -44,7 +44,7 @@ class TaskPool(ComputePool):
        return BlockList(list(new_blocks), list(new_metadata))


-class ActorPool(ComputePool):
+class ActorPool(ComputeStrategy):
    def apply(self, fn: Any, remote_args: dict,
              blocks: Iterable[Block[T]]) -> Iterable[ObjectRef[Block]]:

@ -113,10 +113,12 @@ class ActorPool(ComputePool):
        return BlockList(blocks_out, new_metadata)


-def get_compute(compute_spec: str) -> ComputePool:
-    if compute_spec == "tasks":
+def get_compute(compute_spec: Union[str, ComputeStrategy]) -> ComputeStrategy:
+    if not compute_spec or compute_spec == "tasks":
        return TaskPool()
    elif compute_spec == "actors":
        return ActorPool()
+    elif isinstance(compute_spec, ComputeStrategy):
+        return compute_spec
    else:
        raise ValueError("compute must be one of [`tasks`, `actors`]")
--- a/python/ray/experimental/data/read_api.py
+++ b/python/ray/experimental/data/read_api.py
@ -308,7 +308,7 @@ def read_parquet(paths: Union[str, List[str]],
    @ray.remote
    def gen_read(pieces: List["pyarrow._dataset.ParquetFileFragment"]):
        import pyarrow
-        print("Reading {} parquet pieces".format(len(pieces)))
+        logger.debug("Reading {} parquet pieces".format(len(pieces)))
        tables = [piece.to_table() for piece in pieces]
        if len(tables) > 1:
            table = pyarrow.concat_tables(tables)
--- a/python/ray/experimental/data/tests/test_dataset.py
+++ b/python/ray/experimental/data/tests/test_dataset.py
@ -229,6 +229,15 @@ def test_parquet_write(ray_start_regular_shared, tmp_path):
    assert df.equals(dfds)


+def test_convert_to_pyarrow(ray_start_regular_shared, tmp_path):
+    ds = ray.experimental.data.range(100)
+    assert ds.to_dask().sum().compute()[0] == 4950
+    path = os.path.join(tmp_path, "test_parquet_dir")
+    os.mkdir(path)
+    ds.write_parquet(path)
+    assert ray.experimental.data.read_parquet(path).count() == 100
+
+
 def test_pyarrow(ray_start_regular_shared):
    ds = ray.experimental.data.range_arrow(5)
    assert ds.map(lambda x: {"b": x["value"] + 2}).take() == \