Move ray.data out of experimental (#17560)

This commit is contained in:
Eric Liang 2021-08-04 13:31:10 -07:00 committed by GitHub
parent 63708468df
commit d4f9d3620e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
44 changed files with 221 additions and 230 deletions

View file

@ -353,7 +353,7 @@
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
- DATA_PROCESSING_TESTING=1 ./ci/travis/install-dependencies.sh
- bazel test --config=ci $(./scripts/bazel_export_options) python/ray/experimental/workflow/... python/ray/experimental/data/...
- bazel test --config=ci $(./scripts/bazel_export_options) python/ray/experimental/workflow/... python/ray/data/...
- label: ":book: Doc tests and examples"
conditions:

View file

@ -5,7 +5,7 @@ Datasets: Distributed Arrow on Ray
.. tip::
Ray Datasets is available in early preview at ``ray.experimental.data``.
Ray Datasets is available in early preview at ``ray.data``.
Ray Datasets are the standard way to load and exchange data in Ray libraries and applications. Datasets provide basic distributed data transformations such as ``map``, ``filter``, and ``repartition``, and are compatible with a variety of file formats, datasources, and distributed frameworks.
@ -57,6 +57,9 @@ Datasource Compatibility Matrices
* - Binary Files
- ``ray.data.read_binary_files()``
- ✅
* - Python Objects
- ``ray.data.from_items()``
- ✅
* - Spark Dataframe
- ``ray.data.from_spark()``
- (todo)
@ -219,17 +222,17 @@ Datasets can be transformed in parallel using ``.map()``. Transformations are ex
ds = ray.data.range(10000)
ds = ds.map(lambda x: x * 2)
# -> Map Progress: 100%|█████████████████████████| 200/200 [00:00<00:00, 1123.54it/s]
# -> Map Progress: 100%|████████████████████| 200/200 [00:00<00:00, 1123.54it/s]
# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
ds.take(5)
# -> [0, 2, 4, 6, 8]
ds.filter(lambda x: x > 5).take(5)
# -> Map Progress: 100%|█████████████████████████| 200/200 [00:00<00:00, 1859.63it/s]
# -> Map Progress: 100%|████████████████████| 200/200 [00:00<00:00, 1859.63it/s]
# -> [6, 8, 10, 12, 14]
ds.flat_map(lambda x: [x, -x]).take(5)
# -> Map Progress: 100%|█████████████████████████| 200/200 [00:00<00:00, 1568.10it/s]
# -> Map Progress: 100%|████████████████████| 200/200 [00:00<00:00, 1568.10it/s]
# -> [0, 0, 2, -2, 4]
To take advantage of vectorized functions, use ``.map_batches()``. Note that you can also implement ``filter`` and ``flat_map`` using ``.map_batches()``, since your map function can return an output batch of any size.
@ -237,8 +240,9 @@ To take advantage of vectorized functions, use ``.map_batches()``. Note that you
.. code-block:: python
ds = ray.data.range_arrow(10000)
ds = ds.map_batches(lambda df: df.applymap(lambda x: x * 2), batch_format="pandas")
# -> Map Progress: 100%|█████████████████████████| 200/200 [00:00<00:00, 1927.62it/s]
ds = ds.map_batches(
lambda df: df.applymap(lambda x: x * 2), batch_format="pandas")
# -> Map Progress: 100%|████████████████████| 200/200 [00:00<00:00, 1927.62it/s]
ds.take(5)
# -> [ArrowRow({'value': 0}), ArrowRow({'value': 2}), ...]
@ -260,12 +264,12 @@ By default, transformations are executed using Ray tasks. For transformations th
# Preprocess the data.
ds = ds.map(preprocess)
# -> Map Progress: 100%|█████████████████████████| 200/200 [00:00<00:00, 1123.54it/s]
# -> Map Progress: 100%|████████████████████| 200/200 [00:00<00:00, 1123.54it/s]
# Apply GPU batch inference with actors, and assign each actor a GPU using
# ``num_gpus=1`` (any Ray remote decorator argument can be used here).
ds = ds.map_batches(BatchInferModel, compute="actors", batch_size=256, num_gpus=1)
# -> Map Progress (16 actors 4 pending): 100%|█████| 200/200 [00:07<00:00, 27.60it/s]
# -> Map Progress (16 actors 4 pending): 100%|█████| 200/200 [00:07, 27.60it/s]
# Save the results.
ds.repartition(1).write_json("s3://bucket/inference-results")
@ -324,7 +328,8 @@ Datasets support tensor-typed values, which are represented in-memory as Arrow t
# Create a Dataset of tensor-typed values.
ds = ray.data.range_tensor(10000, shape=(3, 5))
# -> Dataset(num_blocks=200, num_rows=10000, schema=<Tensor: shape=(None, 3, 5), dtype=int64>)
# -> Dataset(num_blocks=200, num_rows=10000,
# schema=<Tensor: shape=(None, 3, 5), dtype=int64>)
ds.map_batches(lambda t: t + 2).show(2)
# -> [[2 2 2 2 2]
@ -339,7 +344,8 @@ Datasets support tensor-typed values, which are represented in-memory as Arrow t
# Read from storage.
ray.data.read_numpy("/tmp/tensor_out")
# -> Dataset(num_blocks=200, num_rows=?, schema=<Tensor: shape=(None, 3, 5), dtype=int64>)
# -> Dataset(num_blocks=200, num_rows=?,
# schema=<Tensor: shape=(None, 3, 5), dtype=int64>)
Tensor datasets are also created whenever an array type is returned from a map function:
@ -351,7 +357,8 @@ Tensor datasets are also created whenever an array type is returned from a map f
# It is now converted into a Tensor dataset.
ds = ds.map_batches(lambda x: np.array(x))
# -> Dataset(num_blocks=10, num_rows=10, schema=<Tensor: shape=(None,), dtype=int64>)
# -> Dataset(num_blocks=10, num_rows=10,
# schema=<Tensor: shape=(None,), dtype=int64>)
Limitations: currently tensor-typed values cannot be nested in tabular records (e.g., as in TFRecord / Petastorm format). This is planned for development.

View file

@ -13,6 +13,7 @@ Creating a Dataset
.. autofunction:: ray.data.read_text
.. autofunction:: ray.data.read_binary_files
.. autofunction:: ray.data.read_datasource
.. autofunction:: ray.data.from_items
.. autofunction:: ray.data.from_arrow
.. autofunction:: ray.data.from_spark
.. autofunction:: ray.data.from_dask
@ -29,7 +30,7 @@ Dataset API
DatasetPipeline API
-------------------
.. autoclass:: ray.experimental.data.dataset_pipeline.DatasetPipeline
.. autoclass:: ray.data.dataset_pipeline.DatasetPipeline
:members:
Custom Datasource API

View file

@ -98,6 +98,7 @@ import ray.actor # noqa: E402,F401
from ray.actor import method # noqa: E402
from ray.cross_language import java_function, java_actor_class # noqa: E402
from ray.runtime_context import get_runtime_context # noqa: E402
from ray import data # noqa: E402,F401
from ray import util # noqa: E402
# We import ClientBuilder so that modules can inherit from `ray.ClientBuilder`.
from ray.client_builder import client, ClientBuilder # noqa: E402
@ -112,6 +113,7 @@ __all__ = [
"client",
"ClientBuilder",
"cluster_resources",
"data"
"get",
"get_actor",
"get_gpu_ids",
@ -151,11 +153,6 @@ __all__ += [
"PlacementGroupID",
]
# Add an alias so we can point to the final location in docs.
# TODO(ekl) remove this once datasets is out of alpha.
from ray.experimental import data # noqa
__all__.append(data)
# Add an alias so we can point to the final location in docs.
# TODO(yic) remove this once workflow is out of alpha.
from ray.experimental import workflow # noqa

View file

@ -1,5 +1,5 @@
# --------------------------------------------------------------------
# Tests from the python/ray/experimental/data/tests directory.
# Tests from the python/ray/data/tests directory.
# Covers all tests starting with `test_`.
# Please keep these sorted alphabetically.
# --------------------------------------------------------------------

View file

@ -1,10 +1,10 @@
from ray.experimental.data.read_api import from_items, range, range_arrow, \
from ray.data.read_api import from_items, range, range_arrow, \
range_tensor, read_parquet, read_json, read_csv, read_binary_files, \
from_dask, from_modin, from_mars, from_pandas, from_arrow, from_spark, \
read_datasource, read_numpy, read_text
from ray.experimental.data.datasource import Datasource, ReadTask, WriteTask
from ray.experimental.data.dataset import Dataset
from ray.experimental.data.impl.progress_bar import set_progress_bars
from ray.data.datasource import Datasource, ReadTask, WriteTask
from ray.data.dataset import Dataset
from ray.data.impl.progress_bar import set_progress_bars
# Module-level cached global functions (for impl/compute). It cannot be defined
# in impl/compute since it has to be process-global across cloudpickled funcs.

View file

@ -6,7 +6,7 @@ import numpy as np
if TYPE_CHECKING:
import pandas
import pyarrow
from ray.experimental.data.impl.block_builder import BlockBuilder
from ray.data.impl.block_builder import BlockBuilder
from ray.util.annotations import DeveloperAPI
@ -115,15 +115,15 @@ class BlockAccessor(Generic[T]):
import pyarrow
if isinstance(block, pyarrow.Table):
from ray.experimental.data.impl.arrow_block import \
from ray.data.impl.arrow_block import \
ArrowBlockAccessor
return ArrowBlockAccessor(block)
elif isinstance(block, list):
from ray.experimental.data.impl.simple_block import \
from ray.data.impl.simple_block import \
SimpleBlockAccessor
return SimpleBlockAccessor(block)
elif isinstance(block, np.ndarray):
from ray.experimental.data.impl.tensor_block import \
from ray.data.impl.tensor_block import \
TensorBlockAccessor
return TensorBlockAccessor(block)
else:

View file

@ -14,7 +14,7 @@ if TYPE_CHECKING:
import ray.util.sgd
import torch
import tensorflow as tf
from ray.experimental.data.dataset_pipeline import DatasetPipeline
from ray.data.dataset_pipeline import DatasetPipeline
import collections
import itertools
@ -23,17 +23,17 @@ import numpy as np
import ray
from ray.types import ObjectRef
from ray.util.annotations import DeveloperAPI, PublicAPI
from ray.experimental.data.block import Block, BlockAccessor, BlockMetadata
from ray.experimental.data.datasource import Datasource, WriteTask
from ray.experimental.data.impl.remote_fn import cached_remote_fn
from ray.experimental.data.impl.batcher import Batcher
from ray.experimental.data.impl.compute import get_compute, cache_wrapper, \
from ray.data.block import Block, BlockAccessor, BlockMetadata
from ray.data.datasource import Datasource, WriteTask
from ray.data.impl.remote_fn import cached_remote_fn
from ray.data.impl.batcher import Batcher
from ray.data.impl.compute import get_compute, cache_wrapper, \
CallableClass
from ray.experimental.data.impl.progress_bar import ProgressBar
from ray.experimental.data.impl.shuffle import simple_shuffle
from ray.experimental.data.impl.sort import sort_impl
from ray.experimental.data.impl.block_list import BlockList
from ray.experimental.data.impl.arrow_block import DelegatingArrowBlockBuilder
from ray.data.impl.progress_bar import ProgressBar
from ray.data.impl.shuffle import simple_shuffle
from ray.data.impl.sort import sort_impl
from ray.data.impl.block_list import BlockList
from ray.data.impl.arrow_block import DelegatingArrowBlockBuilder
T = TypeVar("T")
U = TypeVar("U")
@ -1073,7 +1073,7 @@ class Dataset(Generic[T]):
"""
import torch
from ray.experimental.data.impl.torch_iterable_dataset import \
from ray.data.impl.torch_iterable_dataset import \
TorchIterableDataset
if feature_columns and feature_column_dtypes:
@ -1303,7 +1303,7 @@ class Dataset(Generic[T]):
times: The number of times to loop over this dataset, or None
to repeat indefinitely.
"""
from ray.experimental.data.dataset_pipeline import DatasetPipeline
from ray.data.dataset_pipeline import DatasetPipeline
if times is not None and times < 1:
raise ValueError("`times` must be >= 1, got {}".format(times))
@ -1375,7 +1375,7 @@ class Dataset(Generic[T]):
length of the pipeline. Setting this to infinity effectively
disables pipelining.
"""
from ray.experimental.data.dataset_pipeline import DatasetPipeline
from ray.data.dataset_pipeline import DatasetPipeline
class Iterator:
def __init__(self, splits):

View file

@ -4,10 +4,10 @@ from typing import Any, Callable, List, Iterator, Iterable, Generic, Union, \
TYPE_CHECKING
import ray
from ray.experimental.data.dataset import Dataset, T, U, BatchType
from ray.experimental.data.impl.pipeline_executor import PipelineExecutor, \
from ray.data.dataset import Dataset, T, U, BatchType
from ray.data.impl.pipeline_executor import PipelineExecutor, \
PipelineSplitExecutorCoordinator
from ray.experimental.data.impl import progress_bar
from ray.data.impl import progress_bar
from ray.util.annotations import PublicAPI, DeveloperAPI
if TYPE_CHECKING:

View file

@ -0,0 +1,24 @@
from ray.data.datasource.datasource import (
Datasource, RangeDatasource, DummyOutputDatasource, ReadTask, WriteTask)
from ray.data.datasource.json_datasource import JSONDatasource
from ray.data.datasource.csv_datasource import CSVDatasource
from ray.data.datasource.numpy_datasource import NumpyDatasource
from ray.data.datasource.parquet_datasource import (ParquetDatasource)
from ray.data.datasource.binary_datasource import BinaryDatasource
from ray.data.datasource.file_based_datasource import (FileBasedDatasource,
_S3FileSystemWrapper)
__all__ = [
"JSONDatasource",
"CSVDatasource",
"NumpyDatasource",
"ParquetDatasource",
"BinaryDatasource",
"FileBasedDatasource",
"_S3FileSystemWrapper",
"Datasource",
"RangeDatasource",
"DummyOutputDatasource",
"ReadTask",
"WriteTask",
]

View file

@ -3,8 +3,7 @@ from typing import TYPE_CHECKING
if TYPE_CHECKING:
import pyarrow
from ray.experimental.data.datasource.file_based_datasource import (
FileBasedDatasource)
from ray.data.datasource.file_based_datasource import (FileBasedDatasource)
class BinaryDatasource(FileBasedDatasource):

View file

@ -3,8 +3,7 @@ from typing import TYPE_CHECKING
if TYPE_CHECKING:
import pyarrow
from ray.experimental.data.datasource.file_based_datasource import (
FileBasedDatasource)
from ray.data.datasource.file_based_datasource import (FileBasedDatasource)
class CSVDatasource(FileBasedDatasource):

View file

@ -5,9 +5,9 @@ import numpy as np
import ray
from ray.types import ObjectRef
from ray.experimental.data.block import Block, BlockAccessor, \
from ray.data.block import Block, BlockAccessor, \
BlockMetadata, T
from ray.experimental.data.impl.arrow_block import ArrowRow
from ray.data.impl.arrow_block import ArrowRow
from ray.util.annotations import PublicAPI
WriteResult = Any

View file

@ -5,10 +5,9 @@ from urllib.parse import urlparse
if TYPE_CHECKING:
import pyarrow
from ray.experimental.data.impl.arrow_block import (
ArrowRow, DelegatingArrowBlockBuilder)
from ray.experimental.data.impl.block_list import BlockMetadata
from ray.experimental.data.datasource.datasource import Datasource, ReadTask
from ray.data.impl.arrow_block import (ArrowRow, DelegatingArrowBlockBuilder)
from ray.data.impl.block_list import BlockMetadata
from ray.data.datasource.datasource import Datasource, ReadTask
from ray.util.annotations import DeveloperAPI
logger = logging.getLogger(__name__)

View file

@ -3,8 +3,7 @@ from typing import TYPE_CHECKING
if TYPE_CHECKING:
import pyarrow
from ray.experimental.data.datasource.file_based_datasource import (
FileBasedDatasource)
from ray.data.datasource.file_based_datasource import (FileBasedDatasource)
class JSONDatasource(FileBasedDatasource):

View file

@ -6,8 +6,7 @@ import numpy as np
if TYPE_CHECKING:
import pyarrow
from ray.experimental.data.datasource.file_based_datasource import (
FileBasedDatasource)
from ray.data.datasource.file_based_datasource import (FileBasedDatasource)
class NumpyDatasource(FileBasedDatasource):

View file

@ -4,10 +4,10 @@ from typing import Optional, List, Union, TYPE_CHECKING
if TYPE_CHECKING:
import pyarrow
from ray.experimental.data.impl.arrow_block import ArrowRow
from ray.experimental.data.impl.block_list import BlockMetadata
from ray.experimental.data.datasource.datasource import Datasource, ReadTask
from ray.experimental.data.datasource.file_based_datasource import (
from ray.data.impl.arrow_block import ArrowRow
from ray.data.impl.block_list import BlockMetadata
from ray.data.datasource.datasource import Datasource, ReadTask
from ray.data.datasource.file_based_datasource import (
_resolve_paths_and_filesystem)
logger = logging.getLogger(__name__)

View file

@ -3,7 +3,7 @@ import time
ray.init(num_gpus=2)
ds = ray.experimental.data.range(100)
ds = ray.data.range(100)
def preprocess(x):

View file

@ -3,7 +3,7 @@ import ray
ray.init()
ds = ray.experimental.data.from_items(range(200))
ds = ray.data.from_items(range(200))
def slow(x):

View file

@ -2,5 +2,5 @@ import ray
ray.init()
ds = ray.experimental.data.range(100000000)
ds = ray.data.range(100000000)
ds.repartition(1000)

View file

@ -10,10 +10,10 @@ try:
except ImportError:
pyarrow = None
from ray.experimental.data.block import Block, BlockAccessor, BlockMetadata
from ray.experimental.data.impl.block_builder import BlockBuilder
from ray.experimental.data.impl.simple_block import SimpleBlockBuilder
from ray.experimental.data.impl.tensor_block import TensorBlockBuilder
from ray.data.block import Block, BlockAccessor, BlockMetadata
from ray.data.impl.block_builder import BlockBuilder
from ray.data.impl.simple_block import SimpleBlockBuilder
from ray.data.impl.tensor_block import TensorBlockBuilder
if TYPE_CHECKING:
import pandas

View file

@ -1,7 +1,7 @@
from typing import Optional
from ray.experimental.data.block import Block, BlockAccessor
from ray.experimental.data.impl.arrow_block import DelegatingArrowBlockBuilder
from ray.data.block import Block, BlockAccessor
from ray.data.impl.arrow_block import DelegatingArrowBlockBuilder
class Batcher:

View file

@ -1,6 +1,6 @@
from typing import Generic
from ray.experimental.data.block import Block, T
from ray.data.block import Block, T
class BlockBuilder(Generic[T]):

View file

@ -4,7 +4,7 @@ from typing import Iterable, List
import numpy as np
from ray.types import ObjectRef
from ray.experimental.data.block import Block, BlockMetadata
from ray.data.block import Block, BlockMetadata
class BlockList(Iterable[ObjectRef[Block]]):

View file

@ -2,10 +2,10 @@ from typing import TypeVar, Iterable, Any, Union, Callable
import ray
from ray.types import ObjectRef
from ray.experimental.data.block import Block, BlockAccessor, BlockMetadata
from ray.experimental.data.impl.block_list import BlockList
from ray.experimental.data.impl.progress_bar import ProgressBar
from ray.experimental.data.impl.remote_fn import cached_remote_fn
from ray.data.block import Block, BlockAccessor, BlockMetadata
from ray.data.impl.block_list import BlockList
from ray.data.impl.progress_bar import ProgressBar
from ray.data.impl.remote_fn import cached_remote_fn
T = TypeVar("T")
U = TypeVar("U")

View file

@ -4,8 +4,8 @@ from typing import Callable, List
import numpy as np
from ray.types import ObjectRef
from ray.experimental.data.block import Block, BlockMetadata, T
from ray.experimental.data.impl.block_list import BlockList
from ray.data.block import Block, BlockMetadata, T
from ray.data.impl.block_list import BlockList
class LazyBlockList(BlockList[T]):

View file

@ -1,13 +1,13 @@
from typing import Any, Callable, List, Optional, TYPE_CHECKING
import ray
from ray.experimental.data.dataset import Dataset, T
from ray.experimental.data.impl.progress_bar import ProgressBar, \
from ray.data.dataset import Dataset, T
from ray.data.impl.progress_bar import ProgressBar, \
set_progress_bars
from ray.types import ObjectRef
if TYPE_CHECKING:
from ray.experimental.data.dataset_pipeline import DatasetPipeline
from ray.data.dataset_pipeline import DatasetPipeline
@ray.remote

View file

@ -4,11 +4,11 @@ from typing import TypeVar, List, Optional
import numpy as np
import ray
from ray.experimental.data.block import Block, BlockAccessor, BlockMetadata
from ray.experimental.data.impl.progress_bar import ProgressBar
from ray.experimental.data.impl.block_list import BlockList
from ray.experimental.data.impl.arrow_block import DelegatingArrowBlockBuilder
from ray.experimental.data.impl.remote_fn import cached_remote_fn
from ray.data.block import Block, BlockAccessor, BlockMetadata
from ray.data.impl.progress_bar import ProgressBar
from ray.data.impl.block_list import BlockList
from ray.data.impl.arrow_block import DelegatingArrowBlockBuilder
from ray.data.impl.remote_fn import cached_remote_fn
T = TypeVar("T")

View file

@ -9,8 +9,8 @@ if TYPE_CHECKING:
import pandas
import pyarrow
from ray.experimental.data.impl.block_builder import BlockBuilder
from ray.experimental.data.block import Block, BlockAccessor, BlockMetadata, T
from ray.data.impl.block_builder import BlockBuilder
from ray.data.block import Block, BlockAccessor, BlockMetadata, T
# A simple block can be sorted by value (None) or a lambda function (Callable).
SortKeyT = Union[None, Callable[[T], Any]]

View file

@ -20,10 +20,10 @@ from typing import List, Any, Callable, TypeVar, Tuple, Union
import numpy as np
import ray
from ray.experimental.data.block import Block, BlockAccessor
from ray.experimental.data.impl.block_list import BlockList
from ray.experimental.data.impl.progress_bar import ProgressBar
from ray.experimental.data.impl.remote_fn import cached_remote_fn
from ray.data.block import Block, BlockAccessor
from ray.data.impl.block_list import BlockList
from ray.data.impl.progress_bar import ProgressBar
from ray.data.impl.remote_fn import cached_remote_fn
T = TypeVar("T")

View file

@ -6,8 +6,8 @@ if TYPE_CHECKING:
import pandas
import pyarrow
from ray.experimental.data.block import Block, BlockAccessor
from ray.experimental.data.impl.block_builder import BlockBuilder
from ray.data.block import Block, BlockAccessor
from ray.data.impl.block_builder import BlockBuilder
T = TypeVar("T")

View file

@ -14,16 +14,16 @@ if TYPE_CHECKING:
import ray
from ray.types import ObjectRef
from ray.util.annotations import PublicAPI
from ray.experimental.data.block import Block, BlockAccessor, BlockMetadata
from ray.experimental.data.dataset import Dataset
from ray.experimental.data.datasource import Datasource, RangeDatasource, \
from ray.data.block import Block, BlockAccessor, BlockMetadata
from ray.data.dataset import Dataset
from ray.data.datasource import Datasource, RangeDatasource, \
JSONDatasource, CSVDatasource, ParquetDatasource, BinaryDatasource, \
NumpyDatasource, ReadTask
from ray.experimental.data.impl.arrow_block import ArrowRow, \
from ray.data.impl.arrow_block import ArrowRow, \
DelegatingArrowBlockBuilder
from ray.experimental.data.impl.block_list import BlockList
from ray.experimental.data.impl.lazy_block_list import LazyBlockList
from ray.experimental.data.impl.remote_fn import cached_remote_fn
from ray.data.impl.block_list import BlockList
from ray.data.impl.lazy_block_list import LazyBlockList
from ray.data.impl.remote_fn import cached_remote_fn
T = TypeVar("T")

View file

@ -16,9 +16,9 @@ from fsspec.implementations.local import LocalFileSystem
import ray
from ray.tests.conftest import * # noqa
from ray.experimental.data.datasource import DummyOutputDatasource
from ray.experimental.data.block import BlockAccessor
import ray.experimental.data.tests.util as util
from ray.data.datasource import DummyOutputDatasource
from ray.data.block import BlockAccessor
import ray.data.tests.util as util
def maybe_pipeline(ds, enabled):
@ -31,7 +31,7 @@ def maybe_pipeline(ds, enabled):
@pytest.mark.parametrize("pipelined", [False, True])
def test_basic_actors(shutdown_only, pipelined):
ray.init(num_cpus=2)
ds = ray.experimental.data.range(5)
ds = ray.data.range(5)
ds = maybe_pipeline(ds, pipelined)
assert sorted(ds.map(lambda x: x + 1,
compute="actors").take()) == [1, 2, 3, 4, 5]
@ -64,7 +64,7 @@ def test_equal_split(shutdown_only, pipelined):
def test_callable_classes(shutdown_only):
ray.init(num_cpus=1)
ds = ray.experimental.data.range(10)
ds = ray.data.range(10)
class StatefulFn:
def __init__(self):
@ -120,7 +120,7 @@ def test_callable_classes(shutdown_only):
@pytest.mark.parametrize("pipelined", [False, True])
def test_basic(ray_start_regular_shared, pipelined):
ds = ray.experimental.data.range(5)
ds = ray.data.range(5)
ds = maybe_pipeline(ds, pipelined)
assert sorted(ds.map(lambda x: x + 1).take()) == [1, 2, 3, 4, 5]
assert ds.count() == 5
@ -132,7 +132,7 @@ def test_basic(ray_start_regular_shared, pipelined):
# def test_avoid_placement_group_capture(ray_start_regular_shared, pipelined):
# @ray.remote
# def run():
# ds = ray.experimental.data.range(5)
# ds = ray.data.range(5)
# ds = maybe_pipeline(ds, pipelined)
# assert sorted(ds.map(lambda x: x + 1).take()) == [1, 2, 3, 4, 5]
# assert ds.count() == 5
@ -144,8 +144,7 @@ def test_basic(ray_start_regular_shared, pipelined):
def test_batch_tensors(ray_start_regular_shared):
import torch
ds = ray.experimental.data.from_items(
[torch.tensor([0, 0]) for _ in range(40)])
ds = ray.data.from_items([torch.tensor([0, 0]) for _ in range(40)])
res = "Dataset(num_blocks=40, num_rows=40, schema=<class 'torch.Tensor'>)"
assert str(ds) == res, str(ds)
with pytest.raises(pa.lib.ArrowInvalid):
@ -156,7 +155,7 @@ def test_batch_tensors(ray_start_regular_shared):
def test_tensors(ray_start_regular_shared):
# Create directly.
ds = ray.experimental.data.range_tensor(5, shape=(3, 5))
ds = ray.data.range_tensor(5, shape=(3, 5))
assert str(ds) == ("Dataset(num_blocks=5, num_rows=5, "
"schema=<Tensor: shape=(None, 3, 5), dtype=int64>)")
@ -228,7 +227,7 @@ def test_read_text(ray_start_regular_shared, tmp_path):
@pytest.mark.parametrize("pipelined", [False, True])
def test_write_datasource(ray_start_regular_shared, pipelined):
output = DummyOutputDatasource()
ds = ray.experimental.data.range(10, parallelism=2)
ds = ray.data.range(10, parallelism=2)
ds = maybe_pipeline(ds, pipelined)
ds.write_datasource(output)
if pipelined:
@ -250,20 +249,20 @@ def test_write_datasource(ray_start_regular_shared, pipelined):
def test_empty_dataset(ray_start_regular_shared):
ds = ray.experimental.data.range(0)
ds = ray.data.range(0)
assert ds.count() == 0
assert ds.size_bytes() is None
assert ds.schema() is None
ds = ray.experimental.data.range(1)
ds = ray.data.range(1)
ds = ds.filter(lambda x: x > 1)
assert str(ds) == \
"Dataset(num_blocks=1, num_rows=0, schema=Unknown schema)"
def test_schema(ray_start_regular_shared):
ds = ray.experimental.data.range(10)
ds2 = ray.experimental.data.range_arrow(10)
ds = ray.data.range(10)
ds2 = ray.data.range_arrow(10)
ds3 = ds2.repartition(5)
ds4 = ds3.map(lambda x: {"a": "hi", "b": 1.0}).limit(5).repartition(1)
assert str(ds) == \
@ -277,7 +276,7 @@ def test_schema(ray_start_regular_shared):
def test_lazy_loading_exponential_rampup(ray_start_regular_shared):
ds = ray.experimental.data.range(100, parallelism=20)
ds = ray.data.range(100, parallelism=20)
assert len(ds._blocks._blocks) == 1
assert ds.take(10) == list(range(10))
assert len(ds._blocks._blocks) == 2
@ -292,18 +291,18 @@ def test_lazy_loading_exponential_rampup(ray_start_regular_shared):
def test_limit(ray_start_regular_shared):
ds = ray.experimental.data.range(100, parallelism=20)
ds = ray.data.range(100, parallelism=20)
for i in range(100):
assert ds.limit(i).take(200) == list(range(i))
def test_convert_types(ray_start_regular_shared):
plain_ds = ray.experimental.data.range(1)
plain_ds = ray.data.range(1)
arrow_ds = plain_ds.map(lambda x: {"a": x})
assert arrow_ds.take() == [{"a": 0}]
assert "ArrowRow" in arrow_ds.map(lambda x: str(x)).take()[0]
arrow_ds = ray.experimental.data.range_arrow(1)
arrow_ds = ray.data.range_arrow(1)
assert arrow_ds.map(lambda x: "plain_{}".format(x["value"])).take() \
== ["plain_0"]
assert arrow_ds.map(lambda x: {"a": (x["value"],)}).take() == \
@ -311,12 +310,12 @@ def test_convert_types(ray_start_regular_shared):
def test_from_items(ray_start_regular_shared):
ds = ray.experimental.data.from_items(["hello", "world"])
ds = ray.data.from_items(["hello", "world"])
assert ds.take() == ["hello", "world"]
def test_repartition(ray_start_regular_shared):
ds = ray.experimental.data.range(20, parallelism=10)
ds = ray.data.range(20, parallelism=10)
assert ds.num_blocks() == 10
assert ds.sum() == 190
assert ds._block_sizes() == [2] * 10
@ -332,13 +331,13 @@ def test_repartition(ray_start_regular_shared):
assert ds3.sum() == 190
ds2._block_sizes() == [2] * 10 + [0] * 10
large = ray.experimental.data.range(10000, parallelism=10)
large = ray.data.range(10000, parallelism=10)
large = large.repartition(20)
assert large._block_sizes() == [500] * 20
def test_repartition_arrow(ray_start_regular_shared):
ds = ray.experimental.data.range_arrow(20, parallelism=10)
ds = ray.data.range_arrow(20, parallelism=10)
assert ds.num_blocks() == 10
assert ds.count() == 20
assert ds._block_sizes() == [2] * 10
@ -353,7 +352,7 @@ def test_repartition_arrow(ray_start_regular_shared):
assert ds3.count() == 20
ds2._block_sizes() == [2] * 10 + [0] * 10
large = ray.experimental.data.range_arrow(10000, parallelism=10)
large = ray.data.range_arrow(10000, parallelism=10)
large = large.repartition(20)
assert large._block_sizes() == [500] * 20
@ -361,7 +360,7 @@ def test_repartition_arrow(ray_start_regular_shared):
def test_from_pandas(ray_start_regular_shared):
df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
ds = ray.experimental.data.from_pandas([ray.put(df1), ray.put(df2)])
ds = ray.data.from_pandas([ray.put(df1), ray.put(df2)])
values = [(r["one"], r["two"]) for r in ds.take(6)]
rows = [(r.one, r.two) for _, r in pd.concat([df1, df2]).iterrows()]
assert values == rows
@ -370,7 +369,7 @@ def test_from_pandas(ray_start_regular_shared):
def test_from_arrow(ray_start_regular_shared):
df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
ds = ray.experimental.data.from_arrow([
ds = ray.data.from_arrow([
ray.put(pa.Table.from_pandas(df1)),
ray.put(pa.Table.from_pandas(df2))
])
@ -382,7 +381,7 @@ def test_from_arrow(ray_start_regular_shared):
def test_to_pandas(ray_start_regular_shared):
n = 5
df = pd.DataFrame({"value": list(range(n))})
ds = ray.experimental.data.range_arrow(n)
ds = ray.data.range_arrow(n)
dfds = pd.concat(ray.get(ds.to_pandas()), ignore_index=True)
assert df.equals(dfds)
@ -392,21 +391,21 @@ def test_to_arrow(ray_start_regular_shared):
# Zero-copy.
df = pd.DataFrame({"value": list(range(n))})
ds = ray.experimental.data.range_arrow(n)
ds = ray.data.range_arrow(n)
dfds = pd.concat(
[t.to_pandas() for t in ray.get(ds.to_arrow())], ignore_index=True)
assert df.equals(dfds)
# Conversion.
df = pd.DataFrame({0: list(range(n))})
ds = ray.experimental.data.range(n)
ds = ray.data.range(n)
dfds = pd.concat(
[t.to_pandas() for t in ray.get(ds.to_arrow())], ignore_index=True)
assert df.equals(dfds)
def test_get_blocks(ray_start_regular_shared):
blocks = ray.experimental.data.range(10).get_blocks()
blocks = ray.data.range(10).get_blocks()
assert len(blocks) == 10
out = []
for b in ray.get(blocks):
@ -418,7 +417,7 @@ def test_get_blocks(ray_start_regular_shared):
def test_pandas_roundtrip(ray_start_regular_shared, tmp_path):
df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
ds = ray.experimental.data.from_pandas([ray.put(df1), ray.put(df2)])
ds = ray.data.from_pandas([ray.put(df1), ray.put(df2)])
dfds = pd.concat(ray.get(ds.to_pandas()))
assert pd.concat([df1, df2]).equals(dfds)
@ -441,7 +440,7 @@ def test_fsspec_filesystem(ray_start_regular_shared, tmp_path):
fs = LocalFileSystem()
ds = ray.experimental.data.read_parquet([path1, path2], filesystem=fs)
ds = ray.data.read_parquet([path1, path2], filesystem=fs)
# Test metadata-only parquet ops.
assert len(ds._blocks._blocks) == 1
@ -456,7 +455,7 @@ def test_parquet_read(ray_start_regular_shared, tmp_path):
table = pa.Table.from_pandas(df2)
pq.write_table(table, os.path.join(str(tmp_path), "test2.parquet"))
ds = ray.experimental.data.read_parquet(str(tmp_path))
ds = ray.data.read_parquet(str(tmp_path))
# Test metadata-only parquet ops.
assert len(ds._blocks._blocks) == 1
@ -482,7 +481,7 @@ def test_parquet_read(ray_start_regular_shared, tmp_path):
[6, "g"]]
# Test column selection.
ds = ray.experimental.data.read_parquet(str(tmp_path), columns=["one"])
ds = ray.data.read_parquet(str(tmp_path), columns=["one"])
values = [s["one"] for s in ds.take()]
assert sorted(values) == [1, 2, 3, 4, 5, 6]
@ -491,7 +490,7 @@ def test_parquet_write(ray_start_regular_shared, tmp_path):
df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
df = pd.concat([df1, df2])
ds = ray.experimental.data.from_pandas([ray.put(df1), ray.put(df2)])
ds = ray.data.from_pandas([ray.put(df1), ray.put(df2)])
path = os.path.join(tmp_path, "test_parquet_dir")
os.mkdir(path)
ds._set_uuid("data")
@ -503,16 +502,16 @@ def test_parquet_write(ray_start_regular_shared, tmp_path):
def test_convert_to_pyarrow(ray_start_regular_shared, tmp_path):
ds = ray.experimental.data.range(100)
ds = ray.data.range(100)
assert ds.to_dask().sum().compute()[0] == 4950
path = os.path.join(tmp_path, "test_parquet_dir")
os.mkdir(path)
ds.write_parquet(path)
assert ray.experimental.data.read_parquet(path).count() == 100
assert ray.data.read_parquet(path).count() == 100
def test_pyarrow(ray_start_regular_shared):
ds = ray.experimental.data.range_arrow(5)
ds = ray.data.range_arrow(5)
assert ds.map(lambda x: {"b": x["value"] + 2}).take() == \
[{"b": 2}, {"b": 3}, {"b": 4}, {"b": 5}, {"b": 6}]
assert ds.map(lambda x: {"b": x["value"] + 2}) \
@ -525,7 +524,7 @@ def test_pyarrow(ray_start_regular_shared):
def test_read_binary_files(ray_start_regular_shared):
with util.gen_bin_files(10) as (_, paths):
ds = ray.experimental.data.read_binary_files(paths, parallelism=10)
ds = ray.data.read_binary_files(paths, parallelism=10)
for i, item in enumerate(ds.iter_rows()):
expected = open(paths[i], "rb").read()
assert expected == item
@ -539,8 +538,7 @@ def test_read_binary_files_with_fs(ray_start_regular_shared):
with util.gen_bin_files(10) as (tempdir, paths):
# All the paths are absolute, so we want the root file system.
fs, _ = pa.fs.FileSystem.from_uri("/")
ds = ray.experimental.data.read_binary_files(
paths, filesystem=fs, parallelism=10)
ds = ray.data.read_binary_files(paths, filesystem=fs, parallelism=10)
for i, item in enumerate(ds.iter_rows()):
expected = open(paths[i], "rb").read()
assert expected == item
@ -548,7 +546,7 @@ def test_read_binary_files_with_fs(ray_start_regular_shared):
def test_read_binary_files_with_paths(ray_start_regular_shared):
with util.gen_bin_files(10) as (_, paths):
ds = ray.experimental.data.read_binary_files(
ds = ray.data.read_binary_files(
paths, include_paths=True, parallelism=10)
for i, (path, item) in enumerate(ds.iter_rows()):
assert path == paths[i]
@ -560,8 +558,7 @@ def test_read_binary_files_with_paths(ray_start_regular_shared):
# credentials issue, unskip this test once that's fixed or once ported to moto.
@pytest.mark.skip(reason="Shouldn't hit S3 in CI")
def test_read_binary_files_s3(ray_start_regular_shared):
ds = ray.experimental.data.read_binary_files(
["s3://anyscale-data/small-files/0.dat"])
ds = ray.data.read_binary_files(["s3://anyscale-data/small-files/0.dat"])
item = ds.take(1).pop()
expected = requests.get(
"https://anyscale-data.s3.us-west-2.amazonaws.com/small-files/0.dat"
@ -575,7 +572,7 @@ def test_iter_batches_basic(ray_start_regular_shared):
df3 = pd.DataFrame({"one": [7, 8, 9], "two": [8, 9, 10]})
df4 = pd.DataFrame({"one": [10, 11, 12], "two": [11, 12, 13]})
dfs = [df1, df2, df3, df4]
ds = ray.experimental.data.from_pandas(
ds = ray.data.from_pandas(
[ray.put(df1), ray.put(df2),
ray.put(df3), ray.put(df4)])
@ -677,7 +674,7 @@ def test_iter_batches_grid(ray_start_regular_shared):
}))
running_size += block_size
num_rows = running_size
ds = ray.experimental.data.from_pandas([ray.put(df) for df in dfs])
ds = ray.data.from_pandas([ray.put(df) for df in dfs])
for batch_size in np.random.randint(
1, num_rows + 1, size=batch_size_samples):
for drop_last in (False, True):
@ -720,7 +717,7 @@ def test_iter_batches_grid(ray_start_regular_shared):
def test_lazy_loading_iter_batches_exponential_rampup(
ray_start_regular_shared):
ds = ray.experimental.data.range(32, parallelism=8)
ds = ray.data.range(32, parallelism=8)
expected_num_blocks = [1, 2, 4, 4, 8, 8, 8, 8]
for _, expected in zip(ds.iter_batches(), expected_num_blocks):
assert len(ds._blocks._blocks) == expected
@ -728,7 +725,7 @@ def test_lazy_loading_iter_batches_exponential_rampup(
def test_map_batch(ray_start_regular_shared, tmp_path):
# Test input validation
ds = ray.experimental.data.range(5)
ds = ray.data.range(5)
with pytest.raises(ValueError):
ds.map_batches(
lambda x: x + 1, batch_format="pyarrow", batch_size=-1).take()
@ -737,7 +734,7 @@ def test_map_batch(ray_start_regular_shared, tmp_path):
df = pd.DataFrame({"one": [1, 2, 3], "two": [2, 3, 4]})
table = pa.Table.from_pandas(df)
pq.write_table(table, os.path.join(tmp_path, "test1.parquet"))
ds = ray.experimental.data.read_parquet(str(tmp_path))
ds = ray.data.read_parquet(str(tmp_path))
ds_list = ds.map_batches(
lambda df: df + 1, batch_size=1, batch_format="pandas").take()
values = [s["one"] for s in ds_list]
@ -746,7 +743,7 @@ def test_map_batch(ray_start_regular_shared, tmp_path):
assert values == [3, 4, 5]
# Test Pyarrow
ds = ray.experimental.data.read_parquet(str(tmp_path))
ds = ray.data.read_parquet(str(tmp_path))
ds_list = ds.map_batches(
lambda pa: pa, batch_size=1, batch_format="pyarrow").take()
values = [s["one"] for s in ds_list]
@ -756,7 +753,7 @@ def test_map_batch(ray_start_regular_shared, tmp_path):
# Test batch
size = 300
ds = ray.experimental.data.range(size)
ds = ray.data.range(size)
ds_list = ds.map_batches(
lambda df: df + 1, batch_size=17,
batch_format="pandas").take(limit=size)
@ -769,27 +766,27 @@ def test_map_batch(ray_start_regular_shared, tmp_path):
# Test the lambda returns different types than the batch_format
# pandas => list block
ds = ray.experimental.data.read_parquet(str(tmp_path))
ds = ray.data.read_parquet(str(tmp_path))
ds_list = ds.map_batches(lambda df: [1], batch_size=1).take()
assert ds_list == [1, 1, 1]
assert ds.count() == 3
# pyarrow => list block
ds = ray.experimental.data.read_parquet(str(tmp_path))
ds = ray.data.read_parquet(str(tmp_path))
ds_list = ds.map_batches(
lambda df: [1], batch_size=1, batch_format="pyarrow").take()
assert ds_list == [1, 1, 1]
assert ds.count() == 3
# Test the wrong return value raises an exception.
ds = ray.experimental.data.read_parquet(str(tmp_path))
ds = ray.data.read_parquet(str(tmp_path))
with pytest.raises(ValueError):
ds_list = ds.map_batches(
lambda df: 1, batch_size=2, batch_format="pyarrow").take()
def test_split(ray_start_regular_shared):
ds = ray.experimental.data.range(20, parallelism=10)
ds = ray.data.range(20, parallelism=10)
assert ds.num_blocks() == 10
assert ds.sum() == 190
assert ds._block_sizes() == [2] * 10
@ -839,7 +836,7 @@ def test_split_hints(ray_start_regular_shared):
datasets[1] contains block 2.
"""
num_blocks = len(block_node_ids)
ds = ray.experimental.data.range(num_blocks, parallelism=num_blocks)
ds = ray.data.range(num_blocks, parallelism=num_blocks)
blocks = list(ds._blocks)
assert len(block_node_ids) == len(blocks)
actors = [Actor.remote() for i in range(len(actor_node_ids))]
@ -921,7 +918,7 @@ def test_from_dask(ray_start_regular_shared):
import dask.dataframe as dd
df = pd.DataFrame({"one": list(range(100)), "two": list(range(100))})
ddf = dd.from_pandas(df, npartitions=10)
ds = ray.experimental.data.from_dask(ddf)
ds = ray.data.from_dask(ddf)
dfds = pd.concat(ray.get(ds.to_pandas()))
assert df.equals(dfds)
@ -931,7 +928,7 @@ def test_to_dask(ray_start_regular_shared):
df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
df = pd.concat([df1, df2])
ds = ray.experimental.data.from_pandas([ray.put(df1), ray.put(df2)])
ds = ray.data.from_pandas([ray.put(df1), ray.put(df2)])
ddf = ds.to_dask()
# Explicit Dask-on-Ray
assert df.equals(ddf.compute(scheduler=ray_dask_get))
@ -954,8 +951,7 @@ def test_to_tf(ray_start_regular_shared, pipelined):
})
df3 = pd.DataFrame({"one": [7, 8], "two": [7.0, 8.0], "label": [7.0, 8.0]})
df = pd.concat([df1, df2, df3])
ds = ray.experimental.data.from_pandas(
[ray.put(df1), ray.put(df2), ray.put(df3)])
ds = ray.data.from_pandas([ray.put(df1), ray.put(df2), ray.put(df3)])
ds = maybe_pipeline(ds, pipelined)
tfd = ds.to_tf(
label_column="label",
@ -983,8 +979,7 @@ def test_to_tf_feature_columns(ray_start_regular_shared):
})
df3 = pd.DataFrame({"one": [7, 8], "two": [7.0, 8.0], "label": [7.0, 8.0]})
df = pd.concat([df1, df2, df3]).drop("two", axis=1)
ds = ray.experimental.data.from_pandas(
[ray.put(df1), ray.put(df2), ray.put(df3)])
ds = ray.data.from_pandas([ray.put(df1), ray.put(df2), ray.put(df3)])
tfd = ds.to_tf(
label_column="label",
feature_columns=["one"],
@ -1013,8 +1008,7 @@ def test_to_torch(ray_start_regular_shared, pipelined):
})
df3 = pd.DataFrame({"one": [7, 8], "two": [7.0, 8.0], "label": [7.0, 8.0]})
df = pd.concat([df1, df2, df3])
ds = ray.experimental.data.from_pandas(
[ray.put(df1), ray.put(df2), ray.put(df3)])
ds = ray.data.from_pandas([ray.put(df1), ray.put(df2), ray.put(df3)])
ds = maybe_pipeline(ds, pipelined)
torchd = ds.to_torch(label_column="label", batch_size=3)
@ -1041,8 +1035,7 @@ def test_to_torch_feature_columns(ray_start_regular_shared):
})
df3 = pd.DataFrame({"one": [7, 8], "two": [7.0, 8.0], "label": [7.0, 8.0]})
df = pd.concat([df1, df2, df3]).drop("two", axis=1)
ds = ray.experimental.data.from_pandas(
[ray.put(df1), ray.put(df2), ray.put(df3)])
ds = ray.data.from_pandas([ray.put(df1), ray.put(df2), ray.put(df3)])
torchd = ds.to_torch(
label_column="label", feature_columns=["one"], batch_size=3)
iterations = []
@ -1058,7 +1051,7 @@ def test_json_read(ray_start_regular_shared, tmp_path):
df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
path1 = os.path.join(tmp_path, "test1.json")
df1.to_json(path1, orient="records", lines=True)
ds = ray.experimental.data.read_json(path1)
ds = ray.data.read_json(path1)
assert df1.equals(ray.get(ds.to_pandas())[0])
# Test metadata ops.
assert ds.count() == 3
@ -1069,7 +1062,7 @@ def test_json_read(ray_start_regular_shared, tmp_path):
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
path2 = os.path.join(tmp_path, "test2.json")
df2.to_json(path2, orient="records", lines=True)
ds = ray.experimental.data.read_json([path1, path2], parallelism=2)
ds = ray.data.read_json([path1, path2], parallelism=2)
dsdf = pd.concat(ray.get(ds.to_pandas()))
assert pd.concat([df1, df2]).equals(dsdf)
# Test metadata ops.
@ -1081,7 +1074,7 @@ def test_json_read(ray_start_regular_shared, tmp_path):
path3 = os.path.join(tmp_path, "test3.json")
df3.to_json(path3, orient="records", lines=True)
df = pd.concat([df1, df2, df3], ignore_index=True)
ds = ray.experimental.data.read_json([path1, path2, path3], parallelism=2)
ds = ray.data.read_json([path1, path2, path3], parallelism=2)
dsdf = pd.concat(ray.get(ds.to_pandas()), ignore_index=True)
assert df.equals(dsdf)
@ -1094,7 +1087,7 @@ def test_json_read(ray_start_regular_shared, tmp_path):
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
path2 = os.path.join(path, "data1.json")
df2.to_json(path2, orient="records", lines=True)
ds = ray.experimental.data.read_json(path)
ds = ray.data.read_json(path)
df = pd.concat([df1, df2])
dsdf = pd.concat(ray.get(ds.to_pandas()))
assert df.equals(dsdf)
@ -1114,7 +1107,7 @@ def test_json_read(ray_start_regular_shared, tmp_path):
df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]})
file_path3 = os.path.join(path2, "data2.json")
df3.to_json(file_path3, orient="records", lines=True)
ds = ray.experimental.data.read_json([path1, path2])
ds = ray.data.read_json([path1, path2])
df = pd.concat([df1, df2, df3])
dsdf = pd.concat(ray.get(ds.to_pandas()))
assert df.equals(dsdf)
@ -1130,7 +1123,7 @@ def test_json_read(ray_start_regular_shared, tmp_path):
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
path2 = os.path.join(tmp_path, "data1.json")
df2.to_json(path2, orient="records", lines=True)
ds = ray.experimental.data.read_json([dir_path, path2])
ds = ray.data.read_json([dir_path, path2])
df = pd.concat([df1, df2])
dsdf = pd.concat(ray.get(ds.to_pandas()))
assert df.equals(dsdf)
@ -1142,7 +1135,7 @@ def test_zipped_json_read(ray_start_regular_shared, tmp_path):
df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
path1 = os.path.join(tmp_path, "test1.json.gz")
df1.to_json(path1, compression="gzip", orient="records", lines=True)
ds = ray.experimental.data.read_json(path1)
ds = ray.data.read_json(path1)
assert df1.equals(ray.get(ds.to_pandas())[0])
# Test metadata ops.
assert ds.count() == 3
@ -1152,7 +1145,7 @@ def test_zipped_json_read(ray_start_regular_shared, tmp_path):
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
path2 = os.path.join(tmp_path, "test2.json.gz")
df2.to_json(path2, compression="gzip", orient="records", lines=True)
ds = ray.experimental.data.read_json([path1, path2], parallelism=2)
ds = ray.data.read_json([path1, path2], parallelism=2)
dsdf = pd.concat(ray.get(ds.to_pandas()))
assert pd.concat([df1, df2]).equals(dsdf)
# Test metadata ops.
@ -1168,7 +1161,7 @@ def test_zipped_json_read(ray_start_regular_shared, tmp_path):
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
path2 = os.path.join(tmp_path, "data1.json.gz")
df2.to_json(path2, compression="gzip", orient="records", lines=True)
ds = ray.experimental.data.read_json([dir_path, path2])
ds = ray.data.read_json([dir_path, path2])
df = pd.concat([df1, df2])
dsdf = pd.concat(ray.get(ds.to_pandas()))
assert df.equals(dsdf)
@ -1181,7 +1174,7 @@ def test_json_write(ray_start_regular_shared, tmp_path):
# Single block.
os.mkdir(path)
df = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
ds = ray.experimental.data.from_pandas([ray.put(df)])
ds = ray.data.from_pandas([ray.put(df)])
ds._set_uuid("data")
ds.write_json(path)
file_path = os.path.join(path, "data_000000.json")
@ -1191,7 +1184,7 @@ def test_json_write(ray_start_regular_shared, tmp_path):
# Two blocks.
os.mkdir(path)
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
ds = ray.experimental.data.from_pandas([ray.put(df), ray.put(df2)])
ds = ray.data.from_pandas([ray.put(df), ray.put(df2)])
ds._set_uuid("data")
ds.write_json(path)
file_path2 = os.path.join(path, "data_000001.json")
@ -1206,7 +1199,7 @@ def test_csv_read(ray_start_regular_shared, tmp_path):
df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
path1 = os.path.join(tmp_path, "test1.csv")
df1.to_csv(path1, index=False)
ds = ray.experimental.data.read_csv(path1)
ds = ray.data.read_csv(path1)
dsdf = ray.get(ds.to_pandas())[0]
assert df1.equals(dsdf)
# Test metadata ops.
@ -1218,7 +1211,7 @@ def test_csv_read(ray_start_regular_shared, tmp_path):
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
path2 = os.path.join(tmp_path, "test2.csv")
df2.to_csv(path2, index=False)
ds = ray.experimental.data.read_csv([path1, path2], parallelism=2)
ds = ray.data.read_csv([path1, path2], parallelism=2)
dsdf = pd.concat(ray.get(ds.to_pandas()))
df = pd.concat([df1, df2])
assert df.equals(dsdf)
@ -1230,7 +1223,7 @@ def test_csv_read(ray_start_regular_shared, tmp_path):
df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]})
path3 = os.path.join(tmp_path, "test3.csv")
df3.to_csv(path3, index=False)
ds = ray.experimental.data.read_csv([path1, path2, path3], parallelism=2)
ds = ray.data.read_csv([path1, path2, path3], parallelism=2)
df = pd.concat([df1, df2, df3], ignore_index=True)
dsdf = pd.concat(ray.get(ds.to_pandas()), ignore_index=True)
assert df.equals(dsdf)
@ -1244,7 +1237,7 @@ def test_csv_read(ray_start_regular_shared, tmp_path):
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
path2 = os.path.join(path, "data1.csv")
df2.to_csv(path2, index=False)
ds = ray.experimental.data.read_csv(path)
ds = ray.data.read_csv(path)
df = pd.concat([df1, df2])
dsdf = pd.concat(ray.get(ds.to_pandas()))
assert df.equals(dsdf)
@ -1264,7 +1257,7 @@ def test_csv_read(ray_start_regular_shared, tmp_path):
df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]})
file_path3 = os.path.join(path2, "data2.csv")
df3.to_csv(file_path3, index=False)
ds = ray.experimental.data.read_csv([path1, path2])
ds = ray.data.read_csv([path1, path2])
df = pd.concat([df1, df2, df3])
dsdf = pd.concat(ray.get(ds.to_pandas()))
assert df.equals(dsdf)
@ -1280,7 +1273,7 @@ def test_csv_read(ray_start_regular_shared, tmp_path):
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
path2 = os.path.join(tmp_path, "data1.csv")
df2.to_csv(path2, index=False)
ds = ray.experimental.data.read_csv([dir_path, path2])
ds = ray.data.read_csv([dir_path, path2])
df = pd.concat([df1, df2])
dsdf = pd.concat(ray.get(ds.to_pandas()))
assert df.equals(dsdf)
@ -1293,7 +1286,7 @@ def test_csv_write(ray_start_regular_shared, tmp_path):
# Single block.
os.mkdir(path)
df = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
ds = ray.experimental.data.from_pandas([ray.put(df)])
ds = ray.data.from_pandas([ray.put(df)])
ds._set_uuid("data")
ds.write_csv(path)
file_path = os.path.join(path, "data_000000.csv")
@ -1303,7 +1296,7 @@ def test_csv_write(ray_start_regular_shared, tmp_path):
# Two blocks.
os.mkdir(path)
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
ds = ray.experimental.data.from_pandas([ray.put(df), ray.put(df2)])
ds = ray.data.from_pandas([ray.put(df), ray.put(df2)])
ds._set_uuid("data")
ds.write_csv(path)
file_path2 = os.path.join(path, "data_000001.csv")
@ -1318,7 +1311,7 @@ def test_sort_simple(ray_start_regular_shared):
parallelism = 4
xs = list(range(num_items))
random.shuffle(xs)
ds = ray.experimental.data.from_items(xs, parallelism=parallelism)
ds = ray.data.from_items(xs, parallelism=parallelism)
assert ds.sort().take(num_items) == list(range(num_items))
assert ds.sort(descending=True).take(num_items) == list(
reversed(range(num_items)))
@ -1380,7 +1373,7 @@ def test_sort_arrow(ray_start_regular_shared, num_items, parallelism):
offset += shard
if offset < num_items:
dfs.append(pd.DataFrame({"a": a[offset:], "b": b[offset:]}))
ds = ray.experimental.data.from_pandas([ray.put(df) for df in dfs])
ds = ray.data.from_pandas([ray.put(df) for df in dfs])
def assert_sorted(sorted_ds, expected_rows):
assert [tuple(row.values())

View file

@ -5,14 +5,14 @@ import pytest
import pandas as pd
import ray
from ray.experimental.data.dataset_pipeline import DatasetPipeline
from ray.data.dataset_pipeline import DatasetPipeline
from ray.tests.conftest import * # noqa
def test_pipeline_actors(shutdown_only):
ray.init(num_cpus=2, num_gpus=1)
pipe = ray.experimental.data.range(3) \
pipe = ray.data.range(3) \
.repeat(10) \
.map(lambda x: x + 1) \
.map(lambda x: x + 1, compute="actors", num_gpus=1)
@ -29,13 +29,13 @@ def test_incremental_take(shutdown_only):
time.sleep(999999)
return x
pipe = ray.experimental.data.range(2).pipeline(parallelism=1)
pipe = ray.data.range(2).pipeline(parallelism=1)
pipe = pipe.map(block_on_ones)
assert pipe.take(1) == [0]
def test_basic_pipeline(ray_start_regular_shared):
ds = ray.experimental.data.range(10)
ds = ray.data.range(10)
pipe = ds.pipeline(parallelism=1)
assert str(pipe) == "DatasetPipeline(length=10, num_stages=1)"
@ -64,7 +64,7 @@ def test_from_iterable(ray_start_regular_shared):
def test_repeat_forever(ray_start_regular_shared):
ds = ray.experimental.data.range(10)
ds = ray.data.range(10)
pipe = ds.repeat()
assert str(pipe) == "DatasetPipeline(length=None, num_stages=1)"
for i, v in enumerate(pipe.iter_rows()):
@ -74,42 +74,42 @@ def test_repeat_forever(ray_start_regular_shared):
def test_repartition(ray_start_regular_shared):
pipe = ray.experimental.data.range(10).repeat(10)
pipe = ray.data.range(10).repeat(10)
assert pipe.repartition(1).sum() == 450
assert pipe.repartition(10).sum() == 450
assert pipe.repartition(100).sum() == 450
def test_iter_batches(ray_start_regular_shared):
pipe = ray.experimental.data.range(10).pipeline(parallelism=2)
pipe = ray.data.range(10).pipeline(parallelism=2)
batches = list(pipe.iter_batches())
assert len(batches) == 10
assert all(len(e) == 1 for e in batches)
def test_iter_datasets(ray_start_regular_shared):
pipe = ray.experimental.data.range(10).pipeline(parallelism=2)
pipe = ray.data.range(10).pipeline(parallelism=2)
ds = list(pipe.iter_datasets())
assert len(ds) == 5
pipe = ray.experimental.data.range(10).pipeline(parallelism=5)
pipe = ray.data.range(10).pipeline(parallelism=5)
ds = list(pipe.iter_datasets())
assert len(ds) == 2
def test_foreach_dataset(ray_start_regular_shared):
pipe = ray.experimental.data.range(5).pipeline(parallelism=2)
pipe = ray.data.range(5).pipeline(parallelism=2)
pipe = pipe.foreach_dataset(lambda ds: ds.map(lambda x: x * 2))
assert pipe.take() == [0, 2, 4, 6, 8]
def test_schema(ray_start_regular_shared):
pipe = ray.experimental.data.range(5).pipeline(parallelism=2)
pipe = ray.data.range(5).pipeline(parallelism=2)
assert pipe.schema() == int
def test_split(ray_start_regular_shared):
pipe = ray.experimental.data.range(3) \
pipe = ray.data.range(3) \
.map(lambda x: x + 1) \
.repeat(10)
@ -130,7 +130,7 @@ def test_parquet_write(ray_start_regular_shared, tmp_path):
df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
df = pd.concat([df1, df2])
ds = ray.experimental.data.from_pandas([ray.put(df1), ray.put(df2)])
ds = ray.data.from_pandas([ray.put(df1), ray.put(df2)])
ds = ds.pipeline(parallelism=1)
path = os.path.join(tmp_path, "test_parquet_dir")
os.mkdir(path)

View file

@ -1,5 +1,4 @@
from ray.experimental.dynamic_resources import set_resource
from ray.experimental.packaging.load_package import load_package
from ray.experimental.locations import get_object_locations
from ray.experimental import data
__all__ = ["get_object_locations", "set_resource", "load_package", "data"]
__all__ = ["get_object_locations", "set_resource", "load_package"]

View file

@ -1,25 +0,0 @@
from ray.experimental.data.datasource.datasource import (
Datasource, RangeDatasource, DummyOutputDatasource, ReadTask, WriteTask)
from ray.experimental.data.datasource.json_datasource import JSONDatasource
from ray.experimental.data.datasource.csv_datasource import CSVDatasource
from ray.experimental.data.datasource.numpy_datasource import NumpyDatasource
from ray.experimental.data.datasource.parquet_datasource import (
ParquetDatasource)
from ray.experimental.data.datasource.binary_datasource import BinaryDatasource
from ray.experimental.data.datasource.file_based_datasource import (
FileBasedDatasource, _S3FileSystemWrapper)
__all__ = [
"JSONDatasource",
"CSVDatasource",
"NumpyDatasource",
"ParquetDatasource",
"BinaryDatasource",
"FileBasedDatasource",
"_S3FileSystemWrapper",
"Datasource",
"RangeDatasource",
"DummyOutputDatasource",
"ReadTask",
"WriteTask",
]

View file

@ -78,7 +78,7 @@ ray.init()
start_time = time.time()
print("Downloading...")
ds = ray.experimental.data.read_binary_files(
ds = ray.data.read_binary_files(
"s3://anyscale-data/small-images/",
parallelism=1000,
ray_remote_args={"num_cpus": 0.5})