mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[data] Move Block to public API so that datasource API doesn't reference a private interface (#17098)
This commit is contained in:
parent
673d6822c7
commit
85725f2018
13 changed files with 104 additions and 91 deletions
|
@ -1,32 +1,16 @@
|
|||
import sys
|
||||
from typing import TypeVar, List, Generic, Iterator, Any, Union, Optional, \
|
||||
TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pandas
|
||||
import pyarrow
|
||||
from ray.experimental.data.impl.block_builder import BlockBuilder
|
||||
|
||||
# TODO(ekl) shouldn't Ray provide an ObjectRef type natively?
|
||||
ObjectRef = List
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class BlockBuilder(Generic[T]):
|
||||
"""A builder class for blocks."""
|
||||
|
||||
def add(self, item: T) -> None:
|
||||
"""Append a single row to the block being built."""
|
||||
raise NotImplementedError
|
||||
|
||||
def add_block(self, block: "Block[T]") -> None:
|
||||
"""Append an entire block to the block being built."""
|
||||
raise NotImplementedError
|
||||
|
||||
def build(self) -> "Block[T]":
|
||||
"""Build the block."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BlockMetadata:
|
||||
"""Metadata about the block.
|
||||
|
||||
|
@ -102,58 +86,6 @@ class Block(Generic[T]):
|
|||
input_files=input_files)
|
||||
|
||||
@staticmethod
|
||||
def builder() -> BlockBuilder[T]:
|
||||
def builder() -> "BlockBuilder[T]":
|
||||
"""Create a builder for this block type."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class SimpleBlockBuilder(BlockBuilder[T]):
|
||||
def __init__(self):
|
||||
self._items = []
|
||||
|
||||
def add(self, item: T) -> None:
|
||||
self._items.append(item)
|
||||
|
||||
def add_block(self, block: "SimpleBlock[T]") -> None:
|
||||
self._items.extend(block._items)
|
||||
|
||||
def build(self) -> "SimpleBlock[T]":
|
||||
return SimpleBlock(self._items)
|
||||
|
||||
|
||||
class SimpleBlock(Block):
|
||||
def __init__(self, items):
|
||||
self._items = items
|
||||
|
||||
def num_rows(self) -> int:
|
||||
return len(self._items)
|
||||
|
||||
def iter_rows(self) -> Iterator[T]:
|
||||
return iter(self._items)
|
||||
|
||||
def slice(self, start: int, end: int, copy: bool) -> "SimpleBlock[T]":
|
||||
view = self._items[start:end]
|
||||
if copy:
|
||||
view = view.copy()
|
||||
return SimpleBlock(view)
|
||||
|
||||
def to_pandas(self) -> "pandas.DataFrame":
|
||||
import pandas
|
||||
return pandas.DataFrame(self._items)
|
||||
|
||||
def to_arrow_table(self) -> "pyarrow.Table":
|
||||
import pyarrow
|
||||
return pyarrow.Table.from_pandas(self.to_pandas())
|
||||
|
||||
def size_bytes(self) -> int:
|
||||
return sys.getsizeof(self._items)
|
||||
|
||||
def schema(self) -> Any:
|
||||
if self._items:
|
||||
return type(self._items[0])
|
||||
else:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def builder() -> SimpleBlockBuilder[T]:
|
||||
return SimpleBlockBuilder()
|
|
@ -20,13 +20,13 @@ import itertools
|
|||
import numpy as np
|
||||
|
||||
import ray
|
||||
from ray.experimental.data.block import ObjectRef, Block, BlockMetadata
|
||||
from ray.experimental.data.datasource import Datasource, WriteTask
|
||||
from ray.experimental.data.impl.batcher import Batcher
|
||||
from ray.experimental.data.impl.compute import get_compute
|
||||
from ray.experimental.data.impl.progress_bar import ProgressBar
|
||||
from ray.experimental.data.impl.shuffle import simple_shuffle
|
||||
from ray.experimental.data.impl.block import ObjectRef, Block, SimpleBlock, \
|
||||
BlockMetadata
|
||||
from ray.experimental.data.impl.block_builder import SimpleBlock
|
||||
from ray.experimental.data.impl.block_list import BlockList
|
||||
from ray.experimental.data.impl.arrow_block import (
|
||||
DelegatingArrowBlockBuilder, ArrowBlock)
|
||||
|
@ -747,7 +747,9 @@ class Dataset(Generic[T]):
|
|||
write_args: Additional write args to pass to the datasource.
|
||||
"""
|
||||
|
||||
write_tasks = datasource.prepare_write(self._blocks, **write_args)
|
||||
write_tasks = datasource.prepare_write(self._blocks,
|
||||
self._blocks.get_metadata(),
|
||||
**write_args)
|
||||
progress = ProgressBar("Write Progress", len(write_tasks))
|
||||
|
||||
@ray.remote
|
||||
|
|
|
@ -1,14 +1,13 @@
|
|||
import builtins
|
||||
from typing import Any, Generic, List, Callable, Union, TypeVar
|
||||
from typing import Any, Generic, List, Callable, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
from ray.experimental.data.block import Block, BlockMetadata, ObjectRef, T
|
||||
from ray.experimental.data.impl.block_builder import SimpleBlock
|
||||
from ray.experimental.data.impl.arrow_block import ArrowRow, ArrowBlock
|
||||
from ray.experimental.data.impl.block import Block, SimpleBlock
|
||||
from ray.experimental.data.impl.block_list import BlockList, BlockMetadata
|
||||
|
||||
T = TypeVar("T")
|
||||
WriteResult = Any
|
||||
|
||||
|
||||
|
@ -37,13 +36,15 @@ class Datasource(Generic[T]):
|
|||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def prepare_write(self, blocks: BlockList,
|
||||
def prepare_write(self, blocks: List[ObjectRef[Block[T]]],
|
||||
metadata: List[BlockMetadata],
|
||||
**write_args) -> List["WriteTask[T]"]:
|
||||
"""Return the list of tasks needed to perform a write.
|
||||
|
||||
Args:
|
||||
blocks: List of data block references and block metadata. It is
|
||||
recommended that one write task be generated per block.
|
||||
blocks: List of data block references. It is recommended that one
|
||||
write task be generated per block.
|
||||
metadata: List of block metadata.
|
||||
write_args: Additional kwargs to pass to the datasource impl.
|
||||
|
||||
Returns:
|
||||
|
@ -200,7 +201,8 @@ class DummyOutputDatasource(Datasource[Union[ArrowRow, int]]):
|
|||
self.num_ok = 0
|
||||
self.num_failed = 0
|
||||
|
||||
def prepare_write(self, blocks: BlockList,
|
||||
def prepare_write(self, blocks: List[ObjectRef[Block[T]]],
|
||||
metadata: List[BlockMetadata],
|
||||
**write_args) -> List["WriteTask[T]"]:
|
||||
tasks = []
|
||||
for b in blocks:
|
||||
|
|
|
@ -6,7 +6,8 @@ try:
|
|||
except ImportError:
|
||||
pyarrow = None
|
||||
|
||||
from ray.experimental.data.impl.block import Block, BlockBuilder, \
|
||||
from ray.experimental.data.block import Block
|
||||
from ray.experimental.data.impl.block_builder import BlockBuilder, \
|
||||
SimpleBlockBuilder
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from typing import Optional
|
||||
|
||||
from ray.experimental.data.impl.block import Block
|
||||
from ray.experimental.data.block import Block
|
||||
from ray.experimental.data.impl.arrow_block import DelegatingArrowBlockBuilder
|
||||
|
||||
|
||||
|
|
76
python/ray/experimental/data/impl/block_builder.py
Normal file
76
python/ray/experimental/data/impl/block_builder.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
import sys
|
||||
from typing import Iterator, Generic, Any, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pandas
|
||||
import pyarrow
|
||||
|
||||
from ray.experimental.data.block import Block, T
|
||||
|
||||
|
||||
class BlockBuilder(Generic[T]):
|
||||
"""A builder class for blocks."""
|
||||
|
||||
def add(self, item: T) -> None:
|
||||
"""Append a single row to the block being built."""
|
||||
raise NotImplementedError
|
||||
|
||||
def add_block(self, block: "Block[T]") -> None:
|
||||
"""Append an entire block to the block being built."""
|
||||
raise NotImplementedError
|
||||
|
||||
def build(self) -> "Block[T]":
|
||||
"""Build the block."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class SimpleBlockBuilder(BlockBuilder[T]):
|
||||
def __init__(self):
|
||||
self._items = []
|
||||
|
||||
def add(self, item: T) -> None:
|
||||
self._items.append(item)
|
||||
|
||||
def add_block(self, block: "SimpleBlock[T]") -> None:
|
||||
self._items.extend(block._items)
|
||||
|
||||
def build(self) -> "SimpleBlock[T]":
|
||||
return SimpleBlock(self._items)
|
||||
|
||||
|
||||
class SimpleBlock(Block):
|
||||
def __init__(self, items):
|
||||
self._items = items
|
||||
|
||||
def num_rows(self) -> int:
|
||||
return len(self._items)
|
||||
|
||||
def iter_rows(self) -> Iterator[T]:
|
||||
return iter(self._items)
|
||||
|
||||
def slice(self, start: int, end: int, copy: bool) -> "SimpleBlock[T]":
|
||||
view = self._items[start:end]
|
||||
if copy:
|
||||
view = view.copy()
|
||||
return SimpleBlock(view)
|
||||
|
||||
def to_pandas(self) -> "pandas.DataFrame":
|
||||
import pandas
|
||||
return pandas.DataFrame(self._items)
|
||||
|
||||
def to_arrow_table(self) -> "pyarrow.Table":
|
||||
import pyarrow
|
||||
return pyarrow.Table.from_pandas(self.to_pandas())
|
||||
|
||||
def size_bytes(self) -> int:
|
||||
return sys.getsizeof(self._items)
|
||||
|
||||
def schema(self) -> Any:
|
||||
if self._items:
|
||||
return type(self._items[0])
|
||||
else:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def builder() -> SimpleBlockBuilder[T]:
|
||||
return SimpleBlockBuilder()
|
|
@ -1,6 +1,6 @@
|
|||
from typing import Iterable, List
|
||||
|
||||
from ray.experimental.data.impl.block import Block, BlockMetadata, ObjectRef, T
|
||||
from ray.experimental.data.block import Block, BlockMetadata, ObjectRef, T
|
||||
|
||||
|
||||
class BlockList(Iterable[ObjectRef[Block[T]]]):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import TypeVar, Iterable, Any, Union
|
||||
|
||||
import ray
|
||||
from ray.experimental.data.impl.block import Block, BlockMetadata, ObjectRef
|
||||
from ray.experimental.data.block import Block, BlockMetadata, ObjectRef
|
||||
from ray.experimental.data.impl.block_list import BlockList
|
||||
from ray.experimental.data.impl.progress_bar import ProgressBar
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from typing import Callable, List
|
||||
|
||||
from ray.experimental.data.impl.block import Block, BlockMetadata, ObjectRef, T
|
||||
from ray.experimental.data.block import Block, BlockMetadata, ObjectRef, T
|
||||
from ray.experimental.data.impl.block_list import BlockList
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import List
|
||||
|
||||
import ray
|
||||
from ray.experimental.data.impl.block import ObjectRef
|
||||
from ray.experimental.data.block import ObjectRef
|
||||
|
||||
try:
|
||||
import tqdm
|
||||
|
|
|
@ -2,7 +2,7 @@ import math
|
|||
from typing import TypeVar, List
|
||||
|
||||
import ray
|
||||
from ray.experimental.data.impl.block import Block, BlockMetadata
|
||||
from ray.experimental.data.block import Block, BlockMetadata
|
||||
from ray.experimental.data.impl.progress_bar import ProgressBar
|
||||
from ray.experimental.data.impl.block_list import BlockList
|
||||
from ray.experimental.data.impl.arrow_block import DelegatingArrowBlockBuilder
|
||||
|
|
|
@ -13,13 +13,13 @@ if TYPE_CHECKING:
|
|||
import pyspark
|
||||
|
||||
import ray
|
||||
from ray.experimental.data.block import ObjectRef, Block, BlockMetadata
|
||||
from ray.experimental.data.dataset import Dataset
|
||||
from ray.experimental.data.datasource import Datasource, RangeDatasource, \
|
||||
JSONDatasource, CSVDatasource, ReadTask
|
||||
from ray.experimental.data.impl import reader as _reader
|
||||
from ray.experimental.data.impl.arrow_block import ArrowBlock, ArrowRow
|
||||
from ray.experimental.data.impl.block import ObjectRef, SimpleBlock, Block, \
|
||||
BlockMetadata
|
||||
from ray.experimental.data.impl.block_builder import SimpleBlock
|
||||
from ray.experimental.data.impl.block_list import BlockList
|
||||
from ray.experimental.data.impl.lazy_block_list import LazyBlockList
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@ import ray
|
|||
from ray.util.dask import ray_dask_get
|
||||
from ray.tests.conftest import * # noqa
|
||||
from ray.experimental.data.datasource import DummyOutputDatasource
|
||||
from ray.experimental.data.impl.block import Block
|
||||
from ray.experimental.data.block import Block
|
||||
import ray.experimental.data.tests.util as util
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue