[data] Move Block to public API so that datasource API doesn't reference a private interface (#17098)

This commit is contained in:
Eric Liang 2021-07-14 18:59:13 -07:00 committed by GitHub
parent 673d6822c7
commit 85725f2018
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 104 additions and 91 deletions

View file

@ -1,32 +1,16 @@
import sys
from typing import TypeVar, List, Generic, Iterator, Any, Union, Optional, \
TYPE_CHECKING
if TYPE_CHECKING:
import pandas
import pyarrow
from ray.experimental.data.impl.block_builder import BlockBuilder
# TODO(ekl) shouldn't Ray provide an ObjectRef type natively?
ObjectRef = List
T = TypeVar("T")
class BlockBuilder(Generic[T]):
"""A builder class for blocks."""
def add(self, item: T) -> None:
"""Append a single row to the block being built."""
raise NotImplementedError
def add_block(self, block: "Block[T]") -> None:
"""Append an entire block to the block being built."""
raise NotImplementedError
def build(self) -> "Block[T]":
"""Build the block."""
raise NotImplementedError
class BlockMetadata:
"""Metadata about the block.
@ -102,58 +86,6 @@ class Block(Generic[T]):
input_files=input_files)
@staticmethod
def builder() -> BlockBuilder[T]:
def builder() -> "BlockBuilder[T]":
"""Create a builder for this block type."""
raise NotImplementedError
class SimpleBlockBuilder(BlockBuilder[T]):
def __init__(self):
self._items = []
def add(self, item: T) -> None:
self._items.append(item)
def add_block(self, block: "SimpleBlock[T]") -> None:
self._items.extend(block._items)
def build(self) -> "SimpleBlock[T]":
return SimpleBlock(self._items)
class SimpleBlock(Block):
def __init__(self, items):
self._items = items
def num_rows(self) -> int:
return len(self._items)
def iter_rows(self) -> Iterator[T]:
return iter(self._items)
def slice(self, start: int, end: int, copy: bool) -> "SimpleBlock[T]":
view = self._items[start:end]
if copy:
view = view.copy()
return SimpleBlock(view)
def to_pandas(self) -> "pandas.DataFrame":
import pandas
return pandas.DataFrame(self._items)
def to_arrow_table(self) -> "pyarrow.Table":
import pyarrow
return pyarrow.Table.from_pandas(self.to_pandas())
def size_bytes(self) -> int:
return sys.getsizeof(self._items)
def schema(self) -> Any:
if self._items:
return type(self._items[0])
else:
return None
@staticmethod
def builder() -> SimpleBlockBuilder[T]:
return SimpleBlockBuilder()

View file

@ -20,13 +20,13 @@ import itertools
import numpy as np
import ray
from ray.experimental.data.block import ObjectRef, Block, BlockMetadata
from ray.experimental.data.datasource import Datasource, WriteTask
from ray.experimental.data.impl.batcher import Batcher
from ray.experimental.data.impl.compute import get_compute
from ray.experimental.data.impl.progress_bar import ProgressBar
from ray.experimental.data.impl.shuffle import simple_shuffle
from ray.experimental.data.impl.block import ObjectRef, Block, SimpleBlock, \
BlockMetadata
from ray.experimental.data.impl.block_builder import SimpleBlock
from ray.experimental.data.impl.block_list import BlockList
from ray.experimental.data.impl.arrow_block import (
DelegatingArrowBlockBuilder, ArrowBlock)
@ -747,7 +747,9 @@ class Dataset(Generic[T]):
write_args: Additional write args to pass to the datasource.
"""
write_tasks = datasource.prepare_write(self._blocks, **write_args)
write_tasks = datasource.prepare_write(self._blocks,
self._blocks.get_metadata(),
**write_args)
progress = ProgressBar("Write Progress", len(write_tasks))
@ray.remote

View file

@ -1,14 +1,13 @@
import builtins
from typing import Any, Generic, List, Callable, Union, TypeVar
from typing import Any, Generic, List, Callable, Union
import numpy as np
import ray
from ray.experimental.data.block import Block, BlockMetadata, ObjectRef, T
from ray.experimental.data.impl.block_builder import SimpleBlock
from ray.experimental.data.impl.arrow_block import ArrowRow, ArrowBlock
from ray.experimental.data.impl.block import Block, SimpleBlock
from ray.experimental.data.impl.block_list import BlockList, BlockMetadata
T = TypeVar("T")
WriteResult = Any
@ -37,13 +36,15 @@ class Datasource(Generic[T]):
"""
raise NotImplementedError
def prepare_write(self, blocks: BlockList,
def prepare_write(self, blocks: List[ObjectRef[Block[T]]],
metadata: List[BlockMetadata],
**write_args) -> List["WriteTask[T]"]:
"""Return the list of tasks needed to perform a write.
Args:
blocks: List of data block references and block metadata. It is
recommended that one write task be generated per block.
blocks: List of data block references. It is recommended that one
write task be generated per block.
metadata: List of block metadata.
write_args: Additional kwargs to pass to the datasource impl.
Returns:
@ -200,7 +201,8 @@ class DummyOutputDatasource(Datasource[Union[ArrowRow, int]]):
self.num_ok = 0
self.num_failed = 0
def prepare_write(self, blocks: BlockList,
def prepare_write(self, blocks: List[ObjectRef[Block[T]]],
metadata: List[BlockMetadata],
**write_args) -> List["WriteTask[T]"]:
tasks = []
for b in blocks:

View file

@ -6,7 +6,8 @@ try:
except ImportError:
pyarrow = None
from ray.experimental.data.impl.block import Block, BlockBuilder, \
from ray.experimental.data.block import Block
from ray.experimental.data.impl.block_builder import BlockBuilder, \
SimpleBlockBuilder
if TYPE_CHECKING:

View file

@ -1,6 +1,6 @@
from typing import Optional
from ray.experimental.data.impl.block import Block
from ray.experimental.data.block import Block
from ray.experimental.data.impl.arrow_block import DelegatingArrowBlockBuilder

View file

@ -0,0 +1,76 @@
import sys
from typing import Iterator, Generic, Any, TYPE_CHECKING
if TYPE_CHECKING:
import pandas
import pyarrow
from ray.experimental.data.block import Block, T
class BlockBuilder(Generic[T]):
"""A builder class for blocks."""
def add(self, item: T) -> None:
"""Append a single row to the block being built."""
raise NotImplementedError
def add_block(self, block: "Block[T]") -> None:
"""Append an entire block to the block being built."""
raise NotImplementedError
def build(self) -> "Block[T]":
"""Build the block."""
raise NotImplementedError
class SimpleBlockBuilder(BlockBuilder[T]):
def __init__(self):
self._items = []
def add(self, item: T) -> None:
self._items.append(item)
def add_block(self, block: "SimpleBlock[T]") -> None:
self._items.extend(block._items)
def build(self) -> "SimpleBlock[T]":
return SimpleBlock(self._items)
class SimpleBlock(Block):
def __init__(self, items):
self._items = items
def num_rows(self) -> int:
return len(self._items)
def iter_rows(self) -> Iterator[T]:
return iter(self._items)
def slice(self, start: int, end: int, copy: bool) -> "SimpleBlock[T]":
view = self._items[start:end]
if copy:
view = view.copy()
return SimpleBlock(view)
def to_pandas(self) -> "pandas.DataFrame":
import pandas
return pandas.DataFrame(self._items)
def to_arrow_table(self) -> "pyarrow.Table":
import pyarrow
return pyarrow.Table.from_pandas(self.to_pandas())
def size_bytes(self) -> int:
return sys.getsizeof(self._items)
def schema(self) -> Any:
if self._items:
return type(self._items[0])
else:
return None
@staticmethod
def builder() -> SimpleBlockBuilder[T]:
return SimpleBlockBuilder()

View file

@ -1,6 +1,6 @@
from typing import Iterable, List
from ray.experimental.data.impl.block import Block, BlockMetadata, ObjectRef, T
from ray.experimental.data.block import Block, BlockMetadata, ObjectRef, T
class BlockList(Iterable[ObjectRef[Block[T]]]):

View file

@ -1,7 +1,7 @@
from typing import TypeVar, Iterable, Any, Union
import ray
from ray.experimental.data.impl.block import Block, BlockMetadata, ObjectRef
from ray.experimental.data.block import Block, BlockMetadata, ObjectRef
from ray.experimental.data.impl.block_list import BlockList
from ray.experimental.data.impl.progress_bar import ProgressBar

View file

@ -1,6 +1,6 @@
from typing import Callable, List
from ray.experimental.data.impl.block import Block, BlockMetadata, ObjectRef, T
from ray.experimental.data.block import Block, BlockMetadata, ObjectRef, T
from ray.experimental.data.impl.block_list import BlockList

View file

@ -1,7 +1,7 @@
from typing import List
import ray
from ray.experimental.data.impl.block import ObjectRef
from ray.experimental.data.block import ObjectRef
try:
import tqdm

View file

@ -2,7 +2,7 @@ import math
from typing import TypeVar, List
import ray
from ray.experimental.data.impl.block import Block, BlockMetadata
from ray.experimental.data.block import Block, BlockMetadata
from ray.experimental.data.impl.progress_bar import ProgressBar
from ray.experimental.data.impl.block_list import BlockList
from ray.experimental.data.impl.arrow_block import DelegatingArrowBlockBuilder

View file

@ -13,13 +13,13 @@ if TYPE_CHECKING:
import pyspark
import ray
from ray.experimental.data.block import ObjectRef, Block, BlockMetadata
from ray.experimental.data.dataset import Dataset
from ray.experimental.data.datasource import Datasource, RangeDatasource, \
JSONDatasource, CSVDatasource, ReadTask
from ray.experimental.data.impl import reader as _reader
from ray.experimental.data.impl.arrow_block import ArrowBlock, ArrowRow
from ray.experimental.data.impl.block import ObjectRef, SimpleBlock, Block, \
BlockMetadata
from ray.experimental.data.impl.block_builder import SimpleBlock
from ray.experimental.data.impl.block_list import BlockList
from ray.experimental.data.impl.lazy_block_list import LazyBlockList

View file

@ -16,7 +16,7 @@ import ray
from ray.util.dask import ray_dask_get
from ray.tests.conftest import * # noqa
from ray.experimental.data.datasource import DummyOutputDatasource
from ray.experimental.data.impl.block import Block
from ray.experimental.data.block import Block
import ray.experimental.data.tests.util as util