ray/doc/source/data/doc_code/creating_datasets.py
Clark Zinzow 4444150c29
[Datasets] Overhaul of "Creating Datasets" feature guide. (#24831)
This PR is a general overhaul of the "Creating Datasets" feature guide, providing complete coverage of all (public) dataset creation APIs and highlighting features and quirks of the individual APIs, data modalities, storage backends, etc. In order to keep the page from getting too long and keeping it easy to navigate, tabbed views are used heavily.
2022-05-17 16:23:42 -07:00

592 lines
16 KiB
Python

# flake8: noqa
# fmt: off
# __creating_datasets_import_begin__
import ray
# __creating_datasets_import_end__
# fmt: on
# fmt: off
# __gen_synth_int_range_begin__
# Create a Dataset of Python objects.
ds = ray.data.range(10000)
# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
ds.take(5)
# -> [0, 1, 2, 3, 4]
# __gen_synth_int_range_end__
# fmt: on
# fmt: off
# __gen_synth_arrow_range_begin__
# Create a Dataset of Arrow records.
ds = ray.data.range_arrow(10000)
# -> Dataset(num_blocks=200, num_rows=10000, schema={value: int64})
ds.take(5)
# -> [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': 4}]
# __gen_synth_arrow_range_end__
# fmt: on
# fmt: off
# __gen_synth_tensor_range_begin__
# Create a Dataset of tensors.
ds = ray.data.range_tensor(100 * 64 * 64, shape=(64, 64))
# -> Dataset(
# num_blocks=200,
# num_rows=409600,
# schema={value: <ArrowTensorType: shape=(64, 64), dtype=int64>}
# )
ds.take(2)
# -> [array([[0, 0, 0, ..., 0, 0, 0],
# [0, 0, 0, ..., 0, 0, 0],
# [0, 0, 0, ..., 0, 0, 0],
# ...,
# [0, 0, 0, ..., 0, 0, 0],
# [0, 0, 0, ..., 0, 0, 0],
# [0, 0, 0, ..., 0, 0, 0]]),
# array([[1, 1, 1, ..., 1, 1, 1],
# [1, 1, 1, ..., 1, 1, 1],
# [1, 1, 1, ..., 1, 1, 1],
# ...,
# [1, 1, 1, ..., 1, 1, 1],
# [1, 1, 1, ..., 1, 1, 1],
# [1, 1, 1, ..., 1, 1, 1]])]
# __gen_synth_tensor_range_end__
# fmt: on
# fmt: off
# __from_items_begin__
# Create a Dataset of tabular (Arrow) records.
ds = ray.data.from_items([{"col1": i, "col2": str(i)} for i in range(10000)])
# -> Dataset(num_blocks=200, num_rows=10000, schema={col1: int64, col2: string})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_items_end__
# fmt: on
# fmt: off
# __from_pandas_begin__
import pandas as pd
# Create a tabular Dataset from a Pandas DataFrame.
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
ds = ray.data.from_pandas(df)
# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: object})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_pandas_end__
# fmt: on
# fmt: off
# __from_pandas_mult_begin__
import pandas as pd
data = list(range(10000))
num_chunks = 10
chunk_size = len(data) // num_chunks
chunks = [data[i : i + chunk_size] for i in range(0, len(data), chunk_size)]
dfs = [
pd.DataFrame({"col1": list(chunk), "col2": list(map(str, chunk))})
for chunk in chunks
]
# Create a tabular Dataset from multiple Pandas DataFrames.
ds = ray.data.from_pandas(dfs)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_pandas_mult_end__
# fmt: on
# fmt: off
# __from_numpy_begin__
import numpy as np
# Create a tensor Dataset from a 1D NumPy ndarray.
arr = np.arange(100)
ds = ray.data.from_numpy(arr)
# -> Dataset(
# num_blocks=1,
# num_rows=100,
# schema={value: <ArrowTensorType: shape=(), dtype=int64>},
# )
# Each element is a scalar ndarray.
ds.show(3)
# -> {'value': array(0)}
# -> {'value': array(1)}
# -> {'value': array(2)}
# Create a tensor Dataset from a 3D NumPy ndarray.
arr = np.ones((3, 4, 4))
# The outer dimension is treated as the row dimension.
ds = ray.data.from_numpy(arr)
# -> Dataset(
# num_blocks=1,
# num_rows=3,
# schema={value: <ArrowTensorType: shape=(4, 4), dtype=int64>},
# )
ds.show(2)
# -> {'value': array([[1., 1., 1., 1.],
# [1., 1., 1., 1.],
# [1., 1., 1., 1.],
# [1., 1., 1., 1.]])}
# -> {'value': array([[1., 1., 1., 1.],
# [1., 1., 1., 1.],
# [1., 1., 1., 1.],
# [1., 1., 1., 1.]])}
# __from_numpy_end__
# fmt: on
# fmt: off
# __from_numpy_mult_begin__
import numpy as np
# Create a tensor Dataset from multiple 3D NumPy ndarray.
arrs = [np.random.rand(2, 4, 4) for _ in range(4)]
# The outer dimension is treated as the row dimension.
ds = ray.data.from_numpy(arrs)
# -> Dataset(
# num_blocks=4,
# num_rows=8,
# schema={value: <ArrowTensorType: shape=(4, 4), dtype=int64>},
# )
ds.show(2)
# -> {'value': array([[0.06587483, 0.67808656, 0.76461924, 0.83428549],
# [0.04932103, 0.25112165, 0.26476714, 0.24599738],
# [0.67624391, 0.58689537, 0.12594709, 0.94663371],
# [0.32435665, 0.97719096, 0.03234169, 0.71563231]])}
# -> {'value': array([[0.98570318, 0.65956399, 0.82168898, 0.09798336],
# [0.22426704, 0.34209978, 0.02605247, 0.48200137],
# [0.17312096, 0.38789983, 0.42663678, 0.92652456],
# [0.80787394, 0.92437162, 0.11185822, 0.3319638 ]])}
# __from_numpy_mult_end__
# fmt: on
# fmt: off
# __from_arrow_begin__
import pyarrow as pa
# Create a tabular Dataset from an Arrow Table.
t = pa.table({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
ds = ray.data.from_arrow(t)
# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: string})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_arrow_end__
# fmt: on
# fmt: off
# __from_arrow_mult_begin__
import arrow as pd
data = list(range(10000))
num_chunks = 10
chunk_size = len(data) // num_chunks
chunks = [data[i : i + chunk_size] for i in range(0, len(data), chunk_size)]
ts = [
pa.table({"col1": list(chunk), "col2": list(map(str, chunk))})
for chunk in chunks
]
# Create a tabular Dataset from multiple Arrow Tables.
ds = ray.data.from_arrow(ts)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_arrow_mult_end__
# fmt: on
# fmt: off
# __from_dask_begin__
import pandas as pd
import dask.dataframe as dd
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
ddf = dd.from_pandas(df, npartitions=4)
# Create a tabular Dataset from a Dask DataFrame.
ds = ray.data.from_dask(ddf)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_dask_end__
# fmt: on
# fmt: off
# __from_spark_begin__
import raydp
spark = raydp.init_spark(app_name="Spark -> Datasets Example",
num_executors=2,
executor_cores=2,
executor_memory="500MB")
df = spark.createDataFrame([(i, str(i)) for i in range(10000)], ["col1", "col2"])
# Create a tabular Dataset from a Spark DataFrame.
ds = ray.data.from_dask(df)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_spark_end__
# fmt: on
# fmt: off
# __from_modin_begin__
import modin.pandas as md
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
mdf = md.DataFrame(df)
# Create a tabular Dataset from a Modin DataFrame.
ds = ray.data.from_modin(mdf)
# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_modin_end__
# fmt: on
# fmt: off
# __from_mars_begin__
import mars
import mars.dataframe as md
cluster = mars.new_cluster_in_ray(worker_num=2, worker_cpu=1)
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
mdf = md.DataFrame(df, num_partitions=8)
# Create a tabular Dataset from a Mars DataFrame.
ds = ray.data.from_mars(mdf)
# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_mars_end__
# fmt: on
# fmt: off
# __read_parquet_begin__
# Create a tabular Dataset by reading a Parquet file.
ds = ray.data.read_parquet("example://iris.parquet")
# -> Dataset(
# num_blocks=1,
# num_rows=150,
# schema={
# sepal.length: double,
# sepal.width: double,
# petal.length: double,
# petal.width: double,
# variety: string,
# }
# )
ds.show(2)
# -> {
# 'sepal.length': 5.1,
# 'sepal.width': 3.5,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# -> {
# 'sepal.length': 4.9,
# 'sepal.width': 3.0,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# __read_parquet_end__
# fmt: on
# fmt: off
# __read_parquet_pushdown_begin__
import pyarrow as pa
# Create a tabular Dataset by reading a Parquet file, pushing column selection and row
# filtering down to the file scan.
ds = ray.data.read_parquet(
"example://iris.parquet",
columns=["sepal.length", "variety"],
filter=pa.dataset.field("sepal.length") > 5.0,
).fully_executed() # Force a full read of the file.
# -> Dataset(num_blocks=1, num_rows=118, schema={sepal.length: double, variety: string})
ds.show(2)
# -> {'sepal.length': 5.1, 'variety': 'Setosa'}
# {'sepal.length': 5.4, 'variety': 'Setosa'}
# __read_parquet_pushdown_end__
# fmt: on
# fmt: off
# __read_csv_begin__
# Create a tabular Dataset by reading a CSV file.
ds = ray.data.read_csv("example://iris.csv")
# -> Dataset(
# num_blocks=1,
# num_rows=150,
# schema={
# sepal.length: double,
# sepal.width: double,
# petal.length: double,
# petal.width: double,
# variety: string,
# }
# )
ds.show(2)
# -> {
# 'sepal.length': 5.1,
# 'sepal.width': 3.5,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# -> {
# 'sepal.length': 4.9,
# 'sepal.width': 3.0,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# __read_csv_end__
# fmt: on
# fmt: off
# __read_json_begin__
# Create a tabular Dataset by reading a JSON file.
ds = ray.data.read_json("example://iris.json")
# -> Dataset(
# num_blocks=1,
# num_rows=150,
# schema={
# sepal.length: double,
# sepal.width: double,
# petal.length: double,
# petal.width: double,
# variety: string,
# }
# )
ds.show(2)
# -> {
# 'sepal.length': 5.1,
# 'sepal.width': 3.5,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# -> {
# 'sepal.length': 4.9,
# 'sepal.width': 3.0,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# __read_json_end__
# fmt: on
# fmt: off
# __read_numpy_begin__
# Create a tensor Dataset by reading a NumPy file.
ds = ray.data.read_numpy("example://mnist_subset.npy")
# -> Dataset(
# num_blocks=1,
# num_rows=3,
# schema={__RAY_TC__: <ArrowTensorType: shape=(28, 28), dtype=uint8>},
# )
ds.show(2)
# [array([[0, ...]]), array([[0, ...]])]
# __read_numpy_end__
# fmt: on
# fmt: off
# __read_text_begin__
# Create a tabular Dataset by reading a text file.
ds = ray.data.read_text("example://sms_spam_collection_subset.txt")
# -> Dataset(num_blocks=1, num_rows=10, schema=<class 'str'>)
ds.show(3)
# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e
# buffet... Cine there got amore wat...
# ham Ok lar... Joking wif u oni...
# spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA
# to 87121 to receive entry question(std txt rate)T&C's apply
# 08452810075over18's
# __read_text_end__
# fmt: on
# fmt: off
# __read_binary_begin__
from io import BytesIO
import PIL
# Create a tabular Dataset by reading a binary file.
ds = ray.data.read_binary_files("example://mnist_subset_partitioned/0/1.png")
# -> Dataset(num_blocks=1, num_rows=1, schema=<class 'bytes'>)
ds = ds.map(lambda bytes_: np.asarray(PIL.Image.open(BytesIO(bytes_)).convert("L")))
# -> Dataset(
# num_blocks=1,
# num_rows=1,
# schema={__RAY_TC__: <ArrowTensorType: shape=(28, 28), dtype=uint8>},
# )
ds.show(3)
# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e
# buffet... Cine there got amore wat...
# ham Ok lar... Joking wif u oni...
# spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA
# to 87121 to receive entry question(std txt rate)T&C's apply
# 08452810075over18's
# __read_binary_end__
# fmt: on
# fmt: off
# __read_parquet_s3_begin__
# Create a tabular Dataset by reading a Parquet file from S3.
ds = ray.data.read_parquet("s3://ursa-labs-taxi-data/2009/01/data.parquet")
# -> Dataset(
# num_blocks=1,
# num_rows=14092413,
# schema={
# vendor_id: string,
# pickup_at: timestamp[us],
# dropoff_at: timestamp[us],
# passenger_count: int8,
# trip_distance: float,
# pickup_longitude: float,
# pickup_latitude: float,
# ...,
# },
# )
ds.show(2)
# -> {
# 'vendor_id': 'VTS',
# 'pickup_at': datetime.datetime(2009, 1, 4, 2, 52),
# 'dropoff_at': datetime.datetime(2009, 1, 4, 3, 2),
# 'passenger_count': 1,
# 'trip_distance': 2.630000114440918,
# 'pickup_longitude': -73.99195861816406,
# 'pickup_latitude': 40.72156524658203,
# ...,
# }
# {
# 'vendor_id': 'VTS',
# 'pickup_at': datetime.datetime(2009, 1, 4, 3, 31),
# 'dropoff_at': datetime.datetime(2009, 1, 4, 3, 38),
# 'passenger_count': 3,
# 'trip_distance': 4.550000190734863,
# 'pickup_longitude': -73.98210144042969,
# 'pickup_latitude': 40.736289978027344,
# ...,
# }
# __read_parquet_s3_end__
# fmt: on
# fmt: off
# __read_parquet_s3_with_fs_begin__
import pyarrow as pa
# Create a tabular Dataset by reading a Parquet file from a private S3 bucket.
# NOTE: This example is not runnable as-is; add in a path to your private bucket and the
# required S3 credentials!
ds = ray.data.read_parquet(
"s3://some/private/bucket",
filesystem=pa.fs.S3FileSystem(
region="us-west-2",
access_key="XXXX",
secret_key="XXXX",
),
)
# __read_parquet_s3_with_fs_end__
# fmt: on
# fmt: off
# __read_parquet_hdfs_begin__
# Create a tabular Dataset by reading a Parquet file from HDFS using HDFS connection
# automatically constructed based on the URI.
# NOTE: This example is not runnable as-is; you'll need to point it at your HDFS
# cluster/data.
ds = ray.data.read_parquet("hdfs://<host:port>/path/to/file.parquet")
# __read_parquet_hdfs_end__
# fmt: on
# TODO(Clark): Find clean way to start local HDFS cluster in the below example (that
# works in CI).
# fmt: off
# __read_parquet_hdfs_with_fs_begin__
import pyarrow as pa
# Create a tabular Dataset by reading a Parquet file from HDFS, manually specifying a
# configured HDFS connection via a Pyarrow HDFSFileSystem instance.
# NOTE: This example is not runnable as-is; you'll need to point it at your HDFS
# cluster/data.
ds = ray.data.read_parquet(
"hdfs://path/to/file.parquet",
filesystem=pa.fs.HDFSFileSystem(host="localhost", port=9000, user="bob"),
)
# __read_parquet_hdfs_with_fs_end__
# fmt: on
# TODO(Clark): Find open data for below GCS example.
# fmt: off
# __read_parquet_gcs_begin__
import gcsfs
# Create a tabular Dataset by reading a Parquet file from GCS, passing the configured
# GCSFileSystem.
# NOTE: This example is not runnable as-is; you'll need to point it at your GCS bucket
# and configure your GCP project and credentials.
ds = ray.data.read_parquet(
"gs://path/to/file.parquet",
filesystem=gcsfs.GCSFileSystem(project="my-google-project"),
)
# __read_parquet_gcs_end__
# fmt: on
# fmt: off
# __read_parquet_az_begin__
import adlfs
# Create a tabular Dataset by reading a Parquet file from Azure Blob Storage, passing
# the configured AzureBlobFileSystem.
path = (
"az://nyctlc/yellow/puYear=2009/puMonth=1/"
"part-00019-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426333-4"
".c000.snappy.parquet"
)
ds = ray.data.read_parquet(
path,
filesystem=adlfs.AzureBlobFileSystem(account_name="azureopendatastorage")
)
# __read_parquet_az_end__
# fmt: on