ray/doc/source/data/doc_code/creating_datasets.py

592 lines
16 KiB
Python

# flake8: noqa
# fmt: off
# __creating_datasets_import_begin__
import ray
# __creating_datasets_import_end__
# fmt: on
# fmt: off
# __gen_synth_int_range_begin__
# Create a Dataset of Python objects.
ds = ray.data.range(10000)
# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
ds.take(5)
# -> [0, 1, 2, 3, 4]
# __gen_synth_int_range_end__
# fmt: on
# fmt: off
# __gen_synth_tabular_range_begin__
# Create a Dataset of Arrow records.
ds = ray.data.range_table(10000)
# -> Dataset(num_blocks=200, num_rows=10000, schema={value: int64})
ds.take(5)
# -> [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': 4}]
# __gen_synth_tabular_range_end__
# fmt: on
# fmt: off
# __gen_synth_tensor_range_begin__
# Create a Dataset of tensors.
ds = ray.data.range_tensor(100 * 64 * 64, shape=(64, 64))
# -> Dataset(
# num_blocks=200,
# num_rows=409600,
# schema={value: <ArrowTensorType: shape=(64, 64), dtype=int64>}
# )
ds.take(2)
# -> [array([[0, 0, 0, ..., 0, 0, 0],
# [0, 0, 0, ..., 0, 0, 0],
# [0, 0, 0, ..., 0, 0, 0],
# ...,
# [0, 0, 0, ..., 0, 0, 0],
# [0, 0, 0, ..., 0, 0, 0],
# [0, 0, 0, ..., 0, 0, 0]]),
# array([[1, 1, 1, ..., 1, 1, 1],
# [1, 1, 1, ..., 1, 1, 1],
# [1, 1, 1, ..., 1, 1, 1],
# ...,
# [1, 1, 1, ..., 1, 1, 1],
# [1, 1, 1, ..., 1, 1, 1],
# [1, 1, 1, ..., 1, 1, 1]])]
# __gen_synth_tensor_range_end__
# fmt: on
# fmt: off
# __from_items_begin__
# Create a Dataset of tabular (Arrow) records.
ds = ray.data.from_items([{"col1": i, "col2": str(i)} for i in range(10000)])
# -> Dataset(num_blocks=200, num_rows=10000, schema={col1: int64, col2: string})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_items_end__
# fmt: on
# fmt: off
# __from_pandas_begin__
import pandas as pd
# Create a tabular Dataset from a Pandas DataFrame.
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
ds = ray.data.from_pandas(df)
# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: object})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_pandas_end__
# fmt: on
# fmt: off
# __from_pandas_mult_begin__
import pandas as pd
data = list(range(10000))
num_chunks = 10
chunk_size = len(data) // num_chunks
chunks = [data[i : i + chunk_size] for i in range(0, len(data), chunk_size)]
dfs = [
pd.DataFrame({"col1": list(chunk), "col2": list(map(str, chunk))})
for chunk in chunks
]
# Create a tabular Dataset from multiple Pandas DataFrames.
ds = ray.data.from_pandas(dfs)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_pandas_mult_end__
# fmt: on
# fmt: off
# __from_numpy_begin__
import numpy as np
# Create a tensor Dataset from a 1D NumPy ndarray.
arr = np.arange(100)
ds = ray.data.from_numpy(arr)
# -> Dataset(
# num_blocks=1,
# num_rows=100,
# schema={value: <ArrowTensorType: shape=(), dtype=int64>},
# )
# Each element is a scalar ndarray.
ds.show(3)
# -> {'value': array(0)}
# -> {'value': array(1)}
# -> {'value': array(2)}
# Create a tensor Dataset from a 3D NumPy ndarray.
arr = np.ones((3, 4, 4))
# The outer dimension is treated as the row dimension.
ds = ray.data.from_numpy(arr)
# -> Dataset(
# num_blocks=1,
# num_rows=3,
# schema={value: <ArrowTensorType: shape=(4, 4), dtype=int64>},
# )
ds.show(2)
# -> {'value': array([[1., 1., 1., 1.],
# [1., 1., 1., 1.],
# [1., 1., 1., 1.],
# [1., 1., 1., 1.]])}
# -> {'value': array([[1., 1., 1., 1.],
# [1., 1., 1., 1.],
# [1., 1., 1., 1.],
# [1., 1., 1., 1.]])}
# __from_numpy_end__
# fmt: on
# fmt: off
# __from_numpy_mult_begin__
import numpy as np
# Create a tensor Dataset from multiple 3D NumPy ndarray.
arrs = [np.random.rand(2, 4, 4) for _ in range(4)]
# The outer dimension is treated as the row dimension.
ds = ray.data.from_numpy(arrs)
# -> Dataset(
# num_blocks=4,
# num_rows=8,
# schema={value: <ArrowTensorType: shape=(4, 4), dtype=int64>},
# )
ds.show(2)
# -> {'value': array([[0.06587483, 0.67808656, 0.76461924, 0.83428549],
# [0.04932103, 0.25112165, 0.26476714, 0.24599738],
# [0.67624391, 0.58689537, 0.12594709, 0.94663371],
# [0.32435665, 0.97719096, 0.03234169, 0.71563231]])}
# -> {'value': array([[0.98570318, 0.65956399, 0.82168898, 0.09798336],
# [0.22426704, 0.34209978, 0.02605247, 0.48200137],
# [0.17312096, 0.38789983, 0.42663678, 0.92652456],
# [0.80787394, 0.92437162, 0.11185822, 0.3319638 ]])}
# __from_numpy_mult_end__
# fmt: on
# fmt: off
# __from_arrow_begin__
import pyarrow as pa
# Create a tabular Dataset from an Arrow Table.
t = pa.table({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
ds = ray.data.from_arrow(t)
# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: string})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_arrow_end__
# fmt: on
# fmt: off
# __from_arrow_mult_begin__
import arrow as pd
data = list(range(10000))
num_chunks = 10
chunk_size = len(data) // num_chunks
chunks = [data[i : i + chunk_size] for i in range(0, len(data), chunk_size)]
ts = [
pa.table({"col1": list(chunk), "col2": list(map(str, chunk))})
for chunk in chunks
]
# Create a tabular Dataset from multiple Arrow Tables.
ds = ray.data.from_arrow(ts)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_arrow_mult_end__
# fmt: on
# fmt: off
# __from_dask_begin__
import pandas as pd
import dask.dataframe as dd
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
ddf = dd.from_pandas(df, npartitions=4)
# Create a tabular Dataset from a Dask DataFrame.
ds = ray.data.from_dask(ddf)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_dask_end__
# fmt: on
# fmt: off
# __from_spark_begin__
import raydp
spark = raydp.init_spark(app_name="Spark -> Datasets Example",
num_executors=2,
executor_cores=2,
executor_memory="500MB")
df = spark.createDataFrame([(i, str(i)) for i in range(10000)], ["col1", "col2"])
# Create a tabular Dataset from a Spark DataFrame.
ds = ray.data.from_dask(df)
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_spark_end__
# fmt: on
# fmt: off
# __from_modin_begin__
import modin.pandas as md
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
mdf = md.DataFrame(df)
# Create a tabular Dataset from a Modin DataFrame.
ds = ray.data.from_modin(mdf)
# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_modin_end__
# fmt: on
# fmt: off
# __from_mars_begin__
import mars
import mars.dataframe as md
cluster = mars.new_cluster_in_ray(worker_num=2, worker_cpu=1)
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
mdf = md.DataFrame(df, num_partitions=8)
# Create a tabular Dataset from a Mars DataFrame.
ds = ray.data.from_mars(mdf)
# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
ds.show(3)
# -> {'col1': 0, 'col2': '0'}
# -> {'col1': 1, 'col2': '1'}
# -> {'col1': 2, 'col2': '2'}
# __from_mars_end__
# fmt: on
# fmt: off
# __read_parquet_begin__
# Create a tabular Dataset by reading a Parquet file.
ds = ray.data.read_parquet("example://iris.parquet")
# -> Dataset(
# num_blocks=1,
# num_rows=150,
# schema={
# sepal.length: double,
# sepal.width: double,
# petal.length: double,
# petal.width: double,
# variety: string,
# }
# )
ds.show(2)
# -> {
# 'sepal.length': 5.1,
# 'sepal.width': 3.5,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# -> {
# 'sepal.length': 4.9,
# 'sepal.width': 3.0,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# __read_parquet_end__
# fmt: on
# fmt: off
# __read_parquet_pushdown_begin__
import pyarrow as pa
# Create a tabular Dataset by reading a Parquet file, pushing column selection and row
# filtering down to the file scan.
ds = ray.data.read_parquet(
"example://iris.parquet",
columns=["sepal.length", "variety"],
filter=pa.dataset.field("sepal.length") > 5.0,
).fully_executed() # Force a full read of the file.
# -> Dataset(num_blocks=1, num_rows=118, schema={sepal.length: double, variety: string})
ds.show(2)
# -> {'sepal.length': 5.1, 'variety': 'Setosa'}
# {'sepal.length': 5.4, 'variety': 'Setosa'}
# __read_parquet_pushdown_end__
# fmt: on
# fmt: off
# __read_csv_begin__
# Create a tabular Dataset by reading a CSV file.
ds = ray.data.read_csv("example://iris.csv")
# -> Dataset(
# num_blocks=1,
# num_rows=150,
# schema={
# sepal.length: double,
# sepal.width: double,
# petal.length: double,
# petal.width: double,
# variety: string,
# }
# )
ds.show(2)
# -> {
# 'sepal.length': 5.1,
# 'sepal.width': 3.5,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# -> {
# 'sepal.length': 4.9,
# 'sepal.width': 3.0,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# __read_csv_end__
# fmt: on
# fmt: off
# __read_json_begin__
# Create a tabular Dataset by reading a JSON file.
ds = ray.data.read_json("example://iris.json")
# -> Dataset(
# num_blocks=1,
# num_rows=150,
# schema={
# sepal.length: double,
# sepal.width: double,
# petal.length: double,
# petal.width: double,
# variety: string,
# }
# )
ds.show(2)
# -> {
# 'sepal.length': 5.1,
# 'sepal.width': 3.5,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# -> {
# 'sepal.length': 4.9,
# 'sepal.width': 3.0,
# 'petal.length': 1.4,
# 'petal.width': 0.2,
# 'variety': 'Setosa',
# }
# __read_json_end__
# fmt: on
# fmt: off
# __read_numpy_begin__
# Create a tensor Dataset by reading a NumPy file.
ds = ray.data.read_numpy("example://mnist_subset.npy")
# -> Dataset(
# num_blocks=1,
# num_rows=3,
# schema={__RAY_TC__: <ArrowTensorType: shape=(28, 28), dtype=uint8>},
# )
ds.show(2)
# [array([[0, ...]]), array([[0, ...]])]
# __read_numpy_end__
# fmt: on
# fmt: off
# __read_text_begin__
# Create a tabular Dataset by reading a text file.
ds = ray.data.read_text("example://sms_spam_collection_subset.txt")
# -> Dataset(num_blocks=1, num_rows=10, schema=<class 'str'>)
ds.show(3)
# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e
# buffet... Cine there got amore wat...
# ham Ok lar... Joking wif u oni...
# spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA
# to 87121 to receive entry question(std txt rate)T&C's apply
# 08452810075over18's
# __read_text_end__
# fmt: on
# fmt: off
# __read_binary_begin__
from io import BytesIO
import PIL
# Create a tabular Dataset by reading a binary file.
ds = ray.data.read_binary_files("example://mnist_subset_partitioned/0/1.png")
# -> Dataset(num_blocks=1, num_rows=1, schema=<class 'bytes'>)
ds = ds.map(lambda bytes_: np.asarray(PIL.Image.open(BytesIO(bytes_)).convert("L")))
# -> Dataset(
# num_blocks=1,
# num_rows=1,
# schema={__RAY_TC__: <ArrowTensorType: shape=(28, 28), dtype=uint8>},
# )
ds.show(3)
# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e
# buffet... Cine there got amore wat...
# ham Ok lar... Joking wif u oni...
# spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA
# to 87121 to receive entry question(std txt rate)T&C's apply
# 08452810075over18's
# __read_binary_end__
# fmt: on
# fmt: off
# __read_parquet_s3_begin__
# Create a tabular Dataset by reading a Parquet file from S3.
ds = ray.data.read_parquet("s3://ursa-labs-taxi-data/2009/01/data.parquet")
# -> Dataset(
# num_blocks=1,
# num_rows=14092413,
# schema={
# vendor_id: string,
# pickup_at: timestamp[us],
# dropoff_at: timestamp[us],
# passenger_count: int8,
# trip_distance: float,
# pickup_longitude: float,
# pickup_latitude: float,
# ...,
# },
# )
ds.show(2)
# -> {
# 'vendor_id': 'VTS',
# 'pickup_at': datetime.datetime(2009, 1, 4, 2, 52),
# 'dropoff_at': datetime.datetime(2009, 1, 4, 3, 2),
# 'passenger_count': 1,
# 'trip_distance': 2.630000114440918,
# 'pickup_longitude': -73.99195861816406,
# 'pickup_latitude': 40.72156524658203,
# ...,
# }
# {
# 'vendor_id': 'VTS',
# 'pickup_at': datetime.datetime(2009, 1, 4, 3, 31),
# 'dropoff_at': datetime.datetime(2009, 1, 4, 3, 38),
# 'passenger_count': 3,
# 'trip_distance': 4.550000190734863,
# 'pickup_longitude': -73.98210144042969,
# 'pickup_latitude': 40.736289978027344,
# ...,
# }
# __read_parquet_s3_end__
# fmt: on
# fmt: off
# __read_parquet_s3_with_fs_begin__
import pyarrow as pa
# Create a tabular Dataset by reading a Parquet file from a private S3 bucket.
# NOTE: This example is not runnable as-is; add in a path to your private bucket and the
# required S3 credentials!
ds = ray.data.read_parquet(
"s3://some/private/bucket",
filesystem=pa.fs.S3FileSystem(
region="us-west-2",
access_key="XXXX",
secret_key="XXXX",
),
)
# __read_parquet_s3_with_fs_end__
# fmt: on
# fmt: off
# __read_parquet_hdfs_begin__
# Create a tabular Dataset by reading a Parquet file from HDFS using HDFS connection
# automatically constructed based on the URI.
# NOTE: This example is not runnable as-is; you'll need to point it at your HDFS
# cluster/data.
ds = ray.data.read_parquet("hdfs://<host:port>/path/to/file.parquet")
# __read_parquet_hdfs_end__
# fmt: on
# TODO(Clark): Find clean way to start local HDFS cluster in the below example (that
# works in CI).
# fmt: off
# __read_parquet_hdfs_with_fs_begin__
import pyarrow as pa
# Create a tabular Dataset by reading a Parquet file from HDFS, manually specifying a
# configured HDFS connection via a Pyarrow HDFSFileSystem instance.
# NOTE: This example is not runnable as-is; you'll need to point it at your HDFS
# cluster/data.
ds = ray.data.read_parquet(
"hdfs://path/to/file.parquet",
filesystem=pa.fs.HDFSFileSystem(host="localhost", port=9000, user="bob"),
)
# __read_parquet_hdfs_with_fs_end__
# fmt: on
# TODO(Clark): Find open data for below GCS example.
# fmt: off
# __read_parquet_gcs_begin__
import gcsfs
# Create a tabular Dataset by reading a Parquet file from GCS, passing the configured
# GCSFileSystem.
# NOTE: This example is not runnable as-is; you'll need to point it at your GCS bucket
# and configure your GCP project and credentials.
ds = ray.data.read_parquet(
"gs://path/to/file.parquet",
filesystem=gcsfs.GCSFileSystem(project="my-google-project"),
)
# __read_parquet_gcs_end__
# fmt: on
# fmt: off
# __read_parquet_az_begin__
import adlfs
# Create a tabular Dataset by reading a Parquet file from Azure Blob Storage, passing
# the configured AzureBlobFileSystem.
path = (
"az://nyctlc/yellow/puYear=2009/puMonth=1/"
"part-00019-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426333-4"
".c000.snappy.parquet"
)
ds = ray.data.read_parquet(
path,
filesystem=adlfs.AzureBlobFileSystem(account_name="azureopendatastorage")
)
# __read_parquet_az_end__
# fmt: on