mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
592 lines
16 KiB
Python
592 lines
16 KiB
Python
# flake8: noqa
|
|
|
|
# fmt: off
|
|
# __creating_datasets_import_begin__
|
|
import ray
|
|
# __creating_datasets_import_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __gen_synth_int_range_begin__
|
|
# Create a Dataset of Python objects.
|
|
ds = ray.data.range(10000)
|
|
# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
|
|
|
|
ds.take(5)
|
|
# -> [0, 1, 2, 3, 4]
|
|
# __gen_synth_int_range_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __gen_synth_tabular_range_begin__
|
|
# Create a Dataset of Arrow records.
|
|
ds = ray.data.range_table(10000)
|
|
# -> Dataset(num_blocks=200, num_rows=10000, schema={value: int64})
|
|
|
|
ds.take(5)
|
|
# -> [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': 4}]
|
|
# __gen_synth_tabular_range_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __gen_synth_tensor_range_begin__
|
|
# Create a Dataset of tensors.
|
|
ds = ray.data.range_tensor(100 * 64 * 64, shape=(64, 64))
|
|
# -> Dataset(
|
|
# num_blocks=200,
|
|
# num_rows=409600,
|
|
# schema={value: <ArrowTensorType: shape=(64, 64), dtype=int64>}
|
|
# )
|
|
|
|
ds.take(2)
|
|
# -> [array([[0, 0, 0, ..., 0, 0, 0],
|
|
# [0, 0, 0, ..., 0, 0, 0],
|
|
# [0, 0, 0, ..., 0, 0, 0],
|
|
# ...,
|
|
# [0, 0, 0, ..., 0, 0, 0],
|
|
# [0, 0, 0, ..., 0, 0, 0],
|
|
# [0, 0, 0, ..., 0, 0, 0]]),
|
|
# array([[1, 1, 1, ..., 1, 1, 1],
|
|
# [1, 1, 1, ..., 1, 1, 1],
|
|
# [1, 1, 1, ..., 1, 1, 1],
|
|
# ...,
|
|
# [1, 1, 1, ..., 1, 1, 1],
|
|
# [1, 1, 1, ..., 1, 1, 1],
|
|
# [1, 1, 1, ..., 1, 1, 1]])]
|
|
# __gen_synth_tensor_range_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_items_begin__
|
|
# Create a Dataset of tabular (Arrow) records.
|
|
ds = ray.data.from_items([{"col1": i, "col2": str(i)} for i in range(10000)])
|
|
# -> Dataset(num_blocks=200, num_rows=10000, schema={col1: int64, col2: string})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_items_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_pandas_begin__
|
|
import pandas as pd
|
|
|
|
# Create a tabular Dataset from a Pandas DataFrame.
|
|
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
|
|
ds = ray.data.from_pandas(df)
|
|
# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: object})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_pandas_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_pandas_mult_begin__
|
|
import pandas as pd
|
|
|
|
data = list(range(10000))
|
|
num_chunks = 10
|
|
chunk_size = len(data) // num_chunks
|
|
chunks = [data[i : i + chunk_size] for i in range(0, len(data), chunk_size)]
|
|
dfs = [
|
|
pd.DataFrame({"col1": list(chunk), "col2": list(map(str, chunk))})
|
|
for chunk in chunks
|
|
]
|
|
# Create a tabular Dataset from multiple Pandas DataFrames.
|
|
ds = ray.data.from_pandas(dfs)
|
|
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_pandas_mult_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_numpy_begin__
|
|
import numpy as np
|
|
|
|
# Create a tensor Dataset from a 1D NumPy ndarray.
|
|
arr = np.arange(100)
|
|
ds = ray.data.from_numpy(arr)
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=100,
|
|
# schema={value: <ArrowTensorType: shape=(), dtype=int64>},
|
|
# )
|
|
|
|
# Each element is a scalar ndarray.
|
|
ds.show(3)
|
|
# -> {'value': array(0)}
|
|
# -> {'value': array(1)}
|
|
# -> {'value': array(2)}
|
|
|
|
# Create a tensor Dataset from a 3D NumPy ndarray.
|
|
arr = np.ones((3, 4, 4))
|
|
# The outer dimension is treated as the row dimension.
|
|
ds = ray.data.from_numpy(arr)
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=3,
|
|
# schema={value: <ArrowTensorType: shape=(4, 4), dtype=int64>},
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {'value': array([[1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.]])}
|
|
# -> {'value': array([[1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.]])}
|
|
# __from_numpy_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_numpy_mult_begin__
|
|
import numpy as np
|
|
|
|
# Create a tensor Dataset from multiple 3D NumPy ndarray.
|
|
arrs = [np.random.rand(2, 4, 4) for _ in range(4)]
|
|
# The outer dimension is treated as the row dimension.
|
|
ds = ray.data.from_numpy(arrs)
|
|
# -> Dataset(
|
|
# num_blocks=4,
|
|
# num_rows=8,
|
|
# schema={value: <ArrowTensorType: shape=(4, 4), dtype=int64>},
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {'value': array([[0.06587483, 0.67808656, 0.76461924, 0.83428549],
|
|
# [0.04932103, 0.25112165, 0.26476714, 0.24599738],
|
|
# [0.67624391, 0.58689537, 0.12594709, 0.94663371],
|
|
# [0.32435665, 0.97719096, 0.03234169, 0.71563231]])}
|
|
# -> {'value': array([[0.98570318, 0.65956399, 0.82168898, 0.09798336],
|
|
# [0.22426704, 0.34209978, 0.02605247, 0.48200137],
|
|
# [0.17312096, 0.38789983, 0.42663678, 0.92652456],
|
|
# [0.80787394, 0.92437162, 0.11185822, 0.3319638 ]])}
|
|
# __from_numpy_mult_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_arrow_begin__
|
|
import pyarrow as pa
|
|
|
|
# Create a tabular Dataset from an Arrow Table.
|
|
t = pa.table({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
|
|
ds = ray.data.from_arrow(t)
|
|
# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: string})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_arrow_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_arrow_mult_begin__
|
|
import arrow as pd
|
|
|
|
data = list(range(10000))
|
|
num_chunks = 10
|
|
chunk_size = len(data) // num_chunks
|
|
chunks = [data[i : i + chunk_size] for i in range(0, len(data), chunk_size)]
|
|
ts = [
|
|
pa.table({"col1": list(chunk), "col2": list(map(str, chunk))})
|
|
for chunk in chunks
|
|
]
|
|
# Create a tabular Dataset from multiple Arrow Tables.
|
|
ds = ray.data.from_arrow(ts)
|
|
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_arrow_mult_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_dask_begin__
|
|
import pandas as pd
|
|
import dask.dataframe as dd
|
|
|
|
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
|
|
ddf = dd.from_pandas(df, npartitions=4)
|
|
# Create a tabular Dataset from a Dask DataFrame.
|
|
ds = ray.data.from_dask(ddf)
|
|
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_dask_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_spark_begin__
|
|
import raydp
|
|
|
|
spark = raydp.init_spark(app_name="Spark -> Datasets Example",
|
|
num_executors=2,
|
|
executor_cores=2,
|
|
executor_memory="500MB")
|
|
df = spark.createDataFrame([(i, str(i)) for i in range(10000)], ["col1", "col2"])
|
|
# Create a tabular Dataset from a Spark DataFrame.
|
|
ds = ray.data.from_dask(df)
|
|
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_spark_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_modin_begin__
|
|
import modin.pandas as md
|
|
|
|
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
|
|
mdf = md.DataFrame(df)
|
|
# Create a tabular Dataset from a Modin DataFrame.
|
|
ds = ray.data.from_modin(mdf)
|
|
# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_modin_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_mars_begin__
|
|
import mars
|
|
import mars.dataframe as md
|
|
|
|
cluster = mars.new_cluster_in_ray(worker_num=2, worker_cpu=1)
|
|
|
|
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
|
|
mdf = md.DataFrame(df, num_partitions=8)
|
|
# Create a tabular Dataset from a Mars DataFrame.
|
|
ds = ray.data.from_mars(mdf)
|
|
# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_mars_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_begin__
|
|
# Create a tabular Dataset by reading a Parquet file.
|
|
ds = ray.data.read_parquet("example://iris.parquet")
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=150,
|
|
# schema={
|
|
# sepal.length: double,
|
|
# sepal.width: double,
|
|
# petal.length: double,
|
|
# petal.width: double,
|
|
# variety: string,
|
|
# }
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {
|
|
# 'sepal.length': 5.1,
|
|
# 'sepal.width': 3.5,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# -> {
|
|
# 'sepal.length': 4.9,
|
|
# 'sepal.width': 3.0,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# __read_parquet_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_pushdown_begin__
|
|
import pyarrow as pa
|
|
|
|
# Create a tabular Dataset by reading a Parquet file, pushing column selection and row
|
|
# filtering down to the file scan.
|
|
ds = ray.data.read_parquet(
|
|
"example://iris.parquet",
|
|
columns=["sepal.length", "variety"],
|
|
filter=pa.dataset.field("sepal.length") > 5.0,
|
|
).fully_executed() # Force a full read of the file.
|
|
# -> Dataset(num_blocks=1, num_rows=118, schema={sepal.length: double, variety: string})
|
|
|
|
ds.show(2)
|
|
# -> {'sepal.length': 5.1, 'variety': 'Setosa'}
|
|
# {'sepal.length': 5.4, 'variety': 'Setosa'}
|
|
# __read_parquet_pushdown_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_csv_begin__
|
|
# Create a tabular Dataset by reading a CSV file.
|
|
ds = ray.data.read_csv("example://iris.csv")
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=150,
|
|
# schema={
|
|
# sepal.length: double,
|
|
# sepal.width: double,
|
|
# petal.length: double,
|
|
# petal.width: double,
|
|
# variety: string,
|
|
# }
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {
|
|
# 'sepal.length': 5.1,
|
|
# 'sepal.width': 3.5,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# -> {
|
|
# 'sepal.length': 4.9,
|
|
# 'sepal.width': 3.0,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# __read_csv_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_json_begin__
|
|
# Create a tabular Dataset by reading a JSON file.
|
|
ds = ray.data.read_json("example://iris.json")
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=150,
|
|
# schema={
|
|
# sepal.length: double,
|
|
# sepal.width: double,
|
|
# petal.length: double,
|
|
# petal.width: double,
|
|
# variety: string,
|
|
# }
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {
|
|
# 'sepal.length': 5.1,
|
|
# 'sepal.width': 3.5,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# -> {
|
|
# 'sepal.length': 4.9,
|
|
# 'sepal.width': 3.0,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# __read_json_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_numpy_begin__
|
|
# Create a tensor Dataset by reading a NumPy file.
|
|
ds = ray.data.read_numpy("example://mnist_subset.npy")
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=3,
|
|
# schema={__RAY_TC__: <ArrowTensorType: shape=(28, 28), dtype=uint8>},
|
|
# )
|
|
|
|
ds.show(2)
|
|
# [array([[0, ...]]), array([[0, ...]])]
|
|
# __read_numpy_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_text_begin__
|
|
# Create a tabular Dataset by reading a text file.
|
|
ds = ray.data.read_text("example://sms_spam_collection_subset.txt")
|
|
# -> Dataset(num_blocks=1, num_rows=10, schema=<class 'str'>)
|
|
|
|
ds.show(3)
|
|
# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e
|
|
# buffet... Cine there got amore wat...
|
|
# ham Ok lar... Joking wif u oni...
|
|
# spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA
|
|
# to 87121 to receive entry question(std txt rate)T&C's apply
|
|
# 08452810075over18's
|
|
# __read_text_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_binary_begin__
|
|
from io import BytesIO
|
|
import PIL
|
|
|
|
# Create a tabular Dataset by reading a binary file.
|
|
ds = ray.data.read_binary_files("example://mnist_subset_partitioned/0/1.png")
|
|
# -> Dataset(num_blocks=1, num_rows=1, schema=<class 'bytes'>)
|
|
|
|
ds = ds.map(lambda bytes_: np.asarray(PIL.Image.open(BytesIO(bytes_)).convert("L")))
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=1,
|
|
# schema={__RAY_TC__: <ArrowTensorType: shape=(28, 28), dtype=uint8>},
|
|
# )
|
|
|
|
ds.show(3)
|
|
# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e
|
|
# buffet... Cine there got amore wat...
|
|
# ham Ok lar... Joking wif u oni...
|
|
# spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA
|
|
# to 87121 to receive entry question(std txt rate)T&C's apply
|
|
# 08452810075over18's
|
|
# __read_binary_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_s3_begin__
|
|
# Create a tabular Dataset by reading a Parquet file from S3.
|
|
ds = ray.data.read_parquet("s3://ursa-labs-taxi-data/2009/01/data.parquet")
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=14092413,
|
|
# schema={
|
|
# vendor_id: string,
|
|
# pickup_at: timestamp[us],
|
|
# dropoff_at: timestamp[us],
|
|
# passenger_count: int8,
|
|
# trip_distance: float,
|
|
# pickup_longitude: float,
|
|
# pickup_latitude: float,
|
|
# ...,
|
|
# },
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {
|
|
# 'vendor_id': 'VTS',
|
|
# 'pickup_at': datetime.datetime(2009, 1, 4, 2, 52),
|
|
# 'dropoff_at': datetime.datetime(2009, 1, 4, 3, 2),
|
|
# 'passenger_count': 1,
|
|
# 'trip_distance': 2.630000114440918,
|
|
# 'pickup_longitude': -73.99195861816406,
|
|
# 'pickup_latitude': 40.72156524658203,
|
|
# ...,
|
|
# }
|
|
# {
|
|
# 'vendor_id': 'VTS',
|
|
# 'pickup_at': datetime.datetime(2009, 1, 4, 3, 31),
|
|
# 'dropoff_at': datetime.datetime(2009, 1, 4, 3, 38),
|
|
# 'passenger_count': 3,
|
|
# 'trip_distance': 4.550000190734863,
|
|
# 'pickup_longitude': -73.98210144042969,
|
|
# 'pickup_latitude': 40.736289978027344,
|
|
# ...,
|
|
# }
|
|
# __read_parquet_s3_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_s3_with_fs_begin__
|
|
import pyarrow as pa
|
|
|
|
# Create a tabular Dataset by reading a Parquet file from a private S3 bucket.
|
|
# NOTE: This example is not runnable as-is; add in a path to your private bucket and the
|
|
# required S3 credentials!
|
|
ds = ray.data.read_parquet(
|
|
"s3://some/private/bucket",
|
|
filesystem=pa.fs.S3FileSystem(
|
|
region="us-west-2",
|
|
access_key="XXXX",
|
|
secret_key="XXXX",
|
|
),
|
|
)
|
|
# __read_parquet_s3_with_fs_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_hdfs_begin__
|
|
# Create a tabular Dataset by reading a Parquet file from HDFS using HDFS connection
|
|
# automatically constructed based on the URI.
|
|
# NOTE: This example is not runnable as-is; you'll need to point it at your HDFS
|
|
# cluster/data.
|
|
ds = ray.data.read_parquet("hdfs://<host:port>/path/to/file.parquet")
|
|
# __read_parquet_hdfs_end__
|
|
# fmt: on
|
|
|
|
# TODO(Clark): Find clean way to start local HDFS cluster in the below example (that
|
|
# works in CI).
|
|
|
|
# fmt: off
|
|
# __read_parquet_hdfs_with_fs_begin__
|
|
import pyarrow as pa
|
|
|
|
# Create a tabular Dataset by reading a Parquet file from HDFS, manually specifying a
|
|
# configured HDFS connection via a Pyarrow HDFSFileSystem instance.
|
|
# NOTE: This example is not runnable as-is; you'll need to point it at your HDFS
|
|
# cluster/data.
|
|
ds = ray.data.read_parquet(
|
|
"hdfs://path/to/file.parquet",
|
|
filesystem=pa.fs.HDFSFileSystem(host="localhost", port=9000, user="bob"),
|
|
)
|
|
# __read_parquet_hdfs_with_fs_end__
|
|
# fmt: on
|
|
|
|
# TODO(Clark): Find open data for below GCS example.
|
|
|
|
# fmt: off
|
|
# __read_parquet_gcs_begin__
|
|
import gcsfs
|
|
|
|
# Create a tabular Dataset by reading a Parquet file from GCS, passing the configured
|
|
# GCSFileSystem.
|
|
# NOTE: This example is not runnable as-is; you'll need to point it at your GCS bucket
|
|
# and configure your GCP project and credentials.
|
|
ds = ray.data.read_parquet(
|
|
"gs://path/to/file.parquet",
|
|
filesystem=gcsfs.GCSFileSystem(project="my-google-project"),
|
|
)
|
|
# __read_parquet_gcs_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_az_begin__
|
|
import adlfs
|
|
|
|
# Create a tabular Dataset by reading a Parquet file from Azure Blob Storage, passing
|
|
# the configured AzureBlobFileSystem.
|
|
path = (
|
|
"az://nyctlc/yellow/puYear=2009/puMonth=1/"
|
|
"part-00019-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426333-4"
|
|
".c000.snappy.parquet"
|
|
)
|
|
ds = ray.data.read_parquet(
|
|
path,
|
|
filesystem=adlfs.AzureBlobFileSystem(account_name="azureopendatastorage")
|
|
)
|
|
# __read_parquet_az_end__
|
|
# fmt: on
|