mirror of
https://github.com/vale981/ray
synced 2025-03-10 05:16:49 -04:00

This PR is a general overhaul of the "Creating Datasets" feature guide, providing complete coverage of all (public) dataset creation APIs and highlighting features and quirks of the individual APIs, data modalities, storage backends, etc. In order to keep the page from getting too long and keeping it easy to navigate, tabbed views are used heavily.
592 lines
16 KiB
Python
592 lines
16 KiB
Python
# flake8: noqa
|
|
|
|
# fmt: off
|
|
# __creating_datasets_import_begin__
|
|
import ray
|
|
# __creating_datasets_import_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __gen_synth_int_range_begin__
|
|
# Create a Dataset of Python objects.
|
|
ds = ray.data.range(10000)
|
|
# -> Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)
|
|
|
|
ds.take(5)
|
|
# -> [0, 1, 2, 3, 4]
|
|
# __gen_synth_int_range_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __gen_synth_arrow_range_begin__
|
|
# Create a Dataset of Arrow records.
|
|
ds = ray.data.range_arrow(10000)
|
|
# -> Dataset(num_blocks=200, num_rows=10000, schema={value: int64})
|
|
|
|
ds.take(5)
|
|
# -> [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': 4}]
|
|
# __gen_synth_arrow_range_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __gen_synth_tensor_range_begin__
|
|
# Create a Dataset of tensors.
|
|
ds = ray.data.range_tensor(100 * 64 * 64, shape=(64, 64))
|
|
# -> Dataset(
|
|
# num_blocks=200,
|
|
# num_rows=409600,
|
|
# schema={value: <ArrowTensorType: shape=(64, 64), dtype=int64>}
|
|
# )
|
|
|
|
ds.take(2)
|
|
# -> [array([[0, 0, 0, ..., 0, 0, 0],
|
|
# [0, 0, 0, ..., 0, 0, 0],
|
|
# [0, 0, 0, ..., 0, 0, 0],
|
|
# ...,
|
|
# [0, 0, 0, ..., 0, 0, 0],
|
|
# [0, 0, 0, ..., 0, 0, 0],
|
|
# [0, 0, 0, ..., 0, 0, 0]]),
|
|
# array([[1, 1, 1, ..., 1, 1, 1],
|
|
# [1, 1, 1, ..., 1, 1, 1],
|
|
# [1, 1, 1, ..., 1, 1, 1],
|
|
# ...,
|
|
# [1, 1, 1, ..., 1, 1, 1],
|
|
# [1, 1, 1, ..., 1, 1, 1],
|
|
# [1, 1, 1, ..., 1, 1, 1]])]
|
|
# __gen_synth_tensor_range_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_items_begin__
|
|
# Create a Dataset of tabular (Arrow) records.
|
|
ds = ray.data.from_items([{"col1": i, "col2": str(i)} for i in range(10000)])
|
|
# -> Dataset(num_blocks=200, num_rows=10000, schema={col1: int64, col2: string})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_items_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_pandas_begin__
|
|
import pandas as pd
|
|
|
|
# Create a tabular Dataset from a Pandas DataFrame.
|
|
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
|
|
ds = ray.data.from_pandas(df)
|
|
# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: object})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_pandas_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_pandas_mult_begin__
|
|
import pandas as pd
|
|
|
|
data = list(range(10000))
|
|
num_chunks = 10
|
|
chunk_size = len(data) // num_chunks
|
|
chunks = [data[i : i + chunk_size] for i in range(0, len(data), chunk_size)]
|
|
dfs = [
|
|
pd.DataFrame({"col1": list(chunk), "col2": list(map(str, chunk))})
|
|
for chunk in chunks
|
|
]
|
|
# Create a tabular Dataset from multiple Pandas DataFrames.
|
|
ds = ray.data.from_pandas(dfs)
|
|
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_pandas_mult_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_numpy_begin__
|
|
import numpy as np
|
|
|
|
# Create a tensor Dataset from a 1D NumPy ndarray.
|
|
arr = np.arange(100)
|
|
ds = ray.data.from_numpy(arr)
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=100,
|
|
# schema={value: <ArrowTensorType: shape=(), dtype=int64>},
|
|
# )
|
|
|
|
# Each element is a scalar ndarray.
|
|
ds.show(3)
|
|
# -> {'value': array(0)}
|
|
# -> {'value': array(1)}
|
|
# -> {'value': array(2)}
|
|
|
|
# Create a tensor Dataset from a 3D NumPy ndarray.
|
|
arr = np.ones((3, 4, 4))
|
|
# The outer dimension is treated as the row dimension.
|
|
ds = ray.data.from_numpy(arr)
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=3,
|
|
# schema={value: <ArrowTensorType: shape=(4, 4), dtype=int64>},
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {'value': array([[1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.]])}
|
|
# -> {'value': array([[1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.],
|
|
# [1., 1., 1., 1.]])}
|
|
# __from_numpy_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_numpy_mult_begin__
|
|
import numpy as np
|
|
|
|
# Create a tensor Dataset from multiple 3D NumPy ndarray.
|
|
arrs = [np.random.rand(2, 4, 4) for _ in range(4)]
|
|
# The outer dimension is treated as the row dimension.
|
|
ds = ray.data.from_numpy(arrs)
|
|
# -> Dataset(
|
|
# num_blocks=4,
|
|
# num_rows=8,
|
|
# schema={value: <ArrowTensorType: shape=(4, 4), dtype=int64>},
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {'value': array([[0.06587483, 0.67808656, 0.76461924, 0.83428549],
|
|
# [0.04932103, 0.25112165, 0.26476714, 0.24599738],
|
|
# [0.67624391, 0.58689537, 0.12594709, 0.94663371],
|
|
# [0.32435665, 0.97719096, 0.03234169, 0.71563231]])}
|
|
# -> {'value': array([[0.98570318, 0.65956399, 0.82168898, 0.09798336],
|
|
# [0.22426704, 0.34209978, 0.02605247, 0.48200137],
|
|
# [0.17312096, 0.38789983, 0.42663678, 0.92652456],
|
|
# [0.80787394, 0.92437162, 0.11185822, 0.3319638 ]])}
|
|
# __from_numpy_mult_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_arrow_begin__
|
|
import pyarrow as pa
|
|
|
|
# Create a tabular Dataset from an Arrow Table.
|
|
t = pa.table({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
|
|
ds = ray.data.from_arrow(t)
|
|
# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: string})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_arrow_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_arrow_mult_begin__
|
|
import arrow as pd
|
|
|
|
data = list(range(10000))
|
|
num_chunks = 10
|
|
chunk_size = len(data) // num_chunks
|
|
chunks = [data[i : i + chunk_size] for i in range(0, len(data), chunk_size)]
|
|
ts = [
|
|
pa.table({"col1": list(chunk), "col2": list(map(str, chunk))})
|
|
for chunk in chunks
|
|
]
|
|
# Create a tabular Dataset from multiple Arrow Tables.
|
|
ds = ray.data.from_arrow(ts)
|
|
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_arrow_mult_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_dask_begin__
|
|
import pandas as pd
|
|
import dask.dataframe as dd
|
|
|
|
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
|
|
ddf = dd.from_pandas(df, npartitions=4)
|
|
# Create a tabular Dataset from a Dask DataFrame.
|
|
ds = ray.data.from_dask(ddf)
|
|
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_dask_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_spark_begin__
|
|
import raydp
|
|
|
|
spark = raydp.init_spark(app_name="Spark -> Datasets Example",
|
|
num_executors=2,
|
|
executor_cores=2,
|
|
executor_memory="500MB")
|
|
df = spark.createDataFrame([(i, str(i)) for i in range(10000)], ["col1", "col2"])
|
|
# Create a tabular Dataset from a Spark DataFrame.
|
|
ds = ray.data.from_dask(df)
|
|
# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_spark_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_modin_begin__
|
|
import modin.pandas as md
|
|
|
|
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
|
|
mdf = md.DataFrame(df)
|
|
# Create a tabular Dataset from a Modin DataFrame.
|
|
ds = ray.data.from_modin(mdf)
|
|
# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_modin_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __from_mars_begin__
|
|
import mars
|
|
import mars.dataframe as md
|
|
|
|
cluster = mars.new_cluster_in_ray(worker_num=2, worker_cpu=1)
|
|
|
|
df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))})
|
|
mdf = md.DataFrame(df, num_partitions=8)
|
|
# Create a tabular Dataset from a Mars DataFrame.
|
|
ds = ray.data.from_mars(mdf)
|
|
# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object})
|
|
|
|
ds.show(3)
|
|
# -> {'col1': 0, 'col2': '0'}
|
|
# -> {'col1': 1, 'col2': '1'}
|
|
# -> {'col1': 2, 'col2': '2'}
|
|
# __from_mars_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_begin__
|
|
# Create a tabular Dataset by reading a Parquet file.
|
|
ds = ray.data.read_parquet("example://iris.parquet")
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=150,
|
|
# schema={
|
|
# sepal.length: double,
|
|
# sepal.width: double,
|
|
# petal.length: double,
|
|
# petal.width: double,
|
|
# variety: string,
|
|
# }
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {
|
|
# 'sepal.length': 5.1,
|
|
# 'sepal.width': 3.5,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# -> {
|
|
# 'sepal.length': 4.9,
|
|
# 'sepal.width': 3.0,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# __read_parquet_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_pushdown_begin__
|
|
import pyarrow as pa
|
|
|
|
# Create a tabular Dataset by reading a Parquet file, pushing column selection and row
|
|
# filtering down to the file scan.
|
|
ds = ray.data.read_parquet(
|
|
"example://iris.parquet",
|
|
columns=["sepal.length", "variety"],
|
|
filter=pa.dataset.field("sepal.length") > 5.0,
|
|
).fully_executed() # Force a full read of the file.
|
|
# -> Dataset(num_blocks=1, num_rows=118, schema={sepal.length: double, variety: string})
|
|
|
|
ds.show(2)
|
|
# -> {'sepal.length': 5.1, 'variety': 'Setosa'}
|
|
# {'sepal.length': 5.4, 'variety': 'Setosa'}
|
|
# __read_parquet_pushdown_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_csv_begin__
|
|
# Create a tabular Dataset by reading a CSV file.
|
|
ds = ray.data.read_csv("example://iris.csv")
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=150,
|
|
# schema={
|
|
# sepal.length: double,
|
|
# sepal.width: double,
|
|
# petal.length: double,
|
|
# petal.width: double,
|
|
# variety: string,
|
|
# }
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {
|
|
# 'sepal.length': 5.1,
|
|
# 'sepal.width': 3.5,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# -> {
|
|
# 'sepal.length': 4.9,
|
|
# 'sepal.width': 3.0,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# __read_csv_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_json_begin__
|
|
# Create a tabular Dataset by reading a JSON file.
|
|
ds = ray.data.read_json("example://iris.json")
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=150,
|
|
# schema={
|
|
# sepal.length: double,
|
|
# sepal.width: double,
|
|
# petal.length: double,
|
|
# petal.width: double,
|
|
# variety: string,
|
|
# }
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {
|
|
# 'sepal.length': 5.1,
|
|
# 'sepal.width': 3.5,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# -> {
|
|
# 'sepal.length': 4.9,
|
|
# 'sepal.width': 3.0,
|
|
# 'petal.length': 1.4,
|
|
# 'petal.width': 0.2,
|
|
# 'variety': 'Setosa',
|
|
# }
|
|
# __read_json_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_numpy_begin__
|
|
# Create a tensor Dataset by reading a NumPy file.
|
|
ds = ray.data.read_numpy("example://mnist_subset.npy")
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=3,
|
|
# schema={__RAY_TC__: <ArrowTensorType: shape=(28, 28), dtype=uint8>},
|
|
# )
|
|
|
|
ds.show(2)
|
|
# [array([[0, ...]]), array([[0, ...]])]
|
|
# __read_numpy_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_text_begin__
|
|
# Create a tabular Dataset by reading a text file.
|
|
ds = ray.data.read_text("example://sms_spam_collection_subset.txt")
|
|
# -> Dataset(num_blocks=1, num_rows=10, schema=<class 'str'>)
|
|
|
|
ds.show(3)
|
|
# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e
|
|
# buffet... Cine there got amore wat...
|
|
# ham Ok lar... Joking wif u oni...
|
|
# spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA
|
|
# to 87121 to receive entry question(std txt rate)T&C's apply
|
|
# 08452810075over18's
|
|
# __read_text_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_binary_begin__
|
|
from io import BytesIO
|
|
import PIL
|
|
|
|
# Create a tabular Dataset by reading a binary file.
|
|
ds = ray.data.read_binary_files("example://mnist_subset_partitioned/0/1.png")
|
|
# -> Dataset(num_blocks=1, num_rows=1, schema=<class 'bytes'>)
|
|
|
|
ds = ds.map(lambda bytes_: np.asarray(PIL.Image.open(BytesIO(bytes_)).convert("L")))
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=1,
|
|
# schema={__RAY_TC__: <ArrowTensorType: shape=(28, 28), dtype=uint8>},
|
|
# )
|
|
|
|
ds.show(3)
|
|
# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e
|
|
# buffet... Cine there got amore wat...
|
|
# ham Ok lar... Joking wif u oni...
|
|
# spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA
|
|
# to 87121 to receive entry question(std txt rate)T&C's apply
|
|
# 08452810075over18's
|
|
# __read_binary_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_s3_begin__
|
|
# Create a tabular Dataset by reading a Parquet file from S3.
|
|
ds = ray.data.read_parquet("s3://ursa-labs-taxi-data/2009/01/data.parquet")
|
|
# -> Dataset(
|
|
# num_blocks=1,
|
|
# num_rows=14092413,
|
|
# schema={
|
|
# vendor_id: string,
|
|
# pickup_at: timestamp[us],
|
|
# dropoff_at: timestamp[us],
|
|
# passenger_count: int8,
|
|
# trip_distance: float,
|
|
# pickup_longitude: float,
|
|
# pickup_latitude: float,
|
|
# ...,
|
|
# },
|
|
# )
|
|
|
|
ds.show(2)
|
|
# -> {
|
|
# 'vendor_id': 'VTS',
|
|
# 'pickup_at': datetime.datetime(2009, 1, 4, 2, 52),
|
|
# 'dropoff_at': datetime.datetime(2009, 1, 4, 3, 2),
|
|
# 'passenger_count': 1,
|
|
# 'trip_distance': 2.630000114440918,
|
|
# 'pickup_longitude': -73.99195861816406,
|
|
# 'pickup_latitude': 40.72156524658203,
|
|
# ...,
|
|
# }
|
|
# {
|
|
# 'vendor_id': 'VTS',
|
|
# 'pickup_at': datetime.datetime(2009, 1, 4, 3, 31),
|
|
# 'dropoff_at': datetime.datetime(2009, 1, 4, 3, 38),
|
|
# 'passenger_count': 3,
|
|
# 'trip_distance': 4.550000190734863,
|
|
# 'pickup_longitude': -73.98210144042969,
|
|
# 'pickup_latitude': 40.736289978027344,
|
|
# ...,
|
|
# }
|
|
# __read_parquet_s3_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_s3_with_fs_begin__
|
|
import pyarrow as pa
|
|
|
|
# Create a tabular Dataset by reading a Parquet file from a private S3 bucket.
|
|
# NOTE: This example is not runnable as-is; add in a path to your private bucket and the
|
|
# required S3 credentials!
|
|
ds = ray.data.read_parquet(
|
|
"s3://some/private/bucket",
|
|
filesystem=pa.fs.S3FileSystem(
|
|
region="us-west-2",
|
|
access_key="XXXX",
|
|
secret_key="XXXX",
|
|
),
|
|
)
|
|
# __read_parquet_s3_with_fs_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_hdfs_begin__
|
|
# Create a tabular Dataset by reading a Parquet file from HDFS using HDFS connection
|
|
# automatically constructed based on the URI.
|
|
# NOTE: This example is not runnable as-is; you'll need to point it at your HDFS
|
|
# cluster/data.
|
|
ds = ray.data.read_parquet("hdfs://<host:port>/path/to/file.parquet")
|
|
# __read_parquet_hdfs_end__
|
|
# fmt: on
|
|
|
|
# TODO(Clark): Find clean way to start local HDFS cluster in the below example (that
|
|
# works in CI).
|
|
|
|
# fmt: off
|
|
# __read_parquet_hdfs_with_fs_begin__
|
|
import pyarrow as pa
|
|
|
|
# Create a tabular Dataset by reading a Parquet file from HDFS, manually specifying a
|
|
# configured HDFS connection via a Pyarrow HDFSFileSystem instance.
|
|
# NOTE: This example is not runnable as-is; you'll need to point it at your HDFS
|
|
# cluster/data.
|
|
ds = ray.data.read_parquet(
|
|
"hdfs://path/to/file.parquet",
|
|
filesystem=pa.fs.HDFSFileSystem(host="localhost", port=9000, user="bob"),
|
|
)
|
|
# __read_parquet_hdfs_with_fs_end__
|
|
# fmt: on
|
|
|
|
# TODO(Clark): Find open data for below GCS example.
|
|
|
|
# fmt: off
|
|
# __read_parquet_gcs_begin__
|
|
import gcsfs
|
|
|
|
# Create a tabular Dataset by reading a Parquet file from GCS, passing the configured
|
|
# GCSFileSystem.
|
|
# NOTE: This example is not runnable as-is; you'll need to point it at your GCS bucket
|
|
# and configure your GCP project and credentials.
|
|
ds = ray.data.read_parquet(
|
|
"gs://path/to/file.parquet",
|
|
filesystem=gcsfs.GCSFileSystem(project="my-google-project"),
|
|
)
|
|
# __read_parquet_gcs_end__
|
|
# fmt: on
|
|
|
|
# fmt: off
|
|
# __read_parquet_az_begin__
|
|
import adlfs
|
|
|
|
# Create a tabular Dataset by reading a Parquet file from Azure Blob Storage, passing
|
|
# the configured AzureBlobFileSystem.
|
|
path = (
|
|
"az://nyctlc/yellow/puYear=2009/puMonth=1/"
|
|
"part-00019-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426333-4"
|
|
".c000.snappy.parquet"
|
|
)
|
|
ds = ray.data.read_parquet(
|
|
path,
|
|
filesystem=adlfs.AzureBlobFileSystem(account_name="azureopendatastorage")
|
|
)
|
|
# __read_parquet_az_end__
|
|
# fmt: on
|