# flake8: noqa # fmt: off # __creating_datasets_import_begin__ import ray # __creating_datasets_import_end__ # fmt: on # fmt: off # __gen_synth_int_range_begin__ # Create a Dataset of Python objects. ds = ray.data.range(10000) # -> Dataset(num_blocks=200, num_rows=10000, schema=) ds.take(5) # -> [0, 1, 2, 3, 4] # __gen_synth_int_range_end__ # fmt: on # fmt: off # __gen_synth_tabular_range_begin__ # Create a Dataset of Arrow records. ds = ray.data.range_table(10000) # -> Dataset(num_blocks=200, num_rows=10000, schema={value: int64}) ds.take(5) # -> [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': 4}] # __gen_synth_tabular_range_end__ # fmt: on # fmt: off # __gen_synth_tensor_range_begin__ # Create a Dataset of tensors. ds = ray.data.range_tensor(100 * 64 * 64, shape=(64, 64)) # -> Dataset( # num_blocks=200, # num_rows=409600, # schema={value: } # ) ds.take(2) # -> [array([[0, 0, 0, ..., 0, 0, 0], # [0, 0, 0, ..., 0, 0, 0], # [0, 0, 0, ..., 0, 0, 0], # ..., # [0, 0, 0, ..., 0, 0, 0], # [0, 0, 0, ..., 0, 0, 0], # [0, 0, 0, ..., 0, 0, 0]]), # array([[1, 1, 1, ..., 1, 1, 1], # [1, 1, 1, ..., 1, 1, 1], # [1, 1, 1, ..., 1, 1, 1], # ..., # [1, 1, 1, ..., 1, 1, 1], # [1, 1, 1, ..., 1, 1, 1], # [1, 1, 1, ..., 1, 1, 1]])] # __gen_synth_tensor_range_end__ # fmt: on # fmt: off # __from_items_begin__ # Create a Dataset of tabular (Arrow) records. ds = ray.data.from_items([{"col1": i, "col2": str(i)} for i in range(10000)]) # -> Dataset(num_blocks=200, num_rows=10000, schema={col1: int64, col2: string}) ds.show(3) # -> {'col1': 0, 'col2': '0'} # -> {'col1': 1, 'col2': '1'} # -> {'col1': 2, 'col2': '2'} # __from_items_end__ # fmt: on # fmt: off # __from_pandas_begin__ import pandas as pd # Create a tabular Dataset from a Pandas DataFrame. df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))}) ds = ray.data.from_pandas(df) # -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: object}) ds.show(3) # -> {'col1': 0, 'col2': '0'} # -> {'col1': 1, 'col2': '1'} # -> {'col1': 2, 'col2': '2'} # __from_pandas_end__ # fmt: on # fmt: off # __from_pandas_mult_begin__ import pandas as pd data = list(range(10000)) num_chunks = 10 chunk_size = len(data) // num_chunks chunks = [data[i : i + chunk_size] for i in range(0, len(data), chunk_size)] dfs = [ pd.DataFrame({"col1": list(chunk), "col2": list(map(str, chunk))}) for chunk in chunks ] # Create a tabular Dataset from multiple Pandas DataFrames. ds = ray.data.from_pandas(dfs) # -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object}) ds.show(3) # -> {'col1': 0, 'col2': '0'} # -> {'col1': 1, 'col2': '1'} # -> {'col1': 2, 'col2': '2'} # __from_pandas_mult_end__ # fmt: on # fmt: off # __from_numpy_begin__ import numpy as np # Create a tensor Dataset from a 1D NumPy ndarray. arr = np.arange(100) ds = ray.data.from_numpy(arr) # -> Dataset( # num_blocks=1, # num_rows=100, # schema={value: }, # ) # Each element is a scalar ndarray. ds.show(3) # -> {'value': array(0)} # -> {'value': array(1)} # -> {'value': array(2)} # Create a tensor Dataset from a 3D NumPy ndarray. arr = np.ones((3, 4, 4)) # The outer dimension is treated as the row dimension. ds = ray.data.from_numpy(arr) # -> Dataset( # num_blocks=1, # num_rows=3, # schema={value: }, # ) ds.show(2) # -> {'value': array([[1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.]])} # -> {'value': array([[1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.]])} # __from_numpy_end__ # fmt: on # fmt: off # __from_numpy_mult_begin__ import numpy as np # Create a tensor Dataset from multiple 3D NumPy ndarray. arrs = [np.random.rand(2, 4, 4) for _ in range(4)] # The outer dimension is treated as the row dimension. ds = ray.data.from_numpy(arrs) # -> Dataset( # num_blocks=4, # num_rows=8, # schema={value: }, # ) ds.show(2) # -> {'value': array([[0.06587483, 0.67808656, 0.76461924, 0.83428549], # [0.04932103, 0.25112165, 0.26476714, 0.24599738], # [0.67624391, 0.58689537, 0.12594709, 0.94663371], # [0.32435665, 0.97719096, 0.03234169, 0.71563231]])} # -> {'value': array([[0.98570318, 0.65956399, 0.82168898, 0.09798336], # [0.22426704, 0.34209978, 0.02605247, 0.48200137], # [0.17312096, 0.38789983, 0.42663678, 0.92652456], # [0.80787394, 0.92437162, 0.11185822, 0.3319638 ]])} # __from_numpy_mult_end__ # fmt: on # fmt: off # __from_arrow_begin__ import pyarrow as pa # Create a tabular Dataset from an Arrow Table. t = pa.table({"col1": list(range(10000)), "col2": list(map(str, range(10000)))}) ds = ray.data.from_arrow(t) # -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: string}) ds.show(3) # -> {'col1': 0, 'col2': '0'} # -> {'col1': 1, 'col2': '1'} # -> {'col1': 2, 'col2': '2'} # __from_arrow_end__ # fmt: on # fmt: off # __from_arrow_mult_begin__ import arrow as pd data = list(range(10000)) num_chunks = 10 chunk_size = len(data) // num_chunks chunks = [data[i : i + chunk_size] for i in range(0, len(data), chunk_size)] ts = [ pa.table({"col1": list(chunk), "col2": list(map(str, chunk))}) for chunk in chunks ] # Create a tabular Dataset from multiple Arrow Tables. ds = ray.data.from_arrow(ts) # -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string}) ds.show(3) # -> {'col1': 0, 'col2': '0'} # -> {'col1': 1, 'col2': '1'} # -> {'col1': 2, 'col2': '2'} # __from_arrow_mult_end__ # fmt: on # fmt: off # __from_dask_begin__ import pandas as pd import dask.dataframe as dd df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))}) ddf = dd.from_pandas(df, npartitions=4) # Create a tabular Dataset from a Dask DataFrame. ds = ray.data.from_dask(ddf) # -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object}) ds.show(3) # -> {'col1': 0, 'col2': '0'} # -> {'col1': 1, 'col2': '1'} # -> {'col1': 2, 'col2': '2'} # __from_dask_end__ # fmt: on # fmt: off # __from_spark_begin__ import raydp spark = raydp.init_spark(app_name="Spark -> Datasets Example", num_executors=2, executor_cores=2, executor_memory="500MB") df = spark.createDataFrame([(i, str(i)) for i in range(10000)], ["col1", "col2"]) # Create a tabular Dataset from a Spark DataFrame. ds = ray.data.from_dask(df) # -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string}) ds.show(3) # -> {'col1': 0, 'col2': '0'} # -> {'col1': 1, 'col2': '1'} # -> {'col1': 2, 'col2': '2'} # __from_spark_end__ # fmt: on # fmt: off # __from_modin_begin__ import modin.pandas as md df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))}) mdf = md.DataFrame(df) # Create a tabular Dataset from a Modin DataFrame. ds = ray.data.from_modin(mdf) # -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object}) ds.show(3) # -> {'col1': 0, 'col2': '0'} # -> {'col1': 1, 'col2': '1'} # -> {'col1': 2, 'col2': '2'} # __from_modin_end__ # fmt: on # fmt: off # __from_mars_begin__ import mars import mars.dataframe as md cluster = mars.new_cluster_in_ray(worker_num=2, worker_cpu=1) df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))}) mdf = md.DataFrame(df, num_partitions=8) # Create a tabular Dataset from a Mars DataFrame. ds = ray.data.from_mars(mdf) # -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object}) ds.show(3) # -> {'col1': 0, 'col2': '0'} # -> {'col1': 1, 'col2': '1'} # -> {'col1': 2, 'col2': '2'} # __from_mars_end__ # fmt: on # fmt: off # __read_parquet_begin__ # Create a tabular Dataset by reading a Parquet file. ds = ray.data.read_parquet("example://iris.parquet") # -> Dataset( # num_blocks=1, # num_rows=150, # schema={ # sepal.length: double, # sepal.width: double, # petal.length: double, # petal.width: double, # variety: string, # } # ) ds.show(2) # -> { # 'sepal.length': 5.1, # 'sepal.width': 3.5, # 'petal.length': 1.4, # 'petal.width': 0.2, # 'variety': 'Setosa', # } # -> { # 'sepal.length': 4.9, # 'sepal.width': 3.0, # 'petal.length': 1.4, # 'petal.width': 0.2, # 'variety': 'Setosa', # } # __read_parquet_end__ # fmt: on # fmt: off # __read_parquet_pushdown_begin__ import pyarrow as pa # Create a tabular Dataset by reading a Parquet file, pushing column selection and row # filtering down to the file scan. ds = ray.data.read_parquet( "example://iris.parquet", columns=["sepal.length", "variety"], filter=pa.dataset.field("sepal.length") > 5.0, ).fully_executed() # Force a full read of the file. # -> Dataset(num_blocks=1, num_rows=118, schema={sepal.length: double, variety: string}) ds.show(2) # -> {'sepal.length': 5.1, 'variety': 'Setosa'} # {'sepal.length': 5.4, 'variety': 'Setosa'} # __read_parquet_pushdown_end__ # fmt: on # fmt: off # __read_csv_begin__ # Create a tabular Dataset by reading a CSV file. ds = ray.data.read_csv("example://iris.csv") # -> Dataset( # num_blocks=1, # num_rows=150, # schema={ # sepal.length: double, # sepal.width: double, # petal.length: double, # petal.width: double, # variety: string, # } # ) ds.show(2) # -> { # 'sepal.length': 5.1, # 'sepal.width': 3.5, # 'petal.length': 1.4, # 'petal.width': 0.2, # 'variety': 'Setosa', # } # -> { # 'sepal.length': 4.9, # 'sepal.width': 3.0, # 'petal.length': 1.4, # 'petal.width': 0.2, # 'variety': 'Setosa', # } # __read_csv_end__ # fmt: on # fmt: off # __read_json_begin__ # Create a tabular Dataset by reading a JSON file. ds = ray.data.read_json("example://iris.json") # -> Dataset( # num_blocks=1, # num_rows=150, # schema={ # sepal.length: double, # sepal.width: double, # petal.length: double, # petal.width: double, # variety: string, # } # ) ds.show(2) # -> { # 'sepal.length': 5.1, # 'sepal.width': 3.5, # 'petal.length': 1.4, # 'petal.width': 0.2, # 'variety': 'Setosa', # } # -> { # 'sepal.length': 4.9, # 'sepal.width': 3.0, # 'petal.length': 1.4, # 'petal.width': 0.2, # 'variety': 'Setosa', # } # __read_json_end__ # fmt: on # fmt: off # __read_numpy_begin__ # Create a tensor Dataset by reading a NumPy file. ds = ray.data.read_numpy("example://mnist_subset.npy") # -> Dataset( # num_blocks=1, # num_rows=3, # schema={__RAY_TC__: }, # ) ds.show(2) # [array([[0, ...]]), array([[0, ...]])] # __read_numpy_end__ # fmt: on # fmt: off # __read_text_begin__ # Create a tabular Dataset by reading a text file. ds = ray.data.read_text("example://sms_spam_collection_subset.txt") # -> Dataset(num_blocks=1, num_rows=10, schema=) ds.show(3) # -> ham Go until jurong point, crazy.. Available only in bugis n great world la e # buffet... Cine there got amore wat... # ham Ok lar... Joking wif u oni... # spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA # to 87121 to receive entry question(std txt rate)T&C's apply # 08452810075over18's # __read_text_end__ # fmt: on # fmt: off # __read_binary_begin__ from io import BytesIO import PIL # Create a tabular Dataset by reading a binary file. ds = ray.data.read_binary_files("example://mnist_subset_partitioned/0/1.png") # -> Dataset(num_blocks=1, num_rows=1, schema=) ds = ds.map(lambda bytes_: np.asarray(PIL.Image.open(BytesIO(bytes_)).convert("L"))) # -> Dataset( # num_blocks=1, # num_rows=1, # schema={__RAY_TC__: }, # ) ds.show(3) # -> ham Go until jurong point, crazy.. Available only in bugis n great world la e # buffet... Cine there got amore wat... # ham Ok lar... Joking wif u oni... # spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA # to 87121 to receive entry question(std txt rate)T&C's apply # 08452810075over18's # __read_binary_end__ # fmt: on # fmt: off # __read_parquet_s3_begin__ # Create a tabular Dataset by reading a Parquet file from S3. ds = ray.data.read_parquet("s3://ursa-labs-taxi-data/2009/01/data.parquet") # -> Dataset( # num_blocks=1, # num_rows=14092413, # schema={ # vendor_id: string, # pickup_at: timestamp[us], # dropoff_at: timestamp[us], # passenger_count: int8, # trip_distance: float, # pickup_longitude: float, # pickup_latitude: float, # ..., # }, # ) ds.show(2) # -> { # 'vendor_id': 'VTS', # 'pickup_at': datetime.datetime(2009, 1, 4, 2, 52), # 'dropoff_at': datetime.datetime(2009, 1, 4, 3, 2), # 'passenger_count': 1, # 'trip_distance': 2.630000114440918, # 'pickup_longitude': -73.99195861816406, # 'pickup_latitude': 40.72156524658203, # ..., # } # { # 'vendor_id': 'VTS', # 'pickup_at': datetime.datetime(2009, 1, 4, 3, 31), # 'dropoff_at': datetime.datetime(2009, 1, 4, 3, 38), # 'passenger_count': 3, # 'trip_distance': 4.550000190734863, # 'pickup_longitude': -73.98210144042969, # 'pickup_latitude': 40.736289978027344, # ..., # } # __read_parquet_s3_end__ # fmt: on # fmt: off # __read_parquet_s3_with_fs_begin__ import pyarrow as pa # Create a tabular Dataset by reading a Parquet file from a private S3 bucket. # NOTE: This example is not runnable as-is; add in a path to your private bucket and the # required S3 credentials! ds = ray.data.read_parquet( "s3://some/private/bucket", filesystem=pa.fs.S3FileSystem( region="us-west-2", access_key="XXXX", secret_key="XXXX", ), ) # __read_parquet_s3_with_fs_end__ # fmt: on # fmt: off # __read_parquet_hdfs_begin__ # Create a tabular Dataset by reading a Parquet file from HDFS using HDFS connection # automatically constructed based on the URI. # NOTE: This example is not runnable as-is; you'll need to point it at your HDFS # cluster/data. ds = ray.data.read_parquet("hdfs:///path/to/file.parquet") # __read_parquet_hdfs_end__ # fmt: on # TODO(Clark): Find clean way to start local HDFS cluster in the below example (that # works in CI). # fmt: off # __read_parquet_hdfs_with_fs_begin__ import pyarrow as pa # Create a tabular Dataset by reading a Parquet file from HDFS, manually specifying a # configured HDFS connection via a Pyarrow HDFSFileSystem instance. # NOTE: This example is not runnable as-is; you'll need to point it at your HDFS # cluster/data. ds = ray.data.read_parquet( "hdfs://path/to/file.parquet", filesystem=pa.fs.HDFSFileSystem(host="localhost", port=9000, user="bob"), ) # __read_parquet_hdfs_with_fs_end__ # fmt: on # TODO(Clark): Find open data for below GCS example. # fmt: off # __read_parquet_gcs_begin__ import gcsfs # Create a tabular Dataset by reading a Parquet file from GCS, passing the configured # GCSFileSystem. # NOTE: This example is not runnable as-is; you'll need to point it at your GCS bucket # and configure your GCP project and credentials. ds = ray.data.read_parquet( "gs://path/to/file.parquet", filesystem=gcsfs.GCSFileSystem(project="my-google-project"), ) # __read_parquet_gcs_end__ # fmt: on # fmt: off # __read_parquet_az_begin__ import adlfs # Create a tabular Dataset by reading a Parquet file from Azure Blob Storage, passing # the configured AzureBlobFileSystem. path = ( "az://nyctlc/yellow/puYear=2009/puMonth=1/" "part-00019-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426333-4" ".c000.snappy.parquet" ) ds = ray.data.read_parquet( path, filesystem=adlfs.AzureBlobFileSystem(account_name="azureopendatastorage") ) # __read_parquet_az_end__ # fmt: on