mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[Datasets] Update docs for drop_columns and fix typos (#26317)
We added drop_columns() API to datasets in #26200, so updating documentation here to use the new API - doc/source/data/examples/nyc_taxi_basic_processing.ipynb. In addition, fixing some minor typos after proofreading the datasets documentation.
This commit is contained in:
parent
ea94cda1f3
commit
4e674b6ad3
6 changed files with 13 additions and 13 deletions
|
@ -390,7 +390,7 @@ Supported File Formats
|
|||
treating the outermost ndarray dimension as the row dimension. See our
|
||||
:ref:`tensor data guide <datasets_tensor_support>` for more information on working
|
||||
with tensors in Datasets. Although this simple example demonstrates reading a single
|
||||
file, note that Datasets can also read directories of JSON files, with one tensor
|
||||
file, note that Datasets can also read directories of NumPy files, with one tensor
|
||||
block created per file.
|
||||
|
||||
.. literalinclude:: ./doc_code/creating_datasets.py
|
||||
|
|
|
@ -24,7 +24,7 @@ ML pipeline completely within Ray without requiring data to be materialized to e
|
|||
:width: 650px
|
||||
:align: center
|
||||
|
||||
See the :ref:`ML preprocessing docs <datasets-ml-preprocessing>` for information on how to use Datasets as the
|
||||
See below for information on how to use Datasets as the
|
||||
last-mile bridge to model training and inference, and see :ref:`the Talks section <data-talks>` for more
|
||||
Datasets ML use cases and benchmarks.
|
||||
|
||||
|
|
|
@ -32,13 +32,13 @@ ds.take(5)
|
|||
|
||||
# Write out just one file.
|
||||
ds.repartition(1).write_csv("/tmp/one_csv")
|
||||
# -> /tmp/one_parquet/d757569dfb2845589b0ccbcb263e8cc3_000000.csv
|
||||
# -> /tmp/one_csv/d757569dfb2845589b0ccbcb263e8cc3_000000.csv
|
||||
|
||||
# Write out multiple files.
|
||||
ds.repartition(3).write_csv("/tmp/multi_csv")
|
||||
# -> /tmp/multi_parquet/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000000.csv
|
||||
# -> /tmp/multi_parquet/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000001.csv
|
||||
# -> /tmp/multi_parquet/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000002.csv
|
||||
# -> /tmp/multi_csv/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000000.csv
|
||||
# -> /tmp/multi_csv/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000001.csv
|
||||
# -> /tmp/multi_csv/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000002.csv
|
||||
# __write_csv_end__
|
||||
# fmt: on
|
||||
|
||||
|
@ -53,13 +53,13 @@ ds.take(5)
|
|||
|
||||
# Write out just one file.
|
||||
ds.repartition(1).write_json("/tmp/one_json")
|
||||
# -> /tmp/one_parquet/ab693fde13634f4c8cdaef1db9595ac1_000000.json
|
||||
# -> /tmp/one_json/ab693fde13634f4c8cdaef1db9595ac1_000000.json
|
||||
|
||||
# Write out multiple files.
|
||||
ds.repartition(3).write_json("/tmp/multi_json")
|
||||
# -> /tmp/multi_parquet/f467636b3c41420bb109505ab56c6eae_000000.json
|
||||
# -> /tmp/multi_parquet/f467636b3c41420bb109505ab56c6eae_000001.json
|
||||
# -> /tmp/multi_parquet/f467636b3c41420bb109505ab56c6eae_000002.json
|
||||
# -> /tmp/multi_json/f467636b3c41420bb109505ab56c6eae_000000.json
|
||||
# -> /tmp/multi_json/f467636b3c41420bb109505ab56c6eae_000001.json
|
||||
# -> /tmp/multi_json/f467636b3c41420bb109505ab56c6eae_000002.json
|
||||
# __write_json_end__
|
||||
# fmt: on
|
||||
|
||||
|
|
|
@ -569,7 +569,7 @@
|
|||
],
|
||||
"source": [
|
||||
"# Drop some columns.\n",
|
||||
"ds = ds.map_batches(lambda df: df.drop(columns=[\"store_and_fwd_flag\", \"mta_tax\"]))"
|
||||
"ds = ds.drop_columns([\"store_and_fwd_flag\", \"mta_tax\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -99,7 +99,7 @@ just two of the five columns of Iris dataset.
|
|||
Parquet Row Pruning
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Similarly, you can pass in a filter to ``ray.data.read_parquet()`` (selection pushdown)
|
||||
Similarly, you can pass in a filter to ``ray.data.read_parquet()`` (filter pushdown)
|
||||
which will be applied at the file scan so only rows that match the filter predicate
|
||||
will be returned.
|
||||
For example, use ``ray.data.read_parquet("example://iris.parquet", filter=pa.dataset.field("sepal.length") > 5.0``
|
||||
|
|
|
@ -561,7 +561,7 @@ class Dataset(Generic[T]):
|
|||
compute: The compute strategy, either "tasks" (default) to use Ray
|
||||
tasks, or ActorPoolStrategy(min, max) to use an autoscaling actor pool.
|
||||
ray_remote_args: Additional resource requirements to request from
|
||||
ray (e.g., num_gpus=1 to request GPUs for the map tasks
|
||||
ray (e.g., num_gpus=1 to request GPUs for the map tasks).
|
||||
"""
|
||||
|
||||
return self.map_batches(
|
||||
|
|
Loading…
Add table
Reference in a new issue