mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[Datasets] [Docs] Add a warning about from_huggingface (#24608)
Adds a warning to docs about the intended use of from_huggingface.
This commit is contained in:
parent
c87c50b156
commit
04e16f70a3
2 changed files with 29 additions and 1 deletions
|
@ -68,7 +68,7 @@ Finally, you can create a ``Dataset`` from existing data in the Ray object store
|
||||||
ds = ray.data.from_dask(dask_df)
|
ds = ray.data.from_dask(dask_df)
|
||||||
|
|
||||||
From Torch/TensorFlow
|
From Torch/TensorFlow
|
||||||
---------------------------------------
|
---------------------
|
||||||
|
|
||||||
.. tabbed:: PyTorch
|
.. tabbed:: PyTorch
|
||||||
|
|
||||||
|
@ -119,3 +119,27 @@ From Torch/TensorFlow
|
||||||
features, label = dataset.take(1)[0]
|
features, label = dataset.take(1)[0]
|
||||||
features.shape # TensorShape([32, 32, 3])
|
features.shape # TensorShape([32, 32, 3])
|
||||||
label # <tf.Tensor: shape=(), dtype=int64, numpy=7>
|
label # <tf.Tensor: shape=(), dtype=int64, numpy=7>
|
||||||
|
|
||||||
|
|
||||||
|
From 🤗 (Hugging Face) Datasets
|
||||||
|
-------------------------------
|
||||||
|
|
||||||
|
You can convert 🤗 Datasets into Ray Datasets by using
|
||||||
|
:py:class:`~ray.data.from_huggingface`. This function accesses the underlying Arrow table and
|
||||||
|
converts it into a Ray Dataset directly.
|
||||||
|
|
||||||
|
.. warning::
|
||||||
|
:py:class:`~ray.data.from_huggingface` doesn't support parallel
|
||||||
|
reads. This will not usually be an issue with in-memory 🤗 Datasets,
|
||||||
|
but may fail with large memory-mapped 🤗 Datasets. 🤗 ``IterableDataset``
|
||||||
|
objects are not supported.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import ray.data
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
hf_datasets = load_dataset("wikitext", "wikitext-2-raw-v1")
|
||||||
|
ray_datasets = ray.data.from_huggingface(hf_datasets)
|
||||||
|
ray_datasets["train"].take(2)
|
||||||
|
# [{'text': ''}, {'text': ' = Valkyria Chronicles III = \n'}]
|
|
@ -969,6 +969,10 @@ def from_huggingface(
|
||||||
) -> Union[Dataset[ArrowRow], Dict[str, Dataset[ArrowRow]]]:
|
) -> Union[Dataset[ArrowRow], Dict[str, Dataset[ArrowRow]]]:
|
||||||
"""Create a dataset from a Hugging Face Datasets Dataset.
|
"""Create a dataset from a Hugging Face Datasets Dataset.
|
||||||
|
|
||||||
|
This function is not parallelized, and is intended to be used
|
||||||
|
with Hugging Face Datasets that are loaded into memory (as opposed
|
||||||
|
to memory-mapped).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dataset: A Hugging Face ``Dataset``, or ``DatasetDict``.
|
dataset: A Hugging Face ``Dataset``, or ``DatasetDict``.
|
||||||
``IterableDataset`` is not supported.
|
``IterableDataset`` is not supported.
|
||||||
|
|
Loading…
Add table
Reference in a new issue