diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml index ee7a9cdeb..ceda116bb 100644 --- a/.buildkite/pipeline.ml.yml +++ b/.buildkite/pipeline.ml.yml @@ -5,6 +5,7 @@ - DATA_PROCESSING_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-gpu,-needs_credentials python/ray/air/... - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_air,-gpu_only,-gpu,-needs_credentials python/ray/train/... + - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_air python/ray/data/... - label: ":brain: RLlib: Learning discr. actions TF2-static-graph" conditions: ["RAY_CI_RLLIB_AFFECTED"] @@ -352,7 +353,7 @@ commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - DATA_PROCESSING_TESTING=1 PYTHON=3.7 ./ci/env/install-dependencies.sh - - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only python/ray/data/... + - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-ray_air python/ray/data/... - label: ":potable_water: Workflow tests (Python 3.7)" conditions: ["RAY_CI_PYTHON_AFFECTED"] diff --git a/doc/source/ray-air/doc_code/air_ingest.py b/doc/source/ray-air/doc_code/air_ingest.py index 960aaaefc..706c7e60e 100644 --- a/doc/source/ray-air/doc_code/air_ingest.py +++ b/doc/source/ray-air/doc_code/air_ingest.py @@ -60,7 +60,7 @@ tuner.fit() # __check_ingest_1__ import ray -from ray.air.preprocessors import Chain, BatchMapper +from ray.data.preprocessors import Chain, BatchMapper from ray.air.util.check_ingest import DummyTrainer # Generate a synthetic dataset of ~10GiB of float64 data. The dataset is sharded diff --git a/doc/source/ray-air/doc_code/air_key_concepts.py b/doc/source/ray-air/doc_code/air_key_concepts.py index 2c06d503c..2a2bf68c4 100644 --- a/doc/source/ray-air/doc_code/air_key_concepts.py +++ b/doc/source/ray-air/doc_code/air_key_concepts.py @@ -6,7 +6,7 @@ import pandas as pd from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split -from ray.air.preprocessors import * +from ray.data.preprocessors import * data_raw = load_breast_cancer() dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"]) diff --git a/doc/source/ray-air/doc_code/preprocessors.py b/doc/source/ray-air/doc_code/preprocessors.py index b277c7c22..3b5ec9c82 100644 --- a/doc/source/ray-air/doc_code/preprocessors.py +++ b/doc/source/ray-air/doc_code/preprocessors.py @@ -4,7 +4,7 @@ # __preprocessor_setup_start__ import pandas as pd import ray -from ray.air.preprocessors import MinMaxScaler +from ray.data.preprocessors import MinMaxScaler # Generate two simple datasets. dataset = ray.data.range_table(8) @@ -47,8 +47,8 @@ print(batch_transformed) # __trainer_start__ import ray +from ray.data.preprocessors import MinMaxScaler from ray.train.xgboost import XGBoostTrainer -from ray.air.preprocessors import MinMaxScaler train_dataset = ray.data.from_items([{"x": x, "y": 2 * x} for x in range(0, 32, 3)]) valid_dataset = ray.data.from_items([{"x": x, "y": 2 * x} for x in range(1, 32, 3)]) @@ -106,7 +106,7 @@ print(predicted_labels.to_pandas()) # __chain_start__ import ray -from ray.air.preprocessors import Chain, MinMaxScaler, SimpleImputer +from ray.data.preprocessors import Chain, MinMaxScaler, SimpleImputer # Generate one simple dataset. dataset = ray.data.from_items( @@ -125,7 +125,7 @@ print(dataset_transformed.take()) # __custom_stateless_start__ import ray -from ray.air.preprocessors import BatchMapper +from ray.data.preprocessors import BatchMapper # Generate a simple dataset. dataset = ray.data.range_table(4) @@ -144,7 +144,7 @@ print(dataset_transformed.take()) from typing import Dict import ray from pandas import DataFrame -from ray.air.preprocessors import CustomStatefulPreprocessor +from ray.data.preprocessors import CustomStatefulPreprocessor from ray.data import Dataset from ray.data.aggregate import Max diff --git a/doc/source/ray-air/doc_code/xgboost_starter.py b/doc/source/ray-air/doc_code/xgboost_starter.py index 7b38ef3d6..051f046ab 100644 --- a/doc/source/ray-air/doc_code/xgboost_starter.py +++ b/doc/source/ray-air/doc_code/xgboost_starter.py @@ -2,7 +2,7 @@ # __air_xgb_preprocess_start__ import ray -from ray.air.preprocessors import StandardScaler +from ray.data.preprocessors import StandardScaler import pandas as pd diff --git a/doc/source/ray-air/examples/lightgbm_example.ipynb b/doc/source/ray-air/examples/lightgbm_example.ipynb index c46644431..25625893e 100644 --- a/doc/source/ray-air/examples/lightgbm_example.ipynb +++ b/doc/source/ray-air/examples/lightgbm_example.ipynb @@ -51,12 +51,12 @@ "import ray\n", "from ray.air.batch_predictor import BatchPredictor\n", "from ray.air.predictors.integrations.lightgbm import LightGBMPredictor\n", - "from ray.air.preprocessors.chain import Chain\n", - "from ray.air.preprocessors.encoder import Categorizer\n", + "from ray.data.preprocessors.chain import Chain\n", + "from ray.data.preprocessors.encoder import Categorizer\n", "from ray.train.lightgbm import LightGBMTrainer\n", "from ray.data.dataset import Dataset\n", "from ray.air.result import Result\n", - "from ray.air.preprocessors import StandardScaler\n", + "from ray.data.preprocessors import StandardScaler\n", "from sklearn.datasets import load_breast_cancer\n", "from sklearn.model_selection import train_test_split" ] diff --git a/doc/source/ray-air/examples/serving_guide.ipynb b/doc/source/ray-air/examples/serving_guide.ipynb index 7a4b387d6..c87fb502e 100644 --- a/doc/source/ray-air/examples/serving_guide.ipynb +++ b/doc/source/ray-air/examples/serving_guide.ipynb @@ -165,7 +165,7 @@ "from sklearn.model_selection import train_test_split\n", "\n", "from ray.air.train.integrations.xgboost import XGBoostTrainer\n", - "from ray.air.preprocessors import StandardScaler\n", + "from ray.data.preprocessors import StandardScaler\n", "\n", "data_raw = load_breast_cancer()\n", "dataset_df = pd.DataFrame(data_raw[\"data\"], columns=data_raw[\"feature_names\"])\n", diff --git a/doc/source/ray-air/examples/sklearn_example.ipynb b/doc/source/ray-air/examples/sklearn_example.ipynb index d17d7ed72..fe724ecea 100644 --- a/doc/source/ray-air/examples/sklearn_example.ipynb +++ b/doc/source/ray-air/examples/sklearn_example.ipynb @@ -56,7 +56,7 @@ "from ray.data.dataset import Dataset\n", "from ray.air.batch_predictor import BatchPredictor\n", "from ray.air.predictors.integrations.sklearn import SklearnPredictor\n", - "from ray.air.preprocessors import Chain, OrdinalEncoder, StandardScaler\n", + "from ray.data.preprocessors import Chain, OrdinalEncoder, StandardScaler\n", "from ray.air.result import Result\n", "from ray.train.sklearn import SklearnTrainer\n", "\n", diff --git a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb index d9d3b7f41..bb108efd8 100644 --- a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb +++ b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb @@ -472,7 +472,7 @@ }, "outputs": [], "source": [ - "from ray.air.preprocessors import (\n", + "from ray.data.preprocessors import (\n", " BatchMapper,\n", " Chain,\n", " OneHotEncoder,\n", diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb index 350673be0..7be2cbdfa 100644 --- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb +++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb @@ -590,7 +590,7 @@ }, "outputs": [], "source": [ - "from ray.air.preprocessors import BatchMapper\n", + "from ray.data.preprocessors import BatchMapper\n", "\n", "from torchvision import transforms\n", "\n", diff --git a/doc/source/ray-air/examples/xgboost_example.ipynb b/doc/source/ray-air/examples/xgboost_example.ipynb index d51d8f092..acfeeda38 100644 --- a/doc/source/ray-air/examples/xgboost_example.ipynb +++ b/doc/source/ray-air/examples/xgboost_example.ipynb @@ -69,7 +69,7 @@ "from ray.train.xgboost import XGBoostTrainer\n", "from ray.data.dataset import Dataset\n", "from ray.air.result import Result\n", - "from ray.air.preprocessors import StandardScaler\n", + "from ray.data.preprocessors import StandardScaler\n", "from sklearn.datasets import load_breast_cancer\n", "from sklearn.model_selection import train_test_split" ] diff --git a/doc/source/ray-air/package-ref.rst b/doc/source/ray-air/package-ref.rst index 30418dae4..56a97a74c 100644 --- a/doc/source/ray-air/package-ref.rst +++ b/doc/source/ray-air/package-ref.rst @@ -14,10 +14,10 @@ Components Preprocessors ~~~~~~~~~~~~~ -.. autoclass:: ray.air.preprocessor.Preprocessor +.. autoclass:: ray.data.preprocessor.Preprocessor :members: -.. automodule:: ray.air.preprocessors +.. automodule:: ray.data.preprocessors :members: :show-inheritance: diff --git a/doc/source/ray-air/preprocessors.rst b/doc/source/ray-air/preprocessors.rst index 852941324..c765cf2ab 100644 --- a/doc/source/ray-air/preprocessors.rst +++ b/doc/source/ray-air/preprocessors.rst @@ -127,32 +127,32 @@ Ray AIR provides a handful of ``Preprocessor``\s that you can use out of the box .. tabbed:: Common APIs - #. :class:`Preprocessor ` - #. :class:`Chain ` - #. :class:`BatchMapper ` - #. :class:`CustomStatefulPreprocessor ` + #. :class:`Preprocessor ` + #. :class:`BatchMapper ` + #. :class:`Chain ` + #. :class:`CustomStatefulPreprocessor ` .. tabbed:: Tabular - #. :class:`Categorizer ` - #. :class:`FeatureHasher ` - #. :class:`LabelEncoder ` - #. :class:`MaxAbsScaler ` - #. :class:`MinMaxScaler ` - #. :class:`Normalizer ` - #. :class:`OneHotEncoder ` - #. :class:`OrdinalEncoder ` - #. :class:`PowerTransformer ` - #. :class:`RobustScaler ` - #. :class:`SimpleImputer ` - #. :class:`StandardScaler ` - #. :class:`SimpleImputer ` + #. :class:`Categorizer ` + #. :class:`FeatureHasher ` + #. :class:`LabelEncoder ` + #. :class:`MaxAbsScaler ` + #. :class:`MinMaxScaler ` + #. :class:`Normalizer ` + #. :class:`OneHotEncoder ` + #. :class:`OrdinalEncoder ` + #. :class:`PowerTransformer ` + #. :class:`RobustScaler ` + #. :class:`SimpleImputer ` + #. :class:`StandardScaler ` + #. :class:`SimpleImputer ` .. tabbed:: Text - #. :class:`CountVectorizer ` - #. :class:`HashingVectorizer ` - #. :class:`Tokenizer ` + #. :class:`CountVectorizer ` + #. :class:`HashingVectorizer ` + #. :class:`Tokenizer ` .. tabbed:: Image diff --git a/python/ray/air/BUILD b/python/ray/air/BUILD index 770e6872d..4cddf5f7b 100644 --- a/python/ray/air/BUILD +++ b/python/ray/air/BUILD @@ -233,14 +233,6 @@ py_test( deps = [":ml_lib"] ) -py_test( - name = "test_preprocessors", - size = "small", - srcs = ["tests/test_preprocessors.py"], - tags = ["team:ml", "exclusive"], - deps = [":ml_lib"] -) - py_test( name = "test_remote_storage", size = "small", diff --git a/python/ray/air/__init__.py b/python/ray/air/__init__.py index 9c8e25980..df7bda2c4 100644 --- a/python/ray/air/__init__.py +++ b/python/ray/air/__init__.py @@ -1,7 +1,7 @@ from ray.air.checkpoint import Checkpoint from ray.air.data_batch_type import DataBatchType from ray.air.config import RunConfig, ScalingConfig, DatasetConfig -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from ray.air.predictor import Predictor from ray.air.result import Result from ray.air.batch_predictor import BatchPredictor diff --git a/python/ray/air/_internal/checkpointing.py b/python/ray/air/_internal/checkpointing.py index 5f1c0e14f..254c71e0a 100644 --- a/python/ray/air/_internal/checkpointing.py +++ b/python/ray/air/_internal/checkpointing.py @@ -6,7 +6,7 @@ import ray.cloudpickle as cpickle from ray.air.constants import PREPROCESSOR_KEY if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor def save_preprocessor_to_dir( diff --git a/python/ray/air/predictors/integrations/huggingface/huggingface_predictor.py b/python/ray/air/predictors/integrations/huggingface/huggingface_predictor.py index 0b5d356a9..606a21583 100644 --- a/python/ray/air/predictors/integrations/huggingface/huggingface_predictor.py +++ b/python/ray/air/predictors/integrations/huggingface/huggingface_predictor.py @@ -13,7 +13,7 @@ from ray.air.checkpoint import Checkpoint from ray.air._internal.checkpointing import load_preprocessor_from_dir if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor class HuggingFacePredictor(Predictor): diff --git a/python/ray/air/predictors/integrations/lightgbm/lightgbm_predictor.py b/python/ray/air/predictors/integrations/lightgbm/lightgbm_predictor.py index 844580a26..f6dd4257f 100644 --- a/python/ray/air/predictors/integrations/lightgbm/lightgbm_predictor.py +++ b/python/ray/air/predictors/integrations/lightgbm/lightgbm_predictor.py @@ -9,7 +9,7 @@ from ray.air.predictor import Predictor, DataBatchType from ray.train.lightgbm import load_checkpoint if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor class LightGBMPredictor(Predictor): diff --git a/python/ray/air/predictors/integrations/lightgbm/utils.py b/python/ray/air/predictors/integrations/lightgbm/utils.py index 90fb89a85..193c5d367 100644 --- a/python/ray/air/predictors/integrations/lightgbm/utils.py +++ b/python/ray/air/predictors/integrations/lightgbm/utils.py @@ -10,7 +10,7 @@ from ray.air._internal.checkpointing import ( ) if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor def to_air_checkpoint( diff --git a/python/ray/air/predictors/integrations/rl/rl_predictor.py b/python/ray/air/predictors/integrations/rl/rl_predictor.py index 2671c455d..74d74f705 100644 --- a/python/ray/air/predictors/integrations/rl/rl_predictor.py +++ b/python/ray/air/predictors/integrations/rl/rl_predictor.py @@ -9,7 +9,7 @@ from ray.rllib.policy.policy import Policy from ray.rllib.utils.typing import EnvType if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor class RLPredictor(Predictor): diff --git a/python/ray/air/predictors/integrations/sklearn/sklearn_predictor.py b/python/ray/air/predictors/integrations/sklearn/sklearn_predictor.py index b72d81473..b4d986525 100644 --- a/python/ray/air/predictors/integrations/sklearn/sklearn_predictor.py +++ b/python/ray/air/predictors/integrations/sklearn/sklearn_predictor.py @@ -13,7 +13,7 @@ from ray.util.joblib import register_ray from sklearn.base import BaseEstimator if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor class SklearnPredictor(Predictor): diff --git a/python/ray/air/predictors/integrations/sklearn/utils.py b/python/ray/air/predictors/integrations/sklearn/utils.py index 6e5ff21d7..b3dc30c6f 100644 --- a/python/ray/air/predictors/integrations/sklearn/utils.py +++ b/python/ray/air/predictors/integrations/sklearn/utils.py @@ -11,7 +11,7 @@ from ray.air._internal.checkpointing import ( import ray.cloudpickle as cpickle if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor def to_air_checkpoint( diff --git a/python/ray/air/predictors/integrations/tensorflow/tensorflow_predictor.py b/python/ray/air/predictors/integrations/tensorflow/tensorflow_predictor.py index 4c54569b9..4dd7cd102 100644 --- a/python/ray/air/predictors/integrations/tensorflow/tensorflow_predictor.py +++ b/python/ray/air/predictors/integrations/tensorflow/tensorflow_predictor.py @@ -9,7 +9,7 @@ from ray.train.data_parallel_trainer import _load_checkpoint from ray.air._internal.tensorflow_utils import convert_pandas_to_tf_tensor if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor class TensorflowPredictor(Predictor): diff --git a/python/ray/air/predictors/integrations/tensorflow/utils.py b/python/ray/air/predictors/integrations/tensorflow/utils.py index f12e42a17..16a1c5732 100644 --- a/python/ray/air/predictors/integrations/tensorflow/utils.py +++ b/python/ray/air/predictors/integrations/tensorflow/utils.py @@ -6,7 +6,7 @@ from ray.air.checkpoint import Checkpoint from ray.air.constants import MODEL_KEY, PREPROCESSOR_KEY if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor def to_air_checkpoint( diff --git a/python/ray/air/predictors/integrations/torch/torch_predictor.py b/python/ray/air/predictors/integrations/torch/torch_predictor.py index 0afd62557..a90cba483 100644 --- a/python/ray/air/predictors/integrations/torch/torch_predictor.py +++ b/python/ray/air/predictors/integrations/torch/torch_predictor.py @@ -10,7 +10,7 @@ from ray.train.torch import load_checkpoint from ray.air._internal.torch_utils import convert_pandas_to_torch_tensor if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor class TorchPredictor(Predictor): diff --git a/python/ray/air/predictors/integrations/torch/utils.py b/python/ray/air/predictors/integrations/torch/utils.py index d79d51ae0..35854d2f2 100644 --- a/python/ray/air/predictors/integrations/torch/utils.py +++ b/python/ray/air/predictors/integrations/torch/utils.py @@ -6,7 +6,7 @@ from ray.air.checkpoint import Checkpoint from ray.air.constants import MODEL_KEY, PREPROCESSOR_KEY if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor def to_air_checkpoint( diff --git a/python/ray/air/predictors/integrations/xgboost/utils.py b/python/ray/air/predictors/integrations/xgboost/utils.py index 7dce13f98..5627e1464 100644 --- a/python/ray/air/predictors/integrations/xgboost/utils.py +++ b/python/ray/air/predictors/integrations/xgboost/utils.py @@ -10,7 +10,7 @@ from ray.air._internal.checkpointing import ( ) if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor def to_air_checkpoint( diff --git a/python/ray/air/predictors/integrations/xgboost/xgboost_predictor.py b/python/ray/air/predictors/integrations/xgboost/xgboost_predictor.py index 60c447f0f..88e7fe7b7 100644 --- a/python/ray/air/predictors/integrations/xgboost/xgboost_predictor.py +++ b/python/ray/air/predictors/integrations/xgboost/xgboost_predictor.py @@ -9,7 +9,7 @@ from ray.air.predictor import Predictor, DataBatchType from ray.train.xgboost import load_checkpoint if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor class XGBoostPredictor(Predictor): diff --git a/python/ray/air/preprocessors/__init__.py b/python/ray/air/preprocessors/__init__.py deleted file mode 100644 index 28084bdb0..000000000 --- a/python/ray/air/preprocessors/__init__.py +++ /dev/null @@ -1,44 +0,0 @@ -from ray.air.preprocessors.batch_mapper import BatchMapper -from ray.air.preprocessors.chain import Chain -from ray.air.preprocessors.encoder import ( - Categorizer, - LabelEncoder, - MultiHotEncoder, - OneHotEncoder, - OrdinalEncoder, -) -from ray.air.preprocessors.hasher import FeatureHasher -from ray.air.preprocessors.imputer import SimpleImputer -from ray.air.preprocessors.normalizer import Normalizer -from ray.air.preprocessors.scaler import ( - StandardScaler, - MinMaxScaler, - MaxAbsScaler, - RobustScaler, -) -from ray.air.preprocessors.custom_stateful import CustomStatefulPreprocessor -from ray.air.preprocessors.tokenizer import Tokenizer -from ray.air.preprocessors.transformer import PowerTransformer -from ray.air.preprocessors.vectorizer import CountVectorizer, HashingVectorizer - -__all__ = [ - "BatchMapper", - "Categorizer", - "Chain", - "CountVectorizer", - "CustomStatefulPreprocessor", - "FeatureHasher", - "HashingVectorizer", - "LabelEncoder", - "MaxAbsScaler", - "MinMaxScaler", - "MultiHotEncoder", - "Normalizer", - "OneHotEncoder", - "OrdinalEncoder", - "PowerTransformer", - "RobustScaler", - "SimpleImputer", - "StandardScaler", - "Tokenizer", -] diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py index d545a4f0d..4b156c762 100644 --- a/python/ray/air/tests/test_api.py +++ b/python/ray/air/tests/test_api.py @@ -4,7 +4,7 @@ import ray from ray.air import Checkpoint from ray.air.config import ScalingConfigDataClass from ray.train import BaseTrainer -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated diff --git a/python/ray/air/tests/test_dataset_config.py b/python/ray/air/tests/test_dataset_config.py index 2b2c5f895..4195cf3a2 100644 --- a/python/ray/air/tests/test_dataset_config.py +++ b/python/ray/air/tests/test_dataset_config.py @@ -8,7 +8,7 @@ from ray.air.config import DatasetConfig from ray import train from ray.train.data_parallel_trainer import DataParallelTrainer -from ray.air.preprocessors import BatchMapper +from ray.data.preprocessors import BatchMapper @pytest.fixture diff --git a/python/ray/air/tests/test_huggingface_predictor.py b/python/ray/air/tests/test_huggingface_predictor.py index 1c1f91b77..ef29e581c 100644 --- a/python/ray/air/tests/test_huggingface_predictor.py +++ b/python/ray/air/tests/test_huggingface_predictor.py @@ -10,7 +10,7 @@ from transformers import ( from transformers.pipelines import pipeline import ray -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from ray.air.predictors.integrations.huggingface import HuggingFacePredictor prompts = pd.DataFrame( diff --git a/python/ray/air/tests/test_lightgbm_predictor.py b/python/ray/air/tests/test_lightgbm_predictor.py index 6b0d1ec0f..e614ab397 100644 --- a/python/ray/air/tests/test_lightgbm_predictor.py +++ b/python/ray/air/tests/test_lightgbm_predictor.py @@ -5,7 +5,7 @@ from ray.air.predictors.integrations.lightgbm import ( LightGBMPredictor, to_air_checkpoint, ) -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from ray.air.checkpoint import Checkpoint from ray.air.constants import MODEL_KEY from ray.air._internal.checkpointing import save_preprocessor_to_dir diff --git a/python/ray/air/tests/test_rl_predictor.py b/python/ray/air/tests/test_rl_predictor.py index 3169b9354..41e8dd559 100644 --- a/python/ray/air/tests/test_rl_predictor.py +++ b/python/ray/air/tests/test_rl_predictor.py @@ -7,7 +7,7 @@ import pytest import tempfile from ray.air.predictors.integrations.rl.rl_predictor import RLPredictor -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from ray.air.checkpoint import Checkpoint from ray.train.rl import RLTrainer diff --git a/python/ray/air/tests/test_sklearn_predictor.py b/python/ray/air/tests/test_sklearn_predictor.py index d0a897099..f23b9ab41 100644 --- a/python/ray/air/tests/test_sklearn_predictor.py +++ b/python/ray/air/tests/test_sklearn_predictor.py @@ -9,7 +9,7 @@ from sklearn.ensemble import RandomForestClassifier import ray import ray.cloudpickle as cpickle from ray.air.predictors.integrations.sklearn import SklearnPredictor, to_air_checkpoint -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from ray.air.checkpoint import Checkpoint from ray.air.constants import MODEL_KEY from ray.air.batch_predictor import BatchPredictor diff --git a/python/ray/air/tests/test_tensorflow_predictor.py b/python/ray/air/tests/test_tensorflow_predictor.py index bde4ee227..c342d5c85 100644 --- a/python/ray/air/tests/test_tensorflow_predictor.py +++ b/python/ray/air/tests/test_tensorflow_predictor.py @@ -6,7 +6,7 @@ from ray.air.predictors.integrations.tensorflow import ( TensorflowPredictor, to_air_checkpoint, ) -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor import numpy as np import pandas as pd diff --git a/python/ray/air/tests/test_torch_predictor.py b/python/ray/air/tests/test_torch_predictor.py index 9eab278bc..8e6654234 100644 --- a/python/ray/air/tests/test_torch_predictor.py +++ b/python/ray/air/tests/test_torch_predictor.py @@ -5,7 +5,7 @@ import pandas as pd import torch from ray.air.predictors.integrations.torch import TorchPredictor, to_air_checkpoint -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from ray.air.checkpoint import Checkpoint from ray.air.constants import PREPROCESSOR_KEY, MODEL_KEY diff --git a/python/ray/air/tests/test_xgboost_predictor.py b/python/ray/air/tests/test_xgboost_predictor.py index 7c4dcec17..a71ab63e7 100644 --- a/python/ray/air/tests/test_xgboost_predictor.py +++ b/python/ray/air/tests/test_xgboost_predictor.py @@ -1,6 +1,6 @@ import os from ray.air.predictors.integrations.xgboost import XGBoostPredictor, to_air_checkpoint -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from ray.air.checkpoint import Checkpoint from ray.air.constants import MODEL_KEY import json diff --git a/python/ray/air/util/check_ingest.py b/python/ray/air/util/check_ingest.py index cbbea8447..df70d1617 100755 --- a/python/ray/air/util/check_ingest.py +++ b/python/ray/air/util/check_ingest.py @@ -7,7 +7,7 @@ import sys import ray from ray import train -from ray.air.preprocessors import Chain, BatchMapper +from ray.data.preprocessors import Chain, BatchMapper from ray.air.config import DatasetConfig from ray.train.data_parallel_trainer import DataParallelTrainer from ray.util.annotations import DeveloperAPI diff --git a/python/ray/data/BUILD b/python/ray/data/BUILD index d694c7bb6..907551896 100644 --- a/python/ray/data/BUILD +++ b/python/ray/data/BUILD @@ -12,8 +12,19 @@ SRCS = [] + select({ "//conditions:default": [], }) +py_test( + name = "test_preprocessors", + size = "small", + srcs = ["tests/test_preprocessors.py"], + tags = ["team:ml", "exclusive", "ray_air"], + deps = ["//:ray_lib"], +) + py_test_module_list( - files = glob(["tests/test_*.py"]), + files = glob( + include=["tests/test_*.py"], + exclude=["tests/test_preprocessors.py"] + ), size = "large", extra_srcs = SRCS, tags = ["team:core", "exclusive"], diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index 76d15f4dd..3347f25e3 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -29,6 +29,7 @@ from ray.data.dataset import Dataset from ray.data.dataset_pipeline import DatasetPipeline from ray.data._internal.progress_bar import set_progress_bars from ray.data._internal.compute import ActorPoolStrategy +from ray.data.preprocessor import Preprocessor # Module-level cached global functions (for impl/compute). It cannot be defined # in impl/compute since it has to be process-global across cloudpickled funcs. @@ -65,4 +66,5 @@ __all__ = [ "read_parquet", "read_parquet_bulk", "set_progress_bars", + "Preprocessor", ] diff --git a/python/ray/air/preprocessor.py b/python/ray/data/preprocessor.py similarity index 97% rename from python/ray/air/preprocessor.py rename to python/ray/data/preprocessor.py index 355cf5de8..a13be3a47 100644 --- a/python/ray/air/preprocessor.py +++ b/python/ray/data/preprocessor.py @@ -6,9 +6,9 @@ from ray.util.annotations import PublicAPI if TYPE_CHECKING: import pandas as pd + from ray.air.data_batch_type import DataBatchType from ray.data import Dataset -from ray.air.data_batch_type import DataBatchType @PublicAPI(stability="alpha") @@ -134,7 +134,7 @@ class Preprocessor(abc.ABC): self._transform_stats = transformed_ds.stats() return transformed_ds - def transform_batch(self, df: DataBatchType) -> DataBatchType: + def transform_batch(self, df: "DataBatchType") -> "DataBatchType": """Transform a single batch of data. Args: @@ -171,7 +171,7 @@ class Preprocessor(abc.ABC): # The default may be too small for some datasets and too large for others. return dataset.map_batches(self._transform_pandas, batch_format="pandas") - def _transform_batch(self, df: DataBatchType) -> DataBatchType: + def _transform_batch(self, df: "DataBatchType") -> "DataBatchType": import pandas as pd # TODO(matt): Add `_transform_arrow` to use based on input type. diff --git a/python/ray/data/preprocessors/__init__.py b/python/ray/data/preprocessors/__init__.py new file mode 100644 index 000000000..a80ea04ae --- /dev/null +++ b/python/ray/data/preprocessors/__init__.py @@ -0,0 +1,44 @@ +from ray.data.preprocessors.batch_mapper import BatchMapper +from ray.data.preprocessors.chain import Chain +from ray.data.preprocessors.custom_stateful import CustomStatefulPreprocessor +from ray.data.preprocessors.encoder import ( + Categorizer, + LabelEncoder, + MultiHotEncoder, + OneHotEncoder, + OrdinalEncoder, +) +from ray.data.preprocessors.hasher import FeatureHasher +from ray.data.preprocessors.imputer import SimpleImputer +from ray.data.preprocessors.normalizer import Normalizer +from ray.data.preprocessors.scaler import ( + StandardScaler, + MinMaxScaler, + MaxAbsScaler, + RobustScaler, +) +from ray.data.preprocessors.tokenizer import Tokenizer +from ray.data.preprocessors.transformer import PowerTransformer +from ray.data.preprocessors.vectorizer import CountVectorizer, HashingVectorizer + +__all__ = [ + "BatchMapper", + "Categorizer", + "CountVectorizer", + "Chain", + "CustomStatefulPreprocessor", + "FeatureHasher", + "HashingVectorizer", + "LabelEncoder", + "MaxAbsScaler", + "MinMaxScaler", + "MultiHotEncoder", + "Normalizer", + "OneHotEncoder", + "OrdinalEncoder", + "PowerTransformer", + "RobustScaler", + "SimpleImputer", + "StandardScaler", + "Tokenizer", +] diff --git a/python/ray/air/preprocessors/batch_mapper.py b/python/ray/data/preprocessors/batch_mapper.py similarity index 94% rename from python/ray/air/preprocessors/batch_mapper.py rename to python/ray/data/preprocessors/batch_mapper.py index 9d0232de8..eaca7726e 100644 --- a/python/ray/air/preprocessors/batch_mapper.py +++ b/python/ray/data/preprocessors/batch_mapper.py @@ -1,6 +1,6 @@ from typing import Callable, TYPE_CHECKING -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor if TYPE_CHECKING: import pandas diff --git a/python/ray/air/preprocessors/chain.py b/python/ray/data/preprocessors/chain.py similarity index 91% rename from python/ray/air/preprocessors/chain.py rename to python/ray/data/preprocessors/chain.py index 9ad9b1c83..d11852815 100644 --- a/python/ray/air/preprocessors/chain.py +++ b/python/ray/data/preprocessors/chain.py @@ -1,5 +1,9 @@ +from typing import TYPE_CHECKING from ray.data import Dataset -from ray.air.preprocessor import Preprocessor, DataBatchType +from ray.data.preprocessor import Preprocessor + +if TYPE_CHECKING: + from ray.air.data_batch_type import DataBatchType class Chain(Preprocessor): @@ -61,7 +65,7 @@ class Chain(Preprocessor): self._transform_stats = preprocessor.transform_stats() return ds - def _transform_batch(self, df: DataBatchType) -> DataBatchType: + def _transform_batch(self, df: "DataBatchType") -> "DataBatchType": for preprocessor in self.preprocessors: df = preprocessor.transform_batch(df) return df diff --git a/python/ray/air/preprocessors/custom_stateful.py b/python/ray/data/preprocessors/custom_stateful.py similarity index 96% rename from python/ray/air/preprocessors/custom_stateful.py rename to python/ray/data/preprocessors/custom_stateful.py index 1c58e9e44..801f4d6f0 100644 --- a/python/ray/air/preprocessors/custom_stateful.py +++ b/python/ray/data/preprocessors/custom_stateful.py @@ -1,6 +1,6 @@ from typing import Callable, TYPE_CHECKING, Dict -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from ray.data import Dataset if TYPE_CHECKING: @@ -22,7 +22,7 @@ class CustomStatefulPreprocessor(Preprocessor): import pandas as pd import ray.data from pandas import DataFrame - from ray.air.preprocessors import CustomStatefulPreprocessor + from ray.data.preprocessors import CustomStatefulPreprocessor from ray.data import Dataset from ray.data.aggregate import Max diff --git a/python/ray/air/preprocessors/encoder.py b/python/ray/data/preprocessors/encoder.py similarity index 98% rename from python/ray/air/preprocessors/encoder.py rename to python/ray/data/preprocessors/encoder.py index 5c290811c..5e4d368b2 100644 --- a/python/ray/air/preprocessors/encoder.py +++ b/python/ray/data/preprocessors/encoder.py @@ -6,7 +6,7 @@ import pandas as pd import pandas.api.types from ray.data import Dataset -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor class OrdinalEncoder(Preprocessor): @@ -25,7 +25,7 @@ class OrdinalEncoder(Preprocessor): .. code-block:: python import ray.data - from ray.air.preprocessors import OrdinalEncoder + from ray.data.preprocessors import OrdinalEncoder import pandas as pd batch = pd.DataFrame( { @@ -202,7 +202,7 @@ class MultiHotEncoder(Preprocessor): .. code-block:: python import ray.data - from ray.air.preprocessors import MultiHotEncoder + from ray.data.preprocessors import MultiHotEncoder import pandas as pd mhe = MultiHotEncoder(columns=["A", "B"]) batch = pd.DataFrame( diff --git a/python/ray/air/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py similarity index 95% rename from python/ray/air/preprocessors/hasher.py rename to python/ray/data/preprocessors/hasher.py index 30095a018..1687c68c2 100644 --- a/python/ray/air/preprocessors/hasher.py +++ b/python/ray/data/preprocessors/hasher.py @@ -3,9 +3,9 @@ from typing import List import pandas as pd -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor -from ray.air.preprocessors.utils import simple_hash +from ray.data.preprocessors.utils import simple_hash class FeatureHasher(Preprocessor): diff --git a/python/ray/air/preprocessors/imputer.py b/python/ray/data/preprocessors/imputer.py similarity index 98% rename from python/ray/air/preprocessors/imputer.py rename to python/ray/data/preprocessors/imputer.py index e38207cc5..996dc1961 100644 --- a/python/ray/air/preprocessors/imputer.py +++ b/python/ray/data/preprocessors/imputer.py @@ -6,7 +6,7 @@ import pandas as pd from ray.data import Dataset from ray.data.aggregate import Mean -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor class SimpleImputer(Preprocessor): diff --git a/python/ray/air/preprocessors/normalizer.py b/python/ray/data/preprocessors/normalizer.py similarity index 96% rename from python/ray/air/preprocessors/normalizer.py rename to python/ray/data/preprocessors/normalizer.py index 8e4baabeb..ac765a8b1 100644 --- a/python/ray/air/preprocessors/normalizer.py +++ b/python/ray/data/preprocessors/normalizer.py @@ -3,7 +3,7 @@ from typing import List import numpy as np import pandas as pd -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor class Normalizer(Preprocessor): diff --git a/python/ray/air/preprocessors/scaler.py b/python/ray/data/preprocessors/scaler.py similarity index 99% rename from python/ray/air/preprocessors/scaler.py rename to python/ray/data/preprocessors/scaler.py index 4cad10d89..b5b527147 100644 --- a/python/ray/air/preprocessors/scaler.py +++ b/python/ray/data/preprocessors/scaler.py @@ -5,7 +5,7 @@ import pandas as pd from ray.data import Dataset from ray.data.aggregate import Mean, Std, Min, Max, AbsMax -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor class StandardScaler(Preprocessor): diff --git a/python/ray/air/preprocessors/tokenizer.py b/python/ray/data/preprocessors/tokenizer.py similarity index 91% rename from python/ray/air/preprocessors/tokenizer.py rename to python/ray/data/preprocessors/tokenizer.py index e7add283b..12f7d8ac8 100644 --- a/python/ray/air/preprocessors/tokenizer.py +++ b/python/ray/data/preprocessors/tokenizer.py @@ -2,8 +2,8 @@ from typing import List, Callable, Optional import pandas as pd -from ray.air.preprocessor import Preprocessor -from ray.air.preprocessors.utils import simple_split_tokenizer +from ray.data.preprocessor import Preprocessor +from ray.data.preprocessors.utils import simple_split_tokenizer class Tokenizer(Preprocessor): diff --git a/python/ray/air/preprocessors/transformer.py b/python/ray/data/preprocessors/transformer.py similarity index 98% rename from python/ray/air/preprocessors/transformer.py rename to python/ray/data/preprocessors/transformer.py index 78fd852a9..20373a9e6 100644 --- a/python/ray/air/preprocessors/transformer.py +++ b/python/ray/data/preprocessors/transformer.py @@ -3,7 +3,7 @@ from typing import List import numpy as np import pandas as pd -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor class PowerTransformer(Preprocessor): diff --git a/python/ray/air/preprocessors/utils.py b/python/ray/data/preprocessors/utils.py similarity index 100% rename from python/ray/air/preprocessors/utils.py rename to python/ray/data/preprocessors/utils.py diff --git a/python/ray/air/preprocessors/vectorizer.py b/python/ray/data/preprocessors/vectorizer.py similarity index 97% rename from python/ray/air/preprocessors/vectorizer.py rename to python/ray/data/preprocessors/vectorizer.py index 2d2e93697..7485dd6d1 100644 --- a/python/ray/air/preprocessors/vectorizer.py +++ b/python/ray/data/preprocessors/vectorizer.py @@ -4,8 +4,8 @@ from typing import List, Callable, Optional import pandas as pd from ray.data import Dataset -from ray.air.preprocessor import Preprocessor -from ray.air.preprocessors.utils import simple_split_tokenizer, simple_hash +from ray.data.preprocessor import Preprocessor +from ray.data.preprocessors.utils import simple_split_tokenizer, simple_hash class HashingVectorizer(Preprocessor): diff --git a/python/ray/air/tests/test_preprocessors.py b/python/ray/data/tests/test_preprocessors.py similarity index 98% rename from python/ray/air/tests/test_preprocessors.py rename to python/ray/data/tests/test_preprocessors.py index 48511a670..7e7cc6948 100644 --- a/python/ray/air/tests/test_preprocessors.py +++ b/python/ray/data/tests/test_preprocessors.py @@ -8,8 +8,8 @@ import pandas as pd import pytest import ray from pandas import DataFrame -from ray.air.preprocessor import PreprocessorNotFittedException -from ray.air.preprocessors import ( +from ray.data.preprocessor import PreprocessorNotFittedException +from ray.data.preprocessors import ( BatchMapper, StandardScaler, MinMaxScaler, @@ -20,14 +20,14 @@ from ray.air.preprocessors import ( Chain, CustomStatefulPreprocessor, ) -from ray.air.preprocessors.encoder import Categorizer, MultiHotEncoder -from ray.air.preprocessors.hasher import FeatureHasher -from ray.air.preprocessors.normalizer import Normalizer -from ray.air.preprocessors.scaler import MaxAbsScaler, RobustScaler -from ray.air.preprocessors.tokenizer import Tokenizer -from ray.air.preprocessors.transformer import PowerTransformer -from ray.air.preprocessors.utils import simple_split_tokenizer, simple_hash -from ray.air.preprocessors.vectorizer import CountVectorizer, HashingVectorizer +from ray.data.preprocessors.encoder import Categorizer, MultiHotEncoder +from ray.data.preprocessors.hasher import FeatureHasher +from ray.data.preprocessors.normalizer import Normalizer +from ray.data.preprocessors.scaler import MaxAbsScaler, RobustScaler +from ray.data.preprocessors.tokenizer import Tokenizer +from ray.data.preprocessors.transformer import PowerTransformer +from ray.data.preprocessors.utils import simple_split_tokenizer, simple_hash +from ray.data.preprocessors.vectorizer import CountVectorizer, HashingVectorizer from ray.data import Dataset from ray.data.aggregate import Max diff --git a/python/ray/train/_internal/dataset_spec.py b/python/ray/train/_internal/dataset_spec.py index 1dd8441cc..3175824db 100644 --- a/python/ray/train/_internal/dataset_spec.py +++ b/python/ray/train/_internal/dataset_spec.py @@ -7,7 +7,7 @@ from ray.air.config import DatasetConfig if TYPE_CHECKING: from ray.data import Dataset, DatasetPipeline - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor RayDataset = Union["Dataset", "DatasetPipeline"] diff --git a/python/ray/train/base_trainer.py b/python/ray/train/base_trainer.py index d2029c565..aada0a220 100644 --- a/python/ray/train/base_trainer.py +++ b/python/ray/train/base_trainer.py @@ -22,7 +22,7 @@ from ray.util.ml_utils.dict import merge_dicts if TYPE_CHECKING: from ray.data import Dataset - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor # A type representing either a ray.data.Dataset or a function that returns a # ray.data.Dataset and accepts no arguments. @@ -57,7 +57,7 @@ class BaseTrainer(abc.ABC): specified here. - ``trainer.preprocess_datasets()``: The provided ray.data.Dataset are preprocessed with the provided - ray.air.preprocessor. + ray.data.Preprocessor. - ``trainer.train_loop()``: Executes the main training logic. - Calling ``trainer.fit()`` will return a ``ray.result.Result`` object where you can access metrics from your training run, as well @@ -200,10 +200,10 @@ class BaseTrainer(abc.ABC): ) # Preprocessor if self.preprocessor is not None and not isinstance( - self.preprocessor, ray.air.preprocessor.Preprocessor + self.preprocessor, ray.data.Preprocessor ): raise ValueError( - f"`preprocessor` should be an instance of `ray.air.Preprocessor`, " + f"`preprocessor` should be an instance of `ray.data.Preprocessor`, " f"found {type(self.preprocessor)} with value `{self.preprocessor}`." ) diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py index b85554087..63a6390dd 100644 --- a/python/ray/train/data_parallel_trainer.py +++ b/python/ray/train/data_parallel_trainer.py @@ -32,7 +32,7 @@ from ray.util.annotations import DeveloperAPI from ray.util.ml_utils.checkpoint_manager import CheckpointStrategy, _TrackedCheckpoint if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor logger = logging.getLogger(__name__) @@ -217,7 +217,7 @@ class DataParallelTrainer(BaseTrainer): dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if one is provided. - preprocessor: A ray.air.preprocessor.Preprocessor to preprocess the + preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. """ diff --git a/python/ray/train/gbdt_trainer.py b/python/ray/train/gbdt_trainer.py index a2f6709da..ea70387f2 100644 --- a/python/ray/train/gbdt_trainer.py +++ b/python/ray/train/gbdt_trainer.py @@ -13,7 +13,7 @@ from ray.train.constants import MODEL_KEY, TRAIN_DATASET_KEY if TYPE_CHECKING: import xgboost_ray - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor def _convert_scaling_config_to_ray_params( @@ -58,7 +58,7 @@ class GBDTTrainer(BaseTrainer): :class:`xgboost_ray.RayDMatrix` initializations. scaling_config: Configuration for how to scale data parallel training. run_config: Configuration for the execution of the training run. - preprocessor: A ray.air.preprocessor.Preprocessor to preprocess the + preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. **train_kwargs: Additional kwargs passed to framework ``train()`` function. diff --git a/python/ray/train/horovod/horovod_trainer.py b/python/ray/train/horovod/horovod_trainer.py index 6dbc172db..3b5e6e76c 100644 --- a/python/ray/train/horovod/horovod_trainer.py +++ b/python/ray/train/horovod/horovod_trainer.py @@ -9,7 +9,7 @@ from ray.train.data_parallel_trainer import DataParallelTrainer from ray.train.horovod.config import HorovodConfig if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor class HorovodTrainer(DataParallelTrainer): @@ -160,7 +160,7 @@ class HorovodTrainer(DataParallelTrainer): dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if one is provided. - preprocessor: A ray.air.preprocessor.Preprocessor to preprocess the + preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. """ diff --git a/python/ray/train/huggingface/huggingface_trainer.py b/python/ray/train/huggingface/huggingface_trainer.py index a704b180a..2b400de3d 100644 --- a/python/ray/train/huggingface/huggingface_trainer.py +++ b/python/ray/train/huggingface/huggingface_trainer.py @@ -46,7 +46,7 @@ from ray.tune.trainable import Trainable from ray.tune.utils.file_transfer import delete_on_node, sync_dir_between_nodes if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor # This trainer uses a special checkpoint syncing logic. # Because HF checkpoints are very large dirs (at least several GBs), @@ -254,7 +254,7 @@ class HuggingFaceTrainer(TorchTrainer): scaling_config: Configuration for how to scale data parallel training. dataset_config: Configuration for dataset ingest. run_config: Configuration for the execution of the training run. - preprocessor: A ray.air.preprocessor.Preprocessor to preprocess the + preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. """ diff --git a/python/ray/train/lightgbm/lightgbm_trainer.py b/python/ray/train/lightgbm/lightgbm_trainer.py index fc017b1ff..afbbed8a3 100644 --- a/python/ray/train/lightgbm/lightgbm_trainer.py +++ b/python/ray/train/lightgbm/lightgbm_trainer.py @@ -12,7 +12,7 @@ import lightgbm_ray from lightgbm_ray.tune import TuneReportCheckpointCallback if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor @PublicAPI(stability="alpha") @@ -61,7 +61,7 @@ class LightGBMTrainer(GBDTTrainer): can be used to add sample weights with the ``weights`` parameter. scaling_config: Configuration for how to scale data parallel training. run_config: Configuration for the execution of the training run. - preprocessor: A ray.air.preprocessor.Preprocessor to preprocess the + preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. **train_kwargs: Additional kwargs passed to ``lightgbm.train()`` function. diff --git a/python/ray/train/rl/rl_trainer.py b/python/ray/train/rl/rl_trainer.py index b7380507f..808a3d140 100644 --- a/python/ray/train/rl/rl_trainer.py +++ b/python/ray/train/rl/rl_trainer.py @@ -21,7 +21,7 @@ from ray.util.annotations import PublicAPI from ray.util.ml_utils.dict import merge_dicts if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor RL_TRAINER_CLASS_FILE = "trainer_class.pkl" RL_CONFIG_FILE = "config.pkl" diff --git a/python/ray/train/sklearn/sklearn_trainer.py b/python/ray/train/sklearn/sklearn_trainer.py index d0c339aa0..9023abcce 100644 --- a/python/ray/train/sklearn/sklearn_trainer.py +++ b/python/ray/train/sklearn/sklearn_trainer.py @@ -32,7 +32,7 @@ from sklearn.model_selection import BaseCrossValidator, cross_validate from sklearn.model_selection._validation import _check_multimetric_scoring, _score if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor logger = logging.getLogger(__name__) @@ -152,7 +152,7 @@ class SklearnTrainer(BaseTrainer): as the training is not distributed. dataset_config: Configuration for dataset ingest. run_config: Configuration for the execution of the training run. - preprocessor: A ray.air.preprocessor.Preprocessor to preprocess the + preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. **fit_params: Additional kwargs passed to ``estimator.fit()`` method. diff --git a/python/ray/train/tensorflow/tensorflow_trainer.py b/python/ray/train/tensorflow/tensorflow_trainer.py index d23e11a13..c66084ea9 100644 --- a/python/ray/train/tensorflow/tensorflow_trainer.py +++ b/python/ray/train/tensorflow/tensorflow_trainer.py @@ -9,7 +9,7 @@ from ray.air.checkpoint import Checkpoint from ray.util import PublicAPI if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor @PublicAPI(stability="alpha") @@ -153,7 +153,7 @@ class TensorflowTrainer(DataParallelTrainer): dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if one is provided. - preprocessor: A ray.air.preprocessor.Preprocessor to preprocess the + preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. """ diff --git a/python/ray/train/tests/test_base_trainer.py b/python/ray/train/tests/test_base_trainer.py index 76dc917e4..565e7b33f 100644 --- a/python/ray/train/tests/test_base_trainer.py +++ b/python/ray/train/tests/test_base_trainer.py @@ -3,7 +3,7 @@ import pytest import ray from ray import tune -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from ray.train.trainer import BaseTrainer from ray.util.placement_group import get_current_placement_group diff --git a/python/ray/train/tests/test_data_parallel_trainer.py b/python/ray/train/tests/test_data_parallel_trainer.py index 37a8aa2ff..331ccad35 100644 --- a/python/ray/train/tests/test_data_parallel_trainer.py +++ b/python/ray/train/tests/test_data_parallel_trainer.py @@ -5,8 +5,8 @@ from ray import train, tune from ray.air.checkpoint import Checkpoint from ray.train.constants import PREPROCESSOR_KEY +from ray.data.preprocessor import Preprocessor from ray.train.data_parallel_trainer import DataParallelTrainer -from ray.air.preprocessor import Preprocessor from ray.tune.tune_config import TuneConfig from ray.tune.tuner import Tuner diff --git a/python/ray/train/tests/test_lightgbm_trainer.py b/python/ray/train/tests/test_lightgbm_trainer.py index 906479960..66db92fe4 100644 --- a/python/ray/train/tests/test_lightgbm_trainer.py +++ b/python/ray/train/tests/test_lightgbm_trainer.py @@ -9,8 +9,8 @@ from ray import tune from ray.air.checkpoint import Checkpoint from ray.train.constants import TRAIN_DATASET_KEY +from ray.data.preprocessor import Preprocessor from ray.train.lightgbm import LightGBMTrainer, load_checkpoint -from ray.air.preprocessor import Preprocessor from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split diff --git a/python/ray/train/tests/test_sklearn_trainer.py b/python/ray/train/tests/test_sklearn_trainer.py index f273cdc23..39fc39f35 100644 --- a/python/ray/train/tests/test_sklearn_trainer.py +++ b/python/ray/train/tests/test_sklearn_trainer.py @@ -7,7 +7,7 @@ from ray.air.checkpoint import Checkpoint from ray.train.constants import TRAIN_DATASET_KEY from ray.train.sklearn import SklearnTrainer, load_checkpoint -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split diff --git a/python/ray/train/tests/test_xgboost_trainer.py b/python/ray/train/tests/test_xgboost_trainer.py index d9ed7788b..bd9528481 100644 --- a/python/ray/train/tests/test_xgboost_trainer.py +++ b/python/ray/train/tests/test_xgboost_trainer.py @@ -10,7 +10,7 @@ from ray.air.checkpoint import Checkpoint from ray.train.constants import TRAIN_DATASET_KEY from ray.train.xgboost import XGBoostTrainer, load_checkpoint -from ray.air.preprocessor import Preprocessor +from ray.data.preprocessor import Preprocessor from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split diff --git a/python/ray/train/torch/torch_trainer.py b/python/ray/train/torch/torch_trainer.py index 453a8982a..dc00cc748 100644 --- a/python/ray/train/torch/torch_trainer.py +++ b/python/ray/train/torch/torch_trainer.py @@ -10,7 +10,7 @@ from ray.air._internal.torch_utils import load_torch_model from ray.util import PublicAPI if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor @PublicAPI(stability="alpha") @@ -163,7 +163,7 @@ class TorchTrainer(DataParallelTrainer): dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if one is provided. - preprocessor: A ``ray.air.preprocessor.Preprocessor`` to preprocess the + preprocessor: A ``ray.data.Preprocessor`` to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. """ diff --git a/python/ray/train/xgboost/xgboost_trainer.py b/python/ray/train/xgboost/xgboost_trainer.py index f99f3b3d1..a3a89fb29 100644 --- a/python/ray/train/xgboost/xgboost_trainer.py +++ b/python/ray/train/xgboost/xgboost_trainer.py @@ -12,7 +12,7 @@ import xgboost_ray from xgboost_ray.tune import TuneReportCheckpointCallback if TYPE_CHECKING: - from ray.air.preprocessor import Preprocessor + from ray.data.preprocessor import Preprocessor @PublicAPI(stability="alpha") @@ -57,7 +57,7 @@ class XGBoostTrainer(GBDTTrainer): be used to add sample weights with the ``weights`` parameter. scaling_config: Configuration for how to scale data parallel training. run_config: Configuration for the execution of the training run. - preprocessor: A ray.air.preprocessor.Preprocessor to preprocess the + preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. **train_kwargs: Additional kwargs passed to ``xgboost.train()`` function.