From 2e05b6223689a6215c49adfa00b5013ef8ec30dd Mon Sep 17 00:00:00 2001
From: matthewdeng <matt@anyscale.com>
Date: Fri, 3 Jun 2022 11:43:51 -0700
Subject: [PATCH] [AIR] Preprocessors feature guide (#25302)

---
 doc/source/_toc.yml                           |   1 +
 doc/source/ray-air/doc_code/preprocessors.py  | 136 +++++++++++++
 .../ray-air/images/air-preprocessor.svg       |   4 +
 doc/source/ray-air/key-concepts.rst           |   1 +
 doc/source/ray-air/package-ref.rst            |   2 +
 doc/source/ray-air/preprocessors.rst          | 192 ++++++++++++++++++
 6 files changed, 336 insertions(+)
 create mode 100644 doc/source/ray-air/doc_code/preprocessors.py
 create mode 100644 doc/source/ray-air/images/air-preprocessor.svg
 create mode 100644 doc/source/ray-air/preprocessors.rst

diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml
index a156771ba..d7572d1a9 100644
--- a/doc/source/_toc.yml
+++ b/doc/source/_toc.yml
@@ -180,6 +180,7 @@ parts:
               - file: ray-air/key-concepts
               - file: ray-air/deployment
               - file: ray-air/check-ingest
+              - file: ray-air/preprocessors
               - file: ray-air/examples/index
                 sections:
                   - file: ray-air/examples/analyze_tuning_results
diff --git a/doc/source/ray-air/doc_code/preprocessors.py b/doc/source/ray-air/doc_code/preprocessors.py
new file mode 100644
index 000000000..4a1601a63
--- /dev/null
+++ b/doc/source/ray-air/doc_code/preprocessors.py
@@ -0,0 +1,136 @@
+# flake8: noqa
+
+
+# __preprocessor_setup_start__
+import pandas as pd
+import ray
+from ray.ml.preprocessors import MinMaxScaler
+
+# Generate two simple datasets.
+dataset = ray.data.range_table(8)
+dataset1, dataset2 = dataset.split(2)
+
+print(dataset1.take())
+# [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}]
+
+print(dataset2.take())
+# [{'value': 4}, {'value': 5}, {'value': 6}, {'value': 7}]
+# __preprocessor_setup_end__
+
+
+# __preprocessor_fit_transform_start__
+# Fit the preprocessor on dataset1, and transform both dataset1 and dataset2.
+preprocessor = MinMaxScaler(["value"])
+
+dataset1_transformed = preprocessor.fit_transform(dataset1)
+print(dataset1_transformed.take())
+# [{'value': 0.0}, {'value': 0.3333333333333333}, {'value': 0.6666666666666666}, {'value': 1.0}]
+
+dataset2_transformed = preprocessor.transform(dataset2)
+print(dataset2_transformed.take())
+# [{'value': 1.3333333333333333}, {'value': 1.6666666666666667}, {'value': 2.0}, {'value': 2.3333333333333335}]
+# __preprocessor_fit_transform_end__
+
+
+# __preprocessor_transform_batch_start__
+batch = pd.DataFrame({"value": list(range(8, 12))})
+batch_transformed = preprocessor.transform_batch(batch)
+print(batch_transformed)
+#       value
+# 0  2.666667
+# 1  3.000000
+# 2  3.333333
+# 3  3.666667
+# __preprocessor_transform_batch_end__
+
+
+# __trainer_start__
+import ray
+
+from ray.ml.train.integrations.xgboost import XGBoostTrainer
+from ray.ml.preprocessors import MinMaxScaler
+
+train_dataset = ray.data.from_items([{"x": x, "y": 2 * x} for x in range(0, 32, 3)])
+valid_dataset = ray.data.from_items([{"x": x, "y": 2 * x} for x in range(1, 32, 3)])
+
+preprocessor = MinMaxScaler(["x"])
+
+trainer = XGBoostTrainer(
+    label_column="y",
+    params={"objective": "reg:squarederror"},
+    scaling_config={"num_workers": 2},
+    datasets={"train": train_dataset, "valid": valid_dataset},
+    preprocessor=preprocessor,
+)
+result = trainer.fit()
+# __trainer_end__
+
+
+# __checkpoint_start__
+from ray.ml.utils.checkpointing import load_preprocessor_from_dir
+
+checkpoint = result.checkpoint
+with checkpoint.as_directory() as checkpoint_path:
+    preprocessor = load_preprocessor_from_dir(checkpoint_path)
+    print(preprocessor)
+# MixMaxScaler(columns=['x'], stats={'min(x)': 0, 'max(x)': 30})
+# __checkpoint_end__
+
+
+# __predictor_start__
+from ray.ml.batch_predictor import BatchPredictor
+from ray.ml.predictors.integrations.xgboost import XGBoostPredictor
+
+test_dataset = ray.data.from_items([{"x": x} for x in range(2, 32, 3)])
+
+batch_predictor = BatchPredictor.from_checkpoint(checkpoint, XGBoostPredictor)
+predicted_labels = batch_predictor.predict(test_dataset)
+print(predicted_labels.to_pandas())
+#    predictions
+# 0     0.098437
+# 1     5.604667
+# 2    11.405312
+# 3    15.684700
+# 4    23.990948
+# 5    29.900211
+# 6    34.599442
+# 7    40.696899
+# 8    45.681076
+# 9    50.290031
+# __predictor_end__
+
+
+# __chain_start__
+import ray
+from ray.ml.preprocessors import Chain, MinMaxScaler, SimpleImputer
+
+# Generate one simple dataset.
+dataset = ray.data.from_items(
+    [{"value": 0}, {"value": 1}, {"value": 2}, {"value": 3}, {"value": None}]
+)
+print(dataset.take())
+# [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': None}]
+
+preprocessor = Chain(SimpleImputer(["value"]), MinMaxScaler(["value"]))
+
+dataset_transformed = preprocessor.fit_transform(dataset)
+print(dataset_transformed.take())
+# [{'value': 0.0}, {'value': 0.3333333333333333}, {'value': 0.6666666666666666}, {'value': 1.0}, {'value': 0.5}]
+# __chain_end__
+
+
+# __custom_stateless_start__
+import ray
+from ray.ml.preprocessors import BatchMapper
+
+# Generate a simple dataset.
+dataset = ray.data.range_table(4)
+print(dataset.take())
+# [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}]
+
+# Create a stateless preprocess that multiplies values by 2.
+preprocessor = BatchMapper(lambda df: df * 2)
+dataset_transformed = preprocessor.transform(dataset)
+print(dataset_transformed.take())
+# [{'value': 0}, {'value': 2}, {'value': 4}, {'value': 6}]
+# __custom_stateless_end__
diff --git a/doc/source/ray-air/images/air-preprocessor.svg b/doc/source/ray-air/images/air-preprocessor.svg
new file mode 100644
index 000000000..9425ef20f
--- /dev/null
+++ b/doc/source/ray-air/images/air-preprocessor.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Do not edit this file with editors other than diagrams.net -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="842px" height="303px" viewBox="-0.5 -0.5 842 303" content="&lt;mxfile host=&quot;app.diagrams.net&quot; modified=&quot;2022-05-31T01:19:31.847Z&quot; agent=&quot;5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36&quot; etag=&quot;k_ZbQeSsUe2oBHS7OHeJ&quot; version=&quot;17.5.0&quot; type=&quot;google&quot;&gt;&lt;diagram id=&quot;cgKqAAx2cLCLF_52fc_p&quot; name=&quot;Page-1&quot;&gt;3VrbbuM2EP0aA+1DDFE3y48bJ22BboEsUrS7TwEtjSxuZFGl6MTery8pkZYoyY4Tyxfsiy0OyRE5c+bMkPbImS3XvzOcJ3/RCNKRbUXrkXM3su3p1BefUrCpBN7UqwQLRqJKhGrBI/kBSmgp6YpEUBgDOaUpJ7kpDGmWQcgNGWaMvprDYpqab83xAjqCxxCnXem/JOJJJQ08q5b/AWSR6DcjS/UssR6sBEWCI/raEDn3I2fGKOXV03I9g1TaTtulmvfbjt7twhhk/JAJf04mzv18lsfOGs2+vPwz+7JxbtCkUvOC05Xa8cj2U6HwNqZCr1g23yhb+P+tqO64KUpPfRIDkJ+v607xtJDfDwwiEnLKxnn19MuvWrFYYqW7Gqmss32Nzegqi0Cu2hLdrwnh8JjjUPa+CowJWcKXqWghuRiSpjOaUlbOdSIPgsgV8oIz+gyNnsCeO74veiJcJKV2OR2nZJGJ51AYEcTI2xTPIX2gBeGEGh0vwDgRuPjcGsBp3uj9pPTNKed0KZdKGfkhdov1gpW9xXhY7/Qk2uJDxBXQJXC2EUPUBDdQkFIx5fiq/Voj1NU4TJro1AOxiorFVncNHPGgsPMeHLknwtHfDJMM2Dgm50IQhiAO+xDkhwHM458DQSaAtu23AGQHpwKQ3cEPl55/ijDHBfDjXDyAxdomc1yvYzJk95jsZCHn9ETcpY008QwjuXYXV+c1UpeWKE+AaVgVlzdZ0DKZc2mTTTs2gUiURKpJGU/ogmY4va+lt7XVpFnqMZ+p5LZS+B0436j6Dq84NS0Ja8K/SqOPPdX61ui5Wyt/lI2NbmRiu1+bjW+1Btmsp5UtPc8k9ghivCqzyvfVMtfbzGgGWwfL/e93rzAXXbEQ9phVuZljtgD+FhF24cIgxZy8mOsYPo2jTrygsSybCX8SfJwVMWVLmYd7EFKmNdOrndzYTnJLEkUVgEAUAnhe6pM+yinJeLk773bk3em0KqAVAdOuU06qenD4vChh2OrdGaPqxKBeui2fDefuDpCdAW2NXSuYGDGtk9vBPlTKH6QRas03vqE1MOfTONZ5somA7QKPAIXV8feVRzik8/KkpeEmRW9yWE/4V0yhz32oQ3QXoAfnovTQrTlsSQ9XQQ3nIwBk9TthywBmSh8q/JGp9nzx7x1dEfTEVimqyMK2jisaUINQanrppxR5blQKkb+nGOiGfh+pfJCfLsAbwZG80Y/Jzk2E1apKq3WpWSeApt8hpHdfO6C+awdH0lqBhWHK4ocLJAv/MMgZDaEoBFgOvIo4A/21IX12OvT20uGNNbZcv1URoYEo0THUTs9FiV4Hdg8GNnYeKtG776PiOLbD3vuoyJ/7nj/MMdRvXQhNLn0K1S/bd71xHVmoGX7o2IwyYGYIDswMaP95RoSv3SpnrOOC9/ThaXePsa4k9PpHiZjR5VOYQPhc8ull6tY2cs5N3PZbdazleibBDsTbATK0ahY/PTCCDi5mWwwMy9pBCP2sPQ8817OGYW3PujbaRl0LkywGsaEQnuaYh8keM5/nxnXSusm//CW1Tm0No3nGsbqy3A6SunwG/OgxaD/fDZpXP3JUHDAXo+mhyXjHOa1RS0/8aYuTrz4f6+3/PPVym3lPSSKiWf9LpHJJ/Vcb5/5/&lt;/diagram&gt;&lt;/mxfile&gt;" style="background-color: rgb(255, 255, 255);"><defs/><g><rect x="440" y="141" width="400" height="160" fill="#d5e8d4" stroke="#82b366" stroke-dasharray="2.9999999999999996 2.9999999999999996" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-end; justify-content: unsafe center; width: 398px; height: 1px; padding-top: 138px; margin-left: 441px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><font style="font-size: 16px">Predictor.predict()</font></div></div></div></foreignObject><text x="640" y="138" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Predictor.predict()</text></switch></g><rect x="0" y="21" width="400" height="280" fill="#dae8fc" stroke="#6c8ebf" stroke-dasharray="2.9999999999999996 2.9999999999999996" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-end; justify-content: unsafe center; width: 398px; height: 1px; padding-top: 18px; margin-left: 1px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><font style="font-size: 16px">Trainer.fit()</font></div></div></div></foreignObject><text x="200" y="18" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Trainer.fit()</text></switch></g><rect x="200" y="126" width="120" height="60" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 156px; margin-left: 201px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">train_dataset</div></div></div></foreignObject><text x="260" y="160" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">train_dataset</text></switch></g><rect x="235" y="201" width="120" height="60" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><rect x="245" y="211" width="120" height="60" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 241px; margin-left: 246px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">other_datasets</div></div></div></foreignObject><text x="305" y="245" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">other_datasets</text></switch></g><path d="M 80 111 L 80 146 Q 80 156 90 156 L 193.63 156" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 198.88 156 L 191.88 159.5 L 193.63 156 L 191.88 152.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 167px; margin-left: 145px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 11px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;">1. fit_transform()</div></div></div></foreignObject><text x="145" y="170" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="11px" text-anchor="middle">1. fit_transform()</text></switch></g><path d="M 80 111 L 80 221.05 Q 80 231.05 90 231.05 L 228.63 231" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 233.88 231 L 226.88 234.5 L 228.63 231 L 226.88 227.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 242px; margin-left: 152px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 11px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;">2. transform()</div></div></div></foreignObject><text x="152" y="245" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="11px" text-anchor="middle">2. transform()</text></switch></g><path d="M 140 81 L 453.63 81" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 458.88 81 L 451.88 84.5 L 453.63 81 L 451.88 77.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 92px; margin-left: 280px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 16px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;"><font style="font-size: 11px">3. save fitted preprocessor</font></div></div></div></foreignObject><text x="280" y="96" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="16px" text-anchor="middle">3. save fitted preprocessor</text></switch></g><rect x="20" y="51" width="120" height="60" rx="9" ry="9" fill="#fff2cc" stroke="#d6b656" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 81px; margin-left: 21px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Preprocessor</div></div></div></foreignObject><text x="80" y="85" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Preprocessor</text></switch></g><path d="M 520 111 L 520 194.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 520 199.88 L 516.5 192.88 L 520 194.63 L 523.5 192.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 172px; margin-left: 601px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 11px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;">4. Predictor.from_checkpoint()</div></div></div></foreignObject><text x="601" y="175" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="11px" text-anchor="middle">4. Predictor.from_checkpoint()</text></switch></g><rect x="460" y="51" width="120" height="60" rx="9" ry="9" fill="#f8cecc" stroke="#b85450" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 81px; margin-left: 461px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Checkpoint</div></div></div></foreignObject><text x="520" y="85" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Checkpoint</text></switch></g><rect x="700" y="201" width="120" height="60" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 231px; margin-left: 701px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">inference_batch</div></div></div></foreignObject><text x="760" y="235" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">inference_batch</text></switch></g><path d="M 580 231 L 693.63 231" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 698.88 231 L 691.88 234.5 L 693.63 231 L 691.88 227.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 241px; margin-left: 636px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 11px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;">5. transform_batch()</div></div></div></foreignObject><text x="636" y="244" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="11px" text-anchor="middle">5. transform_batch()</text></switch></g><rect x="460" y="201" width="120" height="60" rx="9" ry="9" fill="#fff2cc" stroke="#d6b656" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(0.9999999999999999)"><switch><foreignObject pointer-events="none" width="101%" height="101%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 231px; margin-left: 461px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Preprocessor</div></div></div></foreignObject><text x="520" y="235" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Preprocessor</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.diagrams.net/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg>
\ No newline at end of file
diff --git a/doc/source/ray-air/key-concepts.rst b/doc/source/ray-air/key-concepts.rst
index 72b537f9a..83edeee93 100644
--- a/doc/source/ray-air/key-concepts.rst
+++ b/doc/source/ray-air/key-concepts.rst
@@ -71,6 +71,7 @@ You can take a trained model and do batch inference using the BatchPredictor obj
     :start-after: __air_batch_predictor_start__
     :end-before: __air_batch_predictor_end__
 
+.. _air-key-concepts-online-inference:
 
 Online Inference
 ----------------
diff --git a/doc/source/ray-air/package-ref.rst b/doc/source/ray-air/package-ref.rst
index e2333fc59..30adf60b0 100644
--- a/doc/source/ray-air/package-ref.rst
+++ b/doc/source/ray-air/package-ref.rst
@@ -81,6 +81,8 @@ Predictors
 .. autoclass:: ray.ml.predictor.Predictor
     :members:
 
+.. autoclass:: ray.ml.predictor.DataBatchType
+
 .. autoclass:: ray.ml.batch_predictor.BatchPredictor
     :members:
 
diff --git a/doc/source/ray-air/preprocessors.rst b/doc/source/ray-air/preprocessors.rst
new file mode 100644
index 000000000..47be6ea41
--- /dev/null
+++ b/doc/source/ray-air/preprocessors.rst
@@ -0,0 +1,192 @@
+.. _air-preprocessors:
+
+Preprocessing Data
+==================
+
+This page describes how to perform data preprocessing in Ray AIR.
+
+Data preprocessing is a common technique for transforming raw data into features that will be input to a machine learning model.
+In general, you may want to apply the same preprocessing logic to your offline training data and online inference data.
+Ray AIR provides several common preprocessors out of the box as well as interfaces that enable you to define your own custom logic.
+
+Overview
+--------
+
+Ray AIR exposes a ``Preprocessor`` class for preprocessing. The ``Preprocessor`` has four methods that make up its core interface.
+
+#. ``fit()``: Compute state information about a :class:`Dataset <ray.data.Dataset>` (e.g. the mean or standard deviation of a column)
+   and save it to the ``Preprocessor``. This information should then be used to perform ``transform()``.
+   *This is typically called on the training dataset.*
+#. ``transform()``: Apply a transformation to a ``Dataset``.
+   If the ``Preprocessor`` is stateful, then ``fit()`` must be called first.
+   *This is typically called on the training, validation, test datasets.*
+#. ``transform_batch()``: Apply a transformation to a single :class:`batch <ray.ml.predictor.DataBatchType>` of data.
+   *This is typically called on online or offline inference data.*
+#. ``fit_transform()``: Syntactic sugar for calling both ``fit()`` and ``transform()`` on a ``Dataset``.
+
+To show these in action, let's walk through a basic example. First we'll set up two simple Ray ``Dataset``\s.
+
+.. literalinclude:: doc_code/preprocessors.py
+    :language: python
+    :start-after: __preprocessor_setup_start__
+    :end-before: __preprocessor_setup_end__
+
+Next, ``fit`` the ``Preprocessor`` on one ``Dataset``, and ``transform`` both ``Dataset``\s with this fitted information.
+
+.. literalinclude:: doc_code/preprocessors.py
+    :language: python
+    :start-after: __preprocessor_fit_transform_start__
+    :end-before: __preprocessor_fit_transform_end__
+
+Finally, call ``transform_batch`` on a single batch of data.
+
+.. literalinclude:: doc_code/preprocessors.py
+    :language: python
+    :start-after: __preprocessor_transform_batch_start__
+    :end-before: __preprocessor_transform_batch_end__
+
+Life of an AIR Preprocessor
+---------------------------
+
+Now that we've gone over the basics, let's dive into how ``Preprocessor``\s fit into an end-to-end application built with AIR.
+The diagram below depicts an overview of the main steps of a ``Preprocessor``:
+
+#. Passed into a ``Trainer`` to ``fit`` and ``transform`` input ``Dataset``\s.
+#. Saved as a ``Checkpoint``.
+#. Reconstructed in a ``Predictor`` to ``fit_batch`` on batches of data.
+
+.. figure:: images/air-preprocessor.svg
+
+Throughout this section we'll go through this workflow in more detail, with code examples using XGBoost.
+The same logic is applicable to other integrations as well.
+
+Trainer
+~~~~~~~
+
+The journey of the ``Preprocessor`` starts with the :class:`Trainer <ray.ml.trainer.Trainer>`.
+If the ``Trainer`` is instantiated with a ``Preprocessor``, then the following logic will be executed when ``Trainer.fit()`` is called:
+
+#. If a ``"train"`` ``Dataset`` is passed in, then the ``Preprocessor`` will call ``fit()`` on it.
+#. The ``Preprocessor`` will then call ``transform()`` on *all* ``Dataset``\s, including the ``"train"`` ``Dataset``.
+#. The ``Trainer`` will then perform training on the preprocessed ``Dataset``\s.
+
+.. literalinclude:: doc_code/preprocessors.py
+    :language: python
+    :start-after: __trainer_start__
+    :end-before: __trainer_end__
+
+.. note::
+
+    If you're passing a ``Preprocessor`` that is already fitted, it will be refitted on the ``"train"`` ``Dataset``.
+    Adding the functionality to support passing in a fitted Preprocessor is being tracked
+    `here <https://github.com/ray-project/ray/issues/25299>`__.
+
+.. TODO: Remove the note above once the issue is resolved.
+
+Tune
+~~~~
+
+If you're using ``Ray Tune`` for hyperparameter optimization, be aware that each ``Trial`` will instantiate its own copy of
+the ``Preprocessor`` and the fitting and transformation logic will occur once per ``Trial``.
+
+Checkpoint
+~~~~~~~~~~
+
+``Trainer.fit()`` returns a ``Results`` object which contains a ``Checkpoint``.
+If a ``Preprocessor`` was passed into the ``Trainer``, then it will be saved in the ``Checkpoint`` along with any fitted state.
+
+As a sanity check, let's confirm the ``Preprocessor`` is available in the ``Checkpoint``. In practice you should not need to do this.
+
+.. literalinclude:: doc_code/preprocessors.py
+    :language: python
+    :start-after: __checkpoint_start__
+    :end-before: __checkpoint_end__
+
+
+Predictor
+~~~~~~~~~
+
+A ``Predictor`` can be constructed from a saved ``Checkpoint``. If the ``Checkpoint`` contains a ``Preprocessor``,
+then the ``Preprocessor`` will be used to call ``transform_batch`` on input batches prior to performing inference.
+
+In the following example, we show the Batch Predictor flow. The same logic applies to the :ref:`Online Inference flow <air-key-concepts-online-inference>`.
+
+.. literalinclude:: doc_code/preprocessors.py
+    :language: python
+    :start-after: __predictor_start__
+    :end-before: __predictor_end__
+
+Types of Preprocessors
+----------------------
+
+Basic Preprocessors
+~~~~~~~~~~~~~~~~~~~
+
+Ray AIR provides a handful of ``Preprocessor``\s that you can use out of the box, and more will be added over time.
+`Contributions <https://docs.ray.io/en/master/getting-involved.html>`__ are welcome!
+
+.. tabbed:: Common APIs
+
+    #. :class:`Preprocessor <ray.ml.preprocessor.Preprocessor>`
+    #. :class:`BatchMapper <ray.ml.preprocessors.BatchMapper>`
+    #. :class:`Chain <ray.ml.preprocessors.Chain>`
+
+.. tabbed:: Tabular
+
+    #. :class:`Categorizer <ray.ml.preprocessors.Categorizer>`
+    #. :class:`FeatureHasher <ray.ml.preprocessors.FeatureHasher>`
+    #. :class:`LabelEncoder <ray.ml.preprocessors.LabelEncoder>`
+    #. :class:`MaxAbsScaler <ray.ml.preprocessors.MaxAbsScaler>`
+    #. :class:`MinMaxScaler <ray.ml.preprocessors.MinMaxScaler>`
+    #. :class:`Normalizer <ray.ml.preprocessors.Normalizer>`
+    #. :class:`OneHotEncoder <ray.ml.preprocessors.OneHotEncoder>`
+    #. :class:`OrdinalEncoder <ray.ml.preprocessors.OrdinalEncoder>`
+    #. :class:`PowerTransformer <ray.ml.preprocessors.PowerTransformer>`
+    #. :class:`RobustScaler <ray.ml.preprocessors.RobustScaler>`
+    #. :class:`SimpleImputer <ray.ml.preprocessors.SimpleImputer>`
+    #. :class:`StandardScaler <ray.ml.preprocessors.StandardScaler>`
+    #. :class:`SimpleImputer <ray.ml.preprocessors.SimpleImputer>`
+
+.. tabbed:: Text
+
+    #. :class:`CountVectorizer <ray.ml.preprocessors.CountVectorizer>`
+    #. :class:`HashingVectorizer <ray.ml.preprocessors.HashingVectorizer>`
+    #. :class:`Tokenizer <ray.ml.preprocessors.Tokenizer>`
+
+.. tabbed:: Image
+
+    Coming soon!
+
+.. tabbed:: Utilities
+
+    #. :func:`train_test_split <ray.ml.train_test_split>`
+
+Chaining Preprocessors
+~~~~~~~~~~~~~~~~~~~~~~
+
+More often than not, your preprocessing logic will contain multiple logical steps or apply different transformations to each column.
+A simple ``Chain`` ``Preprocessor`` is provided which can be used to apply individual ``Preprocessor`` operations sequentially.
+
+.. literalinclude:: doc_code/preprocessors.py
+    :language: python
+    :start-after: __chain_start__
+    :end-before: __chain_end__
+
+.. tip::
+
+    Keep in mind that the operations are sequential. For example, if you define a ``Preprocessor``
+    ``Chain([preprocessorA, preprocessorB])``, then ``preprocessorB.transform()`` will be applied
+    to the result of ``preprocessorA.transform()``.
+
+Custom Preprocessors
+~~~~~~~~~~~~~~~~~~~~
+
+**Stateless Preprocessors:** Stateless preprocessors can be implemented with the ``BatchMapper``.
+
+.. literalinclude:: doc_code/preprocessors.py
+    :language: python
+    :start-after: __custom_stateless_start__
+    :end-before: __custom_stateless_end__
+
+**Stateful Preprocessors:** Coming soon!
+