[air] Add initial benchmark section (#26608)

2025-03-06 02:21:39 -05:00 · 2022-07-15 15:33:48 -07:00 · 2022-07-15 15:33:48 -07:00 · 5ad4e75831
commit 5ad4e75831
parent 964bc90e09
4 changed files with 122 additions and 14 deletions
--- a/doc/source/_toc.yml
+++ b/doc/source/_toc.yml
@ -23,6 +23,7 @@ parts:
        - file: ray-air/examples/serving_guide
        - file: ray-air/deployment
        - file: ray-air/use-pretrained-model
+        - file: ray-air/benchmarks
    - file: ray-air/examples/index
      sections:
        - file: ray-air/examples/torch_image_example
--- a/doc/source/ray-air/benchmarks.rst
+++ b/doc/source/ray-air/benchmarks.rst
@ -0,0 +1,81 @@
+AIR Benchmarks
+==============
+
+Below we document key performance benchmarks for common AIR tasks and workflows.
+
+XGBoost Batch Prediction
+------------------------
+
+This task uses the BatchPredictor module to process different amounts of data
+using an XGBoost model.
+
+We test out the performance across different cluster sizes and data sizes.
+
+- `XGBoost Prediction Script`_
+- `XGBoost Cluster configuration`_
+
+.. TODO: Add script for generating data and running the benchmark.
+
+.. list-table::
+
+    * - **Cluster Setup**
+      - **# workers**
+      - **Data Size**
+      - **# of rows**
+      - **Time taken**
+      - **Throughput**
+      - **Command**
+    * - 1 m5.4xlarge
+      - 1 actor
+      - 10 GB
+      - 26M rows
+      - 275 s
+      - 94.5k rows/sec
+      - `python xgboost_benchmark.py --size 10GB`
+    * - 10 m5.4xlarge nodes
+      - 10 actors (12 CPUs each)
+      - 100 GB
+      - 260M rows
+      - 331 s
+      - 786k rows/sec
+      - `python xgboost_benchmark.py --size 100GB`
+
+
+XGBoost training
+----------------
+
+This task uses the XGBoostTrainer module to train on different sizes of data
+with different amounts of parallelism.
+
+XGBoost parameters were kept as defaults for xgboost==1.6.1 this task.
+
+
+- `XGBoost Training Script`_
+- `XGBoost Cluster configuration`_
+
+.. list-table::
+
+    * - **Cluster Setup**
+      - **# workers**
+      - **Data Size**
+      - **# of rows**
+      - **Time taken**
+      - **Command**
+    * - 1 m5.4xlarge
+      - 1 actor
+      - 10 GB
+      - 26M rows
+      - 692 s
+      - `python xgboost_benchmark.py --size 10GB`
+    * - 10 m5.4xlarge nodes
+      - 10 actors (12 CPUs each)
+      - 100 GB
+      - 260M rows
+      - 693 s
+      - `python xgboost_benchmark.py --size 100GB`
+
+
+
+.. _`XGBoost Training Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L40-L58
+.. _`XGBoost Prediction Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L63-L71
+.. _`XGBoost Cluster configuration`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml#L6-L24
--- a/doc/source/ray-air/user-guides.rst
+++ b/doc/source/ray-air/user-guides.rst
@ -75,4 +75,13 @@ AIR Feature Guides
    .. link-button:: use-pretrained-model-ref
        :type: ref
        :text: How to use a pretrained model for inference?
-        :classes: btn-link btn-block stretched-link
+        :classes: btn-link btn-block stretched-link
+
+    ---
+    :img-top: /ray-overview/images/ray_svg_logo.svg
+
+    +++
+    .. link-button:: benchmarks
+        :type: ref
+        :text: Ray AIR benchmarks
+        :classes: btn-link btn-block stretched-link
--- a/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py
+++ b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py
@ -19,6 +19,17 @@ _XGB_MODEL_PATH = "model.json"
 _TRAINING_TIME_THRESHOLD = 1000
 _PREDICTION_TIME_THRESHOLD = 450

+_EXPERIMENT_PARAMS = {
+    "10G": {
+        "data": "s3://air-example-data-2/10G-xgboost-data.parquet/",
+        "num_workers": 1,
+    },
+    "100G": {
+        "data": "s3://air-example-data-2/100G-xgboost-data.parquet/",
+        "num_workers": 10,
+    },
+}
+

 def run_and_time_it(f):
    """Runs f in a separate process and time it."""
@ -37,17 +48,18 @@ def run_and_time_it(f):


@run_and_time_it
-def run_xgboost_training():
-    ds = data.read_parquet(
-        "s3://air-example-data-2/100G-xgboost-data.parquet/"
-    )  # silver tier
+def run_xgboost_training(data_path: str, num_workers: int):
+    ds = data.read_parquet(data_path)
    params = {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    trainer = XGBoostTrainer(
-        scaling_config={"num_workers": 10, "resources_per_worker": {"CPU": 12}},
+        scaling_config={
+            "num_workers": num_workers,
+            "resources_per_worker": {"CPU": 12},
+        },
        label_column="labels",
        params=params,
        datasets={"train": ds},
@ -59,23 +71,23 @@ def run_xgboost_training():


@run_and_time_it
-def run_xgboost_prediction(model_path: str):
+def run_xgboost_prediction(model_path: str, data_path: str):
    model = xgb.Booster()
    model.load_model(model_path)
-    ds = data.read_parquet(
-        "s3://air-example-data-2/100G-xgboost-data.parquet/"
-    )  # silver tier
+    ds = data.read_parquet(data_path)
    ckpt = to_air_checkpoint(".", model)
    batch_predictor = BatchPredictor.from_checkpoint(ckpt, XGBoostPredictor)
    result = batch_predictor.predict(ds.drop_columns(["labels"]))
    return result


-def main():
+def main(args):
+    experiment_params = _EXPERIMENT_PARAMS[args.size]
+    data_path, num_workers = experiment_params["data"], experiment_params["num_workers"]
    print("Running xgboost training benchmark...")
-    training_time = run_xgboost_training()
+    training_time = run_xgboost_training(data_path, num_workers)
    print("Running xgboost prediction benchmark...")
-    prediction_time = run_xgboost_prediction(_XGB_MODEL_PATH)
+    prediction_time = run_xgboost_prediction(_XGB_MODEL_PATH, data_path)
    result = {
        "training_time": training_time,
        "prediction_time": prediction_time,
@ -99,4 +111,9 @@ def main():


 if __name__ == "__main__":
-    main()
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--size", type=str, choices=["10G", "100G"], default="100G")
+    args = parser.parse_args()
+    main(args)