[air] Add xgboost release test for silver tier(10-node case). (#26460)

Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
2025-03-05 18:11:42 -05:00 · 2022-07-15 13:21:10 -07:00 · 2022-07-15 13:21:10 -07:00 · a241e6a0f5
commit a241e6a0f5
parent 0ecc7dad74
4 changed files with 175 additions and 0 deletions
--- a/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py
+++ b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py
@ -0,0 +1,102 @@
+from functools import wraps
+import json
+from multiprocessing import Process
+import os
+import time
+import xgboost as xgb
+
+import ray
+from ray import data
+from ray.train.xgboost import (
+    XGBoostTrainer,
+    load_checkpoint,
+    to_air_checkpoint,
+    XGBoostPredictor,
+)
+from ray.train.batch_predictor import BatchPredictor
+
+_XGB_MODEL_PATH = "model.json"
+_TRAINING_TIME_THRESHOLD = 1000
+_PREDICTION_TIME_THRESHOLD = 450
+
+
+def run_and_time_it(f):
+    """Runs f in a separate process and time it."""
+
+    @wraps(f)
+    def wrapper(*args, **kwargs):
+        p = Process(target=f, args=args)
+        start = time.monotonic()
+        p.start()
+        p.join()
+        time_taken = time.monotonic() - start
+        print(f"{f.__name__} takes {time_taken} seconds.")
+        return time_taken
+
+    return wrapper
+
+
+@run_and_time_it
+def run_xgboost_training():
+    ds = data.read_parquet(
+        "s3://air-example-data-2/100G-xgboost-data.parquet/"
+    )  # silver tier
+    params = {
+        "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+    }
+
+    trainer = XGBoostTrainer(
+        scaling_config={"num_workers": 10, "resources_per_worker": {"CPU": 12}},
+        label_column="labels",
+        params=params,
+        datasets={"train": ds},
+    )
+    result = trainer.fit()
+    xgboost_model = load_checkpoint(result.checkpoint)[0]
+    xgboost_model.save_model(_XGB_MODEL_PATH)
+    ray.shutdown()
+
+
+@run_and_time_it
+def run_xgboost_prediction(model_path: str):
+    model = xgb.Booster()
+    model.load_model(model_path)
+    ds = data.read_parquet(
+        "s3://air-example-data-2/100G-xgboost-data.parquet/"
+    )  # silver tier
+    ckpt = to_air_checkpoint(".", model)
+    batch_predictor = BatchPredictor.from_checkpoint(ckpt, XGBoostPredictor)
+    result = batch_predictor.predict(ds.drop_columns(["labels"]))
+    return result
+
+
+def main():
+    print("Running xgboost training benchmark...")
+    training_time = run_xgboost_training()
+    print("Running xgboost prediction benchmark...")
+    prediction_time = run_xgboost_prediction(_XGB_MODEL_PATH)
+    result = {
+        "training_time": training_time,
+        "prediction_time": prediction_time,
+    }
+    print("Results:", result)
+    test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/result.json")
+    with open(test_output_json, "wt") as f:
+        json.dump(result, f)
+
+    if training_time > _TRAINING_TIME_THRESHOLD:
+        raise RuntimeError(
+            f"Training on XGBoost is taking {training_time} seconds, "
+            f"which is longer than expected ({_TRAINING_TIME_THRESHOLD} seconds)."
+        )
+
+    if prediction_time > _PREDICTION_TIME_THRESHOLD:
+        raise RuntimeError(
+            f"Batch prediction on XGBoost is taking {prediction_time} seconds, "
+            f"which is longer than expected ({_PREDICTION_TIME_THRESHOLD} seconds)."
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/release/air_tests/air_benchmarks/xgboost_app_config.yaml
+++ b/release/air_tests/air_benchmarks/xgboost_app_config.yaml
@ -0,0 +1,14 @@
+base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
+env_vars: {}
+debian_packages:
+  - curl
+
+python:
+  pip_packages:
+    - pytest
+  conda_packages: []
+
+post_build_cmds:
+  - pip3 uninstall ray -y || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
+  - pip3 install -U --force-reinstall --no-deps xgboost_ray  # Avoid caching
+  - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
--- a/release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml
+++ b/release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml
@ -0,0 +1,28 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 9
+
+head_node_type:
+    name: head_node
+    instance_type: m5.4xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: m5.4xlarge
+      max_workers: 9
+      min_workers: 9
+      use_spot: false
+
+aws:
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+            Iops: 5000
+            Throughput: 1000
+            VolumeSize: 1000
+            VolumeType: gp3
+
+    IamInstanceProfile:
+        # Arn: arn:aws:iam::959243851260:instance-profile/ray-autoscaler-v1
+        Arn: arn:aws:iam::188439194153:instance-profile/ray-autoscaler-v1
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@ -153,6 +153,37 @@

  alert: default

+# AIR benchmarks for XGBoost CUJ
+- name: air_benchmark_xgboost_cpu_10
+  group: AIR tests
+  working_dir: air_tests/air_benchmarks
+
+  frequency: nightly
+  team: ml
+  env: staging
+
+  cluster:
+    cluster_env: xgboost_app_config.yaml
+    cluster_compute: xgboost_compute_tpl.yaml
+
+  run:
+    timeout: 36000
+    script: python workloads/xgboost_benchmark.py
+
+    wait_for_nodes:
+      num_nodes: 10
+
+    type: sdk_command
+    file_manager: job
+
+  smoke_test:
+    frequency: disabled
+
+    run:
+      timeout: 1800
+
+  alert: default
+
 # Ray AIR distributed Torch benchmarks
 - name: air_benchmark_torch_mnist_cpu_4x1
  group: AIR tests