mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[air] Add xgboost release test for silver tier(10-node case). (#26460)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
parent
0ecc7dad74
commit
a241e6a0f5
4 changed files with 175 additions and 0 deletions
102
release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py
Normal file
102
release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py
Normal file
|
@ -0,0 +1,102 @@
|
|||
from functools import wraps
|
||||
import json
|
||||
from multiprocessing import Process
|
||||
import os
|
||||
import time
|
||||
import xgboost as xgb
|
||||
|
||||
import ray
|
||||
from ray import data
|
||||
from ray.train.xgboost import (
|
||||
XGBoostTrainer,
|
||||
load_checkpoint,
|
||||
to_air_checkpoint,
|
||||
XGBoostPredictor,
|
||||
)
|
||||
from ray.train.batch_predictor import BatchPredictor
|
||||
|
||||
_XGB_MODEL_PATH = "model.json"
|
||||
_TRAINING_TIME_THRESHOLD = 1000
|
||||
_PREDICTION_TIME_THRESHOLD = 450
|
||||
|
||||
|
||||
def run_and_time_it(f):
|
||||
"""Runs f in a separate process and time it."""
|
||||
|
||||
@wraps(f)
|
||||
def wrapper(*args, **kwargs):
|
||||
p = Process(target=f, args=args)
|
||||
start = time.monotonic()
|
||||
p.start()
|
||||
p.join()
|
||||
time_taken = time.monotonic() - start
|
||||
print(f"{f.__name__} takes {time_taken} seconds.")
|
||||
return time_taken
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@run_and_time_it
|
||||
def run_xgboost_training():
|
||||
ds = data.read_parquet(
|
||||
"s3://air-example-data-2/100G-xgboost-data.parquet/"
|
||||
) # silver tier
|
||||
params = {
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": ["logloss", "error"],
|
||||
}
|
||||
|
||||
trainer = XGBoostTrainer(
|
||||
scaling_config={"num_workers": 10, "resources_per_worker": {"CPU": 12}},
|
||||
label_column="labels",
|
||||
params=params,
|
||||
datasets={"train": ds},
|
||||
)
|
||||
result = trainer.fit()
|
||||
xgboost_model = load_checkpoint(result.checkpoint)[0]
|
||||
xgboost_model.save_model(_XGB_MODEL_PATH)
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@run_and_time_it
|
||||
def run_xgboost_prediction(model_path: str):
|
||||
model = xgb.Booster()
|
||||
model.load_model(model_path)
|
||||
ds = data.read_parquet(
|
||||
"s3://air-example-data-2/100G-xgboost-data.parquet/"
|
||||
) # silver tier
|
||||
ckpt = to_air_checkpoint(".", model)
|
||||
batch_predictor = BatchPredictor.from_checkpoint(ckpt, XGBoostPredictor)
|
||||
result = batch_predictor.predict(ds.drop_columns(["labels"]))
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
print("Running xgboost training benchmark...")
|
||||
training_time = run_xgboost_training()
|
||||
print("Running xgboost prediction benchmark...")
|
||||
prediction_time = run_xgboost_prediction(_XGB_MODEL_PATH)
|
||||
result = {
|
||||
"training_time": training_time,
|
||||
"prediction_time": prediction_time,
|
||||
}
|
||||
print("Results:", result)
|
||||
test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/result.json")
|
||||
with open(test_output_json, "wt") as f:
|
||||
json.dump(result, f)
|
||||
|
||||
if training_time > _TRAINING_TIME_THRESHOLD:
|
||||
raise RuntimeError(
|
||||
f"Training on XGBoost is taking {training_time} seconds, "
|
||||
f"which is longer than expected ({_TRAINING_TIME_THRESHOLD} seconds)."
|
||||
)
|
||||
|
||||
if prediction_time > _PREDICTION_TIME_THRESHOLD:
|
||||
raise RuntimeError(
|
||||
f"Batch prediction on XGBoost is taking {prediction_time} seconds, "
|
||||
f"which is longer than expected ({_PREDICTION_TIME_THRESHOLD} seconds)."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
14
release/air_tests/air_benchmarks/xgboost_app_config.yaml
Normal file
14
release/air_tests/air_benchmarks/xgboost_app_config.yaml
Normal file
|
@ -0,0 +1,14 @@
|
|||
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
|
||||
env_vars: {}
|
||||
debian_packages:
|
||||
- curl
|
||||
|
||||
python:
|
||||
pip_packages:
|
||||
- pytest
|
||||
conda_packages: []
|
||||
|
||||
post_build_cmds:
|
||||
- pip3 uninstall ray -y || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
|
||||
- pip3 install -U --force-reinstall --no-deps xgboost_ray # Avoid caching
|
||||
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
|
28
release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml
Normal file
28
release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml
Normal file
|
@ -0,0 +1,28 @@
|
|||
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
|
||||
region: us-west-2
|
||||
|
||||
max_workers: 9
|
||||
|
||||
head_node_type:
|
||||
name: head_node
|
||||
instance_type: m5.4xlarge
|
||||
|
||||
worker_node_types:
|
||||
- name: worker_node
|
||||
instance_type: m5.4xlarge
|
||||
max_workers: 9
|
||||
min_workers: 9
|
||||
use_spot: false
|
||||
|
||||
aws:
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
Iops: 5000
|
||||
Throughput: 1000
|
||||
VolumeSize: 1000
|
||||
VolumeType: gp3
|
||||
|
||||
IamInstanceProfile:
|
||||
# Arn: arn:aws:iam::959243851260:instance-profile/ray-autoscaler-v1
|
||||
Arn: arn:aws:iam::188439194153:instance-profile/ray-autoscaler-v1
|
|
@ -153,6 +153,37 @@
|
|||
|
||||
alert: default
|
||||
|
||||
# AIR benchmarks for XGBoost CUJ
|
||||
- name: air_benchmark_xgboost_cpu_10
|
||||
group: AIR tests
|
||||
working_dir: air_tests/air_benchmarks
|
||||
|
||||
frequency: nightly
|
||||
team: ml
|
||||
env: staging
|
||||
|
||||
cluster:
|
||||
cluster_env: xgboost_app_config.yaml
|
||||
cluster_compute: xgboost_compute_tpl.yaml
|
||||
|
||||
run:
|
||||
timeout: 36000
|
||||
script: python workloads/xgboost_benchmark.py
|
||||
|
||||
wait_for_nodes:
|
||||
num_nodes: 10
|
||||
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
|
||||
smoke_test:
|
||||
frequency: disabled
|
||||
|
||||
run:
|
||||
timeout: 1800
|
||||
|
||||
alert: default
|
||||
|
||||
# Ray AIR distributed Torch benchmarks
|
||||
- name: air_benchmark_torch_mnist_cpu_4x1
|
||||
group: AIR tests
|
||||
|
|
Loading…
Add table
Reference in a new issue