[air] Add xgboost release test for silver tier(10-node case). (#26460)

Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
xwjiang2010 2022-07-15 13:21:10 -07:00 committed by GitHub
parent 0ecc7dad74
commit a241e6a0f5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 175 additions and 0 deletions

View file

@ -0,0 +1,102 @@
from functools import wraps
import json
from multiprocessing import Process
import os
import time
import xgboost as xgb
import ray
from ray import data
from ray.train.xgboost import (
XGBoostTrainer,
load_checkpoint,
to_air_checkpoint,
XGBoostPredictor,
)
from ray.train.batch_predictor import BatchPredictor
_XGB_MODEL_PATH = "model.json"
_TRAINING_TIME_THRESHOLD = 1000
_PREDICTION_TIME_THRESHOLD = 450
def run_and_time_it(f):
"""Runs f in a separate process and time it."""
@wraps(f)
def wrapper(*args, **kwargs):
p = Process(target=f, args=args)
start = time.monotonic()
p.start()
p.join()
time_taken = time.monotonic() - start
print(f"{f.__name__} takes {time_taken} seconds.")
return time_taken
return wrapper
@run_and_time_it
def run_xgboost_training():
ds = data.read_parquet(
"s3://air-example-data-2/100G-xgboost-data.parquet/"
) # silver tier
params = {
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
}
trainer = XGBoostTrainer(
scaling_config={"num_workers": 10, "resources_per_worker": {"CPU": 12}},
label_column="labels",
params=params,
datasets={"train": ds},
)
result = trainer.fit()
xgboost_model = load_checkpoint(result.checkpoint)[0]
xgboost_model.save_model(_XGB_MODEL_PATH)
ray.shutdown()
@run_and_time_it
def run_xgboost_prediction(model_path: str):
model = xgb.Booster()
model.load_model(model_path)
ds = data.read_parquet(
"s3://air-example-data-2/100G-xgboost-data.parquet/"
) # silver tier
ckpt = to_air_checkpoint(".", model)
batch_predictor = BatchPredictor.from_checkpoint(ckpt, XGBoostPredictor)
result = batch_predictor.predict(ds.drop_columns(["labels"]))
return result
def main():
print("Running xgboost training benchmark...")
training_time = run_xgboost_training()
print("Running xgboost prediction benchmark...")
prediction_time = run_xgboost_prediction(_XGB_MODEL_PATH)
result = {
"training_time": training_time,
"prediction_time": prediction_time,
}
print("Results:", result)
test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/result.json")
with open(test_output_json, "wt") as f:
json.dump(result, f)
if training_time > _TRAINING_TIME_THRESHOLD:
raise RuntimeError(
f"Training on XGBoost is taking {training_time} seconds, "
f"which is longer than expected ({_TRAINING_TIME_THRESHOLD} seconds)."
)
if prediction_time > _PREDICTION_TIME_THRESHOLD:
raise RuntimeError(
f"Batch prediction on XGBoost is taking {prediction_time} seconds, "
f"which is longer than expected ({_PREDICTION_TIME_THRESHOLD} seconds)."
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,14 @@
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
env_vars: {}
debian_packages:
- curl
python:
pip_packages:
- pytest
conda_packages: []
post_build_cmds:
- pip3 uninstall ray -y || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
- pip3 install -U --force-reinstall --no-deps xgboost_ray # Avoid caching
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}

View file

@ -0,0 +1,28 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2
max_workers: 9
head_node_type:
name: head_node
instance_type: m5.4xlarge
worker_node_types:
- name: worker_node
instance_type: m5.4xlarge
max_workers: 9
min_workers: 9
use_spot: false
aws:
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
Iops: 5000
Throughput: 1000
VolumeSize: 1000
VolumeType: gp3
IamInstanceProfile:
# Arn: arn:aws:iam::959243851260:instance-profile/ray-autoscaler-v1
Arn: arn:aws:iam::188439194153:instance-profile/ray-autoscaler-v1

View file

@ -153,6 +153,37 @@
alert: default
# AIR benchmarks for XGBoost CUJ
- name: air_benchmark_xgboost_cpu_10
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: xgboost_app_config.yaml
cluster_compute: xgboost_compute_tpl.yaml
run:
timeout: 36000
script: python workloads/xgboost_benchmark.py
wait_for_nodes:
num_nodes: 10
type: sdk_command
file_manager: job
smoke_test:
frequency: disabled
run:
timeout: 1800
alert: default
# Ray AIR distributed Torch benchmarks
- name: air_benchmark_torch_mnist_cpu_4x1
group: AIR tests