mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[air] Add initial benchmark section (#26608)
This commit is contained in:
parent
964bc90e09
commit
5ad4e75831
4 changed files with 122 additions and 14 deletions
|
@ -23,6 +23,7 @@ parts:
|
|||
- file: ray-air/examples/serving_guide
|
||||
- file: ray-air/deployment
|
||||
- file: ray-air/use-pretrained-model
|
||||
- file: ray-air/benchmarks
|
||||
- file: ray-air/examples/index
|
||||
sections:
|
||||
- file: ray-air/examples/torch_image_example
|
||||
|
|
81
doc/source/ray-air/benchmarks.rst
Normal file
81
doc/source/ray-air/benchmarks.rst
Normal file
|
@ -0,0 +1,81 @@
|
|||
AIR Benchmarks
|
||||
==============
|
||||
|
||||
Below we document key performance benchmarks for common AIR tasks and workflows.
|
||||
|
||||
XGBoost Batch Prediction
|
||||
------------------------
|
||||
|
||||
This task uses the BatchPredictor module to process different amounts of data
|
||||
using an XGBoost model.
|
||||
|
||||
We test out the performance across different cluster sizes and data sizes.
|
||||
|
||||
- `XGBoost Prediction Script`_
|
||||
- `XGBoost Cluster configuration`_
|
||||
|
||||
.. TODO: Add script for generating data and running the benchmark.
|
||||
|
||||
.. list-table::
|
||||
|
||||
* - **Cluster Setup**
|
||||
- **# workers**
|
||||
- **Data Size**
|
||||
- **# of rows**
|
||||
- **Time taken**
|
||||
- **Throughput**
|
||||
- **Command**
|
||||
* - 1 m5.4xlarge
|
||||
- 1 actor
|
||||
- 10 GB
|
||||
- 26M rows
|
||||
- 275 s
|
||||
- 94.5k rows/sec
|
||||
- `python xgboost_benchmark.py --size 10GB`
|
||||
* - 10 m5.4xlarge nodes
|
||||
- 10 actors (12 CPUs each)
|
||||
- 100 GB
|
||||
- 260M rows
|
||||
- 331 s
|
||||
- 786k rows/sec
|
||||
- `python xgboost_benchmark.py --size 100GB`
|
||||
|
||||
|
||||
XGBoost training
|
||||
----------------
|
||||
|
||||
This task uses the XGBoostTrainer module to train on different sizes of data
|
||||
with different amounts of parallelism.
|
||||
|
||||
XGBoost parameters were kept as defaults for xgboost==1.6.1 this task.
|
||||
|
||||
|
||||
- `XGBoost Training Script`_
|
||||
- `XGBoost Cluster configuration`_
|
||||
|
||||
.. list-table::
|
||||
|
||||
* - **Cluster Setup**
|
||||
- **# workers**
|
||||
- **Data Size**
|
||||
- **# of rows**
|
||||
- **Time taken**
|
||||
- **Command**
|
||||
* - 1 m5.4xlarge
|
||||
- 1 actor
|
||||
- 10 GB
|
||||
- 26M rows
|
||||
- 692 s
|
||||
- `python xgboost_benchmark.py --size 10GB`
|
||||
* - 10 m5.4xlarge nodes
|
||||
- 10 actors (12 CPUs each)
|
||||
- 100 GB
|
||||
- 260M rows
|
||||
- 693 s
|
||||
- `python xgboost_benchmark.py --size 100GB`
|
||||
|
||||
|
||||
|
||||
.. _`XGBoost Training Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L40-L58
|
||||
.. _`XGBoost Prediction Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L63-L71
|
||||
.. _`XGBoost Cluster configuration`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml#L6-L24
|
|
@ -75,4 +75,13 @@ AIR Feature Guides
|
|||
.. link-button:: use-pretrained-model-ref
|
||||
:type: ref
|
||||
:text: How to use a pretrained model for inference?
|
||||
:classes: btn-link btn-block stretched-link
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /ray-overview/images/ray_svg_logo.svg
|
||||
|
||||
+++
|
||||
.. link-button:: benchmarks
|
||||
:type: ref
|
||||
:text: Ray AIR benchmarks
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
|
|
@ -19,6 +19,17 @@ _XGB_MODEL_PATH = "model.json"
|
|||
_TRAINING_TIME_THRESHOLD = 1000
|
||||
_PREDICTION_TIME_THRESHOLD = 450
|
||||
|
||||
_EXPERIMENT_PARAMS = {
|
||||
"10G": {
|
||||
"data": "s3://air-example-data-2/10G-xgboost-data.parquet/",
|
||||
"num_workers": 1,
|
||||
},
|
||||
"100G": {
|
||||
"data": "s3://air-example-data-2/100G-xgboost-data.parquet/",
|
||||
"num_workers": 10,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def run_and_time_it(f):
|
||||
"""Runs f in a separate process and time it."""
|
||||
|
@ -37,17 +48,18 @@ def run_and_time_it(f):
|
|||
|
||||
|
||||
@run_and_time_it
|
||||
def run_xgboost_training():
|
||||
ds = data.read_parquet(
|
||||
"s3://air-example-data-2/100G-xgboost-data.parquet/"
|
||||
) # silver tier
|
||||
def run_xgboost_training(data_path: str, num_workers: int):
|
||||
ds = data.read_parquet(data_path)
|
||||
params = {
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": ["logloss", "error"],
|
||||
}
|
||||
|
||||
trainer = XGBoostTrainer(
|
||||
scaling_config={"num_workers": 10, "resources_per_worker": {"CPU": 12}},
|
||||
scaling_config={
|
||||
"num_workers": num_workers,
|
||||
"resources_per_worker": {"CPU": 12},
|
||||
},
|
||||
label_column="labels",
|
||||
params=params,
|
||||
datasets={"train": ds},
|
||||
|
@ -59,23 +71,23 @@ def run_xgboost_training():
|
|||
|
||||
|
||||
@run_and_time_it
|
||||
def run_xgboost_prediction(model_path: str):
|
||||
def run_xgboost_prediction(model_path: str, data_path: str):
|
||||
model = xgb.Booster()
|
||||
model.load_model(model_path)
|
||||
ds = data.read_parquet(
|
||||
"s3://air-example-data-2/100G-xgboost-data.parquet/"
|
||||
) # silver tier
|
||||
ds = data.read_parquet(data_path)
|
||||
ckpt = to_air_checkpoint(".", model)
|
||||
batch_predictor = BatchPredictor.from_checkpoint(ckpt, XGBoostPredictor)
|
||||
result = batch_predictor.predict(ds.drop_columns(["labels"]))
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
def main(args):
|
||||
experiment_params = _EXPERIMENT_PARAMS[args.size]
|
||||
data_path, num_workers = experiment_params["data"], experiment_params["num_workers"]
|
||||
print("Running xgboost training benchmark...")
|
||||
training_time = run_xgboost_training()
|
||||
training_time = run_xgboost_training(data_path, num_workers)
|
||||
print("Running xgboost prediction benchmark...")
|
||||
prediction_time = run_xgboost_prediction(_XGB_MODEL_PATH)
|
||||
prediction_time = run_xgboost_prediction(_XGB_MODEL_PATH, data_path)
|
||||
result = {
|
||||
"training_time": training_time,
|
||||
"prediction_time": prediction_time,
|
||||
|
@ -99,4 +111,9 @@ def main():
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--size", type=str, choices=["10G", "100G"], default="100G")
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
|
Loading…
Add table
Reference in a new issue