[air] Use SPREAD strategy by default and don't special case it in benchmarks (#26633)

This commit is contained in:
Eric Liang 2022-07-16 17:37:06 -07:00 committed by GitHub
parent 7c32993c15
commit 0855bcb77e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 31 additions and 63 deletions

View file

@ -29,29 +29,21 @@ For this benchmark, we configured the nodes to have reasonable disk size and thr
.. list-table::
* - **Cluster Setup**
- **# workers**
- **Time taken**
- **Throughput**
- **Data Spilled**
- **Performance**
- **Disk Spill**
- **Command**
* - 1 m5.4xlarge
- 1 actor
- 390 s
- 0.51 GB/s
* - 1 m5.4xlarge node (1 actor)
- 390 s (0.51 GiB/s)
- 205 GiB
- `python data_benchmark.py --dataset-size-gib=200 --num-workers=1 --placement-strategy=SPREAD`
* - 5 m5.4xlarge
- 5 actors
- 70 s
- 2.85 GiB/s
- `python data_benchmark.py --dataset-size-gb=200 --num-workers=1`
* - 5 m5.4xlarge nodes (2 actors)
- 70 s (2.85 GiB/S)
- 206 GiB
- `python data_benchmark.py --dataset-size-gib=200 --num-workers=5 --placement-strategy=SPREAD`
* - 20 m5.4xlarge nodes
- 20 actors
- 3.8 s
- 52.6 GiB/s
- `python data_benchmark.py --dataset-size-gb=200 --num-workers=5`
* - 20 m5.4xlarge nodes (20 actors)
- 3.8 s (52.6 GiB/s)
- 0 GB
- `python data_benchmark.py --dataset-size-gib=200 --num-workers=20 --placement-strategy=SPREAD`
- `python data_benchmark.py --dataset-size-gb=200 --num-workers=20`
XGBoost Batch Prediction
@ -70,25 +62,16 @@ We test out the performance across different cluster sizes and data sizes.
.. list-table::
* - **Cluster Setup**
- **# workers**
- **Data Size**
- **# of rows**
- **Time taken**
- **Throughput**
- **Performance**
- **Command**
* - 1 m5.4xlarge
- 1 actor
- 10 GB
- 26M rows
- 275 s
- 94.5k rows/sec
* - 1 m5.4xlarge node (1 actor)
- 10 GB (26M rows)
- 275 s (94.5k rows/s)
- `python xgboost_benchmark.py --size 10GB`
* - 10 m5.4xlarge nodes
- 10 actors (12 CPUs each)
- 100 GB
- 260M rows
- 331 s
- 786k rows/sec
* - 10 m5.4xlarge nodes (10 actors)
- 100 GB (260M rows)
- 331 s (786k rows/s)
- `python xgboost_benchmark.py --size 100GB`
@ -107,21 +90,15 @@ XGBoost parameters were kept as defaults for xgboost==1.6.1 this task.
.. list-table::
* - **Cluster Setup**
- **# workers**
- **Data Size**
- **# of rows**
- **Time taken**
- **Performance**
- **Command**
* - 1 m5.4xlarge
- 1 actor
- 10 GB
- 26M rows
* - 1 m5.4xlarge node (1 actor)
- 10 GB (26M rows)
- 692 s
- `python xgboost_benchmark.py --size 10GB`
* - 10 m5.4xlarge nodes
- 10 actors (12 CPUs each)
- 100 GB
- 260M rows
* - 10 m5.4xlarge nodes (10 actors)
- 100 GB (260M rows)
- 693 s
- `python xgboost_benchmark.py --size 100GB`

View file

@ -49,7 +49,7 @@ class ScalingConfigDataClass:
num_workers: Optional[int] = None
use_gpu: bool = False
resources_per_worker: Optional[Dict] = None
placement_strategy: str = "PACK"
placement_strategy: str = "SPREAD"
def __post_init__(self):
if self.resources_per_worker:

View file

@ -11,9 +11,9 @@ from ray.data.preprocessors import BatchMapper
GiB = 1024 * 1024 * 1024
def make_ds(size_gib: int):
def make_ds(size_gb: int):
# Dataset of 10KiB tensor records.
total_size = GiB * size_gib
total_size = GiB * size_gb
record_dim = 1280
record_size = record_dim * 8
num_records = int(total_size / record_size)
@ -22,14 +22,13 @@ def make_ds(size_gib: int):
return dataset
def run_ingest_bulk(dataset, num_workers, num_cpus_per_worker, placement_strategy):
def run_ingest_bulk(dataset, num_workers, num_cpus_per_worker):
dummy_prep = BatchMapper(lambda df: df * 2)
trainer = DummyTrainer(
scaling_config={
"num_workers": num_workers,
"trainer_resources": {"CPU": 0},
"resources_per_worker": {"CPU": num_cpus_per_worker},
"placement_strategy": placement_strategy,
},
datasets={"train": dataset},
preprocessor=dummy_prep,
@ -49,20 +48,12 @@ if __name__ == "__main__":
default=1,
help="Number of CPUs for each training worker.",
)
parser.add_argument(
"--placement-strategy",
type=str,
default="PACK",
help="Worker placement strategy.",
)
parser.add_argument("--dataset-size-gib", type=int, default=200)
parser.add_argument("--dataset-size-gb", type=int, default=200)
args = parser.parse_args()
ds = make_ds(args.dataset_size_gib)
ds = make_ds(args.dataset_size_gb)
start = time.time()
run_ingest_bulk(
ds, args.num_workers, args.num_cpus_per_worker, args.placement_strategy
)
run_ingest_bulk(ds, args.num_workers, args.num_cpus_per_worker)
end = time.time()
time_taken = end - start

View file

@ -168,7 +168,7 @@
run:
timeout: 3600
script: python workloads/data_benchmark.py --dataset-size-gib=200 --num-workers=20 --placement-strategy=SPREAD
script: python workloads/data_benchmark.py --dataset-size-gb=200 --num-workers=20
wait_for_nodes:
num_nodes: 20