mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[air] Use SPREAD strategy by default and don't special case it in benchmarks (#26633)
This commit is contained in:
parent
7c32993c15
commit
0855bcb77e
4 changed files with 31 additions and 63 deletions
|
@ -29,29 +29,21 @@ For this benchmark, we configured the nodes to have reasonable disk size and thr
|
|||
.. list-table::
|
||||
|
||||
* - **Cluster Setup**
|
||||
- **# workers**
|
||||
- **Time taken**
|
||||
- **Throughput**
|
||||
- **Data Spilled**
|
||||
- **Performance**
|
||||
- **Disk Spill**
|
||||
- **Command**
|
||||
* - 1 m5.4xlarge
|
||||
- 1 actor
|
||||
- 390 s
|
||||
- 0.51 GB/s
|
||||
* - 1 m5.4xlarge node (1 actor)
|
||||
- 390 s (0.51 GiB/s)
|
||||
- 205 GiB
|
||||
- `python data_benchmark.py --dataset-size-gib=200 --num-workers=1 --placement-strategy=SPREAD`
|
||||
* - 5 m5.4xlarge
|
||||
- 5 actors
|
||||
- 70 s
|
||||
- 2.85 GiB/s
|
||||
- `python data_benchmark.py --dataset-size-gb=200 --num-workers=1`
|
||||
* - 5 m5.4xlarge nodes (2 actors)
|
||||
- 70 s (2.85 GiB/S)
|
||||
- 206 GiB
|
||||
- `python data_benchmark.py --dataset-size-gib=200 --num-workers=5 --placement-strategy=SPREAD`
|
||||
* - 20 m5.4xlarge nodes
|
||||
- 20 actors
|
||||
- 3.8 s
|
||||
- 52.6 GiB/s
|
||||
- `python data_benchmark.py --dataset-size-gb=200 --num-workers=5`
|
||||
* - 20 m5.4xlarge nodes (20 actors)
|
||||
- 3.8 s (52.6 GiB/s)
|
||||
- 0 GB
|
||||
- `python data_benchmark.py --dataset-size-gib=200 --num-workers=20 --placement-strategy=SPREAD`
|
||||
- `python data_benchmark.py --dataset-size-gb=200 --num-workers=20`
|
||||
|
||||
|
||||
XGBoost Batch Prediction
|
||||
|
@ -70,25 +62,16 @@ We test out the performance across different cluster sizes and data sizes.
|
|||
.. list-table::
|
||||
|
||||
* - **Cluster Setup**
|
||||
- **# workers**
|
||||
- **Data Size**
|
||||
- **# of rows**
|
||||
- **Time taken**
|
||||
- **Throughput**
|
||||
- **Performance**
|
||||
- **Command**
|
||||
* - 1 m5.4xlarge
|
||||
- 1 actor
|
||||
- 10 GB
|
||||
- 26M rows
|
||||
- 275 s
|
||||
- 94.5k rows/sec
|
||||
* - 1 m5.4xlarge node (1 actor)
|
||||
- 10 GB (26M rows)
|
||||
- 275 s (94.5k rows/s)
|
||||
- `python xgboost_benchmark.py --size 10GB`
|
||||
* - 10 m5.4xlarge nodes
|
||||
- 10 actors (12 CPUs each)
|
||||
- 100 GB
|
||||
- 260M rows
|
||||
- 331 s
|
||||
- 786k rows/sec
|
||||
* - 10 m5.4xlarge nodes (10 actors)
|
||||
- 100 GB (260M rows)
|
||||
- 331 s (786k rows/s)
|
||||
- `python xgboost_benchmark.py --size 100GB`
|
||||
|
||||
|
||||
|
@ -107,21 +90,15 @@ XGBoost parameters were kept as defaults for xgboost==1.6.1 this task.
|
|||
.. list-table::
|
||||
|
||||
* - **Cluster Setup**
|
||||
- **# workers**
|
||||
- **Data Size**
|
||||
- **# of rows**
|
||||
- **Time taken**
|
||||
- **Performance**
|
||||
- **Command**
|
||||
* - 1 m5.4xlarge
|
||||
- 1 actor
|
||||
- 10 GB
|
||||
- 26M rows
|
||||
* - 1 m5.4xlarge node (1 actor)
|
||||
- 10 GB (26M rows)
|
||||
- 692 s
|
||||
- `python xgboost_benchmark.py --size 10GB`
|
||||
* - 10 m5.4xlarge nodes
|
||||
- 10 actors (12 CPUs each)
|
||||
- 100 GB
|
||||
- 260M rows
|
||||
* - 10 m5.4xlarge nodes (10 actors)
|
||||
- 100 GB (260M rows)
|
||||
- 693 s
|
||||
- `python xgboost_benchmark.py --size 100GB`
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ class ScalingConfigDataClass:
|
|||
num_workers: Optional[int] = None
|
||||
use_gpu: bool = False
|
||||
resources_per_worker: Optional[Dict] = None
|
||||
placement_strategy: str = "PACK"
|
||||
placement_strategy: str = "SPREAD"
|
||||
|
||||
def __post_init__(self):
|
||||
if self.resources_per_worker:
|
||||
|
|
|
@ -11,9 +11,9 @@ from ray.data.preprocessors import BatchMapper
|
|||
GiB = 1024 * 1024 * 1024
|
||||
|
||||
|
||||
def make_ds(size_gib: int):
|
||||
def make_ds(size_gb: int):
|
||||
# Dataset of 10KiB tensor records.
|
||||
total_size = GiB * size_gib
|
||||
total_size = GiB * size_gb
|
||||
record_dim = 1280
|
||||
record_size = record_dim * 8
|
||||
num_records = int(total_size / record_size)
|
||||
|
@ -22,14 +22,13 @@ def make_ds(size_gib: int):
|
|||
return dataset
|
||||
|
||||
|
||||
def run_ingest_bulk(dataset, num_workers, num_cpus_per_worker, placement_strategy):
|
||||
def run_ingest_bulk(dataset, num_workers, num_cpus_per_worker):
|
||||
dummy_prep = BatchMapper(lambda df: df * 2)
|
||||
trainer = DummyTrainer(
|
||||
scaling_config={
|
||||
"num_workers": num_workers,
|
||||
"trainer_resources": {"CPU": 0},
|
||||
"resources_per_worker": {"CPU": num_cpus_per_worker},
|
||||
"placement_strategy": placement_strategy,
|
||||
},
|
||||
datasets={"train": dataset},
|
||||
preprocessor=dummy_prep,
|
||||
|
@ -49,20 +48,12 @@ if __name__ == "__main__":
|
|||
default=1,
|
||||
help="Number of CPUs for each training worker.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--placement-strategy",
|
||||
type=str,
|
||||
default="PACK",
|
||||
help="Worker placement strategy.",
|
||||
)
|
||||
parser.add_argument("--dataset-size-gib", type=int, default=200)
|
||||
parser.add_argument("--dataset-size-gb", type=int, default=200)
|
||||
args = parser.parse_args()
|
||||
ds = make_ds(args.dataset_size_gib)
|
||||
ds = make_ds(args.dataset_size_gb)
|
||||
|
||||
start = time.time()
|
||||
run_ingest_bulk(
|
||||
ds, args.num_workers, args.num_cpus_per_worker, args.placement_strategy
|
||||
)
|
||||
run_ingest_bulk(ds, args.num_workers, args.num_cpus_per_worker)
|
||||
end = time.time()
|
||||
time_taken = end - start
|
||||
|
||||
|
|
|
@ -168,7 +168,7 @@
|
|||
|
||||
run:
|
||||
timeout: 3600
|
||||
script: python workloads/data_benchmark.py --dataset-size-gib=200 --num-workers=20 --placement-strategy=SPREAD
|
||||
script: python workloads/data_benchmark.py --dataset-size-gb=200 --num-workers=20
|
||||
|
||||
wait_for_nodes:
|
||||
num_nodes: 20
|
||||
|
|
Loading…
Add table
Reference in a new issue