mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
379 lines
12 KiB
YAML
379 lines
12 KiB
YAML
#
|
|
# Single node shuffle
|
|
#
|
|
|
|
# Test basic single node 10GB shuffle with a small number of partitions.
|
|
# This doesn't require object spilling.
|
|
- name: shuffle_10gb
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_single.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6
|
|
|
|
# Test single node 50GB shuffle with a large number of partitions.
|
|
- name: shuffle_50gb
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_single.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
|
|
|
|
# Test single node 50GB shuffle with a large number of partitions.
|
|
- name: shuffle_50gb_large_partition
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_single.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
|
|
|
|
# Test non streaming shuffle in a single node with a small number of partition.
|
|
- name: non_streaming_shuffle_50gb
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_single.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming
|
|
|
|
# Test non streaming shuffle in a single node with a large number of partition.
|
|
- name: non_streaming_shuffle_50gb_large_partition
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_single.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
|
|
|
|
- name: dask_on_ray_10gb_sort
|
|
cluster:
|
|
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
|
|
|
|
- name: dask_on_ray_100gb_sort
|
|
cluster:
|
|
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
|
|
|
|
#
|
|
# Multi node shuffle
|
|
#
|
|
|
|
# Test multi nodes 100GB shuffle with a small number of partitions.
|
|
- name: shuffle_100gb
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_multi.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
prepare: python wait_cluster.py 4 600
|
|
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
|
|
|
|
# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
|
|
- name: non_streaming_shuffle_100gb
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_multi.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
prepare: python wait_cluster.py 4 600
|
|
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
|
|
|
|
# Test autoscaling 1TB streaming shuffle with a large number of partitions.
|
|
- name: autoscaling_shuffle_1tb_1000_partitions
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_autoscaling.yaml
|
|
|
|
run:
|
|
timeout: 4000
|
|
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
|
|
|
|
# Test multi nodes 1TB streaming shuffle with a large number of partitions.
|
|
- name: shuffle_1tb_1000_partition
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
prepare: python wait_cluster.py 20 900
|
|
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
|
|
|
|
# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
|
|
- name: non_streaming_shuffle_1tb_1000_partition
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
prepare: python wait_cluster.py 20 900
|
|
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
|
|
|
|
# Stress test for 1TB multi node streaming shuffle.
|
|
- name: shuffle_1tb_5000_partitions
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
|
|
run:
|
|
timeout: 9000
|
|
prepare: python wait_cluster.py 20 900
|
|
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
|
|
|
|
# Stress test for 1TB multi node non-streaming shuffle.
|
|
# - name: non_streaming_shuffle_1tb_5000_partitions
|
|
# stable: False
|
|
# cluster:
|
|
# app_config: shuffle/shuffle_app_config.yaml
|
|
# compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
|
|
# run:
|
|
# timeout: 7200
|
|
# prepare: python wait_cluster.py 20 900
|
|
# script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
|
|
|
|
# Test large scale dask on ray test without spilling.
|
|
- name: dask_on_ray_large_scale_test_no_spilling
|
|
cluster:
|
|
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 21 600
|
|
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
|
|
|
smoke_test:
|
|
cluster:
|
|
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 5 600
|
|
script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
|
|
|
# Test large scale dask on ray test with spilling.
|
|
- name: dask_on_ray_large_scale_test_spilling
|
|
cluster:
|
|
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 21 600
|
|
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
|
|
|
|
smoke_test:
|
|
cluster:
|
|
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 5 600
|
|
script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
|
|
|
|
# Stress tests with many tasks
|
|
- name: stress_test_many_tasks
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/stress_tests_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: python stress_tests/test_many_tasks.py
|
|
|
|
smoke_test:
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/smoke_test_compute.yaml
|
|
|
|
run:
|
|
timeout: 3600
|
|
script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
|
|
|
|
# Stress tests with dead actors
|
|
- name: stress_test_dead_actors
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/stress_tests_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: python stress_tests/test_dead_actors.py
|
|
|
|
smoke_test:
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/smoke_test_compute.yaml
|
|
|
|
run:
|
|
timeout: 3600
|
|
script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3
|
|
|
|
# Stress tests with placement groups
|
|
- name: stress_test_placement_group
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/placement_group_tests_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: python stress_tests/test_placement_group.py
|
|
|
|
# Stress tests with many threaded actors.
|
|
- name: threaded_actors_stress_test
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 201 600
|
|
script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
|
|
|
|
smoke_test:
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/smoke_test_compute.yaml
|
|
|
|
run:
|
|
timeout: 3600
|
|
prepare: python wait_cluster.py 5 600
|
|
script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
|
|
stable: false
|
|
|
|
# Test decision tree on autoscaling compute cluster.
|
|
- name: decision_tree_autoscaling
|
|
cluster:
|
|
app_config: decision_tree/decision_tree_app_config.yaml
|
|
compute_template: decision_tree/autoscaling_compute.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python decision_tree/cart_with_tree.py
|
|
|
|
# Test 20 concurrent decision tree runs on autoscaling compute cluster.
|
|
- name: decision_tree_autoscaling_20_runs
|
|
cluster:
|
|
app_config: decision_tree/decision_tree_app_config.yaml
|
|
compute_template: decision_tree/autoscaling_compute.yaml
|
|
run:
|
|
timeout: 9600
|
|
script: python decision_tree/cart_with_tree.py --concurrency=20
|
|
|
|
# Stress test shuffle_data_loader.
|
|
- name: shuffle_data_loader
|
|
cluster:
|
|
app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
|
|
compute_template: shuffle_data_loader/shuffle_data_loader_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: >
|
|
python shuffle_data_loader/benchmark.py
|
|
--num-rows 100_000_000
|
|
--num-files 50
|
|
--num-row-groups-per-file 5
|
|
--num-reducers 32 --num-trainers 16
|
|
--batch-size 250000
|
|
--cluster
|
|
--num-trials 1
|
|
--num-epochs 10
|
|
--max-concurrent-epochs 2
|
|
--data-dir s3://core-nightly-test/shuffle-data/
|
|
--no-stats
|
|
|
|
# Stress test shuffle_data_loader.
|
|
- name: shuffle_data_loader_4_nodes
|
|
cluster:
|
|
app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
|
|
compute_template: shuffle_data_loader/shuffle_data_loader_compute_4_nodes.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 4 600
|
|
script: >
|
|
python shuffle_data_loader/benchmark.py
|
|
--num-rows 400_000_000
|
|
--num-files 50
|
|
--num-row-groups-per-file 5
|
|
--num-reducers 32 --num-trainers 16
|
|
--batch-size 250000
|
|
--cluster
|
|
--num-trials 1
|
|
--num-epochs 10
|
|
--max-concurrent-epochs 2
|
|
--data-dir s3://core-nightly-test/shuffle-data/
|
|
--no-stats
|
|
|
|
- name: dask_on_ray_1tb_sort
|
|
cluster:
|
|
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/1tb_sort_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 32 1000
|
|
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
|
|
|
|
- name: many_nodes_actor_test
|
|
cluster:
|
|
app_config: many_nodes_tests/app_config.yaml
|
|
compute_template: many_nodes_tests/compute_config.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 251 5400
|
|
script: python many_nodes_tests/actor_test.py
|
|
|
|
- name: pg_autoscaling_regression_test
|
|
cluster:
|
|
app_config: placement_group_tests/app_config.yaml
|
|
compute_template: placement_group_tests/compute.yaml
|
|
|
|
run:
|
|
timeout: 1200
|
|
script: python placement_group_tests/pg_run.py
|
|
|
|
- name: pg_long_running_performance_test
|
|
cluster:
|
|
app_config: placement_group_tests/app_config.yaml
|
|
compute_template: placement_group_tests/long_running_test_compute.yaml
|
|
|
|
run:
|
|
timeout: 3600
|
|
prepare: python wait_cluster.py 2 600
|
|
script: python placement_group_tests/long_running_performance_test.py --num-stages 2000
|
|
stable: false
|
|
|
|
- name: placement_group_performance_test
|
|
cluster:
|
|
app_config: placement_group_tests/app_config.yaml
|
|
compute_template: placement_group_tests/pg_perf_test_compute.yaml
|
|
|
|
run:
|
|
timeout: 1200
|
|
prepare: python wait_cluster.py 5 600
|
|
script: python placement_group_tests/placement_group_performance_test.py
|