ray/release/nightly_tests/nightly_tests.yaml
SangBin Cho 5ec63ccc5f
[Regresion test] Placement group long running test (#20251)
Why are these changes needed?
In the past, there was a regression the placement group creation time gets slower as time goes. I believe the issue is fixed in the master, but this PR verifies if that's actually fixed.

This PR adds a long running test for the placement group. There are 2 purposes of the test.

Make sure the placement group creation / removal doesn't get slower as time goes. The test basically measure the first 20 iteration P50 creation time and run very long iteration. After all iteration, it checks if the p50 creation time is not too slow compared to the initial round.
Make sure placement group removal / creation works consistently for a long time without an issue.
Q: Should we make it a real long running test? (that runs for a day?)
2021-11-16 04:21:18 -08:00

379 lines
12 KiB
YAML

#
# Single node shuffle
#
# Test basic single node 10GB shuffle with a small number of partitions.
# This doesn't require object spilling.
- name: shuffle_10gb
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6
# Test single node 50GB shuffle with a large number of partitions.
- name: shuffle_50gb
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
# Test single node 50GB shuffle with a large number of partitions.
- name: shuffle_50gb_large_partition
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
# Test non streaming shuffle in a single node with a small number of partition.
- name: non_streaming_shuffle_50gb
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming
# Test non streaming shuffle in a single node with a large number of partition.
- name: non_streaming_shuffle_50gb_large_partition
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
- name: dask_on_ray_10gb_sort
cluster:
app_config: dask_on_ray/dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
run:
timeout: 7200
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
- name: dask_on_ray_100gb_sort
cluster:
app_config: dask_on_ray/dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
run:
timeout: 7200
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
#
# Multi node shuffle
#
# Test multi nodes 100GB shuffle with a small number of partitions.
- name: shuffle_100gb
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_multi.yaml
run:
timeout: 3000
prepare: python wait_cluster.py 4 600
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
- name: non_streaming_shuffle_100gb
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_multi.yaml
run:
timeout: 3000
prepare: python wait_cluster.py 4 600
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
# Test autoscaling 1TB streaming shuffle with a large number of partitions.
- name: autoscaling_shuffle_1tb_1000_partitions
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_autoscaling.yaml
run:
timeout: 4000
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
# Test multi nodes 1TB streaming shuffle with a large number of partitions.
- name: shuffle_1tb_1000_partition
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_large_scale.yaml
run:
timeout: 3000
prepare: python wait_cluster.py 20 900
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
- name: non_streaming_shuffle_1tb_1000_partition
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_large_scale.yaml
run:
timeout: 3000
prepare: python wait_cluster.py 20 900
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
# Stress test for 1TB multi node streaming shuffle.
- name: shuffle_1tb_5000_partitions
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_large_scale.yaml
run:
timeout: 9000
prepare: python wait_cluster.py 20 900
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
# Stress test for 1TB multi node non-streaming shuffle.
# - name: non_streaming_shuffle_1tb_5000_partitions
# stable: False
# cluster:
# app_config: shuffle/shuffle_app_config.yaml
# compute_template: shuffle/shuffle_compute_large_scale.yaml
# run:
# timeout: 7200
# prepare: python wait_cluster.py 20 900
# script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
# Test large scale dask on ray test without spilling.
- name: dask_on_ray_large_scale_test_no_spilling
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 21 600
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
smoke_test:
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 5 600
script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
# Test large scale dask on ray test with spilling.
- name: dask_on_ray_large_scale_test_spilling
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 21 600
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
smoke_test:
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 5 600
script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
# Stress tests with many tasks
- name: stress_test_many_tasks
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/stress_tests_compute.yaml
run:
timeout: 7200
script: python stress_tests/test_many_tasks.py
smoke_test:
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
# Stress tests with dead actors
- name: stress_test_dead_actors
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/stress_tests_compute.yaml
run:
timeout: 7200
script: python stress_tests/test_dead_actors.py
smoke_test:
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3
# Stress tests with placement groups
- name: stress_test_placement_group
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/placement_group_tests_compute.yaml
run:
timeout: 7200
script: python stress_tests/test_placement_group.py
# Stress tests with many threaded actors.
- name: threaded_actors_stress_test
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 201 600
script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
smoke_test:
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
prepare: python wait_cluster.py 5 600
script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
stable: false
# Test decision tree on autoscaling compute cluster.
- name: decision_tree_autoscaling
cluster:
app_config: decision_tree/decision_tree_app_config.yaml
compute_template: decision_tree/autoscaling_compute.yaml
run:
timeout: 3000
script: python decision_tree/cart_with_tree.py
# Test 20 concurrent decision tree runs on autoscaling compute cluster.
- name: decision_tree_autoscaling_20_runs
cluster:
app_config: decision_tree/decision_tree_app_config.yaml
compute_template: decision_tree/autoscaling_compute.yaml
run:
timeout: 9600
script: python decision_tree/cart_with_tree.py --concurrency=20
# Stress test shuffle_data_loader.
- name: shuffle_data_loader
cluster:
app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
compute_template: shuffle_data_loader/shuffle_data_loader_compute.yaml
run:
timeout: 7200
script: >
python shuffle_data_loader/benchmark.py
--num-rows 100_000_000
--num-files 50
--num-row-groups-per-file 5
--num-reducers 32 --num-trainers 16
--batch-size 250000
--cluster
--num-trials 1
--num-epochs 10
--max-concurrent-epochs 2
--data-dir s3://core-nightly-test/shuffle-data/
--no-stats
# Stress test shuffle_data_loader.
- name: shuffle_data_loader_4_nodes
cluster:
app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
compute_template: shuffle_data_loader/shuffle_data_loader_compute_4_nodes.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 4 600
script: >
python shuffle_data_loader/benchmark.py
--num-rows 400_000_000
--num-files 50
--num-row-groups-per-file 5
--num-reducers 32 --num-trainers 16
--batch-size 250000
--cluster
--num-trials 1
--num-epochs 10
--max-concurrent-epochs 2
--data-dir s3://core-nightly-test/shuffle-data/
--no-stats
- name: dask_on_ray_1tb_sort
cluster:
app_config: dask_on_ray/dask_on_ray_app_config.yaml
compute_template: dask_on_ray/1tb_sort_compute.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 32 1000
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
- name: many_nodes_actor_test
cluster:
app_config: many_nodes_tests/app_config.yaml
compute_template: many_nodes_tests/compute_config.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 251 5400
script: python many_nodes_tests/actor_test.py
- name: pg_autoscaling_regression_test
cluster:
app_config: placement_group_tests/app_config.yaml
compute_template: placement_group_tests/compute.yaml
run:
timeout: 1200
script: python placement_group_tests/pg_run.py
- name: pg_long_running_performance_test
cluster:
app_config: placement_group_tests/app_config.yaml
compute_template: placement_group_tests/long_running_test_compute.yaml
run:
timeout: 3600
prepare: python wait_cluster.py 2 600
script: python placement_group_tests/long_running_performance_test.py --num-stages 5000
stable: false
- name: placement_group_performance_test
cluster:
app_config: placement_group_tests/app_config.yaml
compute_template: placement_group_tests/pg_perf_test_compute.yaml
run:
timeout: 1200
prepare: python wait_cluster.py 5 600
script: python placement_group_tests/placement_group_performance_test.py