mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

The first migration of test into k8s. We are adopting a conservative approach (migrate slowly while we keep existing test suites). Once things are confirmed to be stable, we will migrate with more speed.
418 lines
13 KiB
YAML
418 lines
13 KiB
YAML
#
|
|
# Single node shuffle
|
|
#
|
|
# Test basic single node 10GB shuffle with a small number of partitions.
|
|
# This doesn't require object spilling.
|
|
- name: shuffle_10gb
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_single.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6
|
|
|
|
# Test single node 50GB shuffle with a large number of partitions.
|
|
- name: shuffle_50gb
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_single.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
|
|
|
|
# Test single node 50GB shuffle with a large number of partitions.
|
|
- name: shuffle_50gb_large_partition
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_single.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
|
|
|
|
# Test non streaming shuffle in a single node with a small number of partition.
|
|
- name: non_streaming_shuffle_50gb
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_single.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming
|
|
|
|
# Test non streaming shuffle in a single node with a large number of partition.
|
|
- name: non_streaming_shuffle_50gb_large_partition
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_single.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
|
|
|
|
- name: dask_on_ray_10gb_sort
|
|
team: core
|
|
cluster:
|
|
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
|
|
|
|
- name: dask_on_ray_100gb_sort
|
|
team: core
|
|
cluster:
|
|
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
|
|
|
|
#
|
|
# Multi node shuffle
|
|
#
|
|
|
|
# Test multi nodes 100GB shuffle with a small number of partitions.
|
|
- name: shuffle_100gb
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_multi.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
prepare: python wait_cluster.py 4 600
|
|
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
|
|
|
|
# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
|
|
- name: non_streaming_shuffle_100gb
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_multi.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
prepare: python wait_cluster.py 4 600
|
|
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
|
|
|
|
# Test autoscaling 1TB streaming shuffle with a large number of partitions.
|
|
- name: autoscaling_shuffle_1tb_1000_partitions
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_autoscaling.yaml
|
|
|
|
run:
|
|
timeout: 4000
|
|
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
|
|
|
|
# Test multi nodes 1TB streaming shuffle with a large number of partitions.
|
|
- name: shuffle_1tb_1000_partition
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
prepare: python wait_cluster.py 20 900
|
|
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
|
|
|
|
# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
|
|
- name: non_streaming_shuffle_1tb_1000_partition
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
prepare: python wait_cluster.py 20 900
|
|
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
|
|
|
|
# Stress test for 1TB multi node streaming shuffle.
|
|
- name: shuffle_1tb_5000_partitions
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle/shuffle_app_config.yaml
|
|
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
|
|
run:
|
|
timeout: 9000
|
|
prepare: python wait_cluster.py 20 900
|
|
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
|
|
|
|
# Stress test for 1TB multi node non-streaming shuffle.
|
|
# - name: non_streaming_shuffle_1tb_5000_partitions
|
|
# team: core
|
|
# stable: False
|
|
# cluster:
|
|
# app_config: shuffle/shuffle_app_config.yaml
|
|
# compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
|
|
# run:
|
|
# timeout: 7200
|
|
# prepare: python wait_cluster.py 20 900
|
|
# script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
|
|
|
|
- name: k8s_dask_on_ray_large_scale_test_no_spilling
|
|
cluster:
|
|
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/dask_on_ray_stress_compute_k8s.yaml
|
|
compute_on_k8s: True
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 21 600
|
|
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
|
stable: false
|
|
|
|
# Test large scale dask on ray test without spilling.
|
|
- name: dask_on_ray_large_scale_test_no_spilling
|
|
team: core
|
|
cluster:
|
|
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 21 600
|
|
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
|
|
|
smoke_test:
|
|
cluster:
|
|
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 5 600
|
|
script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
|
|
|
# Test large scale dask on ray test with spilling.
|
|
- name: dask_on_ray_large_scale_test_spilling
|
|
team: core
|
|
cluster:
|
|
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 21 600
|
|
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
|
|
|
|
smoke_test:
|
|
cluster:
|
|
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 5 600
|
|
script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
|
|
|
|
# Stress tests with many tasks
|
|
- name: stress_test_many_tasks
|
|
team: core
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/stress_tests_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: python stress_tests/test_many_tasks.py
|
|
|
|
smoke_test:
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/smoke_test_compute.yaml
|
|
|
|
run:
|
|
timeout: 3600
|
|
script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
|
|
|
|
# Stress tests with dead actors
|
|
- name: stress_test_dead_actors
|
|
team: core
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/stress_tests_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: python stress_tests/test_dead_actors.py
|
|
|
|
smoke_test:
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/smoke_test_compute.yaml
|
|
|
|
run:
|
|
timeout: 3600
|
|
script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3
|
|
|
|
# Stress tests with placement groups
|
|
- name: stress_test_placement_group
|
|
team: core
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/placement_group_tests_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: python stress_tests/test_placement_group.py
|
|
|
|
# Stress tests with many threaded actors.
|
|
- name: threaded_actors_stress_test
|
|
team: core
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 201 600
|
|
script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
|
|
|
|
smoke_test:
|
|
cluster:
|
|
app_config: stress_tests/stress_tests_app_config.yaml
|
|
compute_template: stress_tests/smoke_test_compute.yaml
|
|
|
|
run:
|
|
timeout: 3600
|
|
prepare: python wait_cluster.py 5 600
|
|
script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
|
|
stable: false
|
|
|
|
# Test decision tree on autoscaling compute cluster.
|
|
- name: decision_tree_autoscaling
|
|
team: core
|
|
cluster:
|
|
app_config: decision_tree/decision_tree_app_config.yaml
|
|
compute_template: decision_tree/autoscaling_compute.yaml
|
|
|
|
run:
|
|
timeout: 3000
|
|
script: python decision_tree/cart_with_tree.py
|
|
|
|
# Test 20 concurrent decision tree runs on autoscaling compute cluster.
|
|
- name: decision_tree_autoscaling_20_runs
|
|
team: core
|
|
cluster:
|
|
app_config: decision_tree/decision_tree_app_config.yaml
|
|
compute_template: decision_tree/autoscaling_compute.yaml
|
|
run:
|
|
timeout: 9600
|
|
script: python decision_tree/cart_with_tree.py --concurrency=20
|
|
|
|
# Stress test shuffle_data_loader.
|
|
- name: shuffle_data_loader
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
|
|
compute_template: shuffle_data_loader/shuffle_data_loader_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
script: >
|
|
python shuffle_data_loader/benchmark.py
|
|
--num-rows 100_000_000
|
|
--num-files 50
|
|
--num-row-groups-per-file 5
|
|
--num-reducers 32 --num-trainers 16
|
|
--batch-size 250000
|
|
--cluster
|
|
--num-trials 1
|
|
--num-epochs 10
|
|
--max-concurrent-epochs 2
|
|
--data-dir s3://core-nightly-test/shuffle-data/
|
|
--no-stats
|
|
|
|
# Stress test shuffle_data_loader.
|
|
- name: shuffle_data_loader_4_nodes
|
|
team: core
|
|
cluster:
|
|
app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
|
|
compute_template: shuffle_data_loader/shuffle_data_loader_compute_4_nodes.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 4 600
|
|
script: >
|
|
python shuffle_data_loader/benchmark.py
|
|
--num-rows 400_000_000
|
|
--num-files 50
|
|
--num-row-groups-per-file 5
|
|
--num-reducers 32 --num-trainers 16
|
|
--batch-size 250000
|
|
--cluster
|
|
--num-trials 1
|
|
--num-epochs 10
|
|
--max-concurrent-epochs 2
|
|
--data-dir s3://core-nightly-test/shuffle-data/
|
|
--no-stats
|
|
|
|
- name: dask_on_ray_1tb_sort
|
|
team: core
|
|
cluster:
|
|
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
|
compute_template: dask_on_ray/1tb_sort_compute.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 32 1000
|
|
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
|
|
|
|
- name: many_nodes_actor_test
|
|
team: core
|
|
cluster:
|
|
app_config: many_nodes_tests/app_config.yaml
|
|
compute_template: many_nodes_tests/compute_config.yaml
|
|
|
|
run:
|
|
timeout: 7200
|
|
prepare: python wait_cluster.py 251 5400
|
|
script: python many_nodes_tests/actor_test.py
|
|
|
|
- name: pg_autoscaling_regression_test
|
|
team: core
|
|
cluster:
|
|
app_config: placement_group_tests/app_config.yaml
|
|
compute_template: placement_group_tests/compute.yaml
|
|
|
|
run:
|
|
timeout: 1200
|
|
script: python placement_group_tests/pg_run.py
|
|
|
|
- name: pg_long_running_performance_test
|
|
team: core
|
|
cluster:
|
|
app_config: placement_group_tests/app_config.yaml
|
|
compute_template: placement_group_tests/long_running_test_compute.yaml
|
|
|
|
run:
|
|
timeout: 3600
|
|
prepare: python wait_cluster.py 2 600
|
|
script: python placement_group_tests/long_running_performance_test.py --num-stages 2000
|
|
|
|
- name: placement_group_performance_test
|
|
team: core
|
|
cluster:
|
|
app_config: placement_group_tests/app_config.yaml
|
|
compute_template: placement_group_tests/pg_perf_test_compute.yaml
|
|
|
|
run:
|
|
timeout: 1200
|
|
prepare: python wait_cluster.py 5 600
|
|
script: python placement_group_tests/placement_group_performance_test.py
|