# # Single node shuffle # # Test basic single node 10GB shuffle with a small number of partitions. # This doesn't require object spilling. - name: shuffle_10gb cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_single.yaml run: timeout: 3000 script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6 # Test single node 50GB shuffle with a large number of partitions. - name: shuffle_50gb cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_single.yaml run: timeout: 3000 script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 # Test single node 50GB shuffle with a large number of partitions. - name: shuffle_50gb_large_partition cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_single.yaml run: timeout: 3000 script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 # Test non streaming shuffle in a single node with a small number of partition. - name: non_streaming_shuffle_50gb cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_single.yaml run: timeout: 3000 script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming # Test non streaming shuffle in a single node with a large number of partition. - name: non_streaming_shuffle_50gb_large_partition cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_single.yaml run: timeout: 3000 script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming # Test non streaming shuffle in a single node with a large number of partition. - name: dask_on_ray_10gb_sort cluster: app_config: dask_on_ray/dask_on_ray_app_config.yaml compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml run: timeout: 7200 script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray # Test non streaming shuffle in a single node with a large number of partition. - name: dask_on_ray_100gb_sort cluster: app_config: dask_on_ray/dask_on_ray_app_config.yaml compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml run: timeout: 7200 script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray # # Multi node shuffle # # Test multi nodes 100GB shuffle with a small number of partitions. - name: shuffle_100gb cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_multi.yaml run: timeout: 3000 prepare: python wait_cluster.py 4 600 script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 # Test non streaming multi nodes 100GB shuffle with a small number of partitions. - name: non_streaming_shuffle_100gb cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_multi.yaml run: timeout: 3000 prepare: python wait_cluster.py 4 600 script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming # Test streaming shuffle in a single node with a large partition size. - name: streaming_shuffle_1tb_100_partitions cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_multi.yaml run: timeout: 3000 script: python shuffle/shuffle_test.py --num-partitions=100 --partition-size=10e9 # Test non streaming shuffle in a single node with a large partition size. - name: non_streaming_shuffle_1tb_100_partitions cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_multi.yaml run: timeout: 3000 script: python shuffle/shuffle_test.py --num-partitions=100 --partition-size=10e9 --no-streaming # Test multi nodes 1TB streaming shuffle with a large number of partitions. - name: shuffle_1tb_1000_partition cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_large_scale.yaml run: timeout: 3000 prepare: python wait_cluster.py 20 600 script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 # Test multi nodes 1TB non streaming shuffle with a large number of partitions. - name: non_streaming_shuffle_1tb_1000_partition cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_large_scale.yaml run: timeout: 3000 prepare: python wait_cluster.py 20 600 script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming # Stress test for 1TB multi node streaming shuffle. - name: shuffle_1tb_5000_partitions cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_large_scale.yaml run: timeout: 3000 prepare: python wait_cluster.py 20 600 script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 # Stress test for 1TB multi node non-streaming shuffle. - name: non_streaming_shuffle_1tb_5000_partitions cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_large_scale.yaml run: timeout: 3000 prepare: python wait_cluster.py 20 600 script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming # Test large scale dask on ray test without spilling. - name: dask_on_ray_large_scale_test_no_spilling cluster: app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml run: timeout: 7200 prepare: python wait_cluster.py 5 600 script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray # Test large scale dask on ray test without spilling. - name: dask_on_ray_large_scale_test_spilling cluster: app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml run: timeout: 7200 prepare: python wait_cluster.py 5 600 script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray # Stress tests with many tasks - name: stress_test_many_tasks cluster: app_config: stress_tests/stress_tests_app_config.yaml compute_template: stress_tests/stress_tests_compute.yaml run: timeout: 7200 script: python stress_tests/test_many_tasks.py smoke_test: cluster: app_config: stress_tests/stress_tests_app_config.yaml compute_template: stress_tests/smoke_test_compute.yaml run: timeout: 3600 script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test # Stress tests with dead actors - name: stress_test_dead_actors cluster: app_config: stress_tests/stress_tests_app_config.yaml compute_template: stress_tests/stress_tests_compute.yaml run: timeout: 7200 script: python stress_tests/test_dead_actors.py smoke_test: cluster: app_config: stress_tests/stress_tests_app_config.yaml compute_template: stress_tests/smoke_test_compute.yaml run: timeout: 3600 script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3 # Stress tests with placement groups - name: stress_test_placement_group cluster: app_config: stress_tests/stress_tests_app_config.yaml compute_template: stress_tests/placement_group_tests_compute.yaml run: timeout: 7200 script: python stress_tests/test_placement_group.py # Test decision tree on autoscaling compute cluster. - name: decision_tree_autoscaling cluster: app_config: decision_tree/decision_tree_app_config.yaml compute_template: decision_tree/autoscaling_compute.yaml run: timeout: 3000 script: python decision_tree/cart_with_tree.py