# Global release test configuration file. # All your release test configuration should go here. Adding release tests here # will automatically enable them in the Buildkite release testing schedules # (except they have frequency: disabled). # Here is an example configuration for reference: #- name: example_test # # Tests with the same group will be grouped in the Buildkite UI # group: Example group # # Provide the working directory which will be uploaded to the cluster # working_dir: example_dir # # # For release test infra migration, we provide these fields that are populated # # in the database # legacy: # test_name: example_test # test_suite: examples # # # How often to run the tests. # # One of [disabled, any, multi, nightly, weekly]. # frequency: weekly # # Owning team. This field will be persisted to the database # team: ml # # # Optional location of a bash setup script to run on the driver # # when setting up the local environment. Relative to working_dir # driver_setup: setup_driver.sh # # # Cluster information # cluster: # # Location of cluster env, relative to working_dir # cluster_env: cluster_env.yaml # # Location of cluster compute, relative to working_dir # cluster_compute: cluster_compute.yaml # # Autosuspend parameter passed to the cluster. # # The cluster will automatically terminate if inactive for this # # many minutes. Defaults to 10 if not set. # autosuspend_mins: 10 # # Optional cloud_id to use instead of the default cloud # cloud_id: cld_12345678 # # Alternatively, you can specify a cloud name # cloud_name: anyscale_default_cloud # # # Run configuration for the test # run: # # Type of test. Can be sdk_command or client (job to be implemented soon). # # Uses either Anyscale SDK commands or the Ray client to run the actual # # release test. # type: sdk_command # # # File manager to use to transfer files to and from the cluster. # # Can be any of [sdk, client, job]. # file_manager: sdk # # # If you want to wait for nodes to be ready, you can specify this here: # wait_for_nodes: # # Number of nodes # num_nodes: 16 # # Timeout for waiting for nodes. If nodes are not up by then, the # # test will fail. # timeout: 600 # # # Optional prepare script to be run on the cluster before the test script # prepare: python prepare.py # # The prepare command can have a separate timeout # prepare_timeout: 300 # # # Main script to run as the test script # script: python workloads/train_small.py # # Timeout in seconds. After this time the test is considered as failed. # timeout: 600 # # # You can specify smoke test definitions here. If a smoke test is triggered, # # it will deep update the main test configuration with the values provided # # here. Smoke tests will automatically run with IS_SMOKE_TEST=1 as en # # environment variable and receive the --smoke-test flag as a parameter in the # # run script. # smoke_test: # # Smoke tests can have different frequencies. A smoke test is only triggered # # when the regular test is not matched. # frequency: nightly # # Here we adjust the run timeout down and run on less nodes. The test script # # remains the same. # run: # timeout: 300 # wait_for_nodes: # num_nodes: 4 # timeout: 600 # # # After the test finished, this handler (in alerts/) will process the results. # # It can then let the test fail, e.g. if a metric regression is observed. # alert: default ####################### # XGBoost release tests ####################### - name: xgboost_train_small group: XGBoost working_dir: xgboost_tests legacy: test_name: train_small test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_small.yaml autosuspend_mins: 10 run: timeout: 600 script: python workloads/train_small.py wait_for_nodes: num_nodes: 4 timeout: 600 type: client alert: xgboost_tests - name: xgboost_train_moderate group: XGBoost working_dir: xgboost_tests legacy: test_name: train_moderate test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_moderate.yaml run: timeout: 600 script: python workloads/train_moderate.py wait_for_nodes: num_nodes: 32 timeout: 600 type: sdk_command file_manager: sdk alert: xgboost_tests - name: xgboost_train_gpu group: XGBoost working_dir: xgboost_tests legacy: test_name: train_gpu test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config_gpu.yaml cluster_compute: tpl_gpu_small.yaml run: timeout: 600 script: python workloads/train_gpu.py wait_for_nodes: num_nodes: 5 timeout: 600 type: sdk_command file_manager: sdk alert: xgboost_tests - name: xgboost_distributed_api_test group: XGBoost working_dir: xgboost_tests legacy: test_name: distributed_api_test test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_small.yaml run: timeout: 600 script: python workloads/distributed_api_test.py wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: sdk alert: xgboost_tests - name: xgboost_ft_small_elastic group: XGBoost working_dir: xgboost_tests legacy: test_name: ft_small_elastic test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_small.yaml run: timeout: 900 script: python workloads/ft_small_elastic.py wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: sdk alert: xgboost_tests - name: xgboost_ft_small_non_elastic group: XGBoost working_dir: xgboost_tests legacy: test_name: ft_small_non_elastic test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_small.yaml run: timeout: 900 script: python workloads/ft_small_non_elastic.py wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: sdk alert: xgboost_tests - name: xgboost_tune_small group: XGBoost working_dir: xgboost_tests legacy: test_name: tune_small test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_small.yaml run: timeout: 600 script: python workloads/tune_small.py wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: sdk alert: xgboost_tests - name: xgboost_tune_32x4 group: XGBoost working_dir: xgboost_tests legacy: test_name: tune_32x4 test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_moderate.yaml run: timeout: 900 script: python workloads/tune_32x4.py wait_for_nodes: num_nodes: 32 timeout: 600 type: sdk_command file_manager: sdk alert: xgboost_tests - name: xgboost_tune_4x32 group: XGBoost working_dir: xgboost_tests legacy: test_name: tune_4x32 test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_moderate.yaml run: timeout: 900 script: python workloads/tune_4x32.py wait_for_nodes: num_nodes: 32 timeout: 600 type: sdk_command file_manager: sdk alert: xgboost_tests ####################### # Tune cloud tests ####################### - name: tune_cloud_aws_no_sync_down group: Tune cloud tests working_dir: tune_tests/cloud_tests legacy: test_name: aws_no_sync_down test_suite: tune_cloud_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_aws_4x2.yaml run: timeout: 600 script: python workloads/run_cloud_test.py no_sync_down wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: sdk alert: tune_tests - name: tune_cloud_aws_ssh_sync group: Tune cloud tests working_dir: tune_tests/cloud_tests legacy: test_name: aws_ssh_sync test_suite: tune_cloud_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_aws_4x2.yaml run: timeout: 600 script: python workloads/run_cloud_test.py ssh_sync wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: sdk alert: tune_tests - name: tune_cloud_aws_durable_upload group: Tune cloud tests working_dir: tune_tests/cloud_tests legacy: test_name: aws_durable_upload test_suite: tune_cloud_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_aws_4x2.yaml run: timeout: 600 script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: sdk alert: tune_tests - name: tune_cloud_aws_durable_upload_rllib_str group: Tune cloud tests working_dir: tune_tests/cloud_tests legacy: test_name: aws_durable_upload_rllib_str test_suite: tune_cloud_tests frequency: nightly team: ml cluster: cluster_env: app_config_ml.yaml cluster_compute: tpl_aws_4x2.yaml run: timeout: 600 script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: sdk alert: tune_tests - name: tune_cloud_aws_durable_upload_rllib_trainer group: Tune cloud tests working_dir: tune_tests/cloud_tests legacy: test_name: aws_durable_upload_rllib_trainer test_suite: tune_cloud_tests frequency: nightly team: ml cluster: cluster_env: app_config_ml.yaml cluster_compute: tpl_aws_4x2.yaml run: timeout: 600 script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: sdk alert: tune_tests - name: tune_cloud_gcp_k8s_no_sync_down group: Tune cloud tests working_dir: tune_tests/cloud_tests legacy: test_name: gcp_k8s_no_sync_down test_suite: tune_cloud_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_gcp_k8s_4x8.yaml cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud run: timeout: 600 script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8 type: client alert: tune_tests - name: tune_cloud_gcp_k8s_ssh_sync group: Tune cloud tests working_dir: tune_tests/cloud_tests legacy: test_name: gcp_k8s_ssh_sync test_suite: tune_cloud_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_gcp_k8s_4x8.yaml cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud run: timeout: 600 script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8 type: client alert: tune_tests - name: tune_cloud_gcp_k8s_durable_upload group: Tune cloud tests working_dir: tune_tests/cloud_tests legacy: test_name: gcp_k8s_durable_upload test_suite: tune_cloud_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_gcp_k8s_4x8.yaml cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud run: timeout: 600 script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload type: client alert: tune_tests ######################## # Tune scalability tests ######################## - name: tune_scalability_bookkeeping_overhead group: Tune scalability tests working_dir: tune_tests/scalability_tests legacy: test_name: bookkeeping_overhead test_suite: tune_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_1x16.yaml run: timeout: 1200 script: python workloads/test_bookkeeping_overhead.py type: sdk_command file_manager: sdk alert: tune_tests - name: tune_scalability_durable_trainable group: Tune scalability tests working_dir: tune_tests/scalability_tests legacy: test_name: durable_trainable test_suite: tune_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_16x2.yaml run: timeout: 900 script: python workloads/test_durable_trainable.py --bucket data-test-ilr wait_for_nodes: num_nodes: 16 timeout: 600 type: sdk_command file_manager: sdk alert: tune_tests - name: tune_scalability_long_running_large_checkpoints group: Tune scalability tests working_dir: tune_tests/scalability_tests legacy: test_name: long_running_large_checkpoints test_suite: tune_tests frequency: weekly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_1x32_hd.yaml run: timeout: 86400 script: python workloads/test_long_running_large_checkpoints.py long_running: true type: sdk_command file_manager: sdk smoke_test: frequency: nightly run: timeout: 3600 alert: tune_tests - name: tune_scalability_network_overhead group: Tune scalability tests working_dir: tune_tests/scalability_tests legacy: test_name: network_overhead test_suite: tune_tests frequency: weekly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_100x2.yaml run: timeout: 900 prepare_timeout: 1200 script: python workloads/test_network_overhead.py wait_for_nodes: num_nodes: 100 timeout: 1200 type: sdk_command file_manager: sdk smoke_test: frequency: nightly cluster: compute_template: tpl_20x2.yaml run: timeout: 400 prepare_timeout: 600 wait_for_nodes: num_nodes: 20 timeout: 600 alert: tune_tests - name: tune_scalability_result_throughput_cluster group: Tune scalability tests working_dir: tune_tests/scalability_tests legacy: test_name: result_throughput_cluster test_suite: tune_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_16x64.yaml run: timeout: 600 script: python workloads/test_result_throughput_cluster.py wait_for_nodes: num_nodes: 16 timeout: 600 type: sdk_command file_manager: sdk alert: tune_tests - name: tune_scalability_result_throughput_single_node group: Tune scalability tests working_dir: tune_tests/scalability_tests legacy: test_name: result_throughput_single_node test_suite: tune_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_1x96.yaml run: timeout: 600 script: python workloads/test_result_throughput_single_node.py type: sdk_command file_manager: sdk alert: tune_tests - name: tune_scalability_xgboost_sweep group: Tune scalability tests working_dir: tune_tests/scalability_tests legacy: test_name: xgboost_sweep test_suite: tune_tests frequency: weekly team: ml cluster: cluster_env: app_config_data.yaml cluster_compute: tpl_16x64.yaml run: timeout: 3600 script: python workloads/test_xgboost_sweep.py wait_for_nodes: num_nodes: 16 timeout: 600 type: sdk_command file_manager: sdk alert: tune_tests ######################## # Core Nightly Tests ######################## - name: shuffle_10gb group: Core nightly tests frequency: nightly team: core frequency: multi working_dir: nightly_tests legacy: test_name: shuffle_10gb test_suite: nightly_tests cluster: cluster_env: shuffle/shuffle_app_config.yaml cluster_compute: shuffle/shuffle_compute_single.yaml run: timeout: 3000 script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6 type: sdk_command file_manager: sdk - name: microbenchmark group: Core nightly tests team: core frequency: nightly working_dir: microbenchmark legacy: test_name: microbenchmark test_suite: microbenchmark cluster: cluster_env: app_config.yaml cluster_compute: tpl_64.yaml run: timeout: 1800 script: OMP_NUM_THREADS=64 RAY_ADDRESS= python run_microbenchmark.py ######################### # Core Scalability Tests ######################### - name: single_node group: core-scalability-test working_dir: benchmarks legacy: test_name: single_node test_suite: benchmark_tests frequency: multi team: core cluster: cluster_env: app_config.yaml cluster_compute: single_node.yaml run: timeout: 12000 prepare: sleep 0 script: python single_node/test_single_node.py type: sdk_command file_manager: sdk - name: object_store group: core-scalability-test working_dir: benchmarks legacy: test_name: object_store test_suite: benchmark_tests frequency: multi team: core cluster: cluster_env: app_config.yaml cluster_compute: object_store.yaml run: timeout: 3600 script: python object_store/test_object_store.py wait_for_nodes: num_nodes: 50 timeout: 600 type: sdk_command file_manager: sdk - name: many_actors group: core-scalability-test working_dir: benchmarks legacy: test_name: many_actors test_suite: benchmark_tests frequency: nightly team: core cluster: cluster_env: app_config.yaml cluster_compute: distributed.yaml run: timeout: 3600 script: python distributed/test_many_actors.py wait_for_nodes: num_nodes: 65 timeout: 600 type: sdk_command file_manager: sdk - name: many_actors_smoke_test group: core-scalability-test working_dir: benchmarks legacy: test_name: many_actors_smoke_test test_suite: benchmark_tests frequency: multi team: core cluster: cluster_env: app_config.yaml cluster_compute: distributed_smoke_test.yaml run: timeout: 3600 script: SMOKE_TEST=1 python distributed/test_many_actors.py wait_for_nodes: num_nodes: 2 timeout: 600 type: sdk_command file_manager: sdk - name: many_tasks group: core-scalability-test working_dir: benchmarks legacy: test_name: many_tasks test_suite: benchmark_tests frequency: nightly team: core cluster: cluster_env: app_config.yaml cluster_compute: distributed.yaml run: timeout: 3600 script: python distributed/test_many_tasks.py --num-tasks=10000 wait_for_nodes: num_nodes: 65 timeout: 600 type: sdk_command file_manager: sdk - name: many_tasks_smoke_test group: core-scalability-test working_dir: benchmarks legacy: test_name: many_tasks_smoke_test test_suite: benchmark_tests frequency: multi team: core cluster: cluster_env: app_config.yaml cluster_compute: distributed_smoke_test.yaml run: timeout: 3600 script: python distributed/test_many_tasks.py --num-tasks=100 wait_for_nodes: num_nodes: 2 timeout: 600 type: sdk_command file_manager: sdk - name: many_pgs group: core-scalability-test working_dir: benchmarks legacy: test_name: many_pgs test_suite: benchmark_tests frequency: nightly team: core cluster: cluster_env: app_config.yaml cluster_compute: distributed.yaml run: timeout: 3600 script: python distributed/test_many_pgs.py wait_for_nodes: num_nodes: 65 timeout: 600 type: sdk_command file_manager: sdk - name: many_pgs_smoke_test group: core-scalability-test working_dir: benchmarks legacy: test_name: many_pgs_smoke_test test_suite: benchmark_tests frequency: multi team: core cluster: cluster_env: app_config.yaml cluster_compute: distributed_smoke_test.yaml run: timeout: 3600 script: SMOKE_TEST=1 python distributed/test_many_pgs.py wait_for_nodes: num_nodes: 2 timeout: 600 type: sdk_command file_manager: sdk - name: many_nodes group: core-scalability-test working_dir: benchmarks legacy: test_name: many_nodes test_suite: benchmark_tests frequency: nightly team: core cluster: cluster_env: app_config.yaml cluster_compute: many_nodes.yaml run: timeout: 3600 script: python distributed/test_many_tasks.py --num-tasks=1000 wait_for_nodes: num_nodes: 250 timeout: 600 type: sdk_command file_manager: sdk - name: scheduling_test_many_0s_tasks_single_node group: core-scalability-test working_dir: benchmarks legacy: test_name: scheduling_test_many_0s_tasks_single_node test_suite: benchmark_tests frequency: nightly team: core cluster: cluster_env: app_config.yaml cluster_compute: scheduling.yaml run: timeout: 3600 script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1 wait_for_nodes: num_nodes: 32 timeout: 600 type: sdk_command file_manager: sdk - name: scheduling_test_many_0s_tasks_many_nodes group: core-scalability-test working_dir: benchmarks legacy: test_name: scheduling_test_many_0s_tasks_many_nodes test_suite: benchmark_tests frequency: nightly team: core cluster: cluster_env: app_config.yaml cluster_compute: scheduling.yaml run: timeout: 3600 script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1 wait_for_nodes: num_nodes: 32 timeout: 600 type: sdk_command file_manager: sdk - name: scheduling_test_many_5s_tasks_single_node group: core-scalability-test working_dir: benchmarks legacy: test_name: scheduling_test_many_5s_tasks_single_node test_suite: benchmark_tests frequency: nightly team: core cluster: cluster_env: app_config.yaml cluster_compute: scheduling.yaml run: timeout: 3600 script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1 wait_for_nodes: num_nodes: 32 timeout: 600 type: sdk_command file_manager: sdk stable: false - name: scheduling_test_many_5s_tasks_many_nodes group: core-scalability-test working_dir: benchmarks legacy: test_name: scheduling_test_many_5s_tasks_many_nodes test_suite: benchmark_tests frequency: nightly team: core cluster: cluster_env: app_config.yaml cluster_compute: scheduling.yaml run: timeout: 3600 script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1 wait_for_nodes: num_nodes: 32 timeout: 600 type: sdk_command file_manager: sdk stable: false