# Global release test configuration file. # All your release test configuration should go here. Adding release tests here # will automatically enable them in the Buildkite release testing schedules # (except they have frequency: disabled). # Here is an example configuration for reference: #- name: example_test # # Tests with the same group will be grouped in the Buildkite UI # group: Example group # # Provide the working directory which will be uploaded to the cluster # working_dir: example_dir # # # For release test infra migration, we provide these fields that are populated # # in the database # legacy: # test_name: example_test # test_suite: examples # # # How often to run the tests. # # One of [disabled, any, multi, nightly, weekly]. # frequency: weekly # # Owning team. This field will be persisted to the database # team: ml # # # Optional location of a bash setup script to run on the driver # # when setting up the local environment. Relative to working_dir # driver_setup: setup_driver.sh # # # Cluster information # cluster: # # Location of cluster env, relative to working_dir # cluster_env: cluster_env.yaml # # Location of cluster compute, relative to working_dir # cluster_compute: cluster_compute.yaml # # Autosuspend parameter passed to the cluster. # # The cluster will automatically terminate if inactive for this # # many minutes. Defaults to 10 if not set. # autosuspend_mins: 10 # # Optional cloud_id to use instead of the default cloud # cloud_id: cld_12345678 # # Alternatively, you can specify a cloud name # cloud_name: anyscale_default_cloud # # # Run configuration for the test # run: # # Type of test. Can be sdk_command or client (job to be implemented soon). # # Uses either Anyscale SDK commands or the Ray client to run the actual # # release test. # type: sdk_command # # # File manager to use to transfer files to and from the cluster. # # Can be any of [sdk, client, job]. # file_manager: job # # # If you want to wait for nodes to be ready, you can specify this here: # wait_for_nodes: # # Number of nodes # num_nodes: 16 # # Timeout for waiting for nodes. If nodes are not up by then, the # # test will fail. # timeout: 600 # # # Optional prepare script to be run on the cluster before the test script # prepare: python prepare.py # # The prepare command can have a separate timeout # prepare_timeout: 300 # # # Main script to run as the test script # script: python workloads/train_small.py # # Timeout in seconds. After this time the test is considered as failed. # timeout: 600 # # # You can specify smoke test definitions here. If a smoke test is triggered, # # it will deep update the main test configuration with the values provided # # here. Smoke tests will automatically run with IS_SMOKE_TEST=1 as en # # environment variable and receive the --smoke-test flag as a parameter in the # # run script. # smoke_test: # # Smoke tests can have different frequencies. A smoke test is only triggered # # when the regular test is not matched. # frequency: nightly # # Here we adjust the run timeout down and run on less nodes. The test script # # remains the same. # run: # timeout: 300 # wait_for_nodes: # num_nodes: 4 # timeout: 600 # # # After the test finished, this handler (in alerts/) will process the results. # # It can then let the test fail, e.g. if a metric regression is observed. # alert: default ####################### # XGBoost release tests ####################### - name: xgboost_train_small group: XGBoost working_dir: xgboost_tests legacy: test_name: train_small test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_small.yaml autosuspend_mins: 10 run: timeout: 600 script: python workloads/train_small.py wait_for_nodes: num_nodes: 4 timeout: 600 type: client alert: xgboost_tests - name: xgboost_train_moderate group: XGBoost working_dir: xgboost_tests legacy: test_name: train_moderate test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_moderate.yaml run: timeout: 600 script: python workloads/train_moderate.py wait_for_nodes: num_nodes: 32 timeout: 600 type: sdk_command file_manager: job alert: xgboost_tests - name: xgboost_train_gpu group: XGBoost working_dir: xgboost_tests legacy: test_name: train_gpu test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config_gpu.yaml cluster_compute: tpl_gpu_small.yaml run: timeout: 600 script: python workloads/train_gpu.py wait_for_nodes: num_nodes: 5 timeout: 600 type: sdk_command file_manager: job alert: xgboost_tests - name: xgboost_distributed_api_test group: XGBoost working_dir: xgboost_tests legacy: test_name: distributed_api_test test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_small.yaml run: timeout: 600 script: python workloads/distributed_api_test.py wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: job alert: xgboost_tests - name: xgboost_ft_small_elastic group: XGBoost working_dir: xgboost_tests legacy: test_name: ft_small_elastic test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_small.yaml run: timeout: 900 script: python workloads/ft_small_elastic.py wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: job alert: xgboost_tests - name: xgboost_ft_small_non_elastic group: XGBoost working_dir: xgboost_tests legacy: test_name: ft_small_non_elastic test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_small.yaml run: timeout: 900 script: python workloads/ft_small_non_elastic.py wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: job alert: xgboost_tests - name: xgboost_tune_small group: XGBoost working_dir: xgboost_tests legacy: test_name: tune_small test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_small.yaml run: timeout: 600 script: python workloads/tune_small.py wait_for_nodes: num_nodes: 4 timeout: 600 type: sdk_command file_manager: job alert: xgboost_tests - name: xgboost_tune_32x4 group: XGBoost working_dir: xgboost_tests legacy: test_name: tune_32x4 test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_moderate.yaml run: timeout: 900 script: python workloads/tune_32x4.py wait_for_nodes: num_nodes: 32 timeout: 600 type: sdk_command file_manager: job alert: xgboost_tests - name: xgboost_tune_4x32 group: XGBoost working_dir: xgboost_tests legacy: test_name: tune_4x32 test_suite: xgboost_tests frequency: nightly team: ml cluster: cluster_env: app_config.yaml cluster_compute: tpl_cpu_moderate.yaml run: timeout: 900 script: python workloads/tune_4x32.py wait_for_nodes: num_nodes: 32 timeout: 600 type: sdk_command file_manager: job alert: xgboost_tests