mirror of
https://github.com/vale981/ray
synced 2025-03-12 14:16:39 -04:00

Adds a unit-tested and restructured ray_release package for running release tests. Relevant changes in behavior: Per default, Buildkite will wait for the wheels of the current commit to be available. Alternatively, users can a) specify a different commit hash, b) a wheels URL (which we will also wait for to be available) or c) specify a branch (or user/branch combination), in which case the latest available wheels will be used (e.g. if master is passed, behavior matches old default behavior). The main subpackages are: Cluster manager: Creates cluster envs/computes, starts cluster, terminates cluster Command runner: Runs commands, e.g. as client command or sdk command File manager: Uploads/downloads files to/from session Reporter: Reports results (e.g. to database) Much of the code base is unit tested, but there are probably some pieces missing. Example build (waited for wheels to be built): https://buildkite.com/ray-project/kf-dev/builds/51#_ Wheel build: https://buildkite.com/ray-project/ray-builders-branch/builds/6023
346 lines
7.6 KiB
YAML
346 lines
7.6 KiB
YAML
# Global release test configuration file.
|
|
# All your release test configuration should go here. Adding release tests here
|
|
# will automatically enable them in the Buildkite release testing schedules
|
|
# (except they have frequency: disabled).
|
|
# Here is an example configuration for reference:
|
|
#- name: example_test
|
|
# # Tests with the same group will be grouped in the Buildkite UI
|
|
# group: Example group
|
|
# # Provide the working directory which will be uploaded to the cluster
|
|
# working_dir: example_dir
|
|
#
|
|
# # For release test infra migration, we provide these fields that are populated
|
|
# # in the database
|
|
# legacy:
|
|
# test_name: example_test
|
|
# test_suite: examples
|
|
#
|
|
# # How often to run the tests.
|
|
# # One of [disabled, any, multi, nightly, weekly].
|
|
# frequency: weekly
|
|
# # Owning team. This field will be persisted to the database
|
|
# team: ml
|
|
#
|
|
# # Optional location of a bash setup script to run on the driver
|
|
# # when setting up the local environment. Relative to working_dir
|
|
# driver_setup: setup_driver.sh
|
|
#
|
|
# # Cluster information
|
|
# cluster:
|
|
# # Location of cluster env, relative to working_dir
|
|
# cluster_env: cluster_env.yaml
|
|
# # Location of cluster compute, relative to working_dir
|
|
# cluster_compute: cluster_compute.yaml
|
|
# # Autosuspend parameter passed to the cluster.
|
|
# # The cluster will automatically terminate if inactive for this
|
|
# # many minutes. Defaults to 10 if not set.
|
|
# autosuspend_mins: 10
|
|
# # Optional cloud_id to use instead of the default cloud
|
|
# cloud_id: cld_12345678
|
|
# # Alternatively, you can specify a cloud name
|
|
# cloud_name: anyscale_default_cloud
|
|
#
|
|
# # Run configuration for the test
|
|
# run:
|
|
# # Type of test. Can be sdk_command or client (job to be implemented soon).
|
|
# # Uses either Anyscale SDK commands or the Ray client to run the actual
|
|
# # release test.
|
|
# type: sdk_command
|
|
#
|
|
# # File manager to use to transfer files to and from the cluster.
|
|
# # Can be any of [sdk, client, job].
|
|
# file_manager: job
|
|
#
|
|
# # If you want to wait for nodes to be ready, you can specify this here:
|
|
# wait_for_nodes:
|
|
# # Number of nodes
|
|
# num_nodes: 16
|
|
# # Timeout for waiting for nodes. If nodes are not up by then, the
|
|
# # test will fail.
|
|
# timeout: 600
|
|
#
|
|
# # Optional prepare script to be run on the cluster before the test script
|
|
# prepare: python prepare.py
|
|
# # The prepare command can have a separate timeout
|
|
# prepare_timeout: 300
|
|
#
|
|
# # Main script to run as the test script
|
|
# script: python workloads/train_small.py
|
|
# # Timeout in seconds. After this time the test is considered as failed.
|
|
# timeout: 600
|
|
#
|
|
# # You can specify smoke test definitions here. If a smoke test is triggered,
|
|
# # it will deep update the main test configuration with the values provided
|
|
# # here. Smoke tests will automatically run with IS_SMOKE_TEST=1 as en
|
|
# # environment variable and receive the --smoke-test flag as a parameter in the
|
|
# # run script.
|
|
# smoke_test:
|
|
# # Smoke tests can have different frequencies. A smoke test is only triggered
|
|
# # when the regular test is not matched.
|
|
# frequency: nightly
|
|
# # Here we adjust the run timeout down and run on less nodes. The test script
|
|
# # remains the same.
|
|
# run:
|
|
# timeout: 300
|
|
# wait_for_nodes:
|
|
# num_nodes: 4
|
|
# timeout: 600
|
|
#
|
|
# # After the test finished, this handler (in alerts/) will process the results.
|
|
# # It can then let the test fail, e.g. if a metric regression is observed.
|
|
# alert: default
|
|
|
|
|
|
#######################
|
|
# XGBoost release tests
|
|
#######################
|
|
- name: xgboost_train_small
|
|
group: XGBoost
|
|
working_dir: xgboost_tests
|
|
|
|
legacy:
|
|
test_name: train_small
|
|
test_suite: xgboost_tests
|
|
|
|
frequency: nightly
|
|
team: ml
|
|
|
|
cluster:
|
|
cluster_env: app_config.yaml
|
|
cluster_compute: tpl_cpu_small.yaml
|
|
autosuspend_mins: 10
|
|
|
|
run:
|
|
timeout: 600
|
|
script: python workloads/train_small.py
|
|
|
|
wait_for_nodes:
|
|
num_nodes: 4
|
|
timeout: 600
|
|
|
|
type: client
|
|
|
|
alert: xgboost_tests
|
|
|
|
- name: xgboost_train_moderate
|
|
group: XGBoost
|
|
working_dir: xgboost_tests
|
|
|
|
legacy:
|
|
test_name: train_moderate
|
|
test_suite: xgboost_tests
|
|
|
|
frequency: nightly
|
|
team: ml
|
|
|
|
cluster:
|
|
cluster_env: app_config.yaml
|
|
cluster_compute: tpl_cpu_moderate.yaml
|
|
|
|
run:
|
|
timeout: 600
|
|
script: python workloads/train_moderate.py
|
|
|
|
wait_for_nodes:
|
|
num_nodes: 32
|
|
timeout: 600
|
|
|
|
type: sdk_command
|
|
file_manager: job
|
|
|
|
alert: xgboost_tests
|
|
|
|
- name: xgboost_train_gpu
|
|
group: XGBoost
|
|
working_dir: xgboost_tests
|
|
|
|
legacy:
|
|
test_name: train_gpu
|
|
test_suite: xgboost_tests
|
|
|
|
frequency: nightly
|
|
team: ml
|
|
|
|
cluster:
|
|
cluster_env: app_config_gpu.yaml
|
|
cluster_compute: tpl_gpu_small.yaml
|
|
|
|
run:
|
|
timeout: 600
|
|
script: python workloads/train_gpu.py
|
|
|
|
wait_for_nodes:
|
|
num_nodes: 5
|
|
timeout: 600
|
|
|
|
type: sdk_command
|
|
file_manager: job
|
|
|
|
alert: xgboost_tests
|
|
|
|
- name: xgboost_distributed_api_test
|
|
group: XGBoost
|
|
working_dir: xgboost_tests
|
|
|
|
legacy:
|
|
test_name: distributed_api_test
|
|
test_suite: xgboost_tests
|
|
|
|
frequency: nightly
|
|
team: ml
|
|
|
|
cluster:
|
|
cluster_env: app_config.yaml
|
|
cluster_compute: tpl_cpu_small.yaml
|
|
|
|
run:
|
|
timeout: 600
|
|
script: python workloads/distributed_api_test.py
|
|
wait_for_nodes:
|
|
num_nodes: 4
|
|
timeout: 600
|
|
|
|
type: sdk_command
|
|
file_manager: job
|
|
|
|
alert: xgboost_tests
|
|
|
|
- name: xgboost_ft_small_elastic
|
|
group: XGBoost
|
|
working_dir: xgboost_tests
|
|
|
|
legacy:
|
|
test_name: ft_small_elastic
|
|
test_suite: xgboost_tests
|
|
|
|
frequency: nightly
|
|
team: ml
|
|
|
|
cluster:
|
|
cluster_env: app_config.yaml
|
|
cluster_compute: tpl_cpu_small.yaml
|
|
|
|
run:
|
|
timeout: 900
|
|
script: python workloads/ft_small_elastic.py
|
|
|
|
wait_for_nodes:
|
|
num_nodes: 4
|
|
timeout: 600
|
|
|
|
type: sdk_command
|
|
file_manager: job
|
|
|
|
alert: xgboost_tests
|
|
|
|
- name: xgboost_ft_small_non_elastic
|
|
group: XGBoost
|
|
working_dir: xgboost_tests
|
|
|
|
legacy:
|
|
test_name: ft_small_non_elastic
|
|
test_suite: xgboost_tests
|
|
|
|
frequency: nightly
|
|
team: ml
|
|
|
|
cluster:
|
|
cluster_env: app_config.yaml
|
|
cluster_compute: tpl_cpu_small.yaml
|
|
|
|
run:
|
|
timeout: 900
|
|
script: python workloads/ft_small_non_elastic.py
|
|
|
|
wait_for_nodes:
|
|
num_nodes: 4
|
|
timeout: 600
|
|
|
|
type: sdk_command
|
|
file_manager: job
|
|
|
|
alert: xgboost_tests
|
|
|
|
- name: xgboost_tune_small
|
|
group: XGBoost
|
|
working_dir: xgboost_tests
|
|
|
|
legacy:
|
|
test_name: tune_small
|
|
test_suite: xgboost_tests
|
|
|
|
frequency: nightly
|
|
team: ml
|
|
|
|
cluster:
|
|
cluster_env: app_config.yaml
|
|
cluster_compute: tpl_cpu_small.yaml
|
|
|
|
run:
|
|
timeout: 600
|
|
script: python workloads/tune_small.py
|
|
|
|
wait_for_nodes:
|
|
num_nodes: 4
|
|
timeout: 600
|
|
|
|
type: sdk_command
|
|
file_manager: job
|
|
|
|
alert: xgboost_tests
|
|
|
|
- name: xgboost_tune_32x4
|
|
group: XGBoost
|
|
working_dir: xgboost_tests
|
|
|
|
legacy:
|
|
test_name: tune_32x4
|
|
test_suite: xgboost_tests
|
|
|
|
frequency: nightly
|
|
team: ml
|
|
|
|
cluster:
|
|
cluster_env: app_config.yaml
|
|
cluster_compute: tpl_cpu_moderate.yaml
|
|
|
|
run:
|
|
timeout: 900
|
|
script: python workloads/tune_32x4.py
|
|
|
|
wait_for_nodes:
|
|
num_nodes: 32
|
|
timeout: 600
|
|
|
|
type: sdk_command
|
|
file_manager: job
|
|
|
|
alert: xgboost_tests
|
|
|
|
- name: xgboost_tune_4x32
|
|
group: XGBoost
|
|
working_dir: xgboost_tests
|
|
|
|
legacy:
|
|
test_name: tune_4x32
|
|
test_suite: xgboost_tests
|
|
|
|
frequency: nightly
|
|
team: ml
|
|
|
|
cluster:
|
|
cluster_env: app_config.yaml
|
|
cluster_compute: tpl_cpu_moderate.yaml
|
|
|
|
run:
|
|
timeout: 900
|
|
script: python workloads/tune_4x32.py
|
|
|
|
wait_for_nodes:
|
|
num_nodes: 32
|
|
timeout: 600
|
|
|
|
type: sdk_command
|
|
file_manager: job
|
|
|
|
alert: xgboost_tests
|