ray/release/release_tests.yaml
Kai Fricke b91246a093
[air/benchmarks] Measure local training time in torch/tf benchmarks (#27902)
We currently measure end-to-end training time in our benchmarks, which includes setup overhead. This is an unequal comparison, as setup overhead for vanilla training cannot be accurately expressed and was instead just disregarded.
By comparing the raw training times in the actual training loop, we will get a more accurate expression of any potential overhead or benefit in using Ray vs. vanilla tensorflow/torch.

Signed-off-by: Kai Fricke <kai@anyscale.com>
2022-08-16 19:16:08 +02:00

4652 lines
96 KiB
YAML

# Global release test configuration file.
# All your release test configuration should go here. Adding release tests here
# will automatically enable them in the Buildkite release testing schedules
# (except they have frequency: disabled).
# Here is an example configuration for reference:
#- name: example_test
# # Tests with the same group will be grouped in the Buildkite UI
# group: Example group
# # Provide the working directory which will be uploaded to the cluster
# working_dir: example_dir
#
# # For release test infra migration, we provide these fields that are populated
# # in the database
# legacy:
# test_name: example_test
# test_suite: examples
#
# # How often to run the tests.
# # One of [disabled, any, multi, nightly, weekly].
# frequency: weekly
# # Owning team. This field will be persisted to the database
# team: ml
#
# # Python version. This optional field determines which Python version to run tests
# # on. This must be a string!
# python: "3.7"
#
# # Optional location of a bash setup script to run on the driver
# # when setting up the local environment. Relative to working_dir
# driver_setup: setup_driver.sh
#
# # Cluster information
# cluster:
# # Location of cluster env, relative to working_dir
# cluster_env: cluster_env.yaml
# # Location of cluster compute, relative to working_dir
# cluster_compute: cluster_compute.yaml
# # Autosuspend parameter passed to the cluster.
# # The cluster will automatically terminate if inactive for this
# # many minutes. Defaults to 10 if not set.
# autosuspend_mins: 10
# # Optional cloud_id to use instead of the default cloud
# cloud_id: cld_12345678
# # Alternatively, you can specify a cloud name
# cloud_name: anyscale_default_cloud
#
# # Run configuration for the test
# run:
# # Type of test. Can be sdk_command or client (job to be implemented soon).
# # Uses either Anyscale SDK commands or the Ray client to run the actual
# # release test.
# type: sdk_command
#
# # File manager to use to transfer files to and from the cluster.
# # Can be any of [sdk, client, job].
# file_manager: sdk
#
# # If you want to wait for nodes to be ready, you can specify this here:
# wait_for_nodes:
# # Number of nodes
# num_nodes: 16
# # Timeout for waiting for nodes. If nodes are not up by then, the
# # test will fail.
# timeout: 600
#
# # Optional prepare script to be run on the cluster before the test script
# prepare: python prepare.py
# # The prepare command can have a separate timeout
# prepare_timeout: 300
#
# # Main script to run as the test script
# script: python workloads/train_small.py
# # Timeout in seconds. After this time the test is considered as failed.
# timeout: 600
#
# # You can specify smoke test definitions here. If a smoke test is triggered,
# # it will deep update the main test configuration with the values provided
# # here. Smoke tests will automatically run with IS_SMOKE_TEST=1 as en
# # environment variable and receive the --smoke-test flag as a parameter in the
# # run script.
# smoke_test:
# # Smoke tests can have different frequencies. A smoke test is only triggered
# # when the regular test is not matched.
# frequency: nightly
# # Here we adjust the run timeout down and run on less nodes. The test script
# # remains the same.
# run:
# timeout: 300
# wait_for_nodes:
# num_nodes: 4
# timeout: 600
#
# # After the test finished, this handler (in alerts/) will process the results.
# # It can then let the test fail, e.g. if a metric regression is observed.
# alert: default
#######################
# Cluster scaling tests
#######################
- name: cluster_tune_scale_up_down
group: Cluster tests
working_dir: cluster_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: cpt_autoscaling_1-3.yaml
run:
timeout: 3600
script: python workloads/tune_scale_up_down.py
wait_for_nodes:
num_nodes: 0
type: sdk_command
alert: default
#########################
# AIR release tests
#########################
- name: long_running_horovod_tune_test
group: AIR tests
working_dir: air_tests
frequency: weekly
team: ml
env: staging
cluster:
cluster_env: horovod/app_config_master.yaml
cluster_compute: horovod/compute_tpl.yaml
run:
timeout: 36000
script: python horovod/workloads/horovod_tune_test.py
long_running: true
wait_for_nodes:
num_nodes: 3
type: sdk_command
file_manager: job
smoke_test:
frequency: disabled
run:
timeout: 1800
alert: default
- name: air_benchmark_data_bulk_ingest
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: data_20_nodes.yaml
run:
timeout: 3600
script: python workloads/data_benchmark.py --dataset-size-gb=200 --num-workers=20
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: job
alert: default
# AIR benchmarks for XGBoost CUJ
- name: air_benchmark_xgboost_cpu_10
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: xgboost_app_config.yaml
cluster_compute: xgboost_compute_tpl.yaml
run:
timeout: 36000
script: python workloads/xgboost_benchmark.py
wait_for_nodes:
num_nodes: 10
type: sdk_command
file_manager: job
smoke_test:
frequency: disabled
run:
timeout: 1800
alert: default
# Ray AIR distributed Torch benchmarks
- name: air_benchmark_torch_mnist_cpu_4x1
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_cpu_4.yaml
run:
timeout: 3600
script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
- name: air_benchmark_torch_mnist_gpu_4x4
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: weekly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_gpu_4x4.yaml
run:
timeout: 4800
script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 120 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
smoke_test:
frequency: nightly
cluster:
cluster_compute: compute_gpu_2x2.yaml
run:
timeout: 3600
script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
wait_for_nodes:
num_nodes: 2
alert: default
- name: air_benchmark_torch_mnist_cpu_1x4
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_cpu_1.yaml
run:
timeout: 3600
script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
type: sdk_command
file_manager: job
alert: default
- name: air_benchmark_torch_batch_prediction_gpu_1x1_20gb
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_gpu_1.yaml
run:
timeout: 3600
script: python workloads/gpu_batch_prediction.py --data-size-gb 20
type: sdk_command
file_manager: job
alert: default
# TODO(jiaodong): This leads to CUDA OOM on release tool but succeeds on
# workspaces, re-enabled it once inline conversions are fixed
# - name: air_benchmark_torch_batch_prediction_gpu_4x4_100gb
# group: AIR tests
# working_dir: air_tests/air_benchmarks
# frequency: nightly
# team: ml
# env: staging
# stable: false
# cluster:
# cluster_env: app_config.yaml
# cluster_compute: compute_gpu_16.yaml
# run:
# timeout: 10800
# script: python workloads/gpu_batch_prediction.py --data-size-gb 100
# type: sdk_command
# file_manager: job
# alert: default
- name: air_benchmark_torch_mnist_cpu_4x4
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_cpu_4.yaml
run:
timeout: 5400
script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
- name: air_benchmark_tune_torch_mnist
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_cpu_8.yaml
run:
timeout: 3600
script: python workloads/tune_torch_benchmark.py --num-runs 3 --num-trials 8 --num-workers 4
wait_for_nodes:
num_nodes: 8
type: sdk_command
file_manager: job
alert: default
- name: air_benchmark_tune_torch_mnist_gpu
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_gpu_4_g4_12xl.yaml
run:
timeout: 3600
script: python workloads/tune_torch_benchmark.py --num-runs 2 --num-trials 4 --num-workers 4 --use-gpu
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
- name: air_benchmark_tune_torch_mnist_large_gpu
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: weekly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_gpu_8_g4_12xl.yaml
run:
timeout: 3600
script: python workloads/tune_torch_benchmark.py --num-runs 2 --num-trials 4 --num-workers 8 --use-gpu
wait_for_nodes:
num_nodes: 8
type: sdk_command
file_manager: job
alert: default
# Ray AIR distributed Tensorflow benchmarks
- name: air_benchmark_tensorflow_mnist_cpu_4x1
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_cpu_4.yaml
run:
timeout: 5400
script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
- name: air_benchmark_tensorflow_mnist_cpu_1x4
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_cpu_1.yaml
run:
timeout: 5400
script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2
type: sdk_command
file_manager: job
alert: default
- name: air_benchmark_tensorflow_mnist_cpu_4x4
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
stable: false
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_cpu_4.yaml
run:
timeout: 5400
script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
- name: air_benchmark_tensorflow_mnist_gpu_4x4
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: weekly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_gpu_4x4.yaml
run:
timeout: 5400
script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 64 --use-gpu
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
smoke_test:
frequency: nightly
cluster:
cluster_compute: compute_gpu_2x2.yaml
run:
script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
wait_for_nodes:
num_nodes: 2
alert: default
- name: air_benchmark_pytorch_training_e2e_gpu_1x1_20gb
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_gpu_1.yaml
run:
timeout: 3600
script: python workloads/pytorch_training_e2e.py --data-size-gb 20
type: sdk_command
file_manager: job
alert: default
- name: air_benchmark_pytorch_training_e2e_gpu_4x4_100gb
group: AIR tests
working_dir: air_tests/air_benchmarks
frequency: nightly
team: ml
env: staging
stable: false
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_gpu_16.yaml
run:
timeout: 10800
script: python workloads/pytorch_training_e2e.py --data-size-gb=100 --num-workers=16
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
#######################
# XGBoost release tests
#######################
- name: xgboost_train_small
group: XGBoost
working_dir: xgboost_tests
legacy:
test_name: train_small
test_suite: xgboost_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_small.yaml
run:
timeout: 600
script: python workloads/train_small.py
wait_for_nodes:
num_nodes: 4
type: client
alert: xgboost_tests
- name: xgboost_train_moderate
group: XGBoost
working_dir: xgboost_tests
legacy:
test_name: train_moderate
test_suite: xgboost_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_moderate.yaml
run:
timeout: 600
script: python workloads/train_moderate.py
wait_for_nodes:
num_nodes: 32
type: sdk_command
file_manager: sdk
alert: xgboost_tests
- name: xgboost_train_gpu
group: XGBoost
working_dir: xgboost_tests
legacy:
test_name: train_gpu
test_suite: xgboost_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config_gpu.yaml
cluster_compute: tpl_gpu_small.yaml
run:
timeout: 600
script: python workloads/train_gpu.py
wait_for_nodes:
num_nodes: 5
type: sdk_command
file_manager: sdk
alert: xgboost_tests
- name: xgboost_distributed_api_test
group: XGBoost
working_dir: xgboost_tests
legacy:
test_name: distributed_api_test
test_suite: xgboost_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_small.yaml
run:
timeout: 600
script: python workloads/distributed_api_test.py
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
alert: xgboost_tests
- name: xgboost_ft_small_elastic
group: XGBoost
working_dir: xgboost_tests
legacy:
test_name: ft_small_elastic
test_suite: xgboost_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_small.yaml
run:
timeout: 900
script: python workloads/ft_small_elastic.py
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
alert: xgboost_tests
- name: xgboost_ft_small_non_elastic
group: XGBoost
working_dir: xgboost_tests
legacy:
test_name: ft_small_non_elastic
test_suite: xgboost_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_small.yaml
run:
timeout: 900
script: python workloads/ft_small_non_elastic.py
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
alert: xgboost_tests
- name: xgboost_tune_small
group: XGBoost
working_dir: xgboost_tests
legacy:
test_name: tune_small
test_suite: xgboost_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_small.yaml
run:
timeout: 600
script: python workloads/tune_small.py
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
alert: xgboost_tests
- name: xgboost_tune_32x4
group: XGBoost
working_dir: xgboost_tests
legacy:
test_name: tune_32x4
test_suite: xgboost_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_moderate.yaml
run:
timeout: 900
script: python workloads/tune_32x4.py
wait_for_nodes:
num_nodes: 32
type: sdk_command
file_manager: sdk
alert: xgboost_tests
- name: xgboost_tune_4x32
group: XGBoost
working_dir: xgboost_tests
legacy:
test_name: tune_4x32
test_suite: xgboost_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_moderate.yaml
run:
timeout: 900
script: python workloads/tune_4x32.py
wait_for_nodes:
num_nodes: 32
type: sdk_command
file_manager: sdk
alert: xgboost_tests
#######################
# LightGBM tests
#######################
- name: lightgbm_train_small
group: LightGBM tests
working_dir: lightgbm_tests
legacy:
test_name: train_small
test_suite: lightgbm_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_small.yaml
run:
timeout: 600
script: python workloads/train_small.py
wait_for_nodes:
num_nodes: 4
type: client
alert: default
- name: lightgbm_train_moderate
group: LightGBM tests
working_dir: lightgbm_tests
legacy:
test_name: train_moderate
test_suite: lightgbm_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_moderate.yaml
run:
timeout: 600
script: python workloads/train_moderate.py
wait_for_nodes:
num_nodes: 32
type: sdk_command
file_manager: job
alert: default
- name: lightgbm_distributed_api_test
group: LightGBM tests
working_dir: lightgbm_tests
legacy:
test_name: distributed_api_test
test_suite: lightgbm_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_small.yaml
run:
timeout: 600
script: python workloads/distributed_api_test.py
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
- name: lightgbm_ft_small_non_elastic
group: LightGBM tests
working_dir: lightgbm_tests
legacy:
test_name: ft_small_non_elastic
test_suite: lightgbm_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_small.yaml
run:
timeout: 900
script: python workloads/ft_small_non_elastic.py
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
- name: lightgbm_tune_small
group: LightGBM tests
working_dir: lightgbm_tests
legacy:
test_name: tune_small
test_suite: lightgbm_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_small.yaml
run:
timeout: 600
script: python workloads/tune_small.py
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
- name: lightgbm_tune_16x4
group: LightGBM tests
working_dir: lightgbm_tests
legacy:
test_name: tune_16x4
test_suite: lightgbm_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_moderate.yaml
run:
timeout: 900
script: python workloads/tune_16x4.py
wait_for_nodes:
num_nodes: 32
type: sdk_command
file_manager: job
alert: default
- name: lightgbm_tune_4x16
group: LightGBM tests
working_dir: lightgbm_tests
legacy:
test_name: tune_4x16
test_suite: lightgbm_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_moderate.yaml
run:
timeout: 900
script: python workloads/tune_4x16.py
wait_for_nodes:
num_nodes: 32
type: sdk_command
file_manager: job
alert: default
#######################
# ML user tests
#######################
- name: ml_user_horovod_user_test_latest
group: ML user tests
working_dir: ml_user_tests
legacy:
test_name: horovod_user_test_latest
test_suite: ml_user_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: horovod/app_config.yaml
cluster_compute: horovod/compute_tpl.yaml
driver_setup: horovod/driver_setup_latest.sh
run:
timeout: 1200
script: python horovod/horovod_user_test.py
type: client
alert: default
- name: ml_user_horovod_user_test_master
group: ML user tests
working_dir: ml_user_tests
legacy:
test_name: horovod_user_test_master
test_suite: ml_user_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: horovod/app_config_master.yaml
cluster_compute: horovod/compute_tpl.yaml
driver_setup: horovod/driver_setup_master.sh
run:
timeout: 1200
script: python horovod/horovod_user_test.py
type: client
alert: default
- name: ml_user_train_tensorflow_mnist_test
group: ML user tests
working_dir: ml_user_tests
legacy:
test_name: train_tensorflow_mnist_test
test_suite: ml_user_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: train/app_config.yaml
cluster_compute: train/compute_tpl.yaml
driver_setup: train/driver_setup.sh
run:
timeout: 36000
script: python train/train_tensorflow_mnist_test.py
type: client
alert: default
- name: ml_user_train_torch_linear_test
group: ML user tests
working_dir: ml_user_tests
legacy:
test_name: train_torch_linear_test
test_suite: ml_user_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: train/app_config.yaml
cluster_compute: train/compute_tpl.yaml
driver_setup: train/driver_setup.sh
run:
timeout: 36000
script: python train/train_torch_linear_test.py
type: client
alert: default
- name: ml_user_xgboost_gpu_connect_latest
group: ML user tests
working_dir: ml_user_tests
legacy:
test_name: xgboost_gpu_connect_latest
test_suite: ml_user_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: xgboost/app_config_gpu.yaml
cluster_compute: xgboost/tpl_gpu_small_scaling.yaml
run:
timeout: 1200
script: python xgboost/train_gpu_connect.py
type: client
alert: default
- name: ml_user_xgboost_gpu_connect_master
group: ML user tests
working_dir: ml_user_tests
legacy:
test_name: xgboost_gpu_connect_master
test_suite: ml_user_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: xgboost/app_config_gpu_master.yaml
cluster_compute: xgboost/tpl_gpu_small_scaling.yaml
run:
timeout: 1200
script: python xgboost/train_gpu_connect.py
type: client
alert: default
- name: ml_user_ray_lightning_user_test_latest
group: ML user tests
working_dir: ml_user_tests
legacy:
test_name: ray_lightning_user_test_latest
test_suite: ml_user_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: ray-lightning/app_config.yaml
cluster_compute: ray-lightning/compute_tpl.yaml
driver_setup: ray-lightning/driver_setup.sh
run:
timeout: 1200
script: python ray-lightning/ray_lightning_user_test.py
type: client
alert: default
- name: ml_user_ray_lightning_user_test_master
group: ML user tests
working_dir: ml_user_tests
legacy:
test_name: ray_lightning_user_test_master
test_suite: ml_user_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: ray-lightning/app_config_master.yaml
cluster_compute: ray-lightning/compute_tpl.yaml
driver_setup: ray-lightning/driver_setup.sh
run:
timeout: 1200
script: python ray-lightning/ray_lightning_user_test.py
type: client
alert: default
- name: ml_user_tune_rllib_connect_test
group: ML user tests
working_dir: ml_user_tests
legacy:
test_name: tune_rllib_connect_test
test_suite: ml_user_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: ../rllib_tests/app_config.yaml
cluster_compute: tune_rllib/compute_tpl.yaml
driver_setup: tune_rllib/driver_setup.sh
run:
timeout: 2000
script: python tune_rllib/run_connect_tests.py
type: client
alert: default
#######################
# Tune cloud tests
#######################
- name: tune_cloud_aws_no_sync_down
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: true
legacy:
test_name: aws_no_sync_down
test_suite: tune_cloud_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_aws_4x2.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py no_sync_down
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_cloud_aws_ssh_sync
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: true
legacy:
test_name: aws_ssh_sync
test_suite: tune_cloud_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_aws_4x2.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py ssh_sync
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_cloud_aws_durable_upload
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: true
legacy:
test_name: aws_durable_upload
test_suite: tune_cloud_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_aws_4x2.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py durable_upload --bucket s3://tune-cloud-tests/durable_upload
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_cloud_aws_durable_upload_rllib_str
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: false
legacy:
test_name: aws_durable_upload_rllib_str
test_suite: tune_cloud_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config_ml.yaml
cluster_compute: tpl_aws_4x2.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str
--bucket s3://tune-cloud-tests/durable_upload_rllib_str
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_cloud_aws_durable_upload_rllib_trainer
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: false
legacy:
test_name: aws_durable_upload_rllib_trainer
test_suite: tune_cloud_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config_ml.yaml
cluster_compute: tpl_aws_4x2.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer
--bucket s3://tune-cloud-tests/durable_upload_rllib_trainer
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_cloud_gcp_k8s_no_sync_down
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: true
legacy:
test_name: gcp_k8s_no_sync_down
test_suite: tune_cloud_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
autosuspend_mins: 60
run:
timeout: 600
script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
type: client
wait_for_nodes:
num_nodes: 4
alert: tune_tests
- name: tune_cloud_gcp_k8s_ssh_sync
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: true
legacy:
test_name: gcp_k8s_ssh_sync
test_suite: tune_cloud_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
autosuspend_mins: 60
run:
timeout: 600
script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
type: client
wait_for_nodes:
num_nodes: 4
alert: tune_tests
- name: tune_cloud_gcp_k8s_durable_upload
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: true
legacy:
test_name: gcp_k8s_durable_upload
test_suite: tune_cloud_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
autosuspend_mins: 60
run:
timeout: 600
script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://tune-cloud-tests/durable_upload
type: client
wait_for_nodes:
num_nodes: 4
alert: tune_tests
########################
# Tune scalability tests
########################
- name: tune_scalability_bookkeeping_overhead
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: bookkeeping_overhead
test_suite: tune_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_1x16.yaml
run:
timeout: 1200
script: python workloads/test_bookkeeping_overhead.py
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_scalability_durable_trainable
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: durable_trainable
test_suite: tune_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_16x2.yaml
run:
timeout: 900
script: python workloads/test_durable_trainable.py --bucket tune-cloud-tests
wait_for_nodes:
num_nodes: 16
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_scalability_long_running_large_checkpoints
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: long_running_large_checkpoints
test_suite: tune_tests
frequency: weekly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_1x32_hd.yaml
run:
timeout: 86400
script: python workloads/test_long_running_large_checkpoints.py
long_running: true
type: sdk_command
file_manager: sdk
smoke_test:
frequency: nightly
run:
timeout: 3600
alert: tune_tests
- name: tune_scalability_network_overhead
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: network_overhead
test_suite: tune_tests
frequency: weekly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_100x2.yaml
run:
timeout: 900
prepare_timeout: 1200
script: python workloads/test_network_overhead.py
wait_for_nodes:
num_nodes: 100
type: sdk_command
file_manager: sdk
smoke_test:
frequency: nightly
cluster:
cluster_compute: tpl_20x2.yaml
run:
timeout: 400
prepare_timeout: 600
wait_for_nodes:
num_nodes: 20
alert: tune_tests
- name: tune_scalability_result_throughput_cluster
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: result_throughput_cluster
test_suite: tune_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_16x64.yaml
run:
timeout: 600
script: python workloads/test_result_throughput_cluster.py
wait_for_nodes:
num_nodes: 16
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_scalability_result_throughput_single_node
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: result_throughput_single_node
test_suite: tune_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_1x96.yaml
run:
timeout: 600
script: python workloads/test_result_throughput_single_node.py
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_scalability_xgboost_sweep
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: xgboost_sweep
test_suite: tune_tests
frequency: weekly
team: ml
env: staging
cluster:
cluster_env: app_config_data.yaml
cluster_compute: tpl_16x64.yaml
run:
timeout: 3600
script: python workloads/test_xgboost_sweep.py
wait_for_nodes:
num_nodes: 16
type: sdk_command
file_manager: sdk
alert: tune_tests
########################
# Golden Notebook tests
########################
- name: golden_notebook_torch_tune_serve_test
group: Golden Notebook tests
working_dir: golden_notebook_tests
legacy:
test_name: torch_tune_serve_test
test_suite: golden_notebook_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: torch_tune_serve_app_config.yaml
cluster_compute: gpu_tpl.yaml
run:
timeout: 1800
script: python workloads/torch_tune_serve_test.py
type: client
alert: default
#######################
# Long running tests
#######################
- name: long_running_actor_deaths
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: actor_deaths
test_suite: long_running_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/actor_deaths.py
long_running: true
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_apex
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: apex
test_suite: long_running_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: ../rllib_tests/app_config.yaml
cluster_compute: tpl_cpu_3.yaml
run:
timeout: 86400
script: python workloads/apex.py
long_running: true
wait_for_nodes:
num_nodes: 3
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_impala
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: impala
test_suite: long_running_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config_np.yaml
cluster_compute: tpl_cpu_1_large.yaml
run:
timeout: 86400
script: python workloads/impala.py
long_running: true
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_many_actor_tasks
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: many_actor_tasks
test_suite: long_running_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_actor_tasks.py
long_running: true
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_many_drivers
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: many_drivers
test_suite: long_running_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_drivers.py --iteration-num=4000
long_running: true
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_many_ppo
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: many_ppo
test_suite: long_running_tests
stable: false
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: ../rllib_tests/app_config.yaml
cluster_compute: many_ppo.yaml
run:
timeout: 86400
script: python workloads/many_ppo.py
long_running: true
wait_for_nodes:
num_nodes: 1
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_many_tasks
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: many_tasks
test_suite: long_running_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_tasks.py
long_running: true
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_many_tasks_serialized_ids
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: many_tasks_serialized_ids
test_suite: long_running_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_tasks_serialized_ids.py
long_running: true
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_node_failures
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: node_failures
test_suite: long_running_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/node_failures.py
long_running: true
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_pbt
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: pbt
test_suite: long_running_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: ../rllib_tests/app_config.yaml
cluster_compute: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/pbt.py
long_running: true
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_serve
group: Long running tests
working_dir: long_running_tests
legacy:
test_name: serve
test_suite: long_running_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/serve.py
long_running: true
type: sdk_command
file_manager: job
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
- name: long_running_serve_failure
group: Long running tests
working_dir: long_running_tests
stable: false
legacy:
test_name: serve_failure
test_suite: long_running_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/serve_failure.py
long_running: true
type: sdk_command
file_manager: job
smoke_test:
frequency: disabled
run:
timeout: 600
alert: long_running_tests
- name: long_running_distributed_pytorch_pbt_failure
group: Long running tests
working_dir: long_running_distributed_tests
legacy:
test_name: pytorch_pbt_failure
test_suite: long_running_distributed
frequency: weekly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl.yaml
run:
timeout: 86400
script: python workloads/pytorch_pbt_failure.py
long_running: true
type: sdk_command
file_manager: job
smoke_test:
frequency: disabled
run:
timeout: 3600
alert: long_running_tests
########################
# Jobs tests
########################
- name: jobs_basic_local_working_dir
group: Jobs tests
working_dir: jobs_tests
legacy:
test_name: jobs_basic_local_working_dir
test_suite: jobs_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_4_xlarge.yaml
run:
timeout: 600
script: python workloads/jobs_basic.py --working-dir "workloads"
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
- name: jobs_basic_remote_working_dir
group: Jobs tests
working_dir: jobs_tests
legacy:
test_name: jobs_basic_remote_working_dir
test_suite: jobs_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_4_xlarge.yaml
run:
timeout: 600
script: python workloads/jobs_basic.py --working-dir "https://github.com/anyscale/job-services-cuj-examples/archive/refs/heads/main.zip"
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
########################
# Runtime env tests
########################
- name: runtime_env_rte_many_tasks_actors
group: Runtime env tests
working_dir: runtime_env_tests
legacy:
test_name: rte_many_tasks_actors
test_suite: runtime_env_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: rte_small.yaml
run:
timeout: 600
script: python workloads/rte_many_tasks_actors.py
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: job
alert: default
- name: runtime_env_wheel_urls
group: Runtime env tests
working_dir: runtime_env_tests
legacy:
test_name: wheel_urls
test_suite: runtime_env_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: rte_minimal.yaml
run:
timeout: 9000
script: python workloads/wheel_urls.py
wait_for_nodes:
num_nodes: 1
type: sdk_command
file_manager: job
alert: default
- name: runtime_env_rte_ray_client
group: Runtime env tests
working_dir: runtime_env_tests
legacy:
test_name: rte_ray_client
test_suite: runtime_env_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: rte_minimal.yaml
run:
timeout: 600
script: python workloads/rte_ray_client.py
wait_for_nodes:
num_nodes: 1
type: client
alert: default
########################
# Serve tests
########################
- name: serve_single_deployment_1k_noop_replica
group: Serve tests
working_dir: serve_tests
legacy:
test_name: single_deployment_1k_noop_replica
test_suite: serve_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_32_cpu.yaml
run:
timeout: 7200
long_running: false
script: python workloads/single_deployment_1k_noop_replica.py
type: sdk_command
file_manager: job
alert: default
- name: serve_multi_deployment_1k_noop_replica
group: Serve tests
working_dir: serve_tests
legacy:
test_name: multi_deployment_1k_noop_replica
test_suite: serve_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_32_cpu.yaml
run:
timeout: 7200
long_running: false
script: python workloads/multi_deployment_1k_noop_replica.py
type: sdk_command
file_manager: job
alert: default
- name: serve_autoscaling_single_deployment
group: Serve tests
working_dir: serve_tests
legacy:
test_name: autoscaling_single_deployment
test_suite: serve_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_8_cpu_autoscaling.yaml
run:
timeout: 7200
long_running: false
script: python workloads/autoscaling_single_deployment.py
type: sdk_command
file_manager: job
alert: default
- name: serve_autoscaling_multi_deployment
group: Serve tests
working_dir: serve_tests
legacy:
test_name: autoscaling_multi_deployment
test_suite: serve_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_32_cpu_autoscaling.yaml
run:
timeout: 7200
long_running: false
script: python workloads/autoscaling_multi_deployment.py
type: sdk_command
file_manager: job
alert: default
- name: serve_serve_micro_benchmark
group: Serve tests
working_dir: serve_tests
legacy:
test_name: serve_micro_benchmark
test_suite: serve_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node.yaml
run:
timeout: 7200
long_running: false
script: python workloads/serve_micro_benchmark.py
type: sdk_command
file_manager: job
alert: default
- name: serve_serve_micro_benchmark_k8s
group: Serve tests
working_dir: serve_tests
legacy:
test_name: serve_micro_benchmark_k8s
test_suite: serve_tests
# TODO(architkulkarni) Reenable after K8s migration. Currently failing
frequency: disabled
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node_k8s.yaml
run:
timeout: 7200
long_running: false
script: python workloads/serve_micro_benchmark.py
type: sdk_command
file_manager: job
alert: default
- name: deployment_graph_long_chain
group: Serve tests
working_dir: serve_tests
legacy:
test_name: deployment_graph_long_chain
test_suite: serve_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node_32_cpu.yaml
run:
timeout: 3600
long_running: false
script: python workloads/deployment_graph_long_chain.py --chain-length=10 --num-clients=4 --local-test=False
type: sdk_command
file_manager: job
alert: default
stable: False
- name: deployment_graph_wide_ensemble
group: Serve tests
working_dir: serve_tests
legacy:
test_name: deployment_graph_wide_ensemble
test_suite: serve_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node_32_cpu.yaml
run:
timeout: 3600
long_running: false
script: python workloads/deployment_graph_wide_ensemble.py --fanout-degree=10 --num-clients=4 --local-test=False
type: sdk_command
file_manager: job
alert: default
stable: False
- name: serve_handle_long_chain
group: Serve tests
working_dir: serve_tests
legacy:
test_name: serve_handle_long_chain
test_suite: serve_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node_32_cpu.yaml
run:
timeout: 3600
long_running: false
script: python workloads/serve_handle_long_chain.py --chain-length=10 --num-clients=4 --local-test=False
type: sdk_command
file_manager: job
alert: default
stable: False
- name: serve_handle_wide_ensemble
group: Serve tests
working_dir: serve_tests
legacy:
test_name: serve_handle_wide_ensemble
test_suite: serve_tests
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node_32_cpu.yaml
run:
timeout: 3600
long_running: false
script: python workloads/serve_handle_wide_ensemble.py --fanout-degree=10 --num-clients=4 --local-test=False
type: sdk_command
file_manager: job
alert: default
stable: False
########################
# Train tests
########################
- name: train_horovod_multi_node_test
group: Train tests
working_dir: train_tests/horovod
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl.yaml
run:
timeout: 3000
script: python train_horovod_multi_node_test.py
wait_for_nodes:
num_nodes: 2
type: sdk_command
file_manager: job
alert: default
########################
# RLlib tests
########################
- name: rllib_learning_tests_a2c
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=a2c
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_a3c
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=a3c
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_apex
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 1gpu_24cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=apex
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_appo
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=appo
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_ppo
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=ppo
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_bc
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 1gpu_16cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=bc
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_cql
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 1gpu_16cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=cql
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_ddpg
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=ddpg
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_dqn
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=dqn
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_es
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 4gpus_64cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=es
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_impala
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=impala
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_marwil
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 1gpu_16cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=marwil
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_sac
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=sac
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_slateq
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=slateq
type: sdk_command
file_manager: job
alert: default
- name: rllib_learning_tests_td3
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus.yaml
run:
timeout: 18000
script: python learning_tests/run.py --yaml-sub-dir=td3
type: sdk_command
file_manager: job
alert: default
- name: rllib_multi_gpu_learning_tests
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: multi_gpu_learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 8gpus_96cpus.yaml
run:
timeout: 7200
script: python multi_gpu_learning_tests/run.py
type: sdk_command
file_manager: job
alert: default
- name: rllib_multi_gpu_with_lstm_learning_tests
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: multi_gpu_with_lstm_learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 8gpus_96cpus.yaml
run:
timeout: 7200
script: python multi_gpu_with_lstm_learning_tests/run.py
type: sdk_command
file_manager: job
alert: default
- name: rllib_multi_gpu_with_attention_learning_tests
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: multi_gpu_with_attention_learning_tests
test_suite: rllib_tests
frequency: nightly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 8gpus_96cpus.yaml
run:
timeout: 7200
script: python multi_gpu_with_attention_learning_tests/run.py
type: sdk_command
file_manager: job
alert: default
- name: rllib_stress_tests
group: RLlib tests
working_dir: rllib_tests
legacy:
test_name: stress_tests
test_suite: rllib_tests
frequency: weekly
team: ml
env: staging
cluster:
cluster_env: app_config.yaml
cluster_compute: 4gpus_544_cpus.yaml
run:
timeout: 5400
script: python stress_tests/run_stress_tests.py
wait_for_nodes:
num_nodes: 6
type: sdk_command
file_manager: job
smoke_test:
frequency: nightly
run:
timeout: 2000
alert: default
########################
# Core Nightly Tests
########################
- name: shuffle_10gb
group: core-multi-test
team: core
frequency: multi
working_dir: nightly_tests
legacy:
test_name: shuffle_10gb
test_suite: nightly_tests
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6
type: sdk_command
file_manager: sdk
- name: shuffle_50gb
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: shuffle_50gb
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
type: sdk_command
file_manager: sdk
- name: shuffle_50gb_large_partition
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: shuffle_50gb_large_partition
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
type: sdk_command
file_manager: sdk
- name: shuffle_100gb
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: shuffle_100gb
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_multi.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
- name: non_streaming_shuffle_100gb
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: non_streaming_shuffle_100gb
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_multi.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
--no-streaming
wait_for_nodes:
num_nodes: 4
type: sdk_command
file_manager: sdk
- name: non_streaming_shuffle_50gb_large_partition
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: non_streaming_shuffle_50gb_large_partition
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
--no-streaming
type: sdk_command
file_manager: sdk
- name: non_streaming_shuffle_50gb
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: non_streaming_shuffle_50gb
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
--no-streaming
type: sdk_command
file_manager: sdk
- name: stress_test_placement_group
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: stress_test_placement_group
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: stress_tests/stress_tests_app_config.yaml
cluster_compute: stress_tests/placement_group_tests_compute.yaml
run:
timeout: 7200
script: python stress_tests/test_placement_group.py
type: sdk_command
file_manager: sdk
- name: shuffle_1tb_1000_partition
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: shuffle_1tb_1000_partition
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_large_scale.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: non_streaming_shuffle_1tb_1000_partition
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: non_streaming_shuffle_1tb_1000_partition
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_large_scale.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
--no-streaming
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: shuffle_1tb_5000_partitions
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: shuffle_1tb_5000_partitions
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_large_scale.yaml
run:
timeout: 9000
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: decision_tree_autoscaling
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: decision_tree_autoscaling
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: decision_tree/decision_tree_app_config.yaml
cluster_compute: decision_tree/autoscaling_compute.yaml
run:
timeout: 3000
script: python decision_tree/cart_with_tree.py
type: sdk_command
file_manager: sdk
- name: decision_tree_autoscaling_20_runs
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: decision_tree_autoscaling_20_runs
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: decision_tree/decision_tree_app_config.yaml
cluster_compute: decision_tree/autoscaling_compute.yaml
run:
timeout: 9600
script: python decision_tree/cart_with_tree.py --concurrency=20
type: sdk_command
file_manager: sdk
- name: autoscaling_shuffle_1tb_1000_partitions
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: autoscaling_shuffle_1tb_1000_partitions
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_autoscaling.yaml
run:
timeout: 4000
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
--no-streaming
type: sdk_command
file_manager: sdk
- name: pg_long_running_performance_test
group: core-multi-test
working_dir: nightly_tests
legacy:
test_name: pg_long_running_performance_test
test_suite: nightly_tests
frequency: multi
team: core
cluster:
cluster_env: placement_group_tests/app_config.yaml
cluster_compute: placement_group_tests/long_running_test_compute.yaml
run:
timeout: 3600
script: python placement_group_tests/long_running_performance_test.py --num-stages
2000
wait_for_nodes:
num_nodes: 2
type: sdk_command
file_manager: sdk
- name: microbenchmark
group: core-daily-test
team: core
frequency: nightly
working_dir: microbenchmark
env: prod
python: "3.7"
legacy:
test_name: microbenchmark
test_suite: microbenchmark
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_64.yaml
run:
timeout: 1800
script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py
- name: microbenchmark_staging
group: core-daily-test
team: core
frequency: nightly
working_dir: microbenchmark
env: staging
python: "3.7"
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_64.yaml
run:
timeout: 1800
script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py
- name: microbenchmark_38
group: core-daily-test
team: core
frequency: nightly
working_dir: microbenchmark
python: "3.8"
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_64.yaml
run:
timeout: 1800
script: OMP_NUM_THREADS=64 RAY_ADDRESS=local python run_microbenchmark.py
- name: dask_on_ray_10gb_sort
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: dask_on_ray_10gb_sort
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template.yaml
run:
timeout: 7200
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions
50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
type: sdk_command
file_manager: sdk
- name: dask_on_ray_100gb_sort
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: dask_on_ray_100gb_sort
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template.yaml
run:
timeout: 7200
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions
200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
type: sdk_command
file_manager: sdk
- name: dask_on_ray_large_scale_test_no_spilling
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: dask_on_ray_large_scale_test_no_spilling
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
run:
timeout: 7200
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb
20 --error_rate 0 --data_save_path /tmp/ray
wait_for_nodes:
num_nodes: 21
type: sdk_command
file_manager: sdk
smoke_test:
frequency: multi
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
run:
timeout: 7200
script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb
20 --error_rate 0 --data_save_path /tmp/ray
wait_for_nodes:
num_nodes: 5
- name: dask_on_ray_large_scale_test_spilling
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: dask_on_ray_large_scale_test_spilling
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
run:
timeout: 7200
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
70 --error_rate 0 --data_save_path /tmp/ray
wait_for_nodes:
num_nodes: 21
type: sdk_command
file_manager: sdk
smoke_test:
frequency: multi
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
run:
timeout: 7200
script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb
70 --error_rate 0 --data_save_path /tmp/ray
wait_for_nodes:
num_nodes: 5
- name: stress_test_state_api_scale
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: stress_test_state_api_scale
test_suite: nightly_tests
stable: false
frequency: nightly
team: core
cluster:
cluster_env: stress_tests/stress_tests_app_config.yaml
cluster_compute: stress_tests/stress_tests_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_state_api_scale.py
type: sdk_command
file_manager: sdk
smoke_test:
frequency: multi
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
cluster_compute: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_state_api_scale.py --smoke-test
- name: shuffle_20gb_with_state_api
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: shuffle_20gb_with_state_api
test_suite: nightly_tests
stable: false
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_single.yaml
run:
timeout: 1000
script: python stress_tests/test_state_api_with_other_tests.py
nightly_tests/shuffle/shuffle_test.py --test-args="--num-partitions=100 --partition-size=200e6"
type: sdk_command
file_manager: sdk
- name: stress_test_many_tasks
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: stress_test_many_tasks
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: stress_tests/stress_tests_app_config.yaml
cluster_compute: stress_tests/stress_tests_compute.yaml
run:
timeout: 7200
script: python stress_tests/test_many_tasks.py
type: sdk_command
file_manager: sdk
smoke_test:
frequency: multi
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
cluster_compute: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
- name: stress_test_dead_actors
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: stress_test_dead_actors
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: stress_tests/stress_tests_app_config.yaml
cluster_compute: stress_tests/stress_tests_compute.yaml
run:
timeout: 7200
script: python stress_tests/test_dead_actors.py
type: sdk_command
file_manager: sdk
smoke_test:
frequency: multi
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
cluster_compute: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3
--num-children=3
# The full test is not stable, so run the smoke test only.
# See https://github.com/ray-project/ray/issues/23244.
- name: threaded_actors_stress_test
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: threaded_actors_stress_test
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: stress_tests/stress_tests_app_config.yaml
cluster_compute: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s
30
wait_for_nodes:
num_nodes: 5
# - name: threaded_actors_stress_test
# group: core-daily-test
# working_dir: nightly_tests
# legacy:
# test_name: threaded_actors_stress_test
# test_suite: nightly_tests
#
# frequency: nightly
# team: core
# cluster:
# cluster_env: stress_tests/stress_tests_app_config.yaml
# cluster_compute: stress_tests/stress_test_threaded_actor_compute.yaml
#
# run:
# timeout: 7200
# script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s
# 60
#
# wait_for_nodes:
# num_nodes: 201
# timeout: 600
#
# type: sdk_command
# file_manager: sdk
#
# smoke_test:
# frequency: nightly
# cluster:
# app_config: stress_tests/stress_tests_app_config.yaml
# cluster_compute: stress_tests/smoke_test_compute.yaml
#
# run:
# timeout: 3600
# script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s
# 30
#
# wait_for_nodes:
# num_nodes: 5
# timeout: 600
- name: dask_on_ray_1tb_sort
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: dask_on_ray_1tb_sort
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/1tb_sort_compute.yaml
run:
timeout: 7200
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions
1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
wait_for_nodes:
num_nodes: 32
type: sdk_command
file_manager: sdk
- name: many_nodes_actor_test
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: many_nodes_actor_test
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: many_nodes_tests/app_config.yaml
cluster_compute: many_nodes_tests/compute_config.yaml
run:
timeout: 7200
script: python many_nodes_tests/actor_test.py
wait_for_nodes:
num_nodes: 251
type: sdk_command
file_manager: sdk
- name: pg_autoscaling_regression_test
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: pg_autoscaling_regression_test
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: placement_group_tests/app_config.yaml
cluster_compute: placement_group_tests/compute.yaml
run:
timeout: 1200
script: python placement_group_tests/pg_run.py
type: sdk_command
file_manager: sdk
- name: placement_group_performance_test
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: placement_group_performance_test
test_suite: nightly_tests
frequency: nightly
team: core
cluster:
cluster_env: placement_group_tests/app_config.yaml
cluster_compute: placement_group_tests/pg_perf_test_compute.yaml
run:
timeout: 1200
script: python placement_group_tests/placement_group_performance_test.py
wait_for_nodes:
num_nodes: 5
type: sdk_command
file_manager: sdk
#########################
# Core Scalability Tests
#########################
- name: single_node
group: core-scalability-test
working_dir: benchmarks
legacy:
test_name: single_node
test_suite: benchmark_tests
frequency: multi
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: single_node.yaml
run:
timeout: 12000
prepare: sleep 0
script: python single_node/test_single_node.py
type: sdk_command
file_manager: sdk
- name: object_store
group: core-scalability-test
working_dir: benchmarks
legacy:
test_name: object_store
test_suite: benchmark_tests
frequency: multi
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: object_store.yaml
run:
timeout: 3600
script: python object_store/test_object_store.py
wait_for_nodes:
num_nodes: 50
type: sdk_command
file_manager: sdk
- name: many_actors
group: core-scalability-test
working_dir: benchmarks
legacy:
test_name: many_actors
test_suite: benchmark_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: distributed.yaml
run:
timeout: 3600
script: python distributed/test_many_actors.py
wait_for_nodes:
num_nodes: 65
type: sdk_command
file_manager: sdk
- name: many_actors_smoke_test
group: core-scalability-test
working_dir: benchmarks
legacy:
test_name: many_actors_smoke_test
test_suite: benchmark_tests
frequency: multi
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: distributed_smoke_test.yaml
run:
timeout: 3600
script: SMOKE_TEST=1 python distributed/test_many_actors.py
wait_for_nodes:
num_nodes: 2
type: sdk_command
file_manager: sdk
- name: many_tasks
group: core-scalability-test
working_dir: benchmarks
legacy:
test_name: many_tasks
test_suite: benchmark_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: distributed.yaml
run:
timeout: 3600
script: python distributed/test_many_tasks.py --num-tasks=10000
wait_for_nodes:
num_nodes: 65
type: sdk_command
file_manager: sdk
smoke_test:
frequency: multi
cluster:
cluster_env: app_config.yaml
cluster_compute: distributed_smoke_test.yaml
run:
timeout: 3600
script: python distributed/test_many_tasks.py --num-tasks=100
wait_for_nodes:
num_nodes: 2
- name: many_pgs
group: core-scalability-test
working_dir: benchmarks
legacy:
test_name: many_pgs
test_suite: benchmark_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: distributed.yaml
run:
timeout: 3600
script: python distributed/test_many_pgs.py
wait_for_nodes:
num_nodes: 65
type: sdk_command
file_manager: sdk
- name: many_pgs_smoke_test
group: core-scalability-test
working_dir: benchmarks
legacy:
test_name: many_pgs_smoke_test
test_suite: benchmark_tests
frequency: multi
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: distributed_smoke_test.yaml
run:
timeout: 3600
script: SMOKE_TEST=1 python distributed/test_many_pgs.py
wait_for_nodes:
num_nodes: 2
type: sdk_command
file_manager: sdk
- name: many_nodes
group: core-scalability-test
working_dir: benchmarks
env: staging
legacy:
test_name: many_nodes
test_suite: benchmark_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: many_nodes.yaml
run:
timeout: 3600
script: python distributed/test_many_tasks.py --num-tasks=1000
wait_for_nodes:
num_nodes: 250
type: sdk_command
file_manager: sdk
- name: scheduling_test_many_0s_tasks_single_node
group: core-scalability-test
working_dir: benchmarks
legacy:
test_name: scheduling_test_many_0s_tasks_single_node
test_suite: benchmark_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: scheduling.yaml
run:
timeout: 3600
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
--task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
wait_for_nodes:
num_nodes: 32
type: sdk_command
file_manager: sdk
- name: scheduling_test_many_0s_tasks_many_nodes
group: core-scalability-test
working_dir: benchmarks
legacy:
test_name: scheduling_test_many_0s_tasks_many_nodes
test_suite: benchmark_tests
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: scheduling.yaml
run:
timeout: 3600
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
--task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
wait_for_nodes:
num_nodes: 32
type: sdk_command
file_manager: sdk
# - name: scheduling_test_many_5s_tasks_single_node
# group: core-scalability-test
# working_dir: benchmarks
# legacy:
# test_name: scheduling_test_many_5s_tasks_single_node
# test_suite: benchmark_tests
# frequency: nightly
# team: core
# cluster:
# cluster_env: app_config.yaml
# cluster_compute: scheduling.yaml
# run:
# timeout: 3600
# script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
# --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
# wait_for_nodes:
# num_nodes: 32
# timeout: 600
# type: sdk_command
# file_manager: sdk
# stable: false
# - name: scheduling_test_many_5s_tasks_many_nodes
# group: core-scalability-test
# working_dir: benchmarks
# legacy:
# test_name: scheduling_test_many_5s_tasks_many_nodes
# test_suite: benchmark_tests
# frequency: nightly
# team: core
# cluster:
# cluster_env: app_config.yaml
# cluster_compute: scheduling.yaml
# run:
# timeout: 3600
# script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1
# --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
# wait_for_nodes:
# num_nodes: 32
# timeout: 600
# type: sdk_command
# file_manager: sdk
# stable: false
###############
# Dataset tests
###############
- name: inference
group: core-dataset-tests
working_dir: nightly_tests/dataset
legacy:
test_name: inference
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: inference.yaml
run:
timeout: 600
script: python inference.py
wait_for_nodes:
num_nodes: 2
type: sdk_command
file_manager: sdk
- name: shuffle_data_loader
group: core-dataset-tests
working_dir: nightly_tests/dataset
legacy:
test_name: shuffle_data_loader
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: shuffle_app_config.yaml
cluster_compute: shuffle_compute.yaml
run:
timeout: 1800
script: python dataset_shuffle_data_loader.py
type: sdk_command
file_manager: sdk
- name: parquet_metadata_resolution
group: core-dataset-tests
working_dir: nightly_tests/dataset
legacy:
test_name: parquet_metadata_resolution
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: pipelined_training_app.yaml
cluster_compute: pipelined_training_compute.yaml
run:
timeout: 1200
script: python parquet_metadata_resolution.py --num-files 915
wait_for_nodes:
num_nodes: 15
type: sdk_command
file_manager: sdk
- name: dataset_random_access
group: core-dataset-tests
working_dir: nightly_tests/dataset
stable: false
frequency: multi
team: core
cluster:
cluster_env: pipelined_training_app.yaml
cluster_compute: pipelined_training_compute.yaml
run:
timeout: 1200
script: python dataset_random_access.py
wait_for_nodes:
num_nodes: 15
type: sdk_command
file_manager: sdk
- name: pipelined_data_ingest_benchmark
group: core-dataset-tests
working_dir: nightly_tests/dataset
frequency: nightly
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: data_ingest_benchmark_compute.yaml
run:
timeout: 300
script: python data_ingest_benchmark.py --dataset-size-gb=200 --num-workers=20 --streaming
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: pipelined_training_50_gb
group: core-dataset-tests
working_dir: nightly_tests/dataset
legacy:
test_name: pipelined_training_50_gb
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: pipelined_training_app.yaml
cluster_compute: pipelined_training_compute.yaml
run:
timeout: 4800
script: python pipelined_training.py --epochs 1
wait_for_nodes:
num_nodes: 15
type: sdk_command
file_manager: sdk
- name: pipelined_ingestion_1500_gb
group: core-dataset-tests
working_dir: nightly_tests/dataset
legacy:
test_name: pipelined_ingestion_1500_gb
test_suite: dataset_test
frequency: nightly
team: core
cluster:
cluster_env: pipelined_ingestion_app.yaml
cluster_compute: pipelined_ingestion_compute.yaml
run:
timeout: 9600
script: python pipelined_training.py --epochs 2 --num-windows 5 --num-files 915
--debug
wait_for_nodes:
num_nodes: 21
type: sdk_command
file_manager: sdk
- name: datasets_ingest_train_infer
group: core-dataset-tests
working_dir: nightly_tests/dataset
legacy:
test_name: datasets_ingest_train_infer
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: ray_sgd_training_app.yaml
cluster_compute: ray_sgd_training_compute.yaml
run:
timeout: 14400
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu
--large-dataset
wait_for_nodes:
num_nodes: 66
type: sdk_command
file_manager: sdk
smoke_test:
frequency: multi
cluster:
app_config: ray_sgd_training_app.yaml
cluster_compute: ray_sgd_training_smoke_compute.yaml
run:
timeout: 3600
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
wait_for_nodes:
num_nodes: 8
- name: datasets_preprocess_ingest
group: core-dataset-tests
working_dir: nightly_tests/dataset
legacy:
test_name: datasets_preprocess_ingest
test_suite: dataset_test
frequency: nightly
team: core
cluster:
cluster_env: ray_sgd_training_app.yaml
cluster_compute: ray_sgd_training_compute_no_gpu.yaml
run:
timeout: 7200
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu
--large-dataset --debug
wait_for_nodes:
num_nodes: 21
type: sdk_command
file_manager: sdk
- name: datasets_ingest_400G
group: core-dataset-tests
working_dir: nightly_tests/dataset
legacy:
test_name: datasets_ingest_400G
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: ray_sgd_training_app.yaml
cluster_compute: dataset_ingest_400G_compute.yaml
run:
timeout: 7200
script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1
type: sdk_command
file_manager: sdk
- name: dataset_shuffle_random_shuffle_1tb
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: dataset_shuffle_random_shuffle_1tb
test_suite: dataset_test
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
run:
timeout: 7200
script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: dataset_shuffle_sort_1tb
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: dataset_shuffle_sort_1tb
test_suite: dataset_test
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
run:
timeout: 7200
script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: dataset_shuffle_push_based_random_shuffle_1tb
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: dataset_shuffle_push_based_random_shuffle_1tb
test_suite: dataset_test
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
run:
timeout: 7200
script: RAY_DATASET_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: dataset_shuffle_push_based_sort_1tb
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: dataset_shuffle_push_based_sort_1tb
test_suite: dataset_test
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
run:
timeout: 7200
script: RAY_DATASET_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: dataset_shuffle_push_based_random_shuffle_100tb
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: dataset_shuffle_push_based_random_shuffle_100tb
test_suite: dataset_test
stable: false
frequency: nightly
team: core
cluster:
cluster_env: shuffle/100tb_shuffle_app_config.yaml
cluster_compute: shuffle/100tb_shuffle_compute.yaml
run:
timeout: 28800
script: RAY_DATASET_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=100000 --partition-size=1e9 --shuffle
wait_for_nodes:
num_nodes: 100
type: sdk_command
file_manager: sdk
##################
# Core Chaos tests
##################
- name: chaos_many_tasks_no_object_store
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_many_tasks_no_object_store
test_suite: chaos_test
frequency: multi
team: core
cluster:
cluster_env: chaos_test/app_config.yaml
cluster_compute: chaos_test/compute_template.yaml
run:
timeout: 3600
wait_for_nodes:
num_nodes: 10
prepare: python setup_chaos.py --no-start
script: python chaos_test/test_chaos_basic.py --workload=tasks
type: sdk_command
file_manager: sdk
- name: chaos_many_actors
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_many_actors
test_suite: chaos_test
frequency: multi
team: core
cluster:
cluster_env: chaos_test/app_config.yaml
cluster_compute: chaos_test/compute_template.yaml
run:
timeout: 3600
wait_for_nodes:
num_nodes: 10
prepare: python setup_chaos.py --no-start
script: python chaos_test/test_chaos_basic.py --workload=actors
type: sdk_command
file_manager: sdk
- name: chaos_dask_on_ray_large_scale_test_no_spilling
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_dask_on_ray_large_scale_test_no_spilling
test_suite: chaos_test
frequency: nightly
team: core
cluster:
cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml
run:
timeout: 7200
wait_for_nodes:
num_nodes: 21
prepare: python setup_chaos.py --node-kill-interval 100
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb
20 --error_rate 0 --data_save_path /tmp/ray
type: sdk_command
file_manager: sdk
- name: chaos_dask_on_ray_large_scale_test_spilling
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_dask_on_ray_large_scale_test_spilling
test_suite: chaos_test
frequency: nightly
team: core
cluster:
cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
run:
timeout: 7200
wait_for_nodes:
num_nodes: 21
prepare: python setup_chaos.py --node-kill-interval 100
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
70 --error_rate 0 --data_save_path /tmp/ray
type: sdk_command
file_manager: sdk
- name: chaos_pipelined_ingestion_1500_gb_15_windows
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_pipelined_ingestion_1500_gb_15_windows
test_suite: chaos_test
frequency: nightly
team: core
cluster:
cluster_env: dataset/pipelined_ingestion_app.yaml
cluster_compute: dataset/pipelined_ingestion_compute.yaml
run:
timeout: 7200
wait_for_nodes:
num_nodes: 21
prepare: ' python setup_chaos.py --node-kill-interval 300'
script: python dataset/pipelined_training.py --epochs 1 --num-windows 15 --num-files
915 --debug
type: sdk_command
file_manager: sdk
- name: chaos_dataset_shuffle_push_based_sort_1tb
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_dataset_shuffle_push_based_sort_1tb
test_suite: chaos_test
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
run:
timeout: 7200
prepare: ' python setup_chaos.py --node-kill-interval 1200 --max-nodes-to-kill 3'
script: RAY_DATASET_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: chaos_dataset_shuffle_sort_1tb
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_dataset_shuffle_sort_1tb
test_suite: chaos_test
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
run:
timeout: 7200
prepare: ' python setup_chaos.py --node-kill-interval 900 --max-nodes-to-kill 3'
script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: chaos_dataset_shuffle_random_shuffle_1tb
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_dataset_shuffle_random_shuffle_1tb
test_suite: chaos_test
stable: false
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
run:
timeout: 7200
prepare: ' python setup_chaos.py --node-kill-interval 600 --max-nodes-to-kill 2'
script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: chaos_dataset_shuffle_push_based_random_shuffle_1tb
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_dataset_shuffle_push_based_random_shuffle_1tb
test_suite: chaos_test
stable: false
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml
run:
timeout: 7200
prepare: ' python setup_chaos.py --node-kill-interval 600 --max-nodes-to-kill 2'
script: RAY_DATASET_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle
wait_for_nodes:
num_nodes: 20
type: sdk_command
file_manager: sdk
- name: k8s_serve_ha_test
group: k8s-test
working_dir: k8s_tests
legacy:
test_name: k8s_serve_ha_test
test_suite: k8s_tests
stable: false
frequency: nightly
team: serve
cluster:
cluster_env: app_config.yaml
cluster_compute: compute_tpl.yaml
run:
timeout: 1800
prepare: bash prepare.sh
script: python run_gcs_ft_on_k8s.py
type: sdk_command
file_manager: sdk