mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

This fixes the previous problems from team column revert. This has 2 additional changes; alert handler receives the team argument, which was the root cause of breakage; https://github.com/ray-project/ray/pull/21289 Previously, tests without a team column were raising an exception, but I made the condition weaker (warning logs). I will eventually change it to raise an exception, but for smoother transition, we will log warning instead for a short time
124 lines
No EOL
2.7 KiB
YAML
124 lines
No EOL
2.7 KiB
YAML
- name: horovod_user_test_latest
|
|
team: ml
|
|
cluster:
|
|
app_config: horovod/app_config.yaml
|
|
compute_template: horovod/compute_tpl.yaml
|
|
|
|
|
|
driver_setup: horovod/driver_setup_latest.sh
|
|
|
|
run:
|
|
use_connect: True
|
|
autosuspend_mins: 10
|
|
timeout: 1200
|
|
script: python horovod/horovod_user_test.py
|
|
|
|
- name: horovod_user_test_master
|
|
team: ml
|
|
cluster:
|
|
app_config: ../horovod_tests/app_config_master.yaml
|
|
compute_template: horovod/compute_tpl.yaml
|
|
|
|
driver_setup: horovod/driver_setup_master.sh
|
|
|
|
run:
|
|
use_connect: True
|
|
autosuspend_mins: 10
|
|
timeout: 1200
|
|
script: python horovod/horovod_user_test.py
|
|
|
|
|
|
- name: train_tensorflow_mnist_test
|
|
team: ml
|
|
cluster:
|
|
app_config: train/app_config.yaml
|
|
compute_template: train/compute_tpl.yaml
|
|
|
|
driver_setup: train/driver_setup.sh
|
|
|
|
run:
|
|
use_connect: True
|
|
timeout: 36000
|
|
script: python train/train_tensorflow_mnist_test.py
|
|
|
|
- name: train_torch_linear_test
|
|
team: ml
|
|
cluster:
|
|
app_config: train/app_config.yaml
|
|
compute_template: train/compute_tpl.yaml
|
|
|
|
driver_setup: train/driver_setup.sh
|
|
|
|
run:
|
|
use_connect: True
|
|
timeout: 36000
|
|
script: python train/train_torch_linear_test.py
|
|
|
|
|
|
- name: xgboost_gpu_connect_latest
|
|
team: ml
|
|
cluster:
|
|
app_config: xgboost/app_config_gpu.yaml
|
|
compute_template: xgboost/tpl_gpu_small_scaling.yaml
|
|
|
|
run:
|
|
use_connect: True
|
|
timeout: 1200
|
|
script: python xgboost/train_gpu_connect.py
|
|
|
|
- name: xgboost_gpu_connect_master
|
|
team: ml
|
|
cluster:
|
|
app_config: xgboost/app_config_gpu_master.yaml
|
|
compute_template: xgboost/tpl_gpu_small_scaling.yaml
|
|
|
|
run:
|
|
use_connect: True
|
|
timeout: 1200
|
|
script: python xgboost/train_gpu_connect.py
|
|
|
|
- name: ray_lightning_user_test_latest
|
|
team: ml
|
|
cluster:
|
|
app_config: ray-lightning/app_config.yaml
|
|
compute_template: ray-lightning/compute_tpl.yaml
|
|
|
|
driver_setup: ray-lightning/driver_setup.sh
|
|
|
|
run:
|
|
use_connect: True
|
|
autosuspend_mins: 10
|
|
timeout: 1200
|
|
script: python ray-lightning/ray_lightning_user_test.py
|
|
|
|
|
|
- name: ray_lightning_user_test_master
|
|
team: ml
|
|
cluster:
|
|
app_config: ray-lightning/app_config_master.yaml
|
|
compute_template: ray-lightning/compute_tpl.yaml
|
|
|
|
|
|
driver_setup: ray-lightning/driver_setup.sh
|
|
|
|
run:
|
|
use_connect: True
|
|
autosuspend_mins: 10
|
|
timeout: 1200
|
|
script: python ray-lightning/ray_lightning_user_test.py
|
|
|
|
|
|
- name: tune_rllib_connect_test
|
|
team: ml
|
|
cluster:
|
|
app_config: ../rllib_tests/app_config.yaml
|
|
compute_template: tune_rllib/compute_tpl.yaml
|
|
|
|
|
|
driver_setup: tune_rllib/driver_setup.sh
|
|
|
|
run:
|
|
use_connect: True
|
|
autosuspend_mins: 10
|
|
timeout: 1200
|
|
script: python tune_rllib/run_connect_tests.py |