ray/release/tune_tests/cloud_tests/tune_cloud_tests.yaml
SangBin Cho b1308b1c8c
[Test Infra] Unrevert team col (#21700)
This fixes the previous problems from team column revert.

This has 2 additional changes;

alert handler receives the team argument, which was the root cause of breakage; https://github.com/ray-project/ray/pull/21289

Previously, tests without a team column were raising an exception, but I made the condition weaker (warning logs). I will eventually change it to raise an exception, but for smoother transition, we will log warning instead for a short time
2022-01-19 13:29:53 -08:00

118 lines
3.3 KiB
YAML

- name: aws_no_sync_down
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py no_sync_down
- name: aws_ssh_sync
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py ssh_sync
- name: aws_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
- name: aws_durable_upload_rllib_str
team: ml
cluster:
app_config: app_config_ml.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str
- name: aws_durable_upload_rllib_trainer
team: ml
cluster:
app_config: app_config_ml.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer
- name: aws_no_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload
- name: gcp_k8s_no_sync_down
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
use_connect: True
timeout: 600
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
- name: gcp_k8s_ssh_sync
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
use_connect: True
timeout: 600
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
- name: gcp_k8s_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
use_connect: True
timeout: 600
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
- name: gcp_k8s_no_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
use_connect: True
timeout: 600
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
script: python workloads/run_cloud_test.py no_durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload