ray/release/long_running_tests/long_running_tests.yaml
SangBin Cho b1308b1c8c
[Test Infra] Unrevert team col (#21700)
This fixes the previous problems from team column revert.

This has 2 additional changes;

alert handler receives the team argument, which was the root cause of breakage; https://github.com/ray-project/ray/pull/21289

Previously, tests without a team column were raising an exception, but I made the condition weaker (warning logs). I will eventually change it to raise an exception, but for smoother transition, we will log warning instead for a short time
2022-01-19 13:29:53 -08:00

196 lines
3.3 KiB
YAML

- name: actor_deaths
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/actor_deaths.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: apex
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: tpl_cpu_3.yaml
run:
timeout: 86400
prepare: python wait_cluster.py 3 600
script: python workloads/apex.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: impala
team: ml
cluster:
app_config: app_config_np.yaml
compute_template: tpl_cpu_1_large.yaml
run:
timeout: 86400
script: python workloads/impala.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: many_actor_tasks
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_actor_tasks.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: many_drivers
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_drivers.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: many_ppo
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: many_ppo.yaml
run:
timeout: 86400
prepare: python wait_cluster.py 1 600
script: python workloads/many_ppo.py
long_running: True
smoke_test:
run:
timeout: 3600
stable: false
- name: many_tasks
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_tasks.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: many_tasks_serialized_ids
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_tasks_serialized_ids.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: node_failures
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/node_failures.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: pbt
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/pbt.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: serve
team: serve
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/serve.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: serve_failure
team: serve
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/serve_failure.py
long_running: True
smoke_test:
run:
timeout: 3600