2021-05-08 21:38:39 +02:00
|
|
|
- name: train_small
|
2022-01-20 06:29:53 +09:00
|
|
|
team: ml
|
2021-05-08 21:38:39 +02:00
|
|
|
cluster:
|
|
|
|
app_config: app_config.yaml
|
|
|
|
compute_template: tpl_cpu_small.yaml
|
|
|
|
|
|
|
|
run:
|
2021-09-24 16:17:06 +01:00
|
|
|
use_connect: True
|
|
|
|
autosuspend_mins: 10
|
2021-05-08 21:38:39 +02:00
|
|
|
timeout: 600
|
|
|
|
prepare: python wait_cluster.py 4 600
|
|
|
|
script: python workloads/train_small.py
|
|
|
|
|
|
|
|
- name: train_moderate
|
2022-01-20 06:29:53 +09:00
|
|
|
team: ml
|
2021-05-08 21:38:39 +02:00
|
|
|
cluster:
|
|
|
|
app_config: app_config.yaml
|
|
|
|
compute_template: tpl_cpu_moderate.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 600
|
|
|
|
prepare: python wait_cluster.py 32 600
|
|
|
|
script: python workloads/train_moderate.py
|
|
|
|
|
|
|
|
- name: train_gpu
|
2022-01-20 06:29:53 +09:00
|
|
|
team: ml
|
2021-05-08 21:38:39 +02:00
|
|
|
cluster:
|
2021-06-01 20:19:15 +02:00
|
|
|
app_config: app_config_gpu.yaml
|
2021-05-08 21:38:39 +02:00
|
|
|
compute_template: tpl_gpu_small.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 600
|
|
|
|
prepare: python wait_cluster.py 5 600
|
|
|
|
script: python workloads/train_gpu.py
|
|
|
|
|
|
|
|
- name: distributed_api_test
|
2022-01-20 06:29:53 +09:00
|
|
|
team: ml
|
2021-05-08 21:38:39 +02:00
|
|
|
cluster:
|
|
|
|
app_config: app_config.yaml
|
|
|
|
compute_template: tpl_cpu_small.yaml
|
|
|
|
results:
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 600
|
|
|
|
prepare: python wait_cluster.py 4 600
|
|
|
|
script: python workloads/distributed_api_test.py
|
|
|
|
results: ""
|
|
|
|
|
|
|
|
- name: ft_small_elastic
|
2022-01-20 06:29:53 +09:00
|
|
|
team: ml
|
2021-05-08 21:38:39 +02:00
|
|
|
cluster:
|
|
|
|
app_config: app_config.yaml
|
|
|
|
compute_template: tpl_cpu_small.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 900
|
|
|
|
prepare: python wait_cluster.py 4 600
|
|
|
|
script: python workloads/ft_small_elastic.py
|
|
|
|
results: ""
|
|
|
|
|
|
|
|
- name: ft_small_non_elastic
|
2022-01-20 06:29:53 +09:00
|
|
|
team: ml
|
2021-05-08 21:38:39 +02:00
|
|
|
cluster:
|
|
|
|
app_config: app_config.yaml
|
|
|
|
compute_template: tpl_cpu_small.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 900
|
|
|
|
prepare: python wait_cluster.py 4 600
|
|
|
|
script: python workloads/ft_small_non_elastic.py
|
|
|
|
results: ""
|
|
|
|
|
|
|
|
- name: tune_small
|
2022-01-20 06:29:53 +09:00
|
|
|
team: ml
|
2021-05-08 21:38:39 +02:00
|
|
|
cluster:
|
|
|
|
app_config: app_config.yaml
|
|
|
|
compute_template: tpl_cpu_small.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 600
|
|
|
|
prepare: python wait_cluster.py 4 600
|
|
|
|
script: python workloads/tune_small.py
|
|
|
|
|
|
|
|
- name: tune_32x4
|
2022-01-20 06:29:53 +09:00
|
|
|
team: ml
|
2021-05-08 21:38:39 +02:00
|
|
|
cluster:
|
|
|
|
app_config: app_config.yaml
|
|
|
|
compute_template: tpl_cpu_moderate.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 900
|
|
|
|
prepare: python wait_cluster.py 32 600
|
|
|
|
script: python workloads/tune_32x4.py
|
|
|
|
|
|
|
|
- name: tune_4x32
|
2022-01-20 06:29:53 +09:00
|
|
|
team: ml
|
2021-05-08 21:38:39 +02:00
|
|
|
cluster:
|
|
|
|
app_config: app_config.yaml
|
|
|
|
compute_template: tpl_cpu_moderate.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 900
|
|
|
|
prepare: python wait_cluster.py 32 600
|
|
|
|
script: python workloads/tune_4x32.py
|