2021-07-16 14:15:49 -07:00
|
|
|
- name: inference
|
2021-12-28 07:42:41 +09:00
|
|
|
team: core
|
2021-07-16 14:15:49 -07:00
|
|
|
cluster:
|
|
|
|
app_config: app_config.yaml
|
|
|
|
compute_template: inference.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 600
|
2021-09-23 20:39:03 -07:00
|
|
|
prepare: python wait_cluster.py 2 600
|
2021-07-16 14:15:49 -07:00
|
|
|
script: python inference.py
|
2021-08-01 18:03:46 -07:00
|
|
|
|
2021-08-20 11:26:01 -07:00
|
|
|
- name: shuffle_data_loader
|
2021-12-28 07:42:41 +09:00
|
|
|
team: core
|
2021-08-20 11:26:01 -07:00
|
|
|
cluster:
|
|
|
|
app_config: shuffle_app_config.yaml
|
|
|
|
compute_template: shuffle_compute.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 1800
|
|
|
|
script: python dataset_shuffle_data_loader.py
|
2021-09-23 20:39:03 -07:00
|
|
|
|
|
|
|
- name: pipelined_training_50_gb
|
2021-12-28 07:42:41 +09:00
|
|
|
team: core
|
2021-09-23 20:39:03 -07:00
|
|
|
cluster:
|
|
|
|
app_config: pipelined_training_app.yaml
|
|
|
|
compute_template: pipelined_training_compute.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 4800
|
|
|
|
prepare: python wait_cluster.py 15 1200
|
2021-10-18 18:53:50 -07:00
|
|
|
script: python pipelined_training.py --epochs 1
|
2021-09-23 20:39:03 -07:00
|
|
|
|
|
|
|
- name: pipelined_ingestion_1500_gb_15_windows
|
2021-12-28 07:42:41 +09:00
|
|
|
team: core
|
2021-09-23 20:39:03 -07:00
|
|
|
cluster:
|
|
|
|
app_config: pipelined_ingestion_app.yaml
|
|
|
|
compute_template: pipelined_ingestion_compute.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 4800
|
|
|
|
prepare: python wait_cluster.py 21 2400
|
|
|
|
script: python pipelined_training.py --epochs 2 --num-windows 15 --num-files 915 --debug
|
2021-11-21 20:04:45 -08:00
|
|
|
|
|
|
|
- name: datasets_ingest_train_infer
|
2021-12-28 07:42:41 +09:00
|
|
|
team: core
|
2021-11-21 20:04:45 -08:00
|
|
|
cluster:
|
|
|
|
app_config: ray_sgd_training_app.yaml
|
|
|
|
compute_template: ray_sgd_training_compute.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 14400
|
|
|
|
prepare: python wait_cluster.py 66 2400
|
|
|
|
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset
|
|
|
|
|
2021-11-29 19:10:17 -08:00
|
|
|
stable: false
|
|
|
|
|
2021-11-21 20:04:45 -08:00
|
|
|
smoke_test:
|
|
|
|
cluster:
|
|
|
|
app_config: ray_sgd_training_app.yaml
|
|
|
|
compute_template: ray_sgd_training_smoke_compute.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 3600
|
|
|
|
prepare: python wait_cluster.py 8 2400
|
|
|
|
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
|
2021-11-29 19:10:17 -08:00
|
|
|
|
|
|
|
- name: datasets_preprocess_ingest
|
2021-12-28 07:42:41 +09:00
|
|
|
team: core
|
2021-11-29 19:10:17 -08:00
|
|
|
cluster:
|
|
|
|
app_config: ray_sgd_training_app.yaml
|
2021-12-06 09:59:21 -08:00
|
|
|
compute_template: ray_sgd_training_compute_no_gpu.yaml
|
2021-11-29 19:10:17 -08:00
|
|
|
|
|
|
|
run:
|
2021-12-06 09:59:21 -08:00
|
|
|
timeout: 7200
|
|
|
|
prepare: python wait_cluster.py 21 2400
|
2021-11-29 19:10:17 -08:00
|
|
|
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset --debug
|
|
|
|
|
|
|
|
stable: false
|
2021-12-07 02:50:17 -08:00
|
|
|
|
|
|
|
- name: datasets_ingest_400G
|
2021-12-28 07:42:41 +09:00
|
|
|
team: core
|
2021-12-07 02:50:17 -08:00
|
|
|
cluster:
|
|
|
|
app_config: ray_sgd_training_app.yaml
|
|
|
|
compute_template: dataset_ingest_400G_compute.yaml
|
|
|
|
|
|
|
|
run:
|
|
|
|
timeout: 7200
|
|
|
|
script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1
|
|
|
|
|
|
|
|
stable: false
|
|
|
|
|