mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[Test Infra] Unrevert team col (#21700)
This fixes the previous problems from team column revert. This has 2 additional changes; alert handler receives the team argument, which was the root cause of breakage; https://github.com/ray-project/ray/pull/21289 Previously, tests without a team column were raising an exception, but I made the condition weaker (warning logs). I will eventually change it to raise an exception, but for smoother transition, we will log warning instead for a short time
This commit is contained in:
parent
88143cdc35
commit
b1308b1c8c
24 changed files with 188 additions and 129 deletions
|
@ -1,8 +1,5 @@
|
|||
- name: single_node
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Alex Wu"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: single_node.yaml
|
||||
|
@ -13,10 +10,7 @@
|
|||
script: python single_node/test_single_node.py
|
||||
|
||||
- name: object_store
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Alex Wu"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: object_store.yaml
|
||||
|
@ -27,10 +21,7 @@
|
|||
script: python object_store/test_object_store.py
|
||||
|
||||
- name: many_actors
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Alex Wu"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
@ -41,10 +32,7 @@
|
|||
script: python distributed/test_many_actors.py
|
||||
|
||||
- name: many_actors_smoke_test
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Alex Wu"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed_smoke_test.yaml
|
||||
|
@ -55,10 +43,7 @@
|
|||
script: SMOKE_TEST=1 python distributed/test_many_actors.py
|
||||
|
||||
- name: many_tasks
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Alex Wu"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
@ -69,10 +54,7 @@
|
|||
script: python distributed/test_many_tasks.py --num-tasks=10000
|
||||
|
||||
- name: many_tasks_smoke_test
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Alex Wu"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed_smoke_test.yaml
|
||||
|
@ -83,10 +65,7 @@
|
|||
script: python distributed/test_many_tasks.py --num-tasks=100
|
||||
|
||||
- name: many_pgs
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Alex Wu"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
@ -97,10 +76,7 @@
|
|||
script: python distributed/test_many_pgs.py
|
||||
|
||||
- name: many_pgs_smoke_test
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Alex Wu"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed_smoke_test.yaml
|
||||
|
@ -112,10 +88,7 @@
|
|||
|
||||
# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
|
||||
- name: many_nodes
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Alex Wu"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: many_nodes.yaml
|
||||
|
@ -126,10 +99,7 @@
|
|||
script: python distributed/test_many_tasks.py --num-tasks=1000
|
||||
|
||||
- name: many_tasks_redis_ha
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Yi Cheng"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
@ -146,10 +116,7 @@
|
|||
stable: false
|
||||
|
||||
- name: many_actors_redis_ha
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Yi Cheng"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
@ -166,10 +133,7 @@
|
|||
stable: false
|
||||
|
||||
- name: many_nodes_redis_ha
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Yi Cheng"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: many_nodes.yaml
|
||||
|
@ -186,10 +150,7 @@
|
|||
stable: false
|
||||
|
||||
- name: many_pgs_redis_ha
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Yi Cheng"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|||
|
||||
def handle_result(created_on: datetime.datetime, category: str,
|
||||
test_suite: str, test_name: str, status: str, results: Dict,
|
||||
artifacts: Dict, last_logs: str) -> Optional[str]:
|
||||
artifacts: Dict, last_logs: str, team: str) -> Optional[str]:
|
||||
|
||||
if not status == "finished":
|
||||
return f"Test script did not finish successfully ({status})."
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|||
|
||||
def handle_result(created_on: datetime.datetime, category: str,
|
||||
test_suite: str, test_name: str, status: str, results: Dict,
|
||||
artifacts: Dict, last_logs: str) -> Optional[str]:
|
||||
artifacts: Dict, last_logs: str, team: str) -> Optional[str]:
|
||||
assert test_suite == "long_running_tests"
|
||||
|
||||
# elapsed_time = results.get("elapsed_time", 0.)
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|||
|
||||
def handle_result(created_on: datetime.datetime, category: str,
|
||||
test_suite: str, test_name: str, status: str, results: Dict,
|
||||
artifacts: Dict, last_logs: str) -> Optional[str]:
|
||||
artifacts: Dict, last_logs: str, team: str) -> Optional[str]:
|
||||
assert test_suite == "rllib_tests"
|
||||
|
||||
if not status == "finished":
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|||
|
||||
def handle_result(created_on: datetime.datetime, category: str,
|
||||
test_suite: str, test_name: str, status: str, results: Dict,
|
||||
artifacts: Dict, last_logs: str) -> Optional[str]:
|
||||
artifacts: Dict, last_logs: str, team: str) -> Optional[str]:
|
||||
assert test_suite == "tune_tests"
|
||||
|
||||
msg = ""
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|||
|
||||
def handle_result(created_on: datetime.datetime, category: str,
|
||||
test_suite: str, test_name: str, status: str, results: Dict,
|
||||
artifacts: Dict, last_logs: str) -> Optional[str]:
|
||||
artifacts: Dict, last_logs: str, team: str) -> Optional[str]:
|
||||
assert test_suite == "xgboost_tests"
|
||||
|
||||
time_taken = results.get("time_taken", float("inf"))
|
||||
|
|
|
@ -286,6 +286,7 @@ GLOBAL_CONFIG = {
|
|||
|
||||
REPORT_S = 30
|
||||
RETRY_MULTIPLIER = 2
|
||||
VALID_TEAMS = ["ml", "core", "serve"]
|
||||
|
||||
|
||||
class ExitCode(enum.Enum):
|
||||
|
@ -683,20 +684,17 @@ def maybe_get_alert_for_result(result_dict: Dict[str, Any]) -> Optional[str]:
|
|||
return alert
|
||||
|
||||
|
||||
def report_result(test_suite: str, test_name: str, status: str, last_logs: str,
|
||||
results: Dict[Any, Any], artifacts: Dict[Any, Any],
|
||||
category: str):
|
||||
def report_result(*, test_suite: str, test_name: str, status: str,
|
||||
last_logs: str, results: Dict[Any, Any],
|
||||
artifacts: Dict[Any, Any], category: str, team: str):
|
||||
# session_url: str, commit_url: str,
|
||||
# runtime: float, stable: bool, frequency: str, return_code: int):
|
||||
"""Report the test result to database."""
|
||||
now = datetime.datetime.utcnow()
|
||||
rds_data_client = boto3.client("rds-data", region_name="us-west-2")
|
||||
|
||||
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
|
||||
|
||||
sql = (
|
||||
f"INSERT INTO {schema} "
|
||||
f"(created_on, test_suite, test_name, status, last_logs, "
|
||||
f"results, artifacts, category) "
|
||||
f"VALUES (:created_on, :test_suite, :test_name, :status, :last_logs, "
|
||||
f":results, :artifacts, :category)")
|
||||
parameters = [{
|
||||
"name": "created_on",
|
||||
"typeHint": "TIMESTAMP",
|
||||
|
@ -740,7 +738,20 @@ def report_result(test_suite: str, test_name: str, status: str, last_logs: str,
|
|||
"value": {
|
||||
"stringValue": category
|
||||
}
|
||||
}, {
|
||||
"name": "team",
|
||||
"value": {
|
||||
"stringValue": team
|
||||
}
|
||||
}]
|
||||
columns = [param["name"] for param in parameters]
|
||||
values = [f":{param['name']}" for param in parameters]
|
||||
column_str = ", ".join(columns).strip(", ")
|
||||
value_str = ", ".join(values).strip(", ")
|
||||
|
||||
sql = (f"INSERT INTO {schema} " f"({column_str}) " f"VALUES ({value_str})")
|
||||
|
||||
logger.info(f"Query: {sql}")
|
||||
|
||||
# Default boto3 call timeout is 45 seconds.
|
||||
retry_delay_s = 64
|
||||
|
@ -2177,6 +2188,18 @@ def run_test(test_config_file: str,
|
|||
driver_setup_script = test_config.get("driver_setup", None)
|
||||
if driver_setup_script:
|
||||
run_bash_script(local_dir, driver_setup_script)
|
||||
logger.info(test_config)
|
||||
team = test_config.get("team", "unspecified").strip(" ").lower()
|
||||
# When running local test, this validates the team name.
|
||||
# If the team name is not specified, they will be recorded as "unspecified"
|
||||
if not report and team not in VALID_TEAMS:
|
||||
logger.warning(
|
||||
f"Incorrect team name {team} has given."
|
||||
"Please specify team under the name field in the test config. "
|
||||
"For example, within nightly_tests.yaml,\n"
|
||||
"\tname: test_xxx\n"
|
||||
f"\tteam: {'|'.join(VALID_TEAMS)}\n"
|
||||
"\tcluster:...")
|
||||
|
||||
result = run_test_config(
|
||||
local_dir,
|
||||
|
@ -2226,7 +2249,7 @@ def run_test(test_config_file: str,
|
|||
results=result.get("results", {}),
|
||||
artifacts=result.get("artifacts", {}),
|
||||
category=category,
|
||||
)
|
||||
team=team)
|
||||
|
||||
if not has_errored(result):
|
||||
# Check if result are met if test succeeded
|
||||
|
@ -2254,7 +2277,7 @@ def run_test(test_config_file: str,
|
|||
except Exception as e:
|
||||
# On database error the test should still pass
|
||||
# Todo: flag somewhere else?
|
||||
logger.error(f"Error persisting results to database: {e}")
|
||||
logger.exception(f"Error persisting results to database: {e}")
|
||||
else:
|
||||
logger.info(f"Usually I would now report the following results:\n"
|
||||
f"{report_kwargs}")
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
- name: dask_xgboost_test
|
||||
owner:
|
||||
mail: "antoni@anyscale.com"
|
||||
slack: "@team_ml"
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: dask_xgboost_app_config.yaml
|
||||
compute_template: compute_tpl.yaml
|
||||
|
@ -20,9 +18,7 @@
|
|||
]
|
||||
|
||||
- name: modin_xgboost_test
|
||||
owner:
|
||||
mail: "antoni@anyscale.com"
|
||||
slack: "@team_ml"
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: modin_xgboost_app_config.yaml
|
||||
compute_template: compute_tpl.yaml
|
||||
|
@ -41,10 +37,7 @@
|
|||
]
|
||||
|
||||
- name: torch_tune_serve_test
|
||||
owner:
|
||||
mail: "matt@anyscale.com"
|
||||
slack: "@team_ml"
|
||||
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: torch_tune_serve_app_config.yaml
|
||||
compute_template: gpu_tpl.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: horovod_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_master.yaml
|
||||
compute_template: compute_tpl.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: train_small
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
@ -11,6 +12,7 @@
|
|||
script: python workloads/train_small.py
|
||||
|
||||
- name: train_moderate
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
@ -21,6 +23,7 @@
|
|||
script: python workloads/train_moderate.py
|
||||
|
||||
- name: train_gpu
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_gpu.yaml
|
||||
compute_template: tpl_gpu_small.yaml
|
||||
|
@ -31,6 +34,7 @@
|
|||
script: python workloads/train_gpu.py
|
||||
|
||||
- name: distributed_api_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
@ -43,6 +47,7 @@
|
|||
results: ""
|
||||
|
||||
- name: ft_small_non_elastic
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
@ -54,6 +59,7 @@
|
|||
results: ""
|
||||
|
||||
- name: tune_small
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
@ -64,6 +70,7 @@
|
|||
script: python workloads/tune_small.py
|
||||
|
||||
- name: tune_32x4
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
@ -74,6 +81,7 @@
|
|||
script: python workloads/tune_32x4.py
|
||||
|
||||
- name: tune_4x32
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: pytorch_pbt_failure
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: compute_tpl.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: actor_deaths
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
@ -14,6 +15,7 @@
|
|||
timeout: 3600
|
||||
|
||||
- name: apex
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ../rllib_tests/app_config.yaml
|
||||
compute_template: tpl_cpu_3.yaml
|
||||
|
@ -30,6 +32,7 @@
|
|||
|
||||
|
||||
- name: impala
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_np.yaml
|
||||
compute_template: tpl_cpu_1_large.yaml
|
||||
|
@ -44,6 +47,7 @@
|
|||
timeout: 3600
|
||||
|
||||
- name: many_actor_tasks
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
@ -60,6 +64,7 @@
|
|||
|
||||
|
||||
- name: many_drivers
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
@ -76,6 +81,7 @@
|
|||
|
||||
|
||||
- name: many_ppo
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ../rllib_tests/app_config.yaml
|
||||
compute_template: many_ppo.yaml
|
||||
|
@ -93,6 +99,7 @@
|
|||
|
||||
|
||||
- name: many_tasks
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
@ -108,6 +115,7 @@
|
|||
timeout: 3600
|
||||
|
||||
- name: many_tasks_serialized_ids
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
@ -124,6 +132,7 @@
|
|||
|
||||
|
||||
- name: node_failures
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
@ -139,6 +148,7 @@
|
|||
timeout: 3600
|
||||
|
||||
- name: pbt
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ../rllib_tests/app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
@ -154,6 +164,7 @@
|
|||
timeout: 3600
|
||||
|
||||
- name: serve
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
@ -169,6 +180,7 @@
|
|||
timeout: 3600
|
||||
|
||||
- name: serve_failure
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: microbenchmark
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_64.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: horovod_user_test_latest
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: horovod/app_config.yaml
|
||||
compute_template: horovod/compute_tpl.yaml
|
||||
|
@ -13,6 +14,7 @@
|
|||
script: python horovod/horovod_user_test.py
|
||||
|
||||
- name: horovod_user_test_master
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ../horovod_tests/app_config_master.yaml
|
||||
compute_template: horovod/compute_tpl.yaml
|
||||
|
@ -26,32 +28,35 @@
|
|||
script: python horovod/horovod_user_test.py
|
||||
|
||||
|
||||
- name: train_tensorflow_mnist_test
|
||||
cluster:
|
||||
app_config: train/app_config.yaml
|
||||
compute_template: train/compute_tpl.yaml
|
||||
- name: train_tensorflow_mnist_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: train/app_config.yaml
|
||||
compute_template: train/compute_tpl.yaml
|
||||
|
||||
driver_setup: train/driver_setup.sh
|
||||
driver_setup: train/driver_setup.sh
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 36000
|
||||
script: python train/train_tensorflow_mnist_test.py
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 36000
|
||||
script: python train/train_tensorflow_mnist_test.py
|
||||
|
||||
- name: train_torch_linear_test
|
||||
cluster:
|
||||
app_config: train/app_config.yaml
|
||||
compute_template: train/compute_tpl.yaml
|
||||
- name: train_torch_linear_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: train/app_config.yaml
|
||||
compute_template: train/compute_tpl.yaml
|
||||
|
||||
driver_setup: train/driver_setup.sh
|
||||
driver_setup: train/driver_setup.sh
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 36000
|
||||
script: python train/train_torch_linear_test.py
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 36000
|
||||
script: python train/train_torch_linear_test.py
|
||||
|
||||
|
||||
- name: xgboost_gpu_connect_latest
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: xgboost/app_config_gpu.yaml
|
||||
compute_template: xgboost/tpl_gpu_small_scaling.yaml
|
||||
|
@ -62,6 +67,7 @@
|
|||
script: python xgboost/train_gpu_connect.py
|
||||
|
||||
- name: xgboost_gpu_connect_master
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: xgboost/app_config_gpu_master.yaml
|
||||
compute_template: xgboost/tpl_gpu_small_scaling.yaml
|
||||
|
@ -72,6 +78,7 @@
|
|||
script: python xgboost/train_gpu_connect.py
|
||||
|
||||
- name: ray_lightning_user_test_latest
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ray-lightning/app_config.yaml
|
||||
compute_template: ray-lightning/compute_tpl.yaml
|
||||
|
@ -86,6 +93,7 @@
|
|||
|
||||
|
||||
- name: ray_lightning_user_test_master
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ray-lightning/app_config_master.yaml
|
||||
compute_template: ray-lightning/compute_tpl.yaml
|
||||
|
@ -101,6 +109,7 @@
|
|||
|
||||
|
||||
- name: tune_rllib_connect_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ../rllib_tests/app_config.yaml
|
||||
compute_template: tune_rllib/compute_tpl.yaml
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
# Run the test that invokes many tasks without object store usage.
|
||||
- name: chaos_many_tasks_no_object_store
|
||||
team: core
|
||||
cluster:
|
||||
app_config: chaos_test/app_config.yaml
|
||||
compute_template: chaos_test/compute_template.yaml
|
||||
|
@ -14,6 +15,7 @@
|
|||
script: python chaos_test/test_chaos_basic.py --workload=tasks
|
||||
|
||||
- name: chaos_many_actors
|
||||
team: core
|
||||
cluster:
|
||||
app_config: chaos_test/app_config.yaml
|
||||
compute_template: chaos_test/compute_template.yaml
|
||||
|
@ -24,6 +26,7 @@
|
|||
script: python chaos_test/test_chaos_basic.py --workload=actors
|
||||
|
||||
- name: chaos_dask_on_ray_large_scale_test_no_spilling
|
||||
team: core
|
||||
cluster:
|
||||
app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
||||
|
@ -37,6 +40,7 @@
|
|||
|
||||
# Test large scale dask on ray test with spilling.
|
||||
- name: chaos_dask_on_ray_large_scale_test_spilling
|
||||
team: core
|
||||
cluster:
|
||||
app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
||||
|
@ -49,10 +53,7 @@
|
|||
stable: false
|
||||
|
||||
- name: chaos_pipelined_ingestion_1500_gb_15_windows
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Chen Shen"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dataset/pipelined_ingestion_app.yaml
|
||||
compute_template: dataset/pipelined_ingestion_compute.yaml
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
- name: inference
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Alex Wu"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: inference.yaml
|
||||
|
@ -13,10 +10,7 @@
|
|||
script: python inference.py
|
||||
|
||||
- name: shuffle_data_loader
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Chen Shen"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle_app_config.yaml
|
||||
compute_template: shuffle_compute.yaml
|
||||
|
@ -26,10 +20,7 @@
|
|||
script: python dataset_shuffle_data_loader.py
|
||||
|
||||
- name: pipelined_training_50_gb
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Chen Shen"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: pipelined_training_app.yaml
|
||||
compute_template: pipelined_training_compute.yaml
|
||||
|
@ -40,10 +31,7 @@
|
|||
script: python pipelined_training.py --epochs 1
|
||||
|
||||
- name: pipelined_ingestion_1500_gb_15_windows
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Chen Shen"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: pipelined_ingestion_app.yaml
|
||||
compute_template: pipelined_ingestion_compute.yaml
|
||||
|
@ -54,10 +42,7 @@
|
|||
script: python pipelined_training.py --epochs 2 --num-windows 15 --num-files 915 --debug
|
||||
|
||||
- name: datasets_ingest_train_infer
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Chen Shen"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: ray_sgd_training_app.yaml
|
||||
compute_template: ray_sgd_training_compute.yaml
|
||||
|
@ -80,10 +65,7 @@
|
|||
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
|
||||
|
||||
- name: datasets_preprocess_ingest
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Chen Shen"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: ray_sgd_training_app.yaml
|
||||
compute_template: ray_sgd_training_compute_no_gpu.yaml
|
||||
|
@ -96,10 +78,7 @@
|
|||
stable: false
|
||||
|
||||
- name: datasets_ingest_400G
|
||||
owner:
|
||||
mail: "core@anyscale.com"
|
||||
slack: "@Chen Shen"
|
||||
|
||||
team: core
|
||||
cluster:
|
||||
app_config: ray_sgd_training_app.yaml
|
||||
compute_template: dataset_ingest_400G_compute.yaml
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
#
|
||||
# Single node shuffle
|
||||
#
|
||||
|
||||
# Test basic single node 10GB shuffle with a small number of partitions.
|
||||
# This doesn't require object spilling.
|
||||
- name: shuffle_10gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_single.yaml
|
||||
|
@ -15,6 +15,7 @@
|
|||
|
||||
# Test single node 50GB shuffle with a large number of partitions.
|
||||
- name: shuffle_50gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_single.yaml
|
||||
|
@ -25,6 +26,7 @@
|
|||
|
||||
# Test single node 50GB shuffle with a large number of partitions.
|
||||
- name: shuffle_50gb_large_partition
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_single.yaml
|
||||
|
@ -35,6 +37,7 @@
|
|||
|
||||
# Test non streaming shuffle in a single node with a small number of partition.
|
||||
- name: non_streaming_shuffle_50gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_single.yaml
|
||||
|
@ -45,6 +48,7 @@
|
|||
|
||||
# Test non streaming shuffle in a single node with a large number of partition.
|
||||
- name: non_streaming_shuffle_50gb_large_partition
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_single.yaml
|
||||
|
@ -54,6 +58,7 @@
|
|||
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
|
||||
|
||||
- name: dask_on_ray_10gb_sort
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
|
||||
|
@ -63,6 +68,7 @@
|
|||
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
|
||||
|
||||
- name: dask_on_ray_100gb_sort
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
|
||||
|
@ -77,6 +83,7 @@
|
|||
|
||||
# Test multi nodes 100GB shuffle with a small number of partitions.
|
||||
- name: shuffle_100gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_multi.yaml
|
||||
|
@ -88,6 +95,7 @@
|
|||
|
||||
# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
|
||||
- name: non_streaming_shuffle_100gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_multi.yaml
|
||||
|
@ -99,6 +107,7 @@
|
|||
|
||||
# Test autoscaling 1TB streaming shuffle with a large number of partitions.
|
||||
- name: autoscaling_shuffle_1tb_1000_partitions
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_autoscaling.yaml
|
||||
|
@ -109,6 +118,7 @@
|
|||
|
||||
# Test multi nodes 1TB streaming shuffle with a large number of partitions.
|
||||
- name: shuffle_1tb_1000_partition
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
||||
|
@ -120,6 +130,7 @@
|
|||
|
||||
# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
|
||||
- name: non_streaming_shuffle_1tb_1000_partition
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
||||
|
@ -131,6 +142,7 @@
|
|||
|
||||
# Stress test for 1TB multi node streaming shuffle.
|
||||
- name: shuffle_1tb_5000_partitions
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
||||
|
@ -142,6 +154,7 @@
|
|||
|
||||
# Stress test for 1TB multi node non-streaming shuffle.
|
||||
# - name: non_streaming_shuffle_1tb_5000_partitions
|
||||
# team: core
|
||||
# stable: False
|
||||
# cluster:
|
||||
# app_config: shuffle/shuffle_app_config.yaml
|
||||
|
@ -154,6 +167,7 @@
|
|||
|
||||
# Test large scale dask on ray test without spilling.
|
||||
- name: dask_on_ray_large_scale_test_no_spilling
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
||||
|
@ -175,6 +189,7 @@
|
|||
|
||||
# Test large scale dask on ray test with spilling.
|
||||
- name: dask_on_ray_large_scale_test_spilling
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
||||
|
@ -196,6 +211,7 @@
|
|||
|
||||
# Stress tests with many tasks
|
||||
- name: stress_test_many_tasks
|
||||
team: core
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/stress_tests_compute.yaml
|
||||
|
@ -215,6 +231,7 @@
|
|||
|
||||
# Stress tests with dead actors
|
||||
- name: stress_test_dead_actors
|
||||
team: core
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/stress_tests_compute.yaml
|
||||
|
@ -234,6 +251,7 @@
|
|||
|
||||
# Stress tests with placement groups
|
||||
- name: stress_test_placement_group
|
||||
team: core
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/placement_group_tests_compute.yaml
|
||||
|
@ -244,6 +262,7 @@
|
|||
|
||||
# Stress tests with many threaded actors.
|
||||
- name: threaded_actors_stress_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
|
||||
|
@ -266,6 +285,7 @@
|
|||
|
||||
# Test decision tree on autoscaling compute cluster.
|
||||
- name: decision_tree_autoscaling
|
||||
team: core
|
||||
cluster:
|
||||
app_config: decision_tree/decision_tree_app_config.yaml
|
||||
compute_template: decision_tree/autoscaling_compute.yaml
|
||||
|
@ -276,6 +296,7 @@
|
|||
|
||||
# Test 20 concurrent decision tree runs on autoscaling compute cluster.
|
||||
- name: decision_tree_autoscaling_20_runs
|
||||
team: core
|
||||
cluster:
|
||||
app_config: decision_tree/decision_tree_app_config.yaml
|
||||
compute_template: decision_tree/autoscaling_compute.yaml
|
||||
|
@ -285,6 +306,7 @@
|
|||
|
||||
# Stress test shuffle_data_loader.
|
||||
- name: shuffle_data_loader
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
|
||||
compute_template: shuffle_data_loader/shuffle_data_loader_compute.yaml
|
||||
|
@ -307,6 +329,7 @@
|
|||
|
||||
# Stress test shuffle_data_loader.
|
||||
- name: shuffle_data_loader_4_nodes
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
|
||||
compute_template: shuffle_data_loader/shuffle_data_loader_compute_4_nodes.yaml
|
||||
|
@ -329,6 +352,7 @@
|
|||
--no-stats
|
||||
|
||||
- name: dask_on_ray_1tb_sort
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/1tb_sort_compute.yaml
|
||||
|
@ -339,6 +363,7 @@
|
|||
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
|
||||
|
||||
- name: many_nodes_actor_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: many_nodes_tests/app_config.yaml
|
||||
compute_template: many_nodes_tests/compute_config.yaml
|
||||
|
@ -349,6 +374,7 @@
|
|||
script: python many_nodes_tests/actor_test.py
|
||||
|
||||
- name: pg_autoscaling_regression_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: placement_group_tests/app_config.yaml
|
||||
compute_template: placement_group_tests/compute.yaml
|
||||
|
@ -358,6 +384,7 @@
|
|||
script: python placement_group_tests/pg_run.py
|
||||
|
||||
- name: pg_long_running_performance_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: placement_group_tests/app_config.yaml
|
||||
compute_template: placement_group_tests/long_running_test_compute.yaml
|
||||
|
@ -368,6 +395,7 @@
|
|||
script: python placement_group_tests/long_running_performance_test.py --num-stages 2000
|
||||
|
||||
- name: placement_group_performance_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: placement_group_tests/app_config.yaml
|
||||
compute_template: placement_group_tests/pg_perf_test_compute.yaml
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# Heavy learning tests (Atari and HalfCheetah) for major algos.
|
||||
- name: learning_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 8gpus_64cpus.yaml
|
||||
|
@ -14,6 +15,7 @@
|
|||
|
||||
# 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos.
|
||||
- name: multi_gpu_learning_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 8gpus_96cpus.yaml
|
||||
|
@ -25,6 +27,7 @@
|
|||
# 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos
|
||||
# (that support RNN models).
|
||||
- name: multi_gpu_with_lstm_learning_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 8gpus_96cpus.yaml
|
||||
|
@ -36,6 +39,7 @@
|
|||
# 2-GPU learning tests (StatelessCartPole) + use_attention=True for major
|
||||
# algos (that support RNN models).
|
||||
- name: multi_gpu_with_attention_learning_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 8gpus_96cpus.yaml
|
||||
|
@ -46,6 +50,7 @@
|
|||
|
||||
# We'll have these as per-PR tests soon.
|
||||
# - name: example_scripts_on_gpu_tests
|
||||
# team: ml
|
||||
# cluster:
|
||||
# app_config: app_config.yaml
|
||||
# compute_template: 1gpu_4cpus.yaml
|
||||
|
@ -56,6 +61,7 @@
|
|||
|
||||
# IMPALA large machine stress tests (4x Atari).
|
||||
- name: stress_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 4gpus_544_cpus.yaml
|
||||
|
@ -71,6 +77,7 @@
|
|||
|
||||
# Tests that exercise auto-scaling and Anyscale connect.
|
||||
- name: connect_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: auto_scale.yaml
|
||||
|
@ -86,6 +93,7 @@
|
|||
# Performance metrics, such as reward achieved and throughput, are then
|
||||
# collected and tracked over time.
|
||||
- name: performance_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 12gpus_192cpus.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: rte_many_tasks_actors
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: rte_small.yaml
|
||||
|
@ -9,6 +10,7 @@
|
|||
script: python workloads/rte_many_tasks_actors.py
|
||||
|
||||
- name: wheel_urls
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: rte_minimal.yaml
|
||||
|
@ -19,6 +21,7 @@
|
|||
script: python workloads/wheel_urls.py
|
||||
|
||||
- name: rte_ray_client
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: rte_minimal.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: single_deployment_1k_noop_replica
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: compute_tpl_8_cpu.yaml
|
||||
|
@ -12,6 +13,7 @@
|
|||
timeout: 600
|
||||
|
||||
- name: multi_deployment_1k_noop_replica
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: compute_tpl_8_cpu.yaml
|
||||
|
@ -25,6 +27,7 @@
|
|||
timeout: 600
|
||||
|
||||
- name: serve_micro_benchmark
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
# 16 CPUS
|
||||
|
@ -54,6 +57,7 @@
|
|||
timeout: 600
|
||||
|
||||
- name: serve_cluster_fault_tolerance
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
# 16 CPUS
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# Test multi-node, multi-GPU Ray SGD example.
|
||||
- name: sgd_gpu
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: sgd_gpu/sgd_gpu_app_config.yaml
|
||||
compute_template: sgd_gpu/sgd_gpu_compute.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: aws_no_sync_down
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
@ -9,6 +10,7 @@
|
|||
script: python workloads/run_cloud_test.py no_sync_down
|
||||
|
||||
- name: aws_ssh_sync
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
@ -19,6 +21,7 @@
|
|||
script: python workloads/run_cloud_test.py ssh_sync
|
||||
|
||||
- name: aws_durable_upload
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
@ -29,6 +32,7 @@
|
|||
script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
|
||||
|
||||
- name: aws_durable_upload_rllib_str
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_ml.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
@ -39,6 +43,7 @@
|
|||
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str
|
||||
|
||||
- name: aws_durable_upload_rllib_trainer
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_ml.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
@ -49,6 +54,7 @@
|
|||
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer
|
||||
|
||||
- name: aws_no_durable_upload
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
@ -59,6 +65,7 @@
|
|||
script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload
|
||||
|
||||
- name: gcp_k8s_no_sync_down
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
||||
|
@ -71,6 +78,7 @@
|
|||
script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
|
||||
|
||||
- name: gcp_k8s_ssh_sync
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
||||
|
@ -83,6 +91,7 @@
|
|||
script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
|
||||
|
||||
- name: gcp_k8s_durable_upload
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
||||
|
@ -96,6 +105,7 @@
|
|||
|
||||
|
||||
- name: gcp_k8s_no_durable_upload
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: bookkeeping_overhead
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_1x16.yaml
|
||||
|
@ -9,6 +10,7 @@
|
|||
|
||||
|
||||
- name: durable_trainable
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_16x2.yaml
|
||||
|
@ -19,6 +21,7 @@
|
|||
script: python workloads/test_durable_trainable.py --bucket data-test-ilr
|
||||
|
||||
- name: long_running_large_checkpoints
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_1x32_hd.yaml
|
||||
|
@ -34,6 +37,7 @@
|
|||
|
||||
|
||||
- name: network_overhead
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_100x2.yaml
|
||||
|
@ -54,6 +58,7 @@
|
|||
prepare: python wait_cluster.py 20 600
|
||||
|
||||
- name: result_throughput_cluster
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_16x64.yaml
|
||||
|
@ -64,6 +69,7 @@
|
|||
script: python workloads/test_result_throughput_cluster.py
|
||||
|
||||
- name: result_throughput_single_node
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_1x96.yaml
|
||||
|
@ -73,6 +79,7 @@
|
|||
script: python workloads/test_result_throughput_single_node.py
|
||||
|
||||
- name: xgboost_sweep
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_data.yaml
|
||||
compute_template: tpl_16x64.yaml
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
- name: train_small
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
@ -11,6 +12,7 @@
|
|||
script: python workloads/train_small.py
|
||||
|
||||
- name: train_moderate
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
@ -21,6 +23,7 @@
|
|||
script: python workloads/train_moderate.py
|
||||
|
||||
- name: train_gpu
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_gpu.yaml
|
||||
compute_template: tpl_gpu_small.yaml
|
||||
|
@ -31,6 +34,7 @@
|
|||
script: python workloads/train_gpu.py
|
||||
|
||||
- name: distributed_api_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
@ -43,6 +47,7 @@
|
|||
results: ""
|
||||
|
||||
- name: ft_small_elastic
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
@ -54,6 +59,7 @@
|
|||
results: ""
|
||||
|
||||
- name: ft_small_non_elastic
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
@ -65,6 +71,7 @@
|
|||
results: ""
|
||||
|
||||
- name: tune_small
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
@ -75,6 +82,7 @@
|
|||
script: python workloads/tune_small.py
|
||||
|
||||
- name: tune_32x4
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
@ -85,6 +93,7 @@
|
|||
script: python workloads/tune_32x4.py
|
||||
|
||||
- name: tune_4x32
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
|
Loading…
Add table
Reference in a new issue