[Test Infra] Unrevert team col (#21700)

This fixes the previous problems from team column revert. This has 2 additional changes; alert handler receives the team argument, which was the root cause of breakage; https://github.com/ray-project/ray/pull/21289 Previously, tests without a team column were raising an exception, but I made the condition weaker (warning logs). I will eventually change it to raise an exception, but for smoother transition, we will log warning instead for a short time
2025-03-06 02:21:39 -05:00 · 2022-01-20 06:29:53 +09:00 · 2022-01-20 06:29:53 +09:00 · b1308b1c8c
commit b1308b1c8c
parent 88143cdc35
24 changed files with 188 additions and 129 deletions
--- a/benchmarks/benchmark_tests.yaml
+++ b/benchmarks/benchmark_tests.yaml
@ -1,8 +1,5 @@
 - name: single_node
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Alex Wu"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: single_node.yaml
@ -13,10 +10,7 @@
    script: python single_node/test_single_node.py

 - name: object_store
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Alex Wu"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: object_store.yaml
@ -27,10 +21,7 @@
    script: python object_store/test_object_store.py

 - name: many_actors
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Alex Wu"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
@ -41,10 +32,7 @@
    script: python distributed/test_many_actors.py

 - name: many_actors_smoke_test
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Alex Wu"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed_smoke_test.yaml
@ -55,10 +43,7 @@
    script: SMOKE_TEST=1 python distributed/test_many_actors.py

 - name: many_tasks
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Alex Wu"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
@ -69,10 +54,7 @@
    script: python distributed/test_many_tasks.py --num-tasks=10000

 - name: many_tasks_smoke_test
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Alex Wu"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed_smoke_test.yaml
@ -83,10 +65,7 @@
    script: python distributed/test_many_tasks.py --num-tasks=100

 - name: many_pgs
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Alex Wu"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
@ -97,10 +76,7 @@
    script: python distributed/test_many_pgs.py

 - name: many_pgs_smoke_test
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Alex Wu"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed_smoke_test.yaml
@ -112,10 +88,7 @@

 # NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
 - name: many_nodes
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Alex Wu"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: many_nodes.yaml
@ -126,10 +99,7 @@
    script: python distributed/test_many_tasks.py --num-tasks=1000

 - name: many_tasks_redis_ha
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Yi Cheng"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
@ -146,10 +116,7 @@
  stable: false

 - name: many_actors_redis_ha
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Yi Cheng"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
@ -166,10 +133,7 @@
  stable: false

 - name: many_nodes_redis_ha
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Yi Cheng"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: many_nodes.yaml
@ -186,10 +150,7 @@
  stable: false

 - name: many_pgs_redis_ha
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Yi Cheng"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
--- a/release/alerts/default.py
+++ b/release/alerts/default.py
@ -5,7 +5,7 @@ from typing import Dict, Optional

 def handle_result(created_on: datetime.datetime, category: str,
                  test_suite: str, test_name: str, status: str, results: Dict,
-                  artifacts: Dict, last_logs: str) -> Optional[str]:
+                  artifacts: Dict, last_logs: str, team: str) -> Optional[str]:

    if not status == "finished":
        return f"Test script did not finish successfully ({status})."
--- a/release/alerts/long_running_tests.py
+++ b/release/alerts/long_running_tests.py
@ -5,7 +5,7 @@ from typing import Dict, Optional

 def handle_result(created_on: datetime.datetime, category: str,
                  test_suite: str, test_name: str, status: str, results: Dict,
-                  artifacts: Dict, last_logs: str) -> Optional[str]:
+                  artifacts: Dict, last_logs: str, team: str) -> Optional[str]:
    assert test_suite == "long_running_tests"

    # elapsed_time = results.get("elapsed_time", 0.)
--- a/release/alerts/rllib_tests.py
+++ b/release/alerts/rllib_tests.py
@ -5,7 +5,7 @@ from typing import Dict, Optional

 def handle_result(created_on: datetime.datetime, category: str,
                  test_suite: str, test_name: str, status: str, results: Dict,
-                  artifacts: Dict, last_logs: str) -> Optional[str]:
+                  artifacts: Dict, last_logs: str, team: str) -> Optional[str]:
    assert test_suite == "rllib_tests"

    if not status == "finished":
--- a/release/alerts/tune_tests.py
+++ b/release/alerts/tune_tests.py
@ -5,7 +5,7 @@ from typing import Dict, Optional

 def handle_result(created_on: datetime.datetime, category: str,
                  test_suite: str, test_name: str, status: str, results: Dict,
-                  artifacts: Dict, last_logs: str) -> Optional[str]:
+                  artifacts: Dict, last_logs: str, team: str) -> Optional[str]:
    assert test_suite == "tune_tests"

    msg = ""
--- a/release/alerts/xgboost_tests.py
+++ b/release/alerts/xgboost_tests.py
@ -5,7 +5,7 @@ from typing import Dict, Optional

 def handle_result(created_on: datetime.datetime, category: str,
                  test_suite: str, test_name: str, status: str, results: Dict,
-                  artifacts: Dict, last_logs: str) -> Optional[str]:
+                  artifacts: Dict, last_logs: str, team: str) -> Optional[str]:
    assert test_suite == "xgboost_tests"

    time_taken = results.get("time_taken", float("inf"))
--- a/release/e2e.py
+++ b/release/e2e.py
@ -286,6 +286,7 @@ GLOBAL_CONFIG = {

 REPORT_S = 30
 RETRY_MULTIPLIER = 2
+VALID_TEAMS = ["ml", "core", "serve"]


 class ExitCode(enum.Enum):
@ -683,20 +684,17 @@ def maybe_get_alert_for_result(result_dict: Dict[str, Any]) -> Optional[str]:
    return alert


-def report_result(test_suite: str, test_name: str, status: str, last_logs: str,
-                  results: Dict[Any, Any], artifacts: Dict[Any, Any],
-                  category: str):
+def report_result(*, test_suite: str, test_name: str, status: str,
+                  last_logs: str, results: Dict[Any, Any],
+                  artifacts: Dict[Any, Any], category: str, team: str):
+    #   session_url: str, commit_url: str,
+    #   runtime: float, stable: bool, frequency: str, return_code: int):
+    """Report the test result to database."""
    now = datetime.datetime.utcnow()
    rds_data_client = boto3.client("rds-data", region_name="us-west-2")

    schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]

-    sql = (
-        f"INSERT INTO {schema} "
-        f"(created_on, test_suite, test_name, status, last_logs, "
-        f"results, artifacts, category) "
-        f"VALUES (:created_on, :test_suite, :test_name, :status, :last_logs, "
-        f":results, :artifacts, :category)")
    parameters = [{
        "name": "created_on",
        "typeHint": "TIMESTAMP",
@ -740,7 +738,20 @@ def report_result(test_suite: str, test_name: str, status: str, last_logs: str,
        "value": {
            "stringValue": category
        }
+    }, {
+        "name": "team",
+        "value": {
+            "stringValue": team
+        }
    }]
+    columns = [param["name"] for param in parameters]
+    values = [f":{param['name']}" for param in parameters]
+    column_str = ", ".join(columns).strip(", ")
+    value_str = ", ".join(values).strip(", ")
+
+    sql = (f"INSERT INTO {schema} " f"({column_str}) " f"VALUES ({value_str})")
+
+    logger.info(f"Query: {sql}")

    # Default boto3 call timeout is 45 seconds.
    retry_delay_s = 64
@ -2177,6 +2188,18 @@ def run_test(test_config_file: str,
    driver_setup_script = test_config.get("driver_setup", None)
    if driver_setup_script:
        run_bash_script(local_dir, driver_setup_script)
+    logger.info(test_config)
+    team = test_config.get("team", "unspecified").strip(" ").lower()
+    # When running local test, this validates the team name.
+    # If the team name is not specified, they will be recorded as "unspecified"
+    if not report and team not in VALID_TEAMS:
+        logger.warning(
+            f"Incorrect team name {team} has given."
+            "Please specify team under the name field in the test config. "
+            "For example, within nightly_tests.yaml,\n"
+            "\tname: test_xxx\n"
+            f"\tteam: {'|'.join(VALID_TEAMS)}\n"
+            "\tcluster:...")

    result = run_test_config(
        local_dir,
@ -2226,7 +2249,7 @@ def run_test(test_config_file: str,
            results=result.get("results", {}),
            artifacts=result.get("artifacts", {}),
            category=category,
-        )
+            team=team)

        if not has_errored(result):
            # Check if result are met if test succeeded
@ -2254,7 +2277,7 @@ def run_test(test_config_file: str,
            except Exception as e:
                # On database error the test should still pass
                # Todo: flag somewhere else?
-                logger.error(f"Error persisting results to database: {e}")
+                logger.exception(f"Error persisting results to database: {e}")
        else:
            logger.info(f"Usually I would now report the following results:\n"
                        f"{report_kwargs}")
--- a/release/golden_notebook_tests/golden_notebook_tests.yaml
+++ b/release/golden_notebook_tests/golden_notebook_tests.yaml
@ -1,7 +1,5 @@
 - name: dask_xgboost_test
-  owner:
-    mail: "antoni@anyscale.com"
-    slack: "@team_ml"
+  team: ml
  cluster:
    app_config: dask_xgboost_app_config.yaml
    compute_template: compute_tpl.yaml
@ -20,9 +18,7 @@
      ]

 - name: modin_xgboost_test
-  owner:
-    mail: "antoni@anyscale.com"
-    slack: "@team_ml"
+  team: ml
  cluster:
    app_config: modin_xgboost_app_config.yaml
    compute_template: compute_tpl.yaml
@ -41,10 +37,7 @@
      ]

 - name: torch_tune_serve_test
-  owner:
-    mail: "matt@anyscale.com"
-    slack: "@team_ml"
-
+  team: ml
  cluster:
    app_config: torch_tune_serve_app_config.yaml
    compute_template: gpu_tpl.yaml
--- a/release/horovod_tests/horovod_tests.yaml
+++ b/release/horovod_tests/horovod_tests.yaml
@ -1,4 +1,5 @@
 - name: horovod_test
+  team: ml
  cluster:
    app_config: app_config_master.yaml
    compute_template: compute_tpl.yaml
--- a/release/lightgbm_tests/lightgbm_tests.yaml
+++ b/release/lightgbm_tests/lightgbm_tests.yaml
@ -1,4 +1,5 @@
 - name: train_small
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
@ -11,6 +12,7 @@
    script: python workloads/train_small.py

 - name: train_moderate
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
@ -21,6 +23,7 @@
    script: python workloads/train_moderate.py

 - name: train_gpu
+  team: ml
  cluster:
    app_config: app_config_gpu.yaml
    compute_template: tpl_gpu_small.yaml
@ -31,6 +34,7 @@
    script: python workloads/train_gpu.py

 - name: distributed_api_test
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
@ -43,6 +47,7 @@
    results: ""

 - name: ft_small_non_elastic
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
@ -54,6 +59,7 @@
    results: ""

 - name: tune_small
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
@ -64,6 +70,7 @@
    script: python workloads/tune_small.py

 - name: tune_32x4
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
@ -74,6 +81,7 @@
    script: python workloads/tune_32x4.py

 - name: tune_4x32
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
--- a/release/long_running_distributed_tests/long_running_distributed.yaml
+++ b/release/long_running_distributed_tests/long_running_distributed.yaml
@ -1,4 +1,5 @@
 - name: pytorch_pbt_failure
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: compute_tpl.yaml
--- a/release/long_running_tests/long_running_tests.yaml
+++ b/release/long_running_tests/long_running_tests.yaml
@ -1,4 +1,5 @@
 - name: actor_deaths
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
@ -14,6 +15,7 @@
      timeout: 3600

 - name: apex
+  team: ml
  cluster:
    app_config: ../rllib_tests/app_config.yaml
    compute_template: tpl_cpu_3.yaml
@ -30,6 +32,7 @@


 - name: impala
+  team: ml
  cluster:
    app_config: app_config_np.yaml
    compute_template: tpl_cpu_1_large.yaml
@ -44,6 +47,7 @@
      timeout: 3600

 - name: many_actor_tasks
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
@ -60,6 +64,7 @@


 - name: many_drivers
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
@ -76,6 +81,7 @@


 - name: many_ppo
+  team: ml
  cluster:
    app_config: ../rllib_tests/app_config.yaml
    compute_template: many_ppo.yaml
@ -93,6 +99,7 @@


 - name: many_tasks
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
@ -108,6 +115,7 @@
      timeout: 3600

 - name: many_tasks_serialized_ids
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
@ -124,6 +132,7 @@


 - name: node_failures
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
@ -139,6 +148,7 @@
      timeout: 3600

 - name: pbt
+  team: ml
  cluster:
    app_config: ../rllib_tests/app_config.yaml
    compute_template: tpl_cpu_1.yaml
@ -154,6 +164,7 @@
      timeout: 3600

 - name: serve
+  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
@ -169,6 +180,7 @@
      timeout: 3600

 - name: serve_failure
+  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
--- a/release/microbenchmark/microbenchmark.yaml
+++ b/release/microbenchmark/microbenchmark.yaml
@ -1,4 +1,5 @@
 - name: microbenchmark
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_64.yaml
--- a/release/ml_user_tests/ml_user_tests.yaml
+++ b/release/ml_user_tests/ml_user_tests.yaml
@ -1,4 +1,5 @@
 - name: horovod_user_test_latest
+  team: ml
  cluster:
    app_config: horovod/app_config.yaml
    compute_template: horovod/compute_tpl.yaml
@ -13,6 +14,7 @@
    script: python horovod/horovod_user_test.py

 - name: horovod_user_test_master
+  team: ml
  cluster:
    app_config: ../horovod_tests/app_config_master.yaml
    compute_template: horovod/compute_tpl.yaml
@ -26,32 +28,35 @@
    script: python horovod/horovod_user_test.py


-   name: train_tensorflow_mnist_test
-    cluster:
-        app_config: train/app_config.yaml
-        compute_template: train/compute_tpl.yaml
+- name: train_tensorflow_mnist_test
+  team: ml
+  cluster:
+      app_config: train/app_config.yaml
+      compute_template: train/compute_tpl.yaml

-    driver_setup: train/driver_setup.sh
+  driver_setup: train/driver_setup.sh

-    run:
-        use_connect: True
-        timeout: 36000
-        script: python train/train_tensorflow_mnist_test.py
+  run:
+      use_connect: True
+      timeout: 36000
+      script: python train/train_tensorflow_mnist_test.py

-   name: train_torch_linear_test
-    cluster:
-        app_config: train/app_config.yaml
-        compute_template: train/compute_tpl.yaml
+- name: train_torch_linear_test
+  team: ml
+  cluster:
+      app_config: train/app_config.yaml
+      compute_template: train/compute_tpl.yaml

-    driver_setup: train/driver_setup.sh
+  driver_setup: train/driver_setup.sh

-    run:
-        use_connect: True
-        timeout: 36000
-        script: python train/train_torch_linear_test.py
+  run:
+      use_connect: True
+      timeout: 36000
+      script: python train/train_torch_linear_test.py


 - name: xgboost_gpu_connect_latest
+  team: ml
  cluster:
    app_config: xgboost/app_config_gpu.yaml
    compute_template: xgboost/tpl_gpu_small_scaling.yaml
@ -62,6 +67,7 @@
    script: python xgboost/train_gpu_connect.py

 - name: xgboost_gpu_connect_master
+  team: ml
  cluster:
    app_config: xgboost/app_config_gpu_master.yaml
    compute_template: xgboost/tpl_gpu_small_scaling.yaml
@ -72,6 +78,7 @@
    script: python xgboost/train_gpu_connect.py

 - name: ray_lightning_user_test_latest
+  team: ml
  cluster:
    app_config: ray-lightning/app_config.yaml
    compute_template: ray-lightning/compute_tpl.yaml
@ -86,6 +93,7 @@


 - name: ray_lightning_user_test_master
+  team: ml
  cluster:
    app_config: ray-lightning/app_config_master.yaml
    compute_template: ray-lightning/compute_tpl.yaml
@ -101,6 +109,7 @@


 - name: tune_rllib_connect_test
+  team: ml
  cluster:
    app_config: ../rllib_tests/app_config.yaml
    compute_template: tune_rllib/compute_tpl.yaml
--- a/release/nightly_tests/chaos_test.yaml
+++ b/release/nightly_tests/chaos_test.yaml
@ -4,6 +4,7 @@

 # Run the test that invokes many tasks without object store usage.
 - name: chaos_many_tasks_no_object_store
+  team: core
  cluster:
    app_config: chaos_test/app_config.yaml
    compute_template: chaos_test/compute_template.yaml
@ -14,6 +15,7 @@
    script: python chaos_test/test_chaos_basic.py --workload=tasks

 - name: chaos_many_actors
+  team: core
  cluster:
    app_config: chaos_test/app_config.yaml
    compute_template: chaos_test/compute_template.yaml
@ -24,6 +26,7 @@
    script: python chaos_test/test_chaos_basic.py --workload=actors

 - name: chaos_dask_on_ray_large_scale_test_no_spilling
+  team: core
  cluster:
    app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
    compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
@ -37,6 +40,7 @@

 # Test large scale dask on ray test with spilling.
 - name: chaos_dask_on_ray_large_scale_test_spilling
+  team: core
  cluster:
    app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
    compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
@ -49,10 +53,7 @@
  stable: false

 - name: chaos_pipelined_ingestion_1500_gb_15_windows
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Chen Shen"
-
+  team: core
  cluster:
    app_config: dataset/pipelined_ingestion_app.yaml
    compute_template: dataset/pipelined_ingestion_compute.yaml
--- a/release/nightly_tests/dataset/dataset_test.yaml
+++ b/release/nightly_tests/dataset/dataset_test.yaml
@ -1,8 +1,5 @@
 - name: inference
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Alex Wu"
-
+  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: inference.yaml
@ -13,10 +10,7 @@
    script: python inference.py
  
 - name: shuffle_data_loader
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Chen Shen"
-
+  team: core
  cluster:
    app_config: shuffle_app_config.yaml
    compute_template: shuffle_compute.yaml
@ -26,10 +20,7 @@
    script: python dataset_shuffle_data_loader.py

 - name: pipelined_training_50_gb
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Chen Shen"
-
+  team: core
  cluster:
    app_config: pipelined_training_app.yaml
    compute_template: pipelined_training_compute.yaml
@ -40,10 +31,7 @@
    script: python pipelined_training.py --epochs 1

 - name: pipelined_ingestion_1500_gb_15_windows
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Chen Shen"
-
+  team: core
  cluster:
    app_config: pipelined_ingestion_app.yaml
    compute_template: pipelined_ingestion_compute.yaml
@ -54,10 +42,7 @@
    script: python pipelined_training.py --epochs 2 --num-windows 15  --num-files 915 --debug

 - name: datasets_ingest_train_infer
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Chen Shen"
-
+  team: core
  cluster:
    app_config: ray_sgd_training_app.yaml
    compute_template: ray_sgd_training_compute.yaml
@ -80,10 +65,7 @@
      script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu

 - name: datasets_preprocess_ingest
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Chen Shen"
-
+  team: core
  cluster:
    app_config: ray_sgd_training_app.yaml
    compute_template: ray_sgd_training_compute_no_gpu.yaml
@ -96,10 +78,7 @@
  stable: false

 - name: datasets_ingest_400G
-  owner:
-    mail: "core@anyscale.com"
-    slack: "@Chen Shen"
-
+  team: core
  cluster:
    app_config: ray_sgd_training_app.yaml
    compute_template: dataset_ingest_400G_compute.yaml
--- a/release/nightly_tests/nightly_tests.yaml
+++ b/release/nightly_tests/nightly_tests.yaml
@ -1,10 +1,10 @@
 #
 # Single node shuffle
 #
-
 # Test basic single node 10GB shuffle with a small number of partitions.
 # This doesn't require object spilling.
 - name: shuffle_10gb
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_single.yaml
@ -15,6 +15,7 @@

 # Test single node 50GB shuffle with a large number of partitions.
 - name: shuffle_50gb
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_single.yaml
@ -25,6 +26,7 @@

 # Test single node 50GB shuffle with a large number of partitions.
 - name: shuffle_50gb_large_partition
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_single.yaml
@ -35,6 +37,7 @@

 # Test non streaming shuffle in a single node with a small number of partition.
 - name: non_streaming_shuffle_50gb
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_single.yaml
@ -45,6 +48,7 @@

 # Test non streaming shuffle in a single node with a large number of partition.
 - name: non_streaming_shuffle_50gb_large_partition
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_single.yaml
@ -54,6 +58,7 @@
    script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming

 - name: dask_on_ray_10gb_sort
+  team: core
  cluster:
    app_config: dask_on_ray/dask_on_ray_app_config.yaml
    compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
@ -63,6 +68,7 @@
    script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray

 - name: dask_on_ray_100gb_sort
+  team: core
  cluster:
    app_config: dask_on_ray/dask_on_ray_app_config.yaml
    compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
@ -77,6 +83,7 @@

 # Test multi nodes 100GB shuffle with a small number of partitions.
 - name: shuffle_100gb
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_multi.yaml
@ -88,6 +95,7 @@

 # Test non streaming multi nodes 100GB shuffle with a small number of partitions.
 - name: non_streaming_shuffle_100gb
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_multi.yaml
@ -99,6 +107,7 @@

 # Test autoscaling 1TB streaming shuffle with a large number of partitions.
 - name: autoscaling_shuffle_1tb_1000_partitions
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_autoscaling.yaml
@ -109,6 +118,7 @@

 # Test multi nodes 1TB streaming shuffle with a large number of partitions.
 - name: shuffle_1tb_1000_partition
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_large_scale.yaml
@ -120,6 +130,7 @@

 # Test multi nodes 1TB non streaming shuffle with a large number of partitions.
 - name: non_streaming_shuffle_1tb_1000_partition
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_large_scale.yaml
@ -131,6 +142,7 @@

 # Stress test for 1TB multi node streaming shuffle.
 - name: shuffle_1tb_5000_partitions
+  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_large_scale.yaml
@ -142,6 +154,7 @@

 # Stress test for 1TB multi node non-streaming shuffle.
 # - name: non_streaming_shuffle_1tb_5000_partitions
+#   team: core
 #   stable: False
 #   cluster:
 #     app_config: shuffle/shuffle_app_config.yaml
@ -154,6 +167,7 @@

 # Test large scale dask on ray test without spilling.
 - name: dask_on_ray_large_scale_test_no_spilling
+  team: core
  cluster:
    app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
    compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
@ -175,6 +189,7 @@

 # Test large scale dask on ray test with spilling.
 - name: dask_on_ray_large_scale_test_spilling
+  team: core
  cluster:
    app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
    compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
@ -196,6 +211,7 @@

 # Stress tests with many tasks
 - name: stress_test_many_tasks
+  team: core
  cluster:
    app_config: stress_tests/stress_tests_app_config.yaml
    compute_template: stress_tests/stress_tests_compute.yaml
@ -215,6 +231,7 @@

 # Stress tests with dead actors
 - name: stress_test_dead_actors
+  team: core
  cluster:
    app_config: stress_tests/stress_tests_app_config.yaml
    compute_template: stress_tests/stress_tests_compute.yaml
@ -234,6 +251,7 @@

 # Stress tests with placement groups
 - name: stress_test_placement_group
+  team: core
  cluster:
    app_config: stress_tests/stress_tests_app_config.yaml
    compute_template: stress_tests/placement_group_tests_compute.yaml
@ -244,6 +262,7 @@

 # Stress tests with many threaded actors.
 - name: threaded_actors_stress_test
+  team: core
  cluster:
    app_config: stress_tests/stress_tests_app_config.yaml
    compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
@ -266,6 +285,7 @@

 # Test decision tree on autoscaling compute cluster.
 - name: decision_tree_autoscaling
+  team: core
  cluster:
    app_config: decision_tree/decision_tree_app_config.yaml
    compute_template: decision_tree/autoscaling_compute.yaml
@ -276,6 +296,7 @@

 # Test 20 concurrent decision tree runs on autoscaling compute cluster.
 - name: decision_tree_autoscaling_20_runs
+  team: core
  cluster:
    app_config: decision_tree/decision_tree_app_config.yaml
    compute_template: decision_tree/autoscaling_compute.yaml
@ -285,6 +306,7 @@

 # Stress test shuffle_data_loader.
 - name: shuffle_data_loader
+  team: core
  cluster:
    app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
    compute_template: shuffle_data_loader/shuffle_data_loader_compute.yaml
@ -307,6 +329,7 @@

 # Stress test shuffle_data_loader.
 - name: shuffle_data_loader_4_nodes
+  team: core
  cluster:
    app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
    compute_template: shuffle_data_loader/shuffle_data_loader_compute_4_nodes.yaml
@ -329,6 +352,7 @@
      --no-stats

 - name: dask_on_ray_1tb_sort
+  team: core
  cluster:
    app_config: dask_on_ray/dask_on_ray_app_config.yaml
    compute_template: dask_on_ray/1tb_sort_compute.yaml
@ -339,6 +363,7 @@
    script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test

 - name: many_nodes_actor_test
+  team: core
  cluster:
    app_config: many_nodes_tests/app_config.yaml
    compute_template: many_nodes_tests/compute_config.yaml
@ -349,6 +374,7 @@
    script: python many_nodes_tests/actor_test.py

 - name: pg_autoscaling_regression_test
+  team: core
  cluster:
    app_config: placement_group_tests/app_config.yaml
    compute_template: placement_group_tests/compute.yaml
@ -358,6 +384,7 @@
    script: python placement_group_tests/pg_run.py

 - name: pg_long_running_performance_test
+  team: core
  cluster:
    app_config: placement_group_tests/app_config.yaml
    compute_template: placement_group_tests/long_running_test_compute.yaml
@ -368,6 +395,7 @@
    script: python placement_group_tests/long_running_performance_test.py --num-stages 2000

 - name: placement_group_performance_test
+  team: core
  cluster:
    app_config: placement_group_tests/app_config.yaml
    compute_template: placement_group_tests/pg_perf_test_compute.yaml
--- a/release/rllib_tests/rllib_tests.yaml
+++ b/release/rllib_tests/rllib_tests.yaml
@ -1,5 +1,6 @@
 # Heavy learning tests (Atari and HalfCheetah) for major algos.
 - name: learning_tests
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 8gpus_64cpus.yaml
@ -14,6 +15,7 @@

 # 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos.
 - name: multi_gpu_learning_tests
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 8gpus_96cpus.yaml
@ -25,6 +27,7 @@
 # 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos
 # (that support RNN models).
 - name: multi_gpu_with_lstm_learning_tests
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 8gpus_96cpus.yaml
@ -36,6 +39,7 @@
 # 2-GPU learning tests (StatelessCartPole) + use_attention=True for major
 # algos (that support RNN models).
 - name: multi_gpu_with_attention_learning_tests
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 8gpus_96cpus.yaml
@ -46,6 +50,7 @@

 # We'll have these as per-PR tests soon.
 # - name: example_scripts_on_gpu_tests
+#   team: ml
 #  cluster:
 #    app_config: app_config.yaml
 #    compute_template: 1gpu_4cpus.yaml
@ -56,6 +61,7 @@

 # IMPALA large machine stress tests (4x Atari).
 - name: stress_tests
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 4gpus_544_cpus.yaml
@ -71,6 +77,7 @@

 # Tests that exercise auto-scaling and Anyscale connect.
 - name: connect_tests
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: auto_scale.yaml
@ -86,6 +93,7 @@
 # Performance metrics, such as reward achieved and throughput, are then
 # collected and tracked over time.
 - name: performance_tests
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 12gpus_192cpus.yaml
--- a/release/runtime_env_tests/runtime_env_tests.yaml
+++ b/release/runtime_env_tests/runtime_env_tests.yaml
@ -1,4 +1,5 @@
 - name: rte_many_tasks_actors
+  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: rte_small.yaml
@ -9,6 +10,7 @@
    script: python workloads/rte_many_tasks_actors.py

 - name: wheel_urls
+  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: rte_minimal.yaml
@ -19,6 +21,7 @@
    script: python workloads/wheel_urls.py

 - name: rte_ray_client
+  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: rte_minimal.yaml
--- a/release/serve_tests/serve_tests.yaml
+++ b/release/serve_tests/serve_tests.yaml
@ -1,4 +1,5 @@
 - name: single_deployment_1k_noop_replica
+  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: compute_tpl_8_cpu.yaml
@ -12,6 +13,7 @@
    timeout: 600

 - name: multi_deployment_1k_noop_replica
+  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: compute_tpl_8_cpu.yaml
@ -25,6 +27,7 @@
    timeout: 600

 - name: serve_micro_benchmark
+  team: serve
  cluster:
    app_config: app_config.yaml
    # 16 CPUS
@ -54,6 +57,7 @@
    timeout: 600

 - name: serve_cluster_fault_tolerance
+  team: serve
  cluster:
    app_config: app_config.yaml
    # 16 CPUS
--- a/release/sgd_tests/sgd_tests.yaml
+++ b/release/sgd_tests/sgd_tests.yaml
@ -1,5 +1,6 @@
 # Test multi-node, multi-GPU Ray SGD example.
 - name: sgd_gpu
+  team: ml
  cluster:
    app_config: sgd_gpu/sgd_gpu_app_config.yaml
    compute_template: sgd_gpu/sgd_gpu_compute.yaml
--- a/release/tune_tests/cloud_tests/tune_cloud_tests.yaml
+++ b/release/tune_tests/cloud_tests/tune_cloud_tests.yaml
@ -1,4 +1,5 @@
 - name: aws_no_sync_down
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_aws_4x2.yaml
@ -9,6 +10,7 @@
    script: python workloads/run_cloud_test.py no_sync_down

 - name: aws_ssh_sync
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_aws_4x2.yaml
@ -19,6 +21,7 @@
    script: python workloads/run_cloud_test.py ssh_sync

 - name: aws_durable_upload
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_aws_4x2.yaml
@ -29,6 +32,7 @@
    script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload

 - name: aws_durable_upload_rllib_str
+  team: ml
  cluster:
    app_config: app_config_ml.yaml
    compute_template: tpl_aws_4x2.yaml
@ -39,6 +43,7 @@
    script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str

 - name: aws_durable_upload_rllib_trainer
+  team: ml
  cluster:
    app_config: app_config_ml.yaml
    compute_template: tpl_aws_4x2.yaml
@ -49,6 +54,7 @@
    script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer

 - name: aws_no_durable_upload
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_aws_4x2.yaml
@ -59,6 +65,7 @@
    script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload

 - name: gcp_k8s_no_sync_down
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_gcp_k8s_4x8.yaml
@ -71,6 +78,7 @@
    script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8

 - name: gcp_k8s_ssh_sync
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_gcp_k8s_4x8.yaml
@ -83,6 +91,7 @@
    script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8

 - name: gcp_k8s_durable_upload
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_gcp_k8s_4x8.yaml
@ -96,6 +105,7 @@


 - name: gcp_k8s_no_durable_upload
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_gcp_k8s_4x8.yaml
--- a/release/tune_tests/scalability_tests/tune_tests.yaml
+++ b/release/tune_tests/scalability_tests/tune_tests.yaml
@ -1,4 +1,5 @@
 - name: bookkeeping_overhead
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_1x16.yaml
@ -9,6 +10,7 @@


 - name: durable_trainable
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_16x2.yaml
@ -19,6 +21,7 @@
    script: python workloads/test_durable_trainable.py --bucket data-test-ilr

 - name: long_running_large_checkpoints
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_1x32_hd.yaml
@ -34,6 +37,7 @@


 - name: network_overhead
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_100x2.yaml
@ -54,6 +58,7 @@
      prepare: python wait_cluster.py 20 600

 - name: result_throughput_cluster
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_16x64.yaml
@ -64,6 +69,7 @@
    script: python workloads/test_result_throughput_cluster.py

 - name: result_throughput_single_node
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_1x96.yaml
@ -73,6 +79,7 @@
    script: python workloads/test_result_throughput_single_node.py

 - name: xgboost_sweep
+  team: ml
  cluster:
    app_config: app_config_data.yaml
    compute_template: tpl_16x64.yaml
--- a/release/xgboost_tests/xgboost_tests.yaml
+++ b/release/xgboost_tests/xgboost_tests.yaml
@ -1,4 +1,5 @@
 - name: train_small
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
@ -11,6 +12,7 @@
    script: python workloads/train_small.py

 - name: train_moderate
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
@ -21,6 +23,7 @@
    script: python workloads/train_moderate.py

 - name: train_gpu
+  team: ml
  cluster:
    app_config: app_config_gpu.yaml
    compute_template: tpl_gpu_small.yaml
@ -31,6 +34,7 @@
    script: python workloads/train_gpu.py

 - name: distributed_api_test
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
@ -43,6 +47,7 @@
    results: ""

 - name: ft_small_elastic
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
@ -54,6 +59,7 @@
    results: ""

 - name: ft_small_non_elastic
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
@ -65,6 +71,7 @@
    results: ""

 - name: tune_small
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
@ -75,6 +82,7 @@
    script: python workloads/tune_small.py

 - name: tune_32x4
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
@ -85,6 +93,7 @@
    script: python workloads/tune_32x4.py

 - name: tune_4x32
+  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml