[ci/release] Remove old OSS release test infrastructure (#23134)

Now that we've migrated all OSS release tests to the new infrastructure, we can remove old config files and infra scripts.
2025-03-05 10:01:43 -05:00 · 2022-03-14 15:10:52 +00:00 · 2022-03-14 15:10:52 +00:00 · 8608b64885
commit 8608b64885
parent d93fa95dd5
39 changed files with 0 additions and 6712 deletions
--- a/benchmarks/benchmark_tests.yaml
+++ b/benchmarks/benchmark_tests.yaml
@ -1,145 +0,0 @@
- name: single_node
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: single_node.yaml
-
-  run:
-    timeout: 12000
-    prepare: sleep 0
-    script: python single_node/test_single_node.py
-
- name: object_store
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: object_store.yaml
-
-  run:
-    timeout: 3600
-    prepare: python distributed/wait_cluster.py --num-nodes=50
-    script: python object_store/test_object_store.py
-
- name: many_actors
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=65
-    script: python distributed/test_many_actors.py
-
- name: many_actors_smoke_test
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed_smoke_test.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=2
-    script: SMOKE_TEST=1 python distributed/test_many_actors.py
-
- name: many_tasks
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=65
-    script: python distributed/test_many_tasks.py --num-tasks=10000
-
- name: many_tasks_smoke_test
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed_smoke_test.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=2
-    script: python distributed/test_many_tasks.py --num-tasks=100
-
- name: many_pgs
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=65
-    script: python distributed/test_many_pgs.py
-
- name: many_pgs_smoke_test
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed_smoke_test.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=2
-    script: SMOKE_TEST=1 python distributed/test_many_pgs.py
-
-# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
- name: many_nodes
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: many_nodes.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=250
-    script: python distributed/test_many_tasks.py --num-tasks=1000
-
- name: scheduling_test_many_0s_tasks_single_node
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: scheduling.yaml
-
-  run:
-    timeout: 3600
-    prepare: python distributed/wait_cluster.py --num-nodes=32
-    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
-
- name: scheduling_test_many_0s_tasks_many_nodes
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: scheduling.yaml
-
-  run:
-    timeout: 3600
-    prepare: python distributed/wait_cluster.py --num-nodes=32
-    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
-
- name: scheduling_test_many_5s_tasks_single_node
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: scheduling.yaml
-
-  run:
-    timeout: 3600
-    prepare: python distributed/wait_cluster.py --num-nodes=32
-    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
-  stable: false
-
- name: scheduling_test_many_5s_tasks_many_nodes
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: scheduling.yaml
-
-  run:
-    timeout: 3600
-    prepare: python distributed/wait_cluster.py --num-nodes=32
-    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
-  stable: false
--- a/benchmarks/distributed/wait_cluster.py
+++ b/benchmarks/distributed/wait_cluster.py
@ -1,24 +0,0 @@
-import click
-import ray
-import time
-
-
-def num_alive_nodes():
-    n = 0
-    for node in ray.nodes():
-        if node["Alive"]:
-            n += 1
-    return n
-
-
-@click.command()
-@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
-def wait_cluster(num_nodes: int):
-    ray.init(address="auto")
-    while num_alive_nodes() != num_nodes:
-        print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
-        time.sleep(5)
-
-
-if __name__ == "__main__":
-    wait_cluster()
--- a/release/.buildkite/build_pipeline.py
+++ b/release/.buildkite/build_pipeline.py
@ -1,680 +0,0 @@
-import copy
-import logging
-import os
-import re
-import sys
-
-import yaml
-
-# If you update or reorganize the periodic tests, please ensure the
-# relevant portions of the Ray release instructions (go/release-ray)
-# (in particular, running periodic tests and collecting release logs)
-# are up to date.  If you need access, please contact @zhe-thoughts.
-
-# Env variables:
-
-# RAY_REPO          Repo to use for finding the wheel
-# RAY_BRANCH        Branch to find the wheel
-# RAY_VERSION       Version to find the wheel
-# RAY_WHEELS        Direct Ray wheel URL
-# RAY_TEST_REPO     Repo to use for test scripts
-# RAY_TEST_BRANCH   Branch for test scripts
-# FILTER_FILE       File filter
-# FILTER_TEST       Test name filter
-# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)
-
-
-class ReleaseTest:
-    def __init__(
-        self,
-        name: str,
-        smoke_test: bool = False,
-        retry: int = 0,
-    ):
-        self.name = name
-        self.smoke_test = smoke_test
-        self.retry = retry
-
-    def __str__(self):
-        return self.name
-
-    def __repr__(self):
-        return self.name
-
-    def __contains__(self, item):
-        return self.name.__contains__(item)
-
-    def __iter__(self):
-        return iter(self.name)
-
-    def __len__(self):
-        return len(self.name)
-
-
-class SmokeTest(ReleaseTest):
-    def __init__(self, name: str, retry: int = 0):
-        super(SmokeTest, self).__init__(name=name, smoke_test=True, retry=retry)
-
-
-CORE_NIGHTLY_TESTS = {
-    # "~/ray/release/nightly_tests/nightly_tests.yaml": [
-    # "shuffle_10gb",
-    # "shuffle_50gb",
-    # "shuffle_50gb_large_partition",
-    # "shuffle_100gb",
-    # "non_streaming_shuffle_100gb",
-    # "non_streaming_shuffle_50gb_large_partition",
-    # "non_streaming_shuffle_50gb",
-    # SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
-    # SmokeTest("dask_on_ray_large_scale_test_spilling"),
-    # "stress_test_placement_group",
-    # "shuffle_1tb_1000_partition",
-    # "non_streaming_shuffle_1tb_1000_partition",
-    # "shuffle_1tb_5000_partitions",
-    # TODO(sang): It doesn't even work without spilling
-    # as it hits the scalability limit.
-    # "non_streaming_shuffle_1tb_5000_partitions",
-    # "decision_tree_autoscaling",
-    # "decision_tree_autoscaling_20_runs",
-    # "autoscaling_shuffle_1tb_1000_partitions",
-    # SmokeTest("stress_test_many_tasks"),
-    # SmokeTest("stress_test_dead_actors"),
-    # SmokeTest("threaded_actors_stress_test"),
-    # "pg_long_running_performance_test",
-    # ],
-    # "~/ray/benchmarks/benchmark_tests.yaml": [
-    #     "single_node",
-    #     "object_store",
-    #     "many_actors_smoke_test",
-    #     "many_tasks_smoke_test",
-    #     "many_pgs_smoke_test",
-    # ],
-    # "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
-    #     "inference",
-    #     "shuffle_data_loader",
-    #     "parquet_metadata_resolution",
-    #     "pipelined_training_50_gb",
-    #     "pipelined_ingestion_1500_gb",
-    #     "datasets_preprocess_ingest",
-    #     "datasets_ingest_400G",
-    #     SmokeTest("datasets_ingest_train_infer"),
-    # ],
-    # "~/ray/release/nightly_tests/chaos_test.yaml": [
-    #     "chaos_many_actors",
-    #     "chaos_many_tasks_no_object_store",
-    #     "chaos_pipelined_ingestion_1500_gb_15_windows",
-    # ],
-    # "~/ray/release/microbenchmark/microbenchmark.yaml": [
-    #     "microbenchmark",
-    # ],
-}
-
-SERVE_NIGHTLY_TESTS = {
-    # "~/ray/release/long_running_tests/long_running_tests.yaml": [
-    #     SmokeTest("serve"),
-    #     SmokeTest("serve_failure"),
-    # ],
-    # "~/ray/release/serve_tests/serve_tests.yaml": [
-    #     "single_deployment_1k_noop_replica",
-    #     "multi_deployment_1k_noop_replica",
-    #     "autoscaling_single_deployment",
-    #     "autoscaling_multi_deployment",
-    #     "serve_micro_benchmark",
-    #     # TODO(architkulkarni) Reenable after K8s migration.  Currently failing
-    #     # "serve_micro_benchmark_k8s",
-    #     "serve_cluster_fault_tolerance",
-    # ],
-}
-
-CORE_DAILY_TESTS = {
-    # "~/ray/release/nightly_tests/nightly_tests.yaml": [
-    #     "k8s_dask_on_ray_large_scale_test_no_spilling",
-    #     "dask_on_ray_large_scale_test_no_spilling",
-    #     "dask_on_ray_large_scale_test_spilling",
-    #     "pg_autoscaling_regression_test",
-    #     "threaded_actors_stress_test",
-    #     "k8s_threaded_actors_stress_test",
-    #     "stress_test_many_tasks",
-    #     "stress_test_dead_actors",
-    # ],
-    # "~/ray/release/nightly_tests/chaos_test.yaml": [
-    #     "chaos_dask_on_ray_large_scale_test_no_spilling",
-    #     "chaos_dask_on_ray_large_scale_test_spilling",
-    # ],
-}
-
-CORE_SCALABILITY_TESTS_DAILY = {
-    # "~/ray/benchmarks/benchmark_tests.yaml": [
-    #     "many_actors",
-    #     "many_tasks",
-    #     "many_pgs",
-    #     "many_nodes",
-    # ],
-}
-
-CORE_SCHEDULING_DAILY = {
-    # "~/ray/benchmarks/benchmark_tests.yaml": [
-    #     "scheduling_test_many_0s_tasks_single_node",
-    #     "scheduling_test_many_0s_tasks_many_nodes",
-    #     # Reenable these two once we got right setup
-    #     # "scheduling_test_many_5s_tasks_single_node",
-    #     # "scheduling_test_many_5s_tasks_many_nodes",
-    # ],
-    # "~/ray/release/nightly_tests/nightly_tests.yaml": [
-    #     "many_nodes_actor_test",
-    #     "dask_on_ray_10gb_sort",
-    #     "dask_on_ray_100gb_sort",
-    #     "dask_on_ray_1tb_sort",
-    #     "placement_group_performance_test",
-    # ],
-}
-
-NIGHTLY_TESTS = {
-    # "~/ray/release/horovod_tests/horovod_tests.yaml": [
-    #     SmokeTest("horovod_test"),
-    # ],  # Should we enable this?
-    # "~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
-    #     "dask_xgboost_test",
-    #     "modin_xgboost_test",
-    #     "torch_tune_serve_test",
-    # ],
-    # "~/ray/release/long_running_tests/long_running_tests.yaml": [
-    #     SmokeTest("actor_deaths"),
-    #     SmokeTest("apex"),
-    #     SmokeTest("impala"),
-    #     SmokeTest("many_actor_tasks"),
-    #     SmokeTest("many_drivers"),
-    #     SmokeTest("many_ppo"),
-    #     SmokeTest("many_tasks"),
-    #     SmokeTest("many_tasks_serialized_ids"),
-    #     SmokeTest("node_failures"),
-    #     SmokeTest("pbt"),
-    #     # SmokeTest("serve"),
-    #     # SmokeTest("serve_failure"),
-    #     # Full long running tests (1 day runtime)
-    #     "actor_deaths",
-    #     "apex",
-    #     "impala",
-    #     "many_actor_tasks",
-    #     "many_drivers",
-    #     "many_ppo",
-    #     "many_tasks",
-    #     "many_tasks_serialized_ids",
-    #     "node_failures",
-    #     "pbt",
-    #     "serve",
-    #     "serve_failure",
-    # ],
-    # "~/ray/release/sgd_tests/sgd_tests.yaml": [
-    #     "sgd_gpu",
-    # ],
-    # "~/ray/release/tune_tests/cloud_tests/tune_cloud_tests.yaml": [
-    #     "aws_no_sync_down",
-    #     "aws_ssh_sync",
-    #     "aws_durable_upload",
-    #     "aws_durable_upload_rllib_str",
-    #     "aws_durable_upload_rllib_trainer",
-    #     "gcp_k8s_durable_upload",
-    # ],
-    # "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
-    #     "bookkeeping_overhead",
-    #     "durable_trainable",
-    #     SmokeTest("long_running_large_checkpoints"),
-    #     SmokeTest("network_overhead"),
-    #     "result_throughput_cluster",
-    #     "result_throughput_single_node",
-    # ],
-    # "~/ray/release/xgboost_tests/xgboost_tests.yaml": [
-    #     "train_small",
-    #     "train_moderate",
-    #     "train_gpu",
-    #     "tune_small",
-    #     "tune_4x32",
-    #     "tune_32x4",
-    #     "ft_small_elastic",
-    #     "ft_small_non_elastic",
-    #     "distributed_api_test",
-    # ],
-    # "~/ray/release/rllib_tests/rllib_tests.yaml": [
-    #     SmokeTest("learning_tests"),
-    #     SmokeTest("stress_tests"),
-    #     "performance_tests",
-    #     "multi_gpu_learning_tests",
-    #     "multi_gpu_with_lstm_learning_tests",
-    #     "multi_gpu_with_attention_learning_tests",
-    #     # We'll have these as per-PR tests soon.
-    #     # "example_scripts_on_gpu_tests",
-    # ],
-    # "~/ray/release/runtime_env_tests/runtime_env_tests.yaml": [
-    #     "rte_many_tasks_actors",
-    #     "wheel_urls",
-    #     "rte_ray_client",
-    # ],
-}
-
-WEEKLY_TESTS = {
-    # "~/ray/release/horovod_tests/horovod_tests.yaml": [
-    #     "horovod_test",
-    # ],
-    "~/ray/release/long_running_distributed_tests"
-    # "/long_running_distributed.yaml": [
-    #     "pytorch_pbt_failure",
-    # ],
-    # "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
-    #     "network_overhead",
-    #     "long_running_large_checkpoints",
-    #     "xgboost_sweep",
-    # ],
-    # "~/ray/release/rllib_tests/rllib_tests.yaml": [
-    #     "learning_tests",
-    #     "stress_tests",
-    # ],
-}
-
-# This test suite holds "user" tests to test important user workflows
-# in a particular environment.
-# All workloads in this test suite should:
-#   1. Be run in a distributed (multi-node) fashion
-#   2. Use autoscaling/scale up (no wait_cluster.py)
-#   3. Use GPUs if applicable
-#   4. Have the `use_connect` flag set.
-USER_TESTS = {
-    # "~/ray/release/ml_user_tests/ml_user_tests.yaml": [
-    #     "train_tensorflow_mnist_test",
-    #     "train_torch_linear_test",
-    #     "ray_lightning_user_test_latest",
-    #     "ray_lightning_user_test_master",
-    #     "horovod_user_test_latest",
-    #     "horovod_user_test_master",
-    #     "xgboost_gpu_connect_latest",
-    #     "xgboost_gpu_connect_master",
-    #     "tune_rllib_connect_test",
-    # ]
-}
-
-SUITES = {
-    "core-nightly": CORE_NIGHTLY_TESTS,
-    "serve-nightly": SERVE_NIGHTLY_TESTS,
-    "core-daily": CORE_DAILY_TESTS,
-    "core-scalability": CORE_SCALABILITY_TESTS_DAILY,
-    "nightly": {**NIGHTLY_TESTS, **USER_TESTS},
-    "core-scheduling-daily": CORE_SCHEDULING_DAILY,
-    "weekly": WEEKLY_TESTS,
-}
-
-DEFAULT_STEP_TEMPLATE = {
-    "env": {
-        "ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
-        "ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
-        "RELEASE_AWS_BUCKET": "ray-release-automation-results",
-        "RELEASE_AWS_LOCATION": "dev",
-        "RELEASE_AWS_DB_NAME": "ray_ci",
-        "RELEASE_AWS_DB_TABLE": "release_test_result",
-        "AWS_REGION": "us-west-2",
-    },
-    "agents": {"queue": "runner_queue_branch"},
-    "plugins": [
-        {
-            "docker#v3.9.0": {
-                "image": "rayproject/ray",
-                "propagate-environment": True,
-                "volumes": [
-                    "/tmp/ray_release_test_artifacts:" "/tmp/ray_release_test_artifacts"
-                ],
-            }
-        }
-    ],
-    "artifact_paths": ["/tmp/ray_release_test_artifacts/**/*"],
-}
-
-
-def ask_configuration():
-    RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
-    RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
-    RAY_VERSION = os.environ.get("RAY_VERSION", "")
-    RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
-
-    RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
-    RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
-
-    RELEASE_TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
-    FILTER_FILE = os.environ.get("FILTER_FILE", "")
-    FILTER_TEST = os.environ.get("FILTER_TEST", "")
-
-    input_ask_step = {
-        "input": "Input required: Please specify tests to run",
-        "fields": [
-            {
-                "text": (
-                    "RAY_REPO: Please specify the Ray repository used "
-                    "to find the wheel."
-                ),
-                "hint": (
-                    "Repository from which to fetch the latest "
-                    "commits to find the Ray wheels. Usually you don't "
-                    "need to change this."
-                ),
-                "default": RAY_REPO,
-                "key": "ray_repo",
-            },
-            {
-                "text": (
-                    "RAY_BRANCH: Please specify the Ray branch used "
-                    "to find the wheel."
-                ),
-                "hint": "For releases, this will be e.g. `releases/1.x.0`",
-                "default": RAY_BRANCH,
-                "key": "ray_branch",
-            },
-            {
-                "text": (
-                    "RAY_VERSION: Please specify the Ray version used "
-                    "to find the wheel."
-                ),
-                "hint": (
-                    "Leave empty for latest master. For releases, "
-                    "specify the release version."
-                ),
-                "required": False,
-                "default": RAY_VERSION,
-                "key": "ray_version",
-            },
-            {
-                "text": "RAY_WHEELS: Please specify the Ray wheel URL.",
-                "hint": (
-                    "ATTENTION: If you provide this, RAY_REPO, "
-                    "RAY_BRANCH and RAY_VERSION will be ignored! "
-                    "Please also make sure to provide the wheels URL "
-                    "for Python 3.7 on Linux.\n"
-                    "You can also insert a commit hash here instead "
-                    "of a full URL.\n"
-                    "NOTE: You can specify multiple commits or URLs "
-                    "for easy bisection (one per line) - this will "
-                    "run each test on each of the specified wheels."
-                ),
-                "required": False,
-                "default": RAY_WHEELS,
-                "key": "ray_wheels",
-            },
-            {
-                "text": (
-                    "RAY_TEST_REPO: Please specify the Ray repository "
-                    "used to find the tests you would like to run."
-                ),
-                "hint": (
-                    "If you're developing a new release test, this "
-                    "will most likely be your GitHub fork."
-                ),
-                "default": RAY_TEST_REPO,
-                "key": "ray_test_repo",
-            },
-            {
-                "text": (
-                    "RAY_TEST_BRANCH: Please specify the Ray branch used "
-                    "to find the tests you would like to run."
-                ),
-                "hint": (
-                    "If you're developing a new release test, this "
-                    "will most likely be a branch living on your "
-                    "GitHub fork."
-                ),
-                "default": RAY_TEST_BRANCH,
-                "key": "ray_test_branch",
-            },
-            {
-                "select": (
-                    "RELEASE_TEST_SUITE: Please specify the release "
-                    "test suite containing the tests you would like "
-                    "to run."
-                ),
-                "hint": (
-                    "Check in the `build_pipeline.py` if you're "
-                    "unsure which suite contains your tests."
-                ),
-                "required": True,
-                "options": sorted(SUITES.keys()),
-                "default": RELEASE_TEST_SUITE,
-                "key": "release_test_suite",
-            },
-            {
-                "text": (
-                    "FILTER_FILE: Please specify a filter for the "
-                    "test files that should be included in this build."
-                ),
-                "hint": (
-                    "Only test files (e.g. xgboost_tests.yml) that "
-                    "match this string will be included in the test"
-                ),
-                "default": FILTER_FILE,
-                "required": False,
-                "key": "filter_file",
-            },
-            {
-                "text": (
-                    "FILTER_TEST: Please specify a filter for the "
-                    "test names that should be included in this build."
-                ),
-                "hint": (
-                    "Only test names (e.g. tune_4x32) that match "
-                    "this string will be included in the test"
-                ),
-                "default": FILTER_TEST,
-                "required": False,
-                "key": "filter_test",
-            },
-        ],
-        "key": "input_ask_step",
-    }
-
-    run_again_step = {
-        "commands": [
-            f'export {v}=$(buildkite-agent meta-data get "{k}")'
-            for k, v in {
-                "ray_branch": "RAY_BRANCH",
-                "ray_repo": "RAY_REPO",
-                "ray_version": "RAY_VERSION",
-                "ray_wheels": "RAY_WHEELS",
-                "ray_test_branch": "RAY_TEST_BRANCH",
-                "ray_test_repo": "RAY_TEST_REPO",
-                "release_test_suite": "RELEASE_TEST_SUITE",
-                "filter_file": "FILTER_FILE",
-                "filter_test": "FILTER_TEST",
-            }.items()
-        ]
-        + [
-            "export AUTOMATIC=1",
-            "python3 -m pip install --user pyyaml",
-            "rm -rf ~/ray || true",
-            "git clone -b $${RAY_TEST_BRANCH} $${RAY_TEST_REPO} ~/ray",
-            (
-                "python3 ~/ray/release/.buildkite/build_pipeline.py "
-                "| buildkite-agent pipeline upload"
-            ),
-        ],
-        "label": ":pipeline: Again",
-        "agents": {"queue": "runner_queue_branch"},
-        "depends_on": "input_ask_step",
-        "key": "run_again_step",
-    }
-
-    return [
-        input_ask_step,
-        run_again_step,
-    ]
-
-
-def create_test_step(
-    ray_repo: str,
-    ray_branch: str,
-    ray_version: str,
-    ray_wheels: str,
-    ray_test_repo: str,
-    ray_test_branch: str,
-    test_file: str,
-    test_name: ReleaseTest,
-):
-    custom_commit_str = "custom_wheels_url"
-    if ray_wheels:
-        # Extract commit from url
-        p = re.compile(r"([a-f0-9]{40})")
-        m = p.search(ray_wheels)
-        if m is not None:
-            custom_commit_str = m.group(1)
-
-    ray_wheels_str = f" ({ray_wheels}) " if ray_wheels else ""
-
-    logging.info(f"Creating step for {test_file}/{test_name}{ray_wheels_str}")
-
-    cmd = (
-        f"./release/run_e2e.sh "
-        f'--ray-repo "{ray_repo}" '
-        f'--ray-branch "{ray_branch}" '
-        f'--ray-version "{ray_version}" '
-        f'--ray-wheels "{ray_wheels}" '
-        f'--ray-test-repo "{ray_test_repo}" '
-        f'--ray-test-branch "{ray_test_branch}" '
-    )
-
-    args = (
-        f"--category {ray_branch} "
-        f"--test-config {test_file} "
-        f"--test-name {test_name} "
-        f"--keep-results-dir"
-    )
-
-    if test_name.smoke_test:
-        logging.info("This test will run as a smoke test.")
-        args += " --smoke-test"
-
-    step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
-
-    if test_name.retry:
-        logging.info(f"This test will be retried up to " f"{test_name.retry} times.")
-        step_conf["retry"] = {
-            "automatic": [{"exit_status": "*", "limit": test_name.retry}]
-        }
-    else:
-        # Default retry logic
-        # Warning: Exit codes are currently not correctly propagated to
-        # buildkite! Thus, actual retry logic is currently implemented in
-        # the run_e2e.sh script!
-        step_conf["retry"] = {
-            "automatic": [
-                {"exit_status": 7, "limit": 2},  # Prepare timeout
-                {"exit_status": 9, "limit": 2},  # Session timeout
-                {"exit_status": 10, "limit": 2},  # Prepare error
-            ],
-        }
-
-    step_conf["command"] = cmd + args
-
-    step_conf["label"] = (
-        f"{test_name} "
-        f"({custom_commit_str if ray_wheels_str else ray_branch}) - "
-        f"{ray_test_branch}/{ray_test_repo}"
-    )
-    return step_conf
-
-
-def build_pipeline(steps):
-    all_steps = []
-
-    RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
-    RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
-    RAY_VERSION = os.environ.get("RAY_VERSION", "")
-    RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
-
-    RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
-    RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
-
-    FILTER_FILE = os.environ.get("FILTER_FILE", "")
-    FILTER_TEST = os.environ.get("FILTER_TEST", "")
-
-    ray_wheels_list = [""]
-    if RAY_WHEELS:
-        ray_wheels_list = RAY_WHEELS.split("\n")
-
-    if len(ray_wheels_list) > 1:
-        logging.info(
-            f"This will run a bisec on the following URLs/commits: "
-            f"{ray_wheels_list}"
-        )
-
-    logging.info(
-        f"Building pipeline \n"
-        f"Ray repo/branch to test:\n"
-        f" RAY_REPO   = {RAY_REPO}\n"
-        f" RAY_BRANCH = {RAY_BRANCH}\n\n"
-        f" RAY_VERSION = {RAY_VERSION}\n\n"
-        f" RAY_WHEELS = {RAY_WHEELS}\n\n"
-        f"Ray repo/branch containing the test configurations and scripts:"
-        f" RAY_TEST_REPO   = {RAY_TEST_REPO}\n"
-        f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
-        f"Filtering for these tests:\n"
-        f" FILTER_FILE = {FILTER_FILE}\n"
-        f" FILTER_TEST = {FILTER_TEST}\n\n"
-    )
-
-    for test_file, test_names in steps.items():
-        if FILTER_FILE and FILTER_FILE not in test_file:
-            continue
-
-        test_base = os.path.basename(test_file)
-        for test_name in test_names:
-            if FILTER_TEST and FILTER_TEST not in test_name:
-                continue
-
-            if not isinstance(test_name, ReleaseTest):
-                test_name = ReleaseTest(name=test_name)
-
-            logging.info(f"Adding test: {test_base}/{test_name}")
-
-            for ray_wheels in ray_wheels_list:
-                step_conf = create_test_step(
-                    ray_repo=RAY_REPO,
-                    ray_branch=RAY_BRANCH,
-                    ray_version=RAY_VERSION,
-                    ray_wheels=ray_wheels,
-                    ray_test_repo=RAY_TEST_REPO,
-                    ray_test_branch=RAY_TEST_BRANCH,
-                    test_file=test_file,
-                    test_name=test_name,
-                )
-
-                all_steps.append(step_conf)
-
-    return all_steps
-
-
-def alert_pipeline(stats: bool = False):
-    step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
-
-    cmd = "python release/alert.py"
-    if stats:
-        cmd += " --stats"
-
-    step_conf["commands"] = [
-        "pip install -q -r release/requirements.txt",
-        "pip install -U boto3 botocore",
-        cmd,
-    ]
-    step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
-    return [step_conf]
-
-
-if __name__ == "__main__":
-    alert = os.environ.get("RELEASE_ALERT", "0")
-
-    ask_for_config = not bool(int(os.environ.get("AUTOMATIC", "0")))
-
-    if alert in ["1", "stats"]:
-        steps = alert_pipeline(alert == "stats")
-    elif ask_for_config:
-        steps = ask_configuration()
-    else:
-        TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
-        PIPELINE_SPEC = SUITES[TEST_SUITE]
-
-        steps = build_pipeline(PIPELINE_SPEC)
-
-    yaml.dump({"steps": steps}, sys.stdout)
--- a/release/alert.py
+++ b/release/alert.py
@ -1,441 +0,0 @@
-import argparse
-from collections import defaultdict, Counter
-from typing import Any, List, Tuple, Mapping, Optional
-import datetime
-import hashlib
-import json
-import logging
-import os
-import requests
-import sys
-
-import boto3
-
-from e2e import GLOBAL_CONFIG
-
-from alerts.default import handle_result as default_handle_result
-from alerts.rllib_tests import handle_result as rllib_tests_handle_result
-from alerts.long_running_tests import handle_result as long_running_tests_handle_result
-from alerts.tune_tests import handle_result as tune_tests_handle_result
-from alerts.xgboost_tests import handle_result as xgboost_tests_handle_result
-
-SUITE_TO_FN = {
-    "long_running_tests": long_running_tests_handle_result,
-    "rllib_tests": rllib_tests_handle_result,
-    "tune_tests": tune_tests_handle_result,
-    "xgboost_tests": xgboost_tests_handle_result,
-}
-
-GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] = "alert_state"
-GLOBAL_CONFIG["SLACK_WEBHOOK"] = os.environ.get("SLACK_WEBHOOK", "")
-GLOBAL_CONFIG["SLACK_CHANNEL"] = os.environ.get("SLACK_CHANNEL", "#oss-test-cop")
-
-RESULTS_LIMIT = 120
-
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler(stream=sys.stdout)
-formatter = logging.Formatter(
-    fmt="[%(levelname)s %(asctime)s] " "%(filename)s: %(lineno)d  " "%(message)s"
-)
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
-
-def maybe_fetch_slack_webhook():
-    if GLOBAL_CONFIG["SLACK_WEBHOOK"] in [None, ""]:
-        print("Missing SLACK_WEBHOOK, retrieving from AWS secrets store")
-        GLOBAL_CONFIG["SLACK_WEBHOOK"] = boto3.client(
-            "secretsmanager", region_name="us-west-2"
-        ).get_secret_value(
-            SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:"
-            "release-automation/"
-            "slack-webhook-Na0CFP"
-        )[
-            "SecretString"
-        ]
-
-
-def _obj_hash(obj: Any) -> str:
-    json_str = json.dumps(obj, sort_keys=True, ensure_ascii=True)
-    sha = hashlib.sha256()
-    sha.update(json_str.encode())
-    return sha.hexdigest()
-
-
-def fetch_latest_alerts(rds_data_client):
-    schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
-
-    sql = f"""
-        SELECT DISTINCT ON (category, test_suite, test_name)
-               category, test_suite, test_name, last_result_hash,
-               last_notification_dt
-        FROM   {schema}
-        ORDER BY category, test_suite, test_name, last_notification_dt DESC
-        LIMIT {RESULTS_LIMIT}
-        """
-
-    result = rds_data_client.execute_statement(
-        database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
-        secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
-        resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
-        schema=schema,
-        sql=sql,
-    )
-    for row in result["records"]:
-        category, test_suite, test_name, last_result_hash, last_notification_dt = (
-            r["stringValue"] if "stringValue" in r else None for r in row
-        )
-        last_notification_dt = datetime.datetime.strptime(
-            last_notification_dt, "%Y-%m-%d %H:%M:%S"
-        )
-        yield category, test_suite, test_name, last_result_hash, last_notification_dt
-
-
-def fetch_latest_results(
-    rds_data_client, fetch_since: Optional[datetime.datetime] = None
-):
-    schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
-
-    sql = f"""
-        SELECT DISTINCT ON (category, test_suite, test_name)
-               created_on, category, test_suite, test_name, status, results,
-               artifacts, last_logs
-        FROM   {schema} """
-
-    parameters = []
-    if fetch_since is not None:
-        sql += "WHERE created_on >= :created_on "
-        parameters = [
-            {
-                "name": "created_on",
-                "typeHint": "TIMESTAMP",
-                "value": {"stringValue": fetch_since.strftime("%Y-%m-%d %H:%M:%S")},
-            },
-        ]
-
-    sql += "ORDER BY category, test_suite, test_name, created_on DESC "
-    sql += f"LIMIT {RESULTS_LIMIT}"
-
-    result = rds_data_client.execute_statement(
-        database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
-        secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
-        resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
-        schema=schema,
-        sql=sql,
-        parameters=parameters,
-    )
-    for row in result["records"]:
-        (
-            created_on,
-            category,
-            test_suite,
-            test_name,
-            status,
-            results,
-            artifacts,
-            last_logs,
-        ) = (r["stringValue"] if "stringValue" in r else None for r in row)
-
-        # Calculate hash before converting strings to objects
-        result_obj = (
-            created_on,
-            category,
-            test_suite,
-            test_name,
-            status,
-            results,
-            artifacts,
-            last_logs,
-        )
-        result_json = json.dumps(result_obj)
-        result_hash = _obj_hash(result_json)
-
-        # Convert some strings to python objects
-        created_on = datetime.datetime.strptime(created_on, "%Y-%m-%d %H:%M:%S")
-        results = json.loads(results)
-        artifacts = json.loads(artifacts)
-
-        yield result_hash, created_on, category, test_suite, test_name, status, results, artifacts, last_logs  # noqa: E501
-
-
-def mark_as_handled(
-    rds_data_client,
-    update: bool,
-    category: str,
-    test_suite: str,
-    test_name: str,
-    result_hash: str,
-    last_notification_dt: datetime.datetime,
-):
-    schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
-
-    if not update:
-        sql = f"""
-            INSERT INTO {schema}
-            (category, test_suite, test_name,
-            last_result_hash, last_notification_dt)
-            VALUES (:category, :test_suite, :test_name,
-                    :last_result_hash, :last_notification_dt)
-            """
-    else:
-        sql = f"""
-            UPDATE {schema}
-            SET last_result_hash=:last_result_hash,
-                last_notification_dt=:last_notification_dt
-            WHERE category=:category AND test_suite=:test_suite
-            AND test_name=:test_name
-            """
-
-    rds_data_client.execute_statement(
-        database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
-        parameters=[
-            {"name": "category", "value": {"stringValue": category}},
-            {"name": "test_suite", "value": {"stringValue": test_suite or ""}},
-            {"name": "test_name", "value": {"stringValue": test_name}},
-            {"name": "last_result_hash", "value": {"stringValue": result_hash}},
-            {
-                "name": "last_notification_dt",
-                "typeHint": "TIMESTAMP",
-                "value": {
-                    "stringValue": last_notification_dt.strftime("%Y-%m-%d %H:%M:%S")
-                },
-            },
-        ],
-        secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
-        resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
-        schema=schema,
-        sql=sql,
-    )
-
-
-def post_alerts_to_slack(
-    channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
-):
-    if len(alerts) == 0:
-        logger.info("No alerts to post to slack.")
-        return
-
-    markdown_lines = [
-        f"* {len(alerts)} new release test failures found!*",
-        "",
-    ]
-
-    category_alerts = defaultdict(list)
-    for (category, test_suite, test_name, alert) in alerts:
-        category_alerts[category].append(
-            f"   *{test_suite}/{test_name}* failed: {alert}"
-        )
-
-    for category, alert_list in category_alerts.items():
-        markdown_lines.append(f"Branch: *{category}*")
-        markdown_lines.extend(alert_list)
-        markdown_lines.append("")
-
-    total_non_alerts = sum(n for n in non_alerts.values())
-    non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
-
-    markdown_lines += [
-        f"Additionally, {total_non_alerts} tests passed successfully "
-        f"({', '.join(non_alert_detail)})."
-    ]
-
-    slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
-
-    resp = requests.post(
-        slack_url,
-        json={
-            "text": "\n".join(markdown_lines),
-            "channel": channel,
-            "username": "Fail Bot",
-            "icon_emoji": ":red_circle:",
-        },
-    )
-    print(resp.status_code)
-    print(resp.text)
-
-
-def post_statistics_to_slack(
-    channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
-):
-    total_alerts = len(alerts)
-
-    category_alerts = defaultdict(list)
-    for (category, test_suite, test_name, alert) in alerts:
-        category_alerts[category].append(f"`{test_suite}/{test_name}`")
-
-    alert_detail = [f"{len(a)} on {c}" for c, a in category_alerts.items()]
-
-    total_non_alerts = sum(n for n in non_alerts.values())
-    non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
-
-    markdown_lines = [
-        "*Periodic release test report*",
-        "",
-        f"In the past 24 hours, "
-        f"*{total_non_alerts}* release tests finished successfully, and "
-        f"*{total_alerts}* release tests failed.",
-    ]
-
-    markdown_lines.append("")
-
-    if total_alerts:
-        markdown_lines.append(f"*Failing:* {', '.join(alert_detail)}")
-        for c, a in category_alerts.items():
-            markdown_lines.append(f"  *{c}*: {', '.join(sorted(a))}")
-    else:
-        markdown_lines.append("*Failing:* None")
-
-    markdown_lines.append("")
-
-    if total_non_alerts:
-        markdown_lines.append(f"*Passing:* {', '.join(non_alert_detail)}")
-    else:
-        markdown_lines.append("*Passing:* None")
-
-    slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
-
-    resp = requests.post(
-        slack_url,
-        json={
-            "text": "\n".join(markdown_lines),
-            "channel": channel,
-            "username": "Fail Bot",
-            "icon_emoji": ":red_circle:",
-        },
-    )
-    print(resp.status_code)
-    print(resp.text)
-
-
-def handle_results_and_get_alerts(
-    rds_data_client,
-    fetch_since: Optional[datetime.datetime] = None,
-    always_try_alert: bool = False,
-    no_status_update: bool = False,
-):
-    # First build a map of last notifications
-    last_notifications_map = {}
-    for (
-        category,
-        test_suite,
-        test_name,
-        last_result_hash,
-        last_notification_dt,
-    ) in fetch_latest_alerts(rds_data_client):
-        last_notifications_map[(category, test_suite, test_name)] = (
-            last_result_hash,
-            last_notification_dt,
-        )
-
-    alerts = []
-    non_alerts = Counter()
-
-    # Then fetch latest results
-    for (
-        result_hash,
-        created_on,
-        category,
-        test_suite,
-        test_name,
-        status,
-        results,
-        artifacts,
-        last_logs,
-    ) in fetch_latest_results(rds_data_client, fetch_since=fetch_since):
-        key = (category, test_suite, test_name)
-
-        try_alert = always_try_alert
-        if key in last_notifications_map:
-            # If we have an alert for this key, fetch info
-            last_result_hash, last_notification_dt = last_notifications_map[key]
-
-            if last_result_hash != result_hash:
-                # If we got a new result, handle new result
-                try_alert = True
-            # Todo: maybe alert again after some time?
-        else:
-            try_alert = True
-
-        if try_alert:
-            handle_fn = SUITE_TO_FN.get(test_suite, None)
-            if not handle_fn:
-                logger.warning(f"No handle for suite {test_suite}")
-                alert = default_handle_result(
-                    created_on,
-                    category,
-                    test_suite,
-                    test_name,
-                    status,
-                    results,
-                    artifacts,
-                    last_logs,
-                )
-            else:
-                alert = handle_fn(
-                    created_on,
-                    category,
-                    test_suite,
-                    test_name,
-                    status,
-                    results,
-                    artifacts,
-                    last_logs,
-                )
-
-            if alert:
-                logger.warning(
-                    f"Alert raised for test {test_suite}/{test_name} "
-                    f"({category}): {alert}"
-                )
-
-                alerts.append((category, test_suite, test_name, alert))
-            else:
-                logger.debug(
-                    f"No alert raised for test {test_suite}/{test_name} "
-                    f"({category})"
-                )
-                non_alerts[category] += 1
-
-            if not no_status_update:
-                mark_as_handled(
-                    rds_data_client,
-                    key in last_notifications_map,
-                    category,
-                    test_suite,
-                    test_name,
-                    result_hash,
-                    datetime.datetime.now(),
-                )
-
-    return alerts, non_alerts
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--stats",
-        action="store_true",
-        default=False,
-        help="Finish quickly for training.",
-    )
-    args = parser.parse_args()
-
-    maybe_fetch_slack_webhook()
-
-    rds_data_client = boto3.client("rds-data", region_name="us-west-2")
-
-    if args.stats:
-        # Only update last 24 hour stats
-        fetch_since = datetime.datetime.now() - datetime.timedelta(days=1)
-        alerts, non_alerts = handle_results_and_get_alerts(
-            rds_data_client,
-            fetch_since=fetch_since,
-            always_try_alert=True,
-            no_status_update=True,
-        )
-        post_statistics_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
-
-    else:
-        alerts, non_alerts = handle_results_and_get_alerts(rds_data_client)
-        post_alerts_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
--- a/release/benchmarks/benchmark_tests.yaml
+++ b/release/benchmarks/benchmark_tests.yaml
@ -1,145 +0,0 @@
- name: single_node
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: single_node.yaml
-
-  run:
-    timeout: 12000
-    prepare: sleep 0
-    script: python single_node/test_single_node.py
-
- name: object_store
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: object_store.yaml
-
-  run:
-    timeout: 3600
-    prepare: python distributed/wait_cluster.py --num-nodes=50
-    script: python object_store/test_object_store.py
-
- name: many_actors
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=65
-    script: python distributed/test_many_actors.py
-
- name: many_actors_smoke_test
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed_smoke_test.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=2
-    script: SMOKE_TEST=1 python distributed/test_many_actors.py
-
- name: many_tasks
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=65
-    script: python distributed/test_many_tasks.py --num-tasks=10000
-
- name: many_tasks_smoke_test
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed_smoke_test.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=2
-    script: python distributed/test_many_tasks.py --num-tasks=100
-
- name: many_pgs
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=65
-    script: python distributed/test_many_pgs.py
-
- name: many_pgs_smoke_test
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: distributed_smoke_test.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=2
-    script: SMOKE_TEST=1 python distributed/test_many_pgs.py
-
-# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
- name: many_nodes
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: many_nodes.yaml
-
-  run:
-    timeout: 3600 # 1hr
-    prepare: python distributed/wait_cluster.py --num-nodes=250
-    script: python distributed/test_many_tasks.py --num-tasks=1000
-
- name: scheduling_test_many_0s_tasks_single_node
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: scheduling.yaml
-
-  run:
-    timeout: 3600
-    prepare: python distributed/wait_cluster.py --num-nodes=32
-    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
-
- name: scheduling_test_many_0s_tasks_many_nodes
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: scheduling.yaml
-
-  run:
-    timeout: 3600
-    prepare: python distributed/wait_cluster.py --num-nodes=32
-    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
-
- name: scheduling_test_many_5s_tasks_single_node
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: scheduling.yaml
-
-  run:
-    timeout: 3600
-    prepare: python distributed/wait_cluster.py --num-nodes=32
-    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
-  stable: false
-
- name: scheduling_test_many_5s_tasks_many_nodes
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: scheduling.yaml
-
-  run:
-    timeout: 3600
-    prepare: python distributed/wait_cluster.py --num-nodes=32
-    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
-  stable: false
--- a/release/benchmarks/distributed/wait_cluster.py
+++ b/release/benchmarks/distributed/wait_cluster.py
@ -1,24 +0,0 @@
-import click
-import ray
-import time
-
-
-def num_alive_nodes():
-    n = 0
-    for node in ray.nodes():
-        if node["Alive"]:
-            n += 1
-    return n
-
-
-@click.command()
-@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
-def wait_cluster(num_nodes: int):
-    ray.init(address="auto")
-    while num_alive_nodes() != num_nodes:
-        print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
-        time.sleep(5)
-
-
-if __name__ == "__main__":
-    wait_cluster()
--- a/release/benchmarks/wait_cluster.py
+++ b/release/benchmarks/wait_cluster.py
@ -1,54 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/config_generator.html
+++ b/release/config_generator.html
@ -1,214 +0,0 @@
-<!doctype html>
-<html>
-<head>
-    <meta charset="utf-8">
-    <title>Releaser config generator</title>
-    <style type="text/css">
-        html {
-            background: #cccccc;
-        }
-        body {
-            background: #ffffff;
-            font-family: sans-serif;
-            padding: 1em 2em;
-            max-width: 800px;
-            margin: 0 auto;
-        }
-        textarea {
-            width: 600px;
-            height: 200px;
-        }
-        form .use {
-            white-space: nowrap;
-            padding-right: 1em;
-        }
-        form .val {
-            min-width: 300px;
-        }
-        form .val input {
-            width: 90%;
-        }
-        form .desc {
-        }
-    </style>
-    <script type="text/javascript">
-        var env_vars = [
-            {
-                "name": "RAY_TEST_REPO",
-                "short": "Git repo with test files",
-                "long": "Repository in which the test files are which you would like to run. Note that this doesn't have to be the same repo from which the wheels are installed.",
-                "default": "https://github.com/ray-project/ray.git",
-                "enabled": false,
-            },
-            {
-                "name": "RAY_TEST_BRANCH",
-                "short": "Git branch for test repo",
-                "long": "Git branch that is checked out from RAY_TEST_REPO and which contains the test files you would like to run. Note that this doesnt' have to be the same branch you're fetching the Ray wheels from.",
-                "default": "master",
-                "enabled": false,
-            },
-            {
-                "name": "RAY_REPO",
-                "short": "Git repo for the Ray wheels",
-                "long": "Repository from which to fetch the latest commits to find the Ray wheels",
-                "default": "https://github.com/ray-project/ray.git",
-                "enabled": false,
-            },
-            {
-                "name": "RAY_BRANCH",
-                "short": "Git branch for the Ray wheels",
-                "long": "Branch that is check out from RAY_REPO from which the latest commits are fetched to find the Ray wheels",
-                "default": "master",
-                "enabled": true,
-            },
-            {
-                "name": "RELEASE_TEST_SUITE",
-                "short": "Release test suite (nightly/weekly/manual)",
-                "long": "Release test suite as defined in releaser's build_pipeline.py",
-                "default": "nightly",
-                "enabled": true,
-            },
-            {
-                "name": "FILTER_FILE",
-                "short": "Filter test file by this string",
-                "long": "Only test files (e.g. xgboost_tests.yml) that match this string will be included in the test",
-                "default": "",
-                "enabled": false,
-            },
-            {
-                "name": "FILTER_TEST",
-                "short": "Filter test name by this string",
-                "long": "Only test names (e.g. tune_4x32) that match this string will be included in the test",
-                "default": "",
-                "enabled": false,
-            },
-        ]
-
-        window.addEventListener('load', function () {
-
-            var table = document.getElementById("gen_table");
-
-            for (var env_var of env_vars) {
-
-                var use_td = document.createElement("td");
-                use_td.setAttribute("class", "use");
-
-                var use_input = document.createElement("input");
-                use_input.setAttribute("type", "checkbox");
-                use_input.setAttribute("data-activate", env_var["name"] + "_val");
-                use_input.setAttribute("id", env_var["name"] + "_use");
-                use_input.setAttribute("class", "input_use");
-                if (env_var["enabled"]) {
-                    use_input.checked = true;
-                }
-
-
-                var use_label = document.createElement("label");
-                use_label.setAttribute("for", env_var["name"] + "_use");
-                use_label.innerHTML = env_var["name"];
-
-                use_td.append(use_input);
-                use_td.append(use_label);
-
-                val_td = document.createElement("td");
-                val_td.setAttribute("class", "val");
-
-                val_input = document.createElement("input");
-                val_input.setAttribute("type", "text");
-                if (!env_var["enabled"]) {
-                    val_input.setAttribute("disabled", "disabled");
-                }
-                val_input.setAttribute("id", env_var["name"] + "_val");
-                val_input.setAttribute("name", env_var["name"]);
-                val_input.setAttribute("value", env_var["default"]);
-                val_input.setAttribute("class", "input_val");
-
-                val_td.append(val_input);
-
-                use_input.addEventListener("click", function(e) {
-                    var toggle_val = document.getElementById(e.target.getAttribute("data-activate"))
-
-                    if (toggle_val.disabled) {
-                        toggle_val.removeAttribute("disabled");
-                    } else {
-                        toggle_val.setAttribute("disabled", "disabled");
-                    }
-                    generate_snippet();
-                });
-
-                val_input.addEventListener("change", function() { generate_snippet(); });
-                val_input.addEventListener("keydown", function() { generate_snippet(); });
-                val_input.addEventListener("keyup", function() { generate_snippet(); });
-
-                var desc_td = document.createElement("td");
-                desc_td.setAttribute("class", "desc");
-
-                var desc_a = document.createElement("a");
-                desc_a.setAttribute("title", env_var["long"]);
-                desc_a.innerHTML = env_var["short"];
-
-                desc_td.append(desc_a);
-
-                var tr = document.createElement("tr");
-                tr.append(use_td);
-                tr.append(val_td);
-                tr.append(desc_td);
-
-                table.append(tr);
-            }
-
-            var button = document.getElementById("generate");
-            button.addEventListener("click", function() {
-                generate_snippet();
-            })
-
-            generate_snippet()
-        })
-
-        function generate_snippet() {
-            full_snippet = ""
-            for (env_var of env_vars) {
-                var val_input = document.getElementById(env_var["name"] + "_val")
-
-                if (!val_input.disabled) {
-                    full_snippet += env_var["name"] + "=\"" + val_input.value + "\"\n"
-                }
-            }
-
-            document.getElementById("snippet").innerHTML = full_snippet;
-        }
-
-    </script>
-</head>
-<body>
-<header class="header">
-    <h1>Releaser config generator</h1>
-    <p>Use this form to generate a list of environment variables.</p>
-    <p>These variables can be passed to Buildkite to run a subset of release tests
-    and choose the correct wheels/release test branch</p>
-</header>
-<section class="main">
-    <form id="gen">
-        <table id="gen_table">
-            <tr>
-                <th>Set</th>
-                <th>Value</th>
-                <th>Description</th>
-            </tr>
-
-        </table>
-
-    </form>
-
-    <div>
-        <button id="generate">Generate snippet</button>
-    </div>
-
-    <div>
-        <textarea id="snippet">
-
-        </textarea>
-    </div>
-</section>
-</body>
-</html>
--- a/release/e2e.py
+++ b/release/e2e.py
--- a/release/horovod_tests/horovod_tests.yaml
+++ b/release/horovod_tests/horovod_tests.yaml
@ -1,15 +0,0 @@
- name: horovod_test
-  team: ml
-  cluster:
-    app_config: app_config_master.yaml
-    compute_template: compute_tpl.yaml
-
-  run:
-    timeout: 36000
-    prepare: python wait_cluster.py 3 600
-    script: python workloads/horovod_tune_test.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 1800
--- a/release/horovod_tests/wait_cluster.py
+++ b/release/horovod_tests/wait_cluster.py
@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/lightgbm_tests/lightgbm_tests.yaml
+++ b/release/lightgbm_tests/lightgbm_tests.yaml
@ -1,92 +0,0 @@
- name: train_small
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_small.yaml
-
-  run:
-    use_connect: True
-    autosuspend_mins: 10
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/train_small.py
-
- name: train_moderate
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_moderate.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 32 600
-    script: python workloads/train_moderate.py
-
- name: train_gpu
-  team: ml
-  cluster:
-    app_config: app_config_gpu.yaml
-    compute_template: tpl_gpu_small.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 5 600
-    script: python workloads/train_gpu.py
-
- name: distributed_api_test
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_small.yaml
-    results: 
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/distributed_api_test.py
-    results: ""
-
- name: ft_small_non_elastic
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_small.yaml
-
-  run:
-    timeout: 900
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/ft_small_non_elastic.py
-    results: ""
-
- name: tune_small
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_small.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/tune_small.py
-
- name: tune_32x4
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_moderate.yaml
-
-  run:
-    timeout: 900
-    prepare: python wait_cluster.py 32 600
-    script: python workloads/tune_32x4.py
-
- name: tune_4x32
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_moderate.yaml
-
-  run:
-    timeout: 900
-    prepare: python wait_cluster.py 32 600
-    script: python workloads/tune_4x32.py
--- a/release/lightgbm_tests/wait_cluster.py
+++ b/release/lightgbm_tests/wait_cluster.py
@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/long_running_distributed_tests/long_running_distributed.yaml
+++ b/release/long_running_distributed_tests/long_running_distributed.yaml
@ -1,13 +0,0 @@
- name: pytorch_pbt_failure
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: compute_tpl.yaml
-
-  run:
-    timeout: 86400
-    script: python workloads/pytorch_pbt_failure.py
-    long_running: True
-
-  smoke_test:
-    timeout: 3600
--- a/release/long_running_tests/long_running_tests.yaml
+++ b/release/long_running_tests/long_running_tests.yaml
@ -1,196 +0,0 @@
- name: actor_deaths
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_1.yaml
-
-  run:
-    timeout: 86400
-    prepare: ray stop
-    script: python workloads/actor_deaths.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
- name: apex
-  team: ml
-  cluster:
-    app_config: ../rllib_tests/app_config.yaml
-    compute_template: tpl_cpu_3.yaml
-
-  run:
-    timeout: 86400
-    prepare: python wait_cluster.py 3 600
-    script: python workloads/apex.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
-
- name: impala
-  team: ml
-  cluster:
-    app_config: app_config_np.yaml
-    compute_template: tpl_cpu_1_large.yaml
-
-  run:
-    timeout: 86400
-    script: python workloads/impala.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
- name: many_actor_tasks
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_1.yaml
-
-  run:
-    timeout: 86400
-    prepare: ray stop
-    script: python workloads/many_actor_tasks.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
-
- name: many_drivers
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_1.yaml
-
-  run:
-    timeout: 86400
-    prepare: ray stop
-    script: python workloads/many_drivers.py --iteration-num=4000
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
-
- name: many_ppo
-  team: ml
-  cluster:
-    app_config: ../rllib_tests/app_config.yaml
-    compute_template: many_ppo.yaml
-
-  run:
-    timeout: 86400
-    prepare: python wait_cluster.py 1 600
-    script: python workloads/many_ppo.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
- name: many_tasks
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_1.yaml
-
-  run:
-    timeout: 86400
-    prepare: ray stop
-    script: python workloads/many_tasks.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
- name: many_tasks_serialized_ids
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_1.yaml
-
-  run:
-    timeout: 86400
-    prepare: ray stop
-    script: python workloads/many_tasks_serialized_ids.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
-
- name: node_failures
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_1.yaml
-
-  run:
-    timeout: 86400
-    prepare: ray stop
-    script: python workloads/node_failures.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
- name: pbt
-  team: ml
-  cluster:
-    app_config: ../rllib_tests/app_config.yaml
-    compute_template: tpl_cpu_1.yaml
-
-  run:
-    timeout: 86400
-    prepare: ray stop
-    script: python workloads/pbt.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
- name: serve
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_1.yaml
-
-  run:
-    timeout: 86400
-    prepare: ray stop
-    script: python workloads/serve.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
- name: serve_failure
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_1.yaml
-
-  run:
-    timeout: 86400
-    prepare: ray stop
-    script: python workloads/serve_failure.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 600
-
-  stable: False
--- a/release/long_running_tests/wait_cluster.py
+++ b/release/long_running_tests/wait_cluster.py
@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/microbenchmark/microbenchmark.yaml
+++ b/release/microbenchmark/microbenchmark.yaml
@ -1,9 +0,0 @@
-# - name: microbenchmark
-#   team: core
-#   cluster:
-#     app_config: app_config.yaml
-#     compute_template: tpl_64.yaml
-
-#   run:
-#     timeout: 1800
-#     script: OMP_NUM_THREADS=64 RAY_ADDRESS= python run_microbenchmark.py
--- a/release/ml_user_tests/ml_user_tests.yaml
+++ b/release/ml_user_tests/ml_user_tests.yaml
@ -1,124 +0,0 @@
- name: horovod_user_test_latest
-  team: ml
-  cluster:
-    app_config: horovod/app_config.yaml
-    compute_template: horovod/compute_tpl.yaml
-
-
-  driver_setup: horovod/driver_setup_latest.sh
-
-  run:
-    use_connect: True
-    autosuspend_mins: 10
-    timeout: 1200
-    script: python horovod/horovod_user_test.py
-
- name: horovod_user_test_master
-  team: ml
-  cluster:
-    app_config: ../horovod_tests/app_config_master.yaml
-    compute_template: horovod/compute_tpl.yaml
-
-  driver_setup: horovod/driver_setup_master.sh
-
-  run:
-    use_connect: True
-    autosuspend_mins: 10
-    timeout: 1200
-    script: python horovod/horovod_user_test.py
-
-
- name: train_tensorflow_mnist_test
-  team: ml
-  cluster:
-      app_config: train/app_config.yaml
-      compute_template: train/compute_tpl.yaml
-
-  driver_setup: train/driver_setup.sh
-
-  run:
-      use_connect: True
-      timeout: 36000
-      script: python train/train_tensorflow_mnist_test.py
-
- name: train_torch_linear_test
-  team: ml
-  cluster:
-      app_config: train/app_config.yaml
-      compute_template: train/compute_tpl.yaml
-
-  driver_setup: train/driver_setup.sh
-
-  run:
-      use_connect: True
-      timeout: 36000
-      script: python train/train_torch_linear_test.py
-
-
- name: xgboost_gpu_connect_latest
-  team: ml
-  cluster:
-    app_config: xgboost/app_config_gpu.yaml
-    compute_template: xgboost/tpl_gpu_small_scaling.yaml
-
-  run:
-    use_connect: True
-    timeout: 1200
-    script: python xgboost/train_gpu_connect.py
-
- name: xgboost_gpu_connect_master
-  team: ml
-  cluster:
-    app_config: xgboost/app_config_gpu_master.yaml
-    compute_template: xgboost/tpl_gpu_small_scaling.yaml
-
-  run:
-    use_connect: True
-    timeout: 1200
-    script: python xgboost/train_gpu_connect.py
-
- name: ray_lightning_user_test_latest
-  team: ml
-  cluster:
-    app_config: ray-lightning/app_config.yaml
-    compute_template: ray-lightning/compute_tpl.yaml
-
-  driver_setup: ray-lightning/driver_setup.sh
-
-  run:
-    use_connect: True
-    autosuspend_mins: 10
-    timeout: 1200
-    script: python ray-lightning/ray_lightning_user_test.py
-
-
- name: ray_lightning_user_test_master
-  team: ml
-  cluster:
-    app_config: ray-lightning/app_config_master.yaml
-    compute_template: ray-lightning/compute_tpl.yaml
-
-
-  driver_setup: ray-lightning/driver_setup.sh
-
-  run:
-    use_connect: True
-    autosuspend_mins: 10
-    timeout: 1200
-    script: python ray-lightning/ray_lightning_user_test.py
-
-
- name: tune_rllib_connect_test
-  team: ml
-  cluster:
-    app_config: ../rllib_tests/app_config.yaml
-    compute_template: tune_rllib/compute_tpl.yaml
-
-
-  driver_setup: tune_rllib/driver_setup.sh
-
-  run:
-    use_connect: True
-    autosuspend_mins: 10
-    timeout: 1200
-    script: python tune_rllib/run_connect_tests.py
--- a/release/nightly_tests/chaos_test.yaml
+++ b/release/nightly_tests/chaos_test.yaml
@ -1,64 +0,0 @@
-#
-# Chaos tests.
-#
-
-# Run the test that invokes many tasks without object store usage.
- name: chaos_many_tasks_no_object_store
-  team: core
-  cluster:
-    app_config: chaos_test/app_config.yaml
-    compute_template: chaos_test/compute_template.yaml
-
-  run:
-    timeout: 3600
-    prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
-    script: python chaos_test/test_chaos_basic.py --workload=tasks
-
- name: chaos_many_actors
-  team: core
-  cluster:
-    app_config: chaos_test/app_config.yaml
-    compute_template: chaos_test/compute_template.yaml
-
-  run:
-    timeout: 3600
-    prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
-    script: python chaos_test/test_chaos_basic.py --workload=actors
-
- name: chaos_dask_on_ray_large_scale_test_no_spilling
-  team: core
-  cluster:
-    app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
-    compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
-
-  run:
-    timeout: 7200
-    # Total run time without failures is about 300-400s.
-    prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
-    script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0  --data_save_path /tmp/ray
-
-# Test large scale dask on ray test with spilling.
- name: chaos_dask_on_ray_large_scale_test_spilling
-  team: core
-  cluster:
-    app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
-    compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
-
-  run:
-    timeout: 7200
-    # Total run time without failures is about 300-400s.
-    prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
-    script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0  --data_save_path /tmp/ray
-
- name: chaos_pipelined_ingestion_1500_gb_15_windows
-  team: core
-  cluster:
-    app_config: dataset/pipelined_ingestion_app.yaml
-    compute_template: dataset/pipelined_ingestion_compute.yaml
-
-  run:
-    timeout: 7200
-    prepare: python wait_cluster.py 21 2400;  python setup_chaos.py --node-kill-interval 300
-    script: python dataset/pipelined_training.py --epochs 1 --num-windows 15  --num-files 915 --debug
-
-  stable: false
--- a/release/nightly_tests/dataset/dataset_test.yaml
+++ b/release/nightly_tests/dataset/dataset_test.yaml
@ -1,95 +0,0 @@
- name: inference
-  team: core
-  cluster:
-    app_config: app_config.yaml
-    compute_template: inference.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 2 600
-    script: python inference.py
-  
- name: shuffle_data_loader
-  team: core
-  cluster:
-    app_config: shuffle_app_config.yaml
-    compute_template: shuffle_compute.yaml
-
-  run:
-    timeout: 1800
-    script: python dataset_shuffle_data_loader.py
-
- name: parquet_metadata_resolution
-  team: core
-  cluster:
-    app_config: pipelined_training_app.yaml
-    compute_template: pipelined_training_compute.yaml
-
-  run:
-    timeout: 1200
-    prepare: python wait_cluster.py 15 1200
-    script: python parquet_metadata_resolution.py --num-files 915
-
- name: pipelined_training_50_gb
-  team: core
-  cluster:
-    app_config: pipelined_training_app.yaml
-    compute_template: pipelined_training_compute.yaml
-
-  run:
-    timeout: 4800
-    prepare: python wait_cluster.py 15 1200
-    script: python pipelined_training.py --epochs 1
-
- name: pipelined_ingestion_1500_gb
-  team: core
-  cluster:
-    app_config: pipelined_ingestion_app.yaml
-    compute_template: pipelined_ingestion_compute.yaml
-
-  run:
-    timeout: 9600
-    prepare: python wait_cluster.py 21 2400
-    script: python pipelined_training.py --epochs 2 --num-windows 2 --num-files 915 --debug
-
- name: datasets_ingest_train_infer
-  team: core
-  cluster:
-    app_config: ray_sgd_training_app.yaml
-    compute_template: ray_sgd_training_compute.yaml
-
-  run:
-    timeout: 14400
-    prepare: python wait_cluster.py 66 2400
-    script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset
-
-  smoke_test:
-    cluster:
-      app_config: ray_sgd_training_app.yaml
-      compute_template: ray_sgd_training_smoke_compute.yaml
-
-    run:
-      timeout: 3600
-      prepare: python wait_cluster.py 8 2400
-      script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
-
- name: datasets_preprocess_ingest
-  team: core
-  cluster:
-    app_config: ray_sgd_training_app.yaml
-    compute_template: ray_sgd_training_compute_no_gpu.yaml
-
-  run:
-    timeout: 7200
-    prepare: python wait_cluster.py 21 2400
-    script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset --debug
-
- name: datasets_ingest_400G
-  team: core
-  cluster:
-    app_config: ray_sgd_training_app.yaml
-    compute_template: dataset_ingest_400G_compute.yaml
-
-  run:
-    timeout: 7200
-    script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1
--- a/release/nightly_tests/dataset/wait_cluster.py
+++ b/release/nightly_tests/dataset/wait_cluster.py
@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/nightly_tests/nightly_tests.yaml
+++ b/release/nightly_tests/nightly_tests.yaml
@ -1,390 +0,0 @@
-#
-# Single node shuffle
-#
-# Test basic single node 10GB shuffle with a small number of partitions.
-# This doesn't require object spilling.
-# - name: shuffle_10gb
-#   team: core
-#   cluster:
-#     app_config: shuffle/shuffle_app_config.yaml
-#     compute_template: shuffle/shuffle_compute_single.yaml
-
-#   run:
-#     timeout: 3000
-#     script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6
-
-# Test single node 50GB shuffle with a large number of partitions.
- name: shuffle_50gb
-  team: core
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_single.yaml
-
-  run:
-    timeout: 3000
-    script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
-
-# Test single node 50GB shuffle with a large number of partitions.
- name: shuffle_50gb_large_partition
-  team: core
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_single.yaml
-
-  run:
-    timeout: 3000
-    script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
-
-# Test non streaming shuffle in a single node with a small number of partition.
- name: non_streaming_shuffle_50gb
-  team: core
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_single.yaml
-
-  run:
-    timeout: 3000
-    script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming
-
-# Test non streaming shuffle in a single node with a large number of partition.
- name: non_streaming_shuffle_50gb_large_partition
-  team: core
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_single.yaml
-
-  run:
-    timeout: 3000
-    script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
-
- name: dask_on_ray_10gb_sort
-  team: core
-  cluster:
-    app_config: dask_on_ray/dask_on_ray_app_config.yaml
-    compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
-
-  run:
-    timeout: 7200
-    script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
-
- name: dask_on_ray_100gb_sort
-  team: core
-  cluster:
-    app_config: dask_on_ray/dask_on_ray_app_config.yaml
-    compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
-
-  run:
-    timeout: 7200
-    script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
-
-#
-# Multi node shuffle
-#
-
-# Test multi nodes 100GB shuffle with a small number of partitions.
- name: shuffle_100gb
-  team: core
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_multi.yaml
-
-  run:
-    timeout: 3000
-    prepare: python wait_cluster.py 4 600
-    script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
-
-# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
- name: non_streaming_shuffle_100gb
-  team: core
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_multi.yaml
-
-  run:
-    timeout: 3000
-    prepare: python wait_cluster.py 4 600
-    script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
-
-# Test autoscaling 1TB streaming shuffle with a large number of partitions.
- name: autoscaling_shuffle_1tb_1000_partitions
-  team: core
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_autoscaling.yaml
-
-  run:
-    timeout: 4000
-    script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
-
-# Test multi nodes 1TB streaming shuffle with a large number of partitions.
- name: shuffle_1tb_1000_partition
-  team: core
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_large_scale.yaml
-
-  run:
-    timeout: 3000
-    prepare: python wait_cluster.py 20 900
-    script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
-
-# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
- name: non_streaming_shuffle_1tb_1000_partition
-  team: core
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_large_scale.yaml
-
-  run:
-    timeout: 3000
-    prepare: python wait_cluster.py 20 900
-    script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
-
-# Stress test for 1TB multi node streaming shuffle.
- name: shuffle_1tb_5000_partitions
-  team: core
-  cluster:
-    app_config: shuffle/shuffle_app_config.yaml
-    compute_template: shuffle/shuffle_compute_large_scale.yaml
-
-  run:
-    timeout: 9000
-    prepare: python wait_cluster.py 20 900
-    script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
-
-# Stress test for 1TB multi node non-streaming shuffle.
-# - name: non_streaming_shuffle_1tb_5000_partitions
-#   team: core
-#   stable: False
-#   cluster:
-#     app_config: shuffle/shuffle_app_config.yaml
-#     compute_template: shuffle/shuffle_compute_large_scale.yaml
-
-#   run:
-#     timeout: 7200
-#     prepare: python wait_cluster.py 20 900
-#     script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
-
- name: k8s_dask_on_ray_large_scale_test_no_spilling
-  team: core
-  cluster:
-    app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
-    compute_template: dask_on_ray/dask_on_ray_stress_compute_k8s.yaml
-    compute_on_k8s: True
-
-  run:
-    timeout: 7200
-    prepare: python wait_cluster.py 21 600
-    script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0  --data_save_path /tmp/ray
-  stable: false
-
-# # Test large scale dask on ray test without spilling.
-# - name: dask_on_ray_large_scale_test_no_spilling
-#   team: core
-#   cluster:
-#     app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
-#     compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
-
-#   run:
-#     timeout: 7200
-#     prepare: python wait_cluster.py 21 600
-#     script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0  --data_save_path /tmp/ray
-
-#   smoke_test:
-#     cluster:
-#       app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
-#       compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
-
-#     run:
-#       timeout: 7200
-#       prepare: python wait_cluster.py 5 600
-#       script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0  --data_save_path /tmp/ray
-
-# Test large scale dask on ray test with spilling.
- name: dask_on_ray_large_scale_test_spilling
-  team: core
-  cluster:
-    app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
-    compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
-
-  run:
-    timeout: 7200
-    prepare: python wait_cluster.py 21 600
-    script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0  --data_save_path /tmp/ray
-
-  smoke_test:
-    cluster:
-      app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
-      compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
-
-    run:
-      timeout: 7200
-      prepare: python wait_cluster.py 5 600
-      script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0  --data_save_path /tmp/ray
-
-# Stress tests with many tasks
- name: stress_test_many_tasks
-  team: core
-  cluster:
-    app_config: stress_tests/stress_tests_app_config.yaml
-    compute_template: stress_tests/stress_tests_compute.yaml
-
-  run:
-    timeout: 7200
-    script: python stress_tests/test_many_tasks.py
-
-  smoke_test:
-    cluster:
-      app_config: stress_tests/stress_tests_app_config.yaml
-      compute_template: stress_tests/smoke_test_compute.yaml
-
-    run:
-      timeout: 3600
-      script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
-
-# Stress tests with dead actors
- name: stress_test_dead_actors
-  team: core
-  cluster:
-    app_config: stress_tests/stress_tests_app_config.yaml
-    compute_template: stress_tests/stress_tests_compute.yaml
-
-  run:
-    timeout: 7200
-    script: python stress_tests/test_dead_actors.py
-
-  smoke_test:
-    cluster:
-      app_config: stress_tests/stress_tests_app_config.yaml
-      compute_template: stress_tests/smoke_test_compute.yaml
-
-    run:
-      timeout: 3600
-      script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3
-
-# Stress tests with placement groups
- name: stress_test_placement_group
-  team: core
-  cluster:
-    app_config: stress_tests/stress_tests_app_config.yaml
-    compute_template: stress_tests/placement_group_tests_compute.yaml
-
-  run:
-    timeout: 7200
-    script: python stress_tests/test_placement_group.py
-
-# Stress tests with many threaded actors.
- name: threaded_actors_stress_test
-  team: core
-  cluster:
-    app_config: stress_tests/stress_tests_app_config.yaml
-    compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
-
-  run:
-    timeout: 7200
-    prepare: python wait_cluster.py 201 600
-    script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
-
-  smoke_test:
-    cluster:
-      app_config: stress_tests/stress_tests_app_config.yaml
-      compute_template: stress_tests/smoke_test_compute.yaml
-
-    run:
-      timeout: 3600
-      prepare: python wait_cluster.py 5 600
-      script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
-  stable: false
-
- name: k8s_threaded_actors_stress_test
-  team: core
-  cluster:
-    app_config: stress_tests/stress_tests_app_config.yaml
-    compute_template: stress_tests/k8s_stress_test_threaded_actor_compute.yaml
-    compute_on_k8s: True
-
-  run:
-    timeout: 7200
-    prepare: python wait_cluster.py 201 600
-    script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
-
-    run:
-      timeout: 3600
-      prepare: python wait_cluster.py 5 600
-      script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
-  stable: false
-
-# Test decision tree on autoscaling compute cluster.
- name: decision_tree_autoscaling
-  team: core
-  cluster:
-    app_config: decision_tree/decision_tree_app_config.yaml
-    compute_template: decision_tree/autoscaling_compute.yaml
-
-  run:
-    timeout: 3000
-    script: python decision_tree/cart_with_tree.py
-
-# Test 20 concurrent decision tree runs on autoscaling compute cluster.
- name: decision_tree_autoscaling_20_runs
-  team: core
-  cluster:
-    app_config: decision_tree/decision_tree_app_config.yaml
-    compute_template: decision_tree/autoscaling_compute.yaml
-  run:
-    timeout: 9600
-    script: python decision_tree/cart_with_tree.py --concurrency=20
-
- name: dask_on_ray_1tb_sort
-  team: core
-  cluster:
-    app_config: dask_on_ray/dask_on_ray_app_config.yaml
-    compute_template: dask_on_ray/1tb_sort_compute.yaml
-
-  run:
-    timeout: 7200
-    prepare: python wait_cluster.py 32 1000
-    script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
-
- name: many_nodes_actor_test
-  team: core
-  cluster:
-    app_config: many_nodes_tests/app_config.yaml
-    compute_template: many_nodes_tests/compute_config.yaml
-
-  run:
-    timeout: 7200
-    prepare: python wait_cluster.py 251 5400
-    script: python many_nodes_tests/actor_test.py
-
- name: pg_autoscaling_regression_test
-  team: core
-  cluster:
-    app_config: placement_group_tests/app_config.yaml
-    compute_template: placement_group_tests/compute.yaml
-
-  run:
-    timeout: 1200
-    script: python placement_group_tests/pg_run.py
-
- name: pg_long_running_performance_test
-  team: core
-  cluster:
-    app_config: placement_group_tests/app_config.yaml
-    compute_template: placement_group_tests/long_running_test_compute.yaml
-
-  run:
-    timeout: 3600
-    prepare: python wait_cluster.py 2 600
-    script: python placement_group_tests/long_running_performance_test.py --num-stages 2000
-
- name: placement_group_performance_test
-  team: core
-  cluster:
-    app_config: placement_group_tests/app_config.yaml
-    compute_template: placement_group_tests/pg_perf_test_compute.yaml
-
-  run:
-    timeout: 1200
-    prepare: python wait_cluster.py 5 600
-    script: python placement_group_tests/placement_group_performance_test.py
--- a/release/nightly_tests/wait_cluster.py
+++ b/release/nightly_tests/wait_cluster.py
@ -1,54 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/rllib_tests/rllib_tests.yaml
+++ b/release/rllib_tests/rllib_tests.yaml
@ -1,103 +0,0 @@
-# Heavy learning tests (Atari and HalfCheetah) for major algos.
- name: learning_tests
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: 8gpus_64cpus.yaml
-
-  run:
-    timeout: 14400
-    script: python learning_tests/run.py
-
-  smoke_test:
-      run:
-        timeout: 1200
-
-# 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos.
- name: multi_gpu_learning_tests
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: 8gpus_96cpus.yaml
-
-  run:
-    timeout: 7200
-    script: python multi_gpu_learning_tests/run.py
-
-# 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos
-# (that support RNN models).
- name: multi_gpu_with_lstm_learning_tests
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: 8gpus_96cpus.yaml
-
-  run:
-    timeout: 7200
-    script: python multi_gpu_with_lstm_learning_tests/run.py
-
-# 2-GPU learning tests (StatelessCartPole) + use_attention=True for major
-# algos (that support RNN models).
- name: multi_gpu_with_attention_learning_tests
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: 8gpus_96cpus.yaml
-
-  run:
-    timeout: 7200
-    script: python multi_gpu_with_attention_learning_tests/run.py
-
-# We'll have these as per-PR tests soon.
-# - name: example_scripts_on_gpu_tests
-#   team: ml
-#  cluster:
-#    app_config: app_config.yaml
-#    compute_template: 1gpu_4cpus.yaml
-
-#  run:
-#    timeout: 7200
-#    script: bash unit_gpu_tests/run.sh
-
-# IMPALA large machine stress tests (4x Atari).
- name: stress_tests
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: 4gpus_544_cpus.yaml
-
-  run:
-    timeout: 5400
-    prepare: python wait_cluster.py 6 600
-    script: python stress_tests/run_stress_tests.py
-
-  smoke_test:
-      run:
-        timeout: 2000
-
-# Tests that exercise auto-scaling and Anyscale connect.
- name: connect_tests
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: auto_scale.yaml
-
-  run:
-    use_connect: True
-    timeout: 3000
-    script: python connect_tests/run_connect_tests.py
-
-# Nightly performance regression for popular algorithms.
-# These algorithms run nightly for pre-determined amount of time without
-# passing criteria.
-# Performance metrics, such as reward achieved and throughput, are then
-# collected and tracked over time.
- name: performance_tests
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: 12gpus_192cpus.yaml
-
-  run:
-    timeout: 10800
-    script: python performance_tests/run.py
--- a/release/rllib_tests/wait_cluster.py
+++ b/release/rllib_tests/wait_cluster.py
@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/run_e2e.sh
+++ b/release/run_e2e.sh
@ -1,176 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-cd "${0%/*}" || exit 1
-
-reason() {
-  # Keep in sync with e2e.py ExitCode enum
-  case $1 in
-    0)
-    REASON="success"
-    ;;
-    2)
-    REASON="unspecified"
-    ;;
-    3)
-    REASON="unknown"
-    ;;
-    4)
-    REASON="runtime error"
-    ;;
-    5)
-    REASON="command error"
-    ;;
-    6)
-    REASON="command timeout"
-    ;;
-    7)
-    REASON="prepare timeout"
-    ;;
-    8)
-    REASON="filesync timeout"
-    ;;
-    9)
-    REASON="session timeout"
-    ;;
-    10)
-    REASON="prepare error"
-    ;;
-    11)
-    REASON="app config build error"
-    ;;
-    12)
-    REASON="infra error"
-    ;;
-    *)
-    REASON="untracked error"
-    ;;
-  esac
-  echo "${REASON}"
-}
-
-while [[ $# -gt 0 ]]
-do
-key="$1"
-case $key in
-    --ray-repo)
-    shift
-    RAY_REPO=$1
-    ;;
-    --ray-branch)
-    shift
-    RAY_BRANCH=$1
-    ;;
-    --ray-version)
-    shift
-    RAY_VERSION=$1
-    ;;
-    --ray-wheels)
-    shift
-    RAY_WHEELS=$1
-    ;;
-    --ray-test-repo)
-    shift
-    RAY_TEST_REPO=$1
-    ;;
-    --ray-test-branch)
-    shift
-    RAY_TEST_BRANCH=$1
-    ;;
-    --release-results-dir)
-    shift
-    RELEASE_RESULTS_DIR=$1
-    ;;
-    *)
-    break
-esac
-shift
-done
-
-RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
-RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
-RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
-
-export RAY_REPO RAY_BRANCH RAY_VERSION RAY_WHEELS RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
-
-pip uninstall -q -y ray
-pip install -q -r requirements.txt
-pip install -q -U boto3 botocore
-git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" ~/ray
-
-RETRY_NUM=0
-MAX_RETRIES=${MAX_RETRIES-3}
-
-if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
-  echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
-  MAX_RETRIES=1
-fi
-
-ALL_EXIT_CODES=()
-while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
-  RETRY_NUM=$((RETRY_NUM + 1))
-
-  if [ "$RETRY_NUM" -gt 1 ]; then
-    # Sleep for random time between 30 and 90 minutes
-    SLEEP_TIME=$((1800 + RANDOM % 5400))
-    echo "----------------------------------------"
-    echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
-    echo "----------------------------------------"
-    sleep ${SLEEP_TIME}
-  fi
-
-  sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
-
-  python e2e.py "$@"
-  EXIT_CODE=$?
-  REASON=$(reason "${EXIT_CODE}")
-  ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
-
-  case ${EXIT_CODE} in
-    0)
-    echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
-    break
-    ;;
-    7 | 9 | 10)
-    echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
-    ;;
-    *)
-    echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
-    break
-    ;;
-  esac
-
-done
-
-sudo rm -rf /tmp/ray_release_test_artifacts/* || true
-sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
-
-echo "----------------------------------------"
-echo "e2e test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
-echo "Run results:"
-
-COUNTER=1
-for EX in "${ALL_EXIT_CODES[@]}"; do
-  REASON=$(reason "${EX}")
-  echo "  Run $COUNTER: Exit code = ${EX} (${REASON})"
-  COUNTER=$((COUNTER + 1))
-done
-
-echo "----------------------------------------"
-
-REASON=$(reason "${EXIT_CODE}")
-echo "Final e2e exit code is ${EXIT_CODE} (${REASON})"
-
-case ${EXIT_CODE} in
-  0)
-  ;;
-  7 | 9 | 10)
-  echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
-  ;;
-  *)
-  echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
-  ;;
-esac
-
-exit $EXIT_CODE
--- a/release/runtime_env_tests/runtime_env_tests.yaml
+++ b/release/runtime_env_tests/runtime_env_tests.yaml
@ -1,34 +0,0 @@
- name: rte_many_tasks_actors
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    compute_template: rte_small.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/rte_many_tasks_actors.py
-
- name: wheel_urls
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    compute_template: rte_minimal.yaml
-
-  run:
-    timeout: 9000 # 2h30m
-    prepare: python wait_cluster.py 1 600
-    script: python workloads/wheel_urls.py
-
- name: rte_ray_client
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    compute_template: rte_minimal.yaml
-
-  run:
-    use_connect: True
-    autosuspend_mins: 10
-    timeout: 600
-    prepare: python wait_cluster.py 1 600
-    script: python workloads/rte_ray_client.py
--- a/release/runtime_env_tests/wait_cluster.py
+++ b/release/runtime_env_tests/wait_cluster.py
@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/serve_tests/serve_tests.yaml
+++ b/release/serve_tests/serve_tests.yaml
@ -1,101 +0,0 @@
- name: single_deployment_1k_noop_replica
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    compute_template: compute_tpl_32_cpu.yaml
-
-  run:
-    timeout: 7200
-    long_running: False
-    script: python workloads/single_deployment_1k_noop_replica.py
-
-  smoke_test:
-    timeout: 600
-
- name: multi_deployment_1k_noop_replica
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    compute_template: compute_tpl_32_cpu.yaml
-
-  run:
-    timeout: 7200
-    long_running: False
-    script: python workloads/multi_deployment_1k_noop_replica.py
-
-  smoke_test:
-    timeout: 600
-
- name: autoscaling_single_deployment
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    compute_template: compute_tpl_8_cpu_autoscaling.yaml
-
-  run:
-    timeout: 7200
-    long_running: False
-    script: python workloads/autoscaling_single_deployment.py
-
-  smoke_test:
-    timeout: 600
-
- name: autoscaling_multi_deployment
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    compute_template: compute_tpl_8_cpu_autoscaling.yaml
-
-  run:
-    timeout: 7200
-    long_running: False
-    script: python workloads/autoscaling_multi_deployment.py
-
-  smoke_test:
-    timeout: 600
-
- name: serve_micro_benchmark
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    # 16 CPUS
-    compute_template: compute_tpl_single_node.yaml
-
-  run:
-    timeout: 7200
-    long_running: False
-    script: python workloads/serve_micro_benchmark.py
-
-  smoke_test:
-    timeout: 600
-
- name: serve_micro_benchmark_k8s
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    # 16 CPUS
-    compute_template: compute_tpl_single_node_k8s.yaml
-    compute_on_k8s: True
-
-  run:
-    timeout: 7200
-    long_running: False
-    script: python workloads/serve_micro_benchmark.py
-
-  smoke_test:
-    timeout: 600
-
- name: serve_cluster_fault_tolerance
-  team: serve
-  cluster:
-    app_config: app_config.yaml
-    # 16 CPUS
-    compute_template: compute_tpl_single_node.yaml
-
-  run:
-    timeout: 7200
-    long_running: False
-    script: python workloads/serve_cluster_fault_tolerance.py
-
-  smoke_test:
-    timeout: 600
--- a/release/sgd_tests/sgd_tests.yaml
+++ b/release/sgd_tests/sgd_tests.yaml
@ -1,11 +0,0 @@
-# Test multi-node, multi-GPU Ray SGD example.
- name: sgd_gpu
-  team: ml
-  cluster:
-    app_config: sgd_gpu/sgd_gpu_app_config.yaml
-    compute_template: sgd_gpu/sgd_gpu_compute.yaml
-
-  run:
-    timeout: 3000
-    prepare: python wait_cluster.py 2 600
-    script: python sgd_gpu/sgd_gpu_test.py --num-workers=2 --use-gpu --address=auto
--- a/release/sgd_tests/wait_cluster.py
+++ b/release/sgd_tests/wait_cluster.py
@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/test_owners.yaml
+++ b/release/test_owners.yaml
@ -1,27 +0,0 @@
-# Specify the test owners (teams) here.
-# The root key should be the name of the test yaml file without the .yaml.
-# To specify owners of subtests, use a sub dict (see e.g. long_running_tests).
-golden_notebook_tests: ml
-horovod_tests: ml
-lightgbm_tests: ml
-long_running_distributed_tests: ml
-long_running_tests:
-  actor_deaths: core
-  apex: ml
-  impala: ml
-  many_actor_tasks: core
-  many_drivers: core
-  many_ppo: core
-  many_tasks: core
-  many_tasks_serialized_ids: core
-  node_failures: core
-  pbt: ml
-  serve: serve
-  serve_failure: serve
-microbenchmark: core
-nightly_tests: core
-rllib_tests: ml
-runtime_env_tests: serve
-serve_tests: serve
-sgd_tests: ml
-xgboost_tests: ml
--- a/release/tune_tests/cloud_tests/tune_cloud_tests.yaml
+++ b/release/tune_tests/cloud_tests/tune_cloud_tests.yaml
@ -1,118 +0,0 @@
- name: aws_no_sync_down
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_aws_4x2.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/run_cloud_test.py no_sync_down
-
- name: aws_ssh_sync
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_aws_4x2.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/run_cloud_test.py ssh_sync
-
- name: aws_durable_upload
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_aws_4x2.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
-
- name: aws_durable_upload_rllib_str
-  team: ml
-  cluster:
-    app_config: app_config_ml.yaml
-    compute_template: tpl_aws_4x2.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str
-
- name: aws_durable_upload_rllib_trainer
-  team: ml
-  cluster:
-    app_config: app_config_ml.yaml
-    compute_template: tpl_aws_4x2.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer
-
- name: aws_no_durable_upload
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_aws_4x2.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload
-
- name: gcp_k8s_no_sync_down
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_gcp_k8s_4x8.yaml
-    cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM  # anyscale_k8s_gcp_cloud
-
-  run:
-    use_connect: True
-    timeout: 600
-    # Remove --cpus-per-trial 8 once n2-standard-2 is supported
-    script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
-
- name: gcp_k8s_ssh_sync
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_gcp_k8s_4x8.yaml
-    cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM  # anyscale_k8s_gcp_cloud
-
-  run:
-    use_connect: True
-    timeout: 600
-    # Remove --cpus-per-trial 8 once n2-standard-2 is supported
-    script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
-
- name: gcp_k8s_durable_upload
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_gcp_k8s_4x8.yaml
-    cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM  # anyscale_k8s_gcp_cloud
-
-  run:
-    use_connect: True
-    timeout: 600
-    # Remove --cpus-per-trial 8 once n2-standard-2 is supported
-    script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
-
-
- name: gcp_k8s_no_durable_upload
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_gcp_k8s_4x8.yaml
-    cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM  # anyscale_k8s_gcp_cloud
-
-  run:
-    use_connect: True
-    timeout: 600
-    # Remove --cpus-per-trial 8 once n2-standard-2 is supported
-    script: python workloads/run_cloud_test.py no_durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
--- a/release/tune_tests/cloud_tests/wait_cluster.py
+++ b/release/tune_tests/cloud_tests/wait_cluster.py
@ -1,54 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/tune_tests/scalability_tests/tune_tests.yaml
+++ b/release/tune_tests/scalability_tests/tune_tests.yaml
@ -1,90 +0,0 @@
- name: bookkeeping_overhead
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_1x16.yaml
-
-  run:
-    timeout: 1200
-    script: python workloads/test_bookkeeping_overhead.py
-
-
- name: durable_trainable
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_16x2.yaml
-
-  run:
-    timeout: 900
-    prepare: python wait_cluster.py 16 600
-    script: python workloads/test_durable_trainable.py --bucket data-test-ilr
-
- name: long_running_large_checkpoints
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_1x32_hd.yaml
-
-  run:
-    timeout: 86400
-    script: python workloads/test_long_running_large_checkpoints.py
-    long_running: True
-
-  smoke_test:
-    run:
-      timeout: 3600
-
-
- name: network_overhead
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_100x2.yaml
-
-  run:
-    timeout: 900
-    prepare_timeout: 1200
-    prepare: python wait_cluster.py 100 1200
-    script: python workloads/test_network_overhead.py
-
-  smoke_test:
-    cluster:
-      compute_template: tpl_20x2.yaml
-
-    run:
-      timeout: 400
-      prepare_timeout: 600
-      prepare: python wait_cluster.py 20 600
-
- name: result_throughput_cluster
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_16x64.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 16 600
-    script: python workloads/test_result_throughput_cluster.py
-
- name: result_throughput_single_node
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_1x96.yaml
-
-  run:
-    timeout: 600
-    script: python workloads/test_result_throughput_single_node.py
-
- name: xgboost_sweep
-  team: ml
-  cluster:
-    app_config: app_config_data.yaml
-    compute_template: tpl_16x64.yaml
-
-  run:
-    timeout: 3600
-    prepare: python wait_cluster.py 16 600
-    script: python workloads/test_xgboost_sweep.py
--- a/release/tune_tests/scalability_tests/wait_cluster.py
+++ b/release/tune_tests/scalability_tests/wait_cluster.py
@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/util/wait_cluster.py
+++ b/release/util/wait_cluster.py
@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/xgboost_tests/wait_cluster.py
+++ b/release/xgboost_tests/wait_cluster.py
@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
-    "--feedback_interval_s",
-    type=int,
-    default=10,
-    help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
-    now = time.time()
-
-    if now >= max_time:
-        raise RuntimeError(
-            f"Maximum wait time reached, but only "
-            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
-        )
-
-    if now >= next_feedback:
-        passed = now - start
-        print(
-            f"Waiting for more nodes to come up: "
-            f"{curr_nodes}/{args.num_nodes} "
-            f"({passed:.0f} seconds passed)"
-        )
-        next_feedback = now + args.feedback_interval_s
-
-    time.sleep(5)
-    curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
-    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
-    f"{passed:.0f} seconds"
-)
--- a/release/xgboost_tests/xgboost_tests.yaml
+++ b/release/xgboost_tests/xgboost_tests.yaml
@ -1,104 +0,0 @@
- name: train_small
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_small.yaml
-
-  run:
-    use_connect: True
-    autosuspend_mins: 10
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/train_small.py
-
- name: train_moderate
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_moderate.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 32 600
-    script: python workloads/train_moderate.py
-
- name: train_gpu
-  team: ml
-  cluster:
-    app_config: app_config_gpu.yaml
-    compute_template: tpl_gpu_small.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 5 600
-    script: python workloads/train_gpu.py
-
- name: distributed_api_test
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_small.yaml
-    results: 
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/distributed_api_test.py
-    results: ""
-
- name: ft_small_elastic
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_small.yaml
-
-  run:
-    timeout: 900
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/ft_small_elastic.py
-    results: ""
-
- name: ft_small_non_elastic
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_small.yaml
-
-  run:
-    timeout: 900
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/ft_small_non_elastic.py
-    results: ""
-
- name: tune_small
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_small.yaml
-
-  run:
-    timeout: 600
-    prepare: python wait_cluster.py 4 600
-    script: python workloads/tune_small.py
-
- name: tune_32x4
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_moderate.yaml
-
-  run:
-    timeout: 900
-    prepare: python wait_cluster.py 32 600
-    script: python workloads/tune_32x4.py
-
- name: tune_4x32
-  team: ml
-  cluster:
-    app_config: app_config.yaml
-    compute_template: tpl_cpu_moderate.yaml
-
-  run:
-    timeout: 900
-    prepare: python wait_cluster.py 32 600
-    script: python workloads/tune_4x32.py