[ci/release] Remove old OSS release test infrastructure (#23134)

Now that we've migrated all OSS release tests to the new infrastructure, we can remove old config files and infra scripts.
2025-03-05 18:11:42 -05:00 · 2022-03-14 15:10:52 +00:00 · 2022-03-14 15:10:52 +00:00 · 8608b64885
commit 8608b64885
parent d93fa95dd5
39 changed files with 0 additions and 6712 deletions
--- a/benchmarks/benchmark_tests.yaml
+++ b/benchmarks/benchmark_tests.yaml
@ -1,145 +0,0 @@
 - name: single_node
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: single_node.yaml
  run:
    timeout: 12000
    prepare: sleep 0
    script: python single_node/test_single_node.py
 - name: object_store
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: object_store.yaml
  run:
    timeout: 3600
    prepare: python distributed/wait_cluster.py --num-nodes=50
    script: python object_store/test_object_store.py
 - name: many_actors
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=65
    script: python distributed/test_many_actors.py
 - name: many_actors_smoke_test
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed_smoke_test.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=2
    script: SMOKE_TEST=1 python distributed/test_many_actors.py
 - name: many_tasks
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=65
    script: python distributed/test_many_tasks.py --num-tasks=10000
 - name: many_tasks_smoke_test
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed_smoke_test.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=2
    script: python distributed/test_many_tasks.py --num-tasks=100
 - name: many_pgs
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=65
    script: python distributed/test_many_pgs.py
 - name: many_pgs_smoke_test
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed_smoke_test.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=2
    script: SMOKE_TEST=1 python distributed/test_many_pgs.py
 # NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
 - name: many_nodes
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: many_nodes.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=250
    script: python distributed/test_many_tasks.py --num-tasks=1000
 - name: scheduling_test_many_0s_tasks_single_node
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: scheduling.yaml
  run:
    timeout: 3600
    prepare: python distributed/wait_cluster.py --num-nodes=32
    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
 - name: scheduling_test_many_0s_tasks_many_nodes
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: scheduling.yaml
  run:
    timeout: 3600
    prepare: python distributed/wait_cluster.py --num-nodes=32
    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
 - name: scheduling_test_many_5s_tasks_single_node
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: scheduling.yaml
  run:
    timeout: 3600
    prepare: python distributed/wait_cluster.py --num-nodes=32
    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
  stable: false
 - name: scheduling_test_many_5s_tasks_many_nodes
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: scheduling.yaml
  run:
    timeout: 3600
    prepare: python distributed/wait_cluster.py --num-nodes=32
    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
  stable: false
--- a/benchmarks/distributed/wait_cluster.py
+++ b/benchmarks/distributed/wait_cluster.py
@ -1,24 +0,0 @@
 import click
 import ray
 import time
 def num_alive_nodes():
    n = 0
    for node in ray.nodes():
        if node["Alive"]:
            n += 1
    return n
@click.command()
@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
 def wait_cluster(num_nodes: int):
    ray.init(address="auto")
    while num_alive_nodes() != num_nodes:
        print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
        time.sleep(5)
 if __name__ == "__main__":
    wait_cluster()
--- a/release/.buildkite/build_pipeline.py
+++ b/release/.buildkite/build_pipeline.py
@ -1,680 +0,0 @@
 import copy
 import logging
 import os
 import re
 import sys
 import yaml
 # If you update or reorganize the periodic tests, please ensure the
 # relevant portions of the Ray release instructions (go/release-ray)
 # (in particular, running periodic tests and collecting release logs)
 # are up to date.  If you need access, please contact @zhe-thoughts.
 # Env variables:
 # RAY_REPO          Repo to use for finding the wheel
 # RAY_BRANCH        Branch to find the wheel
 # RAY_VERSION       Version to find the wheel
 # RAY_WHEELS        Direct Ray wheel URL
 # RAY_TEST_REPO     Repo to use for test scripts
 # RAY_TEST_BRANCH   Branch for test scripts
 # FILTER_FILE       File filter
 # FILTER_TEST       Test name filter
 # RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)
 class ReleaseTest:
    def __init__(
        self,
        name: str,
        smoke_test: bool = False,
        retry: int = 0,
    ):
        self.name = name
        self.smoke_test = smoke_test
        self.retry = retry
    def __str__(self):
        return self.name
    def __repr__(self):
        return self.name
    def __contains__(self, item):
        return self.name.__contains__(item)
    def __iter__(self):
        return iter(self.name)
    def __len__(self):
        return len(self.name)
 class SmokeTest(ReleaseTest):
    def __init__(self, name: str, retry: int = 0):
        super(SmokeTest, self).__init__(name=name, smoke_test=True, retry=retry)
 CORE_NIGHTLY_TESTS = {
    # "~/ray/release/nightly_tests/nightly_tests.yaml": [
    # "shuffle_10gb",
    # "shuffle_50gb",
    # "shuffle_50gb_large_partition",
    # "shuffle_100gb",
    # "non_streaming_shuffle_100gb",
    # "non_streaming_shuffle_50gb_large_partition",
    # "non_streaming_shuffle_50gb",
    # SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
    # SmokeTest("dask_on_ray_large_scale_test_spilling"),
    # "stress_test_placement_group",
    # "shuffle_1tb_1000_partition",
    # "non_streaming_shuffle_1tb_1000_partition",
    # "shuffle_1tb_5000_partitions",
    # TODO(sang): It doesn't even work without spilling
    # as it hits the scalability limit.
    # "non_streaming_shuffle_1tb_5000_partitions",
    # "decision_tree_autoscaling",
    # "decision_tree_autoscaling_20_runs",
    # "autoscaling_shuffle_1tb_1000_partitions",
    # SmokeTest("stress_test_many_tasks"),
    # SmokeTest("stress_test_dead_actors"),
    # SmokeTest("threaded_actors_stress_test"),
    # "pg_long_running_performance_test",
    # ],
    # "~/ray/benchmarks/benchmark_tests.yaml": [
    #     "single_node",
    #     "object_store",
    #     "many_actors_smoke_test",
    #     "many_tasks_smoke_test",
    #     "many_pgs_smoke_test",
    # ],
    # "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
    #     "inference",
    #     "shuffle_data_loader",
    #     "parquet_metadata_resolution",
    #     "pipelined_training_50_gb",
    #     "pipelined_ingestion_1500_gb",
    #     "datasets_preprocess_ingest",
    #     "datasets_ingest_400G",
    #     SmokeTest("datasets_ingest_train_infer"),
    # ],
    # "~/ray/release/nightly_tests/chaos_test.yaml": [
    #     "chaos_many_actors",
    #     "chaos_many_tasks_no_object_store",
    #     "chaos_pipelined_ingestion_1500_gb_15_windows",
    # ],
    # "~/ray/release/microbenchmark/microbenchmark.yaml": [
    #     "microbenchmark",
    # ],
 }
 SERVE_NIGHTLY_TESTS = {
    # "~/ray/release/long_running_tests/long_running_tests.yaml": [
    #     SmokeTest("serve"),
    #     SmokeTest("serve_failure"),
    # ],
    # "~/ray/release/serve_tests/serve_tests.yaml": [
    #     "single_deployment_1k_noop_replica",
    #     "multi_deployment_1k_noop_replica",
    #     "autoscaling_single_deployment",
    #     "autoscaling_multi_deployment",
    #     "serve_micro_benchmark",
    #     # TODO(architkulkarni) Reenable after K8s migration.  Currently failing
    #     # "serve_micro_benchmark_k8s",
    #     "serve_cluster_fault_tolerance",
    # ],
 }
 CORE_DAILY_TESTS = {
    # "~/ray/release/nightly_tests/nightly_tests.yaml": [
    #     "k8s_dask_on_ray_large_scale_test_no_spilling",
    #     "dask_on_ray_large_scale_test_no_spilling",
    #     "dask_on_ray_large_scale_test_spilling",
    #     "pg_autoscaling_regression_test",
    #     "threaded_actors_stress_test",
    #     "k8s_threaded_actors_stress_test",
    #     "stress_test_many_tasks",
    #     "stress_test_dead_actors",
    # ],
    # "~/ray/release/nightly_tests/chaos_test.yaml": [
    #     "chaos_dask_on_ray_large_scale_test_no_spilling",
    #     "chaos_dask_on_ray_large_scale_test_spilling",
    # ],
 }
 CORE_SCALABILITY_TESTS_DAILY = {
    # "~/ray/benchmarks/benchmark_tests.yaml": [
    #     "many_actors",
    #     "many_tasks",
    #     "many_pgs",
    #     "many_nodes",
    # ],
 }
 CORE_SCHEDULING_DAILY = {
    # "~/ray/benchmarks/benchmark_tests.yaml": [
    #     "scheduling_test_many_0s_tasks_single_node",
    #     "scheduling_test_many_0s_tasks_many_nodes",
    #     # Reenable these two once we got right setup
    #     # "scheduling_test_many_5s_tasks_single_node",
    #     # "scheduling_test_many_5s_tasks_many_nodes",
    # ],
    # "~/ray/release/nightly_tests/nightly_tests.yaml": [
    #     "many_nodes_actor_test",
    #     "dask_on_ray_10gb_sort",
    #     "dask_on_ray_100gb_sort",
    #     "dask_on_ray_1tb_sort",
    #     "placement_group_performance_test",
    # ],
 }
 NIGHTLY_TESTS = {
    # "~/ray/release/horovod_tests/horovod_tests.yaml": [
    #     SmokeTest("horovod_test"),
    # ],  # Should we enable this?
    # "~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
    #     "dask_xgboost_test",
    #     "modin_xgboost_test",
    #     "torch_tune_serve_test",
    # ],
    # "~/ray/release/long_running_tests/long_running_tests.yaml": [
    #     SmokeTest("actor_deaths"),
    #     SmokeTest("apex"),
    #     SmokeTest("impala"),
    #     SmokeTest("many_actor_tasks"),
    #     SmokeTest("many_drivers"),
    #     SmokeTest("many_ppo"),
    #     SmokeTest("many_tasks"),
    #     SmokeTest("many_tasks_serialized_ids"),
    #     SmokeTest("node_failures"),
    #     SmokeTest("pbt"),
    #     # SmokeTest("serve"),
    #     # SmokeTest("serve_failure"),
    #     # Full long running tests (1 day runtime)
    #     "actor_deaths",
    #     "apex",
    #     "impala",
    #     "many_actor_tasks",
    #     "many_drivers",
    #     "many_ppo",
    #     "many_tasks",
    #     "many_tasks_serialized_ids",
    #     "node_failures",
    #     "pbt",
    #     "serve",
    #     "serve_failure",
    # ],
    # "~/ray/release/sgd_tests/sgd_tests.yaml": [
    #     "sgd_gpu",
    # ],
    # "~/ray/release/tune_tests/cloud_tests/tune_cloud_tests.yaml": [
    #     "aws_no_sync_down",
    #     "aws_ssh_sync",
    #     "aws_durable_upload",
    #     "aws_durable_upload_rllib_str",
    #     "aws_durable_upload_rllib_trainer",
    #     "gcp_k8s_durable_upload",
    # ],
    # "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
    #     "bookkeeping_overhead",
    #     "durable_trainable",
    #     SmokeTest("long_running_large_checkpoints"),
    #     SmokeTest("network_overhead"),
    #     "result_throughput_cluster",
    #     "result_throughput_single_node",
    # ],
    # "~/ray/release/xgboost_tests/xgboost_tests.yaml": [
    #     "train_small",
    #     "train_moderate",
    #     "train_gpu",
    #     "tune_small",
    #     "tune_4x32",
    #     "tune_32x4",
    #     "ft_small_elastic",
    #     "ft_small_non_elastic",
    #     "distributed_api_test",
    # ],
    # "~/ray/release/rllib_tests/rllib_tests.yaml": [
    #     SmokeTest("learning_tests"),
    #     SmokeTest("stress_tests"),
    #     "performance_tests",
    #     "multi_gpu_learning_tests",
    #     "multi_gpu_with_lstm_learning_tests",
    #     "multi_gpu_with_attention_learning_tests",
    #     # We'll have these as per-PR tests soon.
    #     # "example_scripts_on_gpu_tests",
    # ],
    # "~/ray/release/runtime_env_tests/runtime_env_tests.yaml": [
    #     "rte_many_tasks_actors",
    #     "wheel_urls",
    #     "rte_ray_client",
    # ],
 }
 WEEKLY_TESTS = {
    # "~/ray/release/horovod_tests/horovod_tests.yaml": [
    #     "horovod_test",
    # ],
    "~/ray/release/long_running_distributed_tests"
    # "/long_running_distributed.yaml": [
    #     "pytorch_pbt_failure",
    # ],
    # "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
    #     "network_overhead",
    #     "long_running_large_checkpoints",
    #     "xgboost_sweep",
    # ],
    # "~/ray/release/rllib_tests/rllib_tests.yaml": [
    #     "learning_tests",
    #     "stress_tests",
    # ],
 }
 # This test suite holds "user" tests to test important user workflows
 # in a particular environment.
 # All workloads in this test suite should:
 #   1. Be run in a distributed (multi-node) fashion
 #   2. Use autoscaling/scale up (no wait_cluster.py)
 #   3. Use GPUs if applicable
 #   4. Have the `use_connect` flag set.
 USER_TESTS = {
    # "~/ray/release/ml_user_tests/ml_user_tests.yaml": [
    #     "train_tensorflow_mnist_test",
    #     "train_torch_linear_test",
    #     "ray_lightning_user_test_latest",
    #     "ray_lightning_user_test_master",
    #     "horovod_user_test_latest",
    #     "horovod_user_test_master",
    #     "xgboost_gpu_connect_latest",
    #     "xgboost_gpu_connect_master",
    #     "tune_rllib_connect_test",
    # ]
 }
 SUITES = {
    "core-nightly": CORE_NIGHTLY_TESTS,
    "serve-nightly": SERVE_NIGHTLY_TESTS,
    "core-daily": CORE_DAILY_TESTS,
    "core-scalability": CORE_SCALABILITY_TESTS_DAILY,
    "nightly": {**NIGHTLY_TESTS, **USER_TESTS},
    "core-scheduling-daily": CORE_SCHEDULING_DAILY,
    "weekly": WEEKLY_TESTS,
 }
 DEFAULT_STEP_TEMPLATE = {
    "env": {
        "ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
        "ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
        "RELEASE_AWS_BUCKET": "ray-release-automation-results",
        "RELEASE_AWS_LOCATION": "dev",
        "RELEASE_AWS_DB_NAME": "ray_ci",
        "RELEASE_AWS_DB_TABLE": "release_test_result",
        "AWS_REGION": "us-west-2",
    },
    "agents": {"queue": "runner_queue_branch"},
    "plugins": [
        {
            "docker#v3.9.0": {
                "image": "rayproject/ray",
                "propagate-environment": True,
                "volumes": [
                    "/tmp/ray_release_test_artifacts:" "/tmp/ray_release_test_artifacts"
                ],
            }
        }
    ],
    "artifact_paths": ["/tmp/ray_release_test_artifacts/**/*"],
 }
 def ask_configuration():
    RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
    RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
    RAY_VERSION = os.environ.get("RAY_VERSION", "")
    RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
    RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
    RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
    RELEASE_TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
    FILTER_FILE = os.environ.get("FILTER_FILE", "")
    FILTER_TEST = os.environ.get("FILTER_TEST", "")
    input_ask_step = {
        "input": "Input required: Please specify tests to run",
        "fields": [
            {
                "text": (
                    "RAY_REPO: Please specify the Ray repository used "
                    "to find the wheel."
                ),
                "hint": (
                    "Repository from which to fetch the latest "
                    "commits to find the Ray wheels. Usually you don't "
                    "need to change this."
                ),
                "default": RAY_REPO,
                "key": "ray_repo",
            },
            {
                "text": (
                    "RAY_BRANCH: Please specify the Ray branch used "
                    "to find the wheel."
                ),
                "hint": "For releases, this will be e.g. `releases/1.x.0`",
                "default": RAY_BRANCH,
                "key": "ray_branch",
            },
            {
                "text": (
                    "RAY_VERSION: Please specify the Ray version used "
                    "to find the wheel."
                ),
                "hint": (
                    "Leave empty for latest master. For releases, "
                    "specify the release version."
                ),
                "required": False,
                "default": RAY_VERSION,
                "key": "ray_version",
            },
            {
                "text": "RAY_WHEELS: Please specify the Ray wheel URL.",
                "hint": (
                    "ATTENTION: If you provide this, RAY_REPO, "
                    "RAY_BRANCH and RAY_VERSION will be ignored! "
                    "Please also make sure to provide the wheels URL "
                    "for Python 3.7 on Linux.\n"
                    "You can also insert a commit hash here instead "
                    "of a full URL.\n"
                    "NOTE: You can specify multiple commits or URLs "
                    "for easy bisection (one per line) - this will "
                    "run each test on each of the specified wheels."
                ),
                "required": False,
                "default": RAY_WHEELS,
                "key": "ray_wheels",
            },
            {
                "text": (
                    "RAY_TEST_REPO: Please specify the Ray repository "
                    "used to find the tests you would like to run."
                ),
                "hint": (
                    "If you're developing a new release test, this "
                    "will most likely be your GitHub fork."
                ),
                "default": RAY_TEST_REPO,
                "key": "ray_test_repo",
            },
            {
                "text": (
                    "RAY_TEST_BRANCH: Please specify the Ray branch used "
                    "to find the tests you would like to run."
                ),
                "hint": (
                    "If you're developing a new release test, this "
                    "will most likely be a branch living on your "
                    "GitHub fork."
                ),
                "default": RAY_TEST_BRANCH,
                "key": "ray_test_branch",
            },
            {
                "select": (
                    "RELEASE_TEST_SUITE: Please specify the release "
                    "test suite containing the tests you would like "
                    "to run."
                ),
                "hint": (
                    "Check in the `build_pipeline.py` if you're "
                    "unsure which suite contains your tests."
                ),
                "required": True,
                "options": sorted(SUITES.keys()),
                "default": RELEASE_TEST_SUITE,
                "key": "release_test_suite",
            },
            {
                "text": (
                    "FILTER_FILE: Please specify a filter for the "
                    "test files that should be included in this build."
                ),
                "hint": (
                    "Only test files (e.g. xgboost_tests.yml) that "
                    "match this string will be included in the test"
                ),
                "default": FILTER_FILE,
                "required": False,
                "key": "filter_file",
            },
            {
                "text": (
                    "FILTER_TEST: Please specify a filter for the "
                    "test names that should be included in this build."
                ),
                "hint": (
                    "Only test names (e.g. tune_4x32) that match "
                    "this string will be included in the test"
                ),
                "default": FILTER_TEST,
                "required": False,
                "key": "filter_test",
            },
        ],
        "key": "input_ask_step",
    }
    run_again_step = {
        "commands": [
            f'export {v}=$(buildkite-agent meta-data get "{k}")'
            for k, v in {
                "ray_branch": "RAY_BRANCH",
                "ray_repo": "RAY_REPO",
                "ray_version": "RAY_VERSION",
                "ray_wheels": "RAY_WHEELS",
                "ray_test_branch": "RAY_TEST_BRANCH",
                "ray_test_repo": "RAY_TEST_REPO",
                "release_test_suite": "RELEASE_TEST_SUITE",
                "filter_file": "FILTER_FILE",
                "filter_test": "FILTER_TEST",
            }.items()
        ]
        + [
            "export AUTOMATIC=1",
            "python3 -m pip install --user pyyaml",
            "rm -rf ~/ray || true",
            "git clone -b $${RAY_TEST_BRANCH} $${RAY_TEST_REPO} ~/ray",
            (
                "python3 ~/ray/release/.buildkite/build_pipeline.py "
                "| buildkite-agent pipeline upload"
            ),
        ],
        "label": ":pipeline: Again",
        "agents": {"queue": "runner_queue_branch"},
        "depends_on": "input_ask_step",
        "key": "run_again_step",
    }
    return [
        input_ask_step,
        run_again_step,
    ]
 def create_test_step(
    ray_repo: str,
    ray_branch: str,
    ray_version: str,
    ray_wheels: str,
    ray_test_repo: str,
    ray_test_branch: str,
    test_file: str,
    test_name: ReleaseTest,
 ):
    custom_commit_str = "custom_wheels_url"
    if ray_wheels:
        # Extract commit from url
        p = re.compile(r"([a-f0-9]{40})")
        m = p.search(ray_wheels)
        if m is not None:
            custom_commit_str = m.group(1)
    ray_wheels_str = f" ({ray_wheels}) " if ray_wheels else ""
    logging.info(f"Creating step for {test_file}/{test_name}{ray_wheels_str}")
    cmd = (
        f"./release/run_e2e.sh "
        f'--ray-repo "{ray_repo}" '
        f'--ray-branch "{ray_branch}" '
        f'--ray-version "{ray_version}" '
        f'--ray-wheels "{ray_wheels}" '
        f'--ray-test-repo "{ray_test_repo}" '
        f'--ray-test-branch "{ray_test_branch}" '
    )
    args = (
        f"--category {ray_branch} "
        f"--test-config {test_file} "
        f"--test-name {test_name} "
        f"--keep-results-dir"
    )
    if test_name.smoke_test:
        logging.info("This test will run as a smoke test.")
        args += " --smoke-test"
    step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
    if test_name.retry:
        logging.info(f"This test will be retried up to " f"{test_name.retry} times.")
        step_conf["retry"] = {
            "automatic": [{"exit_status": "*", "limit": test_name.retry}]
        }
    else:
        # Default retry logic
        # Warning: Exit codes are currently not correctly propagated to
        # buildkite! Thus, actual retry logic is currently implemented in
        # the run_e2e.sh script!
        step_conf["retry"] = {
            "automatic": [
                {"exit_status": 7, "limit": 2},  # Prepare timeout
                {"exit_status": 9, "limit": 2},  # Session timeout
                {"exit_status": 10, "limit": 2},  # Prepare error
            ],
        }
    step_conf["command"] = cmd + args
    step_conf["label"] = (
        f"{test_name} "
        f"({custom_commit_str if ray_wheels_str else ray_branch}) - "
        f"{ray_test_branch}/{ray_test_repo}"
    )
    return step_conf
 def build_pipeline(steps):
    all_steps = []
    RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
    RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
    RAY_VERSION = os.environ.get("RAY_VERSION", "")
    RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
    RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
    RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
    FILTER_FILE = os.environ.get("FILTER_FILE", "")
    FILTER_TEST = os.environ.get("FILTER_TEST", "")
    ray_wheels_list = [""]
    if RAY_WHEELS:
        ray_wheels_list = RAY_WHEELS.split("\n")
    if len(ray_wheels_list) > 1:
        logging.info(
            f"This will run a bisec on the following URLs/commits: "
            f"{ray_wheels_list}"
        )
    logging.info(
        f"Building pipeline \n"
        f"Ray repo/branch to test:\n"
        f" RAY_REPO   = {RAY_REPO}\n"
        f" RAY_BRANCH = {RAY_BRANCH}\n\n"
        f" RAY_VERSION = {RAY_VERSION}\n\n"
        f" RAY_WHEELS = {RAY_WHEELS}\n\n"
        f"Ray repo/branch containing the test configurations and scripts:"
        f" RAY_TEST_REPO   = {RAY_TEST_REPO}\n"
        f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
        f"Filtering for these tests:\n"
        f" FILTER_FILE = {FILTER_FILE}\n"
        f" FILTER_TEST = {FILTER_TEST}\n\n"
    )
    for test_file, test_names in steps.items():
        if FILTER_FILE and FILTER_FILE not in test_file:
            continue
        test_base = os.path.basename(test_file)
        for test_name in test_names:
            if FILTER_TEST and FILTER_TEST not in test_name:
                continue
            if not isinstance(test_name, ReleaseTest):
                test_name = ReleaseTest(name=test_name)
            logging.info(f"Adding test: {test_base}/{test_name}")
            for ray_wheels in ray_wheels_list:
                step_conf = create_test_step(
                    ray_repo=RAY_REPO,
                    ray_branch=RAY_BRANCH,
                    ray_version=RAY_VERSION,
                    ray_wheels=ray_wheels,
                    ray_test_repo=RAY_TEST_REPO,
                    ray_test_branch=RAY_TEST_BRANCH,
                    test_file=test_file,
                    test_name=test_name,
                )
                all_steps.append(step_conf)
    return all_steps
 def alert_pipeline(stats: bool = False):
    step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
    cmd = "python release/alert.py"
    if stats:
        cmd += " --stats"
    step_conf["commands"] = [
        "pip install -q -r release/requirements.txt",
        "pip install -U boto3 botocore",
        cmd,
    ]
    step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
    return [step_conf]
 if __name__ == "__main__":
    alert = os.environ.get("RELEASE_ALERT", "0")
    ask_for_config = not bool(int(os.environ.get("AUTOMATIC", "0")))
    if alert in ["1", "stats"]:
        steps = alert_pipeline(alert == "stats")
    elif ask_for_config:
        steps = ask_configuration()
    else:
        TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
        PIPELINE_SPEC = SUITES[TEST_SUITE]
        steps = build_pipeline(PIPELINE_SPEC)
    yaml.dump({"steps": steps}, sys.stdout)
--- a/release/alert.py
+++ b/release/alert.py
@ -1,441 +0,0 @@
 import argparse
 from collections import defaultdict, Counter
 from typing import Any, List, Tuple, Mapping, Optional
 import datetime
 import hashlib
 import json
 import logging
 import os
 import requests
 import sys
 import boto3
 from e2e import GLOBAL_CONFIG
 from alerts.default import handle_result as default_handle_result
 from alerts.rllib_tests import handle_result as rllib_tests_handle_result
 from alerts.long_running_tests import handle_result as long_running_tests_handle_result
 from alerts.tune_tests import handle_result as tune_tests_handle_result
 from alerts.xgboost_tests import handle_result as xgboost_tests_handle_result
 SUITE_TO_FN = {
    "long_running_tests": long_running_tests_handle_result,
    "rllib_tests": rllib_tests_handle_result,
    "tune_tests": tune_tests_handle_result,
    "xgboost_tests": xgboost_tests_handle_result,
 }
 GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] = "alert_state"
 GLOBAL_CONFIG["SLACK_WEBHOOK"] = os.environ.get("SLACK_WEBHOOK", "")
 GLOBAL_CONFIG["SLACK_CHANNEL"] = os.environ.get("SLACK_CHANNEL", "#oss-test-cop")
 RESULTS_LIMIT = 120
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler(stream=sys.stdout)
 formatter = logging.Formatter(
    fmt="[%(levelname)s %(asctime)s] " "%(filename)s: %(lineno)d  " "%(message)s"
 )
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 def maybe_fetch_slack_webhook():
    if GLOBAL_CONFIG["SLACK_WEBHOOK"] in [None, ""]:
        print("Missing SLACK_WEBHOOK, retrieving from AWS secrets store")
        GLOBAL_CONFIG["SLACK_WEBHOOK"] = boto3.client(
            "secretsmanager", region_name="us-west-2"
        ).get_secret_value(
            SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:"
            "release-automation/"
            "slack-webhook-Na0CFP"
        )[
            "SecretString"
        ]
 def _obj_hash(obj: Any) -> str:
    json_str = json.dumps(obj, sort_keys=True, ensure_ascii=True)
    sha = hashlib.sha256()
    sha.update(json_str.encode())
    return sha.hexdigest()
 def fetch_latest_alerts(rds_data_client):
    schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
    sql = f"""
        SELECT DISTINCT ON (category, test_suite, test_name)
               category, test_suite, test_name, last_result_hash,
               last_notification_dt
        FROM   {schema}
        ORDER BY category, test_suite, test_name, last_notification_dt DESC
        LIMIT {RESULTS_LIMIT}
        """
    result = rds_data_client.execute_statement(
        database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
        secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
        resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
        schema=schema,
        sql=sql,
    )
    for row in result["records"]:
        category, test_suite, test_name, last_result_hash, last_notification_dt = (
            r["stringValue"] if "stringValue" in r else None for r in row
        )
        last_notification_dt = datetime.datetime.strptime(
            last_notification_dt, "%Y-%m-%d %H:%M:%S"
        )
        yield category, test_suite, test_name, last_result_hash, last_notification_dt
 def fetch_latest_results(
    rds_data_client, fetch_since: Optional[datetime.datetime] = None
 ):
    schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
    sql = f"""
        SELECT DISTINCT ON (category, test_suite, test_name)
               created_on, category, test_suite, test_name, status, results,
               artifacts, last_logs
        FROM   {schema} """
    parameters = []
    if fetch_since is not None:
        sql += "WHERE created_on >= :created_on "
        parameters = [
            {
                "name": "created_on",
                "typeHint": "TIMESTAMP",
                "value": {"stringValue": fetch_since.strftime("%Y-%m-%d %H:%M:%S")},
            },
        ]
    sql += "ORDER BY category, test_suite, test_name, created_on DESC "
    sql += f"LIMIT {RESULTS_LIMIT}"
    result = rds_data_client.execute_statement(
        database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
        secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
        resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
        schema=schema,
        sql=sql,
        parameters=parameters,
    )
    for row in result["records"]:
        (
            created_on,
            category,
            test_suite,
            test_name,
            status,
            results,
            artifacts,
            last_logs,
        ) = (r["stringValue"] if "stringValue" in r else None for r in row)
        # Calculate hash before converting strings to objects
        result_obj = (
            created_on,
            category,
            test_suite,
            test_name,
            status,
            results,
            artifacts,
            last_logs,
        )
        result_json = json.dumps(result_obj)
        result_hash = _obj_hash(result_json)
        # Convert some strings to python objects
        created_on = datetime.datetime.strptime(created_on, "%Y-%m-%d %H:%M:%S")
        results = json.loads(results)
        artifacts = json.loads(artifacts)
        yield result_hash, created_on, category, test_suite, test_name, status, results, artifacts, last_logs  # noqa: E501
 def mark_as_handled(
    rds_data_client,
    update: bool,
    category: str,
    test_suite: str,
    test_name: str,
    result_hash: str,
    last_notification_dt: datetime.datetime,
 ):
    schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
    if not update:
        sql = f"""
            INSERT INTO {schema}
            (category, test_suite, test_name,
            last_result_hash, last_notification_dt)
            VALUES (:category, :test_suite, :test_name,
                    :last_result_hash, :last_notification_dt)
            """
    else:
        sql = f"""
            UPDATE {schema}
            SET last_result_hash=:last_result_hash,
                last_notification_dt=:last_notification_dt
            WHERE category=:category AND test_suite=:test_suite
            AND test_name=:test_name
            """
    rds_data_client.execute_statement(
        database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
        parameters=[
            {"name": "category", "value": {"stringValue": category}},
            {"name": "test_suite", "value": {"stringValue": test_suite or ""}},
            {"name": "test_name", "value": {"stringValue": test_name}},
            {"name": "last_result_hash", "value": {"stringValue": result_hash}},
            {
                "name": "last_notification_dt",
                "typeHint": "TIMESTAMP",
                "value": {
                    "stringValue": last_notification_dt.strftime("%Y-%m-%d %H:%M:%S")
                },
            },
        ],
        secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
        resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
        schema=schema,
        sql=sql,
    )
 def post_alerts_to_slack(
    channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
 ):
    if len(alerts) == 0:
        logger.info("No alerts to post to slack.")
        return
    markdown_lines = [
        f"* {len(alerts)} new release test failures found!*",
        "",
    ]
    category_alerts = defaultdict(list)
    for (category, test_suite, test_name, alert) in alerts:
        category_alerts[category].append(
            f"   *{test_suite}/{test_name}* failed: {alert}"
        )
    for category, alert_list in category_alerts.items():
        markdown_lines.append(f"Branch: *{category}*")
        markdown_lines.extend(alert_list)
        markdown_lines.append("")
    total_non_alerts = sum(n for n in non_alerts.values())
    non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
    markdown_lines += [
        f"Additionally, {total_non_alerts} tests passed successfully "
        f"({', '.join(non_alert_detail)})."
    ]
    slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
    resp = requests.post(
        slack_url,
        json={
            "text": "\n".join(markdown_lines),
            "channel": channel,
            "username": "Fail Bot",
            "icon_emoji": ":red_circle:",
        },
    )
    print(resp.status_code)
    print(resp.text)
 def post_statistics_to_slack(
    channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
 ):
    total_alerts = len(alerts)
    category_alerts = defaultdict(list)
    for (category, test_suite, test_name, alert) in alerts:
        category_alerts[category].append(f"`{test_suite}/{test_name}`")
    alert_detail = [f"{len(a)} on {c}" for c, a in category_alerts.items()]
    total_non_alerts = sum(n for n in non_alerts.values())
    non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
    markdown_lines = [
        "*Periodic release test report*",
        "",
        f"In the past 24 hours, "
        f"*{total_non_alerts}* release tests finished successfully, and "
        f"*{total_alerts}* release tests failed.",
    ]
    markdown_lines.append("")
    if total_alerts:
        markdown_lines.append(f"*Failing:* {', '.join(alert_detail)}")
        for c, a in category_alerts.items():
            markdown_lines.append(f"  *{c}*: {', '.join(sorted(a))}")
    else:
        markdown_lines.append("*Failing:* None")
    markdown_lines.append("")
    if total_non_alerts:
        markdown_lines.append(f"*Passing:* {', '.join(non_alert_detail)}")
    else:
        markdown_lines.append("*Passing:* None")
    slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
    resp = requests.post(
        slack_url,
        json={
            "text": "\n".join(markdown_lines),
            "channel": channel,
            "username": "Fail Bot",
            "icon_emoji": ":red_circle:",
        },
    )
    print(resp.status_code)
    print(resp.text)
 def handle_results_and_get_alerts(
    rds_data_client,
    fetch_since: Optional[datetime.datetime] = None,
    always_try_alert: bool = False,
    no_status_update: bool = False,
 ):
    # First build a map of last notifications
    last_notifications_map = {}
    for (
        category,
        test_suite,
        test_name,
        last_result_hash,
        last_notification_dt,
    ) in fetch_latest_alerts(rds_data_client):
        last_notifications_map[(category, test_suite, test_name)] = (
            last_result_hash,
            last_notification_dt,
        )
    alerts = []
    non_alerts = Counter()
    # Then fetch latest results
    for (
        result_hash,
        created_on,
        category,
        test_suite,
        test_name,
        status,
        results,
        artifacts,
        last_logs,
    ) in fetch_latest_results(rds_data_client, fetch_since=fetch_since):
        key = (category, test_suite, test_name)
        try_alert = always_try_alert
        if key in last_notifications_map:
            # If we have an alert for this key, fetch info
            last_result_hash, last_notification_dt = last_notifications_map[key]
            if last_result_hash != result_hash:
                # If we got a new result, handle new result
                try_alert = True
            # Todo: maybe alert again after some time?
        else:
            try_alert = True
        if try_alert:
            handle_fn = SUITE_TO_FN.get(test_suite, None)
            if not handle_fn:
                logger.warning(f"No handle for suite {test_suite}")
                alert = default_handle_result(
                    created_on,
                    category,
                    test_suite,
                    test_name,
                    status,
                    results,
                    artifacts,
                    last_logs,
                )
            else:
                alert = handle_fn(
                    created_on,
                    category,
                    test_suite,
                    test_name,
                    status,
                    results,
                    artifacts,
                    last_logs,
                )
            if alert:
                logger.warning(
                    f"Alert raised for test {test_suite}/{test_name} "
                    f"({category}): {alert}"
                )
                alerts.append((category, test_suite, test_name, alert))
            else:
                logger.debug(
                    f"No alert raised for test {test_suite}/{test_name} "
                    f"({category})"
                )
                non_alerts[category] += 1
            if not no_status_update:
                mark_as_handled(
                    rds_data_client,
                    key in last_notifications_map,
                    category,
                    test_suite,
                    test_name,
                    result_hash,
                    datetime.datetime.now(),
                )
    return alerts, non_alerts
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--stats",
        action="store_true",
        default=False,
        help="Finish quickly for training.",
    )
    args = parser.parse_args()
    maybe_fetch_slack_webhook()
    rds_data_client = boto3.client("rds-data", region_name="us-west-2")
    if args.stats:
        # Only update last 24 hour stats
        fetch_since = datetime.datetime.now() - datetime.timedelta(days=1)
        alerts, non_alerts = handle_results_and_get_alerts(
            rds_data_client,
            fetch_since=fetch_since,
            always_try_alert=True,
            no_status_update=True,
        )
        post_statistics_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
    else:
        alerts, non_alerts = handle_results_and_get_alerts(rds_data_client)
        post_alerts_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
--- a/release/benchmarks/benchmark_tests.yaml
+++ b/release/benchmarks/benchmark_tests.yaml
@ -1,145 +0,0 @@
 - name: single_node
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: single_node.yaml
  run:
    timeout: 12000
    prepare: sleep 0
    script: python single_node/test_single_node.py
 - name: object_store
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: object_store.yaml
  run:
    timeout: 3600
    prepare: python distributed/wait_cluster.py --num-nodes=50
    script: python object_store/test_object_store.py
 - name: many_actors
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=65
    script: python distributed/test_many_actors.py
 - name: many_actors_smoke_test
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed_smoke_test.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=2
    script: SMOKE_TEST=1 python distributed/test_many_actors.py
 - name: many_tasks
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=65
    script: python distributed/test_many_tasks.py --num-tasks=10000
 - name: many_tasks_smoke_test
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed_smoke_test.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=2
    script: python distributed/test_many_tasks.py --num-tasks=100
 - name: many_pgs
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=65
    script: python distributed/test_many_pgs.py
 - name: many_pgs_smoke_test
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: distributed_smoke_test.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=2
    script: SMOKE_TEST=1 python distributed/test_many_pgs.py
 # NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
 - name: many_nodes
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: many_nodes.yaml
  run:
    timeout: 3600 # 1hr
    prepare: python distributed/wait_cluster.py --num-nodes=250
    script: python distributed/test_many_tasks.py --num-tasks=1000
 - name: scheduling_test_many_0s_tasks_single_node
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: scheduling.yaml
  run:
    timeout: 3600
    prepare: python distributed/wait_cluster.py --num-nodes=32
    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
 - name: scheduling_test_many_0s_tasks_many_nodes
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: scheduling.yaml
  run:
    timeout: 3600
    prepare: python distributed/wait_cluster.py --num-nodes=32
    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
 - name: scheduling_test_many_5s_tasks_single_node
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: scheduling.yaml
  run:
    timeout: 3600
    prepare: python distributed/wait_cluster.py --num-nodes=32
    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
  stable: false
 - name: scheduling_test_many_5s_tasks_many_nodes
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: scheduling.yaml
  run:
    timeout: 3600
    prepare: python distributed/wait_cluster.py --num-nodes=32
    script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
  stable: false
--- a/release/benchmarks/distributed/wait_cluster.py
+++ b/release/benchmarks/distributed/wait_cluster.py
@ -1,24 +0,0 @@
 import click
 import ray
 import time
 def num_alive_nodes():
    n = 0
    for node in ray.nodes():
        if node["Alive"]:
            n += 1
    return n
@click.command()
@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
 def wait_cluster(num_nodes: int):
    ray.init(address="auto")
    while num_alive_nodes() != num_nodes:
        print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
        time.sleep(5)
 if __name__ == "__main__":
    wait_cluster()
--- a/release/benchmarks/wait_cluster.py
+++ b/release/benchmarks/wait_cluster.py
@ -1,54 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/config_generator.html
+++ b/release/config_generator.html
@ -1,214 +0,0 @@
 <!doctype html>
 <html>
 <head>
    <meta charset="utf-8">
    <title>Releaser config generator</title>
    <style type="text/css">
        html {
            background: #cccccc;
        }
        body {
            background: #ffffff;
            font-family: sans-serif;
            padding: 1em 2em;
            max-width: 800px;
            margin: 0 auto;
        }
        textarea {
            width: 600px;
            height: 200px;
        }
        form .use {
            white-space: nowrap;
            padding-right: 1em;
        }
        form .val {
            min-width: 300px;
        }
        form .val input {
            width: 90%;
        }
        form .desc {
        }
    </style>
    <script type="text/javascript">
        var env_vars = [
            {
                "name": "RAY_TEST_REPO",
                "short": "Git repo with test files",
                "long": "Repository in which the test files are which you would like to run. Note that this doesn't have to be the same repo from which the wheels are installed.",
                "default": "https://github.com/ray-project/ray.git",
                "enabled": false,
            },
            {
                "name": "RAY_TEST_BRANCH",
                "short": "Git branch for test repo",
                "long": "Git branch that is checked out from RAY_TEST_REPO and which contains the test files you would like to run. Note that this doesnt' have to be the same branch you're fetching the Ray wheels from.",
                "default": "master",
                "enabled": false,
            },
            {
                "name": "RAY_REPO",
                "short": "Git repo for the Ray wheels",
                "long": "Repository from which to fetch the latest commits to find the Ray wheels",
                "default": "https://github.com/ray-project/ray.git",
                "enabled": false,
            },
            {
                "name": "RAY_BRANCH",
                "short": "Git branch for the Ray wheels",
                "long": "Branch that is check out from RAY_REPO from which the latest commits are fetched to find the Ray wheels",
                "default": "master",
                "enabled": true,
            },
            {
                "name": "RELEASE_TEST_SUITE",
                "short": "Release test suite (nightly/weekly/manual)",
                "long": "Release test suite as defined in releaser's build_pipeline.py",
                "default": "nightly",
                "enabled": true,
            },
            {
                "name": "FILTER_FILE",
                "short": "Filter test file by this string",
                "long": "Only test files (e.g. xgboost_tests.yml) that match this string will be included in the test",
                "default": "",
                "enabled": false,
            },
            {
                "name": "FILTER_TEST",
                "short": "Filter test name by this string",
                "long": "Only test names (e.g. tune_4x32) that match this string will be included in the test",
                "default": "",
                "enabled": false,
            },
        ]
        window.addEventListener('load', function () {
            var table = document.getElementById("gen_table");
            for (var env_var of env_vars) {
                var use_td = document.createElement("td");
                use_td.setAttribute("class", "use");
                var use_input = document.createElement("input");
                use_input.setAttribute("type", "checkbox");
                use_input.setAttribute("data-activate", env_var["name"] + "_val");
                use_input.setAttribute("id", env_var["name"] + "_use");
                use_input.setAttribute("class", "input_use");
                if (env_var["enabled"]) {
                    use_input.checked = true;
                }
                var use_label = document.createElement("label");
                use_label.setAttribute("for", env_var["name"] + "_use");
                use_label.innerHTML = env_var["name"];
                use_td.append(use_input);
                use_td.append(use_label);
                val_td = document.createElement("td");
                val_td.setAttribute("class", "val");
                val_input = document.createElement("input");
                val_input.setAttribute("type", "text");
                if (!env_var["enabled"]) {
                    val_input.setAttribute("disabled", "disabled");
                }
                val_input.setAttribute("id", env_var["name"] + "_val");
                val_input.setAttribute("name", env_var["name"]);
                val_input.setAttribute("value", env_var["default"]);
                val_input.setAttribute("class", "input_val");
                val_td.append(val_input);
                use_input.addEventListener("click", function(e) {
                    var toggle_val = document.getElementById(e.target.getAttribute("data-activate"))
                    if (toggle_val.disabled) {
                        toggle_val.removeAttribute("disabled");
                    } else {
                        toggle_val.setAttribute("disabled", "disabled");
                    }
                    generate_snippet();
                });
                val_input.addEventListener("change", function() { generate_snippet(); });
                val_input.addEventListener("keydown", function() { generate_snippet(); });
                val_input.addEventListener("keyup", function() { generate_snippet(); });
                var desc_td = document.createElement("td");
                desc_td.setAttribute("class", "desc");
                var desc_a = document.createElement("a");
                desc_a.setAttribute("title", env_var["long"]);
                desc_a.innerHTML = env_var["short"];
                desc_td.append(desc_a);
                var tr = document.createElement("tr");
                tr.append(use_td);
                tr.append(val_td);
                tr.append(desc_td);
                table.append(tr);
            }
            var button = document.getElementById("generate");
            button.addEventListener("click", function() {
                generate_snippet();
            })
            generate_snippet()
        })
        function generate_snippet() {
            full_snippet = ""
            for (env_var of env_vars) {
                var val_input = document.getElementById(env_var["name"] + "_val")
                if (!val_input.disabled) {
                    full_snippet += env_var["name"] + "=\"" + val_input.value + "\"\n"
                }
            }
            document.getElementById("snippet").innerHTML = full_snippet;
        }
    </script>
 </head>
 <body>
 <header class="header">
    <h1>Releaser config generator</h1>
    <p>Use this form to generate a list of environment variables.</p>
    <p>These variables can be passed to Buildkite to run a subset of release tests
    and choose the correct wheels/release test branch</p>
 </header>
 <section class="main">
    <form id="gen">
        <table id="gen_table">
            <tr>
                <th>Set</th>
                <th>Value</th>
                <th>Description</th>
            </tr>
        </table>
    </form>
    <div>
        <button id="generate">Generate snippet</button>
    </div>
    <div>
        <textarea id="snippet">
        </textarea>
    </div>
 </section>
 </body>
 </html>
--- a/release/e2e.py
+++ b/release/e2e.py
--- a/release/horovod_tests/horovod_tests.yaml
+++ b/release/horovod_tests/horovod_tests.yaml
@ -1,15 +0,0 @@
 - name: horovod_test
  team: ml
  cluster:
    app_config: app_config_master.yaml
    compute_template: compute_tpl.yaml
  run:
    timeout: 36000
    prepare: python wait_cluster.py 3 600
    script: python workloads/horovod_tune_test.py
    long_running: True
  smoke_test:
    run:
      timeout: 1800
--- a/release/horovod_tests/wait_cluster.py
+++ b/release/horovod_tests/wait_cluster.py
@ -1,53 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/lightgbm_tests/lightgbm_tests.yaml
+++ b/release/lightgbm_tests/lightgbm_tests.yaml
@ -1,92 +0,0 @@
 - name: train_small
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
  run:
    use_connect: True
    autosuspend_mins: 10
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/train_small.py
 - name: train_moderate
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 32 600
    script: python workloads/train_moderate.py
 - name: train_gpu
  team: ml
  cluster:
    app_config: app_config_gpu.yaml
    compute_template: tpl_gpu_small.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 5 600
    script: python workloads/train_gpu.py
 - name: distributed_api_test
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
    results: 
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/distributed_api_test.py
    results: ""
 - name: ft_small_non_elastic
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
  run:
    timeout: 900
    prepare: python wait_cluster.py 4 600
    script: python workloads/ft_small_non_elastic.py
    results: ""
 - name: tune_small
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/tune_small.py
 - name: tune_32x4
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
  run:
    timeout: 900
    prepare: python wait_cluster.py 32 600
    script: python workloads/tune_32x4.py
 - name: tune_4x32
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
  run:
    timeout: 900
    prepare: python wait_cluster.py 32 600
    script: python workloads/tune_4x32.py
--- a/release/lightgbm_tests/wait_cluster.py
+++ b/release/lightgbm_tests/wait_cluster.py
@ -1,53 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/long_running_distributed_tests/long_running_distributed.yaml
+++ b/release/long_running_distributed_tests/long_running_distributed.yaml
@ -1,13 +0,0 @@
 - name: pytorch_pbt_failure
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: compute_tpl.yaml
  run:
    timeout: 86400
    script: python workloads/pytorch_pbt_failure.py
    long_running: True
  smoke_test:
    timeout: 3600
--- a/release/long_running_tests/long_running_tests.yaml
+++ b/release/long_running_tests/long_running_tests.yaml
@ -1,196 +0,0 @@
 - name: actor_deaths
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
  run:
    timeout: 86400
    prepare: ray stop
    script: python workloads/actor_deaths.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: apex
  team: ml
  cluster:
    app_config: ../rllib_tests/app_config.yaml
    compute_template: tpl_cpu_3.yaml
  run:
    timeout: 86400
    prepare: python wait_cluster.py 3 600
    script: python workloads/apex.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: impala
  team: ml
  cluster:
    app_config: app_config_np.yaml
    compute_template: tpl_cpu_1_large.yaml
  run:
    timeout: 86400
    script: python workloads/impala.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: many_actor_tasks
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
  run:
    timeout: 86400
    prepare: ray stop
    script: python workloads/many_actor_tasks.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: many_drivers
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
  run:
    timeout: 86400
    prepare: ray stop
    script: python workloads/many_drivers.py --iteration-num=4000
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: many_ppo
  team: ml
  cluster:
    app_config: ../rllib_tests/app_config.yaml
    compute_template: many_ppo.yaml
  run:
    timeout: 86400
    prepare: python wait_cluster.py 1 600
    script: python workloads/many_ppo.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: many_tasks
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
  run:
    timeout: 86400
    prepare: ray stop
    script: python workloads/many_tasks.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: many_tasks_serialized_ids
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
  run:
    timeout: 86400
    prepare: ray stop
    script: python workloads/many_tasks_serialized_ids.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: node_failures
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
  run:
    timeout: 86400
    prepare: ray stop
    script: python workloads/node_failures.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: pbt
  team: ml
  cluster:
    app_config: ../rllib_tests/app_config.yaml
    compute_template: tpl_cpu_1.yaml
  run:
    timeout: 86400
    prepare: ray stop
    script: python workloads/pbt.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: serve
  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
  run:
    timeout: 86400
    prepare: ray stop
    script: python workloads/serve.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: serve_failure
  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_1.yaml
  run:
    timeout: 86400
    prepare: ray stop
    script: python workloads/serve_failure.py
    long_running: True
  smoke_test:
    run:
      timeout: 600
  stable: False
--- a/release/long_running_tests/wait_cluster.py
+++ b/release/long_running_tests/wait_cluster.py
@ -1,53 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/microbenchmark/microbenchmark.yaml
+++ b/release/microbenchmark/microbenchmark.yaml
@ -1,9 +0,0 @@
 # - name: microbenchmark
 #   team: core
 #   cluster:
 #     app_config: app_config.yaml
 #     compute_template: tpl_64.yaml
 #   run:
 #     timeout: 1800
 #     script: OMP_NUM_THREADS=64 RAY_ADDRESS= python run_microbenchmark.py
--- a/release/ml_user_tests/ml_user_tests.yaml
+++ b/release/ml_user_tests/ml_user_tests.yaml
@ -1,124 +0,0 @@
 - name: horovod_user_test_latest
  team: ml
  cluster:
    app_config: horovod/app_config.yaml
    compute_template: horovod/compute_tpl.yaml
  driver_setup: horovod/driver_setup_latest.sh
  run:
    use_connect: True
    autosuspend_mins: 10
    timeout: 1200
    script: python horovod/horovod_user_test.py
 - name: horovod_user_test_master
  team: ml
  cluster:
    app_config: ../horovod_tests/app_config_master.yaml
    compute_template: horovod/compute_tpl.yaml
  driver_setup: horovod/driver_setup_master.sh
  run:
    use_connect: True
    autosuspend_mins: 10
    timeout: 1200
    script: python horovod/horovod_user_test.py
 - name: train_tensorflow_mnist_test
  team: ml
  cluster:
      app_config: train/app_config.yaml
      compute_template: train/compute_tpl.yaml
  driver_setup: train/driver_setup.sh
  run:
      use_connect: True
      timeout: 36000
      script: python train/train_tensorflow_mnist_test.py
 - name: train_torch_linear_test
  team: ml
  cluster:
      app_config: train/app_config.yaml
      compute_template: train/compute_tpl.yaml
  driver_setup: train/driver_setup.sh
  run:
      use_connect: True
      timeout: 36000
      script: python train/train_torch_linear_test.py
 - name: xgboost_gpu_connect_latest
  team: ml
  cluster:
    app_config: xgboost/app_config_gpu.yaml
    compute_template: xgboost/tpl_gpu_small_scaling.yaml
  run:
    use_connect: True
    timeout: 1200
    script: python xgboost/train_gpu_connect.py
 - name: xgboost_gpu_connect_master
  team: ml
  cluster:
    app_config: xgboost/app_config_gpu_master.yaml
    compute_template: xgboost/tpl_gpu_small_scaling.yaml
  run:
    use_connect: True
    timeout: 1200
    script: python xgboost/train_gpu_connect.py
 - name: ray_lightning_user_test_latest
  team: ml
  cluster:
    app_config: ray-lightning/app_config.yaml
    compute_template: ray-lightning/compute_tpl.yaml
  driver_setup: ray-lightning/driver_setup.sh
  run:
    use_connect: True
    autosuspend_mins: 10
    timeout: 1200
    script: python ray-lightning/ray_lightning_user_test.py
 - name: ray_lightning_user_test_master
  team: ml
  cluster:
    app_config: ray-lightning/app_config_master.yaml
    compute_template: ray-lightning/compute_tpl.yaml
  driver_setup: ray-lightning/driver_setup.sh
  run:
    use_connect: True
    autosuspend_mins: 10
    timeout: 1200
    script: python ray-lightning/ray_lightning_user_test.py
 - name: tune_rllib_connect_test
  team: ml
  cluster:
    app_config: ../rllib_tests/app_config.yaml
    compute_template: tune_rllib/compute_tpl.yaml
  driver_setup: tune_rllib/driver_setup.sh
  run:
    use_connect: True
    autosuspend_mins: 10
    timeout: 1200
    script: python tune_rllib/run_connect_tests.py
--- a/release/nightly_tests/chaos_test.yaml
+++ b/release/nightly_tests/chaos_test.yaml
@ -1,64 +0,0 @@
 #
 # Chaos tests.
 #
 # Run the test that invokes many tasks without object store usage.
 - name: chaos_many_tasks_no_object_store
  team: core
  cluster:
    app_config: chaos_test/app_config.yaml
    compute_template: chaos_test/compute_template.yaml
  run:
    timeout: 3600
    prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
    script: python chaos_test/test_chaos_basic.py --workload=tasks
 - name: chaos_many_actors
  team: core
  cluster:
    app_config: chaos_test/app_config.yaml
    compute_template: chaos_test/compute_template.yaml
  run:
    timeout: 3600
    prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
    script: python chaos_test/test_chaos_basic.py --workload=actors
 - name: chaos_dask_on_ray_large_scale_test_no_spilling
  team: core
  cluster:
    app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
    compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
  run:
    timeout: 7200
    # Total run time without failures is about 300-400s.
    prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
    script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0  --data_save_path /tmp/ray
 # Test large scale dask on ray test with spilling.
 - name: chaos_dask_on_ray_large_scale_test_spilling
  team: core
  cluster:
    app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
    compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
  run:
    timeout: 7200
    # Total run time without failures is about 300-400s.
    prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
    script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0  --data_save_path /tmp/ray
 - name: chaos_pipelined_ingestion_1500_gb_15_windows
  team: core
  cluster:
    app_config: dataset/pipelined_ingestion_app.yaml
    compute_template: dataset/pipelined_ingestion_compute.yaml
  run:
    timeout: 7200
    prepare: python wait_cluster.py 21 2400;  python setup_chaos.py --node-kill-interval 300
    script: python dataset/pipelined_training.py --epochs 1 --num-windows 15  --num-files 915 --debug
  stable: false
--- a/release/nightly_tests/dataset/dataset_test.yaml
+++ b/release/nightly_tests/dataset/dataset_test.yaml
@ -1,95 +0,0 @@
 - name: inference
  team: core
  cluster:
    app_config: app_config.yaml
    compute_template: inference.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 2 600
    script: python inference.py
 - name: shuffle_data_loader
  team: core
  cluster:
    app_config: shuffle_app_config.yaml
    compute_template: shuffle_compute.yaml
  run:
    timeout: 1800
    script: python dataset_shuffle_data_loader.py
 - name: parquet_metadata_resolution
  team: core
  cluster:
    app_config: pipelined_training_app.yaml
    compute_template: pipelined_training_compute.yaml
  run:
    timeout: 1200
    prepare: python wait_cluster.py 15 1200
    script: python parquet_metadata_resolution.py --num-files 915
 - name: pipelined_training_50_gb
  team: core
  cluster:
    app_config: pipelined_training_app.yaml
    compute_template: pipelined_training_compute.yaml
  run:
    timeout: 4800
    prepare: python wait_cluster.py 15 1200
    script: python pipelined_training.py --epochs 1
 - name: pipelined_ingestion_1500_gb
  team: core
  cluster:
    app_config: pipelined_ingestion_app.yaml
    compute_template: pipelined_ingestion_compute.yaml
  run:
    timeout: 9600
    prepare: python wait_cluster.py 21 2400
    script: python pipelined_training.py --epochs 2 --num-windows 2 --num-files 915 --debug
 - name: datasets_ingest_train_infer
  team: core
  cluster:
    app_config: ray_sgd_training_app.yaml
    compute_template: ray_sgd_training_compute.yaml
  run:
    timeout: 14400
    prepare: python wait_cluster.py 66 2400
    script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset
  smoke_test:
    cluster:
      app_config: ray_sgd_training_app.yaml
      compute_template: ray_sgd_training_smoke_compute.yaml
    run:
      timeout: 3600
      prepare: python wait_cluster.py 8 2400
      script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
 - name: datasets_preprocess_ingest
  team: core
  cluster:
    app_config: ray_sgd_training_app.yaml
    compute_template: ray_sgd_training_compute_no_gpu.yaml
  run:
    timeout: 7200
    prepare: python wait_cluster.py 21 2400
    script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset --debug
 - name: datasets_ingest_400G
  team: core
  cluster:
    app_config: ray_sgd_training_app.yaml
    compute_template: dataset_ingest_400G_compute.yaml
  run:
    timeout: 7200
    script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1
--- a/release/nightly_tests/dataset/wait_cluster.py
+++ b/release/nightly_tests/dataset/wait_cluster.py
@ -1,53 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/nightly_tests/nightly_tests.yaml
+++ b/release/nightly_tests/nightly_tests.yaml
@ -1,390 +0,0 @@
 #
 # Single node shuffle
 #
 # Test basic single node 10GB shuffle with a small number of partitions.
 # This doesn't require object spilling.
 # - name: shuffle_10gb
 #   team: core
 #   cluster:
 #     app_config: shuffle/shuffle_app_config.yaml
 #     compute_template: shuffle/shuffle_compute_single.yaml
 #   run:
 #     timeout: 3000
 #     script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6
 # Test single node 50GB shuffle with a large number of partitions.
 - name: shuffle_50gb
  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_single.yaml
  run:
    timeout: 3000
    script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
 # Test single node 50GB shuffle with a large number of partitions.
 - name: shuffle_50gb_large_partition
  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_single.yaml
  run:
    timeout: 3000
    script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
 # Test non streaming shuffle in a single node with a small number of partition.
 - name: non_streaming_shuffle_50gb
  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_single.yaml
  run:
    timeout: 3000
    script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming
 # Test non streaming shuffle in a single node with a large number of partition.
 - name: non_streaming_shuffle_50gb_large_partition
  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_single.yaml
  run:
    timeout: 3000
    script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
 - name: dask_on_ray_10gb_sort
  team: core
  cluster:
    app_config: dask_on_ray/dask_on_ray_app_config.yaml
    compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
  run:
    timeout: 7200
    script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
 - name: dask_on_ray_100gb_sort
  team: core
  cluster:
    app_config: dask_on_ray/dask_on_ray_app_config.yaml
    compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
  run:
    timeout: 7200
    script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
 #
 # Multi node shuffle
 #
 # Test multi nodes 100GB shuffle with a small number of partitions.
 - name: shuffle_100gb
  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_multi.yaml
  run:
    timeout: 3000
    prepare: python wait_cluster.py 4 600
    script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
 # Test non streaming multi nodes 100GB shuffle with a small number of partitions.
 - name: non_streaming_shuffle_100gb
  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_multi.yaml
  run:
    timeout: 3000
    prepare: python wait_cluster.py 4 600
    script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
 # Test autoscaling 1TB streaming shuffle with a large number of partitions.
 - name: autoscaling_shuffle_1tb_1000_partitions
  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_autoscaling.yaml
  run:
    timeout: 4000
    script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
 # Test multi nodes 1TB streaming shuffle with a large number of partitions.
 - name: shuffle_1tb_1000_partition
  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_large_scale.yaml
  run:
    timeout: 3000
    prepare: python wait_cluster.py 20 900
    script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
 # Test multi nodes 1TB non streaming shuffle with a large number of partitions.
 - name: non_streaming_shuffle_1tb_1000_partition
  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_large_scale.yaml
  run:
    timeout: 3000
    prepare: python wait_cluster.py 20 900
    script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
 # Stress test for 1TB multi node streaming shuffle.
 - name: shuffle_1tb_5000_partitions
  team: core
  cluster:
    app_config: shuffle/shuffle_app_config.yaml
    compute_template: shuffle/shuffle_compute_large_scale.yaml
  run:
    timeout: 9000
    prepare: python wait_cluster.py 20 900
    script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
 # Stress test for 1TB multi node non-streaming shuffle.
 # - name: non_streaming_shuffle_1tb_5000_partitions
 #   team: core
 #   stable: False
 #   cluster:
 #     app_config: shuffle/shuffle_app_config.yaml
 #     compute_template: shuffle/shuffle_compute_large_scale.yaml
 #   run:
 #     timeout: 7200
 #     prepare: python wait_cluster.py 20 900
 #     script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
 - name: k8s_dask_on_ray_large_scale_test_no_spilling
  team: core
  cluster:
    app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
    compute_template: dask_on_ray/dask_on_ray_stress_compute_k8s.yaml
    compute_on_k8s: True
  run:
    timeout: 7200
    prepare: python wait_cluster.py 21 600
    script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0  --data_save_path /tmp/ray
  stable: false
 # # Test large scale dask on ray test without spilling.
 # - name: dask_on_ray_large_scale_test_no_spilling
 #   team: core
 #   cluster:
 #     app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
 #     compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
 #   run:
 #     timeout: 7200
 #     prepare: python wait_cluster.py 21 600
 #     script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0  --data_save_path /tmp/ray
 #   smoke_test:
 #     cluster:
 #       app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
 #       compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
 #     run:
 #       timeout: 7200
 #       prepare: python wait_cluster.py 5 600
 #       script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0  --data_save_path /tmp/ray
 # Test large scale dask on ray test with spilling.
 - name: dask_on_ray_large_scale_test_spilling
  team: core
  cluster:
    app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
    compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
  run:
    timeout: 7200
    prepare: python wait_cluster.py 21 600
    script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0  --data_save_path /tmp/ray
  smoke_test:
    cluster:
      app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
      compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
    run:
      timeout: 7200
      prepare: python wait_cluster.py 5 600
      script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0  --data_save_path /tmp/ray
 # Stress tests with many tasks
 - name: stress_test_many_tasks
  team: core
  cluster:
    app_config: stress_tests/stress_tests_app_config.yaml
    compute_template: stress_tests/stress_tests_compute.yaml
  run:
    timeout: 7200
    script: python stress_tests/test_many_tasks.py
  smoke_test:
    cluster:
      app_config: stress_tests/stress_tests_app_config.yaml
      compute_template: stress_tests/smoke_test_compute.yaml
    run:
      timeout: 3600
      script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
 # Stress tests with dead actors
 - name: stress_test_dead_actors
  team: core
  cluster:
    app_config: stress_tests/stress_tests_app_config.yaml
    compute_template: stress_tests/stress_tests_compute.yaml
  run:
    timeout: 7200
    script: python stress_tests/test_dead_actors.py
  smoke_test:
    cluster:
      app_config: stress_tests/stress_tests_app_config.yaml
      compute_template: stress_tests/smoke_test_compute.yaml
    run:
      timeout: 3600
      script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3
 # Stress tests with placement groups
 - name: stress_test_placement_group
  team: core
  cluster:
    app_config: stress_tests/stress_tests_app_config.yaml
    compute_template: stress_tests/placement_group_tests_compute.yaml
  run:
    timeout: 7200
    script: python stress_tests/test_placement_group.py
 # Stress tests with many threaded actors.
 - name: threaded_actors_stress_test
  team: core
  cluster:
    app_config: stress_tests/stress_tests_app_config.yaml
    compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
  run:
    timeout: 7200
    prepare: python wait_cluster.py 201 600
    script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
  smoke_test:
    cluster:
      app_config: stress_tests/stress_tests_app_config.yaml
      compute_template: stress_tests/smoke_test_compute.yaml
    run:
      timeout: 3600
      prepare: python wait_cluster.py 5 600
      script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
  stable: false
 - name: k8s_threaded_actors_stress_test
  team: core
  cluster:
    app_config: stress_tests/stress_tests_app_config.yaml
    compute_template: stress_tests/k8s_stress_test_threaded_actor_compute.yaml
    compute_on_k8s: True
  run:
    timeout: 7200
    prepare: python wait_cluster.py 201 600
    script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
    run:
      timeout: 3600
      prepare: python wait_cluster.py 5 600
      script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
  stable: false
 # Test decision tree on autoscaling compute cluster.
 - name: decision_tree_autoscaling
  team: core
  cluster:
    app_config: decision_tree/decision_tree_app_config.yaml
    compute_template: decision_tree/autoscaling_compute.yaml
  run:
    timeout: 3000
    script: python decision_tree/cart_with_tree.py
 # Test 20 concurrent decision tree runs on autoscaling compute cluster.
 - name: decision_tree_autoscaling_20_runs
  team: core
  cluster:
    app_config: decision_tree/decision_tree_app_config.yaml
    compute_template: decision_tree/autoscaling_compute.yaml
  run:
    timeout: 9600
    script: python decision_tree/cart_with_tree.py --concurrency=20
 - name: dask_on_ray_1tb_sort
  team: core
  cluster:
    app_config: dask_on_ray/dask_on_ray_app_config.yaml
    compute_template: dask_on_ray/1tb_sort_compute.yaml
  run:
    timeout: 7200
    prepare: python wait_cluster.py 32 1000
    script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
 - name: many_nodes_actor_test
  team: core
  cluster:
    app_config: many_nodes_tests/app_config.yaml
    compute_template: many_nodes_tests/compute_config.yaml
  run:
    timeout: 7200
    prepare: python wait_cluster.py 251 5400
    script: python many_nodes_tests/actor_test.py
 - name: pg_autoscaling_regression_test
  team: core
  cluster:
    app_config: placement_group_tests/app_config.yaml
    compute_template: placement_group_tests/compute.yaml
  run:
    timeout: 1200
    script: python placement_group_tests/pg_run.py
 - name: pg_long_running_performance_test
  team: core
  cluster:
    app_config: placement_group_tests/app_config.yaml
    compute_template: placement_group_tests/long_running_test_compute.yaml
  run:
    timeout: 3600
    prepare: python wait_cluster.py 2 600
    script: python placement_group_tests/long_running_performance_test.py --num-stages 2000
 - name: placement_group_performance_test
  team: core
  cluster:
    app_config: placement_group_tests/app_config.yaml
    compute_template: placement_group_tests/pg_perf_test_compute.yaml
  run:
    timeout: 1200
    prepare: python wait_cluster.py 5 600
    script: python placement_group_tests/placement_group_performance_test.py
--- a/release/nightly_tests/wait_cluster.py
+++ b/release/nightly_tests/wait_cluster.py
@ -1,54 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/rllib_tests/rllib_tests.yaml
+++ b/release/rllib_tests/rllib_tests.yaml
@ -1,103 +0,0 @@
 # Heavy learning tests (Atari and HalfCheetah) for major algos.
 - name: learning_tests
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 8gpus_64cpus.yaml
  run:
    timeout: 14400
    script: python learning_tests/run.py
  smoke_test:
      run:
        timeout: 1200
 # 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos.
 - name: multi_gpu_learning_tests
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 8gpus_96cpus.yaml
  run:
    timeout: 7200
    script: python multi_gpu_learning_tests/run.py
 # 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos
 # (that support RNN models).
 - name: multi_gpu_with_lstm_learning_tests
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 8gpus_96cpus.yaml
  run:
    timeout: 7200
    script: python multi_gpu_with_lstm_learning_tests/run.py
 # 2-GPU learning tests (StatelessCartPole) + use_attention=True for major
 # algos (that support RNN models).
 - name: multi_gpu_with_attention_learning_tests
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 8gpus_96cpus.yaml
  run:
    timeout: 7200
    script: python multi_gpu_with_attention_learning_tests/run.py
 # We'll have these as per-PR tests soon.
 # - name: example_scripts_on_gpu_tests
 #   team: ml
 #  cluster:
 #    app_config: app_config.yaml
 #    compute_template: 1gpu_4cpus.yaml
 #  run:
 #    timeout: 7200
 #    script: bash unit_gpu_tests/run.sh
 # IMPALA large machine stress tests (4x Atari).
 - name: stress_tests
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 4gpus_544_cpus.yaml
  run:
    timeout: 5400
    prepare: python wait_cluster.py 6 600
    script: python stress_tests/run_stress_tests.py
  smoke_test:
      run:
        timeout: 2000
 # Tests that exercise auto-scaling and Anyscale connect.
 - name: connect_tests
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: auto_scale.yaml
  run:
    use_connect: True
    timeout: 3000
    script: python connect_tests/run_connect_tests.py
 # Nightly performance regression for popular algorithms.
 # These algorithms run nightly for pre-determined amount of time without
 # passing criteria.
 # Performance metrics, such as reward achieved and throughput, are then
 # collected and tracked over time.
 - name: performance_tests
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: 12gpus_192cpus.yaml
  run:
    timeout: 10800
    script: python performance_tests/run.py
--- a/release/rllib_tests/wait_cluster.py
+++ b/release/rllib_tests/wait_cluster.py
@ -1,53 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/run_e2e.sh
+++ b/release/run_e2e.sh
@ -1,176 +0,0 @@
 #!/bin/bash
 set -ex
 cd "${0%/*}" || exit 1
 reason() {
  # Keep in sync with e2e.py ExitCode enum
  case $1 in
    0)
    REASON="success"
    ;;
    2)
    REASON="unspecified"
    ;;
    3)
    REASON="unknown"
    ;;
    4)
    REASON="runtime error"
    ;;
    5)
    REASON="command error"
    ;;
    6)
    REASON="command timeout"
    ;;
    7)
    REASON="prepare timeout"
    ;;
    8)
    REASON="filesync timeout"
    ;;
    9)
    REASON="session timeout"
    ;;
    10)
    REASON="prepare error"
    ;;
    11)
    REASON="app config build error"
    ;;
    12)
    REASON="infra error"
    ;;
    *)
    REASON="untracked error"
    ;;
  esac
  echo "${REASON}"
 }
 while [[ $# -gt 0 ]]
 do
 key="$1"
 case $key in
    --ray-repo)
    shift
    RAY_REPO=$1
    ;;
    --ray-branch)
    shift
    RAY_BRANCH=$1
    ;;
    --ray-version)
    shift
    RAY_VERSION=$1
    ;;
    --ray-wheels)
    shift
    RAY_WHEELS=$1
    ;;
    --ray-test-repo)
    shift
    RAY_TEST_REPO=$1
    ;;
    --ray-test-branch)
    shift
    RAY_TEST_BRANCH=$1
    ;;
    --release-results-dir)
    shift
    RELEASE_RESULTS_DIR=$1
    ;;
    *)
    break
 esac
 shift
 done
 RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
 RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
 RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
 export RAY_REPO RAY_BRANCH RAY_VERSION RAY_WHEELS RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
 pip uninstall -q -y ray
 pip install -q -r requirements.txt
 pip install -q -U boto3 botocore
 git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" ~/ray
 RETRY_NUM=0
 MAX_RETRIES=${MAX_RETRIES-3}
 if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
  echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
  MAX_RETRIES=1
 fi
 ALL_EXIT_CODES=()
 while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
  RETRY_NUM=$((RETRY_NUM + 1))
  if [ "$RETRY_NUM" -gt 1 ]; then
    # Sleep for random time between 30 and 90 minutes
    SLEEP_TIME=$((1800 + RANDOM % 5400))
    echo "----------------------------------------"
    echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
    echo "----------------------------------------"
    sleep ${SLEEP_TIME}
  fi
  sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
  python e2e.py "$@"
  EXIT_CODE=$?
  REASON=$(reason "${EXIT_CODE}")
  ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
  case ${EXIT_CODE} in
    0)
    echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
    break
    ;;
    7 | 9 | 10)
    echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
    ;;
    *)
    echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
    break
    ;;
  esac
 done
 sudo rm -rf /tmp/ray_release_test_artifacts/* || true
 sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
 echo "----------------------------------------"
 echo "e2e test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
 echo "Run results:"
 COUNTER=1
 for EX in "${ALL_EXIT_CODES[@]}"; do
  REASON=$(reason "${EX}")
  echo "  Run $COUNTER: Exit code = ${EX} (${REASON})"
  COUNTER=$((COUNTER + 1))
 done
 echo "----------------------------------------"
 REASON=$(reason "${EXIT_CODE}")
 echo "Final e2e exit code is ${EXIT_CODE} (${REASON})"
 case ${EXIT_CODE} in
  0)
  ;;
  7 | 9 | 10)
  echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
  ;;
  *)
  echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
  ;;
 esac
 exit $EXIT_CODE
--- a/release/runtime_env_tests/runtime_env_tests.yaml
+++ b/release/runtime_env_tests/runtime_env_tests.yaml
@ -1,34 +0,0 @@
 - name: rte_many_tasks_actors
  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: rte_small.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/rte_many_tasks_actors.py
 - name: wheel_urls
  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: rte_minimal.yaml
  run:
    timeout: 9000 # 2h30m
    prepare: python wait_cluster.py 1 600
    script: python workloads/wheel_urls.py
 - name: rte_ray_client
  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: rte_minimal.yaml
  run:
    use_connect: True
    autosuspend_mins: 10
    timeout: 600
    prepare: python wait_cluster.py 1 600
    script: python workloads/rte_ray_client.py
--- a/release/runtime_env_tests/wait_cluster.py
+++ b/release/runtime_env_tests/wait_cluster.py
@ -1,53 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/serve_tests/serve_tests.yaml
+++ b/release/serve_tests/serve_tests.yaml
@ -1,101 +0,0 @@
 - name: single_deployment_1k_noop_replica
  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: compute_tpl_32_cpu.yaml
  run:
    timeout: 7200
    long_running: False
    script: python workloads/single_deployment_1k_noop_replica.py
  smoke_test:
    timeout: 600
 - name: multi_deployment_1k_noop_replica
  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: compute_tpl_32_cpu.yaml
  run:
    timeout: 7200
    long_running: False
    script: python workloads/multi_deployment_1k_noop_replica.py
  smoke_test:
    timeout: 600
 - name: autoscaling_single_deployment
  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: compute_tpl_8_cpu_autoscaling.yaml
  run:
    timeout: 7200
    long_running: False
    script: python workloads/autoscaling_single_deployment.py
  smoke_test:
    timeout: 600
 - name: autoscaling_multi_deployment
  team: serve
  cluster:
    app_config: app_config.yaml
    compute_template: compute_tpl_8_cpu_autoscaling.yaml
  run:
    timeout: 7200
    long_running: False
    script: python workloads/autoscaling_multi_deployment.py
  smoke_test:
    timeout: 600
 - name: serve_micro_benchmark
  team: serve
  cluster:
    app_config: app_config.yaml
    # 16 CPUS
    compute_template: compute_tpl_single_node.yaml
  run:
    timeout: 7200
    long_running: False
    script: python workloads/serve_micro_benchmark.py
  smoke_test:
    timeout: 600
 - name: serve_micro_benchmark_k8s
  team: serve
  cluster:
    app_config: app_config.yaml
    # 16 CPUS
    compute_template: compute_tpl_single_node_k8s.yaml
    compute_on_k8s: True
  run:
    timeout: 7200
    long_running: False
    script: python workloads/serve_micro_benchmark.py
  smoke_test:
    timeout: 600
 - name: serve_cluster_fault_tolerance
  team: serve
  cluster:
    app_config: app_config.yaml
    # 16 CPUS
    compute_template: compute_tpl_single_node.yaml
  run:
    timeout: 7200
    long_running: False
    script: python workloads/serve_cluster_fault_tolerance.py
  smoke_test:
    timeout: 600
--- a/release/sgd_tests/sgd_tests.yaml
+++ b/release/sgd_tests/sgd_tests.yaml
@ -1,11 +0,0 @@
 # Test multi-node, multi-GPU Ray SGD example.
 - name: sgd_gpu
  team: ml
  cluster:
    app_config: sgd_gpu/sgd_gpu_app_config.yaml
    compute_template: sgd_gpu/sgd_gpu_compute.yaml
  run:
    timeout: 3000
    prepare: python wait_cluster.py 2 600
    script: python sgd_gpu/sgd_gpu_test.py --num-workers=2 --use-gpu --address=auto
--- a/release/sgd_tests/wait_cluster.py
+++ b/release/sgd_tests/wait_cluster.py
@ -1,53 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/test_owners.yaml
+++ b/release/test_owners.yaml
@ -1,27 +0,0 @@
 # Specify the test owners (teams) here.
 # The root key should be the name of the test yaml file without the .yaml.
 # To specify owners of subtests, use a sub dict (see e.g. long_running_tests).
 golden_notebook_tests: ml
 horovod_tests: ml
 lightgbm_tests: ml
 long_running_distributed_tests: ml
 long_running_tests:
  actor_deaths: core
  apex: ml
  impala: ml
  many_actor_tasks: core
  many_drivers: core
  many_ppo: core
  many_tasks: core
  many_tasks_serialized_ids: core
  node_failures: core
  pbt: ml
  serve: serve
  serve_failure: serve
 microbenchmark: core
 nightly_tests: core
 rllib_tests: ml
 runtime_env_tests: serve
 serve_tests: serve
 sgd_tests: ml
 xgboost_tests: ml
--- a/release/tune_tests/cloud_tests/tune_cloud_tests.yaml
+++ b/release/tune_tests/cloud_tests/tune_cloud_tests.yaml
@ -1,118 +0,0 @@
 - name: aws_no_sync_down
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_aws_4x2.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/run_cloud_test.py no_sync_down
 - name: aws_ssh_sync
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_aws_4x2.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/run_cloud_test.py ssh_sync
 - name: aws_durable_upload
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_aws_4x2.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
 - name: aws_durable_upload_rllib_str
  team: ml
  cluster:
    app_config: app_config_ml.yaml
    compute_template: tpl_aws_4x2.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str
 - name: aws_durable_upload_rllib_trainer
  team: ml
  cluster:
    app_config: app_config_ml.yaml
    compute_template: tpl_aws_4x2.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer
 - name: aws_no_durable_upload
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_aws_4x2.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload
 - name: gcp_k8s_no_sync_down
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_gcp_k8s_4x8.yaml
    cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM  # anyscale_k8s_gcp_cloud
  run:
    use_connect: True
    timeout: 600
    # Remove --cpus-per-trial 8 once n2-standard-2 is supported
    script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
 - name: gcp_k8s_ssh_sync
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_gcp_k8s_4x8.yaml
    cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM  # anyscale_k8s_gcp_cloud
  run:
    use_connect: True
    timeout: 600
    # Remove --cpus-per-trial 8 once n2-standard-2 is supported
    script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
 - name: gcp_k8s_durable_upload
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_gcp_k8s_4x8.yaml
    cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM  # anyscale_k8s_gcp_cloud
  run:
    use_connect: True
    timeout: 600
    # Remove --cpus-per-trial 8 once n2-standard-2 is supported
    script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
 - name: gcp_k8s_no_durable_upload
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_gcp_k8s_4x8.yaml
    cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM  # anyscale_k8s_gcp_cloud
  run:
    use_connect: True
    timeout: 600
    # Remove --cpus-per-trial 8 once n2-standard-2 is supported
    script: python workloads/run_cloud_test.py no_durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
--- a/release/tune_tests/cloud_tests/wait_cluster.py
+++ b/release/tune_tests/cloud_tests/wait_cluster.py
@ -1,54 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/tune_tests/scalability_tests/tune_tests.yaml
+++ b/release/tune_tests/scalability_tests/tune_tests.yaml
@ -1,90 +0,0 @@
 - name: bookkeeping_overhead
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_1x16.yaml
  run:
    timeout: 1200
    script: python workloads/test_bookkeeping_overhead.py
 - name: durable_trainable
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_16x2.yaml
  run:
    timeout: 900
    prepare: python wait_cluster.py 16 600
    script: python workloads/test_durable_trainable.py --bucket data-test-ilr
 - name: long_running_large_checkpoints
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_1x32_hd.yaml
  run:
    timeout: 86400
    script: python workloads/test_long_running_large_checkpoints.py
    long_running: True
  smoke_test:
    run:
      timeout: 3600
 - name: network_overhead
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_100x2.yaml
  run:
    timeout: 900
    prepare_timeout: 1200
    prepare: python wait_cluster.py 100 1200
    script: python workloads/test_network_overhead.py
  smoke_test:
    cluster:
      compute_template: tpl_20x2.yaml
    run:
      timeout: 400
      prepare_timeout: 600
      prepare: python wait_cluster.py 20 600
 - name: result_throughput_cluster
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_16x64.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 16 600
    script: python workloads/test_result_throughput_cluster.py
 - name: result_throughput_single_node
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_1x96.yaml
  run:
    timeout: 600
    script: python workloads/test_result_throughput_single_node.py
 - name: xgboost_sweep
  team: ml
  cluster:
    app_config: app_config_data.yaml
    compute_template: tpl_16x64.yaml
  run:
    timeout: 3600
    prepare: python wait_cluster.py 16 600
    script: python workloads/test_xgboost_sweep.py
--- a/release/tune_tests/scalability_tests/wait_cluster.py
+++ b/release/tune_tests/scalability_tests/wait_cluster.py
@ -1,53 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/util/wait_cluster.py
+++ b/release/util/wait_cluster.py
@ -1,53 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/xgboost_tests/wait_cluster.py
+++ b/release/xgboost_tests/wait_cluster.py
@ -1,53 +0,0 @@
 import argparse
 import time
 import ray
 ray.init(address="auto")
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
 )
 parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
 parser.add_argument(
    "--feedback_interval_s",
    type=int,
    default=10,
    help="Wait for this number of seconds",
 )
 args = parser.parse_args()
 curr_nodes = 0
 start = time.time()
 next_feedback = start
 max_time = start + args.max_time_s
 while not curr_nodes >= args.num_nodes:
    now = time.time()
    if now >= max_time:
        raise RuntimeError(
            f"Maximum wait time reached, but only "
            f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
        )
    if now >= next_feedback:
        passed = now - start
        print(
            f"Waiting for more nodes to come up: "
            f"{curr_nodes}/{args.num_nodes} "
            f"({passed:.0f} seconds passed)"
        )
        next_feedback = now + args.feedback_interval_s
    time.sleep(5)
    curr_nodes = len(ray.nodes())
 passed = time.time() - start
 print(
    f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
    f"{passed:.0f} seconds"
 )
--- a/release/xgboost_tests/xgboost_tests.yaml
+++ b/release/xgboost_tests/xgboost_tests.yaml
@ -1,104 +0,0 @@
 - name: train_small
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
  run:
    use_connect: True
    autosuspend_mins: 10
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/train_small.py
 - name: train_moderate
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 32 600
    script: python workloads/train_moderate.py
 - name: train_gpu
  team: ml
  cluster:
    app_config: app_config_gpu.yaml
    compute_template: tpl_gpu_small.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 5 600
    script: python workloads/train_gpu.py
 - name: distributed_api_test
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
    results: 
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/distributed_api_test.py
    results: ""
 - name: ft_small_elastic
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
  run:
    timeout: 900
    prepare: python wait_cluster.py 4 600
    script: python workloads/ft_small_elastic.py
    results: ""
 - name: ft_small_non_elastic
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
  run:
    timeout: 900
    prepare: python wait_cluster.py 4 600
    script: python workloads/ft_small_non_elastic.py
    results: ""
 - name: tune_small
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_small.yaml
  run:
    timeout: 600
    prepare: python wait_cluster.py 4 600
    script: python workloads/tune_small.py
 - name: tune_32x4
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
  run:
    timeout: 900
    prepare: python wait_cluster.py 32 600
    script: python workloads/tune_32x4.py
 - name: tune_4x32
  team: ml
  cluster:
    app_config: app_config.yaml
    compute_template: tpl_cpu_moderate.yaml
  run:
    timeout: 900
    prepare: python wait_cluster.py 32 600
    script: python workloads/tune_4x32.py