diff --git a/benchmarks/benchmark_tests.yaml b/benchmarks/benchmark_tests.yaml
deleted file mode 100644
index a89e3deb9..000000000
--- a/benchmarks/benchmark_tests.yaml
+++ /dev/null
@@ -1,145 +0,0 @@
-- name: single_node
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: single_node.yaml
-
- run:
- timeout: 12000
- prepare: sleep 0
- script: python single_node/test_single_node.py
-
-- name: object_store
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: object_store.yaml
-
- run:
- timeout: 3600
- prepare: python distributed/wait_cluster.py --num-nodes=50
- script: python object_store/test_object_store.py
-
-- name: many_actors
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=65
- script: python distributed/test_many_actors.py
-
-- name: many_actors_smoke_test
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed_smoke_test.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=2
- script: SMOKE_TEST=1 python distributed/test_many_actors.py
-
-- name: many_tasks
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=65
- script: python distributed/test_many_tasks.py --num-tasks=10000
-
-- name: many_tasks_smoke_test
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed_smoke_test.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=2
- script: python distributed/test_many_tasks.py --num-tasks=100
-
-- name: many_pgs
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=65
- script: python distributed/test_many_pgs.py
-
-- name: many_pgs_smoke_test
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed_smoke_test.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=2
- script: SMOKE_TEST=1 python distributed/test_many_pgs.py
-
-# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
-- name: many_nodes
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: many_nodes.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=250
- script: python distributed/test_many_tasks.py --num-tasks=1000
-
-- name: scheduling_test_many_0s_tasks_single_node
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: scheduling.yaml
-
- run:
- timeout: 3600
- prepare: python distributed/wait_cluster.py --num-nodes=32
- script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
-
-- name: scheduling_test_many_0s_tasks_many_nodes
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: scheduling.yaml
-
- run:
- timeout: 3600
- prepare: python distributed/wait_cluster.py --num-nodes=32
- script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
-
-- name: scheduling_test_many_5s_tasks_single_node
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: scheduling.yaml
-
- run:
- timeout: 3600
- prepare: python distributed/wait_cluster.py --num-nodes=32
- script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
- stable: false
-
-- name: scheduling_test_many_5s_tasks_many_nodes
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: scheduling.yaml
-
- run:
- timeout: 3600
- prepare: python distributed/wait_cluster.py --num-nodes=32
- script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
- stable: false
diff --git a/benchmarks/distributed/wait_cluster.py b/benchmarks/distributed/wait_cluster.py
deleted file mode 100644
index 12a8a1677..000000000
--- a/benchmarks/distributed/wait_cluster.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import click
-import ray
-import time
-
-
-def num_alive_nodes():
- n = 0
- for node in ray.nodes():
- if node["Alive"]:
- n += 1
- return n
-
-
-@click.command()
-@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
-def wait_cluster(num_nodes: int):
- ray.init(address="auto")
- while num_alive_nodes() != num_nodes:
- print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
- time.sleep(5)
-
-
-if __name__ == "__main__":
- wait_cluster()
diff --git a/release/.buildkite/build_pipeline.py b/release/.buildkite/build_pipeline.py
deleted file mode 100644
index 4c0e09099..000000000
--- a/release/.buildkite/build_pipeline.py
+++ /dev/null
@@ -1,680 +0,0 @@
-import copy
-import logging
-import os
-import re
-import sys
-
-import yaml
-
-# If you update or reorganize the periodic tests, please ensure the
-# relevant portions of the Ray release instructions (go/release-ray)
-# (in particular, running periodic tests and collecting release logs)
-# are up to date. If you need access, please contact @zhe-thoughts.
-
-# Env variables:
-
-# RAY_REPO Repo to use for finding the wheel
-# RAY_BRANCH Branch to find the wheel
-# RAY_VERSION Version to find the wheel
-# RAY_WHEELS Direct Ray wheel URL
-# RAY_TEST_REPO Repo to use for test scripts
-# RAY_TEST_BRANCH Branch for test scripts
-# FILTER_FILE File filter
-# FILTER_TEST Test name filter
-# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)
-
-
-class ReleaseTest:
- def __init__(
- self,
- name: str,
- smoke_test: bool = False,
- retry: int = 0,
- ):
- self.name = name
- self.smoke_test = smoke_test
- self.retry = retry
-
- def __str__(self):
- return self.name
-
- def __repr__(self):
- return self.name
-
- def __contains__(self, item):
- return self.name.__contains__(item)
-
- def __iter__(self):
- return iter(self.name)
-
- def __len__(self):
- return len(self.name)
-
-
-class SmokeTest(ReleaseTest):
- def __init__(self, name: str, retry: int = 0):
- super(SmokeTest, self).__init__(name=name, smoke_test=True, retry=retry)
-
-
-CORE_NIGHTLY_TESTS = {
- # "~/ray/release/nightly_tests/nightly_tests.yaml": [
- # "shuffle_10gb",
- # "shuffle_50gb",
- # "shuffle_50gb_large_partition",
- # "shuffle_100gb",
- # "non_streaming_shuffle_100gb",
- # "non_streaming_shuffle_50gb_large_partition",
- # "non_streaming_shuffle_50gb",
- # SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
- # SmokeTest("dask_on_ray_large_scale_test_spilling"),
- # "stress_test_placement_group",
- # "shuffle_1tb_1000_partition",
- # "non_streaming_shuffle_1tb_1000_partition",
- # "shuffle_1tb_5000_partitions",
- # TODO(sang): It doesn't even work without spilling
- # as it hits the scalability limit.
- # "non_streaming_shuffle_1tb_5000_partitions",
- # "decision_tree_autoscaling",
- # "decision_tree_autoscaling_20_runs",
- # "autoscaling_shuffle_1tb_1000_partitions",
- # SmokeTest("stress_test_many_tasks"),
- # SmokeTest("stress_test_dead_actors"),
- # SmokeTest("threaded_actors_stress_test"),
- # "pg_long_running_performance_test",
- # ],
- # "~/ray/benchmarks/benchmark_tests.yaml": [
- # "single_node",
- # "object_store",
- # "many_actors_smoke_test",
- # "many_tasks_smoke_test",
- # "many_pgs_smoke_test",
- # ],
- # "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
- # "inference",
- # "shuffle_data_loader",
- # "parquet_metadata_resolution",
- # "pipelined_training_50_gb",
- # "pipelined_ingestion_1500_gb",
- # "datasets_preprocess_ingest",
- # "datasets_ingest_400G",
- # SmokeTest("datasets_ingest_train_infer"),
- # ],
- # "~/ray/release/nightly_tests/chaos_test.yaml": [
- # "chaos_many_actors",
- # "chaos_many_tasks_no_object_store",
- # "chaos_pipelined_ingestion_1500_gb_15_windows",
- # ],
- # "~/ray/release/microbenchmark/microbenchmark.yaml": [
- # "microbenchmark",
- # ],
-}
-
-SERVE_NIGHTLY_TESTS = {
- # "~/ray/release/long_running_tests/long_running_tests.yaml": [
- # SmokeTest("serve"),
- # SmokeTest("serve_failure"),
- # ],
- # "~/ray/release/serve_tests/serve_tests.yaml": [
- # "single_deployment_1k_noop_replica",
- # "multi_deployment_1k_noop_replica",
- # "autoscaling_single_deployment",
- # "autoscaling_multi_deployment",
- # "serve_micro_benchmark",
- # # TODO(architkulkarni) Reenable after K8s migration. Currently failing
- # # "serve_micro_benchmark_k8s",
- # "serve_cluster_fault_tolerance",
- # ],
-}
-
-CORE_DAILY_TESTS = {
- # "~/ray/release/nightly_tests/nightly_tests.yaml": [
- # "k8s_dask_on_ray_large_scale_test_no_spilling",
- # "dask_on_ray_large_scale_test_no_spilling",
- # "dask_on_ray_large_scale_test_spilling",
- # "pg_autoscaling_regression_test",
- # "threaded_actors_stress_test",
- # "k8s_threaded_actors_stress_test",
- # "stress_test_many_tasks",
- # "stress_test_dead_actors",
- # ],
- # "~/ray/release/nightly_tests/chaos_test.yaml": [
- # "chaos_dask_on_ray_large_scale_test_no_spilling",
- # "chaos_dask_on_ray_large_scale_test_spilling",
- # ],
-}
-
-CORE_SCALABILITY_TESTS_DAILY = {
- # "~/ray/benchmarks/benchmark_tests.yaml": [
- # "many_actors",
- # "many_tasks",
- # "many_pgs",
- # "many_nodes",
- # ],
-}
-
-CORE_SCHEDULING_DAILY = {
- # "~/ray/benchmarks/benchmark_tests.yaml": [
- # "scheduling_test_many_0s_tasks_single_node",
- # "scheduling_test_many_0s_tasks_many_nodes",
- # # Reenable these two once we got right setup
- # # "scheduling_test_many_5s_tasks_single_node",
- # # "scheduling_test_many_5s_tasks_many_nodes",
- # ],
- # "~/ray/release/nightly_tests/nightly_tests.yaml": [
- # "many_nodes_actor_test",
- # "dask_on_ray_10gb_sort",
- # "dask_on_ray_100gb_sort",
- # "dask_on_ray_1tb_sort",
- # "placement_group_performance_test",
- # ],
-}
-
-NIGHTLY_TESTS = {
- # "~/ray/release/horovod_tests/horovod_tests.yaml": [
- # SmokeTest("horovod_test"),
- # ], # Should we enable this?
- # "~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
- # "dask_xgboost_test",
- # "modin_xgboost_test",
- # "torch_tune_serve_test",
- # ],
- # "~/ray/release/long_running_tests/long_running_tests.yaml": [
- # SmokeTest("actor_deaths"),
- # SmokeTest("apex"),
- # SmokeTest("impala"),
- # SmokeTest("many_actor_tasks"),
- # SmokeTest("many_drivers"),
- # SmokeTest("many_ppo"),
- # SmokeTest("many_tasks"),
- # SmokeTest("many_tasks_serialized_ids"),
- # SmokeTest("node_failures"),
- # SmokeTest("pbt"),
- # # SmokeTest("serve"),
- # # SmokeTest("serve_failure"),
- # # Full long running tests (1 day runtime)
- # "actor_deaths",
- # "apex",
- # "impala",
- # "many_actor_tasks",
- # "many_drivers",
- # "many_ppo",
- # "many_tasks",
- # "many_tasks_serialized_ids",
- # "node_failures",
- # "pbt",
- # "serve",
- # "serve_failure",
- # ],
- # "~/ray/release/sgd_tests/sgd_tests.yaml": [
- # "sgd_gpu",
- # ],
- # "~/ray/release/tune_tests/cloud_tests/tune_cloud_tests.yaml": [
- # "aws_no_sync_down",
- # "aws_ssh_sync",
- # "aws_durable_upload",
- # "aws_durable_upload_rllib_str",
- # "aws_durable_upload_rllib_trainer",
- # "gcp_k8s_durable_upload",
- # ],
- # "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
- # "bookkeeping_overhead",
- # "durable_trainable",
- # SmokeTest("long_running_large_checkpoints"),
- # SmokeTest("network_overhead"),
- # "result_throughput_cluster",
- # "result_throughput_single_node",
- # ],
- # "~/ray/release/xgboost_tests/xgboost_tests.yaml": [
- # "train_small",
- # "train_moderate",
- # "train_gpu",
- # "tune_small",
- # "tune_4x32",
- # "tune_32x4",
- # "ft_small_elastic",
- # "ft_small_non_elastic",
- # "distributed_api_test",
- # ],
- # "~/ray/release/rllib_tests/rllib_tests.yaml": [
- # SmokeTest("learning_tests"),
- # SmokeTest("stress_tests"),
- # "performance_tests",
- # "multi_gpu_learning_tests",
- # "multi_gpu_with_lstm_learning_tests",
- # "multi_gpu_with_attention_learning_tests",
- # # We'll have these as per-PR tests soon.
- # # "example_scripts_on_gpu_tests",
- # ],
- # "~/ray/release/runtime_env_tests/runtime_env_tests.yaml": [
- # "rte_many_tasks_actors",
- # "wheel_urls",
- # "rte_ray_client",
- # ],
-}
-
-WEEKLY_TESTS = {
- # "~/ray/release/horovod_tests/horovod_tests.yaml": [
- # "horovod_test",
- # ],
- "~/ray/release/long_running_distributed_tests"
- # "/long_running_distributed.yaml": [
- # "pytorch_pbt_failure",
- # ],
- # "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
- # "network_overhead",
- # "long_running_large_checkpoints",
- # "xgboost_sweep",
- # ],
- # "~/ray/release/rllib_tests/rllib_tests.yaml": [
- # "learning_tests",
- # "stress_tests",
- # ],
-}
-
-# This test suite holds "user" tests to test important user workflows
-# in a particular environment.
-# All workloads in this test suite should:
-# 1. Be run in a distributed (multi-node) fashion
-# 2. Use autoscaling/scale up (no wait_cluster.py)
-# 3. Use GPUs if applicable
-# 4. Have the `use_connect` flag set.
-USER_TESTS = {
- # "~/ray/release/ml_user_tests/ml_user_tests.yaml": [
- # "train_tensorflow_mnist_test",
- # "train_torch_linear_test",
- # "ray_lightning_user_test_latest",
- # "ray_lightning_user_test_master",
- # "horovod_user_test_latest",
- # "horovod_user_test_master",
- # "xgboost_gpu_connect_latest",
- # "xgboost_gpu_connect_master",
- # "tune_rllib_connect_test",
- # ]
-}
-
-SUITES = {
- "core-nightly": CORE_NIGHTLY_TESTS,
- "serve-nightly": SERVE_NIGHTLY_TESTS,
- "core-daily": CORE_DAILY_TESTS,
- "core-scalability": CORE_SCALABILITY_TESTS_DAILY,
- "nightly": {**NIGHTLY_TESTS, **USER_TESTS},
- "core-scheduling-daily": CORE_SCHEDULING_DAILY,
- "weekly": WEEKLY_TESTS,
-}
-
-DEFAULT_STEP_TEMPLATE = {
- "env": {
- "ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
- "ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
- "RELEASE_AWS_BUCKET": "ray-release-automation-results",
- "RELEASE_AWS_LOCATION": "dev",
- "RELEASE_AWS_DB_NAME": "ray_ci",
- "RELEASE_AWS_DB_TABLE": "release_test_result",
- "AWS_REGION": "us-west-2",
- },
- "agents": {"queue": "runner_queue_branch"},
- "plugins": [
- {
- "docker#v3.9.0": {
- "image": "rayproject/ray",
- "propagate-environment": True,
- "volumes": [
- "/tmp/ray_release_test_artifacts:" "/tmp/ray_release_test_artifacts"
- ],
- }
- }
- ],
- "artifact_paths": ["/tmp/ray_release_test_artifacts/**/*"],
-}
-
-
-def ask_configuration():
- RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
- RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
- RAY_VERSION = os.environ.get("RAY_VERSION", "")
- RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
-
- RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
- RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
-
- RELEASE_TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
- FILTER_FILE = os.environ.get("FILTER_FILE", "")
- FILTER_TEST = os.environ.get("FILTER_TEST", "")
-
- input_ask_step = {
- "input": "Input required: Please specify tests to run",
- "fields": [
- {
- "text": (
- "RAY_REPO: Please specify the Ray repository used "
- "to find the wheel."
- ),
- "hint": (
- "Repository from which to fetch the latest "
- "commits to find the Ray wheels. Usually you don't "
- "need to change this."
- ),
- "default": RAY_REPO,
- "key": "ray_repo",
- },
- {
- "text": (
- "RAY_BRANCH: Please specify the Ray branch used "
- "to find the wheel."
- ),
- "hint": "For releases, this will be e.g. `releases/1.x.0`",
- "default": RAY_BRANCH,
- "key": "ray_branch",
- },
- {
- "text": (
- "RAY_VERSION: Please specify the Ray version used "
- "to find the wheel."
- ),
- "hint": (
- "Leave empty for latest master. For releases, "
- "specify the release version."
- ),
- "required": False,
- "default": RAY_VERSION,
- "key": "ray_version",
- },
- {
- "text": "RAY_WHEELS: Please specify the Ray wheel URL.",
- "hint": (
- "ATTENTION: If you provide this, RAY_REPO, "
- "RAY_BRANCH and RAY_VERSION will be ignored! "
- "Please also make sure to provide the wheels URL "
- "for Python 3.7 on Linux.\n"
- "You can also insert a commit hash here instead "
- "of a full URL.\n"
- "NOTE: You can specify multiple commits or URLs "
- "for easy bisection (one per line) - this will "
- "run each test on each of the specified wheels."
- ),
- "required": False,
- "default": RAY_WHEELS,
- "key": "ray_wheels",
- },
- {
- "text": (
- "RAY_TEST_REPO: Please specify the Ray repository "
- "used to find the tests you would like to run."
- ),
- "hint": (
- "If you're developing a new release test, this "
- "will most likely be your GitHub fork."
- ),
- "default": RAY_TEST_REPO,
- "key": "ray_test_repo",
- },
- {
- "text": (
- "RAY_TEST_BRANCH: Please specify the Ray branch used "
- "to find the tests you would like to run."
- ),
- "hint": (
- "If you're developing a new release test, this "
- "will most likely be a branch living on your "
- "GitHub fork."
- ),
- "default": RAY_TEST_BRANCH,
- "key": "ray_test_branch",
- },
- {
- "select": (
- "RELEASE_TEST_SUITE: Please specify the release "
- "test suite containing the tests you would like "
- "to run."
- ),
- "hint": (
- "Check in the `build_pipeline.py` if you're "
- "unsure which suite contains your tests."
- ),
- "required": True,
- "options": sorted(SUITES.keys()),
- "default": RELEASE_TEST_SUITE,
- "key": "release_test_suite",
- },
- {
- "text": (
- "FILTER_FILE: Please specify a filter for the "
- "test files that should be included in this build."
- ),
- "hint": (
- "Only test files (e.g. xgboost_tests.yml) that "
- "match this string will be included in the test"
- ),
- "default": FILTER_FILE,
- "required": False,
- "key": "filter_file",
- },
- {
- "text": (
- "FILTER_TEST: Please specify a filter for the "
- "test names that should be included in this build."
- ),
- "hint": (
- "Only test names (e.g. tune_4x32) that match "
- "this string will be included in the test"
- ),
- "default": FILTER_TEST,
- "required": False,
- "key": "filter_test",
- },
- ],
- "key": "input_ask_step",
- }
-
- run_again_step = {
- "commands": [
- f'export {v}=$(buildkite-agent meta-data get "{k}")'
- for k, v in {
- "ray_branch": "RAY_BRANCH",
- "ray_repo": "RAY_REPO",
- "ray_version": "RAY_VERSION",
- "ray_wheels": "RAY_WHEELS",
- "ray_test_branch": "RAY_TEST_BRANCH",
- "ray_test_repo": "RAY_TEST_REPO",
- "release_test_suite": "RELEASE_TEST_SUITE",
- "filter_file": "FILTER_FILE",
- "filter_test": "FILTER_TEST",
- }.items()
- ]
- + [
- "export AUTOMATIC=1",
- "python3 -m pip install --user pyyaml",
- "rm -rf ~/ray || true",
- "git clone -b $${RAY_TEST_BRANCH} $${RAY_TEST_REPO} ~/ray",
- (
- "python3 ~/ray/release/.buildkite/build_pipeline.py "
- "| buildkite-agent pipeline upload"
- ),
- ],
- "label": ":pipeline: Again",
- "agents": {"queue": "runner_queue_branch"},
- "depends_on": "input_ask_step",
- "key": "run_again_step",
- }
-
- return [
- input_ask_step,
- run_again_step,
- ]
-
-
-def create_test_step(
- ray_repo: str,
- ray_branch: str,
- ray_version: str,
- ray_wheels: str,
- ray_test_repo: str,
- ray_test_branch: str,
- test_file: str,
- test_name: ReleaseTest,
-):
- custom_commit_str = "custom_wheels_url"
- if ray_wheels:
- # Extract commit from url
- p = re.compile(r"([a-f0-9]{40})")
- m = p.search(ray_wheels)
- if m is not None:
- custom_commit_str = m.group(1)
-
- ray_wheels_str = f" ({ray_wheels}) " if ray_wheels else ""
-
- logging.info(f"Creating step for {test_file}/{test_name}{ray_wheels_str}")
-
- cmd = (
- f"./release/run_e2e.sh "
- f'--ray-repo "{ray_repo}" '
- f'--ray-branch "{ray_branch}" '
- f'--ray-version "{ray_version}" '
- f'--ray-wheels "{ray_wheels}" '
- f'--ray-test-repo "{ray_test_repo}" '
- f'--ray-test-branch "{ray_test_branch}" '
- )
-
- args = (
- f"--category {ray_branch} "
- f"--test-config {test_file} "
- f"--test-name {test_name} "
- f"--keep-results-dir"
- )
-
- if test_name.smoke_test:
- logging.info("This test will run as a smoke test.")
- args += " --smoke-test"
-
- step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
-
- if test_name.retry:
- logging.info(f"This test will be retried up to " f"{test_name.retry} times.")
- step_conf["retry"] = {
- "automatic": [{"exit_status": "*", "limit": test_name.retry}]
- }
- else:
- # Default retry logic
- # Warning: Exit codes are currently not correctly propagated to
- # buildkite! Thus, actual retry logic is currently implemented in
- # the run_e2e.sh script!
- step_conf["retry"] = {
- "automatic": [
- {"exit_status": 7, "limit": 2}, # Prepare timeout
- {"exit_status": 9, "limit": 2}, # Session timeout
- {"exit_status": 10, "limit": 2}, # Prepare error
- ],
- }
-
- step_conf["command"] = cmd + args
-
- step_conf["label"] = (
- f"{test_name} "
- f"({custom_commit_str if ray_wheels_str else ray_branch}) - "
- f"{ray_test_branch}/{ray_test_repo}"
- )
- return step_conf
-
-
-def build_pipeline(steps):
- all_steps = []
-
- RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
- RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
- RAY_VERSION = os.environ.get("RAY_VERSION", "")
- RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
-
- RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
- RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
-
- FILTER_FILE = os.environ.get("FILTER_FILE", "")
- FILTER_TEST = os.environ.get("FILTER_TEST", "")
-
- ray_wheels_list = [""]
- if RAY_WHEELS:
- ray_wheels_list = RAY_WHEELS.split("\n")
-
- if len(ray_wheels_list) > 1:
- logging.info(
- f"This will run a bisec on the following URLs/commits: "
- f"{ray_wheels_list}"
- )
-
- logging.info(
- f"Building pipeline \n"
- f"Ray repo/branch to test:\n"
- f" RAY_REPO = {RAY_REPO}\n"
- f" RAY_BRANCH = {RAY_BRANCH}\n\n"
- f" RAY_VERSION = {RAY_VERSION}\n\n"
- f" RAY_WHEELS = {RAY_WHEELS}\n\n"
- f"Ray repo/branch containing the test configurations and scripts:"
- f" RAY_TEST_REPO = {RAY_TEST_REPO}\n"
- f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
- f"Filtering for these tests:\n"
- f" FILTER_FILE = {FILTER_FILE}\n"
- f" FILTER_TEST = {FILTER_TEST}\n\n"
- )
-
- for test_file, test_names in steps.items():
- if FILTER_FILE and FILTER_FILE not in test_file:
- continue
-
- test_base = os.path.basename(test_file)
- for test_name in test_names:
- if FILTER_TEST and FILTER_TEST not in test_name:
- continue
-
- if not isinstance(test_name, ReleaseTest):
- test_name = ReleaseTest(name=test_name)
-
- logging.info(f"Adding test: {test_base}/{test_name}")
-
- for ray_wheels in ray_wheels_list:
- step_conf = create_test_step(
- ray_repo=RAY_REPO,
- ray_branch=RAY_BRANCH,
- ray_version=RAY_VERSION,
- ray_wheels=ray_wheels,
- ray_test_repo=RAY_TEST_REPO,
- ray_test_branch=RAY_TEST_BRANCH,
- test_file=test_file,
- test_name=test_name,
- )
-
- all_steps.append(step_conf)
-
- return all_steps
-
-
-def alert_pipeline(stats: bool = False):
- step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
-
- cmd = "python release/alert.py"
- if stats:
- cmd += " --stats"
-
- step_conf["commands"] = [
- "pip install -q -r release/requirements.txt",
- "pip install -U boto3 botocore",
- cmd,
- ]
- step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
- return [step_conf]
-
-
-if __name__ == "__main__":
- alert = os.environ.get("RELEASE_ALERT", "0")
-
- ask_for_config = not bool(int(os.environ.get("AUTOMATIC", "0")))
-
- if alert in ["1", "stats"]:
- steps = alert_pipeline(alert == "stats")
- elif ask_for_config:
- steps = ask_configuration()
- else:
- TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
- PIPELINE_SPEC = SUITES[TEST_SUITE]
-
- steps = build_pipeline(PIPELINE_SPEC)
-
- yaml.dump({"steps": steps}, sys.stdout)
diff --git a/release/alert.py b/release/alert.py
deleted file mode 100644
index d0d1d433d..000000000
--- a/release/alert.py
+++ /dev/null
@@ -1,441 +0,0 @@
-import argparse
-from collections import defaultdict, Counter
-from typing import Any, List, Tuple, Mapping, Optional
-import datetime
-import hashlib
-import json
-import logging
-import os
-import requests
-import sys
-
-import boto3
-
-from e2e import GLOBAL_CONFIG
-
-from alerts.default import handle_result as default_handle_result
-from alerts.rllib_tests import handle_result as rllib_tests_handle_result
-from alerts.long_running_tests import handle_result as long_running_tests_handle_result
-from alerts.tune_tests import handle_result as tune_tests_handle_result
-from alerts.xgboost_tests import handle_result as xgboost_tests_handle_result
-
-SUITE_TO_FN = {
- "long_running_tests": long_running_tests_handle_result,
- "rllib_tests": rllib_tests_handle_result,
- "tune_tests": tune_tests_handle_result,
- "xgboost_tests": xgboost_tests_handle_result,
-}
-
-GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] = "alert_state"
-GLOBAL_CONFIG["SLACK_WEBHOOK"] = os.environ.get("SLACK_WEBHOOK", "")
-GLOBAL_CONFIG["SLACK_CHANNEL"] = os.environ.get("SLACK_CHANNEL", "#oss-test-cop")
-
-RESULTS_LIMIT = 120
-
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler(stream=sys.stdout)
-formatter = logging.Formatter(
- fmt="[%(levelname)s %(asctime)s] " "%(filename)s: %(lineno)d " "%(message)s"
-)
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
-
-def maybe_fetch_slack_webhook():
- if GLOBAL_CONFIG["SLACK_WEBHOOK"] in [None, ""]:
- print("Missing SLACK_WEBHOOK, retrieving from AWS secrets store")
- GLOBAL_CONFIG["SLACK_WEBHOOK"] = boto3.client(
- "secretsmanager", region_name="us-west-2"
- ).get_secret_value(
- SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:"
- "release-automation/"
- "slack-webhook-Na0CFP"
- )[
- "SecretString"
- ]
-
-
-def _obj_hash(obj: Any) -> str:
- json_str = json.dumps(obj, sort_keys=True, ensure_ascii=True)
- sha = hashlib.sha256()
- sha.update(json_str.encode())
- return sha.hexdigest()
-
-
-def fetch_latest_alerts(rds_data_client):
- schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
-
- sql = f"""
- SELECT DISTINCT ON (category, test_suite, test_name)
- category, test_suite, test_name, last_result_hash,
- last_notification_dt
- FROM {schema}
- ORDER BY category, test_suite, test_name, last_notification_dt DESC
- LIMIT {RESULTS_LIMIT}
- """
-
- result = rds_data_client.execute_statement(
- database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
- secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
- resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
- schema=schema,
- sql=sql,
- )
- for row in result["records"]:
- category, test_suite, test_name, last_result_hash, last_notification_dt = (
- r["stringValue"] if "stringValue" in r else None for r in row
- )
- last_notification_dt = datetime.datetime.strptime(
- last_notification_dt, "%Y-%m-%d %H:%M:%S"
- )
- yield category, test_suite, test_name, last_result_hash, last_notification_dt
-
-
-def fetch_latest_results(
- rds_data_client, fetch_since: Optional[datetime.datetime] = None
-):
- schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
-
- sql = f"""
- SELECT DISTINCT ON (category, test_suite, test_name)
- created_on, category, test_suite, test_name, status, results,
- artifacts, last_logs
- FROM {schema} """
-
- parameters = []
- if fetch_since is not None:
- sql += "WHERE created_on >= :created_on "
- parameters = [
- {
- "name": "created_on",
- "typeHint": "TIMESTAMP",
- "value": {"stringValue": fetch_since.strftime("%Y-%m-%d %H:%M:%S")},
- },
- ]
-
- sql += "ORDER BY category, test_suite, test_name, created_on DESC "
- sql += f"LIMIT {RESULTS_LIMIT}"
-
- result = rds_data_client.execute_statement(
- database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
- secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
- resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
- schema=schema,
- sql=sql,
- parameters=parameters,
- )
- for row in result["records"]:
- (
- created_on,
- category,
- test_suite,
- test_name,
- status,
- results,
- artifacts,
- last_logs,
- ) = (r["stringValue"] if "stringValue" in r else None for r in row)
-
- # Calculate hash before converting strings to objects
- result_obj = (
- created_on,
- category,
- test_suite,
- test_name,
- status,
- results,
- artifacts,
- last_logs,
- )
- result_json = json.dumps(result_obj)
- result_hash = _obj_hash(result_json)
-
- # Convert some strings to python objects
- created_on = datetime.datetime.strptime(created_on, "%Y-%m-%d %H:%M:%S")
- results = json.loads(results)
- artifacts = json.loads(artifacts)
-
- yield result_hash, created_on, category, test_suite, test_name, status, results, artifacts, last_logs # noqa: E501
-
-
-def mark_as_handled(
- rds_data_client,
- update: bool,
- category: str,
- test_suite: str,
- test_name: str,
- result_hash: str,
- last_notification_dt: datetime.datetime,
-):
- schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
-
- if not update:
- sql = f"""
- INSERT INTO {schema}
- (category, test_suite, test_name,
- last_result_hash, last_notification_dt)
- VALUES (:category, :test_suite, :test_name,
- :last_result_hash, :last_notification_dt)
- """
- else:
- sql = f"""
- UPDATE {schema}
- SET last_result_hash=:last_result_hash,
- last_notification_dt=:last_notification_dt
- WHERE category=:category AND test_suite=:test_suite
- AND test_name=:test_name
- """
-
- rds_data_client.execute_statement(
- database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
- parameters=[
- {"name": "category", "value": {"stringValue": category}},
- {"name": "test_suite", "value": {"stringValue": test_suite or ""}},
- {"name": "test_name", "value": {"stringValue": test_name}},
- {"name": "last_result_hash", "value": {"stringValue": result_hash}},
- {
- "name": "last_notification_dt",
- "typeHint": "TIMESTAMP",
- "value": {
- "stringValue": last_notification_dt.strftime("%Y-%m-%d %H:%M:%S")
- },
- },
- ],
- secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
- resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
- schema=schema,
- sql=sql,
- )
-
-
-def post_alerts_to_slack(
- channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
-):
- if len(alerts) == 0:
- logger.info("No alerts to post to slack.")
- return
-
- markdown_lines = [
- f"* {len(alerts)} new release test failures found!*",
- "",
- ]
-
- category_alerts = defaultdict(list)
- for (category, test_suite, test_name, alert) in alerts:
- category_alerts[category].append(
- f" *{test_suite}/{test_name}* failed: {alert}"
- )
-
- for category, alert_list in category_alerts.items():
- markdown_lines.append(f"Branch: *{category}*")
- markdown_lines.extend(alert_list)
- markdown_lines.append("")
-
- total_non_alerts = sum(n for n in non_alerts.values())
- non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
-
- markdown_lines += [
- f"Additionally, {total_non_alerts} tests passed successfully "
- f"({', '.join(non_alert_detail)})."
- ]
-
- slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
-
- resp = requests.post(
- slack_url,
- json={
- "text": "\n".join(markdown_lines),
- "channel": channel,
- "username": "Fail Bot",
- "icon_emoji": ":red_circle:",
- },
- )
- print(resp.status_code)
- print(resp.text)
-
-
-def post_statistics_to_slack(
- channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
-):
- total_alerts = len(alerts)
-
- category_alerts = defaultdict(list)
- for (category, test_suite, test_name, alert) in alerts:
- category_alerts[category].append(f"`{test_suite}/{test_name}`")
-
- alert_detail = [f"{len(a)} on {c}" for c, a in category_alerts.items()]
-
- total_non_alerts = sum(n for n in non_alerts.values())
- non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
-
- markdown_lines = [
- "*Periodic release test report*",
- "",
- f"In the past 24 hours, "
- f"*{total_non_alerts}* release tests finished successfully, and "
- f"*{total_alerts}* release tests failed.",
- ]
-
- markdown_lines.append("")
-
- if total_alerts:
- markdown_lines.append(f"*Failing:* {', '.join(alert_detail)}")
- for c, a in category_alerts.items():
- markdown_lines.append(f" *{c}*: {', '.join(sorted(a))}")
- else:
- markdown_lines.append("*Failing:* None")
-
- markdown_lines.append("")
-
- if total_non_alerts:
- markdown_lines.append(f"*Passing:* {', '.join(non_alert_detail)}")
- else:
- markdown_lines.append("*Passing:* None")
-
- slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
-
- resp = requests.post(
- slack_url,
- json={
- "text": "\n".join(markdown_lines),
- "channel": channel,
- "username": "Fail Bot",
- "icon_emoji": ":red_circle:",
- },
- )
- print(resp.status_code)
- print(resp.text)
-
-
-def handle_results_and_get_alerts(
- rds_data_client,
- fetch_since: Optional[datetime.datetime] = None,
- always_try_alert: bool = False,
- no_status_update: bool = False,
-):
- # First build a map of last notifications
- last_notifications_map = {}
- for (
- category,
- test_suite,
- test_name,
- last_result_hash,
- last_notification_dt,
- ) in fetch_latest_alerts(rds_data_client):
- last_notifications_map[(category, test_suite, test_name)] = (
- last_result_hash,
- last_notification_dt,
- )
-
- alerts = []
- non_alerts = Counter()
-
- # Then fetch latest results
- for (
- result_hash,
- created_on,
- category,
- test_suite,
- test_name,
- status,
- results,
- artifacts,
- last_logs,
- ) in fetch_latest_results(rds_data_client, fetch_since=fetch_since):
- key = (category, test_suite, test_name)
-
- try_alert = always_try_alert
- if key in last_notifications_map:
- # If we have an alert for this key, fetch info
- last_result_hash, last_notification_dt = last_notifications_map[key]
-
- if last_result_hash != result_hash:
- # If we got a new result, handle new result
- try_alert = True
- # Todo: maybe alert again after some time?
- else:
- try_alert = True
-
- if try_alert:
- handle_fn = SUITE_TO_FN.get(test_suite, None)
- if not handle_fn:
- logger.warning(f"No handle for suite {test_suite}")
- alert = default_handle_result(
- created_on,
- category,
- test_suite,
- test_name,
- status,
- results,
- artifacts,
- last_logs,
- )
- else:
- alert = handle_fn(
- created_on,
- category,
- test_suite,
- test_name,
- status,
- results,
- artifacts,
- last_logs,
- )
-
- if alert:
- logger.warning(
- f"Alert raised for test {test_suite}/{test_name} "
- f"({category}): {alert}"
- )
-
- alerts.append((category, test_suite, test_name, alert))
- else:
- logger.debug(
- f"No alert raised for test {test_suite}/{test_name} "
- f"({category})"
- )
- non_alerts[category] += 1
-
- if not no_status_update:
- mark_as_handled(
- rds_data_client,
- key in last_notifications_map,
- category,
- test_suite,
- test_name,
- result_hash,
- datetime.datetime.now(),
- )
-
- return alerts, non_alerts
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "--stats",
- action="store_true",
- default=False,
- help="Finish quickly for training.",
- )
- args = parser.parse_args()
-
- maybe_fetch_slack_webhook()
-
- rds_data_client = boto3.client("rds-data", region_name="us-west-2")
-
- if args.stats:
- # Only update last 24 hour stats
- fetch_since = datetime.datetime.now() - datetime.timedelta(days=1)
- alerts, non_alerts = handle_results_and_get_alerts(
- rds_data_client,
- fetch_since=fetch_since,
- always_try_alert=True,
- no_status_update=True,
- )
- post_statistics_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
-
- else:
- alerts, non_alerts = handle_results_and_get_alerts(rds_data_client)
- post_alerts_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
diff --git a/release/benchmarks/benchmark_tests.yaml b/release/benchmarks/benchmark_tests.yaml
deleted file mode 100644
index a89e3deb9..000000000
--- a/release/benchmarks/benchmark_tests.yaml
+++ /dev/null
@@ -1,145 +0,0 @@
-- name: single_node
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: single_node.yaml
-
- run:
- timeout: 12000
- prepare: sleep 0
- script: python single_node/test_single_node.py
-
-- name: object_store
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: object_store.yaml
-
- run:
- timeout: 3600
- prepare: python distributed/wait_cluster.py --num-nodes=50
- script: python object_store/test_object_store.py
-
-- name: many_actors
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=65
- script: python distributed/test_many_actors.py
-
-- name: many_actors_smoke_test
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed_smoke_test.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=2
- script: SMOKE_TEST=1 python distributed/test_many_actors.py
-
-- name: many_tasks
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=65
- script: python distributed/test_many_tasks.py --num-tasks=10000
-
-- name: many_tasks_smoke_test
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed_smoke_test.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=2
- script: python distributed/test_many_tasks.py --num-tasks=100
-
-- name: many_pgs
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=65
- script: python distributed/test_many_pgs.py
-
-- name: many_pgs_smoke_test
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: distributed_smoke_test.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=2
- script: SMOKE_TEST=1 python distributed/test_many_pgs.py
-
-# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
-- name: many_nodes
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: many_nodes.yaml
-
- run:
- timeout: 3600 # 1hr
- prepare: python distributed/wait_cluster.py --num-nodes=250
- script: python distributed/test_many_tasks.py --num-tasks=1000
-
-- name: scheduling_test_many_0s_tasks_single_node
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: scheduling.yaml
-
- run:
- timeout: 3600
- prepare: python distributed/wait_cluster.py --num-nodes=32
- script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
-
-- name: scheduling_test_many_0s_tasks_many_nodes
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: scheduling.yaml
-
- run:
- timeout: 3600
- prepare: python distributed/wait_cluster.py --num-nodes=32
- script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
-
-- name: scheduling_test_many_5s_tasks_single_node
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: scheduling.yaml
-
- run:
- timeout: 3600
- prepare: python distributed/wait_cluster.py --num-nodes=32
- script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
- stable: false
-
-- name: scheduling_test_many_5s_tasks_many_nodes
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: scheduling.yaml
-
- run:
- timeout: 3600
- prepare: python distributed/wait_cluster.py --num-nodes=32
- script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
- stable: false
diff --git a/release/benchmarks/distributed/wait_cluster.py b/release/benchmarks/distributed/wait_cluster.py
deleted file mode 100644
index 12a8a1677..000000000
--- a/release/benchmarks/distributed/wait_cluster.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import click
-import ray
-import time
-
-
-def num_alive_nodes():
- n = 0
- for node in ray.nodes():
- if node["Alive"]:
- n += 1
- return n
-
-
-@click.command()
-@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
-def wait_cluster(num_nodes: int):
- ray.init(address="auto")
- while num_alive_nodes() != num_nodes:
- print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
- time.sleep(5)
-
-
-if __name__ == "__main__":
- wait_cluster()
diff --git a/release/benchmarks/wait_cluster.py b/release/benchmarks/wait_cluster.py
deleted file mode 100644
index f70088289..000000000
--- a/release/benchmarks/wait_cluster.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/config_generator.html b/release/config_generator.html
deleted file mode 100644
index 179bd6320..000000000
--- a/release/config_generator.html
+++ /dev/null
@@ -1,214 +0,0 @@
-
-
-
-
- Releaser config generator
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/release/e2e.py b/release/e2e.py
deleted file mode 100644
index 3f458d56a..000000000
--- a/release/e2e.py
+++ /dev/null
@@ -1,2585 +0,0 @@
-"""
-This is an end to end release test automation script used to kick off periodic
-release tests, running on Anyscale.
-
-The tool leverages app configs and compute templates.
-
-Calling this script will run a single release test.
-
-Example:
-
-python e2e.py --test-config ~/ray/release/xgboost_tests/xgboost_tests.yaml --test-name tune_small
-
-The following steps are then performed:
-
-1. It will look up the test tune_small in the file xgboost_tests.yaml
-2. It will fetch the specified app config and compute template and register
- those with anyscale (if they don’t exist yet)
-3. It waits until the app config is built
-4. It then kicks off the script defined in the run block
-5. When the script is finished, it will fetch the latest logs, the full log
- output, and any artifacts specified in the artifacts block.
-6. The full logs and artifacts will be stored in a s3 bucket
-7. It will also fetch the json file specified in the run block as results.
- This is the file where you should write your metrics to.
-8. All results are then stored in a database.
- Specifically it will store the following fields:
- - Timestamp
- - Test name
- - Status (finished, error, timeout, invalid)
- - Last logs (50 lines)
- - results (see above)
- - artifacts (links to s3 files)
-
-Then the script exits. If an error occurs at any time, a fail result is
-written to the database.
-
-Exit codes
-----------
-The script exits with code 0 on success, i.e. if the test has been run
-end to end without failures and the subsequent results checks have passed.
-In all other cases, an exit code > 0 is returned.
-
-Exit code 1 is the general failure exit code returned by Python when we
-encounter an error that isn't caught by the rest of the script.
-
-Generally, we try to catch errors as they occur, and return a specific exit
-code that can be used in automation tools to e.g. retry a test when nodes
-didn't come up in time.
-
-These exit codes are defined in the ``ExitCode`` enum below.
-
-Writing a new release test
---------------------------
-Each release test requires the following:
-
-1. It has to be added in a release test yaml file, describing meta information
- about the test (e.g. name, command to run, timeout)
-2. You need an app config yaml
-3. You need a compute template yaml
-4. You need to define a command to run. This is usually a python script.
- The command should accept (or ignore) a single optional
- `--smoke-test` argument.
- Usually the command should write its result metrics to a json file.
- The json filename is available in the TEST_OUTPUT_JSON env variable.
-5. Add your test in release/.buildkite/build_pipeline.py.
-
-The script will have access to these environment variables:
-
- "RAY_ADDRESS": os.environ.get("RAY_ADDRESS", "auto")
- "TEST_OUTPUT_JSON": results_json_filename
- "IS_SMOKE_TEST": "1" if smoke_test else "0"
-
-For an example, take a look at the XGBoost test suite:
-
-https://github.com/ray-project/ray/blob/master/release/xgboost_tests/xgboost_tests.yaml
-
-These all use the same app configs and similar compute templates. This means
-that app configs can be re-used across runs and only have to be built ones.
-
-App configs and compute templates can interpret environment variables.
-A notable one is the `RAY_WHEELS` variable which points to the wheels that
-should be tested (e.g. latest master wheels). You might want to include
-something like this in your `post_build_cmds`:
-
- - pip3 uninstall ray -y || true
- - pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
-
-If you want to force rebuilds, consider using something like
-
- - echo {{ env["TIMESTAMP"] }}
-
-so that your app configs changes each time the script is executed. If you
-only want to trigger rebuilds once per day, use `DATESTAMP` instead:
-
- - echo {{ env["DATESTAMP"] }}
-
-Local testing
--------------
-Make sure to set these environment variables:
-
-- ANYSCALE_CLI_TOKEN (should contain your anyscale credential token)
-- ANYSCALE_PROJECT (should point to a project ID you have access to)
-
-A test can then be run like this:
-
-python e2e.py --test-config ~/ray/release/xgboost_tests/xgboost_tests.yaml --test-name tune_small
-
-Using Compilation on Product + App Config Override
---------------------------------------------------
-For quick iteration when debugging a release test, go/compile-on-product allows
-you to easily modify and recompile Ray, such that the recompilation happens
-within an app build step and can benefit from a warm Bazel cache. See
-go/compile-on-product for more information.
-
-After kicking off the app build, you can give the app config ID to this script
-as an app config override, where the indicated app config will be used instead
-of the app config given in the test config. E.g., running
-
-python e2e.py --test-config ~/ray/benchmarks/benchmark_tests.yaml --test-name=single_node --app-config-id-override=apt_TBngEXXXrhipMXgexVcrpC9i
-
-would run the single_node benchmark test with the apt_TBngEXXXrhipMXgexVcrpC9i
-app config instead of the app config given in
-~/ray/benchmarks/benchmark_tests.yaml. If the build for the app config is still
-in progress, the script will wait until it completes, same as for a locally
-defined app config.
-
-Running on Head Node vs Running with Anyscale Connect
------------------------------------------------------
-By default release tests run their drivers on the head node. Support is being
-added to run release tests that execute the driver as a subprocess and run
-the workload on Anyscale product via Anyscale connect.
-Note that when the driver in the test is a subprocess of releaser, releaser
-cannot be terminated before the test finishes.
-Other known feature gaps when running with Anyscale connect:
-- Kicking off a test or checking progress is not supported.
-- Downloading / uploading logs and artifacts are unsupported.
-- Logs from remote may not have finished streaming, before the driver exits.
-
-Long running tests
-------------------
-Long running tests can be kicked off with by adding the --kick-off-only
-parameters to the e2e script. The status can then be checked with the
---check command.
-
-Long running test sessions will be terminated after `timeout` seconds, after
-which the latest result in the TEST_OUTPUT_JSON will be reported. Thus,
-long running release tests should update this file periodically.
-
-There are also two config options to configure behavior. The `time_key` is
-needed to track the latest update of the TEST_OUTPUT_JSON and should contain
-a floating point number (usually `time.time()`). The `max_update_delay` then
-specified the maximum time in seconds that can be passed without an update
-to the results json. If the output file hasn't been updated in e.g. 60 seconds,
-this could indicate that the command is stale/frozen, and thus should fail.
-
-Release test yaml example
--------------------------
-- name: example
- owner:
- mail: "kai@anyscale.com" # Currently not used
- slack: "@tune-team" # Currentl not used
-
- cluster:
- app_config: app_config.yaml # Relative to the release test yaml
- compute_template: tpl_cpu.yaml
-
- run:
- timeout: 600 # in seconds
- prepare: python wait_cluster.py 4 600 # prepare cmd to run before test
- script: python workloads/train.py # actual release test command
-
- # Only needed for long running test
- time_key: last_update # Key in the results json indicating current time
- max_update_delay: 30 # If state hasn't been updated in 30s, terminate
-
- # This block is optional
- artifacts:
- # Artifact name: location on head node
- - detailed_output: detailed_output.csv
-
- # This block is optional. If present, the contents will be
- # deep updated for smoke testing
- smoke_test:
- cluster:
- compute_template: tpl_cpu_smoketest.yaml
-
-""" # noqa: E501
-import argparse
-import enum
-import random
-import shlex
-import string
-
-import boto3
-import collections
-import copy
-import datetime
-import hashlib
-import jinja2
-import json
-import logging
-import multiprocessing
-import os
-import requests
-import shutil
-import subprocess
-import sys
-import re
-import tempfile
-import time
-from queue import Empty
-from typing import Any, Dict, Optional, Tuple, List
-
-import yaml
-
-import anyscale
-import anyscale.conf
-from anyscale.authenticate import get_auth_api_client
-from anyscale.controllers.session_controller import SessionController
-from anyscale.sdk.anyscale_client.sdk import AnyscaleSDK
-
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler(stream=sys.stdout)
-formatter = logging.Formatter(
- fmt="[%(levelname)s %(asctime)s] " "%(filename)s: %(lineno)d " "%(message)s"
-)
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
-
-def _format_link(link: str):
- # Use ANSI escape code to allow link to be clickable
- # https://buildkite.com/docs/pipelines/links-and-images
- # -in-log-output
- return "\033]1339;url='" + link + "'\a\n"
-
-
-def getenv_default(key: str, default: Optional[str] = None):
- """Return environment variable with default value"""
- # If the environment variable is set but "", still return default
- return os.environ.get(key, None) or default
-
-
-GLOBAL_CONFIG = {
- "ANYSCALE_USER": getenv_default("ANYSCALE_USER", "release-automation@anyscale.com"),
- "ANYSCALE_HOST": getenv_default("ANYSCALE_HOST", "https://console.anyscale.com"),
- "ANYSCALE_CLI_TOKEN": getenv_default("ANYSCALE_CLI_TOKEN"),
- "ANYSCALE_CLOUD_ID": getenv_default(
- "ANYSCALE_CLOUD_ID", "cld_4F7k8814aZzGG8TNUGPKnc"
- ), # anyscale_default_cloud
- "ANYSCALE_PROJECT": getenv_default("ANYSCALE_PROJECT", ""),
- "RAY_VERSION": getenv_default("RAY_VERSION", "2.0.0.dev0"),
- "RAY_REPO": getenv_default("RAY_REPO", "https://github.com/ray-project/ray.git"),
- "RAY_BRANCH": getenv_default("RAY_BRANCH", "master"),
- "RELEASE_AWS_BUCKET": getenv_default(
- "RELEASE_AWS_BUCKET", "ray-release-automation-results"
- ),
- "RELEASE_AWS_LOCATION": getenv_default("RELEASE_AWS_LOCATION", "dev"),
- "RELEASE_AWS_DB_NAME": getenv_default("RELEASE_AWS_DB_NAME", "ray_ci"),
- "RELEASE_AWS_DB_TABLE": getenv_default(
- "RELEASE_AWS_DB_TABLE", "release_test_result"
- ),
- "RELEASE_AWS_DB_SECRET_ARN": getenv_default(
- "RELEASE_AWS_DB_SECRET_ARN",
- "arn:aws:secretsmanager:us-west-2:029272617770:secret:"
- "rds-db-credentials/cluster-7RB7EYTTBK2EUC3MMTONYRBJLE/ray_ci-MQN2hh",
- ),
- "RELEASE_AWS_DB_RESOURCE_ARN": getenv_default(
- "RELEASE_AWS_DB_RESOURCE_ARN",
- "arn:aws:rds:us-west-2:029272617770:cluster:ci-reporting",
- ),
- "RELEASE_RESULTS_DIR": getenv_default(
- "RELEASE_RESULTS_DIR", "/tmp/ray_release_test_artifacts"
- ),
- "DATESTAMP": str(datetime.datetime.now().strftime("%Y%m%d")),
- "TIMESTAMP": str(int(datetime.datetime.now().timestamp())),
- "EXPIRATION_1D": str(
- (datetime.datetime.now() + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
- ),
- "EXPIRATION_2D": str(
- (datetime.datetime.now() + datetime.timedelta(days=2)).strftime("%Y-%m-%d")
- ),
- "EXPIRATION_3D": str(
- (datetime.datetime.now() + datetime.timedelta(days=3)).strftime("%Y-%m-%d")
- ),
- "REPORT_RESULT": getenv_default("REPORT_RESULT", ""),
-}
-
-REPORT_S = 30
-RETRY_MULTIPLIER = 2
-VALID_TEAMS = ["ml", "core", "serve"]
-
-
-class ExitCode(enum.Enum):
- # If you change these, also change the `retry` section
- # in `build_pipeline.py` and the `reason()` function in `run_e2e.sh`
- UNSPECIFIED = 2
- UNKNOWN = 3
- RUNTIME_ERROR = 4
- COMMAND_ERROR = 5
- COMMAND_TIMEOUT = 6
- PREPARE_TIMEOUT = 7
- FILESYNC_TIMEOUT = 8
- SESSION_TIMEOUT = 9
- PREPARE_ERROR = 10
- APPCONFIG_BUILD_ERROR = 11
- INFRA_ERROR = 12
-
-
-def exponential_backoff_retry(f, retry_exceptions, initial_retry_delay_s, max_retries):
- retry_cnt = 0
- retry_delay_s = initial_retry_delay_s
- while True:
- try:
- return f()
- except retry_exceptions as e:
- retry_cnt += 1
- if retry_cnt > max_retries:
- raise
- logger.info(
- f"Retry function call failed due to {e} "
- f"in {retry_delay_s} seconds..."
- )
- time.sleep(retry_delay_s)
- retry_delay_s *= RETRY_MULTIPLIER
-
-
-def maybe_fetch_api_token():
- if GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"] is None:
- logger.info("Missing ANYSCALE_CLI_TOKEN, retrieving from AWS secrets store")
- # NOTE(simon) This should automatically retrieve
- # release-automation@anyscale.com's anyscale token
- GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"] = boto3.client(
- "secretsmanager", region_name="us-west-2"
- ).get_secret_value(
- SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:"
- "release-automation/"
- "anyscale-token20210505220406333800000001-BcUuKB"
- )[
- "SecretString"
- ]
-
-
-class PrepareCommandRuntimeError(RuntimeError):
- pass
-
-
-class ReleaseTestRuntimeError(RuntimeError):
- pass
-
-
-class ReleaseTestInfraError(ReleaseTestRuntimeError):
- pass
-
-
-class ReleaseTestTimeoutError(ReleaseTestRuntimeError):
- pass
-
-
-class SessionTimeoutError(ReleaseTestTimeoutError):
- pass
-
-
-class FileSyncTimeoutError(ReleaseTestTimeoutError):
- pass
-
-
-class CommandTimeoutError(ReleaseTestTimeoutError):
- pass
-
-
-class PrepareCommandTimeoutError(ReleaseTestTimeoutError):
- pass
-
-
-# e.g., App config failure.
-class AppConfigBuildFailure(RuntimeError):
- pass
-
-
-class State:
- def __init__(self, state: str, timestamp: float, data: Any):
- self.state = state
- self.timestamp = timestamp
- self.data = data
-
-
-class CommandRunnerHack:
- def __init__(self):
- self.subprocess_pool: Dict[int, subprocess.Popen] = dict()
- self.start_time: Dict[int, float] = dict()
- self.counter = 0
-
- def run_command(self, session_name, cmd_to_run, env_vars) -> int:
- self.counter += 1
- command_id = self.counter
- env = os.environ.copy()
- env["RAY_ADDRESS"] = f"anyscale://{session_name}"
- env["ANYSCALE_CLI_TOKEN"] = GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"]
- env["ANYSCALE_HOST"] = GLOBAL_CONFIG["ANYSCALE_HOST"]
- full_cmd = " ".join(f"{k}={v}" for k, v in env_vars.items()) + " " + cmd_to_run
- logger.info(f"Executing {cmd_to_run} with {env_vars} via ray job submit")
- proc = subprocess.Popen(
- f"ray job submit -- bash -c {shlex.quote(full_cmd)}",
- shell=True,
- stdout=sys.stdout,
- stderr=sys.stderr,
- env=env,
- )
- self.subprocess_pool[command_id] = proc
- self.start_time[command_id] = time.time()
- return command_id
-
- def wait_command(self, command_id: int):
- retcode = self.subprocess_pool[command_id].wait()
- duration = time.time() - self.start_time[command_id]
- return retcode, duration
-
-
-global_command_runner = CommandRunnerHack()
-
-
-class S3SyncSessionController(SessionController):
- def __init__(self, sdk, result_queue):
- self.sdk = sdk
- self.result_queue = result_queue
- self.s3_client = boto3.client("s3")
- self.bucket = GLOBAL_CONFIG["RELEASE_AWS_BUCKET"]
- super().__init__()
-
- def _generate_tmp_s3_path(self):
- fn = "".join(random.choice(string.ascii_lowercase) for i in range(10))
- location = f"tmp/{fn}"
- return location
-
- def pull(self, session_name, source, target):
- remote_upload_to = self._generate_tmp_s3_path()
- # remote source -> s3
- cid = global_command_runner.run_command(
- session_name,
- (
- f"pip install -q awscli && aws s3 cp {source} "
- f"s3://{self.bucket}/{remote_upload_to} "
- "--acl bucket-owner-full-control"
- ),
- {},
- )
- global_command_runner.wait_command(cid)
-
- # s3 -> local target
- self.s3_client.download_file(
- Bucket=self.bucket,
- Key=remote_upload_to,
- Filename=target,
- )
-
- def _push_local_dir(self, session_name):
- remote_upload_to = self._generate_tmp_s3_path()
- # pack local dir
- _, local_path = tempfile.mkstemp()
- shutil.make_archive(local_path, "gztar", os.getcwd())
- # local source -> s3
- self.s3_client.upload_file(
- Filename=local_path + ".tar.gz",
- Bucket=self.bucket,
- Key=remote_upload_to,
- )
- # s3 -> remote target
- cid = global_command_runner.run_command(
- session_name,
- (
- "pip install -q awscli && "
- f"aws s3 cp s3://{self.bucket}/{remote_upload_to} "
- f"archive.tar.gz && "
- "tar xf archive.tar.gz"
- ),
- {},
- )
- global_command_runner.wait_command(cid)
-
- def push(
- self,
- session_name: str,
- source: Optional[str],
- target: Optional[str],
- config: Optional[str],
- all_nodes: bool,
- no_warning: bool = False,
- ):
- if source is None and target is None:
- self._push_local_dir(session_name)
- return
-
- assert isinstance(source, str)
- assert isinstance(target, str)
-
- remote_upload_to = self._generate_tmp_s3_path()
- # local source -> s3
- self.s3_client.upload_file(
- Filename=source,
- Bucket=self.bucket,
- Key=remote_upload_to,
- )
- # s3 -> remote target
- cid = global_command_runner.run_command(
- session_name,
- "pip install -q awscli && "
- f"aws s3 cp s3://{self.bucket}/{remote_upload_to} {target}",
- {},
- )
- global_command_runner.wait_command(cid)
-
-
-sys.path.insert(0, anyscale.ANYSCALE_RAY_DIR)
-
-
-def anyscale_project_url(project_id: str):
- return (
- f"{GLOBAL_CONFIG['ANYSCALE_HOST']}"
- f"/o/anyscale-internal/projects/{project_id}"
- f"/?tab=session-list"
- )
-
-
-def anyscale_session_url(project_id: str, session_id: str):
- return (
- f"{GLOBAL_CONFIG['ANYSCALE_HOST']}"
- f"/o/anyscale-internal/projects/{project_id}"
- f"/clusters/{session_id}"
- )
-
-
-def anyscale_compute_tpl_url(compute_tpl_id: str):
- return (
- f"{GLOBAL_CONFIG['ANYSCALE_HOST']}"
- f"/o/anyscale-internal/configurations/cluster-computes"
- f"/{compute_tpl_id}"
- )
-
-
-def anyscale_app_config_build_url(build_id: str):
- return (
- f"{GLOBAL_CONFIG['ANYSCALE_HOST']}"
- f"/o/anyscale-internal/configurations/app-config-details"
- f"/{build_id}"
- )
-
-
-def wheel_url(ray_version, git_branch, git_commit):
- return (
- f"https://s3-us-west-2.amazonaws.com/ray-wheels/"
- f"{git_branch}/{git_commit}/"
- f"ray-{ray_version}-cp37-cp37m-manylinux2014_x86_64.whl"
- )
-
-
-def wheel_exists(ray_version, git_branch, git_commit):
- url = wheel_url(ray_version, git_branch, git_commit)
- return requests.head(url).status_code == 200
-
-
-def commit_or_url(commit_or_url: str) -> str:
- if commit_or_url.startswith("http"):
- url = None
- # Directly return the S3 url
- if "s3" in commit_or_url and "amazonaws.com" in commit_or_url:
- url = commit_or_url
- # Resolve the redirects for buildkite artifacts
- # This is needed because otherwise pip won't recognize the file name.
- elif "buildkite.com" in commit_or_url and "artifacts" in commit_or_url:
- url = requests.head(commit_or_url, allow_redirects=True).url
- if url is not None:
- # Extract commit from url so that we can do the
- # commit sanity check later.
- p = re.compile("/([a-f0-9]{40})/")
- m = p.search(url)
- if m is not None:
- os.environ["RAY_COMMIT"] = m.group(1)
- return url
-
- # Else, assume commit
- os.environ["RAY_COMMIT"] = commit_or_url
- return wheel_url(
- GLOBAL_CONFIG["RAY_VERSION"], GLOBAL_CONFIG["RAY_BRANCH"], commit_or_url
- )
-
-
-def get_latest_commits(repo: str, branch: str = "master") -> List[str]:
- cur = os.getcwd()
- with tempfile.TemporaryDirectory() as tmpdir:
- os.chdir(tmpdir)
-
- clone_cmd = [
- "git",
- "clone",
- "--filter=tree:0",
- "--no-checkout",
- # "--single-branch",
- # "--depth=10",
- f"--branch={branch}",
- repo,
- tmpdir,
- ]
- log_cmd = [
- "git",
- "log",
- "-n",
- "10",
- "--pretty=format:%H",
- ]
-
- subprocess.check_output(clone_cmd)
- commits = (
- subprocess.check_output(log_cmd).decode(sys.stdout.encoding).split("\n")
- )
- os.chdir(cur)
- return commits
-
-
-def find_ray_wheels(repo: str, branch: str, version: str):
- url = None
- commits = get_latest_commits(repo, branch)
- logger.info(f"Latest 10 commits for branch {branch}: {commits}")
- for commit in commits:
- if wheel_exists(version, branch, commit):
- url = wheel_url(version, branch, commit)
- os.environ["RAY_WHEELS"] = url
- os.environ["RAY_COMMIT"] = commit
- logger.info(
- f"Found wheels URL for Ray {version}, branch {branch}: " f"{url}"
- )
- break
- return url
-
-
-def populate_wheels_sanity_check(commit: Optional[str] = None):
- if not commit:
- cmd = (
- "python -c 'import ray; print("
- '"No commit sanity check available, but this is the '
- "Ray wheel commit:\", ray.__commit__)'"
- )
- else:
- cmd = (
- f"python -c 'import ray; "
- f'assert ray.__commit__ == "{commit}", ray.__commit__\''
- )
- os.environ["RAY_WHEELS_SANITY_CHECK"] = cmd
-
-
-def _check_stop(stop_event: multiprocessing.Event, timeout_type: str):
- if stop_event.is_set():
- if timeout_type == "prepare_command":
- raise PrepareCommandTimeoutError(
- "Process timed out in the prepare command stage."
- )
- if timeout_type == "command":
- raise CommandTimeoutError("Process timed out while running a command.")
- elif timeout_type == "file_sync":
- raise FileSyncTimeoutError("Process timed out while syncing files.")
- elif timeout_type == "session":
- raise SessionTimeoutError("Process timed out while starting a session.")
- else:
- assert False, "Unexpected timeout type."
-
-
-def _deep_update(d, u):
- for k, v in u.items():
- if isinstance(v, collections.abc.Mapping):
- d[k] = _deep_update(d.get(k, {}), v)
- else:
- d[k] = v
- return d
-
-
-def _dict_hash(dt: Dict[Any, Any]) -> str:
- json_str = json.dumps(dt, sort_keys=True, ensure_ascii=True)
- sha = hashlib.sha256()
- sha.update(json_str.encode())
- return sha.hexdigest()
-
-
-def _load_config(local_dir: str, config_file: Optional[str]) -> Optional[Dict]:
- if not config_file:
- return None
-
- config_path = os.path.join(local_dir, config_file)
- with open(config_path, "rt") as f:
- # Todo: jinja2 render
- content = f.read()
-
- env = copy.deepcopy(os.environ)
- env.update(GLOBAL_CONFIG)
-
- content = jinja2.Template(content).render(env=env)
- return yaml.safe_load(content)
-
-
-def has_errored(result: Dict[Any, Any]) -> bool:
- return result.get("status", "invalid") != "finished"
-
-
-def maybe_get_alert_for_result(result_dict: Dict[str, Any]) -> Optional[str]:
- # If we get a result dict, check if any alerts should be raised
- from alert import SUITE_TO_FN, default_handle_result
-
- logger.info("Checking if results are valid...")
-
- # Copy dict because we modify kwargs here
- handle_result_kwargs = result_dict.copy()
- handle_result_kwargs["created_on"] = None
-
- test_suite = handle_result_kwargs.get("test_suite", None)
-
- handle_fn = SUITE_TO_FN.get(test_suite, None)
- if not handle_fn:
- logger.warning(f"No handle for suite {test_suite}")
- alert = default_handle_result(**handle_result_kwargs)
- else:
- alert = handle_fn(**handle_result_kwargs)
-
- return alert
-
-
-def report_result(
- *,
- test_suite: str,
- test_name: str,
- status: str,
- last_logs: str,
- results: Dict[Any, Any],
- artifacts: Dict[Any, Any],
- category: str,
- team: str,
-):
- # session_url: str, commit_url: str,
- # runtime: float, stable: bool, frequency: str, return_code: int):
- """Report the test result to database."""
- now = datetime.datetime.utcnow()
- rds_data_client = boto3.client("rds-data", region_name="us-west-2")
-
- schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
-
- parameters = [
- {
- "name": "created_on",
- "typeHint": "TIMESTAMP",
- "value": {"stringValue": now.strftime("%Y-%m-%d %H:%M:%S")},
- },
- {"name": "test_suite", "value": {"stringValue": test_suite}},
- {"name": "test_name", "value": {"stringValue": test_name}},
- {"name": "status", "value": {"stringValue": status}},
- {"name": "last_logs", "value": {"stringValue": last_logs}},
- {
- "name": "results",
- "typeHint": "JSON",
- "value": {"stringValue": json.dumps(results)},
- },
- {
- "name": "artifacts",
- "typeHint": "JSON",
- "value": {"stringValue": json.dumps(artifacts)},
- },
- {"name": "category", "value": {"stringValue": category}},
- {"name": "team", "value": {"stringValue": team}},
- ]
- columns = [param["name"] for param in parameters]
- values = [f":{param['name']}" for param in parameters]
- column_str = ", ".join(columns).strip(", ")
- value_str = ", ".join(values).strip(", ")
-
- sql = f"INSERT INTO {schema} " f"({column_str}) " f"VALUES ({value_str})"
-
- logger.info(f"Query: {sql}")
-
- # Default boto3 call timeout is 45 seconds.
- retry_delay_s = 64
- MAX_RDS_RETRY = 3
- exponential_backoff_retry(
- lambda: rds_data_client.execute_statement(
- database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
- parameters=parameters,
- secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
- resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
- schema=schema,
- sql=sql,
- ),
- retry_exceptions=rds_data_client.exceptions.StatementTimeoutException,
- initial_retry_delay_s=retry_delay_s,
- max_retries=MAX_RDS_RETRY,
- )
- logger.info("Result has been persisted to the database")
-
- # TODO(jjyao) Migrate to new infra later
- logger.info("Persisting results to the databricks delta lake...")
-
- result_json = {
- "_table": "release_test_result",
- "created_on": now.strftime("%Y-%m-%d %H:%M:%S"),
- "status": status,
- "results": results,
- "test_name": test_name,
- "team": team,
- "cluster_url": results["_session_url"],
- "wheel_url": results["_commit_url"],
- "runtime": results["_runtime"],
- "stable": results["_stable"],
- }
-
- logger.debug(f"Result json: {json.dumps(result_json)}")
-
- firehose_client = boto3.client("firehose", region_name="us-west-2")
- firehose_client.put_record(
- DeliveryStreamName="ray-ci-results", Record={"Data": json.dumps(result_json)}
- )
-
- logger.info("Result has been persisted to the databricks delta lake")
-
-
-def log_results_and_artifacts(result: Dict):
- results = result.get("results", {})
- if results:
- msg = "Observed the following results:\n\n"
-
- for key, val in results.items():
- msg += f" {key} = {val}\n"
- else:
- msg = "Did not find any results."
- logger.info(msg)
-
- artifacts = result.get("artifacts", {})
- if artifacts:
- msg = "Saved the following artifacts:\n\n"
-
- for key, val in artifacts.items():
- msg += f" {key} = {val}\n"
- else:
- msg = "Did not find any artifacts."
- logger.info(msg)
-
-
-def _cleanup_session(sdk: AnyscaleSDK, session_id: str):
- if session_id:
- # Just trigger a request. No need to wait until session shutdown.
- sdk.terminate_session(session_id=session_id, terminate_session_options={})
-
-
-def search_running_session(
- sdk: AnyscaleSDK, project_id: str, session_name: str
-) -> Optional[str]:
- session_id = None
-
- logger.info(f"Looking for existing session with name {session_name}")
-
- result = sdk.search_sessions(
- project_id=project_id, sessions_query=dict(name=dict(equals=session_name))
- )
-
- if len(result.results) > 0 and result.results[0].state == "Running":
- logger.info("Found existing session.")
- session_id = result.results[0].id
- return session_id
-
-
-def find_cloud_by_name(
- sdk: AnyscaleSDK, cloud_name: str, _repeat: bool = True
-) -> Optional[str]:
- cloud_id = None
- logger.info(f"Looking up cloud with name `{cloud_name}`. ")
-
- paging_token = None
- while not cloud_id:
- result = sdk.search_clouds(
- clouds_query=dict(paging=dict(count=50, paging_token=paging_token))
- )
-
- paging_token = result.metadata.next_paging_token
-
- for res in result.results:
- if res.name == cloud_name:
- cloud_id = res.id
- logger.info(f"Found cloud with name `{cloud_name}` as `{cloud_id}`")
- break
-
- if not paging_token or cloud_id or not len(result.results):
- break
-
- return cloud_id
-
-
-def create_or_find_compute_template(
- sdk: AnyscaleSDK, project_id: str, compute_tpl: Dict[Any, Any], _repeat: bool = True
-) -> Tuple[Optional[str], Optional[str]]:
- compute_tpl_id = None
- compute_tpl_name = None
- if compute_tpl:
- # As of Anyscale 0.4.1, it is an error to use the same compute template
- # name within the same organization, between different projects.
- compute_tpl_name = f"{project_id}/compute/{_dict_hash(compute_tpl)}"
-
- logger.info(
- f"Tests uses compute template "
- f"with name {compute_tpl_name}. Looking up existing "
- f"templates."
- )
-
- paging_token = None
- while not compute_tpl_id:
- result = sdk.search_compute_templates(
- dict(
- project_id=project_id,
- name=dict(equals=compute_tpl_name),
- include_anonymous=True,
- ),
- paging_token=paging_token,
- )
- paging_token = result.metadata.next_paging_token
-
- for res in result.results:
- if res.name == compute_tpl_name:
- compute_tpl_id = res.id
- logger.info(f"Template already exists with ID {compute_tpl_id}")
- break
-
- if not paging_token:
- break
-
- if not compute_tpl_id:
- logger.info(
- f"Compute template not found. "
- f"Creating with name {compute_tpl_name}."
- )
- try:
- result = sdk.create_compute_template(
- dict(
- name=compute_tpl_name, project_id=project_id, config=compute_tpl
- )
- )
- compute_tpl_id = result.result.id
- except Exception as e:
- if _repeat:
- logger.warning(
- f"Got exception when trying to create compute "
- f"template: {e}. Sleeping for 10 seconds and then "
- f"try again once..."
- )
- time.sleep(10)
- return create_or_find_compute_template(
- sdk=sdk,
- project_id=project_id,
- compute_tpl=compute_tpl,
- _repeat=False,
- )
-
- raise e
-
- logger.info(f"Compute template created with ID {compute_tpl_id}")
-
- return compute_tpl_id, compute_tpl_name
-
-
-def create_or_find_app_config(
- sdk: AnyscaleSDK, project_id: str, app_config: Dict[Any, Any], _repeat: bool = True
-) -> Tuple[Optional[str], Optional[str]]:
- app_config_id = None
- app_config_name = None
- if app_config:
- app_config_name = f"{project_id}-{_dict_hash(app_config)}"
-
- logger.info(
- f"Test uses an app config with hash {app_config_name}. "
- f"Looking up existing app configs with this name."
- )
-
- paging_token = None
- while not app_config_id:
- result = sdk.list_app_configs(
- project_id=project_id, count=50, paging_token=paging_token
- )
- paging_token = result.metadata.next_paging_token
-
- for res in result.results:
- if res.name == app_config_name:
- app_config_id = res.id
- logger.info(f"App config already exists with ID {app_config_id}")
- break
-
- if not paging_token or app_config_id:
- break
-
- if not app_config_id:
- logger.info("App config not found. Creating new one.")
- try:
- result = sdk.create_app_config(
- dict(
- name=app_config_name,
- project_id=project_id,
- config_json=app_config,
- )
- )
- app_config_id = result.result.id
- except Exception as e:
- if _repeat:
- logger.warning(
- f"Got exception when trying to create app "
- f"config: {e}. Sleeping for 10 seconds and then "
- f"try again once..."
- )
- time.sleep(10)
- return create_or_find_app_config(
- sdk=sdk,
- project_id=project_id,
- app_config=app_config,
- _repeat=False,
- )
-
- raise e
-
- logger.info(f"App config created with ID {app_config_id}")
-
- return app_config_id, app_config_name
-
-
-def run_bash_script(local_dir: str, bash_script: str):
- previous_dir = os.getcwd()
-
- bash_script_local_dir = os.path.dirname(bash_script)
- file_name = os.path.basename(bash_script)
-
- full_local_dir = os.path.join(local_dir, bash_script_local_dir)
- os.chdir(full_local_dir)
-
- subprocess.run("./" + file_name, shell=True, check=True)
-
- os.chdir(previous_dir)
-
-
-def install_app_config_packages(app_config: Dict[Any, Any]):
- os.environ.update(app_config.get("env_vars", {}))
- packages = app_config["python"]["pip_packages"]
- for package in packages:
- subprocess.check_output(["pip", "install", "-U", package], text=True)
-
-
-def install_matching_ray():
- wheel = os.environ.get("RAY_WHEELS", None)
- if not wheel:
- return
- assert "manylinux2014_x86_64" in wheel, wheel
- if sys.platform == "darwin":
- platform = "macosx_10_15_intel"
- elif sys.platform == "win32":
- platform = "win_amd64"
- else:
- platform = "manylinux2014_x86_64"
- wheel = wheel.replace("manylinux2014_x86_64", platform)
- subprocess.check_output(["pip", "uninstall", "-y", "ray"], text=True)
- subprocess.check_output(["pip", "install", "-U", wheel], text=True)
-
-
-def wait_for_build_or_raise(
- sdk: AnyscaleSDK, app_config_id: Optional[str]
-) -> Optional[str]:
- if not app_config_id:
- return None
-
- # Fetch build
- build_id = None
- last_status = None
- result = sdk.list_builds(app_config_id)
- for build in sorted(result.results, key=lambda b: b.created_at):
- build_id = build.id
- last_status = build.status
-
- if build.status == "failed":
- continue
-
- if build.status == "succeeded":
- logger.info(
- f"Link to app config build: "
- f"{_format_link(anyscale_app_config_build_url(build_id))}"
- )
- return build_id
-
- if last_status == "failed":
- raise AppConfigBuildFailure("App config build failed.")
-
- if not build_id:
- raise AppConfigBuildFailure("No build found for app config.")
-
- # Build found but not failed/finished yet
- completed = False
- start_wait = time.time()
- next_report = start_wait + REPORT_S
- logger.info(f"Waiting for build {build_id} to finish...")
- logger.info(
- f"Track progress here: "
- f"{_format_link(anyscale_app_config_build_url(build_id))}"
- )
- while not completed:
- now = time.time()
- if now > next_report:
- logger.info(
- f"... still waiting for build {build_id} to finish "
- f"({int(now - start_wait)} seconds) ..."
- )
- next_report = next_report + REPORT_S
-
- result = sdk.get_build(build_id)
- build = result.result
-
- if build.status == "failed":
- raise AppConfigBuildFailure(
- f"App config build failed. Please see "
- f"{anyscale_app_config_build_url(build_id)} for details"
- )
-
- if build.status == "succeeded":
- logger.info("Build succeeded.")
- return build_id
-
- completed = build.status not in ["in_progress", "pending"]
-
- if completed:
- raise AppConfigBuildFailure(
- f"Unknown build status: {build.status}. Please see "
- f"{anyscale_app_config_build_url(build_id)} for details"
- )
-
- time.sleep(1)
-
- return build_id
-
-
-def run_job(
- cluster_name: str,
- compute_tpl_name: str,
- cluster_env_name: str,
- job_name: str,
- min_workers: str,
- script: str,
- script_args: List[str],
- env_vars: Dict[str, str],
- autosuspend: int,
-) -> Tuple[int, str]:
- # Start cluster and job
- address = f"anyscale://{cluster_name}?autosuspend={autosuspend}"
- logger.info(f"Starting job {job_name} with Ray address: {address}")
- env = copy.deepcopy(os.environ)
- env.update(GLOBAL_CONFIG)
- env.update(env_vars)
- env["RAY_ADDRESS"] = address
- env["RAY_JOB_NAME"] = job_name
- env["RAY_RELEASE_MIN_WORKERS"] = str(min_workers)
- proc = subprocess.Popen(
- script.split(" ") + script_args,
- env=env,
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- text=True,
- )
- proc.stdout.reconfigure(line_buffering=True)
- logs = ""
- for line in proc.stdout:
- logs += line
- sys.stdout.write(line)
- proc.wait()
- return proc.returncode, logs
-
-
-def create_and_wait_for_session(
- sdk: AnyscaleSDK,
- stop_event: multiprocessing.Event,
- session_name: str,
- session_options: Dict[Any, Any],
- project_id: str,
-) -> str:
- # Create session
- logger.info(f"Creating session {session_name}")
- result = sdk.create_session(session_options)
- session_id = result.result.id
-
- # Trigger session start
- logger.info(f"Starting session {session_name} ({session_id})")
- session_url = anyscale_session_url(
- project_id=GLOBAL_CONFIG["ANYSCALE_PROJECT"], session_id=session_id
- )
- logger.info(f"URL: {session_url}")
- logger.info(f"Link to session: {_format_link(session_url)}")
-
- result = sdk.start_session(session_id, start_session_options={})
- sop_id = result.result.id
- completed = result.result.completed
-
- # Wait for session
- logger.info(f"Waiting for session {session_name}...")
- start_wait = time.time()
- next_report = start_wait + REPORT_S
- while not completed:
- # Sleep 1 sec before next check.
- time.sleep(1)
-
- session_operation_response = sdk.get_session_operation(
- sop_id, _request_timeout=30
- )
- session_operation = session_operation_response.result
- completed = session_operation.completed
-
- try:
- _check_stop(stop_event, "session")
- except SessionTimeoutError as e:
- # Always queue session termination.
- # We can't do this later as we won't return anything here
- # and the session ID will not be set in the control loop
- _cleanup_session(sdk=sdk, session_id=session_id)
- raise e
-
- now = time.time()
- if now > next_report:
- logger.info(
- f"... still waiting for session {session_name} "
- f"({int(now - start_wait)} seconds) ..."
- )
- next_report = next_report + REPORT_S
-
- result = sdk.get_session(session_id)
- if not result.result.state != "Active":
- raise ReleaseTestInfraError(
- f"Cluster did not come up - most likely the nodes are currently "
- f"not available. Please check the cluster startup logs: "
- f"{anyscale_session_url(project_id, session_id)}"
- )
-
- return session_id
-
-
-def run_session_command(
- sdk: AnyscaleSDK,
- session_id: str,
- cmd_to_run: str,
- result_queue: multiprocessing.Queue,
- env_vars: Dict[str, str],
- state_str: str = "CMD_RUN",
-) -> Tuple[str, int]:
- full_cmd = " ".join(f"{k}={v}" for k, v in env_vars.items()) + " " + cmd_to_run
-
- logger.info(f"Running command in session {session_id}: \n" f"{full_cmd}")
- session_url = anyscale_session_url(
- project_id=GLOBAL_CONFIG["ANYSCALE_PROJECT"], session_id=session_id
- )
- logger.info(f"URL: {session_url}")
- logger.info(f"Link to session: {_format_link(session_url)}")
- result_queue.put(State(state_str, time.time(), None))
- result = sdk.create_session_command(
- dict(session_id=session_id, shell_command=full_cmd)
- )
-
- scd_id = result.result.id
- return scd_id, result
-
-
-def wait_for_session_command_to_complete(
- create_session_command_result,
- sdk: AnyscaleSDK,
- scd_id: str,
- stop_event: multiprocessing.Event,
- state_str: str = "CMD_RUN",
-):
- result = create_session_command_result
- completed = result.result.finished_at is not None
- start_wait = time.time()
- next_report = start_wait + REPORT_S
- while not completed:
- # Sleep 1 sec before next check.
- time.sleep(1)
-
- result = exponential_backoff_retry(
- lambda: sdk.get_session_command(session_command_id=scd_id),
- retry_exceptions=Exception,
- initial_retry_delay_s=10,
- max_retries=3,
- )
- completed = result.result.finished_at
-
- if state_str == "CMD_RUN":
- _check_stop(stop_event, "command")
- elif state_str == "CMD_PREPARE":
- _check_stop(stop_event, "prepare_command")
-
- now = time.time()
- if now > next_report:
- logger.info(
- f"... still waiting for command to finish "
- f"({int(now - start_wait)} seconds) ..."
- )
- next_report = next_report + REPORT_S
-
- status_code = result.result.status_code
- runtime = time.time() - start_wait
-
- if status_code != 0:
- if state_str == "CMD_RUN":
- raise RuntimeError(f"Command returned non-success status: {status_code}")
- elif state_str == "CMD_PREPARE":
- raise PrepareCommandRuntimeError(
- f"Prepare command returned non-success status: {status_code}"
- )
-
- return status_code, runtime
-
-
-def get_command_logs(
- session_controller: SessionController, scd_id: str, lines: int = 50
-):
- result = exponential_backoff_retry(
- lambda: session_controller.api_client.get_execution_logs_api_v2_session_commands_session_command_id_execution_logs_get( # noqa: E501
- session_command_id=scd_id, start_line=-1 * lines, end_line=0
- ),
- retry_exceptions=Exception,
- initial_retry_delay_s=10,
- max_retries=3,
- )
-
- return result.result.lines
-
-
-def get_remote_json_content(
- temp_dir: str,
- session_name: str,
- remote_file: Optional[str],
- session_controller: SessionController,
-):
- if not remote_file:
- logger.warning("No remote file specified, returning empty dict")
- return {}
- local_target_file = os.path.join(temp_dir, ".tmp.json")
- session_controller.pull(
- session_name=session_name, source=remote_file, target=local_target_file
- )
- with open(local_target_file, "rt") as f:
- return json.load(f)
-
-
-def get_local_json_content(
- local_file: Optional[str],
-):
- if not local_file:
- logger.warning("No local file specified, returning empty dict")
- return {}
- with open(local_file, "rt") as f:
- return json.load(f)
-
-
-def pull_artifacts_and_store_in_cloud(
- temp_dir: str,
- logs: str,
- session_name: str,
- test_name: str,
- artifacts: Optional[Dict[Any, Any]],
- session_controller: SessionController,
-):
- output_log_file = os.path.join(temp_dir, "output.log")
- with open(output_log_file, "wt") as f:
- f.write(logs)
-
- bucket = GLOBAL_CONFIG["RELEASE_AWS_BUCKET"]
- location = f"{GLOBAL_CONFIG['RELEASE_AWS_LOCATION']}" f"/{session_name}/{test_name}"
- saved_artifacts = {}
-
- s3_client = boto3.client("s3")
- s3_client.upload_file(output_log_file, bucket, f"{location}/output.log")
- saved_artifacts["output.log"] = f"s3://{bucket}/{location}/output.log"
-
- # Download artifacts
- if artifacts:
- for name, remote_file in artifacts.items():
- logger.info(f"Downloading artifact `{name}` from " f"{remote_file}")
- local_target_file = os.path.join(temp_dir, name)
- session_controller.pull(
- session_name=session_name, source=remote_file, target=local_target_file
- )
-
- # Upload artifacts to s3
- s3_client.upload_file(local_target_file, bucket, f"{location}/{name}")
- saved_artifacts[name] = f"s3://{bucket}/{location}/{name}"
-
- return saved_artifacts
-
-
-def find_session_by_test_name(
- sdk: AnyscaleSDK,
- session_controller: SessionController,
- temp_dir: str,
- state_json: str,
- project_id: str,
- test_name: str,
-) -> Optional[Tuple[str, str, Dict[Any, Any]]]:
- paging_token = None
-
- while True: # Will break if paging_token is None after first search
- result = sdk.search_sessions(
- project_id=project_id,
- sessions_query=dict(
- name=dict(contains=test_name),
- state_filter=["Running"],
- paging=dict(count=20, paging_token=paging_token),
- ),
- )
-
- for session in result.results:
- logger.info(f"Found sessions {session.name}")
- if not session.name.startswith(test_name):
- continue
-
- try:
- session_state = get_remote_json_content(
- temp_dir=temp_dir,
- session_name=session.name,
- remote_file=state_json,
- session_controller=session_controller,
- )
- except Exception as exc:
- raise RuntimeError(
- f"Could not get remote json content " f"for session {session.name}"
- ) from exc
-
- if session_state.get("test_name") == test_name:
- return session.id, session.name, session_state
-
- session_token = result.metadata.next_paging_token
-
- if not session_token:
- return None
-
-
-def get_latest_running_command_id(
- sdk: AnyscaleSDK, session_id: str
-) -> Tuple[Optional[str], Optional[bool]]:
- scd_id = None
- paging_token = None
-
- success = None
-
- while not scd_id:
- result = sdk.list_session_commands(
- session_id=session_id, paging_token=paging_token
- )
-
- paging_token = result.metadata.next_paging_token
-
- for cmd in result.results:
- if not scd_id:
- scd_id = cmd.id
-
- completed = cmd.finished_at is not None
-
- if completed:
- if success is None:
- success = True
-
- success = success and cmd.status_code == 0
-
- if not completed:
- return cmd.id, None
-
- return scd_id, success or False
-
-
-def run_test_config(
- local_dir: str,
- project_id: str,
- test_name: str,
- test_config: Dict[Any, Any],
- commit_url: str,
- session_name: str = None,
- smoke_test: bool = False,
- no_terminate: bool = False,
- kick_off_only: bool = False,
- check_progress: bool = False,
- upload_artifacts: bool = True,
- keep_results_dir: bool = False,
- app_config_id_override: Optional[str] = None,
-) -> Dict[Any, Any]:
- """
-
- Returns:
- Dict with the following entries:
- status (str): One of [finished, error, timeout]
- command_link (str): Link to command (Anyscale web UI)
- last_logs (str): Last logs (excerpt) to send to owner
- artifacts (dict): Dict of artifacts
- Key: Name
- Value: S3 URL
- """
- stop_event = multiprocessing.Event()
- result_queue = multiprocessing.Queue()
-
- if not session_name:
- session_name = f"{test_name}_{int(time.time())}"
-
- temp_dir = tempfile.mkdtemp()
-
- # Result and state files
- results_json = test_config["run"].get("results", None)
- if results_json is None:
- results_json = "/tmp/release_test_out.json"
-
- state_json = test_config["run"].get("state", None)
- if state_json is None:
- state_json = "/tmp/release_test_state.json"
-
- env_vars = {
- "RAY_ADDRESS": os.environ.get("RAY_ADDRESS", "auto"),
- "TEST_OUTPUT_JSON": results_json,
- "TEST_STATE_JSON": state_json,
- "IS_SMOKE_TEST": "1" if smoke_test else "0",
- }
-
- with open(os.path.join(local_dir, ".anyscale.yaml"), "wt") as f:
- f.write(f"project_id: {project_id}")
- os.chdir(local_dir)
-
- # Setup interface
- # Unfortunately, there currently seems to be no great way to
- # transfer files with the Anyscale SDK.
- # So we use the session controller instead.
- sdk = AnyscaleSDK(
- auth_token=GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"],
- host=GLOBAL_CONFIG["ANYSCALE_HOST"],
- )
-
- get_auth_api_client(
- cli_token=GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"],
- host=GLOBAL_CONFIG["ANYSCALE_HOST"],
- )
- on_k8s = test_config["cluster"].get("compute_on_k8s")
- if on_k8s:
- session_controller = S3SyncSessionController(sdk, result_queue)
- else:
- session_controller = SessionController()
-
- cloud_id = test_config["cluster"].get("cloud_id", None)
- cloud_name = test_config["cluster"].get("cloud_name", None)
- if cloud_id and cloud_name:
- raise RuntimeError(
- f"You can't supply both a `cloud_name` ({cloud_name}) and a "
- f"`cloud_id` ({cloud_id}) in the test cluster configuration. "
- f"Please provide only one."
- )
- elif cloud_name and not cloud_id:
- cloud_id = find_cloud_by_name(sdk, cloud_name)
- if not cloud_id:
- raise RuntimeError(f"Couldn't find cloud with name `{cloud_name}`.")
- else:
- cloud_id = cloud_id or GLOBAL_CONFIG["ANYSCALE_CLOUD_ID"]
-
- # Overwrite global config so that `_load_config` sets the correct cloud
- GLOBAL_CONFIG["ANYSCALE_CLOUD_ID"] = cloud_id
-
- cluster_config_rel_path = test_config["cluster"].get("cluster_config", None)
- cluster_config = _load_config(local_dir, cluster_config_rel_path)
-
- app_config_rel_path = test_config["cluster"].get("app_config", None)
- app_config = _load_config(local_dir, app_config_rel_path)
- if app_config.get("env_vars") is None:
- app_config["env_vars"] = {}
- # A lot of staging tests share the same app config yaml, except the flags.
- # `app_env_vars` in test config will help this one.
- # Here we extend the env_vars to use the one specified in the test config.
- if test_config.get("app_env_vars") is not None:
- app_config["env_vars"].update(test_config["app_env_vars"])
- logger.info(f"Using app config:\n{app_config}")
-
- # Flags for redisless ray.
- # TODO: remove them once done.
- app_config["env_vars"]["MATCH_AUTOSCALER_AND_RAY_IMAGES"] = "1"
- app_config["env_vars"]["RAY_bootstrap_with_gcs"] = "1"
- app_config["env_vars"]["RAY_gcs_storage"] = "memory"
- app_config["env_vars"]["RAY_USAGE_STATS_ENABLED"] = "1"
- app_config["env_vars"]["RAY_USAGE_STATS_SOURCE"] = "nightly-tests"
-
- compute_tpl_rel_path = test_config["cluster"].get("compute_template", None)
- compute_tpl = _load_config(local_dir, compute_tpl_rel_path)
-
- timeout = test_config["run"].get("timeout", 1800)
- if "RELEASE_OVERRIDE_TIMEOUT" in os.environ:
- previous_timeout = timeout
- timeout = int(os.environ.get("RELEASE_OVERRIDE_TIMEOUT", str(timeout)))
- logger.warning(
- f"Release test timeout override: {timeout} "
- f"(would have been {previous_timeout})"
- )
-
- # If a test is long running, timeout does not mean it failed
- is_long_running = test_config["run"].get("long_running", False)
-
- build_id_override = None
- if test_config["run"].get("use_connect"):
- autosuspend_mins = test_config["run"].get("autosuspend_mins", 5)
- assert not kick_off_only, "Unsupported for running with Anyscale connect."
- if app_config_id_override is not None:
- logger.info(
- "Using connect and an app config override, waiting until "
- "build finishes so we can fetch the app config in order to "
- "install its pip packages locally."
- )
- build_id_override = wait_for_build_or_raise(sdk, app_config_id_override)
- response = sdk.get_cluster_environment_build(build_id_override)
- app_config = response.result.config_json
- install_app_config_packages(app_config)
- install_matching_ray()
- elif "autosuspend_mins" in test_config["run"]:
- raise ValueError(
- "'autosuspend_mins' is only supported if 'use_connect' is True."
- )
-
- # Add information to results dict
- def _update_results(results: Dict):
- if "last_update" in results:
- results["last_update_diff"] = time.time() - results["last_update"]
- if smoke_test:
- results["smoke_test"] = True
-
- def _process_finished_command(
- session_controller: SessionController,
- scd_id: str,
- results: Optional[Dict] = None,
- runtime: int = None,
- commit_url: str = None,
- session_url: str = None,
- ):
- logger.info("Command finished successfully.")
- if results_json:
- results = results or get_remote_json_content(
- temp_dir=temp_dir,
- session_name=session_name,
- remote_file=results_json,
- session_controller=session_controller,
- )
- else:
- results = {"passed": 1}
-
- _update_results(results)
-
- if scd_id:
- try:
- logs = get_command_logs(
- session_controller, scd_id, test_config.get("log_lines", 50)
- )
- except Exception as e:
- raise ReleaseTestInfraError(
- f"Could not fetch command logs: {e}. This is an "
- f"infrastructure error on the Anyscale side."
- )
- else:
- logs = "No command found to fetch logs for"
-
- if upload_artifacts:
- saved_artifacts = pull_artifacts_and_store_in_cloud(
- temp_dir=temp_dir,
- logs=logs, # Also save logs in cloud
- session_name=session_name,
- test_name=test_name,
- artifacts=test_config.get("artifacts", {}),
- session_controller=session_controller,
- )
-
- logger.info("Fetched results and stored on the cloud. Returning.")
- else:
- saved_artifacts = {}
- logger.info(
- "Usually I would have fetched the results and "
- "artifacts and stored them on S3."
- )
-
- # Add these metadata here to avoid changing SQL schema.
- results["_runtime"] = runtime
- results["_session_url"] = session_url
- results["_commit_url"] = commit_url
- results["_stable"] = test_config.get("stable", True)
- result_queue.put(
- State(
- "END",
- time.time(),
- {
- "status": "finished",
- "last_logs": logs,
- "results": results,
- "artifacts": saved_artifacts,
- },
- )
- )
-
- # When running the test script in client mode, the finish command is a
- # completed local process.
- def _process_finished_client_command(returncode: int, logs: str):
- if returncode != 0:
- raise RuntimeError(f"Client returned non-success status: {returncode}")
- if upload_artifacts:
- saved_artifacts = pull_artifacts_and_store_in_cloud(
- temp_dir=temp_dir,
- logs=logs, # Also save logs in cloud
- session_name=session_name,
- test_name=test_name,
- artifacts=None,
- session_controller=None,
- )
- logger.info("Stored results on the cloud. Returning.")
- else:
- saved_artifacts = {}
- logger.info(
- "Usually I would have fetched the results and "
- "artifacts and stored them on S3."
- )
-
- if results_json:
- results = get_local_json_content(
- local_file=results_json,
- )
- else:
- results = {
- "passed": int(returncode == 0),
- }
-
- results["returncode"] = returncode
-
- _update_results(results)
-
- result_queue.put(
- State(
- "END",
- time.time(),
- {
- "status": "finished",
- "last_logs": logs,
- "results": results,
- "artifacts": saved_artifacts,
- },
- )
- )
-
- def _run(logger):
- # These values will be set as the test runs.
- session_url = None
- runtime = None
- anyscale.conf.CLI_TOKEN = GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"]
- test_uses_ray_connect = test_config["run"].get("use_connect")
-
- session_id = None
- scd_id = None
- try:
- # First, look for running sessions
- session_id = search_running_session(sdk, project_id, session_name)
- compute_tpl_name = None
- app_config_id = app_config_id_override
- app_config_name = None
- build_id = build_id_override
- if not session_id:
- logger.info("No session found.")
- # Start session
- session_options = dict(name=session_name, project_id=project_id)
-
- if cluster_config is not None:
- logging.info("Starting session with cluster config")
- cluster_config_str = json.dumps(cluster_config)
- session_options["cluster_config"] = cluster_config_str
- session_options["cloud_id"] = cloud_id
- session_options["uses_app_config"] = False
- else:
- logging.info("Starting session with app/compute config")
-
- # Find/create compute template
- compute_tpl_id, compute_tpl_name = create_or_find_compute_template(
- sdk, project_id, compute_tpl
- )
-
- url = _format_link(anyscale_compute_tpl_url(compute_tpl_id))
-
- logger.info(f"Link to compute template: {url}")
-
- # Find/create app config
- if app_config_id is None:
- (
- app_config_id,
- app_config_name,
- ) = create_or_find_app_config(sdk, project_id, app_config)
- else:
- logger.info(f"Using override app config {app_config_id}")
- app_config_name = sdk.get_app_config(app_config_id).result.name
- if build_id is None:
- # We might have already retrieved the build ID when
- # installing app config packages locally if using
- # connect, so only get the build ID if it's not set.
- build_id = wait_for_build_or_raise(sdk, app_config_id)
-
- session_options["compute_template_id"] = compute_tpl_id
- session_options["build_id"] = build_id
- session_options["uses_app_config"] = True
-
- # Start session
- session_id = create_and_wait_for_session(
- sdk=sdk,
- stop_event=stop_event,
- session_name=session_name,
- session_options=session_options,
- project_id=project_id,
- )
-
- prepare_command = test_config["run"].get("prepare")
-
- # Write test state json
- test_state_file = os.path.join(local_dir, "test_state.json")
- with open(test_state_file, "wt") as f:
- json.dump({"start_time": time.time(), "test_name": test_name}, f)
-
- on_k8s = test_config["cluster"].get("compute_on_k8s")
- if prepare_command or not test_uses_ray_connect:
- if test_uses_ray_connect:
- logger.info(
- "Found a prepare command, so pushing it " "to the session."
- )
- # Rsync up
- logger.info("Syncing files to session...")
- session_controller.push(
- session_name=session_name,
- source=None,
- target=None,
- config=None,
- all_nodes=False,
- )
-
- logger.info("Syncing test state to session...")
- session_controller.push(
- session_name=session_name,
- source=test_state_file,
- target=state_json,
- config=None,
- all_nodes=False,
- )
-
- session_url = anyscale_session_url(
- project_id=GLOBAL_CONFIG["ANYSCALE_PROJECT"], session_id=session_id
- )
- _check_stop(stop_event, "file_sync")
-
- # Optionally run preparation command
- if prepare_command:
- logger.info(f"Running preparation command: {prepare_command}")
- if on_k8s:
- cid = global_command_runner.run_command(
- session_name, prepare_command, env_vars
- )
- status_code, _ = global_command_runner.wait_command(cid)
- if status_code != 0:
- raise PrepareCommandRuntimeError()
- else:
- scd_id, result = run_session_command(
- sdk=sdk,
- session_id=session_id,
- cmd_to_run=prepare_command,
- result_queue=result_queue,
- env_vars=env_vars,
- state_str="CMD_PREPARE",
- )
- _, _ = wait_for_session_command_to_complete(
- result,
- sdk=sdk,
- scd_id=scd_id,
- stop_event=stop_event,
- state_str="CMD_PREPARE",
- )
-
- if test_uses_ray_connect:
- script_args = test_config["run"].get("args", [])
- if smoke_test:
- script_args += ["--smoke-test"]
- min_workers = 0
- for node_type in compute_tpl["worker_node_types"]:
- min_workers += node_type["min_workers"]
- # Build completed, use job timeout
- result_queue.put(State("CMD_RUN", time.time(), None))
- returncode, logs = run_job(
- cluster_name=session_name,
- compute_tpl_name=compute_tpl_name,
- cluster_env_name=app_config_name,
- job_name=session_name,
- min_workers=min_workers,
- script=test_config["run"]["script"],
- script_args=script_args,
- env_vars=env_vars,
- autosuspend=autosuspend_mins,
- )
- _process_finished_client_command(returncode, logs)
- return
-
- # Run release test command
- cmd_to_run = test_config["run"]["script"] + " "
-
- args = test_config["run"].get("args", [])
- if args:
- cmd_to_run += " ".join(args) + " "
-
- if smoke_test:
- cmd_to_run += " --smoke-test"
-
- if on_k8s:
- cmd_id = global_command_runner.run_command(
- session_name, cmd_to_run, env_vars=env_vars
- )
- else:
- scd_id, result = run_session_command(
- sdk=sdk,
- session_id=session_id,
- cmd_to_run=cmd_to_run,
- result_queue=result_queue,
- env_vars=env_vars,
- state_str="CMD_RUN",
- )
-
- if not kick_off_only:
- if on_k8s:
- retcode, runtime = global_command_runner.wait_command(cmd_id)
- if retcode != 0:
- raise RuntimeError("Command errored")
- _process_finished_command(
- session_controller=session_controller,
- scd_id="",
- runtime=runtime,
- session_url=session_url,
- commit_url=commit_url,
- )
- else:
- _, runtime = wait_for_session_command_to_complete(
- result,
- sdk=sdk,
- scd_id=scd_id,
- stop_event=stop_event,
- state_str="CMD_RUN",
- )
- _process_finished_command(
- session_controller=session_controller,
- scd_id=scd_id,
- runtime=runtime,
- session_url=session_url,
- commit_url=commit_url,
- )
- else:
- result_queue.put(
- State("END", time.time(), {"status": "kickoff", "last_logs": ""})
- )
-
- except (ReleaseTestTimeoutError, Exception) as e:
- logger.error(e, exc_info=True)
-
- logs = str(e)
- if scd_id is not None:
- try:
- logs = (
- logs
- + "; Command logs:"
- + get_command_logs(
- session_controller, scd_id, test_config.get("log_lines", 50)
- )
- )
- except Exception as e2:
- logger.error(e2, exc_info=True)
-
- # Long running tests are "finished" successfully when
- # timed out
- if isinstance(e, ReleaseTestTimeoutError) and is_long_running:
- _process_finished_command(
- session_controller=session_controller, scd_id=scd_id
- )
- else:
- runtime = None
- if isinstance(e, CommandTimeoutError):
- error_type = "timeout"
- runtime = 0
- exit_code = ExitCode.COMMAND_TIMEOUT
- elif isinstance(e, PrepareCommandTimeoutError):
- error_type = "infra_timeout"
- runtime = None
- exit_code = ExitCode.PREPARE_TIMEOUT
- elif isinstance(e, FileSyncTimeoutError):
- error_type = "infra_timeout"
- runtime = None
- exit_code = ExitCode.FILESYNC_TIMEOUT
- elif isinstance(e, SessionTimeoutError):
- error_type = "infra_timeout"
- runtime = None
- exit_code = ExitCode.SESSION_TIMEOUT
- elif isinstance(e, PrepareCommandRuntimeError):
- error_type = "infra_timeout"
- runtime = None
- exit_code = ExitCode.PREPARE_ERROR
- elif isinstance(e, AppConfigBuildFailure):
- error_type = "infra_timeout"
- runtime = None
- exit_code = ExitCode.APPCONFIG_BUILD_ERROR
- elif isinstance(e, ReleaseTestInfraError):
- error_type = "infra_error"
- exit_code = ExitCode.INFRA_ERROR
- elif isinstance(e, RuntimeError):
- error_type = "runtime_error"
- runtime = 0
- exit_code = ExitCode.RUNTIME_ERROR
- else:
- error_type = "unknown timeout"
- runtime = None
- exit_code = ExitCode.UNKNOWN
-
- # Add these metadata here to avoid changing SQL schema.
- results = {}
- results["_runtime"] = runtime
- results["_session_url"] = session_url
- results["_commit_url"] = commit_url
- results["_stable"] = test_config.get("stable", True)
- result_queue.put(
- State(
- "END",
- time.time(),
- {
- "status": error_type,
- "last_logs": logs,
- "results": results,
- "exit_code": exit_code.value,
- },
- )
- )
- finally:
- if no_terminate:
- logger.warning(
- "`no_terminate` is set to True, so the session will "
- "*not* be terminated!"
- )
- else:
- _cleanup_session(sdk, session_id)
-
- def _check_progress(logger):
- anyscale.conf.CLI_TOKEN = GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"]
-
- should_terminate = False
- session_id = None
- scd_id = None
- try:
- existing_session = find_session_by_test_name(
- sdk=sdk,
- session_controller=session_controller,
- temp_dir=temp_dir,
- state_json=state_json,
- project_id=project_id,
- test_name=test_name,
- )
-
- if existing_session is None:
- logger.info(f"Found no existing session for {test_name}")
- result_queue.put(
- State("END", time.time(), {"status": "nosession", "last_logs": ""})
- )
- return
-
- session_id, session_name, session_state = existing_session
-
- logger.info(f"Found existing session for {test_name}: " f"{session_name}")
-
- scd_id, success = get_latest_running_command_id(
- sdk=sdk, session_id=session_id
- )
-
- latest_result = get_remote_json_content(
- temp_dir=temp_dir,
- session_name=session_name,
- remote_file=results_json,
- session_controller=session_controller,
- )
-
- # Fetch result json and check if it has been updated recently
- result_time_key = test_config["run"].get("time_key", None)
- maximum_update_delay = test_config["run"].get("max_update_delay", None)
-
- if result_time_key and maximum_update_delay:
- last_update = latest_result.get(result_time_key, None)
-
- if not last_update:
- result_queue.put(
- State(
- "END",
- time.time(),
- {
- "status": "error",
- "last_logs": f"Test did not store "
- f"{result_time_key} in the "
- f"results json.",
- },
- )
- )
- return
-
- delay = time.time() - last_update
- logger.info(
- f"Last update was at {last_update:.2f}. "
- f"This was {delay:.2f} seconds ago "
- f"(maximum allowed: {maximum_update_delay})"
- )
-
- if delay > maximum_update_delay:
- raise RuntimeError(
- f"Test did not update the results json within "
- f"the last {maximum_update_delay} seconds."
- )
-
- if time.time() - session_state["start_time"] > timeout:
- # Long running test reached timeout
- logger.info(f"Test command reached timeout after {timeout} seconds")
- _process_finished_command(
- session_controller=session_controller,
- scd_id=scd_id,
- results=latest_result,
- )
- should_terminate = True
-
- elif success:
- logger.info("All commands finished.")
- _process_finished_command(
- session_controller=session_controller,
- scd_id=scd_id,
- results=latest_result,
- )
- should_terminate = True
-
- else:
- rest_time = timeout - time.time() + session_state["start_time"]
- logger.info(
- f"Test command should continue running " f"for {rest_time} seconds"
- )
- result_queue.put(
- State(
- "END",
- time.time(),
- {"status": "kickoff", "last_logs": "Test is still running"},
- )
- )
-
- except Exception as e:
- logger.error(e, exc_info=True)
-
- logs = str(e)
- if scd_id is not None:
- try:
- logs = get_command_logs(
- session_controller, scd_id, test_config.get("log_lines", 50)
- )
- logs += f"\n{str(e)}"
- except Exception as e2:
- logger.error(e2, exc_info=True)
-
- result_queue.put(
- State("END", time.time(), {"status": "error", "last_logs": logs})
- )
- should_terminate = True
- finally:
- if should_terminate:
- logger.warning("Terminating session")
- _cleanup_session(sdk, session_id)
-
- if not check_progress:
- process = multiprocessing.Process(target=_run, args=(logger,))
- else:
- process = multiprocessing.Process(target=_check_progress, args=(logger,))
-
- build_timeout = test_config["run"].get("build_timeout", 1800)
- prepare_timeout = test_config["run"].get("prepare_timeout", timeout)
-
- project_url = anyscale_project_url(project_id=GLOBAL_CONFIG["ANYSCALE_PROJECT"])
- logger.info(f"Link to project: {_format_link(project_url)}")
-
- msg = f"This will now run test {test_name}."
- if smoke_test:
- msg += " This is a smoke test."
- if is_long_running:
- msg += " This is a long running test."
- logger.info(msg)
-
- logger.info(
- f"Starting process with timeout {timeout} "
- f"(prepare timeout {prepare_timeout}, "
- f"build timeout {build_timeout})"
- )
- process.start()
-
- # The timeout time will be updated after the build finished
- # Build = App config + compute template build and session start
- timeout_time = time.time() + build_timeout
-
- result = {}
- while process.is_alive():
- try:
- state: State = result_queue.get(timeout=1)
- except (Empty, TimeoutError):
- if time.time() > timeout_time:
- stop_event.set()
- logger.warning("Process timed out.")
-
- if not is_long_running:
- logger.warning("Terminating process in 10 seconds.")
- time.sleep(10)
- logger.warning("Terminating process now.")
- process.terminate()
- else:
- logger.info(
- "Process is long running. Give 2 minutes to "
- "fetch result and terminate."
- )
- start_terminate = time.time()
- while time.time() < start_terminate + 120 and process.is_alive():
- time.sleep(1)
- if process.is_alive():
- logger.warning("Terminating forcefully now.")
- process.terminate()
- else:
- logger.info("Long running results collected.")
- break
- continue
-
- if not isinstance(state, State):
- raise RuntimeError(f"Expected `State` object, got {result}")
-
- if state.state == "CMD_PREPARE":
- # Reset timeout after build finished
- timeout_time = state.timestamp + prepare_timeout
-
- if state.state == "CMD_RUN":
- # Reset timeout after prepare command or build finished
- timeout_time = state.timestamp + timeout
-
- elif state.state == "END":
- result = state.data
- break
-
- while not result_queue.empty():
- state = result_queue.get_nowait()
- result = state.data
-
- logger.info("Final check if everything worked.")
- try:
- result.setdefault("status", "error (status not found)")
- except (TimeoutError, Empty):
- result = {"status": "timeout", "last_logs": "Test timed out."}
-
- logger.info(f"Final results: {result}")
-
- log_results_and_artifacts(result)
-
- if not keep_results_dir:
- logger.info(f"Removing results dir {temp_dir}")
- shutil.rmtree(temp_dir)
- else:
- # Write results.json
- with open(os.path.join(temp_dir, "results.json"), "wt") as fp:
- json.dump(result, fp)
-
- out_dir = os.path.expanduser(GLOBAL_CONFIG["RELEASE_RESULTS_DIR"])
-
- logger.info(
- f"Moving results dir {temp_dir} to persistent location " f"{out_dir}"
- )
-
- try:
- shutil.rmtree(out_dir)
- except Exception:
- logger.exception(
- f"Ran into error when clearing the destination dir: {out_dir}"
- )
-
- try:
- # Use distutils.dir_util.copy_tree() instead of shutil.cptree(),
- # which allows existing output directory.
- from distutils.dir_util import copy_tree
-
- copy_tree(temp_dir, out_dir)
- except Exception:
- logger.exception(
- "Ran into error when copying results dir to persistent "
- f"location: {out_dir}"
- )
-
- logger.info(f"Dir contents: {os.listdir(out_dir)}")
-
- return result
-
-
-def run_test(
- test_config_file: str,
- test_name: str,
- project_id: str,
- commit_url: str,
- category: str = "unspecified",
- smoke_test: bool = False,
- no_terminate: bool = False,
- kick_off_only: bool = False,
- check_progress: bool = False,
- report: bool = True,
- keep_results_dir: bool = False,
- session_name: Optional[str] = None,
- app_config_id_override=None,
-) -> Dict[str, Any]:
- with open(test_config_file, "rt") as f:
- test_configs = yaml.safe_load(f)
-
- test_config_dict = {}
- for test_config in test_configs:
- name = test_config.pop("name")
- test_config_dict[name] = test_config
-
- if test_name not in test_config_dict:
- raise ValueError(
- f"Test with name `{test_name}` not found in test config file "
- f"at `{test_config_file}`."
- )
-
- test_config = test_config_dict[test_name]
-
- if smoke_test and "smoke_test" in test_config:
- smoke_test_config = test_config.pop("smoke_test")
- test_config = _deep_update(test_config, smoke_test_config)
-
- local_dir = os.path.dirname(test_config_file)
- if "local_dir" in test_config:
- # local_dir is relative to test_config_file
- local_dir = os.path.join(local_dir, test_config["local_dir"])
-
- if test_config["run"].get("use_connect"):
- assert not kick_off_only, (
- "--kick-off-only is unsupported when running with " "Anyscale connect."
- )
- assert (
- not check_progress
- ), "--check is unsupported when running with Anyscale connect."
- if test_config.get("artifacts", {}):
- logger.error(
- "Saving artifacts are not yet supported when running with "
- "Anyscale connect."
- )
-
- # Perform necessary driver side setup.
- driver_setup_script = test_config.get("driver_setup", None)
- if driver_setup_script:
- run_bash_script(local_dir, driver_setup_script)
- logger.info(test_config)
- team = test_config.get("team", "unspecified").strip(" ").lower()
- # When running local test, this validates the team name.
- # If the team name is not specified, they will be recorded as "unspecified"
- if not report and team not in VALID_TEAMS:
- logger.warning(
- f"Incorrect team name {team} has given."
- "Please specify team under the name field in the test config. "
- "For example, within nightly_tests.yaml,\n"
- "\tname: test_xxx\n"
- f"\tteam: {'|'.join(VALID_TEAMS)}\n"
- "\tcluster:..."
- )
-
- result = run_test_config(
- local_dir,
- project_id,
- test_name,
- test_config,
- commit_url,
- session_name=session_name,
- smoke_test=smoke_test,
- no_terminate=no_terminate,
- kick_off_only=kick_off_only,
- check_progress=check_progress,
- upload_artifacts=report,
- keep_results_dir=keep_results_dir,
- app_config_id_override=app_config_id_override,
- )
-
- status = result.get("status", "invalid")
-
- if kick_off_only:
- if status != "kickoff":
- raise RuntimeError("Error kicking off test.")
-
- logger.info(
- "Kicked off test. It's now up to the `--check` "
- "part of the script to track its process."
- )
- return {}
- else:
- # `--check` or no kick off only
-
- if status == "nosession":
- logger.info(
- f"No running session found for test {test_name}, so "
- f"assuming everything is fine."
- )
- return {}
-
- if status == "kickoff":
- logger.info(f"Test {test_name} is still running.")
- return {}
-
- last_logs = result.get("last_logs", "No logs.")
-
- test_suite = os.path.basename(test_config_file).replace(".yaml", "")
-
- report_kwargs = dict(
- test_suite=test_suite,
- test_name=test_name,
- status=status,
- last_logs=last_logs,
- results=result.get("results", {}),
- artifacts=result.get("artifacts", {}),
- category=category,
- team=team,
- )
-
- if not has_errored(result):
- # Check if result are met if test succeeded
- alert = maybe_get_alert_for_result(report_kwargs)
-
- if alert:
- # If we get an alert, the test failed.
- logger.error(
- f"Alert has been raised for "
- f"{test_suite}/{test_name} "
- f"({category}): {alert}"
- )
- result["status"] = "error (alert raised)"
- report_kwargs["status"] = "error (alert raised)"
-
- # For printing/reporting to the database
- report_kwargs["last_logs"] = alert
- last_logs = alert
- else:
- logger.info(
- f"No alert raised for test "
- f"{test_suite}/{test_name} "
- f"({category}) - the test successfully passed!"
- )
-
- if report:
- try:
- report_result(**report_kwargs)
- except Exception as e:
- # On database error the test should still pass
- # Todo: flag somewhere else?
- logger.exception(f"Error persisting results to database: {e}")
- else:
- logger.info(
- f"Usually I would now report the following results:\n"
- f"{report_kwargs}"
- )
-
- if has_errored(result):
- # If the script terminates due to an uncaught error, it
- # will return exit code 1, so we use 2 per default to
- # catch these cases.
- exit_code = result.get("exit_code", ExitCode.UNSPECIFIED.value)
- logger.error(last_logs)
- logger.info(f"Exiting with exit code {exit_code}")
- sys.exit(exit_code)
-
- return report_kwargs
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
- )
- parser.add_argument(
- "--test-config", type=str, required=True, help="Test config file"
- )
- parser.add_argument("--test-name", type=str, help="Test name in config")
- parser.add_argument(
- "--ray-wheels", required=False, type=str, help="URL to ray wheels"
- )
- parser.add_argument(
- "--no-terminate",
- action="store_true",
- default=False,
- help="Don't terminate session after failure",
- )
- parser.add_argument(
- "--report",
- action="store_true",
- default=False,
- help="Whether to report results and upload to S3",
- )
- parser.add_argument(
- "--kick-off-only",
- action="store_true",
- default=False,
- help="Kick off only (don't wait for command to finish)",
- )
- parser.add_argument(
- "--check",
- action="store_true",
- default=False,
- help="Check (long running) status",
- )
- parser.add_argument(
- "--keep-results-dir",
- action="store_true",
- default=False,
- help="Keep results in directory (named RELEASE_RESULTS_DIR), e.g. "
- "for Buildkite artifact upload.",
- )
- parser.add_argument(
- "--category",
- type=str,
- default="unspecified",
- help="Category name, e.g. `release-1.3.0` (will be saved in database)",
- )
- parser.add_argument(
- "--smoke-test", action="store_true", help="Finish quickly for testing"
- )
- parser.add_argument(
- "--session-name",
- required=False,
- type=str,
- help="Name of the session to run this test.",
- )
- parser.add_argument(
- "--app-config-id-override",
- required=False,
- type=str,
- help=("An app config ID, which will override the test config app " "config."),
- )
- args, _ = parser.parse_known_args()
-
- if not GLOBAL_CONFIG["ANYSCALE_PROJECT"]:
- raise RuntimeError("You have to set the ANYSCALE_PROJECT environment variable!")
-
- ray_wheels = args.ray_wheels or os.environ.get("RAY_WHEELS", "")
-
- maybe_fetch_api_token()
- if ray_wheels:
- logger.info(f"Using Ray wheels provided from URL/commit: " f"{ray_wheels}")
- url = commit_or_url(str(ray_wheels))
- logger.info(f"Resolved url link is: {url}")
- # Overwrite with actual URL
- os.environ["RAY_WHEELS"] = url
- elif not args.check:
- url = find_ray_wheels(
- GLOBAL_CONFIG["RAY_REPO"],
- GLOBAL_CONFIG["RAY_BRANCH"],
- GLOBAL_CONFIG["RAY_VERSION"],
- )
- if not url:
- raise RuntimeError(
- f"Could not find wheels for "
- f"Ray {GLOBAL_CONFIG['RAY_VERSION']}, "
- f"branch {GLOBAL_CONFIG['RAY_BRANCH']}"
- )
-
- # RAY_COMMIT is set by commit_or_url and find_ray_wheels
- populate_wheels_sanity_check(os.environ.get("RAY_COMMIT", ""))
-
- test_config_file = os.path.abspath(os.path.expanduser(args.test_config))
-
- # Override it from the global variable.
- report = GLOBAL_CONFIG["REPORT_RESULT"]
- if report.lower() == "1" or report.lower() == "true":
- report = True
- else:
- report = args.report
-
- run_test(
- test_config_file=test_config_file,
- test_name=args.test_name,
- project_id=GLOBAL_CONFIG["ANYSCALE_PROJECT"],
- commit_url=url,
- category=args.category,
- smoke_test=args.smoke_test,
- no_terminate=args.no_terminate or args.kick_off_only,
- kick_off_only=args.kick_off_only,
- check_progress=args.check,
- report=report,
- session_name=args.session_name,
- keep_results_dir=args.keep_results_dir,
- app_config_id_override=args.app_config_id_override,
- )
diff --git a/release/horovod_tests/horovod_tests.yaml b/release/horovod_tests/horovod_tests.yaml
deleted file mode 100644
index ce0abe719..000000000
--- a/release/horovod_tests/horovod_tests.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-- name: horovod_test
- team: ml
- cluster:
- app_config: app_config_master.yaml
- compute_template: compute_tpl.yaml
-
- run:
- timeout: 36000
- prepare: python wait_cluster.py 3 600
- script: python workloads/horovod_tune_test.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 1800
diff --git a/release/horovod_tests/wait_cluster.py b/release/horovod_tests/wait_cluster.py
deleted file mode 100644
index c02330db2..000000000
--- a/release/horovod_tests/wait_cluster.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/lightgbm_tests/lightgbm_tests.yaml b/release/lightgbm_tests/lightgbm_tests.yaml
deleted file mode 100644
index 07aa9e5cf..000000000
--- a/release/lightgbm_tests/lightgbm_tests.yaml
+++ /dev/null
@@ -1,92 +0,0 @@
-- name: train_small
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_small.yaml
-
- run:
- use_connect: True
- autosuspend_mins: 10
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/train_small.py
-
-- name: train_moderate
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_moderate.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 32 600
- script: python workloads/train_moderate.py
-
-- name: train_gpu
- team: ml
- cluster:
- app_config: app_config_gpu.yaml
- compute_template: tpl_gpu_small.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 5 600
- script: python workloads/train_gpu.py
-
-- name: distributed_api_test
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_small.yaml
- results:
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/distributed_api_test.py
- results: ""
-
-- name: ft_small_non_elastic
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_small.yaml
-
- run:
- timeout: 900
- prepare: python wait_cluster.py 4 600
- script: python workloads/ft_small_non_elastic.py
- results: ""
-
-- name: tune_small
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_small.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/tune_small.py
-
-- name: tune_32x4
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_moderate.yaml
-
- run:
- timeout: 900
- prepare: python wait_cluster.py 32 600
- script: python workloads/tune_32x4.py
-
-- name: tune_4x32
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_moderate.yaml
-
- run:
- timeout: 900
- prepare: python wait_cluster.py 32 600
- script: python workloads/tune_4x32.py
diff --git a/release/lightgbm_tests/wait_cluster.py b/release/lightgbm_tests/wait_cluster.py
deleted file mode 100644
index c02330db2..000000000
--- a/release/lightgbm_tests/wait_cluster.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/long_running_distributed_tests/long_running_distributed.yaml b/release/long_running_distributed_tests/long_running_distributed.yaml
deleted file mode 100644
index 189ffd3f9..000000000
--- a/release/long_running_distributed_tests/long_running_distributed.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-- name: pytorch_pbt_failure
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: compute_tpl.yaml
-
- run:
- timeout: 86400
- script: python workloads/pytorch_pbt_failure.py
- long_running: True
-
- smoke_test:
- timeout: 3600
diff --git a/release/long_running_tests/long_running_tests.yaml b/release/long_running_tests/long_running_tests.yaml
deleted file mode 100644
index 05d4245d0..000000000
--- a/release/long_running_tests/long_running_tests.yaml
+++ /dev/null
@@ -1,196 +0,0 @@
-- name: actor_deaths
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_1.yaml
-
- run:
- timeout: 86400
- prepare: ray stop
- script: python workloads/actor_deaths.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-- name: apex
- team: ml
- cluster:
- app_config: ../rllib_tests/app_config.yaml
- compute_template: tpl_cpu_3.yaml
-
- run:
- timeout: 86400
- prepare: python wait_cluster.py 3 600
- script: python workloads/apex.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-
-- name: impala
- team: ml
- cluster:
- app_config: app_config_np.yaml
- compute_template: tpl_cpu_1_large.yaml
-
- run:
- timeout: 86400
- script: python workloads/impala.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-- name: many_actor_tasks
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_1.yaml
-
- run:
- timeout: 86400
- prepare: ray stop
- script: python workloads/many_actor_tasks.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-
-- name: many_drivers
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_1.yaml
-
- run:
- timeout: 86400
- prepare: ray stop
- script: python workloads/many_drivers.py --iteration-num=4000
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-
-- name: many_ppo
- team: ml
- cluster:
- app_config: ../rllib_tests/app_config.yaml
- compute_template: many_ppo.yaml
-
- run:
- timeout: 86400
- prepare: python wait_cluster.py 1 600
- script: python workloads/many_ppo.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-- name: many_tasks
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_1.yaml
-
- run:
- timeout: 86400
- prepare: ray stop
- script: python workloads/many_tasks.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-- name: many_tasks_serialized_ids
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_1.yaml
-
- run:
- timeout: 86400
- prepare: ray stop
- script: python workloads/many_tasks_serialized_ids.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-
-- name: node_failures
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_1.yaml
-
- run:
- timeout: 86400
- prepare: ray stop
- script: python workloads/node_failures.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-- name: pbt
- team: ml
- cluster:
- app_config: ../rllib_tests/app_config.yaml
- compute_template: tpl_cpu_1.yaml
-
- run:
- timeout: 86400
- prepare: ray stop
- script: python workloads/pbt.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-- name: serve
- team: serve
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_1.yaml
-
- run:
- timeout: 86400
- prepare: ray stop
- script: python workloads/serve.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-- name: serve_failure
- team: serve
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_1.yaml
-
- run:
- timeout: 86400
- prepare: ray stop
- script: python workloads/serve_failure.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 600
-
- stable: False
diff --git a/release/long_running_tests/wait_cluster.py b/release/long_running_tests/wait_cluster.py
deleted file mode 100644
index c02330db2..000000000
--- a/release/long_running_tests/wait_cluster.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/microbenchmark/microbenchmark.yaml b/release/microbenchmark/microbenchmark.yaml
deleted file mode 100644
index 7b1c6c336..000000000
--- a/release/microbenchmark/microbenchmark.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-# - name: microbenchmark
-# team: core
-# cluster:
-# app_config: app_config.yaml
-# compute_template: tpl_64.yaml
-
-# run:
-# timeout: 1800
-# script: OMP_NUM_THREADS=64 RAY_ADDRESS= python run_microbenchmark.py
diff --git a/release/ml_user_tests/ml_user_tests.yaml b/release/ml_user_tests/ml_user_tests.yaml
deleted file mode 100644
index 8c6a8162e..000000000
--- a/release/ml_user_tests/ml_user_tests.yaml
+++ /dev/null
@@ -1,124 +0,0 @@
-- name: horovod_user_test_latest
- team: ml
- cluster:
- app_config: horovod/app_config.yaml
- compute_template: horovod/compute_tpl.yaml
-
-
- driver_setup: horovod/driver_setup_latest.sh
-
- run:
- use_connect: True
- autosuspend_mins: 10
- timeout: 1200
- script: python horovod/horovod_user_test.py
-
-- name: horovod_user_test_master
- team: ml
- cluster:
- app_config: ../horovod_tests/app_config_master.yaml
- compute_template: horovod/compute_tpl.yaml
-
- driver_setup: horovod/driver_setup_master.sh
-
- run:
- use_connect: True
- autosuspend_mins: 10
- timeout: 1200
- script: python horovod/horovod_user_test.py
-
-
-- name: train_tensorflow_mnist_test
- team: ml
- cluster:
- app_config: train/app_config.yaml
- compute_template: train/compute_tpl.yaml
-
- driver_setup: train/driver_setup.sh
-
- run:
- use_connect: True
- timeout: 36000
- script: python train/train_tensorflow_mnist_test.py
-
-- name: train_torch_linear_test
- team: ml
- cluster:
- app_config: train/app_config.yaml
- compute_template: train/compute_tpl.yaml
-
- driver_setup: train/driver_setup.sh
-
- run:
- use_connect: True
- timeout: 36000
- script: python train/train_torch_linear_test.py
-
-
-- name: xgboost_gpu_connect_latest
- team: ml
- cluster:
- app_config: xgboost/app_config_gpu.yaml
- compute_template: xgboost/tpl_gpu_small_scaling.yaml
-
- run:
- use_connect: True
- timeout: 1200
- script: python xgboost/train_gpu_connect.py
-
-- name: xgboost_gpu_connect_master
- team: ml
- cluster:
- app_config: xgboost/app_config_gpu_master.yaml
- compute_template: xgboost/tpl_gpu_small_scaling.yaml
-
- run:
- use_connect: True
- timeout: 1200
- script: python xgboost/train_gpu_connect.py
-
-- name: ray_lightning_user_test_latest
- team: ml
- cluster:
- app_config: ray-lightning/app_config.yaml
- compute_template: ray-lightning/compute_tpl.yaml
-
- driver_setup: ray-lightning/driver_setup.sh
-
- run:
- use_connect: True
- autosuspend_mins: 10
- timeout: 1200
- script: python ray-lightning/ray_lightning_user_test.py
-
-
-- name: ray_lightning_user_test_master
- team: ml
- cluster:
- app_config: ray-lightning/app_config_master.yaml
- compute_template: ray-lightning/compute_tpl.yaml
-
-
- driver_setup: ray-lightning/driver_setup.sh
-
- run:
- use_connect: True
- autosuspend_mins: 10
- timeout: 1200
- script: python ray-lightning/ray_lightning_user_test.py
-
-
-- name: tune_rllib_connect_test
- team: ml
- cluster:
- app_config: ../rllib_tests/app_config.yaml
- compute_template: tune_rllib/compute_tpl.yaml
-
-
- driver_setup: tune_rllib/driver_setup.sh
-
- run:
- use_connect: True
- autosuspend_mins: 10
- timeout: 1200
- script: python tune_rllib/run_connect_tests.py
\ No newline at end of file
diff --git a/release/nightly_tests/chaos_test.yaml b/release/nightly_tests/chaos_test.yaml
deleted file mode 100644
index f24cdcf16..000000000
--- a/release/nightly_tests/chaos_test.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-#
-# Chaos tests.
-#
-
-# Run the test that invokes many tasks without object store usage.
-- name: chaos_many_tasks_no_object_store
- team: core
- cluster:
- app_config: chaos_test/app_config.yaml
- compute_template: chaos_test/compute_template.yaml
-
- run:
- timeout: 3600
- prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
- script: python chaos_test/test_chaos_basic.py --workload=tasks
-
-- name: chaos_many_actors
- team: core
- cluster:
- app_config: chaos_test/app_config.yaml
- compute_template: chaos_test/compute_template.yaml
-
- run:
- timeout: 3600
- prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
- script: python chaos_test/test_chaos_basic.py --workload=actors
-
-- name: chaos_dask_on_ray_large_scale_test_no_spilling
- team: core
- cluster:
- app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
- compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
-
- run:
- timeout: 7200
- # Total run time without failures is about 300-400s.
- prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
- script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
-
-# Test large scale dask on ray test with spilling.
-- name: chaos_dask_on_ray_large_scale_test_spilling
- team: core
- cluster:
- app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
- compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
-
- run:
- timeout: 7200
- # Total run time without failures is about 300-400s.
- prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
- script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
-
-- name: chaos_pipelined_ingestion_1500_gb_15_windows
- team: core
- cluster:
- app_config: dataset/pipelined_ingestion_app.yaml
- compute_template: dataset/pipelined_ingestion_compute.yaml
-
- run:
- timeout: 7200
- prepare: python wait_cluster.py 21 2400; python setup_chaos.py --node-kill-interval 300
- script: python dataset/pipelined_training.py --epochs 1 --num-windows 15 --num-files 915 --debug
-
- stable: false
diff --git a/release/nightly_tests/dataset/dataset_test.yaml b/release/nightly_tests/dataset/dataset_test.yaml
deleted file mode 100644
index 8ac02a36a..000000000
--- a/release/nightly_tests/dataset/dataset_test.yaml
+++ /dev/null
@@ -1,95 +0,0 @@
-- name: inference
- team: core
- cluster:
- app_config: app_config.yaml
- compute_template: inference.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 2 600
- script: python inference.py
-
-- name: shuffle_data_loader
- team: core
- cluster:
- app_config: shuffle_app_config.yaml
- compute_template: shuffle_compute.yaml
-
- run:
- timeout: 1800
- script: python dataset_shuffle_data_loader.py
-
-- name: parquet_metadata_resolution
- team: core
- cluster:
- app_config: pipelined_training_app.yaml
- compute_template: pipelined_training_compute.yaml
-
- run:
- timeout: 1200
- prepare: python wait_cluster.py 15 1200
- script: python parquet_metadata_resolution.py --num-files 915
-
-- name: pipelined_training_50_gb
- team: core
- cluster:
- app_config: pipelined_training_app.yaml
- compute_template: pipelined_training_compute.yaml
-
- run:
- timeout: 4800
- prepare: python wait_cluster.py 15 1200
- script: python pipelined_training.py --epochs 1
-
-- name: pipelined_ingestion_1500_gb
- team: core
- cluster:
- app_config: pipelined_ingestion_app.yaml
- compute_template: pipelined_ingestion_compute.yaml
-
- run:
- timeout: 9600
- prepare: python wait_cluster.py 21 2400
- script: python pipelined_training.py --epochs 2 --num-windows 2 --num-files 915 --debug
-
-- name: datasets_ingest_train_infer
- team: core
- cluster:
- app_config: ray_sgd_training_app.yaml
- compute_template: ray_sgd_training_compute.yaml
-
- run:
- timeout: 14400
- prepare: python wait_cluster.py 66 2400
- script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset
-
- smoke_test:
- cluster:
- app_config: ray_sgd_training_app.yaml
- compute_template: ray_sgd_training_smoke_compute.yaml
-
- run:
- timeout: 3600
- prepare: python wait_cluster.py 8 2400
- script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
-
-- name: datasets_preprocess_ingest
- team: core
- cluster:
- app_config: ray_sgd_training_app.yaml
- compute_template: ray_sgd_training_compute_no_gpu.yaml
-
- run:
- timeout: 7200
- prepare: python wait_cluster.py 21 2400
- script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset --debug
-
-- name: datasets_ingest_400G
- team: core
- cluster:
- app_config: ray_sgd_training_app.yaml
- compute_template: dataset_ingest_400G_compute.yaml
-
- run:
- timeout: 7200
- script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1
diff --git a/release/nightly_tests/dataset/wait_cluster.py b/release/nightly_tests/dataset/wait_cluster.py
deleted file mode 100644
index c02330db2..000000000
--- a/release/nightly_tests/dataset/wait_cluster.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/nightly_tests/nightly_tests.yaml b/release/nightly_tests/nightly_tests.yaml
deleted file mode 100644
index 2d0f90b94..000000000
--- a/release/nightly_tests/nightly_tests.yaml
+++ /dev/null
@@ -1,390 +0,0 @@
-#
-# Single node shuffle
-#
-# Test basic single node 10GB shuffle with a small number of partitions.
-# This doesn't require object spilling.
-# - name: shuffle_10gb
-# team: core
-# cluster:
-# app_config: shuffle/shuffle_app_config.yaml
-# compute_template: shuffle/shuffle_compute_single.yaml
-
-# run:
-# timeout: 3000
-# script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6
-
-# Test single node 50GB shuffle with a large number of partitions.
-- name: shuffle_50gb
- team: core
- cluster:
- app_config: shuffle/shuffle_app_config.yaml
- compute_template: shuffle/shuffle_compute_single.yaml
-
- run:
- timeout: 3000
- script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
-
-# Test single node 50GB shuffle with a large number of partitions.
-- name: shuffle_50gb_large_partition
- team: core
- cluster:
- app_config: shuffle/shuffle_app_config.yaml
- compute_template: shuffle/shuffle_compute_single.yaml
-
- run:
- timeout: 3000
- script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
-
-# Test non streaming shuffle in a single node with a small number of partition.
-- name: non_streaming_shuffle_50gb
- team: core
- cluster:
- app_config: shuffle/shuffle_app_config.yaml
- compute_template: shuffle/shuffle_compute_single.yaml
-
- run:
- timeout: 3000
- script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming
-
-# Test non streaming shuffle in a single node with a large number of partition.
-- name: non_streaming_shuffle_50gb_large_partition
- team: core
- cluster:
- app_config: shuffle/shuffle_app_config.yaml
- compute_template: shuffle/shuffle_compute_single.yaml
-
- run:
- timeout: 3000
- script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
-
-- name: dask_on_ray_10gb_sort
- team: core
- cluster:
- app_config: dask_on_ray/dask_on_ray_app_config.yaml
- compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
-
- run:
- timeout: 7200
- script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
-
-- name: dask_on_ray_100gb_sort
- team: core
- cluster:
- app_config: dask_on_ray/dask_on_ray_app_config.yaml
- compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
-
- run:
- timeout: 7200
- script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
-
-#
-# Multi node shuffle
-#
-
-# Test multi nodes 100GB shuffle with a small number of partitions.
-- name: shuffle_100gb
- team: core
- cluster:
- app_config: shuffle/shuffle_app_config.yaml
- compute_template: shuffle/shuffle_compute_multi.yaml
-
- run:
- timeout: 3000
- prepare: python wait_cluster.py 4 600
- script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
-
-# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
-- name: non_streaming_shuffle_100gb
- team: core
- cluster:
- app_config: shuffle/shuffle_app_config.yaml
- compute_template: shuffle/shuffle_compute_multi.yaml
-
- run:
- timeout: 3000
- prepare: python wait_cluster.py 4 600
- script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
-
-# Test autoscaling 1TB streaming shuffle with a large number of partitions.
-- name: autoscaling_shuffle_1tb_1000_partitions
- team: core
- cluster:
- app_config: shuffle/shuffle_app_config.yaml
- compute_template: shuffle/shuffle_compute_autoscaling.yaml
-
- run:
- timeout: 4000
- script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
-
-# Test multi nodes 1TB streaming shuffle with a large number of partitions.
-- name: shuffle_1tb_1000_partition
- team: core
- cluster:
- app_config: shuffle/shuffle_app_config.yaml
- compute_template: shuffle/shuffle_compute_large_scale.yaml
-
- run:
- timeout: 3000
- prepare: python wait_cluster.py 20 900
- script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
-
-# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
-- name: non_streaming_shuffle_1tb_1000_partition
- team: core
- cluster:
- app_config: shuffle/shuffle_app_config.yaml
- compute_template: shuffle/shuffle_compute_large_scale.yaml
-
- run:
- timeout: 3000
- prepare: python wait_cluster.py 20 900
- script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
-
-# Stress test for 1TB multi node streaming shuffle.
-- name: shuffle_1tb_5000_partitions
- team: core
- cluster:
- app_config: shuffle/shuffle_app_config.yaml
- compute_template: shuffle/shuffle_compute_large_scale.yaml
-
- run:
- timeout: 9000
- prepare: python wait_cluster.py 20 900
- script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
-
-# Stress test for 1TB multi node non-streaming shuffle.
-# - name: non_streaming_shuffle_1tb_5000_partitions
-# team: core
-# stable: False
-# cluster:
-# app_config: shuffle/shuffle_app_config.yaml
-# compute_template: shuffle/shuffle_compute_large_scale.yaml
-
-# run:
-# timeout: 7200
-# prepare: python wait_cluster.py 20 900
-# script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
-
-- name: k8s_dask_on_ray_large_scale_test_no_spilling
- team: core
- cluster:
- app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
- compute_template: dask_on_ray/dask_on_ray_stress_compute_k8s.yaml
- compute_on_k8s: True
-
- run:
- timeout: 7200
- prepare: python wait_cluster.py 21 600
- script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
- stable: false
-
-# # Test large scale dask on ray test without spilling.
-# - name: dask_on_ray_large_scale_test_no_spilling
-# team: core
-# cluster:
-# app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
-# compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
-
-# run:
-# timeout: 7200
-# prepare: python wait_cluster.py 21 600
-# script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
-
-# smoke_test:
-# cluster:
-# app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
-# compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
-
-# run:
-# timeout: 7200
-# prepare: python wait_cluster.py 5 600
-# script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
-
-# Test large scale dask on ray test with spilling.
-- name: dask_on_ray_large_scale_test_spilling
- team: core
- cluster:
- app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
- compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
-
- run:
- timeout: 7200
- prepare: python wait_cluster.py 21 600
- script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
-
- smoke_test:
- cluster:
- app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
- compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
-
- run:
- timeout: 7200
- prepare: python wait_cluster.py 5 600
- script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
-
-# Stress tests with many tasks
-- name: stress_test_many_tasks
- team: core
- cluster:
- app_config: stress_tests/stress_tests_app_config.yaml
- compute_template: stress_tests/stress_tests_compute.yaml
-
- run:
- timeout: 7200
- script: python stress_tests/test_many_tasks.py
-
- smoke_test:
- cluster:
- app_config: stress_tests/stress_tests_app_config.yaml
- compute_template: stress_tests/smoke_test_compute.yaml
-
- run:
- timeout: 3600
- script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
-
-# Stress tests with dead actors
-- name: stress_test_dead_actors
- team: core
- cluster:
- app_config: stress_tests/stress_tests_app_config.yaml
- compute_template: stress_tests/stress_tests_compute.yaml
-
- run:
- timeout: 7200
- script: python stress_tests/test_dead_actors.py
-
- smoke_test:
- cluster:
- app_config: stress_tests/stress_tests_app_config.yaml
- compute_template: stress_tests/smoke_test_compute.yaml
-
- run:
- timeout: 3600
- script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3
-
-# Stress tests with placement groups
-- name: stress_test_placement_group
- team: core
- cluster:
- app_config: stress_tests/stress_tests_app_config.yaml
- compute_template: stress_tests/placement_group_tests_compute.yaml
-
- run:
- timeout: 7200
- script: python stress_tests/test_placement_group.py
-
-# Stress tests with many threaded actors.
-- name: threaded_actors_stress_test
- team: core
- cluster:
- app_config: stress_tests/stress_tests_app_config.yaml
- compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
-
- run:
- timeout: 7200
- prepare: python wait_cluster.py 201 600
- script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
-
- smoke_test:
- cluster:
- app_config: stress_tests/stress_tests_app_config.yaml
- compute_template: stress_tests/smoke_test_compute.yaml
-
- run:
- timeout: 3600
- prepare: python wait_cluster.py 5 600
- script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
- stable: false
-
-- name: k8s_threaded_actors_stress_test
- team: core
- cluster:
- app_config: stress_tests/stress_tests_app_config.yaml
- compute_template: stress_tests/k8s_stress_test_threaded_actor_compute.yaml
- compute_on_k8s: True
-
- run:
- timeout: 7200
- prepare: python wait_cluster.py 201 600
- script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
-
- run:
- timeout: 3600
- prepare: python wait_cluster.py 5 600
- script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
- stable: false
-
-# Test decision tree on autoscaling compute cluster.
-- name: decision_tree_autoscaling
- team: core
- cluster:
- app_config: decision_tree/decision_tree_app_config.yaml
- compute_template: decision_tree/autoscaling_compute.yaml
-
- run:
- timeout: 3000
- script: python decision_tree/cart_with_tree.py
-
-# Test 20 concurrent decision tree runs on autoscaling compute cluster.
-- name: decision_tree_autoscaling_20_runs
- team: core
- cluster:
- app_config: decision_tree/decision_tree_app_config.yaml
- compute_template: decision_tree/autoscaling_compute.yaml
- run:
- timeout: 9600
- script: python decision_tree/cart_with_tree.py --concurrency=20
-
-- name: dask_on_ray_1tb_sort
- team: core
- cluster:
- app_config: dask_on_ray/dask_on_ray_app_config.yaml
- compute_template: dask_on_ray/1tb_sort_compute.yaml
-
- run:
- timeout: 7200
- prepare: python wait_cluster.py 32 1000
- script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
-
-- name: many_nodes_actor_test
- team: core
- cluster:
- app_config: many_nodes_tests/app_config.yaml
- compute_template: many_nodes_tests/compute_config.yaml
-
- run:
- timeout: 7200
- prepare: python wait_cluster.py 251 5400
- script: python many_nodes_tests/actor_test.py
-
-- name: pg_autoscaling_regression_test
- team: core
- cluster:
- app_config: placement_group_tests/app_config.yaml
- compute_template: placement_group_tests/compute.yaml
-
- run:
- timeout: 1200
- script: python placement_group_tests/pg_run.py
-
-- name: pg_long_running_performance_test
- team: core
- cluster:
- app_config: placement_group_tests/app_config.yaml
- compute_template: placement_group_tests/long_running_test_compute.yaml
-
- run:
- timeout: 3600
- prepare: python wait_cluster.py 2 600
- script: python placement_group_tests/long_running_performance_test.py --num-stages 2000
-
-- name: placement_group_performance_test
- team: core
- cluster:
- app_config: placement_group_tests/app_config.yaml
- compute_template: placement_group_tests/pg_perf_test_compute.yaml
-
- run:
- timeout: 1200
- prepare: python wait_cluster.py 5 600
- script: python placement_group_tests/placement_group_performance_test.py
diff --git a/release/nightly_tests/wait_cluster.py b/release/nightly_tests/wait_cluster.py
deleted file mode 100644
index f70088289..000000000
--- a/release/nightly_tests/wait_cluster.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/rllib_tests/rllib_tests.yaml b/release/rllib_tests/rllib_tests.yaml
deleted file mode 100644
index d0b15dc07..000000000
--- a/release/rllib_tests/rllib_tests.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-# Heavy learning tests (Atari and HalfCheetah) for major algos.
-- name: learning_tests
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: 8gpus_64cpus.yaml
-
- run:
- timeout: 14400
- script: python learning_tests/run.py
-
- smoke_test:
- run:
- timeout: 1200
-
-# 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos.
-- name: multi_gpu_learning_tests
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: 8gpus_96cpus.yaml
-
- run:
- timeout: 7200
- script: python multi_gpu_learning_tests/run.py
-
-# 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos
-# (that support RNN models).
-- name: multi_gpu_with_lstm_learning_tests
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: 8gpus_96cpus.yaml
-
- run:
- timeout: 7200
- script: python multi_gpu_with_lstm_learning_tests/run.py
-
-# 2-GPU learning tests (StatelessCartPole) + use_attention=True for major
-# algos (that support RNN models).
-- name: multi_gpu_with_attention_learning_tests
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: 8gpus_96cpus.yaml
-
- run:
- timeout: 7200
- script: python multi_gpu_with_attention_learning_tests/run.py
-
-# We'll have these as per-PR tests soon.
-# - name: example_scripts_on_gpu_tests
-# team: ml
-# cluster:
-# app_config: app_config.yaml
-# compute_template: 1gpu_4cpus.yaml
-
-# run:
-# timeout: 7200
-# script: bash unit_gpu_tests/run.sh
-
-# IMPALA large machine stress tests (4x Atari).
-- name: stress_tests
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: 4gpus_544_cpus.yaml
-
- run:
- timeout: 5400
- prepare: python wait_cluster.py 6 600
- script: python stress_tests/run_stress_tests.py
-
- smoke_test:
- run:
- timeout: 2000
-
-# Tests that exercise auto-scaling and Anyscale connect.
-- name: connect_tests
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: auto_scale.yaml
-
- run:
- use_connect: True
- timeout: 3000
- script: python connect_tests/run_connect_tests.py
-
-# Nightly performance regression for popular algorithms.
-# These algorithms run nightly for pre-determined amount of time without
-# passing criteria.
-# Performance metrics, such as reward achieved and throughput, are then
-# collected and tracked over time.
-- name: performance_tests
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: 12gpus_192cpus.yaml
-
- run:
- timeout: 10800
- script: python performance_tests/run.py
diff --git a/release/rllib_tests/wait_cluster.py b/release/rllib_tests/wait_cluster.py
deleted file mode 100644
index c02330db2..000000000
--- a/release/rllib_tests/wait_cluster.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/run_e2e.sh b/release/run_e2e.sh
deleted file mode 100755
index 9f1ae16fc..000000000
--- a/release/run_e2e.sh
+++ /dev/null
@@ -1,176 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-cd "${0%/*}" || exit 1
-
-reason() {
- # Keep in sync with e2e.py ExitCode enum
- case $1 in
- 0)
- REASON="success"
- ;;
- 2)
- REASON="unspecified"
- ;;
- 3)
- REASON="unknown"
- ;;
- 4)
- REASON="runtime error"
- ;;
- 5)
- REASON="command error"
- ;;
- 6)
- REASON="command timeout"
- ;;
- 7)
- REASON="prepare timeout"
- ;;
- 8)
- REASON="filesync timeout"
- ;;
- 9)
- REASON="session timeout"
- ;;
- 10)
- REASON="prepare error"
- ;;
- 11)
- REASON="app config build error"
- ;;
- 12)
- REASON="infra error"
- ;;
- *)
- REASON="untracked error"
- ;;
- esac
- echo "${REASON}"
-}
-
-while [[ $# -gt 0 ]]
-do
-key="$1"
-case $key in
- --ray-repo)
- shift
- RAY_REPO=$1
- ;;
- --ray-branch)
- shift
- RAY_BRANCH=$1
- ;;
- --ray-version)
- shift
- RAY_VERSION=$1
- ;;
- --ray-wheels)
- shift
- RAY_WHEELS=$1
- ;;
- --ray-test-repo)
- shift
- RAY_TEST_REPO=$1
- ;;
- --ray-test-branch)
- shift
- RAY_TEST_BRANCH=$1
- ;;
- --release-results-dir)
- shift
- RELEASE_RESULTS_DIR=$1
- ;;
- *)
- break
-esac
-shift
-done
-
-RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
-RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
-RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
-
-export RAY_REPO RAY_BRANCH RAY_VERSION RAY_WHEELS RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
-
-pip uninstall -q -y ray
-pip install -q -r requirements.txt
-pip install -q -U boto3 botocore
-git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" ~/ray
-
-RETRY_NUM=0
-MAX_RETRIES=${MAX_RETRIES-3}
-
-if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
- echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
- MAX_RETRIES=1
-fi
-
-ALL_EXIT_CODES=()
-while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
- RETRY_NUM=$((RETRY_NUM + 1))
-
- if [ "$RETRY_NUM" -gt 1 ]; then
- # Sleep for random time between 30 and 90 minutes
- SLEEP_TIME=$((1800 + RANDOM % 5400))
- echo "----------------------------------------"
- echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
- echo "----------------------------------------"
- sleep ${SLEEP_TIME}
- fi
-
- sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
-
- python e2e.py "$@"
- EXIT_CODE=$?
- REASON=$(reason "${EXIT_CODE}")
- ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
-
- case ${EXIT_CODE} in
- 0)
- echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
- break
- ;;
- 7 | 9 | 10)
- echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
- ;;
- *)
- echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
- break
- ;;
- esac
-
-done
-
-sudo rm -rf /tmp/ray_release_test_artifacts/* || true
-sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
-
-echo "----------------------------------------"
-echo "e2e test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
-echo "Run results:"
-
-COUNTER=1
-for EX in "${ALL_EXIT_CODES[@]}"; do
- REASON=$(reason "${EX}")
- echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
- COUNTER=$((COUNTER + 1))
-done
-
-echo "----------------------------------------"
-
-REASON=$(reason "${EXIT_CODE}")
-echo "Final e2e exit code is ${EXIT_CODE} (${REASON})"
-
-case ${EXIT_CODE} in
- 0)
- ;;
- 7 | 9 | 10)
- echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
- ;;
- *)
- echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
- ;;
-esac
-
-exit $EXIT_CODE
diff --git a/release/runtime_env_tests/runtime_env_tests.yaml b/release/runtime_env_tests/runtime_env_tests.yaml
deleted file mode 100644
index 7a55da490..000000000
--- a/release/runtime_env_tests/runtime_env_tests.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-- name: rte_many_tasks_actors
- team: serve
- cluster:
- app_config: app_config.yaml
- compute_template: rte_small.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/rte_many_tasks_actors.py
-
-- name: wheel_urls
- team: serve
- cluster:
- app_config: app_config.yaml
- compute_template: rte_minimal.yaml
-
- run:
- timeout: 9000 # 2h30m
- prepare: python wait_cluster.py 1 600
- script: python workloads/wheel_urls.py
-
-- name: rte_ray_client
- team: serve
- cluster:
- app_config: app_config.yaml
- compute_template: rte_minimal.yaml
-
- run:
- use_connect: True
- autosuspend_mins: 10
- timeout: 600
- prepare: python wait_cluster.py 1 600
- script: python workloads/rte_ray_client.py
diff --git a/release/runtime_env_tests/wait_cluster.py b/release/runtime_env_tests/wait_cluster.py
deleted file mode 100644
index c02330db2..000000000
--- a/release/runtime_env_tests/wait_cluster.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/serve_tests/serve_tests.yaml b/release/serve_tests/serve_tests.yaml
deleted file mode 100644
index 87058d891..000000000
--- a/release/serve_tests/serve_tests.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-- name: single_deployment_1k_noop_replica
- team: serve
- cluster:
- app_config: app_config.yaml
- compute_template: compute_tpl_32_cpu.yaml
-
- run:
- timeout: 7200
- long_running: False
- script: python workloads/single_deployment_1k_noop_replica.py
-
- smoke_test:
- timeout: 600
-
-- name: multi_deployment_1k_noop_replica
- team: serve
- cluster:
- app_config: app_config.yaml
- compute_template: compute_tpl_32_cpu.yaml
-
- run:
- timeout: 7200
- long_running: False
- script: python workloads/multi_deployment_1k_noop_replica.py
-
- smoke_test:
- timeout: 600
-
-- name: autoscaling_single_deployment
- team: serve
- cluster:
- app_config: app_config.yaml
- compute_template: compute_tpl_8_cpu_autoscaling.yaml
-
- run:
- timeout: 7200
- long_running: False
- script: python workloads/autoscaling_single_deployment.py
-
- smoke_test:
- timeout: 600
-
-- name: autoscaling_multi_deployment
- team: serve
- cluster:
- app_config: app_config.yaml
- compute_template: compute_tpl_8_cpu_autoscaling.yaml
-
- run:
- timeout: 7200
- long_running: False
- script: python workloads/autoscaling_multi_deployment.py
-
- smoke_test:
- timeout: 600
-
-- name: serve_micro_benchmark
- team: serve
- cluster:
- app_config: app_config.yaml
- # 16 CPUS
- compute_template: compute_tpl_single_node.yaml
-
- run:
- timeout: 7200
- long_running: False
- script: python workloads/serve_micro_benchmark.py
-
- smoke_test:
- timeout: 600
-
-- name: serve_micro_benchmark_k8s
- team: serve
- cluster:
- app_config: app_config.yaml
- # 16 CPUS
- compute_template: compute_tpl_single_node_k8s.yaml
- compute_on_k8s: True
-
- run:
- timeout: 7200
- long_running: False
- script: python workloads/serve_micro_benchmark.py
-
- smoke_test:
- timeout: 600
-
-- name: serve_cluster_fault_tolerance
- team: serve
- cluster:
- app_config: app_config.yaml
- # 16 CPUS
- compute_template: compute_tpl_single_node.yaml
-
- run:
- timeout: 7200
- long_running: False
- script: python workloads/serve_cluster_fault_tolerance.py
-
- smoke_test:
- timeout: 600
diff --git a/release/sgd_tests/sgd_tests.yaml b/release/sgd_tests/sgd_tests.yaml
deleted file mode 100644
index cb0d4d5c3..000000000
--- a/release/sgd_tests/sgd_tests.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-# Test multi-node, multi-GPU Ray SGD example.
-- name: sgd_gpu
- team: ml
- cluster:
- app_config: sgd_gpu/sgd_gpu_app_config.yaml
- compute_template: sgd_gpu/sgd_gpu_compute.yaml
-
- run:
- timeout: 3000
- prepare: python wait_cluster.py 2 600
- script: python sgd_gpu/sgd_gpu_test.py --num-workers=2 --use-gpu --address=auto
\ No newline at end of file
diff --git a/release/sgd_tests/wait_cluster.py b/release/sgd_tests/wait_cluster.py
deleted file mode 100644
index c02330db2..000000000
--- a/release/sgd_tests/wait_cluster.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/test_owners.yaml b/release/test_owners.yaml
deleted file mode 100644
index b898529a8..000000000
--- a/release/test_owners.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Specify the test owners (teams) here.
-# The root key should be the name of the test yaml file without the .yaml.
-# To specify owners of subtests, use a sub dict (see e.g. long_running_tests).
-golden_notebook_tests: ml
-horovod_tests: ml
-lightgbm_tests: ml
-long_running_distributed_tests: ml
-long_running_tests:
- actor_deaths: core
- apex: ml
- impala: ml
- many_actor_tasks: core
- many_drivers: core
- many_ppo: core
- many_tasks: core
- many_tasks_serialized_ids: core
- node_failures: core
- pbt: ml
- serve: serve
- serve_failure: serve
-microbenchmark: core
-nightly_tests: core
-rllib_tests: ml
-runtime_env_tests: serve
-serve_tests: serve
-sgd_tests: ml
-xgboost_tests: ml
diff --git a/release/tune_tests/cloud_tests/tune_cloud_tests.yaml b/release/tune_tests/cloud_tests/tune_cloud_tests.yaml
deleted file mode 100644
index 72279931e..000000000
--- a/release/tune_tests/cloud_tests/tune_cloud_tests.yaml
+++ /dev/null
@@ -1,118 +0,0 @@
-- name: aws_no_sync_down
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_aws_4x2.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/run_cloud_test.py no_sync_down
-
-- name: aws_ssh_sync
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_aws_4x2.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/run_cloud_test.py ssh_sync
-
-- name: aws_durable_upload
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_aws_4x2.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
-
-- name: aws_durable_upload_rllib_str
- team: ml
- cluster:
- app_config: app_config_ml.yaml
- compute_template: tpl_aws_4x2.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str
-
-- name: aws_durable_upload_rllib_trainer
- team: ml
- cluster:
- app_config: app_config_ml.yaml
- compute_template: tpl_aws_4x2.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer
-
-- name: aws_no_durable_upload
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_aws_4x2.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload
-
-- name: gcp_k8s_no_sync_down
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_gcp_k8s_4x8.yaml
- cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
-
- run:
- use_connect: True
- timeout: 600
- # Remove --cpus-per-trial 8 once n2-standard-2 is supported
- script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
-
-- name: gcp_k8s_ssh_sync
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_gcp_k8s_4x8.yaml
- cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
-
- run:
- use_connect: True
- timeout: 600
- # Remove --cpus-per-trial 8 once n2-standard-2 is supported
- script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
-
-- name: gcp_k8s_durable_upload
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_gcp_k8s_4x8.yaml
- cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
-
- run:
- use_connect: True
- timeout: 600
- # Remove --cpus-per-trial 8 once n2-standard-2 is supported
- script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
-
-
-- name: gcp_k8s_no_durable_upload
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_gcp_k8s_4x8.yaml
- cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
-
- run:
- use_connect: True
- timeout: 600
- # Remove --cpus-per-trial 8 once n2-standard-2 is supported
- script: python workloads/run_cloud_test.py no_durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
diff --git a/release/tune_tests/cloud_tests/wait_cluster.py b/release/tune_tests/cloud_tests/wait_cluster.py
deleted file mode 100644
index f70088289..000000000
--- a/release/tune_tests/cloud_tests/wait_cluster.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/tune_tests/scalability_tests/tune_tests.yaml b/release/tune_tests/scalability_tests/tune_tests.yaml
deleted file mode 100644
index ba8a5a230..000000000
--- a/release/tune_tests/scalability_tests/tune_tests.yaml
+++ /dev/null
@@ -1,90 +0,0 @@
-- name: bookkeeping_overhead
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_1x16.yaml
-
- run:
- timeout: 1200
- script: python workloads/test_bookkeeping_overhead.py
-
-
-- name: durable_trainable
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_16x2.yaml
-
- run:
- timeout: 900
- prepare: python wait_cluster.py 16 600
- script: python workloads/test_durable_trainable.py --bucket data-test-ilr
-
-- name: long_running_large_checkpoints
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_1x32_hd.yaml
-
- run:
- timeout: 86400
- script: python workloads/test_long_running_large_checkpoints.py
- long_running: True
-
- smoke_test:
- run:
- timeout: 3600
-
-
-- name: network_overhead
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_100x2.yaml
-
- run:
- timeout: 900
- prepare_timeout: 1200
- prepare: python wait_cluster.py 100 1200
- script: python workloads/test_network_overhead.py
-
- smoke_test:
- cluster:
- compute_template: tpl_20x2.yaml
-
- run:
- timeout: 400
- prepare_timeout: 600
- prepare: python wait_cluster.py 20 600
-
-- name: result_throughput_cluster
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_16x64.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 16 600
- script: python workloads/test_result_throughput_cluster.py
-
-- name: result_throughput_single_node
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_1x96.yaml
-
- run:
- timeout: 600
- script: python workloads/test_result_throughput_single_node.py
-
-- name: xgboost_sweep
- team: ml
- cluster:
- app_config: app_config_data.yaml
- compute_template: tpl_16x64.yaml
-
- run:
- timeout: 3600
- prepare: python wait_cluster.py 16 600
- script: python workloads/test_xgboost_sweep.py
diff --git a/release/tune_tests/scalability_tests/wait_cluster.py b/release/tune_tests/scalability_tests/wait_cluster.py
deleted file mode 100644
index c02330db2..000000000
--- a/release/tune_tests/scalability_tests/wait_cluster.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/util/wait_cluster.py b/release/util/wait_cluster.py
deleted file mode 100644
index c02330db2..000000000
--- a/release/util/wait_cluster.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/xgboost_tests/wait_cluster.py b/release/xgboost_tests/wait_cluster.py
deleted file mode 100644
index c02330db2..000000000
--- a/release/xgboost_tests/wait_cluster.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import argparse
-import time
-
-import ray
-
-ray.init(address="auto")
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
- "num_nodes", type=int, help="Wait for this number of nodes (includes head)"
-)
-
-parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
-
-parser.add_argument(
- "--feedback_interval_s",
- type=int,
- default=10,
- help="Wait for this number of seconds",
-)
-
-args = parser.parse_args()
-
-curr_nodes = 0
-start = time.time()
-next_feedback = start
-max_time = start + args.max_time_s
-while not curr_nodes >= args.num_nodes:
- now = time.time()
-
- if now >= max_time:
- raise RuntimeError(
- f"Maximum wait time reached, but only "
- f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
- )
-
- if now >= next_feedback:
- passed = now - start
- print(
- f"Waiting for more nodes to come up: "
- f"{curr_nodes}/{args.num_nodes} "
- f"({passed:.0f} seconds passed)"
- )
- next_feedback = now + args.feedback_interval_s
-
- time.sleep(5)
- curr_nodes = len(ray.nodes())
-
-passed = time.time() - start
-print(
- f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
- f"{passed:.0f} seconds"
-)
diff --git a/release/xgboost_tests/xgboost_tests.yaml b/release/xgboost_tests/xgboost_tests.yaml
deleted file mode 100644
index 264443308..000000000
--- a/release/xgboost_tests/xgboost_tests.yaml
+++ /dev/null
@@ -1,104 +0,0 @@
-- name: train_small
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_small.yaml
-
- run:
- use_connect: True
- autosuspend_mins: 10
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/train_small.py
-
-- name: train_moderate
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_moderate.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 32 600
- script: python workloads/train_moderate.py
-
-- name: train_gpu
- team: ml
- cluster:
- app_config: app_config_gpu.yaml
- compute_template: tpl_gpu_small.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 5 600
- script: python workloads/train_gpu.py
-
-- name: distributed_api_test
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_small.yaml
- results:
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/distributed_api_test.py
- results: ""
-
-- name: ft_small_elastic
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_small.yaml
-
- run:
- timeout: 900
- prepare: python wait_cluster.py 4 600
- script: python workloads/ft_small_elastic.py
- results: ""
-
-- name: ft_small_non_elastic
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_small.yaml
-
- run:
- timeout: 900
- prepare: python wait_cluster.py 4 600
- script: python workloads/ft_small_non_elastic.py
- results: ""
-
-- name: tune_small
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_small.yaml
-
- run:
- timeout: 600
- prepare: python wait_cluster.py 4 600
- script: python workloads/tune_small.py
-
-- name: tune_32x4
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_moderate.yaml
-
- run:
- timeout: 900
- prepare: python wait_cluster.py 32 600
- script: python workloads/tune_32x4.py
-
-- name: tune_4x32
- team: ml
- cluster:
- app_config: app_config.yaml
- compute_template: tpl_cpu_moderate.yaml
-
- run:
- timeout: 900
- prepare: python wait_cluster.py 32 600
- script: python workloads/tune_4x32.py