diff --git a/benchmarks/benchmark_tests.yaml b/benchmarks/benchmark_tests.yaml deleted file mode 100644 index a89e3deb9..000000000 --- a/benchmarks/benchmark_tests.yaml +++ /dev/null @@ -1,145 +0,0 @@ -- name: single_node - team: core - cluster: - app_config: app_config.yaml - compute_template: single_node.yaml - - run: - timeout: 12000 - prepare: sleep 0 - script: python single_node/test_single_node.py - -- name: object_store - team: core - cluster: - app_config: app_config.yaml - compute_template: object_store.yaml - - run: - timeout: 3600 - prepare: python distributed/wait_cluster.py --num-nodes=50 - script: python object_store/test_object_store.py - -- name: many_actors - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=65 - script: python distributed/test_many_actors.py - -- name: many_actors_smoke_test - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed_smoke_test.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=2 - script: SMOKE_TEST=1 python distributed/test_many_actors.py - -- name: many_tasks - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=65 - script: python distributed/test_many_tasks.py --num-tasks=10000 - -- name: many_tasks_smoke_test - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed_smoke_test.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=2 - script: python distributed/test_many_tasks.py --num-tasks=100 - -- name: many_pgs - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=65 - script: python distributed/test_many_pgs.py - -- name: many_pgs_smoke_test - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed_smoke_test.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=2 - script: SMOKE_TEST=1 python distributed/test_many_pgs.py - -# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test -- name: many_nodes - team: core - cluster: - app_config: app_config.yaml - compute_template: many_nodes.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=250 - script: python distributed/test_many_tasks.py --num-tasks=1000 - -- name: scheduling_test_many_0s_tasks_single_node - team: core - cluster: - app_config: app_config.yaml - compute_template: scheduling.yaml - - run: - timeout: 3600 - prepare: python distributed/wait_cluster.py --num-nodes=32 - script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1 - -- name: scheduling_test_many_0s_tasks_many_nodes - team: core - cluster: - app_config: app_config.yaml - compute_template: scheduling.yaml - - run: - timeout: 3600 - prepare: python distributed/wait_cluster.py --num-nodes=32 - script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1 - -- name: scheduling_test_many_5s_tasks_single_node - team: core - cluster: - app_config: app_config.yaml - compute_template: scheduling.yaml - - run: - timeout: 3600 - prepare: python distributed/wait_cluster.py --num-nodes=32 - script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1 - stable: false - -- name: scheduling_test_many_5s_tasks_many_nodes - team: core - cluster: - app_config: app_config.yaml - compute_template: scheduling.yaml - - run: - timeout: 3600 - prepare: python distributed/wait_cluster.py --num-nodes=32 - script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1 - stable: false diff --git a/benchmarks/distributed/wait_cluster.py b/benchmarks/distributed/wait_cluster.py deleted file mode 100644 index 12a8a1677..000000000 --- a/benchmarks/distributed/wait_cluster.py +++ /dev/null @@ -1,24 +0,0 @@ -import click -import ray -import time - - -def num_alive_nodes(): - n = 0 - for node in ray.nodes(): - if node["Alive"]: - n += 1 - return n - - -@click.command() -@click.option("--num-nodes", required=True, type=int, help="The target number of nodes") -def wait_cluster(num_nodes: int): - ray.init(address="auto") - while num_alive_nodes() != num_nodes: - print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}") - time.sleep(5) - - -if __name__ == "__main__": - wait_cluster() diff --git a/release/.buildkite/build_pipeline.py b/release/.buildkite/build_pipeline.py deleted file mode 100644 index 4c0e09099..000000000 --- a/release/.buildkite/build_pipeline.py +++ /dev/null @@ -1,680 +0,0 @@ -import copy -import logging -import os -import re -import sys - -import yaml - -# If you update or reorganize the periodic tests, please ensure the -# relevant portions of the Ray release instructions (go/release-ray) -# (in particular, running periodic tests and collecting release logs) -# are up to date. If you need access, please contact @zhe-thoughts. - -# Env variables: - -# RAY_REPO Repo to use for finding the wheel -# RAY_BRANCH Branch to find the wheel -# RAY_VERSION Version to find the wheel -# RAY_WHEELS Direct Ray wheel URL -# RAY_TEST_REPO Repo to use for test scripts -# RAY_TEST_BRANCH Branch for test scripts -# FILTER_FILE File filter -# FILTER_TEST Test name filter -# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly) - - -class ReleaseTest: - def __init__( - self, - name: str, - smoke_test: bool = False, - retry: int = 0, - ): - self.name = name - self.smoke_test = smoke_test - self.retry = retry - - def __str__(self): - return self.name - - def __repr__(self): - return self.name - - def __contains__(self, item): - return self.name.__contains__(item) - - def __iter__(self): - return iter(self.name) - - def __len__(self): - return len(self.name) - - -class SmokeTest(ReleaseTest): - def __init__(self, name: str, retry: int = 0): - super(SmokeTest, self).__init__(name=name, smoke_test=True, retry=retry) - - -CORE_NIGHTLY_TESTS = { - # "~/ray/release/nightly_tests/nightly_tests.yaml": [ - # "shuffle_10gb", - # "shuffle_50gb", - # "shuffle_50gb_large_partition", - # "shuffle_100gb", - # "non_streaming_shuffle_100gb", - # "non_streaming_shuffle_50gb_large_partition", - # "non_streaming_shuffle_50gb", - # SmokeTest("dask_on_ray_large_scale_test_no_spilling"), - # SmokeTest("dask_on_ray_large_scale_test_spilling"), - # "stress_test_placement_group", - # "shuffle_1tb_1000_partition", - # "non_streaming_shuffle_1tb_1000_partition", - # "shuffle_1tb_5000_partitions", - # TODO(sang): It doesn't even work without spilling - # as it hits the scalability limit. - # "non_streaming_shuffle_1tb_5000_partitions", - # "decision_tree_autoscaling", - # "decision_tree_autoscaling_20_runs", - # "autoscaling_shuffle_1tb_1000_partitions", - # SmokeTest("stress_test_many_tasks"), - # SmokeTest("stress_test_dead_actors"), - # SmokeTest("threaded_actors_stress_test"), - # "pg_long_running_performance_test", - # ], - # "~/ray/benchmarks/benchmark_tests.yaml": [ - # "single_node", - # "object_store", - # "many_actors_smoke_test", - # "many_tasks_smoke_test", - # "many_pgs_smoke_test", - # ], - # "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [ - # "inference", - # "shuffle_data_loader", - # "parquet_metadata_resolution", - # "pipelined_training_50_gb", - # "pipelined_ingestion_1500_gb", - # "datasets_preprocess_ingest", - # "datasets_ingest_400G", - # SmokeTest("datasets_ingest_train_infer"), - # ], - # "~/ray/release/nightly_tests/chaos_test.yaml": [ - # "chaos_many_actors", - # "chaos_many_tasks_no_object_store", - # "chaos_pipelined_ingestion_1500_gb_15_windows", - # ], - # "~/ray/release/microbenchmark/microbenchmark.yaml": [ - # "microbenchmark", - # ], -} - -SERVE_NIGHTLY_TESTS = { - # "~/ray/release/long_running_tests/long_running_tests.yaml": [ - # SmokeTest("serve"), - # SmokeTest("serve_failure"), - # ], - # "~/ray/release/serve_tests/serve_tests.yaml": [ - # "single_deployment_1k_noop_replica", - # "multi_deployment_1k_noop_replica", - # "autoscaling_single_deployment", - # "autoscaling_multi_deployment", - # "serve_micro_benchmark", - # # TODO(architkulkarni) Reenable after K8s migration. Currently failing - # # "serve_micro_benchmark_k8s", - # "serve_cluster_fault_tolerance", - # ], -} - -CORE_DAILY_TESTS = { - # "~/ray/release/nightly_tests/nightly_tests.yaml": [ - # "k8s_dask_on_ray_large_scale_test_no_spilling", - # "dask_on_ray_large_scale_test_no_spilling", - # "dask_on_ray_large_scale_test_spilling", - # "pg_autoscaling_regression_test", - # "threaded_actors_stress_test", - # "k8s_threaded_actors_stress_test", - # "stress_test_many_tasks", - # "stress_test_dead_actors", - # ], - # "~/ray/release/nightly_tests/chaos_test.yaml": [ - # "chaos_dask_on_ray_large_scale_test_no_spilling", - # "chaos_dask_on_ray_large_scale_test_spilling", - # ], -} - -CORE_SCALABILITY_TESTS_DAILY = { - # "~/ray/benchmarks/benchmark_tests.yaml": [ - # "many_actors", - # "many_tasks", - # "many_pgs", - # "many_nodes", - # ], -} - -CORE_SCHEDULING_DAILY = { - # "~/ray/benchmarks/benchmark_tests.yaml": [ - # "scheduling_test_many_0s_tasks_single_node", - # "scheduling_test_many_0s_tasks_many_nodes", - # # Reenable these two once we got right setup - # # "scheduling_test_many_5s_tasks_single_node", - # # "scheduling_test_many_5s_tasks_many_nodes", - # ], - # "~/ray/release/nightly_tests/nightly_tests.yaml": [ - # "many_nodes_actor_test", - # "dask_on_ray_10gb_sort", - # "dask_on_ray_100gb_sort", - # "dask_on_ray_1tb_sort", - # "placement_group_performance_test", - # ], -} - -NIGHTLY_TESTS = { - # "~/ray/release/horovod_tests/horovod_tests.yaml": [ - # SmokeTest("horovod_test"), - # ], # Should we enable this? - # "~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [ - # "dask_xgboost_test", - # "modin_xgboost_test", - # "torch_tune_serve_test", - # ], - # "~/ray/release/long_running_tests/long_running_tests.yaml": [ - # SmokeTest("actor_deaths"), - # SmokeTest("apex"), - # SmokeTest("impala"), - # SmokeTest("many_actor_tasks"), - # SmokeTest("many_drivers"), - # SmokeTest("many_ppo"), - # SmokeTest("many_tasks"), - # SmokeTest("many_tasks_serialized_ids"), - # SmokeTest("node_failures"), - # SmokeTest("pbt"), - # # SmokeTest("serve"), - # # SmokeTest("serve_failure"), - # # Full long running tests (1 day runtime) - # "actor_deaths", - # "apex", - # "impala", - # "many_actor_tasks", - # "many_drivers", - # "many_ppo", - # "many_tasks", - # "many_tasks_serialized_ids", - # "node_failures", - # "pbt", - # "serve", - # "serve_failure", - # ], - # "~/ray/release/sgd_tests/sgd_tests.yaml": [ - # "sgd_gpu", - # ], - # "~/ray/release/tune_tests/cloud_tests/tune_cloud_tests.yaml": [ - # "aws_no_sync_down", - # "aws_ssh_sync", - # "aws_durable_upload", - # "aws_durable_upload_rllib_str", - # "aws_durable_upload_rllib_trainer", - # "gcp_k8s_durable_upload", - # ], - # "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [ - # "bookkeeping_overhead", - # "durable_trainable", - # SmokeTest("long_running_large_checkpoints"), - # SmokeTest("network_overhead"), - # "result_throughput_cluster", - # "result_throughput_single_node", - # ], - # "~/ray/release/xgboost_tests/xgboost_tests.yaml": [ - # "train_small", - # "train_moderate", - # "train_gpu", - # "tune_small", - # "tune_4x32", - # "tune_32x4", - # "ft_small_elastic", - # "ft_small_non_elastic", - # "distributed_api_test", - # ], - # "~/ray/release/rllib_tests/rllib_tests.yaml": [ - # SmokeTest("learning_tests"), - # SmokeTest("stress_tests"), - # "performance_tests", - # "multi_gpu_learning_tests", - # "multi_gpu_with_lstm_learning_tests", - # "multi_gpu_with_attention_learning_tests", - # # We'll have these as per-PR tests soon. - # # "example_scripts_on_gpu_tests", - # ], - # "~/ray/release/runtime_env_tests/runtime_env_tests.yaml": [ - # "rte_many_tasks_actors", - # "wheel_urls", - # "rte_ray_client", - # ], -} - -WEEKLY_TESTS = { - # "~/ray/release/horovod_tests/horovod_tests.yaml": [ - # "horovod_test", - # ], - "~/ray/release/long_running_distributed_tests" - # "/long_running_distributed.yaml": [ - # "pytorch_pbt_failure", - # ], - # "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [ - # "network_overhead", - # "long_running_large_checkpoints", - # "xgboost_sweep", - # ], - # "~/ray/release/rllib_tests/rllib_tests.yaml": [ - # "learning_tests", - # "stress_tests", - # ], -} - -# This test suite holds "user" tests to test important user workflows -# in a particular environment. -# All workloads in this test suite should: -# 1. Be run in a distributed (multi-node) fashion -# 2. Use autoscaling/scale up (no wait_cluster.py) -# 3. Use GPUs if applicable -# 4. Have the `use_connect` flag set. -USER_TESTS = { - # "~/ray/release/ml_user_tests/ml_user_tests.yaml": [ - # "train_tensorflow_mnist_test", - # "train_torch_linear_test", - # "ray_lightning_user_test_latest", - # "ray_lightning_user_test_master", - # "horovod_user_test_latest", - # "horovod_user_test_master", - # "xgboost_gpu_connect_latest", - # "xgboost_gpu_connect_master", - # "tune_rllib_connect_test", - # ] -} - -SUITES = { - "core-nightly": CORE_NIGHTLY_TESTS, - "serve-nightly": SERVE_NIGHTLY_TESTS, - "core-daily": CORE_DAILY_TESTS, - "core-scalability": CORE_SCALABILITY_TESTS_DAILY, - "nightly": {**NIGHTLY_TESTS, **USER_TESTS}, - "core-scheduling-daily": CORE_SCHEDULING_DAILY, - "weekly": WEEKLY_TESTS, -} - -DEFAULT_STEP_TEMPLATE = { - "env": { - "ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc", - "ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle", - "RELEASE_AWS_BUCKET": "ray-release-automation-results", - "RELEASE_AWS_LOCATION": "dev", - "RELEASE_AWS_DB_NAME": "ray_ci", - "RELEASE_AWS_DB_TABLE": "release_test_result", - "AWS_REGION": "us-west-2", - }, - "agents": {"queue": "runner_queue_branch"}, - "plugins": [ - { - "docker#v3.9.0": { - "image": "rayproject/ray", - "propagate-environment": True, - "volumes": [ - "/tmp/ray_release_test_artifacts:" "/tmp/ray_release_test_artifacts" - ], - } - } - ], - "artifact_paths": ["/tmp/ray_release_test_artifacts/**/*"], -} - - -def ask_configuration(): - RAY_BRANCH = os.environ.get("RAY_BRANCH", "master") - RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git") - RAY_VERSION = os.environ.get("RAY_VERSION", "") - RAY_WHEELS = os.environ.get("RAY_WHEELS", "") - - RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH) - RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO) - - RELEASE_TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly") - FILTER_FILE = os.environ.get("FILTER_FILE", "") - FILTER_TEST = os.environ.get("FILTER_TEST", "") - - input_ask_step = { - "input": "Input required: Please specify tests to run", - "fields": [ - { - "text": ( - "RAY_REPO: Please specify the Ray repository used " - "to find the wheel." - ), - "hint": ( - "Repository from which to fetch the latest " - "commits to find the Ray wheels. Usually you don't " - "need to change this." - ), - "default": RAY_REPO, - "key": "ray_repo", - }, - { - "text": ( - "RAY_BRANCH: Please specify the Ray branch used " - "to find the wheel." - ), - "hint": "For releases, this will be e.g. `releases/1.x.0`", - "default": RAY_BRANCH, - "key": "ray_branch", - }, - { - "text": ( - "RAY_VERSION: Please specify the Ray version used " - "to find the wheel." - ), - "hint": ( - "Leave empty for latest master. For releases, " - "specify the release version." - ), - "required": False, - "default": RAY_VERSION, - "key": "ray_version", - }, - { - "text": "RAY_WHEELS: Please specify the Ray wheel URL.", - "hint": ( - "ATTENTION: If you provide this, RAY_REPO, " - "RAY_BRANCH and RAY_VERSION will be ignored! " - "Please also make sure to provide the wheels URL " - "for Python 3.7 on Linux.\n" - "You can also insert a commit hash here instead " - "of a full URL.\n" - "NOTE: You can specify multiple commits or URLs " - "for easy bisection (one per line) - this will " - "run each test on each of the specified wheels." - ), - "required": False, - "default": RAY_WHEELS, - "key": "ray_wheels", - }, - { - "text": ( - "RAY_TEST_REPO: Please specify the Ray repository " - "used to find the tests you would like to run." - ), - "hint": ( - "If you're developing a new release test, this " - "will most likely be your GitHub fork." - ), - "default": RAY_TEST_REPO, - "key": "ray_test_repo", - }, - { - "text": ( - "RAY_TEST_BRANCH: Please specify the Ray branch used " - "to find the tests you would like to run." - ), - "hint": ( - "If you're developing a new release test, this " - "will most likely be a branch living on your " - "GitHub fork." - ), - "default": RAY_TEST_BRANCH, - "key": "ray_test_branch", - }, - { - "select": ( - "RELEASE_TEST_SUITE: Please specify the release " - "test suite containing the tests you would like " - "to run." - ), - "hint": ( - "Check in the `build_pipeline.py` if you're " - "unsure which suite contains your tests." - ), - "required": True, - "options": sorted(SUITES.keys()), - "default": RELEASE_TEST_SUITE, - "key": "release_test_suite", - }, - { - "text": ( - "FILTER_FILE: Please specify a filter for the " - "test files that should be included in this build." - ), - "hint": ( - "Only test files (e.g. xgboost_tests.yml) that " - "match this string will be included in the test" - ), - "default": FILTER_FILE, - "required": False, - "key": "filter_file", - }, - { - "text": ( - "FILTER_TEST: Please specify a filter for the " - "test names that should be included in this build." - ), - "hint": ( - "Only test names (e.g. tune_4x32) that match " - "this string will be included in the test" - ), - "default": FILTER_TEST, - "required": False, - "key": "filter_test", - }, - ], - "key": "input_ask_step", - } - - run_again_step = { - "commands": [ - f'export {v}=$(buildkite-agent meta-data get "{k}")' - for k, v in { - "ray_branch": "RAY_BRANCH", - "ray_repo": "RAY_REPO", - "ray_version": "RAY_VERSION", - "ray_wheels": "RAY_WHEELS", - "ray_test_branch": "RAY_TEST_BRANCH", - "ray_test_repo": "RAY_TEST_REPO", - "release_test_suite": "RELEASE_TEST_SUITE", - "filter_file": "FILTER_FILE", - "filter_test": "FILTER_TEST", - }.items() - ] - + [ - "export AUTOMATIC=1", - "python3 -m pip install --user pyyaml", - "rm -rf ~/ray || true", - "git clone -b $${RAY_TEST_BRANCH} $${RAY_TEST_REPO} ~/ray", - ( - "python3 ~/ray/release/.buildkite/build_pipeline.py " - "| buildkite-agent pipeline upload" - ), - ], - "label": ":pipeline: Again", - "agents": {"queue": "runner_queue_branch"}, - "depends_on": "input_ask_step", - "key": "run_again_step", - } - - return [ - input_ask_step, - run_again_step, - ] - - -def create_test_step( - ray_repo: str, - ray_branch: str, - ray_version: str, - ray_wheels: str, - ray_test_repo: str, - ray_test_branch: str, - test_file: str, - test_name: ReleaseTest, -): - custom_commit_str = "custom_wheels_url" - if ray_wheels: - # Extract commit from url - p = re.compile(r"([a-f0-9]{40})") - m = p.search(ray_wheels) - if m is not None: - custom_commit_str = m.group(1) - - ray_wheels_str = f" ({ray_wheels}) " if ray_wheels else "" - - logging.info(f"Creating step for {test_file}/{test_name}{ray_wheels_str}") - - cmd = ( - f"./release/run_e2e.sh " - f'--ray-repo "{ray_repo}" ' - f'--ray-branch "{ray_branch}" ' - f'--ray-version "{ray_version}" ' - f'--ray-wheels "{ray_wheels}" ' - f'--ray-test-repo "{ray_test_repo}" ' - f'--ray-test-branch "{ray_test_branch}" ' - ) - - args = ( - f"--category {ray_branch} " - f"--test-config {test_file} " - f"--test-name {test_name} " - f"--keep-results-dir" - ) - - if test_name.smoke_test: - logging.info("This test will run as a smoke test.") - args += " --smoke-test" - - step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE) - - if test_name.retry: - logging.info(f"This test will be retried up to " f"{test_name.retry} times.") - step_conf["retry"] = { - "automatic": [{"exit_status": "*", "limit": test_name.retry}] - } - else: - # Default retry logic - # Warning: Exit codes are currently not correctly propagated to - # buildkite! Thus, actual retry logic is currently implemented in - # the run_e2e.sh script! - step_conf["retry"] = { - "automatic": [ - {"exit_status": 7, "limit": 2}, # Prepare timeout - {"exit_status": 9, "limit": 2}, # Session timeout - {"exit_status": 10, "limit": 2}, # Prepare error - ], - } - - step_conf["command"] = cmd + args - - step_conf["label"] = ( - f"{test_name} " - f"({custom_commit_str if ray_wheels_str else ray_branch}) - " - f"{ray_test_branch}/{ray_test_repo}" - ) - return step_conf - - -def build_pipeline(steps): - all_steps = [] - - RAY_BRANCH = os.environ.get("RAY_BRANCH", "master") - RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git") - RAY_VERSION = os.environ.get("RAY_VERSION", "") - RAY_WHEELS = os.environ.get("RAY_WHEELS", "") - - RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH) - RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO) - - FILTER_FILE = os.environ.get("FILTER_FILE", "") - FILTER_TEST = os.environ.get("FILTER_TEST", "") - - ray_wheels_list = [""] - if RAY_WHEELS: - ray_wheels_list = RAY_WHEELS.split("\n") - - if len(ray_wheels_list) > 1: - logging.info( - f"This will run a bisec on the following URLs/commits: " - f"{ray_wheels_list}" - ) - - logging.info( - f"Building pipeline \n" - f"Ray repo/branch to test:\n" - f" RAY_REPO = {RAY_REPO}\n" - f" RAY_BRANCH = {RAY_BRANCH}\n\n" - f" RAY_VERSION = {RAY_VERSION}\n\n" - f" RAY_WHEELS = {RAY_WHEELS}\n\n" - f"Ray repo/branch containing the test configurations and scripts:" - f" RAY_TEST_REPO = {RAY_TEST_REPO}\n" - f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n" - f"Filtering for these tests:\n" - f" FILTER_FILE = {FILTER_FILE}\n" - f" FILTER_TEST = {FILTER_TEST}\n\n" - ) - - for test_file, test_names in steps.items(): - if FILTER_FILE and FILTER_FILE not in test_file: - continue - - test_base = os.path.basename(test_file) - for test_name in test_names: - if FILTER_TEST and FILTER_TEST not in test_name: - continue - - if not isinstance(test_name, ReleaseTest): - test_name = ReleaseTest(name=test_name) - - logging.info(f"Adding test: {test_base}/{test_name}") - - for ray_wheels in ray_wheels_list: - step_conf = create_test_step( - ray_repo=RAY_REPO, - ray_branch=RAY_BRANCH, - ray_version=RAY_VERSION, - ray_wheels=ray_wheels, - ray_test_repo=RAY_TEST_REPO, - ray_test_branch=RAY_TEST_BRANCH, - test_file=test_file, - test_name=test_name, - ) - - all_steps.append(step_conf) - - return all_steps - - -def alert_pipeline(stats: bool = False): - step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE) - - cmd = "python release/alert.py" - if stats: - cmd += " --stats" - - step_conf["commands"] = [ - "pip install -q -r release/requirements.txt", - "pip install -U boto3 botocore", - cmd, - ] - step_conf["label"] = f"Send periodic alert (stats_only = {stats})" - return [step_conf] - - -if __name__ == "__main__": - alert = os.environ.get("RELEASE_ALERT", "0") - - ask_for_config = not bool(int(os.environ.get("AUTOMATIC", "0"))) - - if alert in ["1", "stats"]: - steps = alert_pipeline(alert == "stats") - elif ask_for_config: - steps = ask_configuration() - else: - TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly") - PIPELINE_SPEC = SUITES[TEST_SUITE] - - steps = build_pipeline(PIPELINE_SPEC) - - yaml.dump({"steps": steps}, sys.stdout) diff --git a/release/alert.py b/release/alert.py deleted file mode 100644 index d0d1d433d..000000000 --- a/release/alert.py +++ /dev/null @@ -1,441 +0,0 @@ -import argparse -from collections import defaultdict, Counter -from typing import Any, List, Tuple, Mapping, Optional -import datetime -import hashlib -import json -import logging -import os -import requests -import sys - -import boto3 - -from e2e import GLOBAL_CONFIG - -from alerts.default import handle_result as default_handle_result -from alerts.rllib_tests import handle_result as rllib_tests_handle_result -from alerts.long_running_tests import handle_result as long_running_tests_handle_result -from alerts.tune_tests import handle_result as tune_tests_handle_result -from alerts.xgboost_tests import handle_result as xgboost_tests_handle_result - -SUITE_TO_FN = { - "long_running_tests": long_running_tests_handle_result, - "rllib_tests": rllib_tests_handle_result, - "tune_tests": tune_tests_handle_result, - "xgboost_tests": xgboost_tests_handle_result, -} - -GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] = "alert_state" -GLOBAL_CONFIG["SLACK_WEBHOOK"] = os.environ.get("SLACK_WEBHOOK", "") -GLOBAL_CONFIG["SLACK_CHANNEL"] = os.environ.get("SLACK_CHANNEL", "#oss-test-cop") - -RESULTS_LIMIT = 120 - -logger = logging.getLogger() -logger.setLevel(logging.INFO) -handler = logging.StreamHandler(stream=sys.stdout) -formatter = logging.Formatter( - fmt="[%(levelname)s %(asctime)s] " "%(filename)s: %(lineno)d " "%(message)s" -) -handler.setFormatter(formatter) -logger.addHandler(handler) - - -def maybe_fetch_slack_webhook(): - if GLOBAL_CONFIG["SLACK_WEBHOOK"] in [None, ""]: - print("Missing SLACK_WEBHOOK, retrieving from AWS secrets store") - GLOBAL_CONFIG["SLACK_WEBHOOK"] = boto3.client( - "secretsmanager", region_name="us-west-2" - ).get_secret_value( - SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:" - "release-automation/" - "slack-webhook-Na0CFP" - )[ - "SecretString" - ] - - -def _obj_hash(obj: Any) -> str: - json_str = json.dumps(obj, sort_keys=True, ensure_ascii=True) - sha = hashlib.sha256() - sha.update(json_str.encode()) - return sha.hexdigest() - - -def fetch_latest_alerts(rds_data_client): - schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] - - sql = f""" - SELECT DISTINCT ON (category, test_suite, test_name) - category, test_suite, test_name, last_result_hash, - last_notification_dt - FROM {schema} - ORDER BY category, test_suite, test_name, last_notification_dt DESC - LIMIT {RESULTS_LIMIT} - """ - - result = rds_data_client.execute_statement( - database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"], - secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"], - resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"], - schema=schema, - sql=sql, - ) - for row in result["records"]: - category, test_suite, test_name, last_result_hash, last_notification_dt = ( - r["stringValue"] if "stringValue" in r else None for r in row - ) - last_notification_dt = datetime.datetime.strptime( - last_notification_dt, "%Y-%m-%d %H:%M:%S" - ) - yield category, test_suite, test_name, last_result_hash, last_notification_dt - - -def fetch_latest_results( - rds_data_client, fetch_since: Optional[datetime.datetime] = None -): - schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"] - - sql = f""" - SELECT DISTINCT ON (category, test_suite, test_name) - created_on, category, test_suite, test_name, status, results, - artifacts, last_logs - FROM {schema} """ - - parameters = [] - if fetch_since is not None: - sql += "WHERE created_on >= :created_on " - parameters = [ - { - "name": "created_on", - "typeHint": "TIMESTAMP", - "value": {"stringValue": fetch_since.strftime("%Y-%m-%d %H:%M:%S")}, - }, - ] - - sql += "ORDER BY category, test_suite, test_name, created_on DESC " - sql += f"LIMIT {RESULTS_LIMIT}" - - result = rds_data_client.execute_statement( - database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"], - secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"], - resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"], - schema=schema, - sql=sql, - parameters=parameters, - ) - for row in result["records"]: - ( - created_on, - category, - test_suite, - test_name, - status, - results, - artifacts, - last_logs, - ) = (r["stringValue"] if "stringValue" in r else None for r in row) - - # Calculate hash before converting strings to objects - result_obj = ( - created_on, - category, - test_suite, - test_name, - status, - results, - artifacts, - last_logs, - ) - result_json = json.dumps(result_obj) - result_hash = _obj_hash(result_json) - - # Convert some strings to python objects - created_on = datetime.datetime.strptime(created_on, "%Y-%m-%d %H:%M:%S") - results = json.loads(results) - artifacts = json.loads(artifacts) - - yield result_hash, created_on, category, test_suite, test_name, status, results, artifacts, last_logs # noqa: E501 - - -def mark_as_handled( - rds_data_client, - update: bool, - category: str, - test_suite: str, - test_name: str, - result_hash: str, - last_notification_dt: datetime.datetime, -): - schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] - - if not update: - sql = f""" - INSERT INTO {schema} - (category, test_suite, test_name, - last_result_hash, last_notification_dt) - VALUES (:category, :test_suite, :test_name, - :last_result_hash, :last_notification_dt) - """ - else: - sql = f""" - UPDATE {schema} - SET last_result_hash=:last_result_hash, - last_notification_dt=:last_notification_dt - WHERE category=:category AND test_suite=:test_suite - AND test_name=:test_name - """ - - rds_data_client.execute_statement( - database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"], - parameters=[ - {"name": "category", "value": {"stringValue": category}}, - {"name": "test_suite", "value": {"stringValue": test_suite or ""}}, - {"name": "test_name", "value": {"stringValue": test_name}}, - {"name": "last_result_hash", "value": {"stringValue": result_hash}}, - { - "name": "last_notification_dt", - "typeHint": "TIMESTAMP", - "value": { - "stringValue": last_notification_dt.strftime("%Y-%m-%d %H:%M:%S") - }, - }, - ], - secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"], - resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"], - schema=schema, - sql=sql, - ) - - -def post_alerts_to_slack( - channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int] -): - if len(alerts) == 0: - logger.info("No alerts to post to slack.") - return - - markdown_lines = [ - f"* {len(alerts)} new release test failures found!*", - "", - ] - - category_alerts = defaultdict(list) - for (category, test_suite, test_name, alert) in alerts: - category_alerts[category].append( - f" *{test_suite}/{test_name}* failed: {alert}" - ) - - for category, alert_list in category_alerts.items(): - markdown_lines.append(f"Branch: *{category}*") - markdown_lines.extend(alert_list) - markdown_lines.append("") - - total_non_alerts = sum(n for n in non_alerts.values()) - non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()] - - markdown_lines += [ - f"Additionally, {total_non_alerts} tests passed successfully " - f"({', '.join(non_alert_detail)})." - ] - - slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"] - - resp = requests.post( - slack_url, - json={ - "text": "\n".join(markdown_lines), - "channel": channel, - "username": "Fail Bot", - "icon_emoji": ":red_circle:", - }, - ) - print(resp.status_code) - print(resp.text) - - -def post_statistics_to_slack( - channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int] -): - total_alerts = len(alerts) - - category_alerts = defaultdict(list) - for (category, test_suite, test_name, alert) in alerts: - category_alerts[category].append(f"`{test_suite}/{test_name}`") - - alert_detail = [f"{len(a)} on {c}" for c, a in category_alerts.items()] - - total_non_alerts = sum(n for n in non_alerts.values()) - non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()] - - markdown_lines = [ - "*Periodic release test report*", - "", - f"In the past 24 hours, " - f"*{total_non_alerts}* release tests finished successfully, and " - f"*{total_alerts}* release tests failed.", - ] - - markdown_lines.append("") - - if total_alerts: - markdown_lines.append(f"*Failing:* {', '.join(alert_detail)}") - for c, a in category_alerts.items(): - markdown_lines.append(f" *{c}*: {', '.join(sorted(a))}") - else: - markdown_lines.append("*Failing:* None") - - markdown_lines.append("") - - if total_non_alerts: - markdown_lines.append(f"*Passing:* {', '.join(non_alert_detail)}") - else: - markdown_lines.append("*Passing:* None") - - slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"] - - resp = requests.post( - slack_url, - json={ - "text": "\n".join(markdown_lines), - "channel": channel, - "username": "Fail Bot", - "icon_emoji": ":red_circle:", - }, - ) - print(resp.status_code) - print(resp.text) - - -def handle_results_and_get_alerts( - rds_data_client, - fetch_since: Optional[datetime.datetime] = None, - always_try_alert: bool = False, - no_status_update: bool = False, -): - # First build a map of last notifications - last_notifications_map = {} - for ( - category, - test_suite, - test_name, - last_result_hash, - last_notification_dt, - ) in fetch_latest_alerts(rds_data_client): - last_notifications_map[(category, test_suite, test_name)] = ( - last_result_hash, - last_notification_dt, - ) - - alerts = [] - non_alerts = Counter() - - # Then fetch latest results - for ( - result_hash, - created_on, - category, - test_suite, - test_name, - status, - results, - artifacts, - last_logs, - ) in fetch_latest_results(rds_data_client, fetch_since=fetch_since): - key = (category, test_suite, test_name) - - try_alert = always_try_alert - if key in last_notifications_map: - # If we have an alert for this key, fetch info - last_result_hash, last_notification_dt = last_notifications_map[key] - - if last_result_hash != result_hash: - # If we got a new result, handle new result - try_alert = True - # Todo: maybe alert again after some time? - else: - try_alert = True - - if try_alert: - handle_fn = SUITE_TO_FN.get(test_suite, None) - if not handle_fn: - logger.warning(f"No handle for suite {test_suite}") - alert = default_handle_result( - created_on, - category, - test_suite, - test_name, - status, - results, - artifacts, - last_logs, - ) - else: - alert = handle_fn( - created_on, - category, - test_suite, - test_name, - status, - results, - artifacts, - last_logs, - ) - - if alert: - logger.warning( - f"Alert raised for test {test_suite}/{test_name} " - f"({category}): {alert}" - ) - - alerts.append((category, test_suite, test_name, alert)) - else: - logger.debug( - f"No alert raised for test {test_suite}/{test_name} " - f"({category})" - ) - non_alerts[category] += 1 - - if not no_status_update: - mark_as_handled( - rds_data_client, - key in last_notifications_map, - category, - test_suite, - test_name, - result_hash, - datetime.datetime.now(), - ) - - return alerts, non_alerts - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--stats", - action="store_true", - default=False, - help="Finish quickly for training.", - ) - args = parser.parse_args() - - maybe_fetch_slack_webhook() - - rds_data_client = boto3.client("rds-data", region_name="us-west-2") - - if args.stats: - # Only update last 24 hour stats - fetch_since = datetime.datetime.now() - datetime.timedelta(days=1) - alerts, non_alerts = handle_results_and_get_alerts( - rds_data_client, - fetch_since=fetch_since, - always_try_alert=True, - no_status_update=True, - ) - post_statistics_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts) - - else: - alerts, non_alerts = handle_results_and_get_alerts(rds_data_client) - post_alerts_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts) diff --git a/release/benchmarks/benchmark_tests.yaml b/release/benchmarks/benchmark_tests.yaml deleted file mode 100644 index a89e3deb9..000000000 --- a/release/benchmarks/benchmark_tests.yaml +++ /dev/null @@ -1,145 +0,0 @@ -- name: single_node - team: core - cluster: - app_config: app_config.yaml - compute_template: single_node.yaml - - run: - timeout: 12000 - prepare: sleep 0 - script: python single_node/test_single_node.py - -- name: object_store - team: core - cluster: - app_config: app_config.yaml - compute_template: object_store.yaml - - run: - timeout: 3600 - prepare: python distributed/wait_cluster.py --num-nodes=50 - script: python object_store/test_object_store.py - -- name: many_actors - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=65 - script: python distributed/test_many_actors.py - -- name: many_actors_smoke_test - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed_smoke_test.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=2 - script: SMOKE_TEST=1 python distributed/test_many_actors.py - -- name: many_tasks - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=65 - script: python distributed/test_many_tasks.py --num-tasks=10000 - -- name: many_tasks_smoke_test - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed_smoke_test.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=2 - script: python distributed/test_many_tasks.py --num-tasks=100 - -- name: many_pgs - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=65 - script: python distributed/test_many_pgs.py - -- name: many_pgs_smoke_test - team: core - cluster: - app_config: app_config.yaml - compute_template: distributed_smoke_test.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=2 - script: SMOKE_TEST=1 python distributed/test_many_pgs.py - -# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test -- name: many_nodes - team: core - cluster: - app_config: app_config.yaml - compute_template: many_nodes.yaml - - run: - timeout: 3600 # 1hr - prepare: python distributed/wait_cluster.py --num-nodes=250 - script: python distributed/test_many_tasks.py --num-tasks=1000 - -- name: scheduling_test_many_0s_tasks_single_node - team: core - cluster: - app_config: app_config.yaml - compute_template: scheduling.yaml - - run: - timeout: 3600 - prepare: python distributed/wait_cluster.py --num-nodes=32 - script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1 - -- name: scheduling_test_many_0s_tasks_many_nodes - team: core - cluster: - app_config: app_config.yaml - compute_template: scheduling.yaml - - run: - timeout: 3600 - prepare: python distributed/wait_cluster.py --num-nodes=32 - script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1 - -- name: scheduling_test_many_5s_tasks_single_node - team: core - cluster: - app_config: app_config.yaml - compute_template: scheduling.yaml - - run: - timeout: 3600 - prepare: python distributed/wait_cluster.py --num-nodes=32 - script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1 - stable: false - -- name: scheduling_test_many_5s_tasks_many_nodes - team: core - cluster: - app_config: app_config.yaml - compute_template: scheduling.yaml - - run: - timeout: 3600 - prepare: python distributed/wait_cluster.py --num-nodes=32 - script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1 - stable: false diff --git a/release/benchmarks/distributed/wait_cluster.py b/release/benchmarks/distributed/wait_cluster.py deleted file mode 100644 index 12a8a1677..000000000 --- a/release/benchmarks/distributed/wait_cluster.py +++ /dev/null @@ -1,24 +0,0 @@ -import click -import ray -import time - - -def num_alive_nodes(): - n = 0 - for node in ray.nodes(): - if node["Alive"]: - n += 1 - return n - - -@click.command() -@click.option("--num-nodes", required=True, type=int, help="The target number of nodes") -def wait_cluster(num_nodes: int): - ray.init(address="auto") - while num_alive_nodes() != num_nodes: - print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}") - time.sleep(5) - - -if __name__ == "__main__": - wait_cluster() diff --git a/release/benchmarks/wait_cluster.py b/release/benchmarks/wait_cluster.py deleted file mode 100644 index f70088289..000000000 --- a/release/benchmarks/wait_cluster.py +++ /dev/null @@ -1,54 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s - -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/config_generator.html b/release/config_generator.html deleted file mode 100644 index 179bd6320..000000000 --- a/release/config_generator.html +++ /dev/null @@ -1,214 +0,0 @@ - - - - - Releaser config generator - - - - -
-

Releaser config generator

-

Use this form to generate a list of environment variables.

-

These variables can be passed to Buildkite to run a subset of release tests - and choose the correct wheels/release test branch

-
-
-
- - - - - - - -
SetValueDescription
- -
- -
- -
- -
- -
-
- - \ No newline at end of file diff --git a/release/e2e.py b/release/e2e.py deleted file mode 100644 index 3f458d56a..000000000 --- a/release/e2e.py +++ /dev/null @@ -1,2585 +0,0 @@ -""" -This is an end to end release test automation script used to kick off periodic -release tests, running on Anyscale. - -The tool leverages app configs and compute templates. - -Calling this script will run a single release test. - -Example: - -python e2e.py --test-config ~/ray/release/xgboost_tests/xgboost_tests.yaml --test-name tune_small - -The following steps are then performed: - -1. It will look up the test tune_small in the file xgboost_tests.yaml -2. It will fetch the specified app config and compute template and register - those with anyscale (if they don’t exist yet) -3. It waits until the app config is built -4. It then kicks off the script defined in the run block -5. When the script is finished, it will fetch the latest logs, the full log - output, and any artifacts specified in the artifacts block. -6. The full logs and artifacts will be stored in a s3 bucket -7. It will also fetch the json file specified in the run block as results. - This is the file where you should write your metrics to. -8. All results are then stored in a database. - Specifically it will store the following fields: - - Timestamp - - Test name - - Status (finished, error, timeout, invalid) - - Last logs (50 lines) - - results (see above) - - artifacts (links to s3 files) - -Then the script exits. If an error occurs at any time, a fail result is -written to the database. - -Exit codes ----------- -The script exits with code 0 on success, i.e. if the test has been run -end to end without failures and the subsequent results checks have passed. -In all other cases, an exit code > 0 is returned. - -Exit code 1 is the general failure exit code returned by Python when we -encounter an error that isn't caught by the rest of the script. - -Generally, we try to catch errors as they occur, and return a specific exit -code that can be used in automation tools to e.g. retry a test when nodes -didn't come up in time. - -These exit codes are defined in the ``ExitCode`` enum below. - -Writing a new release test --------------------------- -Each release test requires the following: - -1. It has to be added in a release test yaml file, describing meta information - about the test (e.g. name, command to run, timeout) -2. You need an app config yaml -3. You need a compute template yaml -4. You need to define a command to run. This is usually a python script. - The command should accept (or ignore) a single optional - `--smoke-test` argument. - Usually the command should write its result metrics to a json file. - The json filename is available in the TEST_OUTPUT_JSON env variable. -5. Add your test in release/.buildkite/build_pipeline.py. - -The script will have access to these environment variables: - - "RAY_ADDRESS": os.environ.get("RAY_ADDRESS", "auto") - "TEST_OUTPUT_JSON": results_json_filename - "IS_SMOKE_TEST": "1" if smoke_test else "0" - -For an example, take a look at the XGBoost test suite: - -https://github.com/ray-project/ray/blob/master/release/xgboost_tests/xgboost_tests.yaml - -These all use the same app configs and similar compute templates. This means -that app configs can be re-used across runs and only have to be built ones. - -App configs and compute templates can interpret environment variables. -A notable one is the `RAY_WHEELS` variable which points to the wheels that -should be tested (e.g. latest master wheels). You might want to include -something like this in your `post_build_cmds`: - - - pip3 uninstall ray -y || true - - pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} - -If you want to force rebuilds, consider using something like - - - echo {{ env["TIMESTAMP"] }} - -so that your app configs changes each time the script is executed. If you -only want to trigger rebuilds once per day, use `DATESTAMP` instead: - - - echo {{ env["DATESTAMP"] }} - -Local testing -------------- -Make sure to set these environment variables: - -- ANYSCALE_CLI_TOKEN (should contain your anyscale credential token) -- ANYSCALE_PROJECT (should point to a project ID you have access to) - -A test can then be run like this: - -python e2e.py --test-config ~/ray/release/xgboost_tests/xgboost_tests.yaml --test-name tune_small - -Using Compilation on Product + App Config Override --------------------------------------------------- -For quick iteration when debugging a release test, go/compile-on-product allows -you to easily modify and recompile Ray, such that the recompilation happens -within an app build step and can benefit from a warm Bazel cache. See -go/compile-on-product for more information. - -After kicking off the app build, you can give the app config ID to this script -as an app config override, where the indicated app config will be used instead -of the app config given in the test config. E.g., running - -python e2e.py --test-config ~/ray/benchmarks/benchmark_tests.yaml --test-name=single_node --app-config-id-override=apt_TBngEXXXrhipMXgexVcrpC9i - -would run the single_node benchmark test with the apt_TBngEXXXrhipMXgexVcrpC9i -app config instead of the app config given in -~/ray/benchmarks/benchmark_tests.yaml. If the build for the app config is still -in progress, the script will wait until it completes, same as for a locally -defined app config. - -Running on Head Node vs Running with Anyscale Connect ------------------------------------------------------ -By default release tests run their drivers on the head node. Support is being -added to run release tests that execute the driver as a subprocess and run -the workload on Anyscale product via Anyscale connect. -Note that when the driver in the test is a subprocess of releaser, releaser -cannot be terminated before the test finishes. -Other known feature gaps when running with Anyscale connect: -- Kicking off a test or checking progress is not supported. -- Downloading / uploading logs and artifacts are unsupported. -- Logs from remote may not have finished streaming, before the driver exits. - -Long running tests ------------------- -Long running tests can be kicked off with by adding the --kick-off-only -parameters to the e2e script. The status can then be checked with the ---check command. - -Long running test sessions will be terminated after `timeout` seconds, after -which the latest result in the TEST_OUTPUT_JSON will be reported. Thus, -long running release tests should update this file periodically. - -There are also two config options to configure behavior. The `time_key` is -needed to track the latest update of the TEST_OUTPUT_JSON and should contain -a floating point number (usually `time.time()`). The `max_update_delay` then -specified the maximum time in seconds that can be passed without an update -to the results json. If the output file hasn't been updated in e.g. 60 seconds, -this could indicate that the command is stale/frozen, and thus should fail. - -Release test yaml example -------------------------- -- name: example - owner: - mail: "kai@anyscale.com" # Currently not used - slack: "@tune-team" # Currentl not used - - cluster: - app_config: app_config.yaml # Relative to the release test yaml - compute_template: tpl_cpu.yaml - - run: - timeout: 600 # in seconds - prepare: python wait_cluster.py 4 600 # prepare cmd to run before test - script: python workloads/train.py # actual release test command - - # Only needed for long running test - time_key: last_update # Key in the results json indicating current time - max_update_delay: 30 # If state hasn't been updated in 30s, terminate - - # This block is optional - artifacts: - # Artifact name: location on head node - - detailed_output: detailed_output.csv - - # This block is optional. If present, the contents will be - # deep updated for smoke testing - smoke_test: - cluster: - compute_template: tpl_cpu_smoketest.yaml - -""" # noqa: E501 -import argparse -import enum -import random -import shlex -import string - -import boto3 -import collections -import copy -import datetime -import hashlib -import jinja2 -import json -import logging -import multiprocessing -import os -import requests -import shutil -import subprocess -import sys -import re -import tempfile -import time -from queue import Empty -from typing import Any, Dict, Optional, Tuple, List - -import yaml - -import anyscale -import anyscale.conf -from anyscale.authenticate import get_auth_api_client -from anyscale.controllers.session_controller import SessionController -from anyscale.sdk.anyscale_client.sdk import AnyscaleSDK - -logger = logging.getLogger() -logger.setLevel(logging.INFO) -handler = logging.StreamHandler(stream=sys.stdout) -formatter = logging.Formatter( - fmt="[%(levelname)s %(asctime)s] " "%(filename)s: %(lineno)d " "%(message)s" -) -handler.setFormatter(formatter) -logger.addHandler(handler) - - -def _format_link(link: str): - # Use ANSI escape code to allow link to be clickable - # https://buildkite.com/docs/pipelines/links-and-images - # -in-log-output - return "\033]1339;url='" + link + "'\a\n" - - -def getenv_default(key: str, default: Optional[str] = None): - """Return environment variable with default value""" - # If the environment variable is set but "", still return default - return os.environ.get(key, None) or default - - -GLOBAL_CONFIG = { - "ANYSCALE_USER": getenv_default("ANYSCALE_USER", "release-automation@anyscale.com"), - "ANYSCALE_HOST": getenv_default("ANYSCALE_HOST", "https://console.anyscale.com"), - "ANYSCALE_CLI_TOKEN": getenv_default("ANYSCALE_CLI_TOKEN"), - "ANYSCALE_CLOUD_ID": getenv_default( - "ANYSCALE_CLOUD_ID", "cld_4F7k8814aZzGG8TNUGPKnc" - ), # anyscale_default_cloud - "ANYSCALE_PROJECT": getenv_default("ANYSCALE_PROJECT", ""), - "RAY_VERSION": getenv_default("RAY_VERSION", "2.0.0.dev0"), - "RAY_REPO": getenv_default("RAY_REPO", "https://github.com/ray-project/ray.git"), - "RAY_BRANCH": getenv_default("RAY_BRANCH", "master"), - "RELEASE_AWS_BUCKET": getenv_default( - "RELEASE_AWS_BUCKET", "ray-release-automation-results" - ), - "RELEASE_AWS_LOCATION": getenv_default("RELEASE_AWS_LOCATION", "dev"), - "RELEASE_AWS_DB_NAME": getenv_default("RELEASE_AWS_DB_NAME", "ray_ci"), - "RELEASE_AWS_DB_TABLE": getenv_default( - "RELEASE_AWS_DB_TABLE", "release_test_result" - ), - "RELEASE_AWS_DB_SECRET_ARN": getenv_default( - "RELEASE_AWS_DB_SECRET_ARN", - "arn:aws:secretsmanager:us-west-2:029272617770:secret:" - "rds-db-credentials/cluster-7RB7EYTTBK2EUC3MMTONYRBJLE/ray_ci-MQN2hh", - ), - "RELEASE_AWS_DB_RESOURCE_ARN": getenv_default( - "RELEASE_AWS_DB_RESOURCE_ARN", - "arn:aws:rds:us-west-2:029272617770:cluster:ci-reporting", - ), - "RELEASE_RESULTS_DIR": getenv_default( - "RELEASE_RESULTS_DIR", "/tmp/ray_release_test_artifacts" - ), - "DATESTAMP": str(datetime.datetime.now().strftime("%Y%m%d")), - "TIMESTAMP": str(int(datetime.datetime.now().timestamp())), - "EXPIRATION_1D": str( - (datetime.datetime.now() + datetime.timedelta(days=1)).strftime("%Y-%m-%d") - ), - "EXPIRATION_2D": str( - (datetime.datetime.now() + datetime.timedelta(days=2)).strftime("%Y-%m-%d") - ), - "EXPIRATION_3D": str( - (datetime.datetime.now() + datetime.timedelta(days=3)).strftime("%Y-%m-%d") - ), - "REPORT_RESULT": getenv_default("REPORT_RESULT", ""), -} - -REPORT_S = 30 -RETRY_MULTIPLIER = 2 -VALID_TEAMS = ["ml", "core", "serve"] - - -class ExitCode(enum.Enum): - # If you change these, also change the `retry` section - # in `build_pipeline.py` and the `reason()` function in `run_e2e.sh` - UNSPECIFIED = 2 - UNKNOWN = 3 - RUNTIME_ERROR = 4 - COMMAND_ERROR = 5 - COMMAND_TIMEOUT = 6 - PREPARE_TIMEOUT = 7 - FILESYNC_TIMEOUT = 8 - SESSION_TIMEOUT = 9 - PREPARE_ERROR = 10 - APPCONFIG_BUILD_ERROR = 11 - INFRA_ERROR = 12 - - -def exponential_backoff_retry(f, retry_exceptions, initial_retry_delay_s, max_retries): - retry_cnt = 0 - retry_delay_s = initial_retry_delay_s - while True: - try: - return f() - except retry_exceptions as e: - retry_cnt += 1 - if retry_cnt > max_retries: - raise - logger.info( - f"Retry function call failed due to {e} " - f"in {retry_delay_s} seconds..." - ) - time.sleep(retry_delay_s) - retry_delay_s *= RETRY_MULTIPLIER - - -def maybe_fetch_api_token(): - if GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"] is None: - logger.info("Missing ANYSCALE_CLI_TOKEN, retrieving from AWS secrets store") - # NOTE(simon) This should automatically retrieve - # release-automation@anyscale.com's anyscale token - GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"] = boto3.client( - "secretsmanager", region_name="us-west-2" - ).get_secret_value( - SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:" - "release-automation/" - "anyscale-token20210505220406333800000001-BcUuKB" - )[ - "SecretString" - ] - - -class PrepareCommandRuntimeError(RuntimeError): - pass - - -class ReleaseTestRuntimeError(RuntimeError): - pass - - -class ReleaseTestInfraError(ReleaseTestRuntimeError): - pass - - -class ReleaseTestTimeoutError(ReleaseTestRuntimeError): - pass - - -class SessionTimeoutError(ReleaseTestTimeoutError): - pass - - -class FileSyncTimeoutError(ReleaseTestTimeoutError): - pass - - -class CommandTimeoutError(ReleaseTestTimeoutError): - pass - - -class PrepareCommandTimeoutError(ReleaseTestTimeoutError): - pass - - -# e.g., App config failure. -class AppConfigBuildFailure(RuntimeError): - pass - - -class State: - def __init__(self, state: str, timestamp: float, data: Any): - self.state = state - self.timestamp = timestamp - self.data = data - - -class CommandRunnerHack: - def __init__(self): - self.subprocess_pool: Dict[int, subprocess.Popen] = dict() - self.start_time: Dict[int, float] = dict() - self.counter = 0 - - def run_command(self, session_name, cmd_to_run, env_vars) -> int: - self.counter += 1 - command_id = self.counter - env = os.environ.copy() - env["RAY_ADDRESS"] = f"anyscale://{session_name}" - env["ANYSCALE_CLI_TOKEN"] = GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"] - env["ANYSCALE_HOST"] = GLOBAL_CONFIG["ANYSCALE_HOST"] - full_cmd = " ".join(f"{k}={v}" for k, v in env_vars.items()) + " " + cmd_to_run - logger.info(f"Executing {cmd_to_run} with {env_vars} via ray job submit") - proc = subprocess.Popen( - f"ray job submit -- bash -c {shlex.quote(full_cmd)}", - shell=True, - stdout=sys.stdout, - stderr=sys.stderr, - env=env, - ) - self.subprocess_pool[command_id] = proc - self.start_time[command_id] = time.time() - return command_id - - def wait_command(self, command_id: int): - retcode = self.subprocess_pool[command_id].wait() - duration = time.time() - self.start_time[command_id] - return retcode, duration - - -global_command_runner = CommandRunnerHack() - - -class S3SyncSessionController(SessionController): - def __init__(self, sdk, result_queue): - self.sdk = sdk - self.result_queue = result_queue - self.s3_client = boto3.client("s3") - self.bucket = GLOBAL_CONFIG["RELEASE_AWS_BUCKET"] - super().__init__() - - def _generate_tmp_s3_path(self): - fn = "".join(random.choice(string.ascii_lowercase) for i in range(10)) - location = f"tmp/{fn}" - return location - - def pull(self, session_name, source, target): - remote_upload_to = self._generate_tmp_s3_path() - # remote source -> s3 - cid = global_command_runner.run_command( - session_name, - ( - f"pip install -q awscli && aws s3 cp {source} " - f"s3://{self.bucket}/{remote_upload_to} " - "--acl bucket-owner-full-control" - ), - {}, - ) - global_command_runner.wait_command(cid) - - # s3 -> local target - self.s3_client.download_file( - Bucket=self.bucket, - Key=remote_upload_to, - Filename=target, - ) - - def _push_local_dir(self, session_name): - remote_upload_to = self._generate_tmp_s3_path() - # pack local dir - _, local_path = tempfile.mkstemp() - shutil.make_archive(local_path, "gztar", os.getcwd()) - # local source -> s3 - self.s3_client.upload_file( - Filename=local_path + ".tar.gz", - Bucket=self.bucket, - Key=remote_upload_to, - ) - # s3 -> remote target - cid = global_command_runner.run_command( - session_name, - ( - "pip install -q awscli && " - f"aws s3 cp s3://{self.bucket}/{remote_upload_to} " - f"archive.tar.gz && " - "tar xf archive.tar.gz" - ), - {}, - ) - global_command_runner.wait_command(cid) - - def push( - self, - session_name: str, - source: Optional[str], - target: Optional[str], - config: Optional[str], - all_nodes: bool, - no_warning: bool = False, - ): - if source is None and target is None: - self._push_local_dir(session_name) - return - - assert isinstance(source, str) - assert isinstance(target, str) - - remote_upload_to = self._generate_tmp_s3_path() - # local source -> s3 - self.s3_client.upload_file( - Filename=source, - Bucket=self.bucket, - Key=remote_upload_to, - ) - # s3 -> remote target - cid = global_command_runner.run_command( - session_name, - "pip install -q awscli && " - f"aws s3 cp s3://{self.bucket}/{remote_upload_to} {target}", - {}, - ) - global_command_runner.wait_command(cid) - - -sys.path.insert(0, anyscale.ANYSCALE_RAY_DIR) - - -def anyscale_project_url(project_id: str): - return ( - f"{GLOBAL_CONFIG['ANYSCALE_HOST']}" - f"/o/anyscale-internal/projects/{project_id}" - f"/?tab=session-list" - ) - - -def anyscale_session_url(project_id: str, session_id: str): - return ( - f"{GLOBAL_CONFIG['ANYSCALE_HOST']}" - f"/o/anyscale-internal/projects/{project_id}" - f"/clusters/{session_id}" - ) - - -def anyscale_compute_tpl_url(compute_tpl_id: str): - return ( - f"{GLOBAL_CONFIG['ANYSCALE_HOST']}" - f"/o/anyscale-internal/configurations/cluster-computes" - f"/{compute_tpl_id}" - ) - - -def anyscale_app_config_build_url(build_id: str): - return ( - f"{GLOBAL_CONFIG['ANYSCALE_HOST']}" - f"/o/anyscale-internal/configurations/app-config-details" - f"/{build_id}" - ) - - -def wheel_url(ray_version, git_branch, git_commit): - return ( - f"https://s3-us-west-2.amazonaws.com/ray-wheels/" - f"{git_branch}/{git_commit}/" - f"ray-{ray_version}-cp37-cp37m-manylinux2014_x86_64.whl" - ) - - -def wheel_exists(ray_version, git_branch, git_commit): - url = wheel_url(ray_version, git_branch, git_commit) - return requests.head(url).status_code == 200 - - -def commit_or_url(commit_or_url: str) -> str: - if commit_or_url.startswith("http"): - url = None - # Directly return the S3 url - if "s3" in commit_or_url and "amazonaws.com" in commit_or_url: - url = commit_or_url - # Resolve the redirects for buildkite artifacts - # This is needed because otherwise pip won't recognize the file name. - elif "buildkite.com" in commit_or_url and "artifacts" in commit_or_url: - url = requests.head(commit_or_url, allow_redirects=True).url - if url is not None: - # Extract commit from url so that we can do the - # commit sanity check later. - p = re.compile("/([a-f0-9]{40})/") - m = p.search(url) - if m is not None: - os.environ["RAY_COMMIT"] = m.group(1) - return url - - # Else, assume commit - os.environ["RAY_COMMIT"] = commit_or_url - return wheel_url( - GLOBAL_CONFIG["RAY_VERSION"], GLOBAL_CONFIG["RAY_BRANCH"], commit_or_url - ) - - -def get_latest_commits(repo: str, branch: str = "master") -> List[str]: - cur = os.getcwd() - with tempfile.TemporaryDirectory() as tmpdir: - os.chdir(tmpdir) - - clone_cmd = [ - "git", - "clone", - "--filter=tree:0", - "--no-checkout", - # "--single-branch", - # "--depth=10", - f"--branch={branch}", - repo, - tmpdir, - ] - log_cmd = [ - "git", - "log", - "-n", - "10", - "--pretty=format:%H", - ] - - subprocess.check_output(clone_cmd) - commits = ( - subprocess.check_output(log_cmd).decode(sys.stdout.encoding).split("\n") - ) - os.chdir(cur) - return commits - - -def find_ray_wheels(repo: str, branch: str, version: str): - url = None - commits = get_latest_commits(repo, branch) - logger.info(f"Latest 10 commits for branch {branch}: {commits}") - for commit in commits: - if wheel_exists(version, branch, commit): - url = wheel_url(version, branch, commit) - os.environ["RAY_WHEELS"] = url - os.environ["RAY_COMMIT"] = commit - logger.info( - f"Found wheels URL for Ray {version}, branch {branch}: " f"{url}" - ) - break - return url - - -def populate_wheels_sanity_check(commit: Optional[str] = None): - if not commit: - cmd = ( - "python -c 'import ray; print(" - '"No commit sanity check available, but this is the ' - "Ray wheel commit:\", ray.__commit__)'" - ) - else: - cmd = ( - f"python -c 'import ray; " - f'assert ray.__commit__ == "{commit}", ray.__commit__\'' - ) - os.environ["RAY_WHEELS_SANITY_CHECK"] = cmd - - -def _check_stop(stop_event: multiprocessing.Event, timeout_type: str): - if stop_event.is_set(): - if timeout_type == "prepare_command": - raise PrepareCommandTimeoutError( - "Process timed out in the prepare command stage." - ) - if timeout_type == "command": - raise CommandTimeoutError("Process timed out while running a command.") - elif timeout_type == "file_sync": - raise FileSyncTimeoutError("Process timed out while syncing files.") - elif timeout_type == "session": - raise SessionTimeoutError("Process timed out while starting a session.") - else: - assert False, "Unexpected timeout type." - - -def _deep_update(d, u): - for k, v in u.items(): - if isinstance(v, collections.abc.Mapping): - d[k] = _deep_update(d.get(k, {}), v) - else: - d[k] = v - return d - - -def _dict_hash(dt: Dict[Any, Any]) -> str: - json_str = json.dumps(dt, sort_keys=True, ensure_ascii=True) - sha = hashlib.sha256() - sha.update(json_str.encode()) - return sha.hexdigest() - - -def _load_config(local_dir: str, config_file: Optional[str]) -> Optional[Dict]: - if not config_file: - return None - - config_path = os.path.join(local_dir, config_file) - with open(config_path, "rt") as f: - # Todo: jinja2 render - content = f.read() - - env = copy.deepcopy(os.environ) - env.update(GLOBAL_CONFIG) - - content = jinja2.Template(content).render(env=env) - return yaml.safe_load(content) - - -def has_errored(result: Dict[Any, Any]) -> bool: - return result.get("status", "invalid") != "finished" - - -def maybe_get_alert_for_result(result_dict: Dict[str, Any]) -> Optional[str]: - # If we get a result dict, check if any alerts should be raised - from alert import SUITE_TO_FN, default_handle_result - - logger.info("Checking if results are valid...") - - # Copy dict because we modify kwargs here - handle_result_kwargs = result_dict.copy() - handle_result_kwargs["created_on"] = None - - test_suite = handle_result_kwargs.get("test_suite", None) - - handle_fn = SUITE_TO_FN.get(test_suite, None) - if not handle_fn: - logger.warning(f"No handle for suite {test_suite}") - alert = default_handle_result(**handle_result_kwargs) - else: - alert = handle_fn(**handle_result_kwargs) - - return alert - - -def report_result( - *, - test_suite: str, - test_name: str, - status: str, - last_logs: str, - results: Dict[Any, Any], - artifacts: Dict[Any, Any], - category: str, - team: str, -): - # session_url: str, commit_url: str, - # runtime: float, stable: bool, frequency: str, return_code: int): - """Report the test result to database.""" - now = datetime.datetime.utcnow() - rds_data_client = boto3.client("rds-data", region_name="us-west-2") - - schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"] - - parameters = [ - { - "name": "created_on", - "typeHint": "TIMESTAMP", - "value": {"stringValue": now.strftime("%Y-%m-%d %H:%M:%S")}, - }, - {"name": "test_suite", "value": {"stringValue": test_suite}}, - {"name": "test_name", "value": {"stringValue": test_name}}, - {"name": "status", "value": {"stringValue": status}}, - {"name": "last_logs", "value": {"stringValue": last_logs}}, - { - "name": "results", - "typeHint": "JSON", - "value": {"stringValue": json.dumps(results)}, - }, - { - "name": "artifacts", - "typeHint": "JSON", - "value": {"stringValue": json.dumps(artifacts)}, - }, - {"name": "category", "value": {"stringValue": category}}, - {"name": "team", "value": {"stringValue": team}}, - ] - columns = [param["name"] for param in parameters] - values = [f":{param['name']}" for param in parameters] - column_str = ", ".join(columns).strip(", ") - value_str = ", ".join(values).strip(", ") - - sql = f"INSERT INTO {schema} " f"({column_str}) " f"VALUES ({value_str})" - - logger.info(f"Query: {sql}") - - # Default boto3 call timeout is 45 seconds. - retry_delay_s = 64 - MAX_RDS_RETRY = 3 - exponential_backoff_retry( - lambda: rds_data_client.execute_statement( - database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"], - parameters=parameters, - secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"], - resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"], - schema=schema, - sql=sql, - ), - retry_exceptions=rds_data_client.exceptions.StatementTimeoutException, - initial_retry_delay_s=retry_delay_s, - max_retries=MAX_RDS_RETRY, - ) - logger.info("Result has been persisted to the database") - - # TODO(jjyao) Migrate to new infra later - logger.info("Persisting results to the databricks delta lake...") - - result_json = { - "_table": "release_test_result", - "created_on": now.strftime("%Y-%m-%d %H:%M:%S"), - "status": status, - "results": results, - "test_name": test_name, - "team": team, - "cluster_url": results["_session_url"], - "wheel_url": results["_commit_url"], - "runtime": results["_runtime"], - "stable": results["_stable"], - } - - logger.debug(f"Result json: {json.dumps(result_json)}") - - firehose_client = boto3.client("firehose", region_name="us-west-2") - firehose_client.put_record( - DeliveryStreamName="ray-ci-results", Record={"Data": json.dumps(result_json)} - ) - - logger.info("Result has been persisted to the databricks delta lake") - - -def log_results_and_artifacts(result: Dict): - results = result.get("results", {}) - if results: - msg = "Observed the following results:\n\n" - - for key, val in results.items(): - msg += f" {key} = {val}\n" - else: - msg = "Did not find any results." - logger.info(msg) - - artifacts = result.get("artifacts", {}) - if artifacts: - msg = "Saved the following artifacts:\n\n" - - for key, val in artifacts.items(): - msg += f" {key} = {val}\n" - else: - msg = "Did not find any artifacts." - logger.info(msg) - - -def _cleanup_session(sdk: AnyscaleSDK, session_id: str): - if session_id: - # Just trigger a request. No need to wait until session shutdown. - sdk.terminate_session(session_id=session_id, terminate_session_options={}) - - -def search_running_session( - sdk: AnyscaleSDK, project_id: str, session_name: str -) -> Optional[str]: - session_id = None - - logger.info(f"Looking for existing session with name {session_name}") - - result = sdk.search_sessions( - project_id=project_id, sessions_query=dict(name=dict(equals=session_name)) - ) - - if len(result.results) > 0 and result.results[0].state == "Running": - logger.info("Found existing session.") - session_id = result.results[0].id - return session_id - - -def find_cloud_by_name( - sdk: AnyscaleSDK, cloud_name: str, _repeat: bool = True -) -> Optional[str]: - cloud_id = None - logger.info(f"Looking up cloud with name `{cloud_name}`. ") - - paging_token = None - while not cloud_id: - result = sdk.search_clouds( - clouds_query=dict(paging=dict(count=50, paging_token=paging_token)) - ) - - paging_token = result.metadata.next_paging_token - - for res in result.results: - if res.name == cloud_name: - cloud_id = res.id - logger.info(f"Found cloud with name `{cloud_name}` as `{cloud_id}`") - break - - if not paging_token or cloud_id or not len(result.results): - break - - return cloud_id - - -def create_or_find_compute_template( - sdk: AnyscaleSDK, project_id: str, compute_tpl: Dict[Any, Any], _repeat: bool = True -) -> Tuple[Optional[str], Optional[str]]: - compute_tpl_id = None - compute_tpl_name = None - if compute_tpl: - # As of Anyscale 0.4.1, it is an error to use the same compute template - # name within the same organization, between different projects. - compute_tpl_name = f"{project_id}/compute/{_dict_hash(compute_tpl)}" - - logger.info( - f"Tests uses compute template " - f"with name {compute_tpl_name}. Looking up existing " - f"templates." - ) - - paging_token = None - while not compute_tpl_id: - result = sdk.search_compute_templates( - dict( - project_id=project_id, - name=dict(equals=compute_tpl_name), - include_anonymous=True, - ), - paging_token=paging_token, - ) - paging_token = result.metadata.next_paging_token - - for res in result.results: - if res.name == compute_tpl_name: - compute_tpl_id = res.id - logger.info(f"Template already exists with ID {compute_tpl_id}") - break - - if not paging_token: - break - - if not compute_tpl_id: - logger.info( - f"Compute template not found. " - f"Creating with name {compute_tpl_name}." - ) - try: - result = sdk.create_compute_template( - dict( - name=compute_tpl_name, project_id=project_id, config=compute_tpl - ) - ) - compute_tpl_id = result.result.id - except Exception as e: - if _repeat: - logger.warning( - f"Got exception when trying to create compute " - f"template: {e}. Sleeping for 10 seconds and then " - f"try again once..." - ) - time.sleep(10) - return create_or_find_compute_template( - sdk=sdk, - project_id=project_id, - compute_tpl=compute_tpl, - _repeat=False, - ) - - raise e - - logger.info(f"Compute template created with ID {compute_tpl_id}") - - return compute_tpl_id, compute_tpl_name - - -def create_or_find_app_config( - sdk: AnyscaleSDK, project_id: str, app_config: Dict[Any, Any], _repeat: bool = True -) -> Tuple[Optional[str], Optional[str]]: - app_config_id = None - app_config_name = None - if app_config: - app_config_name = f"{project_id}-{_dict_hash(app_config)}" - - logger.info( - f"Test uses an app config with hash {app_config_name}. " - f"Looking up existing app configs with this name." - ) - - paging_token = None - while not app_config_id: - result = sdk.list_app_configs( - project_id=project_id, count=50, paging_token=paging_token - ) - paging_token = result.metadata.next_paging_token - - for res in result.results: - if res.name == app_config_name: - app_config_id = res.id - logger.info(f"App config already exists with ID {app_config_id}") - break - - if not paging_token or app_config_id: - break - - if not app_config_id: - logger.info("App config not found. Creating new one.") - try: - result = sdk.create_app_config( - dict( - name=app_config_name, - project_id=project_id, - config_json=app_config, - ) - ) - app_config_id = result.result.id - except Exception as e: - if _repeat: - logger.warning( - f"Got exception when trying to create app " - f"config: {e}. Sleeping for 10 seconds and then " - f"try again once..." - ) - time.sleep(10) - return create_or_find_app_config( - sdk=sdk, - project_id=project_id, - app_config=app_config, - _repeat=False, - ) - - raise e - - logger.info(f"App config created with ID {app_config_id}") - - return app_config_id, app_config_name - - -def run_bash_script(local_dir: str, bash_script: str): - previous_dir = os.getcwd() - - bash_script_local_dir = os.path.dirname(bash_script) - file_name = os.path.basename(bash_script) - - full_local_dir = os.path.join(local_dir, bash_script_local_dir) - os.chdir(full_local_dir) - - subprocess.run("./" + file_name, shell=True, check=True) - - os.chdir(previous_dir) - - -def install_app_config_packages(app_config: Dict[Any, Any]): - os.environ.update(app_config.get("env_vars", {})) - packages = app_config["python"]["pip_packages"] - for package in packages: - subprocess.check_output(["pip", "install", "-U", package], text=True) - - -def install_matching_ray(): - wheel = os.environ.get("RAY_WHEELS", None) - if not wheel: - return - assert "manylinux2014_x86_64" in wheel, wheel - if sys.platform == "darwin": - platform = "macosx_10_15_intel" - elif sys.platform == "win32": - platform = "win_amd64" - else: - platform = "manylinux2014_x86_64" - wheel = wheel.replace("manylinux2014_x86_64", platform) - subprocess.check_output(["pip", "uninstall", "-y", "ray"], text=True) - subprocess.check_output(["pip", "install", "-U", wheel], text=True) - - -def wait_for_build_or_raise( - sdk: AnyscaleSDK, app_config_id: Optional[str] -) -> Optional[str]: - if not app_config_id: - return None - - # Fetch build - build_id = None - last_status = None - result = sdk.list_builds(app_config_id) - for build in sorted(result.results, key=lambda b: b.created_at): - build_id = build.id - last_status = build.status - - if build.status == "failed": - continue - - if build.status == "succeeded": - logger.info( - f"Link to app config build: " - f"{_format_link(anyscale_app_config_build_url(build_id))}" - ) - return build_id - - if last_status == "failed": - raise AppConfigBuildFailure("App config build failed.") - - if not build_id: - raise AppConfigBuildFailure("No build found for app config.") - - # Build found but not failed/finished yet - completed = False - start_wait = time.time() - next_report = start_wait + REPORT_S - logger.info(f"Waiting for build {build_id} to finish...") - logger.info( - f"Track progress here: " - f"{_format_link(anyscale_app_config_build_url(build_id))}" - ) - while not completed: - now = time.time() - if now > next_report: - logger.info( - f"... still waiting for build {build_id} to finish " - f"({int(now - start_wait)} seconds) ..." - ) - next_report = next_report + REPORT_S - - result = sdk.get_build(build_id) - build = result.result - - if build.status == "failed": - raise AppConfigBuildFailure( - f"App config build failed. Please see " - f"{anyscale_app_config_build_url(build_id)} for details" - ) - - if build.status == "succeeded": - logger.info("Build succeeded.") - return build_id - - completed = build.status not in ["in_progress", "pending"] - - if completed: - raise AppConfigBuildFailure( - f"Unknown build status: {build.status}. Please see " - f"{anyscale_app_config_build_url(build_id)} for details" - ) - - time.sleep(1) - - return build_id - - -def run_job( - cluster_name: str, - compute_tpl_name: str, - cluster_env_name: str, - job_name: str, - min_workers: str, - script: str, - script_args: List[str], - env_vars: Dict[str, str], - autosuspend: int, -) -> Tuple[int, str]: - # Start cluster and job - address = f"anyscale://{cluster_name}?autosuspend={autosuspend}" - logger.info(f"Starting job {job_name} with Ray address: {address}") - env = copy.deepcopy(os.environ) - env.update(GLOBAL_CONFIG) - env.update(env_vars) - env["RAY_ADDRESS"] = address - env["RAY_JOB_NAME"] = job_name - env["RAY_RELEASE_MIN_WORKERS"] = str(min_workers) - proc = subprocess.Popen( - script.split(" ") + script_args, - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) - proc.stdout.reconfigure(line_buffering=True) - logs = "" - for line in proc.stdout: - logs += line - sys.stdout.write(line) - proc.wait() - return proc.returncode, logs - - -def create_and_wait_for_session( - sdk: AnyscaleSDK, - stop_event: multiprocessing.Event, - session_name: str, - session_options: Dict[Any, Any], - project_id: str, -) -> str: - # Create session - logger.info(f"Creating session {session_name}") - result = sdk.create_session(session_options) - session_id = result.result.id - - # Trigger session start - logger.info(f"Starting session {session_name} ({session_id})") - session_url = anyscale_session_url( - project_id=GLOBAL_CONFIG["ANYSCALE_PROJECT"], session_id=session_id - ) - logger.info(f"URL: {session_url}") - logger.info(f"Link to session: {_format_link(session_url)}") - - result = sdk.start_session(session_id, start_session_options={}) - sop_id = result.result.id - completed = result.result.completed - - # Wait for session - logger.info(f"Waiting for session {session_name}...") - start_wait = time.time() - next_report = start_wait + REPORT_S - while not completed: - # Sleep 1 sec before next check. - time.sleep(1) - - session_operation_response = sdk.get_session_operation( - sop_id, _request_timeout=30 - ) - session_operation = session_operation_response.result - completed = session_operation.completed - - try: - _check_stop(stop_event, "session") - except SessionTimeoutError as e: - # Always queue session termination. - # We can't do this later as we won't return anything here - # and the session ID will not be set in the control loop - _cleanup_session(sdk=sdk, session_id=session_id) - raise e - - now = time.time() - if now > next_report: - logger.info( - f"... still waiting for session {session_name} " - f"({int(now - start_wait)} seconds) ..." - ) - next_report = next_report + REPORT_S - - result = sdk.get_session(session_id) - if not result.result.state != "Active": - raise ReleaseTestInfraError( - f"Cluster did not come up - most likely the nodes are currently " - f"not available. Please check the cluster startup logs: " - f"{anyscale_session_url(project_id, session_id)}" - ) - - return session_id - - -def run_session_command( - sdk: AnyscaleSDK, - session_id: str, - cmd_to_run: str, - result_queue: multiprocessing.Queue, - env_vars: Dict[str, str], - state_str: str = "CMD_RUN", -) -> Tuple[str, int]: - full_cmd = " ".join(f"{k}={v}" for k, v in env_vars.items()) + " " + cmd_to_run - - logger.info(f"Running command in session {session_id}: \n" f"{full_cmd}") - session_url = anyscale_session_url( - project_id=GLOBAL_CONFIG["ANYSCALE_PROJECT"], session_id=session_id - ) - logger.info(f"URL: {session_url}") - logger.info(f"Link to session: {_format_link(session_url)}") - result_queue.put(State(state_str, time.time(), None)) - result = sdk.create_session_command( - dict(session_id=session_id, shell_command=full_cmd) - ) - - scd_id = result.result.id - return scd_id, result - - -def wait_for_session_command_to_complete( - create_session_command_result, - sdk: AnyscaleSDK, - scd_id: str, - stop_event: multiprocessing.Event, - state_str: str = "CMD_RUN", -): - result = create_session_command_result - completed = result.result.finished_at is not None - start_wait = time.time() - next_report = start_wait + REPORT_S - while not completed: - # Sleep 1 sec before next check. - time.sleep(1) - - result = exponential_backoff_retry( - lambda: sdk.get_session_command(session_command_id=scd_id), - retry_exceptions=Exception, - initial_retry_delay_s=10, - max_retries=3, - ) - completed = result.result.finished_at - - if state_str == "CMD_RUN": - _check_stop(stop_event, "command") - elif state_str == "CMD_PREPARE": - _check_stop(stop_event, "prepare_command") - - now = time.time() - if now > next_report: - logger.info( - f"... still waiting for command to finish " - f"({int(now - start_wait)} seconds) ..." - ) - next_report = next_report + REPORT_S - - status_code = result.result.status_code - runtime = time.time() - start_wait - - if status_code != 0: - if state_str == "CMD_RUN": - raise RuntimeError(f"Command returned non-success status: {status_code}") - elif state_str == "CMD_PREPARE": - raise PrepareCommandRuntimeError( - f"Prepare command returned non-success status: {status_code}" - ) - - return status_code, runtime - - -def get_command_logs( - session_controller: SessionController, scd_id: str, lines: int = 50 -): - result = exponential_backoff_retry( - lambda: session_controller.api_client.get_execution_logs_api_v2_session_commands_session_command_id_execution_logs_get( # noqa: E501 - session_command_id=scd_id, start_line=-1 * lines, end_line=0 - ), - retry_exceptions=Exception, - initial_retry_delay_s=10, - max_retries=3, - ) - - return result.result.lines - - -def get_remote_json_content( - temp_dir: str, - session_name: str, - remote_file: Optional[str], - session_controller: SessionController, -): - if not remote_file: - logger.warning("No remote file specified, returning empty dict") - return {} - local_target_file = os.path.join(temp_dir, ".tmp.json") - session_controller.pull( - session_name=session_name, source=remote_file, target=local_target_file - ) - with open(local_target_file, "rt") as f: - return json.load(f) - - -def get_local_json_content( - local_file: Optional[str], -): - if not local_file: - logger.warning("No local file specified, returning empty dict") - return {} - with open(local_file, "rt") as f: - return json.load(f) - - -def pull_artifacts_and_store_in_cloud( - temp_dir: str, - logs: str, - session_name: str, - test_name: str, - artifacts: Optional[Dict[Any, Any]], - session_controller: SessionController, -): - output_log_file = os.path.join(temp_dir, "output.log") - with open(output_log_file, "wt") as f: - f.write(logs) - - bucket = GLOBAL_CONFIG["RELEASE_AWS_BUCKET"] - location = f"{GLOBAL_CONFIG['RELEASE_AWS_LOCATION']}" f"/{session_name}/{test_name}" - saved_artifacts = {} - - s3_client = boto3.client("s3") - s3_client.upload_file(output_log_file, bucket, f"{location}/output.log") - saved_artifacts["output.log"] = f"s3://{bucket}/{location}/output.log" - - # Download artifacts - if artifacts: - for name, remote_file in artifacts.items(): - logger.info(f"Downloading artifact `{name}` from " f"{remote_file}") - local_target_file = os.path.join(temp_dir, name) - session_controller.pull( - session_name=session_name, source=remote_file, target=local_target_file - ) - - # Upload artifacts to s3 - s3_client.upload_file(local_target_file, bucket, f"{location}/{name}") - saved_artifacts[name] = f"s3://{bucket}/{location}/{name}" - - return saved_artifacts - - -def find_session_by_test_name( - sdk: AnyscaleSDK, - session_controller: SessionController, - temp_dir: str, - state_json: str, - project_id: str, - test_name: str, -) -> Optional[Tuple[str, str, Dict[Any, Any]]]: - paging_token = None - - while True: # Will break if paging_token is None after first search - result = sdk.search_sessions( - project_id=project_id, - sessions_query=dict( - name=dict(contains=test_name), - state_filter=["Running"], - paging=dict(count=20, paging_token=paging_token), - ), - ) - - for session in result.results: - logger.info(f"Found sessions {session.name}") - if not session.name.startswith(test_name): - continue - - try: - session_state = get_remote_json_content( - temp_dir=temp_dir, - session_name=session.name, - remote_file=state_json, - session_controller=session_controller, - ) - except Exception as exc: - raise RuntimeError( - f"Could not get remote json content " f"for session {session.name}" - ) from exc - - if session_state.get("test_name") == test_name: - return session.id, session.name, session_state - - session_token = result.metadata.next_paging_token - - if not session_token: - return None - - -def get_latest_running_command_id( - sdk: AnyscaleSDK, session_id: str -) -> Tuple[Optional[str], Optional[bool]]: - scd_id = None - paging_token = None - - success = None - - while not scd_id: - result = sdk.list_session_commands( - session_id=session_id, paging_token=paging_token - ) - - paging_token = result.metadata.next_paging_token - - for cmd in result.results: - if not scd_id: - scd_id = cmd.id - - completed = cmd.finished_at is not None - - if completed: - if success is None: - success = True - - success = success and cmd.status_code == 0 - - if not completed: - return cmd.id, None - - return scd_id, success or False - - -def run_test_config( - local_dir: str, - project_id: str, - test_name: str, - test_config: Dict[Any, Any], - commit_url: str, - session_name: str = None, - smoke_test: bool = False, - no_terminate: bool = False, - kick_off_only: bool = False, - check_progress: bool = False, - upload_artifacts: bool = True, - keep_results_dir: bool = False, - app_config_id_override: Optional[str] = None, -) -> Dict[Any, Any]: - """ - - Returns: - Dict with the following entries: - status (str): One of [finished, error, timeout] - command_link (str): Link to command (Anyscale web UI) - last_logs (str): Last logs (excerpt) to send to owner - artifacts (dict): Dict of artifacts - Key: Name - Value: S3 URL - """ - stop_event = multiprocessing.Event() - result_queue = multiprocessing.Queue() - - if not session_name: - session_name = f"{test_name}_{int(time.time())}" - - temp_dir = tempfile.mkdtemp() - - # Result and state files - results_json = test_config["run"].get("results", None) - if results_json is None: - results_json = "/tmp/release_test_out.json" - - state_json = test_config["run"].get("state", None) - if state_json is None: - state_json = "/tmp/release_test_state.json" - - env_vars = { - "RAY_ADDRESS": os.environ.get("RAY_ADDRESS", "auto"), - "TEST_OUTPUT_JSON": results_json, - "TEST_STATE_JSON": state_json, - "IS_SMOKE_TEST": "1" if smoke_test else "0", - } - - with open(os.path.join(local_dir, ".anyscale.yaml"), "wt") as f: - f.write(f"project_id: {project_id}") - os.chdir(local_dir) - - # Setup interface - # Unfortunately, there currently seems to be no great way to - # transfer files with the Anyscale SDK. - # So we use the session controller instead. - sdk = AnyscaleSDK( - auth_token=GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"], - host=GLOBAL_CONFIG["ANYSCALE_HOST"], - ) - - get_auth_api_client( - cli_token=GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"], - host=GLOBAL_CONFIG["ANYSCALE_HOST"], - ) - on_k8s = test_config["cluster"].get("compute_on_k8s") - if on_k8s: - session_controller = S3SyncSessionController(sdk, result_queue) - else: - session_controller = SessionController() - - cloud_id = test_config["cluster"].get("cloud_id", None) - cloud_name = test_config["cluster"].get("cloud_name", None) - if cloud_id and cloud_name: - raise RuntimeError( - f"You can't supply both a `cloud_name` ({cloud_name}) and a " - f"`cloud_id` ({cloud_id}) in the test cluster configuration. " - f"Please provide only one." - ) - elif cloud_name and not cloud_id: - cloud_id = find_cloud_by_name(sdk, cloud_name) - if not cloud_id: - raise RuntimeError(f"Couldn't find cloud with name `{cloud_name}`.") - else: - cloud_id = cloud_id or GLOBAL_CONFIG["ANYSCALE_CLOUD_ID"] - - # Overwrite global config so that `_load_config` sets the correct cloud - GLOBAL_CONFIG["ANYSCALE_CLOUD_ID"] = cloud_id - - cluster_config_rel_path = test_config["cluster"].get("cluster_config", None) - cluster_config = _load_config(local_dir, cluster_config_rel_path) - - app_config_rel_path = test_config["cluster"].get("app_config", None) - app_config = _load_config(local_dir, app_config_rel_path) - if app_config.get("env_vars") is None: - app_config["env_vars"] = {} - # A lot of staging tests share the same app config yaml, except the flags. - # `app_env_vars` in test config will help this one. - # Here we extend the env_vars to use the one specified in the test config. - if test_config.get("app_env_vars") is not None: - app_config["env_vars"].update(test_config["app_env_vars"]) - logger.info(f"Using app config:\n{app_config}") - - # Flags for redisless ray. - # TODO: remove them once done. - app_config["env_vars"]["MATCH_AUTOSCALER_AND_RAY_IMAGES"] = "1" - app_config["env_vars"]["RAY_bootstrap_with_gcs"] = "1" - app_config["env_vars"]["RAY_gcs_storage"] = "memory" - app_config["env_vars"]["RAY_USAGE_STATS_ENABLED"] = "1" - app_config["env_vars"]["RAY_USAGE_STATS_SOURCE"] = "nightly-tests" - - compute_tpl_rel_path = test_config["cluster"].get("compute_template", None) - compute_tpl = _load_config(local_dir, compute_tpl_rel_path) - - timeout = test_config["run"].get("timeout", 1800) - if "RELEASE_OVERRIDE_TIMEOUT" in os.environ: - previous_timeout = timeout - timeout = int(os.environ.get("RELEASE_OVERRIDE_TIMEOUT", str(timeout))) - logger.warning( - f"Release test timeout override: {timeout} " - f"(would have been {previous_timeout})" - ) - - # If a test is long running, timeout does not mean it failed - is_long_running = test_config["run"].get("long_running", False) - - build_id_override = None - if test_config["run"].get("use_connect"): - autosuspend_mins = test_config["run"].get("autosuspend_mins", 5) - assert not kick_off_only, "Unsupported for running with Anyscale connect." - if app_config_id_override is not None: - logger.info( - "Using connect and an app config override, waiting until " - "build finishes so we can fetch the app config in order to " - "install its pip packages locally." - ) - build_id_override = wait_for_build_or_raise(sdk, app_config_id_override) - response = sdk.get_cluster_environment_build(build_id_override) - app_config = response.result.config_json - install_app_config_packages(app_config) - install_matching_ray() - elif "autosuspend_mins" in test_config["run"]: - raise ValueError( - "'autosuspend_mins' is only supported if 'use_connect' is True." - ) - - # Add information to results dict - def _update_results(results: Dict): - if "last_update" in results: - results["last_update_diff"] = time.time() - results["last_update"] - if smoke_test: - results["smoke_test"] = True - - def _process_finished_command( - session_controller: SessionController, - scd_id: str, - results: Optional[Dict] = None, - runtime: int = None, - commit_url: str = None, - session_url: str = None, - ): - logger.info("Command finished successfully.") - if results_json: - results = results or get_remote_json_content( - temp_dir=temp_dir, - session_name=session_name, - remote_file=results_json, - session_controller=session_controller, - ) - else: - results = {"passed": 1} - - _update_results(results) - - if scd_id: - try: - logs = get_command_logs( - session_controller, scd_id, test_config.get("log_lines", 50) - ) - except Exception as e: - raise ReleaseTestInfraError( - f"Could not fetch command logs: {e}. This is an " - f"infrastructure error on the Anyscale side." - ) - else: - logs = "No command found to fetch logs for" - - if upload_artifacts: - saved_artifacts = pull_artifacts_and_store_in_cloud( - temp_dir=temp_dir, - logs=logs, # Also save logs in cloud - session_name=session_name, - test_name=test_name, - artifacts=test_config.get("artifacts", {}), - session_controller=session_controller, - ) - - logger.info("Fetched results and stored on the cloud. Returning.") - else: - saved_artifacts = {} - logger.info( - "Usually I would have fetched the results and " - "artifacts and stored them on S3." - ) - - # Add these metadata here to avoid changing SQL schema. - results["_runtime"] = runtime - results["_session_url"] = session_url - results["_commit_url"] = commit_url - results["_stable"] = test_config.get("stable", True) - result_queue.put( - State( - "END", - time.time(), - { - "status": "finished", - "last_logs": logs, - "results": results, - "artifacts": saved_artifacts, - }, - ) - ) - - # When running the test script in client mode, the finish command is a - # completed local process. - def _process_finished_client_command(returncode: int, logs: str): - if returncode != 0: - raise RuntimeError(f"Client returned non-success status: {returncode}") - if upload_artifacts: - saved_artifacts = pull_artifacts_and_store_in_cloud( - temp_dir=temp_dir, - logs=logs, # Also save logs in cloud - session_name=session_name, - test_name=test_name, - artifacts=None, - session_controller=None, - ) - logger.info("Stored results on the cloud. Returning.") - else: - saved_artifacts = {} - logger.info( - "Usually I would have fetched the results and " - "artifacts and stored them on S3." - ) - - if results_json: - results = get_local_json_content( - local_file=results_json, - ) - else: - results = { - "passed": int(returncode == 0), - } - - results["returncode"] = returncode - - _update_results(results) - - result_queue.put( - State( - "END", - time.time(), - { - "status": "finished", - "last_logs": logs, - "results": results, - "artifacts": saved_artifacts, - }, - ) - ) - - def _run(logger): - # These values will be set as the test runs. - session_url = None - runtime = None - anyscale.conf.CLI_TOKEN = GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"] - test_uses_ray_connect = test_config["run"].get("use_connect") - - session_id = None - scd_id = None - try: - # First, look for running sessions - session_id = search_running_session(sdk, project_id, session_name) - compute_tpl_name = None - app_config_id = app_config_id_override - app_config_name = None - build_id = build_id_override - if not session_id: - logger.info("No session found.") - # Start session - session_options = dict(name=session_name, project_id=project_id) - - if cluster_config is not None: - logging.info("Starting session with cluster config") - cluster_config_str = json.dumps(cluster_config) - session_options["cluster_config"] = cluster_config_str - session_options["cloud_id"] = cloud_id - session_options["uses_app_config"] = False - else: - logging.info("Starting session with app/compute config") - - # Find/create compute template - compute_tpl_id, compute_tpl_name = create_or_find_compute_template( - sdk, project_id, compute_tpl - ) - - url = _format_link(anyscale_compute_tpl_url(compute_tpl_id)) - - logger.info(f"Link to compute template: {url}") - - # Find/create app config - if app_config_id is None: - ( - app_config_id, - app_config_name, - ) = create_or_find_app_config(sdk, project_id, app_config) - else: - logger.info(f"Using override app config {app_config_id}") - app_config_name = sdk.get_app_config(app_config_id).result.name - if build_id is None: - # We might have already retrieved the build ID when - # installing app config packages locally if using - # connect, so only get the build ID if it's not set. - build_id = wait_for_build_or_raise(sdk, app_config_id) - - session_options["compute_template_id"] = compute_tpl_id - session_options["build_id"] = build_id - session_options["uses_app_config"] = True - - # Start session - session_id = create_and_wait_for_session( - sdk=sdk, - stop_event=stop_event, - session_name=session_name, - session_options=session_options, - project_id=project_id, - ) - - prepare_command = test_config["run"].get("prepare") - - # Write test state json - test_state_file = os.path.join(local_dir, "test_state.json") - with open(test_state_file, "wt") as f: - json.dump({"start_time": time.time(), "test_name": test_name}, f) - - on_k8s = test_config["cluster"].get("compute_on_k8s") - if prepare_command or not test_uses_ray_connect: - if test_uses_ray_connect: - logger.info( - "Found a prepare command, so pushing it " "to the session." - ) - # Rsync up - logger.info("Syncing files to session...") - session_controller.push( - session_name=session_name, - source=None, - target=None, - config=None, - all_nodes=False, - ) - - logger.info("Syncing test state to session...") - session_controller.push( - session_name=session_name, - source=test_state_file, - target=state_json, - config=None, - all_nodes=False, - ) - - session_url = anyscale_session_url( - project_id=GLOBAL_CONFIG["ANYSCALE_PROJECT"], session_id=session_id - ) - _check_stop(stop_event, "file_sync") - - # Optionally run preparation command - if prepare_command: - logger.info(f"Running preparation command: {prepare_command}") - if on_k8s: - cid = global_command_runner.run_command( - session_name, prepare_command, env_vars - ) - status_code, _ = global_command_runner.wait_command(cid) - if status_code != 0: - raise PrepareCommandRuntimeError() - else: - scd_id, result = run_session_command( - sdk=sdk, - session_id=session_id, - cmd_to_run=prepare_command, - result_queue=result_queue, - env_vars=env_vars, - state_str="CMD_PREPARE", - ) - _, _ = wait_for_session_command_to_complete( - result, - sdk=sdk, - scd_id=scd_id, - stop_event=stop_event, - state_str="CMD_PREPARE", - ) - - if test_uses_ray_connect: - script_args = test_config["run"].get("args", []) - if smoke_test: - script_args += ["--smoke-test"] - min_workers = 0 - for node_type in compute_tpl["worker_node_types"]: - min_workers += node_type["min_workers"] - # Build completed, use job timeout - result_queue.put(State("CMD_RUN", time.time(), None)) - returncode, logs = run_job( - cluster_name=session_name, - compute_tpl_name=compute_tpl_name, - cluster_env_name=app_config_name, - job_name=session_name, - min_workers=min_workers, - script=test_config["run"]["script"], - script_args=script_args, - env_vars=env_vars, - autosuspend=autosuspend_mins, - ) - _process_finished_client_command(returncode, logs) - return - - # Run release test command - cmd_to_run = test_config["run"]["script"] + " " - - args = test_config["run"].get("args", []) - if args: - cmd_to_run += " ".join(args) + " " - - if smoke_test: - cmd_to_run += " --smoke-test" - - if on_k8s: - cmd_id = global_command_runner.run_command( - session_name, cmd_to_run, env_vars=env_vars - ) - else: - scd_id, result = run_session_command( - sdk=sdk, - session_id=session_id, - cmd_to_run=cmd_to_run, - result_queue=result_queue, - env_vars=env_vars, - state_str="CMD_RUN", - ) - - if not kick_off_only: - if on_k8s: - retcode, runtime = global_command_runner.wait_command(cmd_id) - if retcode != 0: - raise RuntimeError("Command errored") - _process_finished_command( - session_controller=session_controller, - scd_id="", - runtime=runtime, - session_url=session_url, - commit_url=commit_url, - ) - else: - _, runtime = wait_for_session_command_to_complete( - result, - sdk=sdk, - scd_id=scd_id, - stop_event=stop_event, - state_str="CMD_RUN", - ) - _process_finished_command( - session_controller=session_controller, - scd_id=scd_id, - runtime=runtime, - session_url=session_url, - commit_url=commit_url, - ) - else: - result_queue.put( - State("END", time.time(), {"status": "kickoff", "last_logs": ""}) - ) - - except (ReleaseTestTimeoutError, Exception) as e: - logger.error(e, exc_info=True) - - logs = str(e) - if scd_id is not None: - try: - logs = ( - logs - + "; Command logs:" - + get_command_logs( - session_controller, scd_id, test_config.get("log_lines", 50) - ) - ) - except Exception as e2: - logger.error(e2, exc_info=True) - - # Long running tests are "finished" successfully when - # timed out - if isinstance(e, ReleaseTestTimeoutError) and is_long_running: - _process_finished_command( - session_controller=session_controller, scd_id=scd_id - ) - else: - runtime = None - if isinstance(e, CommandTimeoutError): - error_type = "timeout" - runtime = 0 - exit_code = ExitCode.COMMAND_TIMEOUT - elif isinstance(e, PrepareCommandTimeoutError): - error_type = "infra_timeout" - runtime = None - exit_code = ExitCode.PREPARE_TIMEOUT - elif isinstance(e, FileSyncTimeoutError): - error_type = "infra_timeout" - runtime = None - exit_code = ExitCode.FILESYNC_TIMEOUT - elif isinstance(e, SessionTimeoutError): - error_type = "infra_timeout" - runtime = None - exit_code = ExitCode.SESSION_TIMEOUT - elif isinstance(e, PrepareCommandRuntimeError): - error_type = "infra_timeout" - runtime = None - exit_code = ExitCode.PREPARE_ERROR - elif isinstance(e, AppConfigBuildFailure): - error_type = "infra_timeout" - runtime = None - exit_code = ExitCode.APPCONFIG_BUILD_ERROR - elif isinstance(e, ReleaseTestInfraError): - error_type = "infra_error" - exit_code = ExitCode.INFRA_ERROR - elif isinstance(e, RuntimeError): - error_type = "runtime_error" - runtime = 0 - exit_code = ExitCode.RUNTIME_ERROR - else: - error_type = "unknown timeout" - runtime = None - exit_code = ExitCode.UNKNOWN - - # Add these metadata here to avoid changing SQL schema. - results = {} - results["_runtime"] = runtime - results["_session_url"] = session_url - results["_commit_url"] = commit_url - results["_stable"] = test_config.get("stable", True) - result_queue.put( - State( - "END", - time.time(), - { - "status": error_type, - "last_logs": logs, - "results": results, - "exit_code": exit_code.value, - }, - ) - ) - finally: - if no_terminate: - logger.warning( - "`no_terminate` is set to True, so the session will " - "*not* be terminated!" - ) - else: - _cleanup_session(sdk, session_id) - - def _check_progress(logger): - anyscale.conf.CLI_TOKEN = GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"] - - should_terminate = False - session_id = None - scd_id = None - try: - existing_session = find_session_by_test_name( - sdk=sdk, - session_controller=session_controller, - temp_dir=temp_dir, - state_json=state_json, - project_id=project_id, - test_name=test_name, - ) - - if existing_session is None: - logger.info(f"Found no existing session for {test_name}") - result_queue.put( - State("END", time.time(), {"status": "nosession", "last_logs": ""}) - ) - return - - session_id, session_name, session_state = existing_session - - logger.info(f"Found existing session for {test_name}: " f"{session_name}") - - scd_id, success = get_latest_running_command_id( - sdk=sdk, session_id=session_id - ) - - latest_result = get_remote_json_content( - temp_dir=temp_dir, - session_name=session_name, - remote_file=results_json, - session_controller=session_controller, - ) - - # Fetch result json and check if it has been updated recently - result_time_key = test_config["run"].get("time_key", None) - maximum_update_delay = test_config["run"].get("max_update_delay", None) - - if result_time_key and maximum_update_delay: - last_update = latest_result.get(result_time_key, None) - - if not last_update: - result_queue.put( - State( - "END", - time.time(), - { - "status": "error", - "last_logs": f"Test did not store " - f"{result_time_key} in the " - f"results json.", - }, - ) - ) - return - - delay = time.time() - last_update - logger.info( - f"Last update was at {last_update:.2f}. " - f"This was {delay:.2f} seconds ago " - f"(maximum allowed: {maximum_update_delay})" - ) - - if delay > maximum_update_delay: - raise RuntimeError( - f"Test did not update the results json within " - f"the last {maximum_update_delay} seconds." - ) - - if time.time() - session_state["start_time"] > timeout: - # Long running test reached timeout - logger.info(f"Test command reached timeout after {timeout} seconds") - _process_finished_command( - session_controller=session_controller, - scd_id=scd_id, - results=latest_result, - ) - should_terminate = True - - elif success: - logger.info("All commands finished.") - _process_finished_command( - session_controller=session_controller, - scd_id=scd_id, - results=latest_result, - ) - should_terminate = True - - else: - rest_time = timeout - time.time() + session_state["start_time"] - logger.info( - f"Test command should continue running " f"for {rest_time} seconds" - ) - result_queue.put( - State( - "END", - time.time(), - {"status": "kickoff", "last_logs": "Test is still running"}, - ) - ) - - except Exception as e: - logger.error(e, exc_info=True) - - logs = str(e) - if scd_id is not None: - try: - logs = get_command_logs( - session_controller, scd_id, test_config.get("log_lines", 50) - ) - logs += f"\n{str(e)}" - except Exception as e2: - logger.error(e2, exc_info=True) - - result_queue.put( - State("END", time.time(), {"status": "error", "last_logs": logs}) - ) - should_terminate = True - finally: - if should_terminate: - logger.warning("Terminating session") - _cleanup_session(sdk, session_id) - - if not check_progress: - process = multiprocessing.Process(target=_run, args=(logger,)) - else: - process = multiprocessing.Process(target=_check_progress, args=(logger,)) - - build_timeout = test_config["run"].get("build_timeout", 1800) - prepare_timeout = test_config["run"].get("prepare_timeout", timeout) - - project_url = anyscale_project_url(project_id=GLOBAL_CONFIG["ANYSCALE_PROJECT"]) - logger.info(f"Link to project: {_format_link(project_url)}") - - msg = f"This will now run test {test_name}." - if smoke_test: - msg += " This is a smoke test." - if is_long_running: - msg += " This is a long running test." - logger.info(msg) - - logger.info( - f"Starting process with timeout {timeout} " - f"(prepare timeout {prepare_timeout}, " - f"build timeout {build_timeout})" - ) - process.start() - - # The timeout time will be updated after the build finished - # Build = App config + compute template build and session start - timeout_time = time.time() + build_timeout - - result = {} - while process.is_alive(): - try: - state: State = result_queue.get(timeout=1) - except (Empty, TimeoutError): - if time.time() > timeout_time: - stop_event.set() - logger.warning("Process timed out.") - - if not is_long_running: - logger.warning("Terminating process in 10 seconds.") - time.sleep(10) - logger.warning("Terminating process now.") - process.terminate() - else: - logger.info( - "Process is long running. Give 2 minutes to " - "fetch result and terminate." - ) - start_terminate = time.time() - while time.time() < start_terminate + 120 and process.is_alive(): - time.sleep(1) - if process.is_alive(): - logger.warning("Terminating forcefully now.") - process.terminate() - else: - logger.info("Long running results collected.") - break - continue - - if not isinstance(state, State): - raise RuntimeError(f"Expected `State` object, got {result}") - - if state.state == "CMD_PREPARE": - # Reset timeout after build finished - timeout_time = state.timestamp + prepare_timeout - - if state.state == "CMD_RUN": - # Reset timeout after prepare command or build finished - timeout_time = state.timestamp + timeout - - elif state.state == "END": - result = state.data - break - - while not result_queue.empty(): - state = result_queue.get_nowait() - result = state.data - - logger.info("Final check if everything worked.") - try: - result.setdefault("status", "error (status not found)") - except (TimeoutError, Empty): - result = {"status": "timeout", "last_logs": "Test timed out."} - - logger.info(f"Final results: {result}") - - log_results_and_artifacts(result) - - if not keep_results_dir: - logger.info(f"Removing results dir {temp_dir}") - shutil.rmtree(temp_dir) - else: - # Write results.json - with open(os.path.join(temp_dir, "results.json"), "wt") as fp: - json.dump(result, fp) - - out_dir = os.path.expanduser(GLOBAL_CONFIG["RELEASE_RESULTS_DIR"]) - - logger.info( - f"Moving results dir {temp_dir} to persistent location " f"{out_dir}" - ) - - try: - shutil.rmtree(out_dir) - except Exception: - logger.exception( - f"Ran into error when clearing the destination dir: {out_dir}" - ) - - try: - # Use distutils.dir_util.copy_tree() instead of shutil.cptree(), - # which allows existing output directory. - from distutils.dir_util import copy_tree - - copy_tree(temp_dir, out_dir) - except Exception: - logger.exception( - "Ran into error when copying results dir to persistent " - f"location: {out_dir}" - ) - - logger.info(f"Dir contents: {os.listdir(out_dir)}") - - return result - - -def run_test( - test_config_file: str, - test_name: str, - project_id: str, - commit_url: str, - category: str = "unspecified", - smoke_test: bool = False, - no_terminate: bool = False, - kick_off_only: bool = False, - check_progress: bool = False, - report: bool = True, - keep_results_dir: bool = False, - session_name: Optional[str] = None, - app_config_id_override=None, -) -> Dict[str, Any]: - with open(test_config_file, "rt") as f: - test_configs = yaml.safe_load(f) - - test_config_dict = {} - for test_config in test_configs: - name = test_config.pop("name") - test_config_dict[name] = test_config - - if test_name not in test_config_dict: - raise ValueError( - f"Test with name `{test_name}` not found in test config file " - f"at `{test_config_file}`." - ) - - test_config = test_config_dict[test_name] - - if smoke_test and "smoke_test" in test_config: - smoke_test_config = test_config.pop("smoke_test") - test_config = _deep_update(test_config, smoke_test_config) - - local_dir = os.path.dirname(test_config_file) - if "local_dir" in test_config: - # local_dir is relative to test_config_file - local_dir = os.path.join(local_dir, test_config["local_dir"]) - - if test_config["run"].get("use_connect"): - assert not kick_off_only, ( - "--kick-off-only is unsupported when running with " "Anyscale connect." - ) - assert ( - not check_progress - ), "--check is unsupported when running with Anyscale connect." - if test_config.get("artifacts", {}): - logger.error( - "Saving artifacts are not yet supported when running with " - "Anyscale connect." - ) - - # Perform necessary driver side setup. - driver_setup_script = test_config.get("driver_setup", None) - if driver_setup_script: - run_bash_script(local_dir, driver_setup_script) - logger.info(test_config) - team = test_config.get("team", "unspecified").strip(" ").lower() - # When running local test, this validates the team name. - # If the team name is not specified, they will be recorded as "unspecified" - if not report and team not in VALID_TEAMS: - logger.warning( - f"Incorrect team name {team} has given." - "Please specify team under the name field in the test config. " - "For example, within nightly_tests.yaml,\n" - "\tname: test_xxx\n" - f"\tteam: {'|'.join(VALID_TEAMS)}\n" - "\tcluster:..." - ) - - result = run_test_config( - local_dir, - project_id, - test_name, - test_config, - commit_url, - session_name=session_name, - smoke_test=smoke_test, - no_terminate=no_terminate, - kick_off_only=kick_off_only, - check_progress=check_progress, - upload_artifacts=report, - keep_results_dir=keep_results_dir, - app_config_id_override=app_config_id_override, - ) - - status = result.get("status", "invalid") - - if kick_off_only: - if status != "kickoff": - raise RuntimeError("Error kicking off test.") - - logger.info( - "Kicked off test. It's now up to the `--check` " - "part of the script to track its process." - ) - return {} - else: - # `--check` or no kick off only - - if status == "nosession": - logger.info( - f"No running session found for test {test_name}, so " - f"assuming everything is fine." - ) - return {} - - if status == "kickoff": - logger.info(f"Test {test_name} is still running.") - return {} - - last_logs = result.get("last_logs", "No logs.") - - test_suite = os.path.basename(test_config_file).replace(".yaml", "") - - report_kwargs = dict( - test_suite=test_suite, - test_name=test_name, - status=status, - last_logs=last_logs, - results=result.get("results", {}), - artifacts=result.get("artifacts", {}), - category=category, - team=team, - ) - - if not has_errored(result): - # Check if result are met if test succeeded - alert = maybe_get_alert_for_result(report_kwargs) - - if alert: - # If we get an alert, the test failed. - logger.error( - f"Alert has been raised for " - f"{test_suite}/{test_name} " - f"({category}): {alert}" - ) - result["status"] = "error (alert raised)" - report_kwargs["status"] = "error (alert raised)" - - # For printing/reporting to the database - report_kwargs["last_logs"] = alert - last_logs = alert - else: - logger.info( - f"No alert raised for test " - f"{test_suite}/{test_name} " - f"({category}) - the test successfully passed!" - ) - - if report: - try: - report_result(**report_kwargs) - except Exception as e: - # On database error the test should still pass - # Todo: flag somewhere else? - logger.exception(f"Error persisting results to database: {e}") - else: - logger.info( - f"Usually I would now report the following results:\n" - f"{report_kwargs}" - ) - - if has_errored(result): - # If the script terminates due to an uncaught error, it - # will return exit code 1, so we use 2 per default to - # catch these cases. - exit_code = result.get("exit_code", ExitCode.UNSPECIFIED.value) - logger.error(last_logs) - logger.info(f"Exiting with exit code {exit_code}") - sys.exit(exit_code) - - return report_kwargs - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument( - "--test-config", type=str, required=True, help="Test config file" - ) - parser.add_argument("--test-name", type=str, help="Test name in config") - parser.add_argument( - "--ray-wheels", required=False, type=str, help="URL to ray wheels" - ) - parser.add_argument( - "--no-terminate", - action="store_true", - default=False, - help="Don't terminate session after failure", - ) - parser.add_argument( - "--report", - action="store_true", - default=False, - help="Whether to report results and upload to S3", - ) - parser.add_argument( - "--kick-off-only", - action="store_true", - default=False, - help="Kick off only (don't wait for command to finish)", - ) - parser.add_argument( - "--check", - action="store_true", - default=False, - help="Check (long running) status", - ) - parser.add_argument( - "--keep-results-dir", - action="store_true", - default=False, - help="Keep results in directory (named RELEASE_RESULTS_DIR), e.g. " - "for Buildkite artifact upload.", - ) - parser.add_argument( - "--category", - type=str, - default="unspecified", - help="Category name, e.g. `release-1.3.0` (will be saved in database)", - ) - parser.add_argument( - "--smoke-test", action="store_true", help="Finish quickly for testing" - ) - parser.add_argument( - "--session-name", - required=False, - type=str, - help="Name of the session to run this test.", - ) - parser.add_argument( - "--app-config-id-override", - required=False, - type=str, - help=("An app config ID, which will override the test config app " "config."), - ) - args, _ = parser.parse_known_args() - - if not GLOBAL_CONFIG["ANYSCALE_PROJECT"]: - raise RuntimeError("You have to set the ANYSCALE_PROJECT environment variable!") - - ray_wheels = args.ray_wheels or os.environ.get("RAY_WHEELS", "") - - maybe_fetch_api_token() - if ray_wheels: - logger.info(f"Using Ray wheels provided from URL/commit: " f"{ray_wheels}") - url = commit_or_url(str(ray_wheels)) - logger.info(f"Resolved url link is: {url}") - # Overwrite with actual URL - os.environ["RAY_WHEELS"] = url - elif not args.check: - url = find_ray_wheels( - GLOBAL_CONFIG["RAY_REPO"], - GLOBAL_CONFIG["RAY_BRANCH"], - GLOBAL_CONFIG["RAY_VERSION"], - ) - if not url: - raise RuntimeError( - f"Could not find wheels for " - f"Ray {GLOBAL_CONFIG['RAY_VERSION']}, " - f"branch {GLOBAL_CONFIG['RAY_BRANCH']}" - ) - - # RAY_COMMIT is set by commit_or_url and find_ray_wheels - populate_wheels_sanity_check(os.environ.get("RAY_COMMIT", "")) - - test_config_file = os.path.abspath(os.path.expanduser(args.test_config)) - - # Override it from the global variable. - report = GLOBAL_CONFIG["REPORT_RESULT"] - if report.lower() == "1" or report.lower() == "true": - report = True - else: - report = args.report - - run_test( - test_config_file=test_config_file, - test_name=args.test_name, - project_id=GLOBAL_CONFIG["ANYSCALE_PROJECT"], - commit_url=url, - category=args.category, - smoke_test=args.smoke_test, - no_terminate=args.no_terminate or args.kick_off_only, - kick_off_only=args.kick_off_only, - check_progress=args.check, - report=report, - session_name=args.session_name, - keep_results_dir=args.keep_results_dir, - app_config_id_override=args.app_config_id_override, - ) diff --git a/release/horovod_tests/horovod_tests.yaml b/release/horovod_tests/horovod_tests.yaml deleted file mode 100644 index ce0abe719..000000000 --- a/release/horovod_tests/horovod_tests.yaml +++ /dev/null @@ -1,15 +0,0 @@ -- name: horovod_test - team: ml - cluster: - app_config: app_config_master.yaml - compute_template: compute_tpl.yaml - - run: - timeout: 36000 - prepare: python wait_cluster.py 3 600 - script: python workloads/horovod_tune_test.py - long_running: True - - smoke_test: - run: - timeout: 1800 diff --git a/release/horovod_tests/wait_cluster.py b/release/horovod_tests/wait_cluster.py deleted file mode 100644 index c02330db2..000000000 --- a/release/horovod_tests/wait_cluster.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/lightgbm_tests/lightgbm_tests.yaml b/release/lightgbm_tests/lightgbm_tests.yaml deleted file mode 100644 index 07aa9e5cf..000000000 --- a/release/lightgbm_tests/lightgbm_tests.yaml +++ /dev/null @@ -1,92 +0,0 @@ -- name: train_small - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_small.yaml - - run: - use_connect: True - autosuspend_mins: 10 - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/train_small.py - -- name: train_moderate - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_moderate.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 32 600 - script: python workloads/train_moderate.py - -- name: train_gpu - team: ml - cluster: - app_config: app_config_gpu.yaml - compute_template: tpl_gpu_small.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 5 600 - script: python workloads/train_gpu.py - -- name: distributed_api_test - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_small.yaml - results: - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/distributed_api_test.py - results: "" - -- name: ft_small_non_elastic - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_small.yaml - - run: - timeout: 900 - prepare: python wait_cluster.py 4 600 - script: python workloads/ft_small_non_elastic.py - results: "" - -- name: tune_small - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_small.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/tune_small.py - -- name: tune_32x4 - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_moderate.yaml - - run: - timeout: 900 - prepare: python wait_cluster.py 32 600 - script: python workloads/tune_32x4.py - -- name: tune_4x32 - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_moderate.yaml - - run: - timeout: 900 - prepare: python wait_cluster.py 32 600 - script: python workloads/tune_4x32.py diff --git a/release/lightgbm_tests/wait_cluster.py b/release/lightgbm_tests/wait_cluster.py deleted file mode 100644 index c02330db2..000000000 --- a/release/lightgbm_tests/wait_cluster.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/long_running_distributed_tests/long_running_distributed.yaml b/release/long_running_distributed_tests/long_running_distributed.yaml deleted file mode 100644 index 189ffd3f9..000000000 --- a/release/long_running_distributed_tests/long_running_distributed.yaml +++ /dev/null @@ -1,13 +0,0 @@ -- name: pytorch_pbt_failure - team: ml - cluster: - app_config: app_config.yaml - compute_template: compute_tpl.yaml - - run: - timeout: 86400 - script: python workloads/pytorch_pbt_failure.py - long_running: True - - smoke_test: - timeout: 3600 diff --git a/release/long_running_tests/long_running_tests.yaml b/release/long_running_tests/long_running_tests.yaml deleted file mode 100644 index 05d4245d0..000000000 --- a/release/long_running_tests/long_running_tests.yaml +++ /dev/null @@ -1,196 +0,0 @@ -- name: actor_deaths - team: core - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_1.yaml - - run: - timeout: 86400 - prepare: ray stop - script: python workloads/actor_deaths.py - long_running: True - - smoke_test: - run: - timeout: 3600 - -- name: apex - team: ml - cluster: - app_config: ../rllib_tests/app_config.yaml - compute_template: tpl_cpu_3.yaml - - run: - timeout: 86400 - prepare: python wait_cluster.py 3 600 - script: python workloads/apex.py - long_running: True - - smoke_test: - run: - timeout: 3600 - - -- name: impala - team: ml - cluster: - app_config: app_config_np.yaml - compute_template: tpl_cpu_1_large.yaml - - run: - timeout: 86400 - script: python workloads/impala.py - long_running: True - - smoke_test: - run: - timeout: 3600 - -- name: many_actor_tasks - team: core - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_1.yaml - - run: - timeout: 86400 - prepare: ray stop - script: python workloads/many_actor_tasks.py - long_running: True - - smoke_test: - run: - timeout: 3600 - - -- name: many_drivers - team: core - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_1.yaml - - run: - timeout: 86400 - prepare: ray stop - script: python workloads/many_drivers.py --iteration-num=4000 - long_running: True - - smoke_test: - run: - timeout: 3600 - - -- name: many_ppo - team: ml - cluster: - app_config: ../rllib_tests/app_config.yaml - compute_template: many_ppo.yaml - - run: - timeout: 86400 - prepare: python wait_cluster.py 1 600 - script: python workloads/many_ppo.py - long_running: True - - smoke_test: - run: - timeout: 3600 - -- name: many_tasks - team: core - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_1.yaml - - run: - timeout: 86400 - prepare: ray stop - script: python workloads/many_tasks.py - long_running: True - - smoke_test: - run: - timeout: 3600 - -- name: many_tasks_serialized_ids - team: core - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_1.yaml - - run: - timeout: 86400 - prepare: ray stop - script: python workloads/many_tasks_serialized_ids.py - long_running: True - - smoke_test: - run: - timeout: 3600 - - -- name: node_failures - team: core - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_1.yaml - - run: - timeout: 86400 - prepare: ray stop - script: python workloads/node_failures.py - long_running: True - - smoke_test: - run: - timeout: 3600 - -- name: pbt - team: ml - cluster: - app_config: ../rllib_tests/app_config.yaml - compute_template: tpl_cpu_1.yaml - - run: - timeout: 86400 - prepare: ray stop - script: python workloads/pbt.py - long_running: True - - smoke_test: - run: - timeout: 3600 - -- name: serve - team: serve - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_1.yaml - - run: - timeout: 86400 - prepare: ray stop - script: python workloads/serve.py - long_running: True - - smoke_test: - run: - timeout: 3600 - -- name: serve_failure - team: serve - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_1.yaml - - run: - timeout: 86400 - prepare: ray stop - script: python workloads/serve_failure.py - long_running: True - - smoke_test: - run: - timeout: 600 - - stable: False diff --git a/release/long_running_tests/wait_cluster.py b/release/long_running_tests/wait_cluster.py deleted file mode 100644 index c02330db2..000000000 --- a/release/long_running_tests/wait_cluster.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/microbenchmark/microbenchmark.yaml b/release/microbenchmark/microbenchmark.yaml deleted file mode 100644 index 7b1c6c336..000000000 --- a/release/microbenchmark/microbenchmark.yaml +++ /dev/null @@ -1,9 +0,0 @@ -# - name: microbenchmark -# team: core -# cluster: -# app_config: app_config.yaml -# compute_template: tpl_64.yaml - -# run: -# timeout: 1800 -# script: OMP_NUM_THREADS=64 RAY_ADDRESS= python run_microbenchmark.py diff --git a/release/ml_user_tests/ml_user_tests.yaml b/release/ml_user_tests/ml_user_tests.yaml deleted file mode 100644 index 8c6a8162e..000000000 --- a/release/ml_user_tests/ml_user_tests.yaml +++ /dev/null @@ -1,124 +0,0 @@ -- name: horovod_user_test_latest - team: ml - cluster: - app_config: horovod/app_config.yaml - compute_template: horovod/compute_tpl.yaml - - - driver_setup: horovod/driver_setup_latest.sh - - run: - use_connect: True - autosuspend_mins: 10 - timeout: 1200 - script: python horovod/horovod_user_test.py - -- name: horovod_user_test_master - team: ml - cluster: - app_config: ../horovod_tests/app_config_master.yaml - compute_template: horovod/compute_tpl.yaml - - driver_setup: horovod/driver_setup_master.sh - - run: - use_connect: True - autosuspend_mins: 10 - timeout: 1200 - script: python horovod/horovod_user_test.py - - -- name: train_tensorflow_mnist_test - team: ml - cluster: - app_config: train/app_config.yaml - compute_template: train/compute_tpl.yaml - - driver_setup: train/driver_setup.sh - - run: - use_connect: True - timeout: 36000 - script: python train/train_tensorflow_mnist_test.py - -- name: train_torch_linear_test - team: ml - cluster: - app_config: train/app_config.yaml - compute_template: train/compute_tpl.yaml - - driver_setup: train/driver_setup.sh - - run: - use_connect: True - timeout: 36000 - script: python train/train_torch_linear_test.py - - -- name: xgboost_gpu_connect_latest - team: ml - cluster: - app_config: xgboost/app_config_gpu.yaml - compute_template: xgboost/tpl_gpu_small_scaling.yaml - - run: - use_connect: True - timeout: 1200 - script: python xgboost/train_gpu_connect.py - -- name: xgboost_gpu_connect_master - team: ml - cluster: - app_config: xgboost/app_config_gpu_master.yaml - compute_template: xgboost/tpl_gpu_small_scaling.yaml - - run: - use_connect: True - timeout: 1200 - script: python xgboost/train_gpu_connect.py - -- name: ray_lightning_user_test_latest - team: ml - cluster: - app_config: ray-lightning/app_config.yaml - compute_template: ray-lightning/compute_tpl.yaml - - driver_setup: ray-lightning/driver_setup.sh - - run: - use_connect: True - autosuspend_mins: 10 - timeout: 1200 - script: python ray-lightning/ray_lightning_user_test.py - - -- name: ray_lightning_user_test_master - team: ml - cluster: - app_config: ray-lightning/app_config_master.yaml - compute_template: ray-lightning/compute_tpl.yaml - - - driver_setup: ray-lightning/driver_setup.sh - - run: - use_connect: True - autosuspend_mins: 10 - timeout: 1200 - script: python ray-lightning/ray_lightning_user_test.py - - -- name: tune_rllib_connect_test - team: ml - cluster: - app_config: ../rllib_tests/app_config.yaml - compute_template: tune_rllib/compute_tpl.yaml - - - driver_setup: tune_rllib/driver_setup.sh - - run: - use_connect: True - autosuspend_mins: 10 - timeout: 1200 - script: python tune_rllib/run_connect_tests.py \ No newline at end of file diff --git a/release/nightly_tests/chaos_test.yaml b/release/nightly_tests/chaos_test.yaml deleted file mode 100644 index f24cdcf16..000000000 --- a/release/nightly_tests/chaos_test.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# -# Chaos tests. -# - -# Run the test that invokes many tasks without object store usage. -- name: chaos_many_tasks_no_object_store - team: core - cluster: - app_config: chaos_test/app_config.yaml - compute_template: chaos_test/compute_template.yaml - - run: - timeout: 3600 - prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start - script: python chaos_test/test_chaos_basic.py --workload=tasks - -- name: chaos_many_actors - team: core - cluster: - app_config: chaos_test/app_config.yaml - compute_template: chaos_test/compute_template.yaml - - run: - timeout: 3600 - prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start - script: python chaos_test/test_chaos_basic.py --workload=actors - -- name: chaos_dask_on_ray_large_scale_test_no_spilling - team: core - cluster: - app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml - compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml - - run: - timeout: 7200 - # Total run time without failures is about 300-400s. - prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100 - script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray - -# Test large scale dask on ray test with spilling. -- name: chaos_dask_on_ray_large_scale_test_spilling - team: core - cluster: - app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml - compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml - - run: - timeout: 7200 - # Total run time without failures is about 300-400s. - prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100 - script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray - -- name: chaos_pipelined_ingestion_1500_gb_15_windows - team: core - cluster: - app_config: dataset/pipelined_ingestion_app.yaml - compute_template: dataset/pipelined_ingestion_compute.yaml - - run: - timeout: 7200 - prepare: python wait_cluster.py 21 2400; python setup_chaos.py --node-kill-interval 300 - script: python dataset/pipelined_training.py --epochs 1 --num-windows 15 --num-files 915 --debug - - stable: false diff --git a/release/nightly_tests/dataset/dataset_test.yaml b/release/nightly_tests/dataset/dataset_test.yaml deleted file mode 100644 index 8ac02a36a..000000000 --- a/release/nightly_tests/dataset/dataset_test.yaml +++ /dev/null @@ -1,95 +0,0 @@ -- name: inference - team: core - cluster: - app_config: app_config.yaml - compute_template: inference.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 2 600 - script: python inference.py - -- name: shuffle_data_loader - team: core - cluster: - app_config: shuffle_app_config.yaml - compute_template: shuffle_compute.yaml - - run: - timeout: 1800 - script: python dataset_shuffle_data_loader.py - -- name: parquet_metadata_resolution - team: core - cluster: - app_config: pipelined_training_app.yaml - compute_template: pipelined_training_compute.yaml - - run: - timeout: 1200 - prepare: python wait_cluster.py 15 1200 - script: python parquet_metadata_resolution.py --num-files 915 - -- name: pipelined_training_50_gb - team: core - cluster: - app_config: pipelined_training_app.yaml - compute_template: pipelined_training_compute.yaml - - run: - timeout: 4800 - prepare: python wait_cluster.py 15 1200 - script: python pipelined_training.py --epochs 1 - -- name: pipelined_ingestion_1500_gb - team: core - cluster: - app_config: pipelined_ingestion_app.yaml - compute_template: pipelined_ingestion_compute.yaml - - run: - timeout: 9600 - prepare: python wait_cluster.py 21 2400 - script: python pipelined_training.py --epochs 2 --num-windows 2 --num-files 915 --debug - -- name: datasets_ingest_train_infer - team: core - cluster: - app_config: ray_sgd_training_app.yaml - compute_template: ray_sgd_training_compute.yaml - - run: - timeout: 14400 - prepare: python wait_cluster.py 66 2400 - script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset - - smoke_test: - cluster: - app_config: ray_sgd_training_app.yaml - compute_template: ray_sgd_training_smoke_compute.yaml - - run: - timeout: 3600 - prepare: python wait_cluster.py 8 2400 - script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu - -- name: datasets_preprocess_ingest - team: core - cluster: - app_config: ray_sgd_training_app.yaml - compute_template: ray_sgd_training_compute_no_gpu.yaml - - run: - timeout: 7200 - prepare: python wait_cluster.py 21 2400 - script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset --debug - -- name: datasets_ingest_400G - team: core - cluster: - app_config: ray_sgd_training_app.yaml - compute_template: dataset_ingest_400G_compute.yaml - - run: - timeout: 7200 - script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1 diff --git a/release/nightly_tests/dataset/wait_cluster.py b/release/nightly_tests/dataset/wait_cluster.py deleted file mode 100644 index c02330db2..000000000 --- a/release/nightly_tests/dataset/wait_cluster.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/nightly_tests/nightly_tests.yaml b/release/nightly_tests/nightly_tests.yaml deleted file mode 100644 index 2d0f90b94..000000000 --- a/release/nightly_tests/nightly_tests.yaml +++ /dev/null @@ -1,390 +0,0 @@ -# -# Single node shuffle -# -# Test basic single node 10GB shuffle with a small number of partitions. -# This doesn't require object spilling. -# - name: shuffle_10gb -# team: core -# cluster: -# app_config: shuffle/shuffle_app_config.yaml -# compute_template: shuffle/shuffle_compute_single.yaml - -# run: -# timeout: 3000 -# script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6 - -# Test single node 50GB shuffle with a large number of partitions. -- name: shuffle_50gb - team: core - cluster: - app_config: shuffle/shuffle_app_config.yaml - compute_template: shuffle/shuffle_compute_single.yaml - - run: - timeout: 3000 - script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 - -# Test single node 50GB shuffle with a large number of partitions. -- name: shuffle_50gb_large_partition - team: core - cluster: - app_config: shuffle/shuffle_app_config.yaml - compute_template: shuffle/shuffle_compute_single.yaml - - run: - timeout: 3000 - script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 - -# Test non streaming shuffle in a single node with a small number of partition. -- name: non_streaming_shuffle_50gb - team: core - cluster: - app_config: shuffle/shuffle_app_config.yaml - compute_template: shuffle/shuffle_compute_single.yaml - - run: - timeout: 3000 - script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming - -# Test non streaming shuffle in a single node with a large number of partition. -- name: non_streaming_shuffle_50gb_large_partition - team: core - cluster: - app_config: shuffle/shuffle_app_config.yaml - compute_template: shuffle/shuffle_compute_single.yaml - - run: - timeout: 3000 - script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming - -- name: dask_on_ray_10gb_sort - team: core - cluster: - app_config: dask_on_ray/dask_on_ray_app_config.yaml - compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml - - run: - timeout: 7200 - script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray - -- name: dask_on_ray_100gb_sort - team: core - cluster: - app_config: dask_on_ray/dask_on_ray_app_config.yaml - compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml - - run: - timeout: 7200 - script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray - -# -# Multi node shuffle -# - -# Test multi nodes 100GB shuffle with a small number of partitions. -- name: shuffle_100gb - team: core - cluster: - app_config: shuffle/shuffle_app_config.yaml - compute_template: shuffle/shuffle_compute_multi.yaml - - run: - timeout: 3000 - prepare: python wait_cluster.py 4 600 - script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 - -# Test non streaming multi nodes 100GB shuffle with a small number of partitions. -- name: non_streaming_shuffle_100gb - team: core - cluster: - app_config: shuffle/shuffle_app_config.yaml - compute_template: shuffle/shuffle_compute_multi.yaml - - run: - timeout: 3000 - prepare: python wait_cluster.py 4 600 - script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming - -# Test autoscaling 1TB streaming shuffle with a large number of partitions. -- name: autoscaling_shuffle_1tb_1000_partitions - team: core - cluster: - app_config: shuffle/shuffle_app_config.yaml - compute_template: shuffle/shuffle_compute_autoscaling.yaml - - run: - timeout: 4000 - script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming - -# Test multi nodes 1TB streaming shuffle with a large number of partitions. -- name: shuffle_1tb_1000_partition - team: core - cluster: - app_config: shuffle/shuffle_app_config.yaml - compute_template: shuffle/shuffle_compute_large_scale.yaml - - run: - timeout: 3000 - prepare: python wait_cluster.py 20 900 - script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 - -# Test multi nodes 1TB non streaming shuffle with a large number of partitions. -- name: non_streaming_shuffle_1tb_1000_partition - team: core - cluster: - app_config: shuffle/shuffle_app_config.yaml - compute_template: shuffle/shuffle_compute_large_scale.yaml - - run: - timeout: 3000 - prepare: python wait_cluster.py 20 900 - script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming - -# Stress test for 1TB multi node streaming shuffle. -- name: shuffle_1tb_5000_partitions - team: core - cluster: - app_config: shuffle/shuffle_app_config.yaml - compute_template: shuffle/shuffle_compute_large_scale.yaml - - run: - timeout: 9000 - prepare: python wait_cluster.py 20 900 - script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 - -# Stress test for 1TB multi node non-streaming shuffle. -# - name: non_streaming_shuffle_1tb_5000_partitions -# team: core -# stable: False -# cluster: -# app_config: shuffle/shuffle_app_config.yaml -# compute_template: shuffle/shuffle_compute_large_scale.yaml - -# run: -# timeout: 7200 -# prepare: python wait_cluster.py 20 900 -# script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming - -- name: k8s_dask_on_ray_large_scale_test_no_spilling - team: core - cluster: - app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml - compute_template: dask_on_ray/dask_on_ray_stress_compute_k8s.yaml - compute_on_k8s: True - - run: - timeout: 7200 - prepare: python wait_cluster.py 21 600 - script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray - stable: false - -# # Test large scale dask on ray test without spilling. -# - name: dask_on_ray_large_scale_test_no_spilling -# team: core -# cluster: -# app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml -# compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml - -# run: -# timeout: 7200 -# prepare: python wait_cluster.py 21 600 -# script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray - -# smoke_test: -# cluster: -# app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml -# compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml - -# run: -# timeout: 7200 -# prepare: python wait_cluster.py 5 600 -# script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray - -# Test large scale dask on ray test with spilling. -- name: dask_on_ray_large_scale_test_spilling - team: core - cluster: - app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml - compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml - - run: - timeout: 7200 - prepare: python wait_cluster.py 21 600 - script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray - - smoke_test: - cluster: - app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml - compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml - - run: - timeout: 7200 - prepare: python wait_cluster.py 5 600 - script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray - -# Stress tests with many tasks -- name: stress_test_many_tasks - team: core - cluster: - app_config: stress_tests/stress_tests_app_config.yaml - compute_template: stress_tests/stress_tests_compute.yaml - - run: - timeout: 7200 - script: python stress_tests/test_many_tasks.py - - smoke_test: - cluster: - app_config: stress_tests/stress_tests_app_config.yaml - compute_template: stress_tests/smoke_test_compute.yaml - - run: - timeout: 3600 - script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test - -# Stress tests with dead actors -- name: stress_test_dead_actors - team: core - cluster: - app_config: stress_tests/stress_tests_app_config.yaml - compute_template: stress_tests/stress_tests_compute.yaml - - run: - timeout: 7200 - script: python stress_tests/test_dead_actors.py - - smoke_test: - cluster: - app_config: stress_tests/stress_tests_app_config.yaml - compute_template: stress_tests/smoke_test_compute.yaml - - run: - timeout: 3600 - script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3 - -# Stress tests with placement groups -- name: stress_test_placement_group - team: core - cluster: - app_config: stress_tests/stress_tests_app_config.yaml - compute_template: stress_tests/placement_group_tests_compute.yaml - - run: - timeout: 7200 - script: python stress_tests/test_placement_group.py - -# Stress tests with many threaded actors. -- name: threaded_actors_stress_test - team: core - cluster: - app_config: stress_tests/stress_tests_app_config.yaml - compute_template: stress_tests/stress_test_threaded_actor_compute.yaml - - run: - timeout: 7200 - prepare: python wait_cluster.py 201 600 - script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60 - - smoke_test: - cluster: - app_config: stress_tests/stress_tests_app_config.yaml - compute_template: stress_tests/smoke_test_compute.yaml - - run: - timeout: 3600 - prepare: python wait_cluster.py 5 600 - script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30 - stable: false - -- name: k8s_threaded_actors_stress_test - team: core - cluster: - app_config: stress_tests/stress_tests_app_config.yaml - compute_template: stress_tests/k8s_stress_test_threaded_actor_compute.yaml - compute_on_k8s: True - - run: - timeout: 7200 - prepare: python wait_cluster.py 201 600 - script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60 - - run: - timeout: 3600 - prepare: python wait_cluster.py 5 600 - script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30 - stable: false - -# Test decision tree on autoscaling compute cluster. -- name: decision_tree_autoscaling - team: core - cluster: - app_config: decision_tree/decision_tree_app_config.yaml - compute_template: decision_tree/autoscaling_compute.yaml - - run: - timeout: 3000 - script: python decision_tree/cart_with_tree.py - -# Test 20 concurrent decision tree runs on autoscaling compute cluster. -- name: decision_tree_autoscaling_20_runs - team: core - cluster: - app_config: decision_tree/decision_tree_app_config.yaml - compute_template: decision_tree/autoscaling_compute.yaml - run: - timeout: 9600 - script: python decision_tree/cart_with_tree.py --concurrency=20 - -- name: dask_on_ray_1tb_sort - team: core - cluster: - app_config: dask_on_ray/dask_on_ray_app_config.yaml - compute_template: dask_on_ray/1tb_sort_compute.yaml - - run: - timeout: 7200 - prepare: python wait_cluster.py 32 1000 - script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test - -- name: many_nodes_actor_test - team: core - cluster: - app_config: many_nodes_tests/app_config.yaml - compute_template: many_nodes_tests/compute_config.yaml - - run: - timeout: 7200 - prepare: python wait_cluster.py 251 5400 - script: python many_nodes_tests/actor_test.py - -- name: pg_autoscaling_regression_test - team: core - cluster: - app_config: placement_group_tests/app_config.yaml - compute_template: placement_group_tests/compute.yaml - - run: - timeout: 1200 - script: python placement_group_tests/pg_run.py - -- name: pg_long_running_performance_test - team: core - cluster: - app_config: placement_group_tests/app_config.yaml - compute_template: placement_group_tests/long_running_test_compute.yaml - - run: - timeout: 3600 - prepare: python wait_cluster.py 2 600 - script: python placement_group_tests/long_running_performance_test.py --num-stages 2000 - -- name: placement_group_performance_test - team: core - cluster: - app_config: placement_group_tests/app_config.yaml - compute_template: placement_group_tests/pg_perf_test_compute.yaml - - run: - timeout: 1200 - prepare: python wait_cluster.py 5 600 - script: python placement_group_tests/placement_group_performance_test.py diff --git a/release/nightly_tests/wait_cluster.py b/release/nightly_tests/wait_cluster.py deleted file mode 100644 index f70088289..000000000 --- a/release/nightly_tests/wait_cluster.py +++ /dev/null @@ -1,54 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s - -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/rllib_tests/rllib_tests.yaml b/release/rllib_tests/rllib_tests.yaml deleted file mode 100644 index d0b15dc07..000000000 --- a/release/rllib_tests/rllib_tests.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# Heavy learning tests (Atari and HalfCheetah) for major algos. -- name: learning_tests - team: ml - cluster: - app_config: app_config.yaml - compute_template: 8gpus_64cpus.yaml - - run: - timeout: 14400 - script: python learning_tests/run.py - - smoke_test: - run: - timeout: 1200 - -# 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos. -- name: multi_gpu_learning_tests - team: ml - cluster: - app_config: app_config.yaml - compute_template: 8gpus_96cpus.yaml - - run: - timeout: 7200 - script: python multi_gpu_learning_tests/run.py - -# 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos -# (that support RNN models). -- name: multi_gpu_with_lstm_learning_tests - team: ml - cluster: - app_config: app_config.yaml - compute_template: 8gpus_96cpus.yaml - - run: - timeout: 7200 - script: python multi_gpu_with_lstm_learning_tests/run.py - -# 2-GPU learning tests (StatelessCartPole) + use_attention=True for major -# algos (that support RNN models). -- name: multi_gpu_with_attention_learning_tests - team: ml - cluster: - app_config: app_config.yaml - compute_template: 8gpus_96cpus.yaml - - run: - timeout: 7200 - script: python multi_gpu_with_attention_learning_tests/run.py - -# We'll have these as per-PR tests soon. -# - name: example_scripts_on_gpu_tests -# team: ml -# cluster: -# app_config: app_config.yaml -# compute_template: 1gpu_4cpus.yaml - -# run: -# timeout: 7200 -# script: bash unit_gpu_tests/run.sh - -# IMPALA large machine stress tests (4x Atari). -- name: stress_tests - team: ml - cluster: - app_config: app_config.yaml - compute_template: 4gpus_544_cpus.yaml - - run: - timeout: 5400 - prepare: python wait_cluster.py 6 600 - script: python stress_tests/run_stress_tests.py - - smoke_test: - run: - timeout: 2000 - -# Tests that exercise auto-scaling and Anyscale connect. -- name: connect_tests - team: ml - cluster: - app_config: app_config.yaml - compute_template: auto_scale.yaml - - run: - use_connect: True - timeout: 3000 - script: python connect_tests/run_connect_tests.py - -# Nightly performance regression for popular algorithms. -# These algorithms run nightly for pre-determined amount of time without -# passing criteria. -# Performance metrics, such as reward achieved and throughput, are then -# collected and tracked over time. -- name: performance_tests - team: ml - cluster: - app_config: app_config.yaml - compute_template: 12gpus_192cpus.yaml - - run: - timeout: 10800 - script: python performance_tests/run.py diff --git a/release/rllib_tests/wait_cluster.py b/release/rllib_tests/wait_cluster.py deleted file mode 100644 index c02330db2..000000000 --- a/release/rllib_tests/wait_cluster.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/run_e2e.sh b/release/run_e2e.sh deleted file mode 100755 index 9f1ae16fc..000000000 --- a/release/run_e2e.sh +++ /dev/null @@ -1,176 +0,0 @@ -#!/bin/bash - -set -ex - -cd "${0%/*}" || exit 1 - -reason() { - # Keep in sync with e2e.py ExitCode enum - case $1 in - 0) - REASON="success" - ;; - 2) - REASON="unspecified" - ;; - 3) - REASON="unknown" - ;; - 4) - REASON="runtime error" - ;; - 5) - REASON="command error" - ;; - 6) - REASON="command timeout" - ;; - 7) - REASON="prepare timeout" - ;; - 8) - REASON="filesync timeout" - ;; - 9) - REASON="session timeout" - ;; - 10) - REASON="prepare error" - ;; - 11) - REASON="app config build error" - ;; - 12) - REASON="infra error" - ;; - *) - REASON="untracked error" - ;; - esac - echo "${REASON}" -} - -while [[ $# -gt 0 ]] -do -key="$1" -case $key in - --ray-repo) - shift - RAY_REPO=$1 - ;; - --ray-branch) - shift - RAY_BRANCH=$1 - ;; - --ray-version) - shift - RAY_VERSION=$1 - ;; - --ray-wheels) - shift - RAY_WHEELS=$1 - ;; - --ray-test-repo) - shift - RAY_TEST_REPO=$1 - ;; - --ray-test-branch) - shift - RAY_TEST_BRANCH=$1 - ;; - --release-results-dir) - shift - RELEASE_RESULTS_DIR=$1 - ;; - *) - break -esac -shift -done - -RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git} -RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master} -RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts} - -export RAY_REPO RAY_BRANCH RAY_VERSION RAY_WHEELS RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR - -pip uninstall -q -y ray -pip install -q -r requirements.txt -pip install -q -U boto3 botocore -git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" ~/ray - -RETRY_NUM=0 -MAX_RETRIES=${MAX_RETRIES-3} - -if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then - echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1." - MAX_RETRIES=1 -fi - -ALL_EXIT_CODES=() -while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do - RETRY_NUM=$((RETRY_NUM + 1)) - - if [ "$RETRY_NUM" -gt 1 ]; then - # Sleep for random time between 30 and 90 minutes - SLEEP_TIME=$((1800 + RANDOM % 5400)) - echo "----------------------------------------" - echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run." - echo "----------------------------------------" - sleep ${SLEEP_TIME} - fi - - sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true - - python e2e.py "$@" - EXIT_CODE=$? - REASON=$(reason "${EXIT_CODE}") - ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE - - case ${EXIT_CODE} in - 0) - echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}" - break - ;; - 7 | 9 | 10) - echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})." - ;; - *) - echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting." - break - ;; - esac - -done - -sudo rm -rf /tmp/ray_release_test_artifacts/* || true -sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true - -echo "----------------------------------------" -echo "e2e test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries" -echo "Run results:" - -COUNTER=1 -for EX in "${ALL_EXIT_CODES[@]}"; do - REASON=$(reason "${EX}") - echo " Run $COUNTER: Exit code = ${EX} (${REASON})" - COUNTER=$((COUNTER + 1)) -done - -echo "----------------------------------------" - -REASON=$(reason "${EXIT_CODE}") -echo "Final e2e exit code is ${EXIT_CODE} (${REASON})" - -case ${EXIT_CODE} in - 0) - ;; - 7 | 9 | 10) - echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test." - ;; - *) - echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner." - ;; -esac - -exit $EXIT_CODE diff --git a/release/runtime_env_tests/runtime_env_tests.yaml b/release/runtime_env_tests/runtime_env_tests.yaml deleted file mode 100644 index 7a55da490..000000000 --- a/release/runtime_env_tests/runtime_env_tests.yaml +++ /dev/null @@ -1,34 +0,0 @@ -- name: rte_many_tasks_actors - team: serve - cluster: - app_config: app_config.yaml - compute_template: rte_small.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/rte_many_tasks_actors.py - -- name: wheel_urls - team: serve - cluster: - app_config: app_config.yaml - compute_template: rte_minimal.yaml - - run: - timeout: 9000 # 2h30m - prepare: python wait_cluster.py 1 600 - script: python workloads/wheel_urls.py - -- name: rte_ray_client - team: serve - cluster: - app_config: app_config.yaml - compute_template: rte_minimal.yaml - - run: - use_connect: True - autosuspend_mins: 10 - timeout: 600 - prepare: python wait_cluster.py 1 600 - script: python workloads/rte_ray_client.py diff --git a/release/runtime_env_tests/wait_cluster.py b/release/runtime_env_tests/wait_cluster.py deleted file mode 100644 index c02330db2..000000000 --- a/release/runtime_env_tests/wait_cluster.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/serve_tests/serve_tests.yaml b/release/serve_tests/serve_tests.yaml deleted file mode 100644 index 87058d891..000000000 --- a/release/serve_tests/serve_tests.yaml +++ /dev/null @@ -1,101 +0,0 @@ -- name: single_deployment_1k_noop_replica - team: serve - cluster: - app_config: app_config.yaml - compute_template: compute_tpl_32_cpu.yaml - - run: - timeout: 7200 - long_running: False - script: python workloads/single_deployment_1k_noop_replica.py - - smoke_test: - timeout: 600 - -- name: multi_deployment_1k_noop_replica - team: serve - cluster: - app_config: app_config.yaml - compute_template: compute_tpl_32_cpu.yaml - - run: - timeout: 7200 - long_running: False - script: python workloads/multi_deployment_1k_noop_replica.py - - smoke_test: - timeout: 600 - -- name: autoscaling_single_deployment - team: serve - cluster: - app_config: app_config.yaml - compute_template: compute_tpl_8_cpu_autoscaling.yaml - - run: - timeout: 7200 - long_running: False - script: python workloads/autoscaling_single_deployment.py - - smoke_test: - timeout: 600 - -- name: autoscaling_multi_deployment - team: serve - cluster: - app_config: app_config.yaml - compute_template: compute_tpl_8_cpu_autoscaling.yaml - - run: - timeout: 7200 - long_running: False - script: python workloads/autoscaling_multi_deployment.py - - smoke_test: - timeout: 600 - -- name: serve_micro_benchmark - team: serve - cluster: - app_config: app_config.yaml - # 16 CPUS - compute_template: compute_tpl_single_node.yaml - - run: - timeout: 7200 - long_running: False - script: python workloads/serve_micro_benchmark.py - - smoke_test: - timeout: 600 - -- name: serve_micro_benchmark_k8s - team: serve - cluster: - app_config: app_config.yaml - # 16 CPUS - compute_template: compute_tpl_single_node_k8s.yaml - compute_on_k8s: True - - run: - timeout: 7200 - long_running: False - script: python workloads/serve_micro_benchmark.py - - smoke_test: - timeout: 600 - -- name: serve_cluster_fault_tolerance - team: serve - cluster: - app_config: app_config.yaml - # 16 CPUS - compute_template: compute_tpl_single_node.yaml - - run: - timeout: 7200 - long_running: False - script: python workloads/serve_cluster_fault_tolerance.py - - smoke_test: - timeout: 600 diff --git a/release/sgd_tests/sgd_tests.yaml b/release/sgd_tests/sgd_tests.yaml deleted file mode 100644 index cb0d4d5c3..000000000 --- a/release/sgd_tests/sgd_tests.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# Test multi-node, multi-GPU Ray SGD example. -- name: sgd_gpu - team: ml - cluster: - app_config: sgd_gpu/sgd_gpu_app_config.yaml - compute_template: sgd_gpu/sgd_gpu_compute.yaml - - run: - timeout: 3000 - prepare: python wait_cluster.py 2 600 - script: python sgd_gpu/sgd_gpu_test.py --num-workers=2 --use-gpu --address=auto \ No newline at end of file diff --git a/release/sgd_tests/wait_cluster.py b/release/sgd_tests/wait_cluster.py deleted file mode 100644 index c02330db2..000000000 --- a/release/sgd_tests/wait_cluster.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/test_owners.yaml b/release/test_owners.yaml deleted file mode 100644 index b898529a8..000000000 --- a/release/test_owners.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# Specify the test owners (teams) here. -# The root key should be the name of the test yaml file without the .yaml. -# To specify owners of subtests, use a sub dict (see e.g. long_running_tests). -golden_notebook_tests: ml -horovod_tests: ml -lightgbm_tests: ml -long_running_distributed_tests: ml -long_running_tests: - actor_deaths: core - apex: ml - impala: ml - many_actor_tasks: core - many_drivers: core - many_ppo: core - many_tasks: core - many_tasks_serialized_ids: core - node_failures: core - pbt: ml - serve: serve - serve_failure: serve -microbenchmark: core -nightly_tests: core -rllib_tests: ml -runtime_env_tests: serve -serve_tests: serve -sgd_tests: ml -xgboost_tests: ml diff --git a/release/tune_tests/cloud_tests/tune_cloud_tests.yaml b/release/tune_tests/cloud_tests/tune_cloud_tests.yaml deleted file mode 100644 index 72279931e..000000000 --- a/release/tune_tests/cloud_tests/tune_cloud_tests.yaml +++ /dev/null @@ -1,118 +0,0 @@ -- name: aws_no_sync_down - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_aws_4x2.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/run_cloud_test.py no_sync_down - -- name: aws_ssh_sync - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_aws_4x2.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/run_cloud_test.py ssh_sync - -- name: aws_durable_upload - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_aws_4x2.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload - -- name: aws_durable_upload_rllib_str - team: ml - cluster: - app_config: app_config_ml.yaml - compute_template: tpl_aws_4x2.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str - -- name: aws_durable_upload_rllib_trainer - team: ml - cluster: - app_config: app_config_ml.yaml - compute_template: tpl_aws_4x2.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer - -- name: aws_no_durable_upload - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_aws_4x2.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload - -- name: gcp_k8s_no_sync_down - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_gcp_k8s_4x8.yaml - cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud - - run: - use_connect: True - timeout: 600 - # Remove --cpus-per-trial 8 once n2-standard-2 is supported - script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8 - -- name: gcp_k8s_ssh_sync - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_gcp_k8s_4x8.yaml - cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud - - run: - use_connect: True - timeout: 600 - # Remove --cpus-per-trial 8 once n2-standard-2 is supported - script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8 - -- name: gcp_k8s_durable_upload - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_gcp_k8s_4x8.yaml - cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud - - run: - use_connect: True - timeout: 600 - # Remove --cpus-per-trial 8 once n2-standard-2 is supported - script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload - - -- name: gcp_k8s_no_durable_upload - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_gcp_k8s_4x8.yaml - cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud - - run: - use_connect: True - timeout: 600 - # Remove --cpus-per-trial 8 once n2-standard-2 is supported - script: python workloads/run_cloud_test.py no_durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload diff --git a/release/tune_tests/cloud_tests/wait_cluster.py b/release/tune_tests/cloud_tests/wait_cluster.py deleted file mode 100644 index f70088289..000000000 --- a/release/tune_tests/cloud_tests/wait_cluster.py +++ /dev/null @@ -1,54 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s - -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/tune_tests/scalability_tests/tune_tests.yaml b/release/tune_tests/scalability_tests/tune_tests.yaml deleted file mode 100644 index ba8a5a230..000000000 --- a/release/tune_tests/scalability_tests/tune_tests.yaml +++ /dev/null @@ -1,90 +0,0 @@ -- name: bookkeeping_overhead - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_1x16.yaml - - run: - timeout: 1200 - script: python workloads/test_bookkeeping_overhead.py - - -- name: durable_trainable - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_16x2.yaml - - run: - timeout: 900 - prepare: python wait_cluster.py 16 600 - script: python workloads/test_durable_trainable.py --bucket data-test-ilr - -- name: long_running_large_checkpoints - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_1x32_hd.yaml - - run: - timeout: 86400 - script: python workloads/test_long_running_large_checkpoints.py - long_running: True - - smoke_test: - run: - timeout: 3600 - - -- name: network_overhead - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_100x2.yaml - - run: - timeout: 900 - prepare_timeout: 1200 - prepare: python wait_cluster.py 100 1200 - script: python workloads/test_network_overhead.py - - smoke_test: - cluster: - compute_template: tpl_20x2.yaml - - run: - timeout: 400 - prepare_timeout: 600 - prepare: python wait_cluster.py 20 600 - -- name: result_throughput_cluster - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_16x64.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 16 600 - script: python workloads/test_result_throughput_cluster.py - -- name: result_throughput_single_node - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_1x96.yaml - - run: - timeout: 600 - script: python workloads/test_result_throughput_single_node.py - -- name: xgboost_sweep - team: ml - cluster: - app_config: app_config_data.yaml - compute_template: tpl_16x64.yaml - - run: - timeout: 3600 - prepare: python wait_cluster.py 16 600 - script: python workloads/test_xgboost_sweep.py diff --git a/release/tune_tests/scalability_tests/wait_cluster.py b/release/tune_tests/scalability_tests/wait_cluster.py deleted file mode 100644 index c02330db2..000000000 --- a/release/tune_tests/scalability_tests/wait_cluster.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/util/wait_cluster.py b/release/util/wait_cluster.py deleted file mode 100644 index c02330db2..000000000 --- a/release/util/wait_cluster.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/xgboost_tests/wait_cluster.py b/release/xgboost_tests/wait_cluster.py deleted file mode 100644 index c02330db2..000000000 --- a/release/xgboost_tests/wait_cluster.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import time - -import ray - -ray.init(address="auto") - -parser = argparse.ArgumentParser() -parser.add_argument( - "num_nodes", type=int, help="Wait for this number of nodes (includes head)" -) - -parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds") - -parser.add_argument( - "--feedback_interval_s", - type=int, - default=10, - help="Wait for this number of seconds", -) - -args = parser.parse_args() - -curr_nodes = 0 -start = time.time() -next_feedback = start -max_time = start + args.max_time_s -while not curr_nodes >= args.num_nodes: - now = time.time() - - if now >= max_time: - raise RuntimeError( - f"Maximum wait time reached, but only " - f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting." - ) - - if now >= next_feedback: - passed = now - start - print( - f"Waiting for more nodes to come up: " - f"{curr_nodes}/{args.num_nodes} " - f"({passed:.0f} seconds passed)" - ) - next_feedback = now + args.feedback_interval_s - - time.sleep(5) - curr_nodes = len(ray.nodes()) - -passed = time.time() - start -print( - f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after " - f"{passed:.0f} seconds" -) diff --git a/release/xgboost_tests/xgboost_tests.yaml b/release/xgboost_tests/xgboost_tests.yaml deleted file mode 100644 index 264443308..000000000 --- a/release/xgboost_tests/xgboost_tests.yaml +++ /dev/null @@ -1,104 +0,0 @@ -- name: train_small - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_small.yaml - - run: - use_connect: True - autosuspend_mins: 10 - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/train_small.py - -- name: train_moderate - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_moderate.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 32 600 - script: python workloads/train_moderate.py - -- name: train_gpu - team: ml - cluster: - app_config: app_config_gpu.yaml - compute_template: tpl_gpu_small.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 5 600 - script: python workloads/train_gpu.py - -- name: distributed_api_test - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_small.yaml - results: - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/distributed_api_test.py - results: "" - -- name: ft_small_elastic - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_small.yaml - - run: - timeout: 900 - prepare: python wait_cluster.py 4 600 - script: python workloads/ft_small_elastic.py - results: "" - -- name: ft_small_non_elastic - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_small.yaml - - run: - timeout: 900 - prepare: python wait_cluster.py 4 600 - script: python workloads/ft_small_non_elastic.py - results: "" - -- name: tune_small - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_small.yaml - - run: - timeout: 600 - prepare: python wait_cluster.py 4 600 - script: python workloads/tune_small.py - -- name: tune_32x4 - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_moderate.yaml - - run: - timeout: 900 - prepare: python wait_cluster.py 32 600 - script: python workloads/tune_32x4.py - -- name: tune_4x32 - team: ml - cluster: - app_config: app_config.yaml - compute_template: tpl_cpu_moderate.yaml - - run: - timeout: 900 - prepare: python wait_cluster.py 32 600 - script: python workloads/tune_4x32.py