import copy import logging import os import re import sys import yaml # If you update or reorganize the periodic tests, please ensure the # relevant portions of the Ray release instructions (go/release-ray) # (in particular, running periodic tests and collecting release logs) # are up to date. If you need access, please contact @zhe-thoughts. # Env variables: # RAY_REPO Repo to use for finding the wheel # RAY_BRANCH Branch to find the wheel # RAY_VERSION Version to find the wheel # RAY_WHEELS Direct Ray wheel URL # RAY_TEST_REPO Repo to use for test scripts # RAY_TEST_BRANCH Branch for test scripts # FILTER_FILE File filter # FILTER_TEST Test name filter # RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly) class ReleaseTest: def __init__( self, name: str, smoke_test: bool = False, retry: int = 0, ): self.name = name self.smoke_test = smoke_test self.retry = retry def __str__(self): return self.name def __repr__(self): return self.name def __contains__(self, item): return self.name.__contains__(item) def __iter__(self): return iter(self.name) def __len__(self): return len(self.name) class SmokeTest(ReleaseTest): def __init__(self, name: str, retry: int = 0): super(SmokeTest, self).__init__(name=name, smoke_test=True, retry=retry) CORE_NIGHTLY_TESTS = { "~/ray/release/nightly_tests/nightly_tests.yaml": [ "shuffle_10gb", "shuffle_50gb", "shuffle_50gb_large_partition", "shuffle_100gb", "non_streaming_shuffle_100gb", "non_streaming_shuffle_50gb_large_partition", "non_streaming_shuffle_50gb", SmokeTest("dask_on_ray_large_scale_test_no_spilling"), SmokeTest("dask_on_ray_large_scale_test_spilling"), "stress_test_placement_group", "shuffle_1tb_1000_partition", "non_streaming_shuffle_1tb_1000_partition", "shuffle_1tb_5000_partitions", # TODO(sang): It doesn't even work without spilling # as it hits the scalability limit. # "non_streaming_shuffle_1tb_5000_partitions", "decision_tree_autoscaling", "decision_tree_autoscaling_20_runs", "autoscaling_shuffle_1tb_1000_partitions", SmokeTest("stress_test_many_tasks"), SmokeTest("stress_test_dead_actors"), SmokeTest("threaded_actors_stress_test"), "pg_long_running_performance_test", ], "~/ray/benchmarks/benchmark_tests.yaml": [ "single_node", "object_store", "many_actors_smoke_test", "many_tasks_smoke_test", "many_pgs_smoke_test", ], "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [ "inference", "shuffle_data_loader", "pipelined_training_50_gb", "pipelined_ingestion_1500_gb", "datasets_preprocess_ingest", "datasets_ingest_400G", SmokeTest("datasets_ingest_train_infer"), ], "~/ray/release/nightly_tests/chaos_test.yaml": [ "chaos_many_actors", "chaos_many_tasks_no_object_store", "chaos_pipelined_ingestion_1500_gb_15_windows", ], "~/ray/release/microbenchmark/microbenchmark.yaml": [ "microbenchmark", ], } SERVE_NIGHTLY_TESTS = { "~/ray/release/long_running_tests/long_running_tests.yaml": [ SmokeTest("serve"), SmokeTest("serve_failure"), ], "~/ray/release/serve_tests/serve_tests.yaml": [ "single_deployment_1k_noop_replica", "multi_deployment_1k_noop_replica", "autoscaling_single_deployment", "autoscaling_multi_deployment", "serve_micro_benchmark", # TODO(architkulkarni) Reenable after K8s migration. Currently failing # "serve_micro_benchmark_k8s", "serve_cluster_fault_tolerance", ], } CORE_DAILY_TESTS = { "~/ray/release/nightly_tests/nightly_tests.yaml": [ "k8s_dask_on_ray_large_scale_test_no_spilling", "dask_on_ray_large_scale_test_no_spilling", "dask_on_ray_large_scale_test_spilling", "pg_autoscaling_regression_test", "threaded_actors_stress_test", "k8s_threaded_actors_stress_test", "stress_test_many_tasks", "stress_test_dead_actors", ], "~/ray/release/nightly_tests/chaos_test.yaml": [ "chaos_dask_on_ray_large_scale_test_no_spilling", "chaos_dask_on_ray_large_scale_test_spilling", ], } CORE_SCALABILITY_TESTS_DAILY = { "~/ray/benchmarks/benchmark_tests.yaml": [ "many_actors", "many_tasks", "many_pgs", "many_nodes", ], } CORE_SCHEDULING_DAILY = { "~/ray/benchmarks/benchmark_tests.yaml": [ "scheduling_test_many_0s_tasks_single_node", "scheduling_test_many_0s_tasks_many_nodes", # Reenable these two once we got right setup # "scheduling_test_many_5s_tasks_single_node", # "scheduling_test_many_5s_tasks_many_nodes", ], "~/ray/release/nightly_tests/nightly_tests.yaml": [ "many_nodes_actor_test", "dask_on_ray_10gb_sort", "dask_on_ray_100gb_sort", "dask_on_ray_1tb_sort", "placement_group_performance_test", ], } NIGHTLY_TESTS = { # "~/ray/release/horovod_tests/horovod_tests.yaml": [ # SmokeTest("horovod_test"), # ], # Should we enable this? "~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [ "dask_xgboost_test", "modin_xgboost_test", "torch_tune_serve_test", ], "~/ray/release/long_running_tests/long_running_tests.yaml": [ SmokeTest("actor_deaths"), SmokeTest("apex"), SmokeTest("impala"), SmokeTest("many_actor_tasks"), SmokeTest("many_drivers"), SmokeTest("many_ppo"), SmokeTest("many_tasks"), SmokeTest("many_tasks_serialized_ids"), SmokeTest("node_failures"), SmokeTest("pbt"), # SmokeTest("serve"), # SmokeTest("serve_failure"), # Full long running tests (1 day runtime) "actor_deaths", "apex", "impala", "many_actor_tasks", "many_drivers", "many_ppo", "many_tasks", "many_tasks_serialized_ids", "node_failures", "pbt", "serve", "serve_failure", ], "~/ray/release/sgd_tests/sgd_tests.yaml": [ "sgd_gpu", ], "~/ray/release/tune_tests/cloud_tests/tune_cloud_tests.yaml": [ "aws_no_sync_down", "aws_ssh_sync", "aws_durable_upload", "aws_durable_upload_rllib_str", "aws_durable_upload_rllib_trainer", "gcp_k8s_durable_upload", ], "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [ "bookkeeping_overhead", "durable_trainable", SmokeTest("long_running_large_checkpoints"), SmokeTest("network_overhead"), "result_throughput_cluster", "result_throughput_single_node", ], "~/ray/release/xgboost_tests/xgboost_tests.yaml": [ "train_small", "train_moderate", "train_gpu", "tune_small", "tune_4x32", "tune_32x4", "ft_small_elastic", "ft_small_non_elastic", "distributed_api_test", ], "~/ray/release/rllib_tests/rllib_tests.yaml": [ SmokeTest("learning_tests"), SmokeTest("stress_tests"), "performance_tests", "multi_gpu_learning_tests", "multi_gpu_with_lstm_learning_tests", "multi_gpu_with_attention_learning_tests", # We'll have these as per-PR tests soon. # "example_scripts_on_gpu_tests", ], "~/ray/release/runtime_env_tests/runtime_env_tests.yaml": [ "rte_many_tasks_actors", "wheel_urls", "rte_ray_client", ], } WEEKLY_TESTS = { "~/ray/release/horovod_tests/horovod_tests.yaml": [ "horovod_test", ], "~/ray/release/long_running_distributed_tests" "/long_running_distributed.yaml": [ "pytorch_pbt_failure", ], "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [ "network_overhead", "long_running_large_checkpoints", "xgboost_sweep", ], "~/ray/release/rllib_tests/rllib_tests.yaml": [ "learning_tests", "stress_tests", ], } # This test suite holds "user" tests to test important user workflows # in a particular environment. # All workloads in this test suite should: # 1. Be run in a distributed (multi-node) fashion # 2. Use autoscaling/scale up (no wait_cluster.py) # 3. Use GPUs if applicable # 4. Have the `use_connect` flag set. USER_TESTS = { "~/ray/release/ml_user_tests/ml_user_tests.yaml": [ "train_tensorflow_mnist_test", "train_torch_linear_test", "ray_lightning_user_test_latest", "ray_lightning_user_test_master", "horovod_user_test_latest", "horovod_user_test_master", "xgboost_gpu_connect_latest", "xgboost_gpu_connect_master", "tune_rllib_connect_test", ] } SUITES = { "core-nightly": CORE_NIGHTLY_TESTS, "serve-nightly": SERVE_NIGHTLY_TESTS, "core-daily": CORE_DAILY_TESTS, "core-scalability": CORE_SCALABILITY_TESTS_DAILY, "nightly": {**NIGHTLY_TESTS, **USER_TESTS}, "core-scheduling-daily": CORE_SCHEDULING_DAILY, "weekly": WEEKLY_TESTS, } DEFAULT_STEP_TEMPLATE = { "env": { "ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc", "ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle", "RELEASE_AWS_BUCKET": "ray-release-automation-results", "RELEASE_AWS_LOCATION": "dev", "RELEASE_AWS_DB_NAME": "ray_ci", "RELEASE_AWS_DB_TABLE": "release_test_result", "AWS_REGION": "us-west-2", }, "agents": {"queue": "runner_queue_branch"}, "plugins": [ { "docker#v3.9.0": { "image": "rayproject/ray", "propagate-environment": True, "volumes": [ "/tmp/ray_release_test_artifacts:" "/tmp/ray_release_test_artifacts" ], } } ], "artifact_paths": ["/tmp/ray_release_test_artifacts/**/*"], } def ask_configuration(): RAY_BRANCH = os.environ.get("RAY_BRANCH", "master") RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git") RAY_VERSION = os.environ.get("RAY_VERSION", "") RAY_WHEELS = os.environ.get("RAY_WHEELS", "") RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH) RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO) RELEASE_TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly") FILTER_FILE = os.environ.get("FILTER_FILE", "") FILTER_TEST = os.environ.get("FILTER_TEST", "") input_ask_step = { "input": "Input required: Please specify tests to run", "fields": [ { "text": ( "RAY_REPO: Please specify the Ray repository used " "to find the wheel." ), "hint": ( "Repository from which to fetch the latest " "commits to find the Ray wheels. Usually you don't " "need to change this." ), "default": RAY_REPO, "key": "ray_repo", }, { "text": ( "RAY_BRANCH: Please specify the Ray branch used " "to find the wheel." ), "hint": "For releases, this will be e.g. `releases/1.x.0`", "default": RAY_BRANCH, "key": "ray_branch", }, { "text": ( "RAY_VERSION: Please specify the Ray version used " "to find the wheel." ), "hint": ( "Leave empty for latest master. For releases, " "specify the release version." ), "required": False, "default": RAY_VERSION, "key": "ray_version", }, { "text": "RAY_WHEELS: Please specify the Ray wheel URL.", "hint": ( "ATTENTION: If you provide this, RAY_REPO, " "RAY_BRANCH and RAY_VERSION will be ignored! " "Please also make sure to provide the wheels URL " "for Python 3.7 on Linux.\n" "You can also insert a commit hash here instead " "of a full URL.\n" "NOTE: You can specify multiple commits or URLs " "for easy bisection (one per line) - this will " "run each test on each of the specified wheels." ), "required": False, "default": RAY_WHEELS, "key": "ray_wheels", }, { "text": ( "RAY_TEST_REPO: Please specify the Ray repository " "used to find the tests you would like to run." ), "hint": ( "If you're developing a new release test, this " "will most likely be your GitHub fork." ), "default": RAY_TEST_REPO, "key": "ray_test_repo", }, { "text": ( "RAY_TEST_BRANCH: Please specify the Ray branch used " "to find the tests you would like to run." ), "hint": ( "If you're developing a new release test, this " "will most likely be a branch living on your " "GitHub fork." ), "default": RAY_TEST_BRANCH, "key": "ray_test_branch", }, { "select": ( "RELEASE_TEST_SUITE: Please specify the release " "test suite containing the tests you would like " "to run." ), "hint": ( "Check in the `build_pipeline.py` if you're " "unsure which suite contains your tests." ), "required": True, "options": sorted(SUITES.keys()), "default": RELEASE_TEST_SUITE, "key": "release_test_suite", }, { "text": ( "FILTER_FILE: Please specify a filter for the " "test files that should be included in this build." ), "hint": ( "Only test files (e.g. xgboost_tests.yml) that " "match this string will be included in the test" ), "default": FILTER_FILE, "required": False, "key": "filter_file", }, { "text": ( "FILTER_TEST: Please specify a filter for the " "test names that should be included in this build." ), "hint": ( "Only test names (e.g. tune_4x32) that match " "this string will be included in the test" ), "default": FILTER_TEST, "required": False, "key": "filter_test", }, ], "key": "input_ask_step", } run_again_step = { "commands": [ f'export {v}=$(buildkite-agent meta-data get "{k}")' for k, v in { "ray_branch": "RAY_BRANCH", "ray_repo": "RAY_REPO", "ray_version": "RAY_VERSION", "ray_wheels": "RAY_WHEELS", "ray_test_branch": "RAY_TEST_BRANCH", "ray_test_repo": "RAY_TEST_REPO", "release_test_suite": "RELEASE_TEST_SUITE", "filter_file": "FILTER_FILE", "filter_test": "FILTER_TEST", }.items() ] + [ "export AUTOMATIC=1", "python3 -m pip install --user pyyaml", "rm -rf ~/ray || true", "git clone -b $${RAY_TEST_BRANCH} $${RAY_TEST_REPO} ~/ray", ( "python3 ~/ray/release/.buildkite/build_pipeline.py " "| buildkite-agent pipeline upload" ), ], "label": ":pipeline: Again", "agents": {"queue": "runner_queue_branch"}, "depends_on": "input_ask_step", "key": "run_again_step", } return [ input_ask_step, run_again_step, ] def create_test_step( ray_repo: str, ray_branch: str, ray_version: str, ray_wheels: str, ray_test_repo: str, ray_test_branch: str, test_file: str, test_name: ReleaseTest, ): custom_commit_str = "custom_wheels_url" if ray_wheels: # Extract commit from url p = re.compile(r"([a-f0-9]{40})") m = p.search(ray_wheels) if m is not None: custom_commit_str = m.group(1) ray_wheels_str = f" ({ray_wheels}) " if ray_wheels else "" logging.info(f"Creating step for {test_file}/{test_name}{ray_wheels_str}") cmd = ( f"./release/run_e2e.sh " f'--ray-repo "{ray_repo}" ' f'--ray-branch "{ray_branch}" ' f'--ray-version "{ray_version}" ' f'--ray-wheels "{ray_wheels}" ' f'--ray-test-repo "{ray_test_repo}" ' f'--ray-test-branch "{ray_test_branch}" ' ) args = ( f"--category {ray_branch} " f"--test-config {test_file} " f"--test-name {test_name} " f"--keep-results-dir" ) if test_name.smoke_test: logging.info("This test will run as a smoke test.") args += " --smoke-test" step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE) if test_name.retry: logging.info(f"This test will be retried up to " f"{test_name.retry} times.") step_conf["retry"] = { "automatic": [{"exit_status": "*", "limit": test_name.retry}] } else: # Default retry logic # Warning: Exit codes are currently not correctly propagated to # buildkite! Thus, actual retry logic is currently implemented in # the run_e2e.sh script! step_conf["retry"] = { "automatic": [ {"exit_status": 7, "limit": 2}, # Prepare timeout {"exit_status": 9, "limit": 2}, # Session timeout {"exit_status": 10, "limit": 2}, # Prepare error ], } step_conf["command"] = cmd + args step_conf["label"] = ( f"{test_name} " f"({custom_commit_str if ray_wheels_str else ray_branch}) - " f"{ray_test_branch}/{ray_test_repo}" ) return step_conf def build_pipeline(steps): all_steps = [] RAY_BRANCH = os.environ.get("RAY_BRANCH", "master") RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git") RAY_VERSION = os.environ.get("RAY_VERSION", "") RAY_WHEELS = os.environ.get("RAY_WHEELS", "") RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH) RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO) FILTER_FILE = os.environ.get("FILTER_FILE", "") FILTER_TEST = os.environ.get("FILTER_TEST", "") ray_wheels_list = [""] if RAY_WHEELS: ray_wheels_list = RAY_WHEELS.split("\n") if len(ray_wheels_list) > 1: logging.info( f"This will run a bisec on the following URLs/commits: " f"{ray_wheels_list}" ) logging.info( f"Building pipeline \n" f"Ray repo/branch to test:\n" f" RAY_REPO = {RAY_REPO}\n" f" RAY_BRANCH = {RAY_BRANCH}\n\n" f" RAY_VERSION = {RAY_VERSION}\n\n" f" RAY_WHEELS = {RAY_WHEELS}\n\n" f"Ray repo/branch containing the test configurations and scripts:" f" RAY_TEST_REPO = {RAY_TEST_REPO}\n" f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n" f"Filtering for these tests:\n" f" FILTER_FILE = {FILTER_FILE}\n" f" FILTER_TEST = {FILTER_TEST}\n\n" ) for test_file, test_names in steps.items(): if FILTER_FILE and FILTER_FILE not in test_file: continue test_base = os.path.basename(test_file) for test_name in test_names: if FILTER_TEST and FILTER_TEST not in test_name: continue if not isinstance(test_name, ReleaseTest): test_name = ReleaseTest(name=test_name) logging.info(f"Adding test: {test_base}/{test_name}") for ray_wheels in ray_wheels_list: step_conf = create_test_step( ray_repo=RAY_REPO, ray_branch=RAY_BRANCH, ray_version=RAY_VERSION, ray_wheels=ray_wheels, ray_test_repo=RAY_TEST_REPO, ray_test_branch=RAY_TEST_BRANCH, test_file=test_file, test_name=test_name, ) all_steps.append(step_conf) return all_steps def alert_pipeline(stats: bool = False): step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE) cmd = "python release/alert.py" if stats: cmd += " --stats" step_conf["commands"] = [ "pip install -q -r release/requirements.txt", "pip install -U boto3 botocore", cmd, ] step_conf["label"] = f"Send periodic alert (stats_only = {stats})" return [step_conf] if __name__ == "__main__": alert = os.environ.get("RELEASE_ALERT", "0") ask_for_config = not bool(int(os.environ.get("AUTOMATIC", "0"))) if alert in ["1", "stats"]: steps = alert_pipeline(alert == "stats") elif ask_for_config: steps = ask_configuration() else: TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly") PIPELINE_SPEC = SUITES[TEST_SUITE] steps = build_pipeline(PIPELINE_SPEC) yaml.dump({"steps": steps}, sys.stdout)