diff --git a/release/BUILD b/release/BUILD index f8a169de1..c5988d982 100644 --- a/release/BUILD +++ b/release/BUILD @@ -85,10 +85,10 @@ py_test( ) py_test( - name = "test_repeat", + name = "test_run_script", tags = ["team:ml", "release_unit"], size = "small", - srcs = ["ray_release/tests/test_repeat.py"] + srcs = ["ray_release/tests/test_run_script.py"] ) py_test( diff --git a/release/ray_release/cluster_manager/cluster_manager.py b/release/ray_release/cluster_manager/cluster_manager.py index d920cf2cd..7d826b8b1 100644 --- a/release/ray_release/cluster_manager/cluster_manager.py +++ b/release/ray_release/cluster_manager/cluster_manager.py @@ -34,6 +34,13 @@ class ClusterManager(abc.ABC): def set_cluster_env(self, cluster_env: Dict[str, Any]): self.cluster_env = cluster_env + + # Add flags for redisless Ray + self.cluster_env.setdefault("env_vars", {}) + self.cluster_env["env_vars"]["MATCH_AUTOSCALER_AND_RAY_IMAGES"] = "1" + self.cluster_env["env_vars"]["RAY_bootstrap_with_gcs"] = "1" + self.cluster_env["env_vars"]["RAY_gcs_storage"] = "memory" + self.cluster_env_name = ( f"{self.project_name}_{self.project_id[4:8]}" f"__env__{self.test_name}__" diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 2ceb3537b..9ff3cbb79 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -214,6 +214,8 @@ def run_release_test( command = f"{command} --smoke-test" command_env["IS_SMOKE_TEST"] = "1" + is_long_running = test["run"].get("long_running", False) + try: command_runner.run_command( command, env=command_env, timeout=command_timeout @@ -221,7 +223,9 @@ def run_release_test( except CommandError as e: raise TestCommandError(e) except CommandTimeout as e: - raise TestCommandTimeout(e) + if not is_long_running: + # Only raise error if command is not long running + raise TestCommandTimeout(e) try: command_results = command_runner.fetch_results() diff --git a/release/ray_release/scripts/build_pipeline.py b/release/ray_release/scripts/build_pipeline.py index 92410ed47..200050214 100644 --- a/release/ray_release/scripts/build_pipeline.py +++ b/release/ray_release/scripts/build_pipeline.py @@ -20,6 +20,9 @@ from ray_release.logger import logger from ray_release.wheels import find_and_wait_for_ray_wheels_url +PIPELINE_ARTIFACT_PATH = "/tmp/pipeline_artifacts" + + @click.command() @click.option( "--test-collection-file", @@ -118,6 +121,19 @@ def main(test_collection_file: Optional[str] = None): group_step = {"group": group, "steps": group_steps} steps.append(group_step) + if "BUILDKITE" in os.environ: + if os.path.exists(PIPELINE_ARTIFACT_PATH): + shutil.rmtree(PIPELINE_ARTIFACT_PATH) + + os.makedirs(PIPELINE_ARTIFACT_PATH, exist_ok=True, mode=0o755) + + with open(os.path.join(PIPELINE_ARTIFACT_PATH, "pipeline.json"), "wt") as fp: + json.dump(steps, fp) + + settings["frequency"] = settings["frequency"].value + with open(os.path.join(PIPELINE_ARTIFACT_PATH, "settings.json"), "wt") as fp: + json.dump(settings, fp) + steps_str = json.dumps(steps) print(steps_str) diff --git a/release/ray_release/scripts/convert_legacy_config.py b/release/ray_release/scripts/convert_legacy_config.py index 4eda97dd7..eb115958d 100644 --- a/release/ray_release/scripts/convert_legacy_config.py +++ b/release/ray_release/scripts/convert_legacy_config.py @@ -6,6 +6,17 @@ import click import yaml +class FormatDumper(yaml.SafeDumper): + last_indent = 0 + + def write_line_break(self, data=None): + if (self.indent or 0) < self.last_indent: + super().write_line_break() + + super().write_line_break(data) + self.last_indent = self.indent or 0 + + def replace_prepare(dt: Dict): if "prepare" in dt and "wait_cluster" in dt["prepare"]: _, _, nodes, timeout = dt.pop("prepare").split(" ") @@ -42,6 +53,11 @@ def main(legacy_config: str, prefix: str, group: str, alert: str): "cluster_compute": old["cluster"]["compute_template"], } + if "cloud_id" in old["cluster"]: + test["cluster"]["cloud_id"] = old["cluster"]["cloud_id"] + if "cloud_name" in old["cluster"]: + test["cluster"]["cloud_name"] = old["cluster"]["cloud_name"] + if "driver_setup" in old: test["driver_setup"] = "driver_setup" @@ -65,7 +81,7 @@ def main(legacy_config: str, prefix: str, group: str, alert: str): tests.append(test) - yaml.dump(tests, sys.stdout, sort_keys=False) + yaml.dump(tests, sys.stdout, Dumper=FormatDumper, sort_keys=False) sys.stdout.flush() diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index 6729c6a66..e070d57ff 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -124,6 +124,7 @@ def main( result=result, ray_wheels_url=ray_wheels_url, reporters=reporters, + smoke_test=smoke_test, cluster_id=cluster_id, cluster_env_id=cluster_env_id, no_terminate=no_terminate, diff --git a/release/ray_release/tests/_test_catch_args.py b/release/ray_release/tests/_test_catch_args.py new file mode 100644 index 000000000..736a01e64 --- /dev/null +++ b/release/ray_release/tests/_test_catch_args.py @@ -0,0 +1,14 @@ +import json +import sys + + +def main(): + argv_file = sys.argv[1] + with open(argv_file, "wt") as fp: + json.dump(sys.argv, fp) + + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/release/ray_release/tests/test_buildkite.py b/release/ray_release/tests/test_buildkite.py index 6c806c7f2..074c19552 100644 --- a/release/ray_release/tests/test_buildkite.py +++ b/release/ray_release/tests/test_buildkite.py @@ -11,6 +11,7 @@ from ray_release.buildkite.settings import ( Frequency, update_settings_from_buildkite, ) +from ray_release.buildkite.step import get_step from ray_release.config import Test from ray_release.exception import ReleaseTestConfigError from ray_release.wheels import ( @@ -221,3 +222,19 @@ class BuildkiteSettingsTest(unittest.TestCase): [t["name"] for t, _ in grouped["x"]], ["x1", "x2", "x3"] ) self.assertEqual(len(grouped["y"]), 1) + + def testGetStep(self): + test = Test( + { + "name": "test", + "frequency": "nightly", + "run": {"script": "test_script.py"}, + "smoke_test": {"frequency": "multi"}, + } + ) + + step = get_step(test, smoke_test=False) + self.assertNotIn("--smoke-test", step["command"]) + + step = get_step(test, smoke_test=True) + self.assertIn("--smoke-test", step["command"]) diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index 05e190026..a9ab3a85e 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -1,6 +1,7 @@ import os import shutil import tempfile +import time import unittest from typing import Type, Callable from unittest.mock import patch @@ -213,7 +214,10 @@ class GlueTest(unittest.TestCase): if until == "test_command": return - self.command_runner_return["fetch_results"] = {"time_taken": 50} + self.command_runner_return["fetch_results"] = { + "time_taken": 50, + "last_update": time.time() - 60, + } if until == "fetch_results": return @@ -495,6 +499,26 @@ class GlueTest(unittest.TestCase): # Ensure cluster was terminated self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1) + def testTestCommandTimeoutLongRunning(self): + result = Result() + + self._succeed_until("fetch_results") + + # Test command times out + self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout) + with self.assertRaises(TestCommandTimeout): + self._run(result) + self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value) + + # But now set test to long running + self.test["run"]["long_running"] = True + self._run(result) # Will not fail this time + + self.assertGreaterEqual(result.results["last_update_diff"], 60.0) + + # Ensure cluster was terminated + self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1) + def testFetchResultFails(self): result = Result() diff --git a/release/ray_release/tests/test_repeat.py b/release/ray_release/tests/test_run_script.py similarity index 81% rename from release/ray_release/tests/test_repeat.py rename to release/ray_release/tests/test_run_script.py index fca41669e..1aa944f7e 100644 --- a/release/ray_release/tests/test_repeat.py +++ b/release/ray_release/tests/test_run_script.py @@ -1,3 +1,4 @@ +import json import os import shutil import subprocess @@ -7,7 +8,7 @@ import unittest from ray_release.result import ExitCode -class WheelsFinderTest(unittest.TestCase): +class RunScriptTest(unittest.TestCase): def setUp(self) -> None: self.tempdir = tempfile.mkdtemp() self.state_file = os.path.join(self.tempdir, "state.txt") @@ -18,9 +19,7 @@ class WheelsFinderTest(unittest.TestCase): os.environ["NO_INSTALL"] = "1" os.environ["NO_CLONE"] = "1" os.environ["NO_ARTIFACTS"] = "1" - os.environ["RAY_TEST_SCRIPT"] = ( - "ray_release/tests/" "_test_run_release_test_sh.py" - ) + os.environ["RAY_TEST_SCRIPT"] = "ray_release/tests/_test_run_release_test_sh.py" os.environ["OVERRIDE_SLEEP_TIME"] = "0" def tearDown(self) -> None: @@ -86,3 +85,19 @@ class WheelsFinderTest(unittest.TestCase): ExitCode.COMMAND_ALERT.value, ) self.assertEquals(self._read_state(), 2) + + def testParameters(self): + os.environ["RAY_TEST_SCRIPT"] = "ray_release/tests/_test_catch_args.py" + argv_file = tempfile.mktemp() + + subprocess.check_call( + f"{self.test_script} " f"{argv_file} " f"--smoke-test", + shell=True, + ) + + with open(argv_file, "rt") as fp: + data = json.load(fp) + + os.unlink(argv_file) + + self.assertIn("--smoke-test", data) diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 75b75dcd9..ea3a09c00 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -17,7 +17,7 @@ # # # How often to run the tests. # # One of [disabled, any, multi, nightly, weekly]. -# frequency: weekly +# frequency: disabled # weekly # # Owning team. This field will be persisted to the database # team: ml # @@ -49,7 +49,7 @@ # # # File manager to use to transfer files to and from the cluster. # # Can be any of [sdk, client, job]. -# file_manager: job +# file_manager: sdk # # # If you want to wait for nodes to be ready, you can specify this here: # wait_for_nodes: @@ -77,7 +77,7 @@ # smoke_test: # # Smoke tests can have different frequencies. A smoke test is only triggered # # when the regular test is not matched. -# frequency: nightly +# frequency: disabled # nightly # # Here we adjust the run timeout down and run on less nodes. The test script # # remains the same. # run: @@ -130,7 +130,7 @@ test_name: train_moderate test_suite: xgboost_tests - frequency: nightly + frequency: disabled # nightly team: ml cluster: @@ -146,7 +146,7 @@ timeout: 600 type: sdk_command - file_manager: job + file_manager: sdk alert: xgboost_tests @@ -158,7 +158,7 @@ test_name: train_gpu test_suite: xgboost_tests - frequency: nightly + frequency: disabled # nightly team: ml cluster: @@ -174,7 +174,7 @@ timeout: 600 type: sdk_command - file_manager: job + file_manager: sdk alert: xgboost_tests @@ -186,7 +186,7 @@ test_name: distributed_api_test test_suite: xgboost_tests - frequency: nightly + frequency: disabled # nightly team: ml cluster: @@ -201,7 +201,7 @@ timeout: 600 type: sdk_command - file_manager: job + file_manager: sdk alert: xgboost_tests @@ -213,7 +213,7 @@ test_name: ft_small_elastic test_suite: xgboost_tests - frequency: nightly + frequency: disabled # nightly team: ml cluster: @@ -229,7 +229,7 @@ timeout: 600 type: sdk_command - file_manager: job + file_manager: sdk alert: xgboost_tests @@ -241,7 +241,7 @@ test_name: ft_small_non_elastic test_suite: xgboost_tests - frequency: nightly + frequency: disabled # nightly team: ml cluster: @@ -257,7 +257,7 @@ timeout: 600 type: sdk_command - file_manager: job + file_manager: sdk alert: xgboost_tests @@ -269,7 +269,7 @@ test_name: tune_small test_suite: xgboost_tests - frequency: nightly + frequency: disabled # nightly team: ml cluster: @@ -285,7 +285,7 @@ timeout: 600 type: sdk_command - file_manager: job + file_manager: sdk alert: xgboost_tests @@ -297,7 +297,7 @@ test_name: tune_32x4 test_suite: xgboost_tests - frequency: nightly + frequency: disabled # nightly team: ml cluster: @@ -313,7 +313,7 @@ timeout: 600 type: sdk_command - file_manager: job + file_manager: sdk alert: xgboost_tests @@ -325,7 +325,7 @@ test_name: tune_4x32 test_suite: xgboost_tests - frequency: nightly + frequency: disabled # nightly team: ml cluster: @@ -341,6 +341,423 @@ timeout: 600 type: sdk_command - file_manager: job + file_manager: sdk alert: xgboost_tests + +####################### +# Tune cloud tests +####################### +- name: tune_cloud_aws_no_sync_down + group: Tune cloud tests + working_dir: tune_tests/cloud_tests + + legacy: + test_name: aws_no_sync_down + test_suite: tune_cloud_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_aws_4x2.yaml + + run: + timeout: 600 + script: python workloads/run_cloud_test.py no_sync_down + + wait_for_nodes: + num_nodes: 4 + timeout: 600 + + type: sdk_command + file_manager: sdk + + alert: tune_tests + +- name: tune_cloud_aws_ssh_sync + group: Tune cloud tests + working_dir: tune_tests/cloud_tests + + legacy: + test_name: aws_ssh_sync + test_suite: tune_cloud_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_aws_4x2.yaml + + run: + timeout: 600 + script: python workloads/run_cloud_test.py ssh_sync + + wait_for_nodes: + num_nodes: 4 + timeout: 600 + + type: sdk_command + file_manager: sdk + + alert: tune_tests + +- name: tune_cloud_aws_durable_upload + group: Tune cloud tests + working_dir: tune_tests/cloud_tests + + legacy: + test_name: aws_durable_upload + test_suite: tune_cloud_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_aws_4x2.yaml + + run: + timeout: 600 + script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload + + wait_for_nodes: + num_nodes: 4 + timeout: 600 + + type: sdk_command + file_manager: sdk + + alert: tune_tests + +- name: tune_cloud_aws_durable_upload_rllib_str + group: Tune cloud tests + working_dir: tune_tests/cloud_tests + + legacy: + test_name: aws_durable_upload_rllib_str + test_suite: tune_cloud_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config_ml.yaml + cluster_compute: tpl_aws_4x2.yaml + + run: + timeout: 600 + + script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str + --bucket s3://data-test-ilr/durable_upload_rllib_str + + wait_for_nodes: + num_nodes: 4 + timeout: 600 + + type: sdk_command + file_manager: sdk + + alert: tune_tests + +- name: tune_cloud_aws_durable_upload_rllib_trainer + group: Tune cloud tests + working_dir: tune_tests/cloud_tests + + legacy: + test_name: aws_durable_upload_rllib_trainer + test_suite: tune_cloud_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config_ml.yaml + cluster_compute: tpl_aws_4x2.yaml + + run: + timeout: 600 + script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer + --bucket s3://data-test-ilr/durable_upload_rllib_trainer + + wait_for_nodes: + num_nodes: 4 + timeout: 600 + + type: sdk_command + file_manager: sdk + + alert: tune_tests + +- name: tune_cloud_gcp_k8s_no_sync_down + group: Tune cloud tests + working_dir: tune_tests/cloud_tests + + legacy: + test_name: gcp_k8s_no_sync_down + test_suite: tune_cloud_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_gcp_k8s_4x8.yaml + cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud + + run: + timeout: 600 + script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8 + type: client + + alert: tune_tests + +- name: tune_cloud_gcp_k8s_ssh_sync + group: Tune cloud tests + working_dir: tune_tests/cloud_tests + + legacy: + test_name: gcp_k8s_ssh_sync + test_suite: tune_cloud_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_gcp_k8s_4x8.yaml + cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud + + run: + timeout: 600 + script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8 + type: client + + alert: tune_tests + +- name: tune_cloud_gcp_k8s_durable_upload + group: Tune cloud tests + working_dir: tune_tests/cloud_tests + + legacy: + test_name: gcp_k8s_durable_upload + test_suite: tune_cloud_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_gcp_k8s_4x8.yaml + cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud + + run: + timeout: 600 + script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload + type: client + + alert: tune_tests + + +######################## +# Tune scalability tests +######################## + +- name: tune_scalability_bookkeeping_overhead + group: Tune scalability tests + working_dir: tune_tests/scalability_tests + + legacy: + test_name: bookkeeping_overhead + test_suite: tune_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_1x16.yaml + + run: + timeout: 1200 + script: python workloads/test_bookkeeping_overhead.py + type: sdk_command + file_manager: sdk + + alert: tune_tests + +- name: tune_scalability_durable_trainable + group: Tune scalability tests + working_dir: tune_tests/scalability_tests + + legacy: + test_name: durable_trainable + test_suite: tune_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_16x2.yaml + + run: + timeout: 900 + script: python workloads/test_durable_trainable.py --bucket data-test-ilr + wait_for_nodes: + num_nodes: 16 + timeout: 600 + + type: sdk_command + file_manager: sdk + + alert: tune_tests + +- name: tune_scalability_long_running_large_checkpoints + group: Tune scalability tests + working_dir: tune_tests/scalability_tests + + legacy: + test_name: long_running_large_checkpoints + test_suite: tune_tests + + frequency: disabled # weekly + team: ml + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_1x32_hd.yaml + + run: + timeout: 86400 + script: python workloads/test_long_running_large_checkpoints.py + long_running: true + type: sdk_command + file_manager: sdk + + smoke_test: + frequency: disabled # nightly + + run: + timeout: 3600 + + alert: tune_tests + +- name: tune_scalability_network_overhead + group: Tune scalability tests + working_dir: tune_tests/scalability_tests + legacy: + test_name: network_overhead + test_suite: tune_tests + + frequency: disabled # weekly + team: ml + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_100x2.yaml + + run: + timeout: 900 + prepare_timeout: 1200 + script: python workloads/test_network_overhead.py + wait_for_nodes: + num_nodes: 100 + timeout: 1200 + + type: sdk_command + file_manager: sdk + + smoke_test: + frequency: disabled # nightly + + cluster: + compute_template: tpl_20x2.yaml + + run: + timeout: 400 + prepare_timeout: 600 + wait_for_nodes: + num_nodes: 20 + timeout: 600 + + alert: tune_tests + +- name: tune_scalability_result_throughput_cluster + group: Tune scalability tests + working_dir: tune_tests/scalability_tests + + legacy: + test_name: result_throughput_cluster + test_suite: tune_tests + + frequency: disabled # nightly + team: ml + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_16x64.yaml + + run: + timeout: 600 + script: python workloads/test_result_throughput_cluster.py + + wait_for_nodes: + num_nodes: 16 + timeout: 600 + + type: sdk_command + file_manager: sdk + + alert: tune_tests + +- name: tune_scalability_result_throughput_single_node + group: Tune scalability tests + working_dir: tune_tests/scalability_tests + + legacy: + test_name: result_throughput_single_node + test_suite: tune_tests + + frequency: disabled # nightly + team: ml + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_1x96.yaml + + run: + timeout: 600 + script: python workloads/test_result_throughput_single_node.py + type: sdk_command + file_manager: sdk + + alert: tune_tests + +- name: tune_scalability_xgboost_sweep + group: Tune scalability tests + working_dir: tune_tests/scalability_tests + + legacy: + test_name: xgboost_sweep + test_suite: tune_tests + + frequency: disabled # weekly + team: ml + + cluster: + cluster_env: app_config_data.yaml + cluster_compute: tpl_16x64.yaml + + run: + timeout: 3600 + script: python workloads/test_xgboost_sweep.py + + wait_for_nodes: + num_nodes: 16 + timeout: 600 + + type: sdk_command + file_manager: sdk + + alert: tune_tests + diff --git a/release/run_release_test.sh b/release/run_release_test.sh index 6654f6e63..669a18c5c 100755 --- a/release/run_release_test.sh +++ b/release/run_release_test.sh @@ -26,28 +26,6 @@ reason() { echo "${REASON}" } -while [[ $# -gt 0 ]] -do -key="$1" -case $key in - --ray-test-repo) - shift - RAY_TEST_REPO=$1 - ;; - --ray-test-branch) - shift - RAY_TEST_BRANCH=$1 - ;; - --release-results-dir) - shift - RELEASE_RESULTS_DIR=$1 - ;; - *) - break -esac -shift -done - RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py} RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git} RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}