[release] Fix special cases in release test package (e.g. smoke test) (#22442)

Fixing special cases (e.g. smoke tests, long running tests) in the release test package infrastructure. Prepare migration of Tune and XGBoost tests.
This commit is contained in:
Kai Fricke 2022-02-28 21:05:01 +01:00 committed by GitHub
parent ba4f1423c7
commit 3695408a85
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 559 additions and 50 deletions

View file

@ -85,10 +85,10 @@ py_test(
)
py_test(
name = "test_repeat",
name = "test_run_script",
tags = ["team:ml", "release_unit"],
size = "small",
srcs = ["ray_release/tests/test_repeat.py"]
srcs = ["ray_release/tests/test_run_script.py"]
)
py_test(

View file

@ -34,6 +34,13 @@ class ClusterManager(abc.ABC):
def set_cluster_env(self, cluster_env: Dict[str, Any]):
self.cluster_env = cluster_env
# Add flags for redisless Ray
self.cluster_env.setdefault("env_vars", {})
self.cluster_env["env_vars"]["MATCH_AUTOSCALER_AND_RAY_IMAGES"] = "1"
self.cluster_env["env_vars"]["RAY_bootstrap_with_gcs"] = "1"
self.cluster_env["env_vars"]["RAY_gcs_storage"] = "memory"
self.cluster_env_name = (
f"{self.project_name}_{self.project_id[4:8]}"
f"__env__{self.test_name}__"

View file

@ -214,6 +214,8 @@ def run_release_test(
command = f"{command} --smoke-test"
command_env["IS_SMOKE_TEST"] = "1"
is_long_running = test["run"].get("long_running", False)
try:
command_runner.run_command(
command, env=command_env, timeout=command_timeout
@ -221,7 +223,9 @@ def run_release_test(
except CommandError as e:
raise TestCommandError(e)
except CommandTimeout as e:
raise TestCommandTimeout(e)
if not is_long_running:
# Only raise error if command is not long running
raise TestCommandTimeout(e)
try:
command_results = command_runner.fetch_results()

View file

@ -20,6 +20,9 @@ from ray_release.logger import logger
from ray_release.wheels import find_and_wait_for_ray_wheels_url
PIPELINE_ARTIFACT_PATH = "/tmp/pipeline_artifacts"
@click.command()
@click.option(
"--test-collection-file",
@ -118,6 +121,19 @@ def main(test_collection_file: Optional[str] = None):
group_step = {"group": group, "steps": group_steps}
steps.append(group_step)
if "BUILDKITE" in os.environ:
if os.path.exists(PIPELINE_ARTIFACT_PATH):
shutil.rmtree(PIPELINE_ARTIFACT_PATH)
os.makedirs(PIPELINE_ARTIFACT_PATH, exist_ok=True, mode=0o755)
with open(os.path.join(PIPELINE_ARTIFACT_PATH, "pipeline.json"), "wt") as fp:
json.dump(steps, fp)
settings["frequency"] = settings["frequency"].value
with open(os.path.join(PIPELINE_ARTIFACT_PATH, "settings.json"), "wt") as fp:
json.dump(settings, fp)
steps_str = json.dumps(steps)
print(steps_str)

View file

@ -6,6 +6,17 @@ import click
import yaml
class FormatDumper(yaml.SafeDumper):
last_indent = 0
def write_line_break(self, data=None):
if (self.indent or 0) < self.last_indent:
super().write_line_break()
super().write_line_break(data)
self.last_indent = self.indent or 0
def replace_prepare(dt: Dict):
if "prepare" in dt and "wait_cluster" in dt["prepare"]:
_, _, nodes, timeout = dt.pop("prepare").split(" ")
@ -42,6 +53,11 @@ def main(legacy_config: str, prefix: str, group: str, alert: str):
"cluster_compute": old["cluster"]["compute_template"],
}
if "cloud_id" in old["cluster"]:
test["cluster"]["cloud_id"] = old["cluster"]["cloud_id"]
if "cloud_name" in old["cluster"]:
test["cluster"]["cloud_name"] = old["cluster"]["cloud_name"]
if "driver_setup" in old:
test["driver_setup"] = "driver_setup"
@ -65,7 +81,7 @@ def main(legacy_config: str, prefix: str, group: str, alert: str):
tests.append(test)
yaml.dump(tests, sys.stdout, sort_keys=False)
yaml.dump(tests, sys.stdout, Dumper=FormatDumper, sort_keys=False)
sys.stdout.flush()

View file

@ -124,6 +124,7 @@ def main(
result=result,
ray_wheels_url=ray_wheels_url,
reporters=reporters,
smoke_test=smoke_test,
cluster_id=cluster_id,
cluster_env_id=cluster_env_id,
no_terminate=no_terminate,

View file

@ -0,0 +1,14 @@
import json
import sys
def main():
argv_file = sys.argv[1]
with open(argv_file, "wt") as fp:
json.dump(sys.argv, fp)
sys.exit(0)
if __name__ == "__main__":
main()

View file

@ -11,6 +11,7 @@ from ray_release.buildkite.settings import (
Frequency,
update_settings_from_buildkite,
)
from ray_release.buildkite.step import get_step
from ray_release.config import Test
from ray_release.exception import ReleaseTestConfigError
from ray_release.wheels import (
@ -221,3 +222,19 @@ class BuildkiteSettingsTest(unittest.TestCase):
[t["name"] for t, _ in grouped["x"]], ["x1", "x2", "x3"]
)
self.assertEqual(len(grouped["y"]), 1)
def testGetStep(self):
test = Test(
{
"name": "test",
"frequency": "nightly",
"run": {"script": "test_script.py"},
"smoke_test": {"frequency": "multi"},
}
)
step = get_step(test, smoke_test=False)
self.assertNotIn("--smoke-test", step["command"])
step = get_step(test, smoke_test=True)
self.assertIn("--smoke-test", step["command"])

View file

@ -1,6 +1,7 @@
import os
import shutil
import tempfile
import time
import unittest
from typing import Type, Callable
from unittest.mock import patch
@ -213,7 +214,10 @@ class GlueTest(unittest.TestCase):
if until == "test_command":
return
self.command_runner_return["fetch_results"] = {"time_taken": 50}
self.command_runner_return["fetch_results"] = {
"time_taken": 50,
"last_update": time.time() - 60,
}
if until == "fetch_results":
return
@ -495,6 +499,26 @@ class GlueTest(unittest.TestCase):
# Ensure cluster was terminated
self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def testTestCommandTimeoutLongRunning(self):
result = Result()
self._succeed_until("fetch_results")
# Test command times out
self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout)
with self.assertRaises(TestCommandTimeout):
self._run(result)
self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value)
# But now set test to long running
self.test["run"]["long_running"] = True
self._run(result) # Will not fail this time
self.assertGreaterEqual(result.results["last_update_diff"], 60.0)
# Ensure cluster was terminated
self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
def testFetchResultFails(self):
result = Result()

View file

@ -1,3 +1,4 @@
import json
import os
import shutil
import subprocess
@ -7,7 +8,7 @@ import unittest
from ray_release.result import ExitCode
class WheelsFinderTest(unittest.TestCase):
class RunScriptTest(unittest.TestCase):
def setUp(self) -> None:
self.tempdir = tempfile.mkdtemp()
self.state_file = os.path.join(self.tempdir, "state.txt")
@ -18,9 +19,7 @@ class WheelsFinderTest(unittest.TestCase):
os.environ["NO_INSTALL"] = "1"
os.environ["NO_CLONE"] = "1"
os.environ["NO_ARTIFACTS"] = "1"
os.environ["RAY_TEST_SCRIPT"] = (
"ray_release/tests/" "_test_run_release_test_sh.py"
)
os.environ["RAY_TEST_SCRIPT"] = "ray_release/tests/_test_run_release_test_sh.py"
os.environ["OVERRIDE_SLEEP_TIME"] = "0"
def tearDown(self) -> None:
@ -86,3 +85,19 @@ class WheelsFinderTest(unittest.TestCase):
ExitCode.COMMAND_ALERT.value,
)
self.assertEquals(self._read_state(), 2)
def testParameters(self):
os.environ["RAY_TEST_SCRIPT"] = "ray_release/tests/_test_catch_args.py"
argv_file = tempfile.mktemp()
subprocess.check_call(
f"{self.test_script} " f"{argv_file} " f"--smoke-test",
shell=True,
)
with open(argv_file, "rt") as fp:
data = json.load(fp)
os.unlink(argv_file)
self.assertIn("--smoke-test", data)

View file

@ -17,7 +17,7 @@
#
# # How often to run the tests.
# # One of [disabled, any, multi, nightly, weekly].
# frequency: weekly
# frequency: disabled # weekly
# # Owning team. This field will be persisted to the database
# team: ml
#
@ -49,7 +49,7 @@
#
# # File manager to use to transfer files to and from the cluster.
# # Can be any of [sdk, client, job].
# file_manager: job
# file_manager: sdk
#
# # If you want to wait for nodes to be ready, you can specify this here:
# wait_for_nodes:
@ -77,7 +77,7 @@
# smoke_test:
# # Smoke tests can have different frequencies. A smoke test is only triggered
# # when the regular test is not matched.
# frequency: nightly
# frequency: disabled # nightly
# # Here we adjust the run timeout down and run on less nodes. The test script
# # remains the same.
# run:
@ -130,7 +130,7 @@
test_name: train_moderate
test_suite: xgboost_tests
frequency: nightly
frequency: disabled # nightly
team: ml
cluster:
@ -146,7 +146,7 @@
timeout: 600
type: sdk_command
file_manager: job
file_manager: sdk
alert: xgboost_tests
@ -158,7 +158,7 @@
test_name: train_gpu
test_suite: xgboost_tests
frequency: nightly
frequency: disabled # nightly
team: ml
cluster:
@ -174,7 +174,7 @@
timeout: 600
type: sdk_command
file_manager: job
file_manager: sdk
alert: xgboost_tests
@ -186,7 +186,7 @@
test_name: distributed_api_test
test_suite: xgboost_tests
frequency: nightly
frequency: disabled # nightly
team: ml
cluster:
@ -201,7 +201,7 @@
timeout: 600
type: sdk_command
file_manager: job
file_manager: sdk
alert: xgboost_tests
@ -213,7 +213,7 @@
test_name: ft_small_elastic
test_suite: xgboost_tests
frequency: nightly
frequency: disabled # nightly
team: ml
cluster:
@ -229,7 +229,7 @@
timeout: 600
type: sdk_command
file_manager: job
file_manager: sdk
alert: xgboost_tests
@ -241,7 +241,7 @@
test_name: ft_small_non_elastic
test_suite: xgboost_tests
frequency: nightly
frequency: disabled # nightly
team: ml
cluster:
@ -257,7 +257,7 @@
timeout: 600
type: sdk_command
file_manager: job
file_manager: sdk
alert: xgboost_tests
@ -269,7 +269,7 @@
test_name: tune_small
test_suite: xgboost_tests
frequency: nightly
frequency: disabled # nightly
team: ml
cluster:
@ -285,7 +285,7 @@
timeout: 600
type: sdk_command
file_manager: job
file_manager: sdk
alert: xgboost_tests
@ -297,7 +297,7 @@
test_name: tune_32x4
test_suite: xgboost_tests
frequency: nightly
frequency: disabled # nightly
team: ml
cluster:
@ -313,7 +313,7 @@
timeout: 600
type: sdk_command
file_manager: job
file_manager: sdk
alert: xgboost_tests
@ -325,7 +325,7 @@
test_name: tune_4x32
test_suite: xgboost_tests
frequency: nightly
frequency: disabled # nightly
team: ml
cluster:
@ -341,6 +341,423 @@
timeout: 600
type: sdk_command
file_manager: job
file_manager: sdk
alert: xgboost_tests
#######################
# Tune cloud tests
#######################
- name: tune_cloud_aws_no_sync_down
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
legacy:
test_name: aws_no_sync_down
test_suite: tune_cloud_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_aws_4x2.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py no_sync_down
wait_for_nodes:
num_nodes: 4
timeout: 600
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_cloud_aws_ssh_sync
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
legacy:
test_name: aws_ssh_sync
test_suite: tune_cloud_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_aws_4x2.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py ssh_sync
wait_for_nodes:
num_nodes: 4
timeout: 600
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_cloud_aws_durable_upload
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
legacy:
test_name: aws_durable_upload
test_suite: tune_cloud_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_aws_4x2.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
wait_for_nodes:
num_nodes: 4
timeout: 600
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_cloud_aws_durable_upload_rllib_str
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
legacy:
test_name: aws_durable_upload_rllib_str
test_suite: tune_cloud_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config_ml.yaml
cluster_compute: tpl_aws_4x2.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str
--bucket s3://data-test-ilr/durable_upload_rllib_str
wait_for_nodes:
num_nodes: 4
timeout: 600
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_cloud_aws_durable_upload_rllib_trainer
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
legacy:
test_name: aws_durable_upload_rllib_trainer
test_suite: tune_cloud_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config_ml.yaml
cluster_compute: tpl_aws_4x2.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer
--bucket s3://data-test-ilr/durable_upload_rllib_trainer
wait_for_nodes:
num_nodes: 4
timeout: 600
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_cloud_gcp_k8s_no_sync_down
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
legacy:
test_name: gcp_k8s_no_sync_down
test_suite: tune_cloud_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
timeout: 600
script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
type: client
alert: tune_tests
- name: tune_cloud_gcp_k8s_ssh_sync
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
legacy:
test_name: gcp_k8s_ssh_sync
test_suite: tune_cloud_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
timeout: 600
script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
type: client
alert: tune_tests
- name: tune_cloud_gcp_k8s_durable_upload
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
legacy:
test_name: gcp_k8s_durable_upload
test_suite: tune_cloud_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
timeout: 600
script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
type: client
alert: tune_tests
########################
# Tune scalability tests
########################
- name: tune_scalability_bookkeeping_overhead
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: bookkeeping_overhead
test_suite: tune_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_1x16.yaml
run:
timeout: 1200
script: python workloads/test_bookkeeping_overhead.py
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_scalability_durable_trainable
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: durable_trainable
test_suite: tune_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_16x2.yaml
run:
timeout: 900
script: python workloads/test_durable_trainable.py --bucket data-test-ilr
wait_for_nodes:
num_nodes: 16
timeout: 600
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_scalability_long_running_large_checkpoints
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: long_running_large_checkpoints
test_suite: tune_tests
frequency: disabled # weekly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_1x32_hd.yaml
run:
timeout: 86400
script: python workloads/test_long_running_large_checkpoints.py
long_running: true
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled # nightly
run:
timeout: 3600
alert: tune_tests
- name: tune_scalability_network_overhead
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: network_overhead
test_suite: tune_tests
frequency: disabled # weekly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_100x2.yaml
run:
timeout: 900
prepare_timeout: 1200
script: python workloads/test_network_overhead.py
wait_for_nodes:
num_nodes: 100
timeout: 1200
type: sdk_command
file_manager: sdk
smoke_test:
frequency: disabled # nightly
cluster:
compute_template: tpl_20x2.yaml
run:
timeout: 400
prepare_timeout: 600
wait_for_nodes:
num_nodes: 20
timeout: 600
alert: tune_tests
- name: tune_scalability_result_throughput_cluster
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: result_throughput_cluster
test_suite: tune_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_16x64.yaml
run:
timeout: 600
script: python workloads/test_result_throughput_cluster.py
wait_for_nodes:
num_nodes: 16
timeout: 600
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_scalability_result_throughput_single_node
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: result_throughput_single_node
test_suite: tune_tests
frequency: disabled # nightly
team: ml
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_1x96.yaml
run:
timeout: 600
script: python workloads/test_result_throughput_single_node.py
type: sdk_command
file_manager: sdk
alert: tune_tests
- name: tune_scalability_xgboost_sweep
group: Tune scalability tests
working_dir: tune_tests/scalability_tests
legacy:
test_name: xgboost_sweep
test_suite: tune_tests
frequency: disabled # weekly
team: ml
cluster:
cluster_env: app_config_data.yaml
cluster_compute: tpl_16x64.yaml
run:
timeout: 3600
script: python workloads/test_xgboost_sweep.py
wait_for_nodes:
num_nodes: 16
timeout: 600
type: sdk_command
file_manager: sdk
alert: tune_tests

View file

@ -26,28 +26,6 @@ reason() {
echo "${REASON}"
}
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--ray-test-repo)
shift
RAY_TEST_REPO=$1
;;
--ray-test-branch)
shift
RAY_TEST_BRANCH=$1
;;
--release-results-dir)
shift
RELEASE_RESULTS_DIR=$1
;;
*)
break
esac
shift
done
RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py}
RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}