mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[release] Fix special cases in release test package (e.g. smoke test) (#22442)
Fixing special cases (e.g. smoke tests, long running tests) in the release test package infrastructure. Prepare migration of Tune and XGBoost tests.
This commit is contained in:
parent
ba4f1423c7
commit
3695408a85
12 changed files with 559 additions and 50 deletions
|
@ -85,10 +85,10 @@ py_test(
|
|||
)
|
||||
|
||||
py_test(
|
||||
name = "test_repeat",
|
||||
name = "test_run_script",
|
||||
tags = ["team:ml", "release_unit"],
|
||||
size = "small",
|
||||
srcs = ["ray_release/tests/test_repeat.py"]
|
||||
srcs = ["ray_release/tests/test_run_script.py"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
|
|
|
@ -34,6 +34,13 @@ class ClusterManager(abc.ABC):
|
|||
|
||||
def set_cluster_env(self, cluster_env: Dict[str, Any]):
|
||||
self.cluster_env = cluster_env
|
||||
|
||||
# Add flags for redisless Ray
|
||||
self.cluster_env.setdefault("env_vars", {})
|
||||
self.cluster_env["env_vars"]["MATCH_AUTOSCALER_AND_RAY_IMAGES"] = "1"
|
||||
self.cluster_env["env_vars"]["RAY_bootstrap_with_gcs"] = "1"
|
||||
self.cluster_env["env_vars"]["RAY_gcs_storage"] = "memory"
|
||||
|
||||
self.cluster_env_name = (
|
||||
f"{self.project_name}_{self.project_id[4:8]}"
|
||||
f"__env__{self.test_name}__"
|
||||
|
|
|
@ -214,6 +214,8 @@ def run_release_test(
|
|||
command = f"{command} --smoke-test"
|
||||
command_env["IS_SMOKE_TEST"] = "1"
|
||||
|
||||
is_long_running = test["run"].get("long_running", False)
|
||||
|
||||
try:
|
||||
command_runner.run_command(
|
||||
command, env=command_env, timeout=command_timeout
|
||||
|
@ -221,7 +223,9 @@ def run_release_test(
|
|||
except CommandError as e:
|
||||
raise TestCommandError(e)
|
||||
except CommandTimeout as e:
|
||||
raise TestCommandTimeout(e)
|
||||
if not is_long_running:
|
||||
# Only raise error if command is not long running
|
||||
raise TestCommandTimeout(e)
|
||||
|
||||
try:
|
||||
command_results = command_runner.fetch_results()
|
||||
|
|
|
@ -20,6 +20,9 @@ from ray_release.logger import logger
|
|||
from ray_release.wheels import find_and_wait_for_ray_wheels_url
|
||||
|
||||
|
||||
PIPELINE_ARTIFACT_PATH = "/tmp/pipeline_artifacts"
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--test-collection-file",
|
||||
|
@ -118,6 +121,19 @@ def main(test_collection_file: Optional[str] = None):
|
|||
group_step = {"group": group, "steps": group_steps}
|
||||
steps.append(group_step)
|
||||
|
||||
if "BUILDKITE" in os.environ:
|
||||
if os.path.exists(PIPELINE_ARTIFACT_PATH):
|
||||
shutil.rmtree(PIPELINE_ARTIFACT_PATH)
|
||||
|
||||
os.makedirs(PIPELINE_ARTIFACT_PATH, exist_ok=True, mode=0o755)
|
||||
|
||||
with open(os.path.join(PIPELINE_ARTIFACT_PATH, "pipeline.json"), "wt") as fp:
|
||||
json.dump(steps, fp)
|
||||
|
||||
settings["frequency"] = settings["frequency"].value
|
||||
with open(os.path.join(PIPELINE_ARTIFACT_PATH, "settings.json"), "wt") as fp:
|
||||
json.dump(settings, fp)
|
||||
|
||||
steps_str = json.dumps(steps)
|
||||
print(steps_str)
|
||||
|
||||
|
|
|
@ -6,6 +6,17 @@ import click
|
|||
import yaml
|
||||
|
||||
|
||||
class FormatDumper(yaml.SafeDumper):
|
||||
last_indent = 0
|
||||
|
||||
def write_line_break(self, data=None):
|
||||
if (self.indent or 0) < self.last_indent:
|
||||
super().write_line_break()
|
||||
|
||||
super().write_line_break(data)
|
||||
self.last_indent = self.indent or 0
|
||||
|
||||
|
||||
def replace_prepare(dt: Dict):
|
||||
if "prepare" in dt and "wait_cluster" in dt["prepare"]:
|
||||
_, _, nodes, timeout = dt.pop("prepare").split(" ")
|
||||
|
@ -42,6 +53,11 @@ def main(legacy_config: str, prefix: str, group: str, alert: str):
|
|||
"cluster_compute": old["cluster"]["compute_template"],
|
||||
}
|
||||
|
||||
if "cloud_id" in old["cluster"]:
|
||||
test["cluster"]["cloud_id"] = old["cluster"]["cloud_id"]
|
||||
if "cloud_name" in old["cluster"]:
|
||||
test["cluster"]["cloud_name"] = old["cluster"]["cloud_name"]
|
||||
|
||||
if "driver_setup" in old:
|
||||
test["driver_setup"] = "driver_setup"
|
||||
|
||||
|
@ -65,7 +81,7 @@ def main(legacy_config: str, prefix: str, group: str, alert: str):
|
|||
|
||||
tests.append(test)
|
||||
|
||||
yaml.dump(tests, sys.stdout, sort_keys=False)
|
||||
yaml.dump(tests, sys.stdout, Dumper=FormatDumper, sort_keys=False)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
|
|
|
@ -124,6 +124,7 @@ def main(
|
|||
result=result,
|
||||
ray_wheels_url=ray_wheels_url,
|
||||
reporters=reporters,
|
||||
smoke_test=smoke_test,
|
||||
cluster_id=cluster_id,
|
||||
cluster_env_id=cluster_env_id,
|
||||
no_terminate=no_terminate,
|
||||
|
|
14
release/ray_release/tests/_test_catch_args.py
Normal file
14
release/ray_release/tests/_test_catch_args.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
import json
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
argv_file = sys.argv[1]
|
||||
with open(argv_file, "wt") as fp:
|
||||
json.dump(sys.argv, fp)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -11,6 +11,7 @@ from ray_release.buildkite.settings import (
|
|||
Frequency,
|
||||
update_settings_from_buildkite,
|
||||
)
|
||||
from ray_release.buildkite.step import get_step
|
||||
from ray_release.config import Test
|
||||
from ray_release.exception import ReleaseTestConfigError
|
||||
from ray_release.wheels import (
|
||||
|
@ -221,3 +222,19 @@ class BuildkiteSettingsTest(unittest.TestCase):
|
|||
[t["name"] for t, _ in grouped["x"]], ["x1", "x2", "x3"]
|
||||
)
|
||||
self.assertEqual(len(grouped["y"]), 1)
|
||||
|
||||
def testGetStep(self):
|
||||
test = Test(
|
||||
{
|
||||
"name": "test",
|
||||
"frequency": "nightly",
|
||||
"run": {"script": "test_script.py"},
|
||||
"smoke_test": {"frequency": "multi"},
|
||||
}
|
||||
)
|
||||
|
||||
step = get_step(test, smoke_test=False)
|
||||
self.assertNotIn("--smoke-test", step["command"])
|
||||
|
||||
step = get_step(test, smoke_test=True)
|
||||
self.assertIn("--smoke-test", step["command"])
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
import unittest
|
||||
from typing import Type, Callable
|
||||
from unittest.mock import patch
|
||||
|
@ -213,7 +214,10 @@ class GlueTest(unittest.TestCase):
|
|||
if until == "test_command":
|
||||
return
|
||||
|
||||
self.command_runner_return["fetch_results"] = {"time_taken": 50}
|
||||
self.command_runner_return["fetch_results"] = {
|
||||
"time_taken": 50,
|
||||
"last_update": time.time() - 60,
|
||||
}
|
||||
|
||||
if until == "fetch_results":
|
||||
return
|
||||
|
@ -495,6 +499,26 @@ class GlueTest(unittest.TestCase):
|
|||
# Ensure cluster was terminated
|
||||
self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
|
||||
|
||||
def testTestCommandTimeoutLongRunning(self):
|
||||
result = Result()
|
||||
|
||||
self._succeed_until("fetch_results")
|
||||
|
||||
# Test command times out
|
||||
self.command_runner_return["run_command"] = _fail_on_call(CommandTimeout)
|
||||
with self.assertRaises(TestCommandTimeout):
|
||||
self._run(result)
|
||||
self.assertEqual(result.return_code, ExitCode.COMMAND_TIMEOUT.value)
|
||||
|
||||
# But now set test to long running
|
||||
self.test["run"]["long_running"] = True
|
||||
self._run(result) # Will not fail this time
|
||||
|
||||
self.assertGreaterEqual(result.results["last_update_diff"], 60.0)
|
||||
|
||||
# Ensure cluster was terminated
|
||||
self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
|
||||
|
||||
def testFetchResultFails(self):
|
||||
result = Result()
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
|
@ -7,7 +8,7 @@ import unittest
|
|||
from ray_release.result import ExitCode
|
||||
|
||||
|
||||
class WheelsFinderTest(unittest.TestCase):
|
||||
class RunScriptTest(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tempdir = tempfile.mkdtemp()
|
||||
self.state_file = os.path.join(self.tempdir, "state.txt")
|
||||
|
@ -18,9 +19,7 @@ class WheelsFinderTest(unittest.TestCase):
|
|||
os.environ["NO_INSTALL"] = "1"
|
||||
os.environ["NO_CLONE"] = "1"
|
||||
os.environ["NO_ARTIFACTS"] = "1"
|
||||
os.environ["RAY_TEST_SCRIPT"] = (
|
||||
"ray_release/tests/" "_test_run_release_test_sh.py"
|
||||
)
|
||||
os.environ["RAY_TEST_SCRIPT"] = "ray_release/tests/_test_run_release_test_sh.py"
|
||||
os.environ["OVERRIDE_SLEEP_TIME"] = "0"
|
||||
|
||||
def tearDown(self) -> None:
|
||||
|
@ -86,3 +85,19 @@ class WheelsFinderTest(unittest.TestCase):
|
|||
ExitCode.COMMAND_ALERT.value,
|
||||
)
|
||||
self.assertEquals(self._read_state(), 2)
|
||||
|
||||
def testParameters(self):
|
||||
os.environ["RAY_TEST_SCRIPT"] = "ray_release/tests/_test_catch_args.py"
|
||||
argv_file = tempfile.mktemp()
|
||||
|
||||
subprocess.check_call(
|
||||
f"{self.test_script} " f"{argv_file} " f"--smoke-test",
|
||||
shell=True,
|
||||
)
|
||||
|
||||
with open(argv_file, "rt") as fp:
|
||||
data = json.load(fp)
|
||||
|
||||
os.unlink(argv_file)
|
||||
|
||||
self.assertIn("--smoke-test", data)
|
|
@ -17,7 +17,7 @@
|
|||
#
|
||||
# # How often to run the tests.
|
||||
# # One of [disabled, any, multi, nightly, weekly].
|
||||
# frequency: weekly
|
||||
# frequency: disabled # weekly
|
||||
# # Owning team. This field will be persisted to the database
|
||||
# team: ml
|
||||
#
|
||||
|
@ -49,7 +49,7 @@
|
|||
#
|
||||
# # File manager to use to transfer files to and from the cluster.
|
||||
# # Can be any of [sdk, client, job].
|
||||
# file_manager: job
|
||||
# file_manager: sdk
|
||||
#
|
||||
# # If you want to wait for nodes to be ready, you can specify this here:
|
||||
# wait_for_nodes:
|
||||
|
@ -77,7 +77,7 @@
|
|||
# smoke_test:
|
||||
# # Smoke tests can have different frequencies. A smoke test is only triggered
|
||||
# # when the regular test is not matched.
|
||||
# frequency: nightly
|
||||
# frequency: disabled # nightly
|
||||
# # Here we adjust the run timeout down and run on less nodes. The test script
|
||||
# # remains the same.
|
||||
# run:
|
||||
|
@ -130,7 +130,7 @@
|
|||
test_name: train_moderate
|
||||
test_suite: xgboost_tests
|
||||
|
||||
frequency: nightly
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
|
@ -146,7 +146,7 @@
|
|||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
file_manager: sdk
|
||||
|
||||
alert: xgboost_tests
|
||||
|
||||
|
@ -158,7 +158,7 @@
|
|||
test_name: train_gpu
|
||||
test_suite: xgboost_tests
|
||||
|
||||
frequency: nightly
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
|
@ -174,7 +174,7 @@
|
|||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
file_manager: sdk
|
||||
|
||||
alert: xgboost_tests
|
||||
|
||||
|
@ -186,7 +186,7 @@
|
|||
test_name: distributed_api_test
|
||||
test_suite: xgboost_tests
|
||||
|
||||
frequency: nightly
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
|
@ -201,7 +201,7 @@
|
|||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
file_manager: sdk
|
||||
|
||||
alert: xgboost_tests
|
||||
|
||||
|
@ -213,7 +213,7 @@
|
|||
test_name: ft_small_elastic
|
||||
test_suite: xgboost_tests
|
||||
|
||||
frequency: nightly
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
|
@ -229,7 +229,7 @@
|
|||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
file_manager: sdk
|
||||
|
||||
alert: xgboost_tests
|
||||
|
||||
|
@ -241,7 +241,7 @@
|
|||
test_name: ft_small_non_elastic
|
||||
test_suite: xgboost_tests
|
||||
|
||||
frequency: nightly
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
|
@ -257,7 +257,7 @@
|
|||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
file_manager: sdk
|
||||
|
||||
alert: xgboost_tests
|
||||
|
||||
|
@ -269,7 +269,7 @@
|
|||
test_name: tune_small
|
||||
test_suite: xgboost_tests
|
||||
|
||||
frequency: nightly
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
|
@ -285,7 +285,7 @@
|
|||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
file_manager: sdk
|
||||
|
||||
alert: xgboost_tests
|
||||
|
||||
|
@ -297,7 +297,7 @@
|
|||
test_name: tune_32x4
|
||||
test_suite: xgboost_tests
|
||||
|
||||
frequency: nightly
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
|
@ -313,7 +313,7 @@
|
|||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
file_manager: sdk
|
||||
|
||||
alert: xgboost_tests
|
||||
|
||||
|
@ -325,7 +325,7 @@
|
|||
test_name: tune_4x32
|
||||
test_suite: xgboost_tests
|
||||
|
||||
frequency: nightly
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
|
@ -341,6 +341,423 @@
|
|||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: job
|
||||
file_manager: sdk
|
||||
|
||||
alert: xgboost_tests
|
||||
|
||||
#######################
|
||||
# Tune cloud tests
|
||||
#######################
|
||||
- name: tune_cloud_aws_no_sync_down
|
||||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
legacy:
|
||||
test_name: aws_no_sync_down
|
||||
test_suite: tune_cloud_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
script: python workloads/run_cloud_test.py no_sync_down
|
||||
|
||||
wait_for_nodes:
|
||||
num_nodes: 4
|
||||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_cloud_aws_ssh_sync
|
||||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
legacy:
|
||||
test_name: aws_ssh_sync
|
||||
test_suite: tune_cloud_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
script: python workloads/run_cloud_test.py ssh_sync
|
||||
|
||||
wait_for_nodes:
|
||||
num_nodes: 4
|
||||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_cloud_aws_durable_upload
|
||||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
legacy:
|
||||
test_name: aws_durable_upload
|
||||
test_suite: tune_cloud_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
|
||||
|
||||
wait_for_nodes:
|
||||
num_nodes: 4
|
||||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_cloud_aws_durable_upload_rllib_str
|
||||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
legacy:
|
||||
test_name: aws_durable_upload_rllib_str
|
||||
test_suite: tune_cloud_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config_ml.yaml
|
||||
cluster_compute: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
|
||||
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str
|
||||
--bucket s3://data-test-ilr/durable_upload_rllib_str
|
||||
|
||||
wait_for_nodes:
|
||||
num_nodes: 4
|
||||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_cloud_aws_durable_upload_rllib_trainer
|
||||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
legacy:
|
||||
test_name: aws_durable_upload_rllib_trainer
|
||||
test_suite: tune_cloud_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config_ml.yaml
|
||||
cluster_compute: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer
|
||||
--bucket s3://data-test-ilr/durable_upload_rllib_trainer
|
||||
|
||||
wait_for_nodes:
|
||||
num_nodes: 4
|
||||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_cloud_gcp_k8s_no_sync_down
|
||||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
legacy:
|
||||
test_name: gcp_k8s_no_sync_down
|
||||
test_suite: tune_cloud_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_gcp_k8s_4x8.yaml
|
||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
|
||||
type: client
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_cloud_gcp_k8s_ssh_sync
|
||||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
legacy:
|
||||
test_name: gcp_k8s_ssh_sync
|
||||
test_suite: tune_cloud_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_gcp_k8s_4x8.yaml
|
||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
|
||||
type: client
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_cloud_gcp_k8s_durable_upload
|
||||
group: Tune cloud tests
|
||||
working_dir: tune_tests/cloud_tests
|
||||
|
||||
legacy:
|
||||
test_name: gcp_k8s_durable_upload
|
||||
test_suite: tune_cloud_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_gcp_k8s_4x8.yaml
|
||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
|
||||
type: client
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
|
||||
########################
|
||||
# Tune scalability tests
|
||||
########################
|
||||
|
||||
- name: tune_scalability_bookkeeping_overhead
|
||||
group: Tune scalability tests
|
||||
working_dir: tune_tests/scalability_tests
|
||||
|
||||
legacy:
|
||||
test_name: bookkeeping_overhead
|
||||
test_suite: tune_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_1x16.yaml
|
||||
|
||||
run:
|
||||
timeout: 1200
|
||||
script: python workloads/test_bookkeeping_overhead.py
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_scalability_durable_trainable
|
||||
group: Tune scalability tests
|
||||
working_dir: tune_tests/scalability_tests
|
||||
|
||||
legacy:
|
||||
test_name: durable_trainable
|
||||
test_suite: tune_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_16x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
script: python workloads/test_durable_trainable.py --bucket data-test-ilr
|
||||
wait_for_nodes:
|
||||
num_nodes: 16
|
||||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_scalability_long_running_large_checkpoints
|
||||
group: Tune scalability tests
|
||||
working_dir: tune_tests/scalability_tests
|
||||
|
||||
legacy:
|
||||
test_name: long_running_large_checkpoints
|
||||
test_suite: tune_tests
|
||||
|
||||
frequency: disabled # weekly
|
||||
team: ml
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_1x32_hd.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
script: python workloads/test_long_running_large_checkpoints.py
|
||||
long_running: true
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
smoke_test:
|
||||
frequency: disabled # nightly
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_scalability_network_overhead
|
||||
group: Tune scalability tests
|
||||
working_dir: tune_tests/scalability_tests
|
||||
legacy:
|
||||
test_name: network_overhead
|
||||
test_suite: tune_tests
|
||||
|
||||
frequency: disabled # weekly
|
||||
team: ml
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_100x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare_timeout: 1200
|
||||
script: python workloads/test_network_overhead.py
|
||||
wait_for_nodes:
|
||||
num_nodes: 100
|
||||
timeout: 1200
|
||||
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
smoke_test:
|
||||
frequency: disabled # nightly
|
||||
|
||||
cluster:
|
||||
compute_template: tpl_20x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 400
|
||||
prepare_timeout: 600
|
||||
wait_for_nodes:
|
||||
num_nodes: 20
|
||||
timeout: 600
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_scalability_result_throughput_cluster
|
||||
group: Tune scalability tests
|
||||
working_dir: tune_tests/scalability_tests
|
||||
|
||||
legacy:
|
||||
test_name: result_throughput_cluster
|
||||
test_suite: tune_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_16x64.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
script: python workloads/test_result_throughput_cluster.py
|
||||
|
||||
wait_for_nodes:
|
||||
num_nodes: 16
|
||||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_scalability_result_throughput_single_node
|
||||
group: Tune scalability tests
|
||||
working_dir: tune_tests/scalability_tests
|
||||
|
||||
legacy:
|
||||
test_name: result_throughput_single_node
|
||||
test_suite: tune_tests
|
||||
|
||||
frequency: disabled # nightly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config.yaml
|
||||
cluster_compute: tpl_1x96.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
script: python workloads/test_result_throughput_single_node.py
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
- name: tune_scalability_xgboost_sweep
|
||||
group: Tune scalability tests
|
||||
working_dir: tune_tests/scalability_tests
|
||||
|
||||
legacy:
|
||||
test_name: xgboost_sweep
|
||||
test_suite: tune_tests
|
||||
|
||||
frequency: disabled # weekly
|
||||
team: ml
|
||||
|
||||
cluster:
|
||||
cluster_env: app_config_data.yaml
|
||||
cluster_compute: tpl_16x64.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
script: python workloads/test_xgboost_sweep.py
|
||||
|
||||
wait_for_nodes:
|
||||
num_nodes: 16
|
||||
timeout: 600
|
||||
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
alert: tune_tests
|
||||
|
||||
|
|
|
@ -26,28 +26,6 @@ reason() {
|
|||
echo "${REASON}"
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]
|
||||
do
|
||||
key="$1"
|
||||
case $key in
|
||||
--ray-test-repo)
|
||||
shift
|
||||
RAY_TEST_REPO=$1
|
||||
;;
|
||||
--ray-test-branch)
|
||||
shift
|
||||
RAY_TEST_BRANCH=$1
|
||||
;;
|
||||
--release-results-dir)
|
||||
shift
|
||||
RELEASE_RESULTS_DIR=$1
|
||||
;;
|
||||
*)
|
||||
break
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py}
|
||||
RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
|
||||
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
|
||||
|
|
Loading…
Add table
Reference in a new issue