mirror of
https://github.com/vale981/ray
synced 2025-03-09 04:46:38 -04:00
308 lines
9 KiB
Python
308 lines
9 KiB
Python
![]() |
import copy
|
||
|
import logging
|
||
|
import os
|
||
|
import sys
|
||
|
|
||
|
import yaml
|
||
|
|
||
|
# Env variables:
|
||
|
|
||
|
# RAY_REPO Repo to use for finding the wheel
|
||
|
# RAY_BRANCH Branch to find the wheel
|
||
|
# RAY_TEST_REPO Repo to use for test scripts
|
||
|
# RAY_TEST_BRANCH Branch for test scripts
|
||
|
# FILTER_FILE File filter
|
||
|
# FILTER_TEST Test name filter
|
||
|
# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)
|
||
|
|
||
|
|
||
|
class ReleaseTest:
|
||
|
def __init__(self, name: str, smoke_test: bool = False, retry: int = 0):
|
||
|
self.name = name
|
||
|
self.smoke_test = smoke_test
|
||
|
self.retry = retry
|
||
|
|
||
|
def __str__(self):
|
||
|
return self.name
|
||
|
|
||
|
def __repr__(self):
|
||
|
return self.name
|
||
|
|
||
|
def __contains__(self, item):
|
||
|
return self.name.__contains__(item)
|
||
|
|
||
|
def __iter__(self):
|
||
|
return iter(self.name)
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self.name)
|
||
|
|
||
|
|
||
|
class SmokeTest(ReleaseTest):
|
||
|
def __init__(self, name: str, retry: int = 0):
|
||
|
super(SmokeTest, self).__init__(
|
||
|
name=name, smoke_test=True, retry=retry)
|
||
|
|
||
|
|
||
|
CORE_NIGHTLY_TESTS = {
|
||
|
"~/ray/release/nightly_tests/nightly_tests.yaml": [
|
||
|
"shuffle_10gb",
|
||
|
"shuffle_50gb",
|
||
|
"shuffle_50gb_large_partition",
|
||
|
"shuffle_100gb",
|
||
|
"non_streaming_shuffle_100gb",
|
||
|
"non_streaming_shuffle_50gb_large_partition",
|
||
|
"non_streaming_shuffle_50gb",
|
||
|
"dask_on_ray_10gb_sort",
|
||
|
"dask_on_ray_100gb_sort",
|
||
|
"dask_on_ray_large_scale_test_no_spilling",
|
||
|
"dask_on_ray_large_scale_test_spilling",
|
||
|
"stress_test_placement_group",
|
||
|
"shuffle_1tb_1000_partition",
|
||
|
"non_streaming_shuffle_1tb_1000_partition",
|
||
|
"shuffle_1tb_5000_partitions",
|
||
|
"non_streaming_shuffle_1tb_5000_partitions",
|
||
|
"decision_tree_autoscaling",
|
||
|
"autoscaling_shuffle_1tb_1000_partitions",
|
||
|
SmokeTest("stress_test_many_tasks"),
|
||
|
SmokeTest("stress_test_dead_actors"),
|
||
|
],
|
||
|
"~/ray/benchmarks/benchmark_tests.yaml": [
|
||
|
"single_node",
|
||
|
"object_store",
|
||
|
],
|
||
|
}
|
||
|
|
||
|
NIGHTLY_TESTS = {
|
||
|
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
|
||
|
# SmokeTest("horovod_test"),
|
||
|
# ], # Should we enable this?
|
||
|
"~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
|
||
|
"dask_xgboost_test",
|
||
|
"modin_xgboost_test",
|
||
|
"torch_tune_serve_test",
|
||
|
],
|
||
|
"~/ray/release/long_running_tests/long_running_tests.yaml": [
|
||
|
SmokeTest("actor_deaths"),
|
||
|
SmokeTest("apex"),
|
||
|
SmokeTest("impala"),
|
||
|
SmokeTest("many_actor_tasks"),
|
||
|
SmokeTest("many_drivers"),
|
||
|
SmokeTest("many_ppo"),
|
||
|
SmokeTest("many_tasks"),
|
||
|
SmokeTest("many_tasks_serialized_ids"),
|
||
|
SmokeTest("node_failures"),
|
||
|
SmokeTest("pbt"),
|
||
|
# SmokeTest("serve"),
|
||
|
# SmokeTest("serve_failure"),
|
||
|
],
|
||
|
"~/ray/release/microbenchmark/microbenchmark.yaml": [
|
||
|
"microbenchmark",
|
||
|
],
|
||
|
"~/ray/release/sgd_tests/sgd_tests.yaml": [
|
||
|
"sgd_gpu",
|
||
|
],
|
||
|
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
|
||
|
"bookkeeping_overhead",
|
||
|
"durable_trainable",
|
||
|
SmokeTest("long_running_large_checkpoints"),
|
||
|
SmokeTest("network_overhead"),
|
||
|
"result_throughput_cluster",
|
||
|
"result_throughput_single_node",
|
||
|
"xgboost_sweep",
|
||
|
],
|
||
|
"~/ray/release/xgboost_tests/xgboost_tests.yaml": [
|
||
|
"train_small",
|
||
|
"train_moderate",
|
||
|
"train_gpu",
|
||
|
"tune_small",
|
||
|
"tune_4x32",
|
||
|
"tune_32x4",
|
||
|
"ft_small_elastic",
|
||
|
"ft_small_non_elastic",
|
||
|
"distributed_api_test",
|
||
|
],
|
||
|
}
|
||
|
|
||
|
WEEKLY_TESTS = {
|
||
|
"~/ray/benchmarks/benchmark_tests.yaml": [
|
||
|
"distributed",
|
||
|
],
|
||
|
"~/ray/release/nightly_tests/nightly_tests.yaml": [
|
||
|
"stress_test_many_tasks",
|
||
|
"stress_test_dead_actors",
|
||
|
],
|
||
|
"~/ray/release/horovod_tests/horovod_tests.yaml": [
|
||
|
"horovod_test",
|
||
|
],
|
||
|
"~/ray/release/long_running_distributed_tests"
|
||
|
"/long_running_distributed.yaml": [
|
||
|
"pytorch_pbt_failure",
|
||
|
],
|
||
|
# Full long running tests (1 day runtime)
|
||
|
"~/ray/release/long_running_tests/long_running_tests.yaml": [
|
||
|
"actor_deaths",
|
||
|
"apex",
|
||
|
"impala",
|
||
|
"many_actor_tasks",
|
||
|
"many_drivers",
|
||
|
"many_ppo",
|
||
|
"many_tasks",
|
||
|
"many_tasks_serialized_ids",
|
||
|
"node_failures",
|
||
|
"pbt",
|
||
|
# "serve",
|
||
|
# "serve_failure",
|
||
|
],
|
||
|
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
|
||
|
"network_overhead",
|
||
|
"long_running_large_checkpoints",
|
||
|
],
|
||
|
}
|
||
|
|
||
|
MANUAL_TESTS = {
|
||
|
"~/ray/release/rllib_tests/rllib_tests.yaml": [
|
||
|
"learning_tests",
|
||
|
"example_scripts_on_gpu_tests",
|
||
|
"stress_tests",
|
||
|
],
|
||
|
"~/ray/release/long_running_tests/long_running_tests.yaml": [
|
||
|
SmokeTest("serve"),
|
||
|
SmokeTest("serve_failure"),
|
||
|
]
|
||
|
}
|
||
|
|
||
|
SUITES = {
|
||
|
"core-nightly": CORE_NIGHTLY_TESTS,
|
||
|
"nightly": NIGHTLY_TESTS,
|
||
|
"weekly": WEEKLY_TESTS,
|
||
|
"manual": MANUAL_TESTS,
|
||
|
}
|
||
|
|
||
|
DEFAULT_STEP_TEMPLATE = {
|
||
|
"env": {
|
||
|
"ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
|
||
|
"ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
|
||
|
"RELEASE_AWS_BUCKET": "ray-release-automation-results",
|
||
|
"RELEASE_AWS_LOCATION": "dev",
|
||
|
"RELEASE_AWS_DB_NAME": "ray_ci",
|
||
|
"RELEASE_AWS_DB_TABLE": "release_test_result",
|
||
|
"AWS_REGION": "us-west-2"
|
||
|
},
|
||
|
"agents": {
|
||
|
"queue": "runner_queue_branch"
|
||
|
},
|
||
|
"plugins": [{
|
||
|
"docker#v3.8.0": {
|
||
|
"image": "rayproject/ray",
|
||
|
"propagate-environment": True
|
||
|
}
|
||
|
}],
|
||
|
"commands": []
|
||
|
}
|
||
|
|
||
|
|
||
|
def build_pipeline(steps):
|
||
|
all_steps = []
|
||
|
|
||
|
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
|
||
|
RAY_REPO = os.environ.get("RAY_REPO",
|
||
|
"https://github.com/ray-project/ray.git")
|
||
|
|
||
|
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
|
||
|
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
|
||
|
|
||
|
FILTER_FILE = os.environ.get("FILTER_FILE", "")
|
||
|
FILTER_TEST = os.environ.get("FILTER_TEST", "")
|
||
|
|
||
|
logging.info(
|
||
|
f"Building pipeline \n"
|
||
|
f"Ray repo/branch to test:\n"
|
||
|
f" RAY_REPO = {RAY_REPO}\n"
|
||
|
f" RAY_BRANCH = {RAY_BRANCH}\n\n"
|
||
|
f"Ray repo/branch containing the test configurations and scripts:"
|
||
|
f" RAY_TEST_REPO = {RAY_TEST_REPO}\n"
|
||
|
f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
|
||
|
f"Filtering for these tests:\n"
|
||
|
f" FILTER_FILE = {FILTER_FILE}\n"
|
||
|
f" FILTER_TEST = {FILTER_TEST}\n\n")
|
||
|
|
||
|
for test_file, test_names in steps.items():
|
||
|
if FILTER_FILE and FILTER_FILE not in test_file:
|
||
|
continue
|
||
|
|
||
|
test_base = os.path.basename(test_file)
|
||
|
for test_name in test_names:
|
||
|
if FILTER_TEST and FILTER_TEST not in test_name:
|
||
|
continue
|
||
|
|
||
|
if not isinstance(test_name, ReleaseTest):
|
||
|
test_name = ReleaseTest(name=test_name)
|
||
|
|
||
|
logging.info(f"Adding test: {test_base}/{test_name}")
|
||
|
|
||
|
cmd = str(f"python release/e2e.py "
|
||
|
f"--ray-branch {RAY_BRANCH} "
|
||
|
f"--category {RAY_BRANCH} "
|
||
|
f"--test-config {test_file} "
|
||
|
f"--test-name {test_name}")
|
||
|
|
||
|
if test_name.smoke_test:
|
||
|
logging.info("This test will run as a smoke test.")
|
||
|
cmd += " --smoke-test"
|
||
|
|
||
|
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
|
||
|
|
||
|
if test_name.retry:
|
||
|
logging.info(f"This test will be retried up to "
|
||
|
f"{test_name.retry} times.")
|
||
|
step_conf["retry"] = {
|
||
|
"automatic": [{
|
||
|
"exit_status": "*",
|
||
|
"limit": test_name.retry
|
||
|
}]
|
||
|
}
|
||
|
|
||
|
step_conf["commands"] = [
|
||
|
"pip install -q -r release/requirements.txt",
|
||
|
"pip install -U boto3 botocore",
|
||
|
f"git clone -b {RAY_TEST_BRANCH} {RAY_TEST_REPO} ~/ray",
|
||
|
cmd,
|
||
|
]
|
||
|
|
||
|
step_conf["label"] = f"{test_name} ({RAY_BRANCH}) - " \
|
||
|
f"{RAY_TEST_BRANCH}/{test_base}"
|
||
|
all_steps.append(step_conf)
|
||
|
|
||
|
return all_steps
|
||
|
|
||
|
|
||
|
def alert_pipeline(stats: bool = False):
|
||
|
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
|
||
|
|
||
|
cmd = "python release/alert.py"
|
||
|
if stats:
|
||
|
cmd += " --stats"
|
||
|
|
||
|
step_conf["commands"] = [
|
||
|
"pip install -q -r release/requirements.txt",
|
||
|
"pip install -U boto3 botocore",
|
||
|
cmd,
|
||
|
]
|
||
|
step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
|
||
|
return [step_conf]
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
alert = os.environ.get("RELEASE_ALERT", "0")
|
||
|
|
||
|
if alert in ["1", "stats"]:
|
||
|
steps = alert_pipeline(alert == "stats")
|
||
|
else:
|
||
|
TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
|
||
|
PIPELINE_SPEC = SUITES[TEST_SUITE]
|
||
|
|
||
|
steps = build_pipeline(PIPELINE_SPEC)
|
||
|
|
||
|
yaml.dump({"steps": steps}, sys.stdout)
|