mirror of
https://github.com/vale981/ray
synced 2025-03-04 17:41:43 -05:00
[release] move release testing end to end script to main ray repo (#17070)
This commit is contained in:
parent
92f19170ab
commit
ed131f87da
12 changed files with 2823 additions and 0 deletions
307
release/.buildkite/build_pipeline.py
Normal file
307
release/.buildkite/build_pipeline.py
Normal file
|
@ -0,0 +1,307 @@
|
|||
import copy
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
|
||||
# Env variables:
|
||||
|
||||
# RAY_REPO Repo to use for finding the wheel
|
||||
# RAY_BRANCH Branch to find the wheel
|
||||
# RAY_TEST_REPO Repo to use for test scripts
|
||||
# RAY_TEST_BRANCH Branch for test scripts
|
||||
# FILTER_FILE File filter
|
||||
# FILTER_TEST Test name filter
|
||||
# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)
|
||||
|
||||
|
||||
class ReleaseTest:
|
||||
def __init__(self, name: str, smoke_test: bool = False, retry: int = 0):
|
||||
self.name = name
|
||||
self.smoke_test = smoke_test
|
||||
self.retry = retry
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
def __contains__(self, item):
|
||||
return self.name.__contains__(item)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.name)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.name)
|
||||
|
||||
|
||||
class SmokeTest(ReleaseTest):
|
||||
def __init__(self, name: str, retry: int = 0):
|
||||
super(SmokeTest, self).__init__(
|
||||
name=name, smoke_test=True, retry=retry)
|
||||
|
||||
|
||||
CORE_NIGHTLY_TESTS = {
|
||||
"~/ray/release/nightly_tests/nightly_tests.yaml": [
|
||||
"shuffle_10gb",
|
||||
"shuffle_50gb",
|
||||
"shuffle_50gb_large_partition",
|
||||
"shuffle_100gb",
|
||||
"non_streaming_shuffle_100gb",
|
||||
"non_streaming_shuffle_50gb_large_partition",
|
||||
"non_streaming_shuffle_50gb",
|
||||
"dask_on_ray_10gb_sort",
|
||||
"dask_on_ray_100gb_sort",
|
||||
"dask_on_ray_large_scale_test_no_spilling",
|
||||
"dask_on_ray_large_scale_test_spilling",
|
||||
"stress_test_placement_group",
|
||||
"shuffle_1tb_1000_partition",
|
||||
"non_streaming_shuffle_1tb_1000_partition",
|
||||
"shuffle_1tb_5000_partitions",
|
||||
"non_streaming_shuffle_1tb_5000_partitions",
|
||||
"decision_tree_autoscaling",
|
||||
"autoscaling_shuffle_1tb_1000_partitions",
|
||||
SmokeTest("stress_test_many_tasks"),
|
||||
SmokeTest("stress_test_dead_actors"),
|
||||
],
|
||||
"~/ray/benchmarks/benchmark_tests.yaml": [
|
||||
"single_node",
|
||||
"object_store",
|
||||
],
|
||||
}
|
||||
|
||||
NIGHTLY_TESTS = {
|
||||
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
|
||||
# SmokeTest("horovod_test"),
|
||||
# ], # Should we enable this?
|
||||
"~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
|
||||
"dask_xgboost_test",
|
||||
"modin_xgboost_test",
|
||||
"torch_tune_serve_test",
|
||||
],
|
||||
"~/ray/release/long_running_tests/long_running_tests.yaml": [
|
||||
SmokeTest("actor_deaths"),
|
||||
SmokeTest("apex"),
|
||||
SmokeTest("impala"),
|
||||
SmokeTest("many_actor_tasks"),
|
||||
SmokeTest("many_drivers"),
|
||||
SmokeTest("many_ppo"),
|
||||
SmokeTest("many_tasks"),
|
||||
SmokeTest("many_tasks_serialized_ids"),
|
||||
SmokeTest("node_failures"),
|
||||
SmokeTest("pbt"),
|
||||
# SmokeTest("serve"),
|
||||
# SmokeTest("serve_failure"),
|
||||
],
|
||||
"~/ray/release/microbenchmark/microbenchmark.yaml": [
|
||||
"microbenchmark",
|
||||
],
|
||||
"~/ray/release/sgd_tests/sgd_tests.yaml": [
|
||||
"sgd_gpu",
|
||||
],
|
||||
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
|
||||
"bookkeeping_overhead",
|
||||
"durable_trainable",
|
||||
SmokeTest("long_running_large_checkpoints"),
|
||||
SmokeTest("network_overhead"),
|
||||
"result_throughput_cluster",
|
||||
"result_throughput_single_node",
|
||||
"xgboost_sweep",
|
||||
],
|
||||
"~/ray/release/xgboost_tests/xgboost_tests.yaml": [
|
||||
"train_small",
|
||||
"train_moderate",
|
||||
"train_gpu",
|
||||
"tune_small",
|
||||
"tune_4x32",
|
||||
"tune_32x4",
|
||||
"ft_small_elastic",
|
||||
"ft_small_non_elastic",
|
||||
"distributed_api_test",
|
||||
],
|
||||
}
|
||||
|
||||
WEEKLY_TESTS = {
|
||||
"~/ray/benchmarks/benchmark_tests.yaml": [
|
||||
"distributed",
|
||||
],
|
||||
"~/ray/release/nightly_tests/nightly_tests.yaml": [
|
||||
"stress_test_many_tasks",
|
||||
"stress_test_dead_actors",
|
||||
],
|
||||
"~/ray/release/horovod_tests/horovod_tests.yaml": [
|
||||
"horovod_test",
|
||||
],
|
||||
"~/ray/release/long_running_distributed_tests"
|
||||
"/long_running_distributed.yaml": [
|
||||
"pytorch_pbt_failure",
|
||||
],
|
||||
# Full long running tests (1 day runtime)
|
||||
"~/ray/release/long_running_tests/long_running_tests.yaml": [
|
||||
"actor_deaths",
|
||||
"apex",
|
||||
"impala",
|
||||
"many_actor_tasks",
|
||||
"many_drivers",
|
||||
"many_ppo",
|
||||
"many_tasks",
|
||||
"many_tasks_serialized_ids",
|
||||
"node_failures",
|
||||
"pbt",
|
||||
# "serve",
|
||||
# "serve_failure",
|
||||
],
|
||||
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
|
||||
"network_overhead",
|
||||
"long_running_large_checkpoints",
|
||||
],
|
||||
}
|
||||
|
||||
MANUAL_TESTS = {
|
||||
"~/ray/release/rllib_tests/rllib_tests.yaml": [
|
||||
"learning_tests",
|
||||
"example_scripts_on_gpu_tests",
|
||||
"stress_tests",
|
||||
],
|
||||
"~/ray/release/long_running_tests/long_running_tests.yaml": [
|
||||
SmokeTest("serve"),
|
||||
SmokeTest("serve_failure"),
|
||||
]
|
||||
}
|
||||
|
||||
SUITES = {
|
||||
"core-nightly": CORE_NIGHTLY_TESTS,
|
||||
"nightly": NIGHTLY_TESTS,
|
||||
"weekly": WEEKLY_TESTS,
|
||||
"manual": MANUAL_TESTS,
|
||||
}
|
||||
|
||||
DEFAULT_STEP_TEMPLATE = {
|
||||
"env": {
|
||||
"ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
|
||||
"ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
|
||||
"RELEASE_AWS_BUCKET": "ray-release-automation-results",
|
||||
"RELEASE_AWS_LOCATION": "dev",
|
||||
"RELEASE_AWS_DB_NAME": "ray_ci",
|
||||
"RELEASE_AWS_DB_TABLE": "release_test_result",
|
||||
"AWS_REGION": "us-west-2"
|
||||
},
|
||||
"agents": {
|
||||
"queue": "runner_queue_branch"
|
||||
},
|
||||
"plugins": [{
|
||||
"docker#v3.8.0": {
|
||||
"image": "rayproject/ray",
|
||||
"propagate-environment": True
|
||||
}
|
||||
}],
|
||||
"commands": []
|
||||
}
|
||||
|
||||
|
||||
def build_pipeline(steps):
|
||||
all_steps = []
|
||||
|
||||
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
|
||||
RAY_REPO = os.environ.get("RAY_REPO",
|
||||
"https://github.com/ray-project/ray.git")
|
||||
|
||||
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
|
||||
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
|
||||
|
||||
FILTER_FILE = os.environ.get("FILTER_FILE", "")
|
||||
FILTER_TEST = os.environ.get("FILTER_TEST", "")
|
||||
|
||||
logging.info(
|
||||
f"Building pipeline \n"
|
||||
f"Ray repo/branch to test:\n"
|
||||
f" RAY_REPO = {RAY_REPO}\n"
|
||||
f" RAY_BRANCH = {RAY_BRANCH}\n\n"
|
||||
f"Ray repo/branch containing the test configurations and scripts:"
|
||||
f" RAY_TEST_REPO = {RAY_TEST_REPO}\n"
|
||||
f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
|
||||
f"Filtering for these tests:\n"
|
||||
f" FILTER_FILE = {FILTER_FILE}\n"
|
||||
f" FILTER_TEST = {FILTER_TEST}\n\n")
|
||||
|
||||
for test_file, test_names in steps.items():
|
||||
if FILTER_FILE and FILTER_FILE not in test_file:
|
||||
continue
|
||||
|
||||
test_base = os.path.basename(test_file)
|
||||
for test_name in test_names:
|
||||
if FILTER_TEST and FILTER_TEST not in test_name:
|
||||
continue
|
||||
|
||||
if not isinstance(test_name, ReleaseTest):
|
||||
test_name = ReleaseTest(name=test_name)
|
||||
|
||||
logging.info(f"Adding test: {test_base}/{test_name}")
|
||||
|
||||
cmd = str(f"python release/e2e.py "
|
||||
f"--ray-branch {RAY_BRANCH} "
|
||||
f"--category {RAY_BRANCH} "
|
||||
f"--test-config {test_file} "
|
||||
f"--test-name {test_name}")
|
||||
|
||||
if test_name.smoke_test:
|
||||
logging.info("This test will run as a smoke test.")
|
||||
cmd += " --smoke-test"
|
||||
|
||||
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
|
||||
|
||||
if test_name.retry:
|
||||
logging.info(f"This test will be retried up to "
|
||||
f"{test_name.retry} times.")
|
||||
step_conf["retry"] = {
|
||||
"automatic": [{
|
||||
"exit_status": "*",
|
||||
"limit": test_name.retry
|
||||
}]
|
||||
}
|
||||
|
||||
step_conf["commands"] = [
|
||||
"pip install -q -r release/requirements.txt",
|
||||
"pip install -U boto3 botocore",
|
||||
f"git clone -b {RAY_TEST_BRANCH} {RAY_TEST_REPO} ~/ray",
|
||||
cmd,
|
||||
]
|
||||
|
||||
step_conf["label"] = f"{test_name} ({RAY_BRANCH}) - " \
|
||||
f"{RAY_TEST_BRANCH}/{test_base}"
|
||||
all_steps.append(step_conf)
|
||||
|
||||
return all_steps
|
||||
|
||||
|
||||
def alert_pipeline(stats: bool = False):
|
||||
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
|
||||
|
||||
cmd = "python release/alert.py"
|
||||
if stats:
|
||||
cmd += " --stats"
|
||||
|
||||
step_conf["commands"] = [
|
||||
"pip install -q -r release/requirements.txt",
|
||||
"pip install -U boto3 botocore",
|
||||
cmd,
|
||||
]
|
||||
step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
|
||||
return [step_conf]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
alert = os.environ.get("RELEASE_ALERT", "0")
|
||||
|
||||
if alert in ["1", "stats"]:
|
||||
steps = alert_pipeline(alert == "stats")
|
||||
else:
|
||||
TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
|
||||
PIPELINE_SPEC = SUITES[TEST_SUITE]
|
||||
|
||||
steps = build_pipeline(PIPELINE_SPEC)
|
||||
|
||||
yaml.dump({"steps": steps}, sys.stdout)
|
0
release/__init__.py
Normal file
0
release/__init__.py
Normal file
401
release/alert.py
Normal file
401
release/alert.py
Normal file
|
@ -0,0 +1,401 @@
|
|||
import argparse
|
||||
from collections import defaultdict, Counter
|
||||
from typing import Any, List, Tuple, Mapping, Optional
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import requests
|
||||
import sys
|
||||
|
||||
import boto3
|
||||
|
||||
from e2e import GLOBAL_CONFIG
|
||||
|
||||
from alerts.default import handle_result as default_handle_result
|
||||
from alerts.rllib_tests import handle_result as rllib_tests_handle_result
|
||||
from alerts.long_running_tests import handle_result as \
|
||||
long_running_tests_handle_result
|
||||
from alerts.tune_tests import handle_result as tune_tests_handle_result
|
||||
from alerts.xgboost_tests import handle_result as xgboost_tests_handle_result
|
||||
|
||||
SUITE_TO_FN = {
|
||||
"long_running_tests": long_running_tests_handle_result,
|
||||
"rllib_tests": rllib_tests_handle_result,
|
||||
"tune_tests": tune_tests_handle_result,
|
||||
"xgboost_tests": xgboost_tests_handle_result,
|
||||
}
|
||||
|
||||
GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] = "alert_state"
|
||||
GLOBAL_CONFIG["SLACK_WEBHOOK"] = os.environ.get("SLACK_WEBHOOK", "")
|
||||
GLOBAL_CONFIG["SLACK_CHANNEL"] = os.environ.get("SLACK_CHANNEL",
|
||||
"#oss-test-cop")
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
handler = logging.StreamHandler(stream=sys.stdout)
|
||||
formatter = logging.Formatter(fmt="[%(levelname)s %(asctime)s] "
|
||||
"%(filename)s: %(lineno)d "
|
||||
"%(message)s")
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def maybe_fetch_slack_webhook():
|
||||
if GLOBAL_CONFIG["SLACK_WEBHOOK"] in [None, ""]:
|
||||
print("Missing SLACK_WEBHOOK, retrieving from AWS secrets store")
|
||||
GLOBAL_CONFIG["SLACK_WEBHOOK"] = boto3.client(
|
||||
"secretsmanager", region_name="us-west-2"
|
||||
).get_secret_value(
|
||||
SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:"
|
||||
"release-automation/"
|
||||
"slack-webhook-Na0CFP")["SecretString"]
|
||||
|
||||
|
||||
def _obj_hash(obj: Any) -> str:
|
||||
json_str = json.dumps(obj, sort_keys=True, ensure_ascii=True)
|
||||
sha = hashlib.sha256()
|
||||
sha.update(json_str.encode())
|
||||
return sha.hexdigest()
|
||||
|
||||
|
||||
def fetch_latest_alerts(rds_data_client):
|
||||
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
|
||||
|
||||
sql = (f"""
|
||||
SELECT DISTINCT ON (category, test_suite, test_name)
|
||||
category, test_suite, test_name, last_result_hash,
|
||||
last_notification_dt
|
||||
FROM {schema}
|
||||
ORDER BY category, test_suite, test_name, last_notification_dt DESC
|
||||
""")
|
||||
|
||||
result = rds_data_client.execute_statement(
|
||||
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
|
||||
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
|
||||
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
|
||||
schema=schema,
|
||||
sql=sql,
|
||||
)
|
||||
for row in result["records"]:
|
||||
category, test_suite, test_name, last_result_hash, \
|
||||
last_notification_dt = (
|
||||
r["stringValue"]
|
||||
if "stringValue" in r else None
|
||||
for r in row
|
||||
)
|
||||
last_notification_dt = datetime.datetime.strptime(
|
||||
last_notification_dt, "%Y-%m-%d %H:%M:%S")
|
||||
yield category, test_suite, test_name, last_result_hash, \
|
||||
last_notification_dt
|
||||
|
||||
|
||||
def fetch_latest_results(rds_data_client,
|
||||
fetch_since: Optional[datetime.datetime] = None):
|
||||
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
|
||||
|
||||
sql = (f"""
|
||||
SELECT DISTINCT ON (category, test_suite, test_name)
|
||||
created_on, category, test_suite, test_name, status, results,
|
||||
artifacts, last_logs
|
||||
FROM {schema} """)
|
||||
|
||||
parameters = []
|
||||
if fetch_since is not None:
|
||||
sql += "WHERE created_on >= :created_on "
|
||||
parameters = [
|
||||
{
|
||||
"name": "created_on",
|
||||
"typeHint": "TIMESTAMP",
|
||||
"value": {
|
||||
"stringValue": fetch_since.strftime("%Y-%m-%d %H:%M:%S")
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
sql += "ORDER BY category, test_suite, test_name, created_on DESC"
|
||||
|
||||
result = rds_data_client.execute_statement(
|
||||
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
|
||||
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
|
||||
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
|
||||
schema=schema,
|
||||
sql=sql,
|
||||
parameters=parameters,
|
||||
)
|
||||
for row in result["records"]:
|
||||
created_on, category, test_suite, test_name, status, results, \
|
||||
artifacts, last_logs = (
|
||||
r["stringValue"] if "stringValue" in r else None for r in row)
|
||||
|
||||
# Calculate hash before converting strings to objects
|
||||
result_obj = (created_on, category, test_suite, test_name, status,
|
||||
results, artifacts, last_logs)
|
||||
result_json = json.dumps(result_obj)
|
||||
result_hash = _obj_hash(result_json)
|
||||
|
||||
# Convert some strings to python objects
|
||||
created_on = datetime.datetime.strptime(created_on,
|
||||
"%Y-%m-%d %H:%M:%S")
|
||||
results = json.loads(results)
|
||||
artifacts = json.loads(artifacts)
|
||||
|
||||
yield result_hash, created_on, category, test_suite, test_name, \
|
||||
status, results, artifacts, last_logs
|
||||
|
||||
|
||||
def mark_as_handled(rds_data_client, update: bool, category: str,
|
||||
test_suite: str, test_name: str, result_hash: str,
|
||||
last_notification_dt: datetime.datetime):
|
||||
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
|
||||
|
||||
if not update:
|
||||
sql = (f"""
|
||||
INSERT INTO {schema}
|
||||
(category, test_suite, test_name,
|
||||
last_result_hash, last_notification_dt)
|
||||
VALUES (:category, :test_suite, :test_name,
|
||||
:last_result_hash, :last_notification_dt)
|
||||
""")
|
||||
else:
|
||||
sql = (f"""
|
||||
UPDATE {schema}
|
||||
SET last_result_hash=:last_result_hash,
|
||||
last_notification_dt=:last_notification_dt
|
||||
WHERE category=:category AND test_suite=:test_suite
|
||||
AND test_name=:test_name
|
||||
""")
|
||||
|
||||
rds_data_client.execute_statement(
|
||||
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
|
||||
parameters=[
|
||||
{
|
||||
"name": "category",
|
||||
"value": {
|
||||
"stringValue": category
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "test_suite",
|
||||
"value": {
|
||||
"stringValue": test_suite or ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "test_name",
|
||||
"value": {
|
||||
"stringValue": test_name
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "last_result_hash",
|
||||
"value": {
|
||||
"stringValue": result_hash
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "last_notification_dt",
|
||||
"typeHint": "TIMESTAMP",
|
||||
"value": {
|
||||
"stringValue": last_notification_dt.strftime(
|
||||
"%Y-%m-%d %H:%M:%S")
|
||||
},
|
||||
},
|
||||
],
|
||||
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
|
||||
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
|
||||
schema=schema,
|
||||
sql=sql,
|
||||
)
|
||||
|
||||
|
||||
def post_alerts_to_slack(channel: str, alerts: List[Tuple[str, str, str, str]],
|
||||
non_alerts: Mapping[str, int]):
|
||||
if len(alerts) == 0:
|
||||
logger.info("No alerts to post to slack.")
|
||||
return
|
||||
|
||||
markdown_lines = [
|
||||
f"* {len(alerts)} new release test failures found!*",
|
||||
"",
|
||||
]
|
||||
|
||||
category_alerts = defaultdict(list)
|
||||
for (category, test_suite, test_name, alert) in alerts:
|
||||
category_alerts[category].append(
|
||||
f" *{test_suite}/{test_name}* failed: {alert}")
|
||||
|
||||
for category, alert_list in category_alerts.items():
|
||||
markdown_lines.append(f"Branch: *{category}*")
|
||||
markdown_lines.extend(alert_list)
|
||||
markdown_lines.append("")
|
||||
|
||||
total_non_alerts = sum(n for n in non_alerts.values())
|
||||
non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
|
||||
|
||||
markdown_lines += [
|
||||
f"Additionally, {total_non_alerts} tests passed successfully "
|
||||
f"({', '.join(non_alert_detail)})."
|
||||
]
|
||||
|
||||
slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
|
||||
|
||||
resp = requests.post(
|
||||
slack_url,
|
||||
json={
|
||||
"text": "\n".join(markdown_lines),
|
||||
"channel": channel,
|
||||
"username": "Fail Bot",
|
||||
"icon_emoji": ":red_circle:",
|
||||
},
|
||||
)
|
||||
print(resp.status_code)
|
||||
print(resp.text)
|
||||
|
||||
|
||||
def post_statistics_to_slack(channel: str,
|
||||
alerts: List[Tuple[str, str, str, str]],
|
||||
non_alerts: Mapping[str, int]):
|
||||
total_alerts = len(alerts)
|
||||
|
||||
category_alerts = defaultdict(list)
|
||||
for (category, test_suite, test_name, alert) in alerts:
|
||||
category_alerts[category].append(f"`{test_suite}/{test_name}`")
|
||||
|
||||
alert_detail = [f"{len(a)} on {c}" for c, a in category_alerts.items()]
|
||||
|
||||
total_non_alerts = sum(n for n in non_alerts.values())
|
||||
non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
|
||||
|
||||
markdown_lines = [
|
||||
"*Periodic release test report*", "", f"In the past 24 hours, "
|
||||
f"*{total_non_alerts}* release tests finished successfully, and "
|
||||
f"*{total_alerts}* release tests failed."
|
||||
]
|
||||
|
||||
markdown_lines.append("")
|
||||
|
||||
if total_alerts:
|
||||
markdown_lines.append(f"*Failing:* {', '.join(alert_detail)}")
|
||||
for c, a in category_alerts.items():
|
||||
markdown_lines.append(f" *{c}*: {', '.join(sorted(a))}")
|
||||
else:
|
||||
markdown_lines.append("*Failing:* None")
|
||||
|
||||
markdown_lines.append("")
|
||||
|
||||
if total_non_alerts:
|
||||
markdown_lines.append(f"*Passing:* {', '.join(non_alert_detail)}")
|
||||
else:
|
||||
markdown_lines.append("*Passing:* None")
|
||||
|
||||
slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
|
||||
|
||||
resp = requests.post(
|
||||
slack_url,
|
||||
json={
|
||||
"text": "\n".join(markdown_lines),
|
||||
"channel": channel,
|
||||
"username": "Fail Bot",
|
||||
"icon_emoji": ":red_circle:",
|
||||
},
|
||||
)
|
||||
print(resp.status_code)
|
||||
print(resp.text)
|
||||
|
||||
|
||||
def handle_results_and_get_alerts(
|
||||
rds_data_client,
|
||||
fetch_since: Optional[datetime.datetime] = None,
|
||||
always_try_alert: bool = False,
|
||||
no_status_update: bool = False):
|
||||
# First build a map of last notifications
|
||||
last_notifications_map = {}
|
||||
for category, test_suite, test_name, last_result_hash, \
|
||||
last_notification_dt in fetch_latest_alerts(rds_data_client):
|
||||
last_notifications_map[(category, test_suite,
|
||||
test_name)] = (last_result_hash,
|
||||
last_notification_dt)
|
||||
|
||||
alerts = []
|
||||
non_alerts = Counter()
|
||||
|
||||
# Then fetch latest results
|
||||
for result_hash, created_on, category, test_suite, test_name, status, \
|
||||
results, artifacts, last_logs in fetch_latest_results(
|
||||
rds_data_client, fetch_since=fetch_since):
|
||||
key = (category, test_suite, test_name)
|
||||
|
||||
try_alert = always_try_alert
|
||||
if key in last_notifications_map:
|
||||
# If we have an alert for this key, fetch info
|
||||
last_result_hash, last_notification_dt = last_notifications_map[
|
||||
key]
|
||||
|
||||
if last_result_hash != result_hash:
|
||||
# If we got a new result, handle new result
|
||||
try_alert = True
|
||||
# Todo: maybe alert again after some time?
|
||||
else:
|
||||
try_alert = True
|
||||
|
||||
if try_alert:
|
||||
handle_fn = SUITE_TO_FN.get(test_suite, None)
|
||||
if not handle_fn:
|
||||
logger.warning(f"No handle for suite {test_suite}")
|
||||
alert = default_handle_result(created_on, category, test_suite,
|
||||
test_name, status, results,
|
||||
artifacts, last_logs)
|
||||
else:
|
||||
alert = handle_fn(created_on, category, test_suite, test_name,
|
||||
status, results, artifacts, last_logs)
|
||||
|
||||
if alert:
|
||||
logger.warning(
|
||||
f"Alert raised for test {test_suite}/{test_name} "
|
||||
f"({category}): {alert}")
|
||||
|
||||
alerts.append((category, test_suite, test_name, alert))
|
||||
else:
|
||||
logger.debug(
|
||||
f"No alert raised for test {test_suite}/{test_name} "
|
||||
f"({category})")
|
||||
non_alerts[category] += 1
|
||||
|
||||
if not no_status_update:
|
||||
mark_as_handled(rds_data_client, key in last_notifications_map,
|
||||
category, test_suite, test_name, result_hash,
|
||||
datetime.datetime.now())
|
||||
|
||||
return alerts, non_alerts
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--stats",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Finish quickly for training.")
|
||||
args = parser.parse_args()
|
||||
|
||||
maybe_fetch_slack_webhook()
|
||||
|
||||
rds_data_client = boto3.client("rds-data", region_name="us-west-2")
|
||||
|
||||
if args.stats:
|
||||
# Only update last 24 hour stats
|
||||
fetch_since = datetime.datetime.now() - datetime.timedelta(days=1)
|
||||
alerts, non_alerts = handle_results_and_get_alerts(
|
||||
rds_data_client,
|
||||
fetch_since=fetch_since,
|
||||
always_try_alert=True,
|
||||
no_status_update=True)
|
||||
post_statistics_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts,
|
||||
non_alerts)
|
||||
|
||||
else:
|
||||
alerts, non_alerts = handle_results_and_get_alerts(rds_data_client)
|
||||
post_alerts_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts,
|
||||
non_alerts)
|
0
release/alerts/__init__.py
Normal file
0
release/alerts/__init__.py
Normal file
13
release/alerts/default.py
Normal file
13
release/alerts/default.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
import datetime
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
def handle_result(created_on: datetime.datetime, category: str,
|
||||
test_suite: str, test_name: str, status: str, results: Dict,
|
||||
artifacts: Dict, last_logs: str) -> Optional[str]:
|
||||
|
||||
if not status == "finished":
|
||||
return f"Test script did not finish successfully ({status})."
|
||||
|
||||
return None
|
32
release/alerts/long_running_tests.py
Normal file
32
release/alerts/long_running_tests.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
import datetime
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
def handle_result(created_on: datetime.datetime, category: str,
|
||||
test_suite: str, test_name: str, status: str, results: Dict,
|
||||
artifacts: Dict, last_logs: str) -> Optional[str]:
|
||||
assert test_suite == "long_running_tests"
|
||||
|
||||
# elapsed_time = results.get("elapsed_time", 0.)
|
||||
last_update_diff = results.get("last_update_diff", float("inf"))
|
||||
|
||||
if test_name in [
|
||||
"actor_deaths", "many_actor_tasks", "many_drivers", "many_tasks",
|
||||
"many_tasks_serialized_ids", "node_failures",
|
||||
"object_spilling_shuffle", "serve", "serve_failure"
|
||||
]:
|
||||
# Core tests
|
||||
target_update_diff = 120
|
||||
|
||||
elif test_name in ["apex", "impala", "many_ppo", "pbt"]:
|
||||
# Tune/RLLib style tests
|
||||
target_update_diff = 360
|
||||
else:
|
||||
return None
|
||||
|
||||
if last_update_diff > target_update_diff:
|
||||
return f"Last update to results json was too long ago " \
|
||||
f"({last_update_diff:.2f} > {target_update_diff})"
|
||||
|
||||
return None
|
14
release/alerts/rllib_tests.py
Normal file
14
release/alerts/rllib_tests.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
import datetime
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
def handle_result(created_on: datetime.datetime, category: str,
|
||||
test_suite: str, test_name: str, status: str, results: Dict,
|
||||
artifacts: Dict, last_logs: str) -> Optional[str]:
|
||||
assert test_suite == "rllib_tests"
|
||||
|
||||
if not status == "finished":
|
||||
return f"Test script did not finish successfully ({status})."
|
||||
|
||||
return None
|
60
release/alerts/tune_tests.py
Normal file
60
release/alerts/tune_tests.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
import datetime
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
def handle_result(created_on: datetime.datetime, category: str,
|
||||
test_suite: str, test_name: str, status: str, results: Dict,
|
||||
artifacts: Dict, last_logs: str) -> Optional[str]:
|
||||
assert test_suite == "tune_tests"
|
||||
|
||||
msg = ""
|
||||
success = status == "finished"
|
||||
time_taken = results.get("time_taken", float("inf"))
|
||||
num_terminated = results.get("trial_states", {}).get("TERMINATED", 0)
|
||||
was_smoke_test = results.get("smoke_test", False)
|
||||
|
||||
if not success:
|
||||
if status == "timeout":
|
||||
msg += "Test timed out."
|
||||
else:
|
||||
msg += "Test script failed. "
|
||||
|
||||
if test_name == "long_running_large_checkpoints":
|
||||
last_update_diff = results.get("last_update_diff", float("inf"))
|
||||
target_update_diff = 360
|
||||
|
||||
if last_update_diff > target_update_diff:
|
||||
return f"Last update to results json was too long ago " \
|
||||
f"({last_update_diff:.2f} > {target_update_diff})"
|
||||
return None
|
||||
|
||||
elif test_name == "bookkeeping_overhead":
|
||||
target_terminated = 10000
|
||||
target_time = 800
|
||||
elif test_name == "durable_trainable":
|
||||
target_terminated = 16
|
||||
target_time = 600
|
||||
elif test_name == "network_overhead":
|
||||
target_terminated = 100 if not was_smoke_test else 20
|
||||
target_time = 900 if not was_smoke_test else 400
|
||||
elif test_name == "result_throughput_cluster":
|
||||
target_terminated = 1000
|
||||
target_time = 120
|
||||
elif test_name == "result_throughput_single_node":
|
||||
target_terminated = 96
|
||||
target_time = 120
|
||||
elif test_name == "xgboost_sweep":
|
||||
target_terminated = 31
|
||||
target_time = 3600
|
||||
else:
|
||||
return None
|
||||
|
||||
if num_terminated < target_terminated:
|
||||
msg += f"Some trials failed " \
|
||||
f"(num_terminated={num_terminated} < {target_terminated}). "
|
||||
if time_taken > target_time:
|
||||
msg += f"Took too long to complete " \
|
||||
f"(time_taken={time_taken:.2f} > {target_time}). "
|
||||
|
||||
return msg or None
|
58
release/alerts/xgboost_tests.py
Normal file
58
release/alerts/xgboost_tests.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
import datetime
|
||||
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
def handle_result(created_on: datetime.datetime, category: str,
|
||||
test_suite: str, test_name: str, status: str, results: Dict,
|
||||
artifacts: Dict, last_logs: str) -> Optional[str]:
|
||||
assert test_suite == "xgboost_tests"
|
||||
|
||||
time_taken = results.get("time_taken", float("inf"))
|
||||
num_terminated = results.get("trial_states", {}).get("TERMINATED", 0)
|
||||
|
||||
if test_name in [
|
||||
"distributed_api_test", "ft_small_elastic", "ft_small_nonelastic"
|
||||
]:
|
||||
if not status == "finished":
|
||||
return f"Test script did not finish successfully ({status})."
|
||||
|
||||
return None
|
||||
elif test_name.startswith("tune_"):
|
||||
msg = ""
|
||||
if test_name == "tune_small":
|
||||
target_terminated = 4
|
||||
target_time = 90
|
||||
elif test_name == "tune_4x32":
|
||||
target_terminated = 4
|
||||
target_time = 120
|
||||
elif test_name == "tune_32x4":
|
||||
target_terminated = 32
|
||||
target_time = 600
|
||||
else:
|
||||
return None
|
||||
|
||||
if num_terminated < target_terminated:
|
||||
msg += f"Some trials failed " \
|
||||
f"(num_terminated={num_terminated} < {target_terminated}). "
|
||||
if time_taken > target_time:
|
||||
msg += f"Took too long to complete " \
|
||||
f"(time_taken={time_taken} > {target_time}). "
|
||||
|
||||
return msg or None
|
||||
else:
|
||||
# train scripts
|
||||
if test_name == "train_small":
|
||||
target_time = 30
|
||||
elif test_name == "train_moderate":
|
||||
target_time = 60
|
||||
elif test_name == "train_gpu":
|
||||
target_time = 40
|
||||
else:
|
||||
return None
|
||||
|
||||
if time_taken > target_time:
|
||||
return f"Took too long to complete " \
|
||||
f"(time_taken={time_taken:.2f} > {target_time}). "
|
||||
|
||||
return None
|
214
release/config_generator.html
Normal file
214
release/config_generator.html
Normal file
|
@ -0,0 +1,214 @@
|
|||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Releaser config generator</title>
|
||||
<style type="text/css">
|
||||
html {
|
||||
background: #cccccc;
|
||||
}
|
||||
body {
|
||||
background: #ffffff;
|
||||
font-family: sans-serif;
|
||||
padding: 1em 2em;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
textarea {
|
||||
width: 600px;
|
||||
height: 200px;
|
||||
}
|
||||
form .use {
|
||||
white-space: nowrap;
|
||||
padding-right: 1em;
|
||||
}
|
||||
form .val {
|
||||
min-width: 300px;
|
||||
}
|
||||
form .val input {
|
||||
width: 90%;
|
||||
}
|
||||
form .desc {
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript">
|
||||
var env_vars = [
|
||||
{
|
||||
"name": "RAY_TEST_REPO",
|
||||
"short": "Git repo with test files",
|
||||
"long": "Repository in which the test files are which you would like to run. Note that this doesn't have to be the same repo from which the wheels are installed.",
|
||||
"default": "https://github.com/ray-project/ray.git",
|
||||
"enabled": false,
|
||||
},
|
||||
{
|
||||
"name": "RAY_TEST_BRANCH",
|
||||
"short": "Git branch for test repo",
|
||||
"long": "Git branch that is checked out from RAY_TEST_REPO and which contains the test files you would like to run. Note that this doesnt' have to be the same branch you're fetching the Ray wheels from.",
|
||||
"default": "master",
|
||||
"enabled": false,
|
||||
},
|
||||
{
|
||||
"name": "RAY_REPO",
|
||||
"short": "Git repo for the Ray wheels",
|
||||
"long": "Repository from which to fetch the latest commits to find the Ray wheels",
|
||||
"default": "https://github.com/ray-project/ray.git",
|
||||
"enabled": false,
|
||||
},
|
||||
{
|
||||
"name": "RAY_BRANCH",
|
||||
"short": "Git branch for the Ray wheels",
|
||||
"long": "Branch that is check out from RAY_REPO from which the latest commits are fetched to find the Ray wheels",
|
||||
"default": "master",
|
||||
"enabled": true,
|
||||
},
|
||||
{
|
||||
"name": "RELEASE_TEST_SUITE",
|
||||
"short": "Release test suite (nightly/weekly/manual)",
|
||||
"long": "Release test suite as defined in releaser's build_pipeline.py",
|
||||
"default": "nightly",
|
||||
"enabled": true,
|
||||
},
|
||||
{
|
||||
"name": "FILTER_FILE",
|
||||
"short": "Filter test file by this string",
|
||||
"long": "Only test files (e.g. xgboost_tests.yml) that match this string will be included in the test",
|
||||
"default": "",
|
||||
"enabled": false,
|
||||
},
|
||||
{
|
||||
"name": "FILTER_TEST",
|
||||
"short": "Filter test name by this string",
|
||||
"long": "Only test names (e.g. tune_4x32) that match this string will be included in the test",
|
||||
"default": "",
|
||||
"enabled": false,
|
||||
},
|
||||
]
|
||||
|
||||
window.addEventListener('load', function () {
|
||||
|
||||
var table = document.getElementById("gen_table");
|
||||
|
||||
for (var env_var of env_vars) {
|
||||
|
||||
var use_td = document.createElement("td");
|
||||
use_td.setAttribute("class", "use");
|
||||
|
||||
var use_input = document.createElement("input");
|
||||
use_input.setAttribute("type", "checkbox");
|
||||
use_input.setAttribute("data-activate", env_var["name"] + "_val");
|
||||
use_input.setAttribute("id", env_var["name"] + "_use");
|
||||
use_input.setAttribute("class", "input_use");
|
||||
if (env_var["enabled"]) {
|
||||
use_input.checked = true;
|
||||
}
|
||||
|
||||
|
||||
var use_label = document.createElement("label");
|
||||
use_label.setAttribute("for", env_var["name"] + "_use");
|
||||
use_label.innerHTML = env_var["name"];
|
||||
|
||||
use_td.append(use_input);
|
||||
use_td.append(use_label);
|
||||
|
||||
val_td = document.createElement("td");
|
||||
val_td.setAttribute("class", "val");
|
||||
|
||||
val_input = document.createElement("input");
|
||||
val_input.setAttribute("type", "text");
|
||||
if (!env_var["enabled"]) {
|
||||
val_input.setAttribute("disabled", "disabled");
|
||||
}
|
||||
val_input.setAttribute("id", env_var["name"] + "_val");
|
||||
val_input.setAttribute("name", env_var["name"]);
|
||||
val_input.setAttribute("value", env_var["default"]);
|
||||
val_input.setAttribute("class", "input_val");
|
||||
|
||||
val_td.append(val_input);
|
||||
|
||||
use_input.addEventListener("click", function(e) {
|
||||
var toggle_val = document.getElementById(e.target.getAttribute("data-activate"))
|
||||
|
||||
if (toggle_val.disabled) {
|
||||
toggle_val.removeAttribute("disabled");
|
||||
} else {
|
||||
toggle_val.setAttribute("disabled", "disabled");
|
||||
}
|
||||
generate_snippet();
|
||||
});
|
||||
|
||||
val_input.addEventListener("change", function() { generate_snippet(); });
|
||||
val_input.addEventListener("keydown", function() { generate_snippet(); });
|
||||
val_input.addEventListener("keyup", function() { generate_snippet(); });
|
||||
|
||||
var desc_td = document.createElement("td");
|
||||
desc_td.setAttribute("class", "desc");
|
||||
|
||||
var desc_a = document.createElement("a");
|
||||
desc_a.setAttribute("title", env_var["long"]);
|
||||
desc_a.innerHTML = env_var["short"];
|
||||
|
||||
desc_td.append(desc_a);
|
||||
|
||||
var tr = document.createElement("tr");
|
||||
tr.append(use_td);
|
||||
tr.append(val_td);
|
||||
tr.append(desc_td);
|
||||
|
||||
table.append(tr);
|
||||
}
|
||||
|
||||
var button = document.getElementById("generate");
|
||||
button.addEventListener("click", function() {
|
||||
generate_snippet();
|
||||
})
|
||||
|
||||
generate_snippet()
|
||||
})
|
||||
|
||||
function generate_snippet() {
|
||||
full_snippet = ""
|
||||
for (env_var of env_vars) {
|
||||
var val_input = document.getElementById(env_var["name"] + "_val")
|
||||
|
||||
if (!val_input.disabled) {
|
||||
full_snippet += env_var["name"] + "=\"" + val_input.value + "\"\n"
|
||||
}
|
||||
}
|
||||
|
||||
document.getElementById("snippet").innerHTML = full_snippet;
|
||||
}
|
||||
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<header class="header">
|
||||
<h1>Releaser config generator</h1>
|
||||
<p>Use this form to generate a list of environment variables.</p>
|
||||
<p>These variables can be passed to Buildkite to run a subset of release tests
|
||||
and choose the correct wheels/release test branch</p>
|
||||
</header>
|
||||
<section class="main">
|
||||
<form id="gen">
|
||||
<table id="gen_table">
|
||||
<tr>
|
||||
<th>Set</th>
|
||||
<th>Value</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
|
||||
</form>
|
||||
|
||||
<div>
|
||||
<button id="generate">Generate snippet</button>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<textarea id="snippet">
|
||||
|
||||
</textarea>
|
||||
</div>
|
||||
</section>
|
||||
</body>
|
||||
</html>
|
1709
release/e2e.py
Normal file
1709
release/e2e.py
Normal file
File diff suppressed because it is too large
Load diff
15
release/requirements.txt
Normal file
15
release/requirements.txt
Normal file
|
@ -0,0 +1,15 @@
|
|||
ray
|
||||
click
|
||||
anyscale
|
||||
slackclient
|
||||
boto3
|
||||
PyGithub
|
||||
pydantic
|
||||
pyyaml
|
||||
typer[all]
|
||||
toml
|
||||
python-dotenv
|
||||
expiringdict
|
||||
requests
|
||||
pytz
|
||||
git+https://github.com/ray-project/xgboost_ray.git#xgboost_ray
|
Loading…
Add table
Reference in a new issue