[release] move release testing end to end script to main ray repo (#17070)

This commit is contained in:
Kai Fricke 2021-07-14 21:39:07 +02:00 committed by GitHub
parent 92f19170ab
commit ed131f87da
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 2823 additions and 0 deletions

View file

@ -0,0 +1,307 @@
import copy
import logging
import os
import sys
import yaml
# Env variables:
# RAY_REPO Repo to use for finding the wheel
# RAY_BRANCH Branch to find the wheel
# RAY_TEST_REPO Repo to use for test scripts
# RAY_TEST_BRANCH Branch for test scripts
# FILTER_FILE File filter
# FILTER_TEST Test name filter
# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)
class ReleaseTest:
def __init__(self, name: str, smoke_test: bool = False, retry: int = 0):
self.name = name
self.smoke_test = smoke_test
self.retry = retry
def __str__(self):
return self.name
def __repr__(self):
return self.name
def __contains__(self, item):
return self.name.__contains__(item)
def __iter__(self):
return iter(self.name)
def __len__(self):
return len(self.name)
class SmokeTest(ReleaseTest):
def __init__(self, name: str, retry: int = 0):
super(SmokeTest, self).__init__(
name=name, smoke_test=True, retry=retry)
CORE_NIGHTLY_TESTS = {
"~/ray/release/nightly_tests/nightly_tests.yaml": [
"shuffle_10gb",
"shuffle_50gb",
"shuffle_50gb_large_partition",
"shuffle_100gb",
"non_streaming_shuffle_100gb",
"non_streaming_shuffle_50gb_large_partition",
"non_streaming_shuffle_50gb",
"dask_on_ray_10gb_sort",
"dask_on_ray_100gb_sort",
"dask_on_ray_large_scale_test_no_spilling",
"dask_on_ray_large_scale_test_spilling",
"stress_test_placement_group",
"shuffle_1tb_1000_partition",
"non_streaming_shuffle_1tb_1000_partition",
"shuffle_1tb_5000_partitions",
"non_streaming_shuffle_1tb_5000_partitions",
"decision_tree_autoscaling",
"autoscaling_shuffle_1tb_1000_partitions",
SmokeTest("stress_test_many_tasks"),
SmokeTest("stress_test_dead_actors"),
],
"~/ray/benchmarks/benchmark_tests.yaml": [
"single_node",
"object_store",
],
}
NIGHTLY_TESTS = {
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
# SmokeTest("horovod_test"),
# ], # Should we enable this?
"~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
"dask_xgboost_test",
"modin_xgboost_test",
"torch_tune_serve_test",
],
"~/ray/release/long_running_tests/long_running_tests.yaml": [
SmokeTest("actor_deaths"),
SmokeTest("apex"),
SmokeTest("impala"),
SmokeTest("many_actor_tasks"),
SmokeTest("many_drivers"),
SmokeTest("many_ppo"),
SmokeTest("many_tasks"),
SmokeTest("many_tasks_serialized_ids"),
SmokeTest("node_failures"),
SmokeTest("pbt"),
# SmokeTest("serve"),
# SmokeTest("serve_failure"),
],
"~/ray/release/microbenchmark/microbenchmark.yaml": [
"microbenchmark",
],
"~/ray/release/sgd_tests/sgd_tests.yaml": [
"sgd_gpu",
],
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
"bookkeeping_overhead",
"durable_trainable",
SmokeTest("long_running_large_checkpoints"),
SmokeTest("network_overhead"),
"result_throughput_cluster",
"result_throughput_single_node",
"xgboost_sweep",
],
"~/ray/release/xgboost_tests/xgboost_tests.yaml": [
"train_small",
"train_moderate",
"train_gpu",
"tune_small",
"tune_4x32",
"tune_32x4",
"ft_small_elastic",
"ft_small_non_elastic",
"distributed_api_test",
],
}
WEEKLY_TESTS = {
"~/ray/benchmarks/benchmark_tests.yaml": [
"distributed",
],
"~/ray/release/nightly_tests/nightly_tests.yaml": [
"stress_test_many_tasks",
"stress_test_dead_actors",
],
"~/ray/release/horovod_tests/horovod_tests.yaml": [
"horovod_test",
],
"~/ray/release/long_running_distributed_tests"
"/long_running_distributed.yaml": [
"pytorch_pbt_failure",
],
# Full long running tests (1 day runtime)
"~/ray/release/long_running_tests/long_running_tests.yaml": [
"actor_deaths",
"apex",
"impala",
"many_actor_tasks",
"many_drivers",
"many_ppo",
"many_tasks",
"many_tasks_serialized_ids",
"node_failures",
"pbt",
# "serve",
# "serve_failure",
],
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
"network_overhead",
"long_running_large_checkpoints",
],
}
MANUAL_TESTS = {
"~/ray/release/rllib_tests/rllib_tests.yaml": [
"learning_tests",
"example_scripts_on_gpu_tests",
"stress_tests",
],
"~/ray/release/long_running_tests/long_running_tests.yaml": [
SmokeTest("serve"),
SmokeTest("serve_failure"),
]
}
SUITES = {
"core-nightly": CORE_NIGHTLY_TESTS,
"nightly": NIGHTLY_TESTS,
"weekly": WEEKLY_TESTS,
"manual": MANUAL_TESTS,
}
DEFAULT_STEP_TEMPLATE = {
"env": {
"ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
"ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
"RELEASE_AWS_BUCKET": "ray-release-automation-results",
"RELEASE_AWS_LOCATION": "dev",
"RELEASE_AWS_DB_NAME": "ray_ci",
"RELEASE_AWS_DB_TABLE": "release_test_result",
"AWS_REGION": "us-west-2"
},
"agents": {
"queue": "runner_queue_branch"
},
"plugins": [{
"docker#v3.8.0": {
"image": "rayproject/ray",
"propagate-environment": True
}
}],
"commands": []
}
def build_pipeline(steps):
all_steps = []
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
RAY_REPO = os.environ.get("RAY_REPO",
"https://github.com/ray-project/ray.git")
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
FILTER_FILE = os.environ.get("FILTER_FILE", "")
FILTER_TEST = os.environ.get("FILTER_TEST", "")
logging.info(
f"Building pipeline \n"
f"Ray repo/branch to test:\n"
f" RAY_REPO = {RAY_REPO}\n"
f" RAY_BRANCH = {RAY_BRANCH}\n\n"
f"Ray repo/branch containing the test configurations and scripts:"
f" RAY_TEST_REPO = {RAY_TEST_REPO}\n"
f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
f"Filtering for these tests:\n"
f" FILTER_FILE = {FILTER_FILE}\n"
f" FILTER_TEST = {FILTER_TEST}\n\n")
for test_file, test_names in steps.items():
if FILTER_FILE and FILTER_FILE not in test_file:
continue
test_base = os.path.basename(test_file)
for test_name in test_names:
if FILTER_TEST and FILTER_TEST not in test_name:
continue
if not isinstance(test_name, ReleaseTest):
test_name = ReleaseTest(name=test_name)
logging.info(f"Adding test: {test_base}/{test_name}")
cmd = str(f"python release/e2e.py "
f"--ray-branch {RAY_BRANCH} "
f"--category {RAY_BRANCH} "
f"--test-config {test_file} "
f"--test-name {test_name}")
if test_name.smoke_test:
logging.info("This test will run as a smoke test.")
cmd += " --smoke-test"
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
if test_name.retry:
logging.info(f"This test will be retried up to "
f"{test_name.retry} times.")
step_conf["retry"] = {
"automatic": [{
"exit_status": "*",
"limit": test_name.retry
}]
}
step_conf["commands"] = [
"pip install -q -r release/requirements.txt",
"pip install -U boto3 botocore",
f"git clone -b {RAY_TEST_BRANCH} {RAY_TEST_REPO} ~/ray",
cmd,
]
step_conf["label"] = f"{test_name} ({RAY_BRANCH}) - " \
f"{RAY_TEST_BRANCH}/{test_base}"
all_steps.append(step_conf)
return all_steps
def alert_pipeline(stats: bool = False):
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
cmd = "python release/alert.py"
if stats:
cmd += " --stats"
step_conf["commands"] = [
"pip install -q -r release/requirements.txt",
"pip install -U boto3 botocore",
cmd,
]
step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
return [step_conf]
if __name__ == "__main__":
alert = os.environ.get("RELEASE_ALERT", "0")
if alert in ["1", "stats"]:
steps = alert_pipeline(alert == "stats")
else:
TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
PIPELINE_SPEC = SUITES[TEST_SUITE]
steps = build_pipeline(PIPELINE_SPEC)
yaml.dump({"steps": steps}, sys.stdout)

0
release/__init__.py Normal file
View file

401
release/alert.py Normal file
View file

@ -0,0 +1,401 @@
import argparse
from collections import defaultdict, Counter
from typing import Any, List, Tuple, Mapping, Optional
import datetime
import hashlib
import json
import logging
import os
import requests
import sys
import boto3
from e2e import GLOBAL_CONFIG
from alerts.default import handle_result as default_handle_result
from alerts.rllib_tests import handle_result as rllib_tests_handle_result
from alerts.long_running_tests import handle_result as \
long_running_tests_handle_result
from alerts.tune_tests import handle_result as tune_tests_handle_result
from alerts.xgboost_tests import handle_result as xgboost_tests_handle_result
SUITE_TO_FN = {
"long_running_tests": long_running_tests_handle_result,
"rllib_tests": rllib_tests_handle_result,
"tune_tests": tune_tests_handle_result,
"xgboost_tests": xgboost_tests_handle_result,
}
GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] = "alert_state"
GLOBAL_CONFIG["SLACK_WEBHOOK"] = os.environ.get("SLACK_WEBHOOK", "")
GLOBAL_CONFIG["SLACK_CHANNEL"] = os.environ.get("SLACK_CHANNEL",
"#oss-test-cop")
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter(fmt="[%(levelname)s %(asctime)s] "
"%(filename)s: %(lineno)d "
"%(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
def maybe_fetch_slack_webhook():
if GLOBAL_CONFIG["SLACK_WEBHOOK"] in [None, ""]:
print("Missing SLACK_WEBHOOK, retrieving from AWS secrets store")
GLOBAL_CONFIG["SLACK_WEBHOOK"] = boto3.client(
"secretsmanager", region_name="us-west-2"
).get_secret_value(
SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:"
"release-automation/"
"slack-webhook-Na0CFP")["SecretString"]
def _obj_hash(obj: Any) -> str:
json_str = json.dumps(obj, sort_keys=True, ensure_ascii=True)
sha = hashlib.sha256()
sha.update(json_str.encode())
return sha.hexdigest()
def fetch_latest_alerts(rds_data_client):
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
sql = (f"""
SELECT DISTINCT ON (category, test_suite, test_name)
category, test_suite, test_name, last_result_hash,
last_notification_dt
FROM {schema}
ORDER BY category, test_suite, test_name, last_notification_dt DESC
""")
result = rds_data_client.execute_statement(
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
schema=schema,
sql=sql,
)
for row in result["records"]:
category, test_suite, test_name, last_result_hash, \
last_notification_dt = (
r["stringValue"]
if "stringValue" in r else None
for r in row
)
last_notification_dt = datetime.datetime.strptime(
last_notification_dt, "%Y-%m-%d %H:%M:%S")
yield category, test_suite, test_name, last_result_hash, \
last_notification_dt
def fetch_latest_results(rds_data_client,
fetch_since: Optional[datetime.datetime] = None):
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
sql = (f"""
SELECT DISTINCT ON (category, test_suite, test_name)
created_on, category, test_suite, test_name, status, results,
artifacts, last_logs
FROM {schema} """)
parameters = []
if fetch_since is not None:
sql += "WHERE created_on >= :created_on "
parameters = [
{
"name": "created_on",
"typeHint": "TIMESTAMP",
"value": {
"stringValue": fetch_since.strftime("%Y-%m-%d %H:%M:%S")
},
},
]
sql += "ORDER BY category, test_suite, test_name, created_on DESC"
result = rds_data_client.execute_statement(
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
schema=schema,
sql=sql,
parameters=parameters,
)
for row in result["records"]:
created_on, category, test_suite, test_name, status, results, \
artifacts, last_logs = (
r["stringValue"] if "stringValue" in r else None for r in row)
# Calculate hash before converting strings to objects
result_obj = (created_on, category, test_suite, test_name, status,
results, artifacts, last_logs)
result_json = json.dumps(result_obj)
result_hash = _obj_hash(result_json)
# Convert some strings to python objects
created_on = datetime.datetime.strptime(created_on,
"%Y-%m-%d %H:%M:%S")
results = json.loads(results)
artifacts = json.loads(artifacts)
yield result_hash, created_on, category, test_suite, test_name, \
status, results, artifacts, last_logs
def mark_as_handled(rds_data_client, update: bool, category: str,
test_suite: str, test_name: str, result_hash: str,
last_notification_dt: datetime.datetime):
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
if not update:
sql = (f"""
INSERT INTO {schema}
(category, test_suite, test_name,
last_result_hash, last_notification_dt)
VALUES (:category, :test_suite, :test_name,
:last_result_hash, :last_notification_dt)
""")
else:
sql = (f"""
UPDATE {schema}
SET last_result_hash=:last_result_hash,
last_notification_dt=:last_notification_dt
WHERE category=:category AND test_suite=:test_suite
AND test_name=:test_name
""")
rds_data_client.execute_statement(
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
parameters=[
{
"name": "category",
"value": {
"stringValue": category
}
},
{
"name": "test_suite",
"value": {
"stringValue": test_suite or ""
}
},
{
"name": "test_name",
"value": {
"stringValue": test_name
}
},
{
"name": "last_result_hash",
"value": {
"stringValue": result_hash
}
},
{
"name": "last_notification_dt",
"typeHint": "TIMESTAMP",
"value": {
"stringValue": last_notification_dt.strftime(
"%Y-%m-%d %H:%M:%S")
},
},
],
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
schema=schema,
sql=sql,
)
def post_alerts_to_slack(channel: str, alerts: List[Tuple[str, str, str, str]],
non_alerts: Mapping[str, int]):
if len(alerts) == 0:
logger.info("No alerts to post to slack.")
return
markdown_lines = [
f"* {len(alerts)} new release test failures found!*",
"",
]
category_alerts = defaultdict(list)
for (category, test_suite, test_name, alert) in alerts:
category_alerts[category].append(
f" *{test_suite}/{test_name}* failed: {alert}")
for category, alert_list in category_alerts.items():
markdown_lines.append(f"Branch: *{category}*")
markdown_lines.extend(alert_list)
markdown_lines.append("")
total_non_alerts = sum(n for n in non_alerts.values())
non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
markdown_lines += [
f"Additionally, {total_non_alerts} tests passed successfully "
f"({', '.join(non_alert_detail)})."
]
slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
resp = requests.post(
slack_url,
json={
"text": "\n".join(markdown_lines),
"channel": channel,
"username": "Fail Bot",
"icon_emoji": ":red_circle:",
},
)
print(resp.status_code)
print(resp.text)
def post_statistics_to_slack(channel: str,
alerts: List[Tuple[str, str, str, str]],
non_alerts: Mapping[str, int]):
total_alerts = len(alerts)
category_alerts = defaultdict(list)
for (category, test_suite, test_name, alert) in alerts:
category_alerts[category].append(f"`{test_suite}/{test_name}`")
alert_detail = [f"{len(a)} on {c}" for c, a in category_alerts.items()]
total_non_alerts = sum(n for n in non_alerts.values())
non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
markdown_lines = [
"*Periodic release test report*", "", f"In the past 24 hours, "
f"*{total_non_alerts}* release tests finished successfully, and "
f"*{total_alerts}* release tests failed."
]
markdown_lines.append("")
if total_alerts:
markdown_lines.append(f"*Failing:* {', '.join(alert_detail)}")
for c, a in category_alerts.items():
markdown_lines.append(f" *{c}*: {', '.join(sorted(a))}")
else:
markdown_lines.append("*Failing:* None")
markdown_lines.append("")
if total_non_alerts:
markdown_lines.append(f"*Passing:* {', '.join(non_alert_detail)}")
else:
markdown_lines.append("*Passing:* None")
slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
resp = requests.post(
slack_url,
json={
"text": "\n".join(markdown_lines),
"channel": channel,
"username": "Fail Bot",
"icon_emoji": ":red_circle:",
},
)
print(resp.status_code)
print(resp.text)
def handle_results_and_get_alerts(
rds_data_client,
fetch_since: Optional[datetime.datetime] = None,
always_try_alert: bool = False,
no_status_update: bool = False):
# First build a map of last notifications
last_notifications_map = {}
for category, test_suite, test_name, last_result_hash, \
last_notification_dt in fetch_latest_alerts(rds_data_client):
last_notifications_map[(category, test_suite,
test_name)] = (last_result_hash,
last_notification_dt)
alerts = []
non_alerts = Counter()
# Then fetch latest results
for result_hash, created_on, category, test_suite, test_name, status, \
results, artifacts, last_logs in fetch_latest_results(
rds_data_client, fetch_since=fetch_since):
key = (category, test_suite, test_name)
try_alert = always_try_alert
if key in last_notifications_map:
# If we have an alert for this key, fetch info
last_result_hash, last_notification_dt = last_notifications_map[
key]
if last_result_hash != result_hash:
# If we got a new result, handle new result
try_alert = True
# Todo: maybe alert again after some time?
else:
try_alert = True
if try_alert:
handle_fn = SUITE_TO_FN.get(test_suite, None)
if not handle_fn:
logger.warning(f"No handle for suite {test_suite}")
alert = default_handle_result(created_on, category, test_suite,
test_name, status, results,
artifacts, last_logs)
else:
alert = handle_fn(created_on, category, test_suite, test_name,
status, results, artifacts, last_logs)
if alert:
logger.warning(
f"Alert raised for test {test_suite}/{test_name} "
f"({category}): {alert}")
alerts.append((category, test_suite, test_name, alert))
else:
logger.debug(
f"No alert raised for test {test_suite}/{test_name} "
f"({category})")
non_alerts[category] += 1
if not no_status_update:
mark_as_handled(rds_data_client, key in last_notifications_map,
category, test_suite, test_name, result_hash,
datetime.datetime.now())
return alerts, non_alerts
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--stats",
action="store_true",
default=False,
help="Finish quickly for training.")
args = parser.parse_args()
maybe_fetch_slack_webhook()
rds_data_client = boto3.client("rds-data", region_name="us-west-2")
if args.stats:
# Only update last 24 hour stats
fetch_since = datetime.datetime.now() - datetime.timedelta(days=1)
alerts, non_alerts = handle_results_and_get_alerts(
rds_data_client,
fetch_since=fetch_since,
always_try_alert=True,
no_status_update=True)
post_statistics_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts,
non_alerts)
else:
alerts, non_alerts = handle_results_and_get_alerts(rds_data_client)
post_alerts_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts,
non_alerts)

View file

13
release/alerts/default.py Normal file
View file

@ -0,0 +1,13 @@
import datetime
from typing import Dict, Optional
def handle_result(created_on: datetime.datetime, category: str,
test_suite: str, test_name: str, status: str, results: Dict,
artifacts: Dict, last_logs: str) -> Optional[str]:
if not status == "finished":
return f"Test script did not finish successfully ({status})."
return None

View file

@ -0,0 +1,32 @@
import datetime
from typing import Dict, Optional
def handle_result(created_on: datetime.datetime, category: str,
test_suite: str, test_name: str, status: str, results: Dict,
artifacts: Dict, last_logs: str) -> Optional[str]:
assert test_suite == "long_running_tests"
# elapsed_time = results.get("elapsed_time", 0.)
last_update_diff = results.get("last_update_diff", float("inf"))
if test_name in [
"actor_deaths", "many_actor_tasks", "many_drivers", "many_tasks",
"many_tasks_serialized_ids", "node_failures",
"object_spilling_shuffle", "serve", "serve_failure"
]:
# Core tests
target_update_diff = 120
elif test_name in ["apex", "impala", "many_ppo", "pbt"]:
# Tune/RLLib style tests
target_update_diff = 360
else:
return None
if last_update_diff > target_update_diff:
return f"Last update to results json was too long ago " \
f"({last_update_diff:.2f} > {target_update_diff})"
return None

View file

@ -0,0 +1,14 @@
import datetime
from typing import Dict, Optional
def handle_result(created_on: datetime.datetime, category: str,
test_suite: str, test_name: str, status: str, results: Dict,
artifacts: Dict, last_logs: str) -> Optional[str]:
assert test_suite == "rllib_tests"
if not status == "finished":
return f"Test script did not finish successfully ({status})."
return None

View file

@ -0,0 +1,60 @@
import datetime
from typing import Dict, Optional
def handle_result(created_on: datetime.datetime, category: str,
test_suite: str, test_name: str, status: str, results: Dict,
artifacts: Dict, last_logs: str) -> Optional[str]:
assert test_suite == "tune_tests"
msg = ""
success = status == "finished"
time_taken = results.get("time_taken", float("inf"))
num_terminated = results.get("trial_states", {}).get("TERMINATED", 0)
was_smoke_test = results.get("smoke_test", False)
if not success:
if status == "timeout":
msg += "Test timed out."
else:
msg += "Test script failed. "
if test_name == "long_running_large_checkpoints":
last_update_diff = results.get("last_update_diff", float("inf"))
target_update_diff = 360
if last_update_diff > target_update_diff:
return f"Last update to results json was too long ago " \
f"({last_update_diff:.2f} > {target_update_diff})"
return None
elif test_name == "bookkeeping_overhead":
target_terminated = 10000
target_time = 800
elif test_name == "durable_trainable":
target_terminated = 16
target_time = 600
elif test_name == "network_overhead":
target_terminated = 100 if not was_smoke_test else 20
target_time = 900 if not was_smoke_test else 400
elif test_name == "result_throughput_cluster":
target_terminated = 1000
target_time = 120
elif test_name == "result_throughput_single_node":
target_terminated = 96
target_time = 120
elif test_name == "xgboost_sweep":
target_terminated = 31
target_time = 3600
else:
return None
if num_terminated < target_terminated:
msg += f"Some trials failed " \
f"(num_terminated={num_terminated} < {target_terminated}). "
if time_taken > target_time:
msg += f"Took too long to complete " \
f"(time_taken={time_taken:.2f} > {target_time}). "
return msg or None

View file

@ -0,0 +1,58 @@
import datetime
from typing import Dict, Optional
def handle_result(created_on: datetime.datetime, category: str,
test_suite: str, test_name: str, status: str, results: Dict,
artifacts: Dict, last_logs: str) -> Optional[str]:
assert test_suite == "xgboost_tests"
time_taken = results.get("time_taken", float("inf"))
num_terminated = results.get("trial_states", {}).get("TERMINATED", 0)
if test_name in [
"distributed_api_test", "ft_small_elastic", "ft_small_nonelastic"
]:
if not status == "finished":
return f"Test script did not finish successfully ({status})."
return None
elif test_name.startswith("tune_"):
msg = ""
if test_name == "tune_small":
target_terminated = 4
target_time = 90
elif test_name == "tune_4x32":
target_terminated = 4
target_time = 120
elif test_name == "tune_32x4":
target_terminated = 32
target_time = 600
else:
return None
if num_terminated < target_terminated:
msg += f"Some trials failed " \
f"(num_terminated={num_terminated} < {target_terminated}). "
if time_taken > target_time:
msg += f"Took too long to complete " \
f"(time_taken={time_taken} > {target_time}). "
return msg or None
else:
# train scripts
if test_name == "train_small":
target_time = 30
elif test_name == "train_moderate":
target_time = 60
elif test_name == "train_gpu":
target_time = 40
else:
return None
if time_taken > target_time:
return f"Took too long to complete " \
f"(time_taken={time_taken:.2f} > {target_time}). "
return None

View file

@ -0,0 +1,214 @@
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>Releaser config generator</title>
<style type="text/css">
html {
background: #cccccc;
}
body {
background: #ffffff;
font-family: sans-serif;
padding: 1em 2em;
max-width: 800px;
margin: 0 auto;
}
textarea {
width: 600px;
height: 200px;
}
form .use {
white-space: nowrap;
padding-right: 1em;
}
form .val {
min-width: 300px;
}
form .val input {
width: 90%;
}
form .desc {
}
</style>
<script type="text/javascript">
var env_vars = [
{
"name": "RAY_TEST_REPO",
"short": "Git repo with test files",
"long": "Repository in which the test files are which you would like to run. Note that this doesn't have to be the same repo from which the wheels are installed.",
"default": "https://github.com/ray-project/ray.git",
"enabled": false,
},
{
"name": "RAY_TEST_BRANCH",
"short": "Git branch for test repo",
"long": "Git branch that is checked out from RAY_TEST_REPO and which contains the test files you would like to run. Note that this doesnt' have to be the same branch you're fetching the Ray wheels from.",
"default": "master",
"enabled": false,
},
{
"name": "RAY_REPO",
"short": "Git repo for the Ray wheels",
"long": "Repository from which to fetch the latest commits to find the Ray wheels",
"default": "https://github.com/ray-project/ray.git",
"enabled": false,
},
{
"name": "RAY_BRANCH",
"short": "Git branch for the Ray wheels",
"long": "Branch that is check out from RAY_REPO from which the latest commits are fetched to find the Ray wheels",
"default": "master",
"enabled": true,
},
{
"name": "RELEASE_TEST_SUITE",
"short": "Release test suite (nightly/weekly/manual)",
"long": "Release test suite as defined in releaser's build_pipeline.py",
"default": "nightly",
"enabled": true,
},
{
"name": "FILTER_FILE",
"short": "Filter test file by this string",
"long": "Only test files (e.g. xgboost_tests.yml) that match this string will be included in the test",
"default": "",
"enabled": false,
},
{
"name": "FILTER_TEST",
"short": "Filter test name by this string",
"long": "Only test names (e.g. tune_4x32) that match this string will be included in the test",
"default": "",
"enabled": false,
},
]
window.addEventListener('load', function () {
var table = document.getElementById("gen_table");
for (var env_var of env_vars) {
var use_td = document.createElement("td");
use_td.setAttribute("class", "use");
var use_input = document.createElement("input");
use_input.setAttribute("type", "checkbox");
use_input.setAttribute("data-activate", env_var["name"] + "_val");
use_input.setAttribute("id", env_var["name"] + "_use");
use_input.setAttribute("class", "input_use");
if (env_var["enabled"]) {
use_input.checked = true;
}
var use_label = document.createElement("label");
use_label.setAttribute("for", env_var["name"] + "_use");
use_label.innerHTML = env_var["name"];
use_td.append(use_input);
use_td.append(use_label);
val_td = document.createElement("td");
val_td.setAttribute("class", "val");
val_input = document.createElement("input");
val_input.setAttribute("type", "text");
if (!env_var["enabled"]) {
val_input.setAttribute("disabled", "disabled");
}
val_input.setAttribute("id", env_var["name"] + "_val");
val_input.setAttribute("name", env_var["name"]);
val_input.setAttribute("value", env_var["default"]);
val_input.setAttribute("class", "input_val");
val_td.append(val_input);
use_input.addEventListener("click", function(e) {
var toggle_val = document.getElementById(e.target.getAttribute("data-activate"))
if (toggle_val.disabled) {
toggle_val.removeAttribute("disabled");
} else {
toggle_val.setAttribute("disabled", "disabled");
}
generate_snippet();
});
val_input.addEventListener("change", function() { generate_snippet(); });
val_input.addEventListener("keydown", function() { generate_snippet(); });
val_input.addEventListener("keyup", function() { generate_snippet(); });
var desc_td = document.createElement("td");
desc_td.setAttribute("class", "desc");
var desc_a = document.createElement("a");
desc_a.setAttribute("title", env_var["long"]);
desc_a.innerHTML = env_var["short"];
desc_td.append(desc_a);
var tr = document.createElement("tr");
tr.append(use_td);
tr.append(val_td);
tr.append(desc_td);
table.append(tr);
}
var button = document.getElementById("generate");
button.addEventListener("click", function() {
generate_snippet();
})
generate_snippet()
})
function generate_snippet() {
full_snippet = ""
for (env_var of env_vars) {
var val_input = document.getElementById(env_var["name"] + "_val")
if (!val_input.disabled) {
full_snippet += env_var["name"] + "=\"" + val_input.value + "\"\n"
}
}
document.getElementById("snippet").innerHTML = full_snippet;
}
</script>
</head>
<body>
<header class="header">
<h1>Releaser config generator</h1>
<p>Use this form to generate a list of environment variables.</p>
<p>These variables can be passed to Buildkite to run a subset of release tests
and choose the correct wheels/release test branch</p>
</header>
<section class="main">
<form id="gen">
<table id="gen_table">
<tr>
<th>Set</th>
<th>Value</th>
<th>Description</th>
</tr>
</table>
</form>
<div>
<button id="generate">Generate snippet</button>
</div>
<div>
<textarea id="snippet">
</textarea>
</div>
</section>
</body>
</html>

1709
release/e2e.py Normal file

File diff suppressed because it is too large Load diff

15
release/requirements.txt Normal file
View file

@ -0,0 +1,15 @@
ray
click
anyscale
slackclient
boto3
PyGithub
pydantic
pyyaml
typer[all]
toml
python-dotenv
expiringdict
requests
pytz
git+https://github.com/ray-project/xgboost_ray.git#xgboost_ray