ray/release/.buildkite/build_pipeline.py
Yi Cheng 90093769df
[nightly] Add more many tasks tests (#21727)
This PR add four tests for many tasks:

many short tasks send from the single node
many short tasks send from multiple nodes
many long tasks send from multiple nodes
many long tasks send from the single node
TODO: migrate many nodes actor tests to this one.

scheduling envelop should contain:

(tasks): scheduling_test_many_xx_tasks_yy_nodes
(actors):many_nodes_actor_test (to be combined with this one)
(shuffle): pipelined_ingestion_1500_gb_15_windows
(shuffle): dask_on_ray_1tb_sort
2022-01-20 14:52:26 -08:00

646 lines
21 KiB
Python

import copy
import logging
import os
import re
import sys
import yaml
# Env variables:
# RAY_REPO Repo to use for finding the wheel
# RAY_BRANCH Branch to find the wheel
# RAY_VERSION Version to find the wheel
# RAY_WHEELS Direct Ray wheel URL
# RAY_TEST_REPO Repo to use for test scripts
# RAY_TEST_BRANCH Branch for test scripts
# FILTER_FILE File filter
# FILTER_TEST Test name filter
# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)
class ReleaseTest:
def __init__(
self,
name: str,
smoke_test: bool = False,
retry: int = 0,
):
self.name = name
self.smoke_test = smoke_test
self.retry = retry
def __str__(self):
return self.name
def __repr__(self):
return self.name
def __contains__(self, item):
return self.name.__contains__(item)
def __iter__(self):
return iter(self.name)
def __len__(self):
return len(self.name)
class SmokeTest(ReleaseTest):
def __init__(self, name: str, retry: int = 0):
super(SmokeTest, self).__init__(
name=name, smoke_test=True, retry=retry)
CORE_NIGHTLY_TESTS = {
"~/ray/release/nightly_tests/nightly_tests.yaml": [
"shuffle_10gb",
"shuffle_50gb",
"shuffle_50gb_large_partition",
"shuffle_100gb",
"non_streaming_shuffle_100gb",
"non_streaming_shuffle_50gb_large_partition",
"non_streaming_shuffle_50gb",
"dask_on_ray_10gb_sort",
"dask_on_ray_100gb_sort",
SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
SmokeTest("dask_on_ray_large_scale_test_spilling"),
"stress_test_placement_group",
"shuffle_1tb_1000_partition",
"non_streaming_shuffle_1tb_1000_partition",
"shuffle_1tb_5000_partitions",
# TODO(sang): It doesn't even work without spilling
# as it hits the scalability limit.
# "non_streaming_shuffle_1tb_5000_partitions",
"decision_tree_autoscaling",
"decision_tree_autoscaling_20_runs",
"autoscaling_shuffle_1tb_1000_partitions",
SmokeTest("stress_test_many_tasks"),
SmokeTest("stress_test_dead_actors"),
"shuffle_data_loader",
"dask_on_ray_1tb_sort",
SmokeTest("threaded_actors_stress_test"),
"placement_group_performance_test",
"pg_long_running_performance_test",
],
"~/ray/benchmarks/benchmark_tests.yaml": [
"single_node",
"object_store",
"many_actors_smoke_test",
"many_tasks_smoke_test",
"many_pgs_smoke_test",
],
"~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
"inference",
"shuffle_data_loader",
"pipelined_training_50_gb",
"pipelined_ingestion_1500_gb_15_windows",
"datasets_preprocess_ingest",
"datasets_ingest_400G",
SmokeTest("datasets_ingest_train_infer"),
],
"~/ray/release/nightly_tests/chaos_test.yaml": [
"chaos_many_actors",
"chaos_many_tasks_no_object_store",
"chaos_pipelined_ingestion_1500_gb_15_windows",
],
"~/ray/release/microbenchmark/microbenchmark.yaml": [
"microbenchmark",
],
}
SERVE_NIGHTLY_TESTS = {
"~/ray/release/long_running_tests/long_running_tests.yaml": [
SmokeTest("serve"),
SmokeTest("serve_failure"),
],
"~/ray/release/serve_tests/serve_tests.yaml": [
"single_deployment_1k_noop_replica",
"multi_deployment_1k_noop_replica",
"serve_micro_benchmark",
"serve_micro_benchmark_k8s",
"serve_cluster_fault_tolerance",
],
}
CORE_DAILY_TESTS = {
"~/ray/release/nightly_tests/nightly_tests.yaml": [
"k8s_dask_on_ray_large_scale_test_no_spilling",
"dask_on_ray_large_scale_test_no_spilling",
"dask_on_ray_large_scale_test_spilling",
"pg_autoscaling_regression_test",
"threaded_actors_stress_test",
"stress_test_many_tasks",
"stress_test_dead_actors",
"many_nodes_actor_test",
],
"~/ray/release/nightly_tests/chaos_test.yaml": [
"chaos_dask_on_ray_large_scale_test_no_spilling",
"chaos_dask_on_ray_large_scale_test_spilling",
],
}
CORE_SCALABILITY_TESTS_DAILY = {
"~/ray/benchmarks/benchmark_tests.yaml": [
"many_actors",
"many_tasks",
"many_pgs",
"many_nodes",
"scheduling_test_many_0s_tasks_single_node",
"scheduling_test_many_0s_tasks_many_nodes",
"scheduling_test_many_5s_tasks_single_node",
"scheduling_test_many_5s_tasks_many_nodes",
],
}
CORE_REDIS_HA_TESTS_DAILY = {
"~/ray/benchmarks/benchmark_tests.yaml": [
"many_actors_redis_ha",
"many_tasks_redis_ha",
"many_pgs_redis_ha",
"many_nodes_redis_ha",
],
}
NIGHTLY_TESTS = {
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
# SmokeTest("horovod_test"),
# ], # Should we enable this?
"~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
"dask_xgboost_test",
"modin_xgboost_test",
"torch_tune_serve_test",
],
"~/ray/release/long_running_tests/long_running_tests.yaml": [
SmokeTest("actor_deaths"),
SmokeTest("apex"),
SmokeTest("impala"),
SmokeTest("many_actor_tasks"),
SmokeTest("many_drivers"),
SmokeTest("many_ppo"),
SmokeTest("many_tasks"),
SmokeTest("many_tasks_serialized_ids"),
SmokeTest("node_failures"),
SmokeTest("pbt"),
# SmokeTest("serve"),
# SmokeTest("serve_failure"),
],
"~/ray/release/sgd_tests/sgd_tests.yaml": [
"sgd_gpu",
],
"~/ray/release/tune_tests/cloud_tests/tune_cloud_tests.yaml": [
"aws_no_sync_down",
"aws_ssh_sync",
"aws_durable_upload",
"aws_durable_upload_rllib_str",
"aws_durable_upload_rllib_trainer",
"gcp_k8s_durable_upload",
],
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
"bookkeeping_overhead",
"durable_trainable",
SmokeTest("long_running_large_checkpoints"),
SmokeTest("network_overhead"),
"result_throughput_cluster",
"result_throughput_single_node",
],
"~/ray/release/xgboost_tests/xgboost_tests.yaml": [
"train_small",
"train_moderate",
"train_gpu",
"tune_small",
"tune_4x32",
"tune_32x4",
"ft_small_elastic",
"ft_small_non_elastic",
"distributed_api_test",
],
"~/ray/release/rllib_tests/rllib_tests.yaml": [
SmokeTest("learning_tests"),
SmokeTest("stress_tests"),
"performance_tests",
"multi_gpu_learning_tests",
"multi_gpu_with_lstm_learning_tests",
"multi_gpu_with_attention_learning_tests",
# We'll have these as per-PR tests soon.
# "example_scripts_on_gpu_tests",
],
"~/ray/release/runtime_env_tests/runtime_env_tests.yaml": [
"rte_many_tasks_actors", "wheel_urls", "rte_ray_client"
],
}
WEEKLY_TESTS = {
"~/ray/release/horovod_tests/horovod_tests.yaml": [
"horovod_test",
],
"~/ray/release/long_running_distributed_tests"
"/long_running_distributed.yaml": [
"pytorch_pbt_failure",
],
# Full long running tests (1 day runtime)
"~/ray/release/long_running_tests/long_running_tests.yaml": [
"actor_deaths",
"apex",
"impala",
"many_actor_tasks",
"many_drivers",
"many_ppo",
"many_tasks",
"many_tasks_serialized_ids",
"node_failures",
"pbt",
"serve",
"serve_failure",
],
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
"network_overhead",
"long_running_large_checkpoints",
"xgboost_sweep",
],
"~/ray/release/rllib_tests/rllib_tests.yaml": [
"learning_tests",
"stress_tests",
],
}
# This test suite holds "user" tests to test important user workflows
# in a particular environment.
# All workloads in this test suite should:
# 1. Be run in a distributed (multi-node) fashion
# 2. Use autoscaling/scale up (no wait_cluster.py)
# 3. Use GPUs if applicable
# 4. Have the `use_connect` flag set.
USER_TESTS = {
"~/ray/release/ml_user_tests/ml_user_tests.yaml": [
"train_tensorflow_mnist_test", "train_torch_linear_test",
"ray_lightning_user_test_latest", "ray_lightning_user_test_master",
"horovod_user_test_latest", "horovod_user_test_master",
"xgboost_gpu_connect_latest", "xgboost_gpu_connect_master",
"tune_rllib_connect_test"
]
}
SUITES = {
"core-nightly": CORE_NIGHTLY_TESTS,
"serve-nightly": SERVE_NIGHTLY_TESTS,
"core-daily": CORE_DAILY_TESTS,
"core-scalability": CORE_SCALABILITY_TESTS_DAILY,
"core-redis-ha": CORE_REDIS_HA_TESTS_DAILY,
"nightly": {
**NIGHTLY_TESTS,
**USER_TESTS
},
"weekly": WEEKLY_TESTS,
}
DEFAULT_STEP_TEMPLATE = {
"env": {
"ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
"ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
"RELEASE_AWS_BUCKET": "ray-release-automation-results",
"RELEASE_AWS_LOCATION": "dev",
"RELEASE_AWS_DB_NAME": "ray_ci",
"RELEASE_AWS_DB_TABLE": "release_test_result",
"AWS_REGION": "us-west-2"
},
"agents": {
"queue": "runner_queue_branch"
},
"plugins": [{
"docker#v3.9.0": {
"image": "rayproject/ray",
"propagate-environment": True,
"volumes": [
"/tmp/ray_release_test_artifacts:"
"/tmp/ray_release_test_artifacts"
],
}
}],
"artifact_paths": ["/tmp/ray_release_test_artifacts/**/*"],
}
def ask_configuration():
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
RAY_REPO = os.environ.get("RAY_REPO",
"https://github.com/ray-project/ray.git")
RAY_VERSION = os.environ.get("RAY_VERSION", "")
RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
RELEASE_TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
FILTER_FILE = os.environ.get("FILTER_FILE", "")
FILTER_TEST = os.environ.get("FILTER_TEST", "")
input_ask_step = {
"input": "Input required: Please specify tests to run",
"fields": [
{
"text": ("RAY_REPO: Please specify the Ray repository used "
"to find the wheel."),
"hint": ("Repository from which to fetch the latest "
"commits to find the Ray wheels. Usually you don't "
"need to change this."),
"default": RAY_REPO,
"key": "ray_repo"
},
{
"text": ("RAY_BRANCH: Please specify the Ray branch used "
"to find the wheel."),
"hint": "For releases, this will be e.g. `releases/1.x.0`",
"default": RAY_BRANCH,
"key": "ray_branch"
},
{
"text": ("RAY_VERSION: Please specify the Ray version used "
"to find the wheel."),
"hint": ("Leave empty for latest master. For releases, "
"specify the release version."),
"required": False,
"default": RAY_VERSION,
"key": "ray_version"
},
{
"text": "RAY_WHEELS: Please specify the Ray wheel URL.",
"hint": ("ATTENTION: If you provide this, RAY_REPO, "
"RAY_BRANCH and RAY_VERSION will be ignored! "
"Please also make sure to provide the wheels URL "
"for Python 3.7 on Linux.\n"
"You can also insert a commit hash here instead "
"of a full URL.\n"
"NOTE: You can specify multiple commits or URLs "
"for easy bisection (one per line) - this will "
"run each test on each of the specified wheels."),
"required": False,
"default": RAY_WHEELS,
"key": "ray_wheels"
},
{
"text": ("RAY_TEST_REPO: Please specify the Ray repository "
"used to find the tests you would like to run."),
"hint": ("If you're developing a new release test, this "
"will most likely be your GitHub fork."),
"default": RAY_TEST_REPO,
"key": "ray_test_repo"
},
{
"text": ("RAY_TEST_BRANCH: Please specify the Ray branch used "
"to find the tests you would like to run."),
"hint": ("If you're developing a new release test, this "
"will most likely be a branch living on your "
"GitHub fork."),
"default": RAY_TEST_BRANCH,
"key": "ray_test_branch"
},
{
"select": ("RELEASE_TEST_SUITE: Please specify the release "
"test suite containing the tests you would like "
"to run."),
"hint": ("Check in the `build_pipeline.py` if you're "
"unsure which suite contains your tests."),
"required": True,
"options": sorted(SUITES.keys()),
"default": RELEASE_TEST_SUITE,
"key": "release_test_suite"
},
{
"text": ("FILTER_FILE: Please specify a filter for the "
"test files that should be included in this build."),
"hint": ("Only test files (e.g. xgboost_tests.yml) that "
"match this string will be included in the test"),
"default": FILTER_FILE,
"required": False,
"key": "filter_file"
},
{
"text": ("FILTER_TEST: Please specify a filter for the "
"test names that should be included in this build."),
"hint": ("Only test names (e.g. tune_4x32) that match "
"this string will be included in the test"),
"default": FILTER_TEST,
"required": False,
"key": "filter_test"
},
],
"key": "input_ask_step",
}
run_again_step = {
"commands": [
f"export {v}=$(buildkite-agent meta-data get \"{k}\")"
for k, v in {
"ray_branch": "RAY_BRANCH",
"ray_repo": "RAY_REPO",
"ray_version": "RAY_VERSION",
"ray_wheels": "RAY_WHEELS",
"ray_test_branch": "RAY_TEST_BRANCH",
"ray_test_repo": "RAY_TEST_REPO",
"release_test_suite": "RELEASE_TEST_SUITE",
"filter_file": "FILTER_FILE",
"filter_test": "FILTER_TEST",
}.items()
] + [
"export AUTOMATIC=1",
"python3 -m pip install --user pyyaml",
"rm -rf ~/ray || true",
"git clone -b $${RAY_TEST_BRANCH} $${RAY_TEST_REPO} ~/ray",
("python3 ~/ray/release/.buildkite/build_pipeline.py "
"| buildkite-agent pipeline upload"),
],
"label": ":pipeline: Again",
"agents": {
"queue": "runner_queue_branch"
},
"depends_on": "input_ask_step",
"key": "run_again_step",
}
return [
input_ask_step,
run_again_step,
]
def create_test_step(
ray_repo: str,
ray_branch: str,
ray_version: str,
ray_wheels: str,
ray_test_repo: str,
ray_test_branch: str,
test_file: str,
test_name: ReleaseTest,
):
custom_commit_str = "custom_wheels_url"
if ray_wheels:
# Extract commit from url
p = re.compile(r"([a-f0-9]{40})")
m = p.search(ray_wheels)
if m is not None:
custom_commit_str = m.group(1)
ray_wheels_str = f" ({ray_wheels}) " if ray_wheels else ""
logging.info(f"Creating step for {test_file}/{test_name}{ray_wheels_str}")
cmd = (f"./release/run_e2e.sh "
f"--ray-repo \"{ray_repo}\" "
f"--ray-branch \"{ray_branch}\" "
f"--ray-version \"{ray_version}\" "
f"--ray-wheels \"{ray_wheels}\" "
f"--ray-test-repo \"{ray_test_repo}\" "
f"--ray-test-branch \"{ray_test_branch}\" ")
args = (f"--category {ray_branch} "
f"--test-config {test_file} "
f"--test-name {test_name} "
f"--keep-results-dir")
if test_name.smoke_test:
logging.info("This test will run as a smoke test.")
args += " --smoke-test"
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
if test_name.retry:
logging.info(f"This test will be retried up to "
f"{test_name.retry} times.")
step_conf["retry"] = {
"automatic": [{
"exit_status": "*",
"limit": test_name.retry
}]
}
else:
# Default retry logic
# Warning: Exit codes are currently not correctly propagated to
# buildkite! Thus, actual retry logic is currently implemented in
# the run_e2e.sh script!
step_conf["retry"] = {
"automatic": [
{
"exit_status": 7, # Prepare timeout
"limit": 2
},
{
"exit_status": 9, # Session timeout
"limit": 2
},
{
"exit_status": 10, # Prepare error
"limit": 2
}
],
}
step_conf["command"] = cmd + args
step_conf["label"] = (
f"{test_name} "
f"({custom_commit_str if ray_wheels_str else ray_branch}) - "
f"{ray_test_branch}/{ray_test_repo}")
return step_conf
def build_pipeline(steps):
all_steps = []
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
RAY_REPO = os.environ.get("RAY_REPO",
"https://github.com/ray-project/ray.git")
RAY_VERSION = os.environ.get("RAY_VERSION", "")
RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
FILTER_FILE = os.environ.get("FILTER_FILE", "")
FILTER_TEST = os.environ.get("FILTER_TEST", "")
ray_wheels_list = [""]
if RAY_WHEELS:
ray_wheels_list = RAY_WHEELS.split("\n")
if len(ray_wheels_list) > 1:
logging.info(f"This will run a bisec on the following URLs/commits: "
f"{ray_wheels_list}")
logging.info(
f"Building pipeline \n"
f"Ray repo/branch to test:\n"
f" RAY_REPO = {RAY_REPO}\n"
f" RAY_BRANCH = {RAY_BRANCH}\n\n"
f" RAY_VERSION = {RAY_VERSION}\n\n"
f" RAY_WHEELS = {RAY_WHEELS}\n\n"
f"Ray repo/branch containing the test configurations and scripts:"
f" RAY_TEST_REPO = {RAY_TEST_REPO}\n"
f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
f"Filtering for these tests:\n"
f" FILTER_FILE = {FILTER_FILE}\n"
f" FILTER_TEST = {FILTER_TEST}\n\n")
for test_file, test_names in steps.items():
if FILTER_FILE and FILTER_FILE not in test_file:
continue
test_base = os.path.basename(test_file)
for test_name in test_names:
if FILTER_TEST and FILTER_TEST not in test_name:
continue
if not isinstance(test_name, ReleaseTest):
test_name = ReleaseTest(name=test_name)
logging.info(f"Adding test: {test_base}/{test_name}")
for ray_wheels in ray_wheels_list:
step_conf = create_test_step(
ray_repo=RAY_REPO,
ray_branch=RAY_BRANCH,
ray_version=RAY_VERSION,
ray_wheels=ray_wheels,
ray_test_repo=RAY_TEST_REPO,
ray_test_branch=RAY_TEST_BRANCH,
test_file=test_file,
test_name=test_name)
all_steps.append(step_conf)
return all_steps
def alert_pipeline(stats: bool = False):
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
cmd = "python release/alert.py"
if stats:
cmd += " --stats"
step_conf["commands"] = [
"pip install -q -r release/requirements.txt",
"pip install -U boto3 botocore",
cmd,
]
step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
return [step_conf]
if __name__ == "__main__":
alert = os.environ.get("RELEASE_ALERT", "0")
ask_for_config = not bool(int(os.environ.get("AUTOMATIC", "0")))
if alert in ["1", "stats"]:
steps = alert_pipeline(alert == "stats")
elif ask_for_config:
steps = ask_configuration()
else:
TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
PIPELINE_SPEC = SUITES[TEST_SUITE]
steps = build_pipeline(PIPELINE_SPEC)
yaml.dump({"steps": steps}, sys.stdout)