[ci/release] Remove old OSS release test infrastructure (#23134)

Now that we've migrated all OSS release tests to the new infrastructure, we can remove old config files and infra scripts.
This commit is contained in:
Kai Fricke 2022-03-14 15:10:52 +00:00 committed by GitHub
parent d93fa95dd5
commit 8608b64885
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
39 changed files with 0 additions and 6712 deletions

View file

@ -1,145 +0,0 @@
- name: single_node
team: core
cluster:
app_config: app_config.yaml
compute_template: single_node.yaml
run:
timeout: 12000
prepare: sleep 0
script: python single_node/test_single_node.py
- name: object_store
team: core
cluster:
app_config: app_config.yaml
compute_template: object_store.yaml
run:
timeout: 3600
prepare: python distributed/wait_cluster.py --num-nodes=50
script: python object_store/test_object_store.py
- name: many_actors
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=65
script: python distributed/test_many_actors.py
- name: many_actors_smoke_test
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=2
script: SMOKE_TEST=1 python distributed/test_many_actors.py
- name: many_tasks
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=65
script: python distributed/test_many_tasks.py --num-tasks=10000
- name: many_tasks_smoke_test
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=2
script: python distributed/test_many_tasks.py --num-tasks=100
- name: many_pgs
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=65
script: python distributed/test_many_pgs.py
- name: many_pgs_smoke_test
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=2
script: SMOKE_TEST=1 python distributed/test_many_pgs.py
# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
- name: many_nodes
team: core
cluster:
app_config: app_config.yaml
compute_template: many_nodes.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=250
script: python distributed/test_many_tasks.py --num-tasks=1000
- name: scheduling_test_many_0s_tasks_single_node
team: core
cluster:
app_config: app_config.yaml
compute_template: scheduling.yaml
run:
timeout: 3600
prepare: python distributed/wait_cluster.py --num-nodes=32
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
- name: scheduling_test_many_0s_tasks_many_nodes
team: core
cluster:
app_config: app_config.yaml
compute_template: scheduling.yaml
run:
timeout: 3600
prepare: python distributed/wait_cluster.py --num-nodes=32
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
- name: scheduling_test_many_5s_tasks_single_node
team: core
cluster:
app_config: app_config.yaml
compute_template: scheduling.yaml
run:
timeout: 3600
prepare: python distributed/wait_cluster.py --num-nodes=32
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
stable: false
- name: scheduling_test_many_5s_tasks_many_nodes
team: core
cluster:
app_config: app_config.yaml
compute_template: scheduling.yaml
run:
timeout: 3600
prepare: python distributed/wait_cluster.py --num-nodes=32
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
stable: false

View file

@ -1,24 +0,0 @@
import click
import ray
import time
def num_alive_nodes():
n = 0
for node in ray.nodes():
if node["Alive"]:
n += 1
return n
@click.command()
@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
def wait_cluster(num_nodes: int):
ray.init(address="auto")
while num_alive_nodes() != num_nodes:
print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
time.sleep(5)
if __name__ == "__main__":
wait_cluster()

View file

@ -1,680 +0,0 @@
import copy
import logging
import os
import re
import sys
import yaml
# If you update or reorganize the periodic tests, please ensure the
# relevant portions of the Ray release instructions (go/release-ray)
# (in particular, running periodic tests and collecting release logs)
# are up to date. If you need access, please contact @zhe-thoughts.
# Env variables:
# RAY_REPO Repo to use for finding the wheel
# RAY_BRANCH Branch to find the wheel
# RAY_VERSION Version to find the wheel
# RAY_WHEELS Direct Ray wheel URL
# RAY_TEST_REPO Repo to use for test scripts
# RAY_TEST_BRANCH Branch for test scripts
# FILTER_FILE File filter
# FILTER_TEST Test name filter
# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)
class ReleaseTest:
def __init__(
self,
name: str,
smoke_test: bool = False,
retry: int = 0,
):
self.name = name
self.smoke_test = smoke_test
self.retry = retry
def __str__(self):
return self.name
def __repr__(self):
return self.name
def __contains__(self, item):
return self.name.__contains__(item)
def __iter__(self):
return iter(self.name)
def __len__(self):
return len(self.name)
class SmokeTest(ReleaseTest):
def __init__(self, name: str, retry: int = 0):
super(SmokeTest, self).__init__(name=name, smoke_test=True, retry=retry)
CORE_NIGHTLY_TESTS = {
# "~/ray/release/nightly_tests/nightly_tests.yaml": [
# "shuffle_10gb",
# "shuffle_50gb",
# "shuffle_50gb_large_partition",
# "shuffle_100gb",
# "non_streaming_shuffle_100gb",
# "non_streaming_shuffle_50gb_large_partition",
# "non_streaming_shuffle_50gb",
# SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
# SmokeTest("dask_on_ray_large_scale_test_spilling"),
# "stress_test_placement_group",
# "shuffle_1tb_1000_partition",
# "non_streaming_shuffle_1tb_1000_partition",
# "shuffle_1tb_5000_partitions",
# TODO(sang): It doesn't even work without spilling
# as it hits the scalability limit.
# "non_streaming_shuffle_1tb_5000_partitions",
# "decision_tree_autoscaling",
# "decision_tree_autoscaling_20_runs",
# "autoscaling_shuffle_1tb_1000_partitions",
# SmokeTest("stress_test_many_tasks"),
# SmokeTest("stress_test_dead_actors"),
# SmokeTest("threaded_actors_stress_test"),
# "pg_long_running_performance_test",
# ],
# "~/ray/benchmarks/benchmark_tests.yaml": [
# "single_node",
# "object_store",
# "many_actors_smoke_test",
# "many_tasks_smoke_test",
# "many_pgs_smoke_test",
# ],
# "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
# "inference",
# "shuffle_data_loader",
# "parquet_metadata_resolution",
# "pipelined_training_50_gb",
# "pipelined_ingestion_1500_gb",
# "datasets_preprocess_ingest",
# "datasets_ingest_400G",
# SmokeTest("datasets_ingest_train_infer"),
# ],
# "~/ray/release/nightly_tests/chaos_test.yaml": [
# "chaos_many_actors",
# "chaos_many_tasks_no_object_store",
# "chaos_pipelined_ingestion_1500_gb_15_windows",
# ],
# "~/ray/release/microbenchmark/microbenchmark.yaml": [
# "microbenchmark",
# ],
}
SERVE_NIGHTLY_TESTS = {
# "~/ray/release/long_running_tests/long_running_tests.yaml": [
# SmokeTest("serve"),
# SmokeTest("serve_failure"),
# ],
# "~/ray/release/serve_tests/serve_tests.yaml": [
# "single_deployment_1k_noop_replica",
# "multi_deployment_1k_noop_replica",
# "autoscaling_single_deployment",
# "autoscaling_multi_deployment",
# "serve_micro_benchmark",
# # TODO(architkulkarni) Reenable after K8s migration. Currently failing
# # "serve_micro_benchmark_k8s",
# "serve_cluster_fault_tolerance",
# ],
}
CORE_DAILY_TESTS = {
# "~/ray/release/nightly_tests/nightly_tests.yaml": [
# "k8s_dask_on_ray_large_scale_test_no_spilling",
# "dask_on_ray_large_scale_test_no_spilling",
# "dask_on_ray_large_scale_test_spilling",
# "pg_autoscaling_regression_test",
# "threaded_actors_stress_test",
# "k8s_threaded_actors_stress_test",
# "stress_test_many_tasks",
# "stress_test_dead_actors",
# ],
# "~/ray/release/nightly_tests/chaos_test.yaml": [
# "chaos_dask_on_ray_large_scale_test_no_spilling",
# "chaos_dask_on_ray_large_scale_test_spilling",
# ],
}
CORE_SCALABILITY_TESTS_DAILY = {
# "~/ray/benchmarks/benchmark_tests.yaml": [
# "many_actors",
# "many_tasks",
# "many_pgs",
# "many_nodes",
# ],
}
CORE_SCHEDULING_DAILY = {
# "~/ray/benchmarks/benchmark_tests.yaml": [
# "scheduling_test_many_0s_tasks_single_node",
# "scheduling_test_many_0s_tasks_many_nodes",
# # Reenable these two once we got right setup
# # "scheduling_test_many_5s_tasks_single_node",
# # "scheduling_test_many_5s_tasks_many_nodes",
# ],
# "~/ray/release/nightly_tests/nightly_tests.yaml": [
# "many_nodes_actor_test",
# "dask_on_ray_10gb_sort",
# "dask_on_ray_100gb_sort",
# "dask_on_ray_1tb_sort",
# "placement_group_performance_test",
# ],
}
NIGHTLY_TESTS = {
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
# SmokeTest("horovod_test"),
# ], # Should we enable this?
# "~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
# "dask_xgboost_test",
# "modin_xgboost_test",
# "torch_tune_serve_test",
# ],
# "~/ray/release/long_running_tests/long_running_tests.yaml": [
# SmokeTest("actor_deaths"),
# SmokeTest("apex"),
# SmokeTest("impala"),
# SmokeTest("many_actor_tasks"),
# SmokeTest("many_drivers"),
# SmokeTest("many_ppo"),
# SmokeTest("many_tasks"),
# SmokeTest("many_tasks_serialized_ids"),
# SmokeTest("node_failures"),
# SmokeTest("pbt"),
# # SmokeTest("serve"),
# # SmokeTest("serve_failure"),
# # Full long running tests (1 day runtime)
# "actor_deaths",
# "apex",
# "impala",
# "many_actor_tasks",
# "many_drivers",
# "many_ppo",
# "many_tasks",
# "many_tasks_serialized_ids",
# "node_failures",
# "pbt",
# "serve",
# "serve_failure",
# ],
# "~/ray/release/sgd_tests/sgd_tests.yaml": [
# "sgd_gpu",
# ],
# "~/ray/release/tune_tests/cloud_tests/tune_cloud_tests.yaml": [
# "aws_no_sync_down",
# "aws_ssh_sync",
# "aws_durable_upload",
# "aws_durable_upload_rllib_str",
# "aws_durable_upload_rllib_trainer",
# "gcp_k8s_durable_upload",
# ],
# "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
# "bookkeeping_overhead",
# "durable_trainable",
# SmokeTest("long_running_large_checkpoints"),
# SmokeTest("network_overhead"),
# "result_throughput_cluster",
# "result_throughput_single_node",
# ],
# "~/ray/release/xgboost_tests/xgboost_tests.yaml": [
# "train_small",
# "train_moderate",
# "train_gpu",
# "tune_small",
# "tune_4x32",
# "tune_32x4",
# "ft_small_elastic",
# "ft_small_non_elastic",
# "distributed_api_test",
# ],
# "~/ray/release/rllib_tests/rllib_tests.yaml": [
# SmokeTest("learning_tests"),
# SmokeTest("stress_tests"),
# "performance_tests",
# "multi_gpu_learning_tests",
# "multi_gpu_with_lstm_learning_tests",
# "multi_gpu_with_attention_learning_tests",
# # We'll have these as per-PR tests soon.
# # "example_scripts_on_gpu_tests",
# ],
# "~/ray/release/runtime_env_tests/runtime_env_tests.yaml": [
# "rte_many_tasks_actors",
# "wheel_urls",
# "rte_ray_client",
# ],
}
WEEKLY_TESTS = {
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
# "horovod_test",
# ],
"~/ray/release/long_running_distributed_tests"
# "/long_running_distributed.yaml": [
# "pytorch_pbt_failure",
# ],
# "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
# "network_overhead",
# "long_running_large_checkpoints",
# "xgboost_sweep",
# ],
# "~/ray/release/rllib_tests/rllib_tests.yaml": [
# "learning_tests",
# "stress_tests",
# ],
}
# This test suite holds "user" tests to test important user workflows
# in a particular environment.
# All workloads in this test suite should:
# 1. Be run in a distributed (multi-node) fashion
# 2. Use autoscaling/scale up (no wait_cluster.py)
# 3. Use GPUs if applicable
# 4. Have the `use_connect` flag set.
USER_TESTS = {
# "~/ray/release/ml_user_tests/ml_user_tests.yaml": [
# "train_tensorflow_mnist_test",
# "train_torch_linear_test",
# "ray_lightning_user_test_latest",
# "ray_lightning_user_test_master",
# "horovod_user_test_latest",
# "horovod_user_test_master",
# "xgboost_gpu_connect_latest",
# "xgboost_gpu_connect_master",
# "tune_rllib_connect_test",
# ]
}
SUITES = {
"core-nightly": CORE_NIGHTLY_TESTS,
"serve-nightly": SERVE_NIGHTLY_TESTS,
"core-daily": CORE_DAILY_TESTS,
"core-scalability": CORE_SCALABILITY_TESTS_DAILY,
"nightly": {**NIGHTLY_TESTS, **USER_TESTS},
"core-scheduling-daily": CORE_SCHEDULING_DAILY,
"weekly": WEEKLY_TESTS,
}
DEFAULT_STEP_TEMPLATE = {
"env": {
"ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
"ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
"RELEASE_AWS_BUCKET": "ray-release-automation-results",
"RELEASE_AWS_LOCATION": "dev",
"RELEASE_AWS_DB_NAME": "ray_ci",
"RELEASE_AWS_DB_TABLE": "release_test_result",
"AWS_REGION": "us-west-2",
},
"agents": {"queue": "runner_queue_branch"},
"plugins": [
{
"docker#v3.9.0": {
"image": "rayproject/ray",
"propagate-environment": True,
"volumes": [
"/tmp/ray_release_test_artifacts:" "/tmp/ray_release_test_artifacts"
],
}
}
],
"artifact_paths": ["/tmp/ray_release_test_artifacts/**/*"],
}
def ask_configuration():
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
RAY_VERSION = os.environ.get("RAY_VERSION", "")
RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
RELEASE_TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
FILTER_FILE = os.environ.get("FILTER_FILE", "")
FILTER_TEST = os.environ.get("FILTER_TEST", "")
input_ask_step = {
"input": "Input required: Please specify tests to run",
"fields": [
{
"text": (
"RAY_REPO: Please specify the Ray repository used "
"to find the wheel."
),
"hint": (
"Repository from which to fetch the latest "
"commits to find the Ray wheels. Usually you don't "
"need to change this."
),
"default": RAY_REPO,
"key": "ray_repo",
},
{
"text": (
"RAY_BRANCH: Please specify the Ray branch used "
"to find the wheel."
),
"hint": "For releases, this will be e.g. `releases/1.x.0`",
"default": RAY_BRANCH,
"key": "ray_branch",
},
{
"text": (
"RAY_VERSION: Please specify the Ray version used "
"to find the wheel."
),
"hint": (
"Leave empty for latest master. For releases, "
"specify the release version."
),
"required": False,
"default": RAY_VERSION,
"key": "ray_version",
},
{
"text": "RAY_WHEELS: Please specify the Ray wheel URL.",
"hint": (
"ATTENTION: If you provide this, RAY_REPO, "
"RAY_BRANCH and RAY_VERSION will be ignored! "
"Please also make sure to provide the wheels URL "
"for Python 3.7 on Linux.\n"
"You can also insert a commit hash here instead "
"of a full URL.\n"
"NOTE: You can specify multiple commits or URLs "
"for easy bisection (one per line) - this will "
"run each test on each of the specified wheels."
),
"required": False,
"default": RAY_WHEELS,
"key": "ray_wheels",
},
{
"text": (
"RAY_TEST_REPO: Please specify the Ray repository "
"used to find the tests you would like to run."
),
"hint": (
"If you're developing a new release test, this "
"will most likely be your GitHub fork."
),
"default": RAY_TEST_REPO,
"key": "ray_test_repo",
},
{
"text": (
"RAY_TEST_BRANCH: Please specify the Ray branch used "
"to find the tests you would like to run."
),
"hint": (
"If you're developing a new release test, this "
"will most likely be a branch living on your "
"GitHub fork."
),
"default": RAY_TEST_BRANCH,
"key": "ray_test_branch",
},
{
"select": (
"RELEASE_TEST_SUITE: Please specify the release "
"test suite containing the tests you would like "
"to run."
),
"hint": (
"Check in the `build_pipeline.py` if you're "
"unsure which suite contains your tests."
),
"required": True,
"options": sorted(SUITES.keys()),
"default": RELEASE_TEST_SUITE,
"key": "release_test_suite",
},
{
"text": (
"FILTER_FILE: Please specify a filter for the "
"test files that should be included in this build."
),
"hint": (
"Only test files (e.g. xgboost_tests.yml) that "
"match this string will be included in the test"
),
"default": FILTER_FILE,
"required": False,
"key": "filter_file",
},
{
"text": (
"FILTER_TEST: Please specify a filter for the "
"test names that should be included in this build."
),
"hint": (
"Only test names (e.g. tune_4x32) that match "
"this string will be included in the test"
),
"default": FILTER_TEST,
"required": False,
"key": "filter_test",
},
],
"key": "input_ask_step",
}
run_again_step = {
"commands": [
f'export {v}=$(buildkite-agent meta-data get "{k}")'
for k, v in {
"ray_branch": "RAY_BRANCH",
"ray_repo": "RAY_REPO",
"ray_version": "RAY_VERSION",
"ray_wheels": "RAY_WHEELS",
"ray_test_branch": "RAY_TEST_BRANCH",
"ray_test_repo": "RAY_TEST_REPO",
"release_test_suite": "RELEASE_TEST_SUITE",
"filter_file": "FILTER_FILE",
"filter_test": "FILTER_TEST",
}.items()
]
+ [
"export AUTOMATIC=1",
"python3 -m pip install --user pyyaml",
"rm -rf ~/ray || true",
"git clone -b $${RAY_TEST_BRANCH} $${RAY_TEST_REPO} ~/ray",
(
"python3 ~/ray/release/.buildkite/build_pipeline.py "
"| buildkite-agent pipeline upload"
),
],
"label": ":pipeline: Again",
"agents": {"queue": "runner_queue_branch"},
"depends_on": "input_ask_step",
"key": "run_again_step",
}
return [
input_ask_step,
run_again_step,
]
def create_test_step(
ray_repo: str,
ray_branch: str,
ray_version: str,
ray_wheels: str,
ray_test_repo: str,
ray_test_branch: str,
test_file: str,
test_name: ReleaseTest,
):
custom_commit_str = "custom_wheels_url"
if ray_wheels:
# Extract commit from url
p = re.compile(r"([a-f0-9]{40})")
m = p.search(ray_wheels)
if m is not None:
custom_commit_str = m.group(1)
ray_wheels_str = f" ({ray_wheels}) " if ray_wheels else ""
logging.info(f"Creating step for {test_file}/{test_name}{ray_wheels_str}")
cmd = (
f"./release/run_e2e.sh "
f'--ray-repo "{ray_repo}" '
f'--ray-branch "{ray_branch}" '
f'--ray-version "{ray_version}" '
f'--ray-wheels "{ray_wheels}" '
f'--ray-test-repo "{ray_test_repo}" '
f'--ray-test-branch "{ray_test_branch}" '
)
args = (
f"--category {ray_branch} "
f"--test-config {test_file} "
f"--test-name {test_name} "
f"--keep-results-dir"
)
if test_name.smoke_test:
logging.info("This test will run as a smoke test.")
args += " --smoke-test"
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
if test_name.retry:
logging.info(f"This test will be retried up to " f"{test_name.retry} times.")
step_conf["retry"] = {
"automatic": [{"exit_status": "*", "limit": test_name.retry}]
}
else:
# Default retry logic
# Warning: Exit codes are currently not correctly propagated to
# buildkite! Thus, actual retry logic is currently implemented in
# the run_e2e.sh script!
step_conf["retry"] = {
"automatic": [
{"exit_status": 7, "limit": 2}, # Prepare timeout
{"exit_status": 9, "limit": 2}, # Session timeout
{"exit_status": 10, "limit": 2}, # Prepare error
],
}
step_conf["command"] = cmd + args
step_conf["label"] = (
f"{test_name} "
f"({custom_commit_str if ray_wheels_str else ray_branch}) - "
f"{ray_test_branch}/{ray_test_repo}"
)
return step_conf
def build_pipeline(steps):
all_steps = []
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
RAY_VERSION = os.environ.get("RAY_VERSION", "")
RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
FILTER_FILE = os.environ.get("FILTER_FILE", "")
FILTER_TEST = os.environ.get("FILTER_TEST", "")
ray_wheels_list = [""]
if RAY_WHEELS:
ray_wheels_list = RAY_WHEELS.split("\n")
if len(ray_wheels_list) > 1:
logging.info(
f"This will run a bisec on the following URLs/commits: "
f"{ray_wheels_list}"
)
logging.info(
f"Building pipeline \n"
f"Ray repo/branch to test:\n"
f" RAY_REPO = {RAY_REPO}\n"
f" RAY_BRANCH = {RAY_BRANCH}\n\n"
f" RAY_VERSION = {RAY_VERSION}\n\n"
f" RAY_WHEELS = {RAY_WHEELS}\n\n"
f"Ray repo/branch containing the test configurations and scripts:"
f" RAY_TEST_REPO = {RAY_TEST_REPO}\n"
f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
f"Filtering for these tests:\n"
f" FILTER_FILE = {FILTER_FILE}\n"
f" FILTER_TEST = {FILTER_TEST}\n\n"
)
for test_file, test_names in steps.items():
if FILTER_FILE and FILTER_FILE not in test_file:
continue
test_base = os.path.basename(test_file)
for test_name in test_names:
if FILTER_TEST and FILTER_TEST not in test_name:
continue
if not isinstance(test_name, ReleaseTest):
test_name = ReleaseTest(name=test_name)
logging.info(f"Adding test: {test_base}/{test_name}")
for ray_wheels in ray_wheels_list:
step_conf = create_test_step(
ray_repo=RAY_REPO,
ray_branch=RAY_BRANCH,
ray_version=RAY_VERSION,
ray_wheels=ray_wheels,
ray_test_repo=RAY_TEST_REPO,
ray_test_branch=RAY_TEST_BRANCH,
test_file=test_file,
test_name=test_name,
)
all_steps.append(step_conf)
return all_steps
def alert_pipeline(stats: bool = False):
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
cmd = "python release/alert.py"
if stats:
cmd += " --stats"
step_conf["commands"] = [
"pip install -q -r release/requirements.txt",
"pip install -U boto3 botocore",
cmd,
]
step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
return [step_conf]
if __name__ == "__main__":
alert = os.environ.get("RELEASE_ALERT", "0")
ask_for_config = not bool(int(os.environ.get("AUTOMATIC", "0")))
if alert in ["1", "stats"]:
steps = alert_pipeline(alert == "stats")
elif ask_for_config:
steps = ask_configuration()
else:
TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
PIPELINE_SPEC = SUITES[TEST_SUITE]
steps = build_pipeline(PIPELINE_SPEC)
yaml.dump({"steps": steps}, sys.stdout)

View file

@ -1,441 +0,0 @@
import argparse
from collections import defaultdict, Counter
from typing import Any, List, Tuple, Mapping, Optional
import datetime
import hashlib
import json
import logging
import os
import requests
import sys
import boto3
from e2e import GLOBAL_CONFIG
from alerts.default import handle_result as default_handle_result
from alerts.rllib_tests import handle_result as rllib_tests_handle_result
from alerts.long_running_tests import handle_result as long_running_tests_handle_result
from alerts.tune_tests import handle_result as tune_tests_handle_result
from alerts.xgboost_tests import handle_result as xgboost_tests_handle_result
SUITE_TO_FN = {
"long_running_tests": long_running_tests_handle_result,
"rllib_tests": rllib_tests_handle_result,
"tune_tests": tune_tests_handle_result,
"xgboost_tests": xgboost_tests_handle_result,
}
GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] = "alert_state"
GLOBAL_CONFIG["SLACK_WEBHOOK"] = os.environ.get("SLACK_WEBHOOK", "")
GLOBAL_CONFIG["SLACK_CHANNEL"] = os.environ.get("SLACK_CHANNEL", "#oss-test-cop")
RESULTS_LIMIT = 120
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter(
fmt="[%(levelname)s %(asctime)s] " "%(filename)s: %(lineno)d " "%(message)s"
)
handler.setFormatter(formatter)
logger.addHandler(handler)
def maybe_fetch_slack_webhook():
if GLOBAL_CONFIG["SLACK_WEBHOOK"] in [None, ""]:
print("Missing SLACK_WEBHOOK, retrieving from AWS secrets store")
GLOBAL_CONFIG["SLACK_WEBHOOK"] = boto3.client(
"secretsmanager", region_name="us-west-2"
).get_secret_value(
SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:"
"release-automation/"
"slack-webhook-Na0CFP"
)[
"SecretString"
]
def _obj_hash(obj: Any) -> str:
json_str = json.dumps(obj, sort_keys=True, ensure_ascii=True)
sha = hashlib.sha256()
sha.update(json_str.encode())
return sha.hexdigest()
def fetch_latest_alerts(rds_data_client):
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
sql = f"""
SELECT DISTINCT ON (category, test_suite, test_name)
category, test_suite, test_name, last_result_hash,
last_notification_dt
FROM {schema}
ORDER BY category, test_suite, test_name, last_notification_dt DESC
LIMIT {RESULTS_LIMIT}
"""
result = rds_data_client.execute_statement(
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
schema=schema,
sql=sql,
)
for row in result["records"]:
category, test_suite, test_name, last_result_hash, last_notification_dt = (
r["stringValue"] if "stringValue" in r else None for r in row
)
last_notification_dt = datetime.datetime.strptime(
last_notification_dt, "%Y-%m-%d %H:%M:%S"
)
yield category, test_suite, test_name, last_result_hash, last_notification_dt
def fetch_latest_results(
rds_data_client, fetch_since: Optional[datetime.datetime] = None
):
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
sql = f"""
SELECT DISTINCT ON (category, test_suite, test_name)
created_on, category, test_suite, test_name, status, results,
artifacts, last_logs
FROM {schema} """
parameters = []
if fetch_since is not None:
sql += "WHERE created_on >= :created_on "
parameters = [
{
"name": "created_on",
"typeHint": "TIMESTAMP",
"value": {"stringValue": fetch_since.strftime("%Y-%m-%d %H:%M:%S")},
},
]
sql += "ORDER BY category, test_suite, test_name, created_on DESC "
sql += f"LIMIT {RESULTS_LIMIT}"
result = rds_data_client.execute_statement(
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
schema=schema,
sql=sql,
parameters=parameters,
)
for row in result["records"]:
(
created_on,
category,
test_suite,
test_name,
status,
results,
artifacts,
last_logs,
) = (r["stringValue"] if "stringValue" in r else None for r in row)
# Calculate hash before converting strings to objects
result_obj = (
created_on,
category,
test_suite,
test_name,
status,
results,
artifacts,
last_logs,
)
result_json = json.dumps(result_obj)
result_hash = _obj_hash(result_json)
# Convert some strings to python objects
created_on = datetime.datetime.strptime(created_on, "%Y-%m-%d %H:%M:%S")
results = json.loads(results)
artifacts = json.loads(artifacts)
yield result_hash, created_on, category, test_suite, test_name, status, results, artifacts, last_logs # noqa: E501
def mark_as_handled(
rds_data_client,
update: bool,
category: str,
test_suite: str,
test_name: str,
result_hash: str,
last_notification_dt: datetime.datetime,
):
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
if not update:
sql = f"""
INSERT INTO {schema}
(category, test_suite, test_name,
last_result_hash, last_notification_dt)
VALUES (:category, :test_suite, :test_name,
:last_result_hash, :last_notification_dt)
"""
else:
sql = f"""
UPDATE {schema}
SET last_result_hash=:last_result_hash,
last_notification_dt=:last_notification_dt
WHERE category=:category AND test_suite=:test_suite
AND test_name=:test_name
"""
rds_data_client.execute_statement(
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
parameters=[
{"name": "category", "value": {"stringValue": category}},
{"name": "test_suite", "value": {"stringValue": test_suite or ""}},
{"name": "test_name", "value": {"stringValue": test_name}},
{"name": "last_result_hash", "value": {"stringValue": result_hash}},
{
"name": "last_notification_dt",
"typeHint": "TIMESTAMP",
"value": {
"stringValue": last_notification_dt.strftime("%Y-%m-%d %H:%M:%S")
},
},
],
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
schema=schema,
sql=sql,
)
def post_alerts_to_slack(
channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
):
if len(alerts) == 0:
logger.info("No alerts to post to slack.")
return
markdown_lines = [
f"* {len(alerts)} new release test failures found!*",
"",
]
category_alerts = defaultdict(list)
for (category, test_suite, test_name, alert) in alerts:
category_alerts[category].append(
f" *{test_suite}/{test_name}* failed: {alert}"
)
for category, alert_list in category_alerts.items():
markdown_lines.append(f"Branch: *{category}*")
markdown_lines.extend(alert_list)
markdown_lines.append("")
total_non_alerts = sum(n for n in non_alerts.values())
non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
markdown_lines += [
f"Additionally, {total_non_alerts} tests passed successfully "
f"({', '.join(non_alert_detail)})."
]
slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
resp = requests.post(
slack_url,
json={
"text": "\n".join(markdown_lines),
"channel": channel,
"username": "Fail Bot",
"icon_emoji": ":red_circle:",
},
)
print(resp.status_code)
print(resp.text)
def post_statistics_to_slack(
channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
):
total_alerts = len(alerts)
category_alerts = defaultdict(list)
for (category, test_suite, test_name, alert) in alerts:
category_alerts[category].append(f"`{test_suite}/{test_name}`")
alert_detail = [f"{len(a)} on {c}" for c, a in category_alerts.items()]
total_non_alerts = sum(n for n in non_alerts.values())
non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
markdown_lines = [
"*Periodic release test report*",
"",
f"In the past 24 hours, "
f"*{total_non_alerts}* release tests finished successfully, and "
f"*{total_alerts}* release tests failed.",
]
markdown_lines.append("")
if total_alerts:
markdown_lines.append(f"*Failing:* {', '.join(alert_detail)}")
for c, a in category_alerts.items():
markdown_lines.append(f" *{c}*: {', '.join(sorted(a))}")
else:
markdown_lines.append("*Failing:* None")
markdown_lines.append("")
if total_non_alerts:
markdown_lines.append(f"*Passing:* {', '.join(non_alert_detail)}")
else:
markdown_lines.append("*Passing:* None")
slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
resp = requests.post(
slack_url,
json={
"text": "\n".join(markdown_lines),
"channel": channel,
"username": "Fail Bot",
"icon_emoji": ":red_circle:",
},
)
print(resp.status_code)
print(resp.text)
def handle_results_and_get_alerts(
rds_data_client,
fetch_since: Optional[datetime.datetime] = None,
always_try_alert: bool = False,
no_status_update: bool = False,
):
# First build a map of last notifications
last_notifications_map = {}
for (
category,
test_suite,
test_name,
last_result_hash,
last_notification_dt,
) in fetch_latest_alerts(rds_data_client):
last_notifications_map[(category, test_suite, test_name)] = (
last_result_hash,
last_notification_dt,
)
alerts = []
non_alerts = Counter()
# Then fetch latest results
for (
result_hash,
created_on,
category,
test_suite,
test_name,
status,
results,
artifacts,
last_logs,
) in fetch_latest_results(rds_data_client, fetch_since=fetch_since):
key = (category, test_suite, test_name)
try_alert = always_try_alert
if key in last_notifications_map:
# If we have an alert for this key, fetch info
last_result_hash, last_notification_dt = last_notifications_map[key]
if last_result_hash != result_hash:
# If we got a new result, handle new result
try_alert = True
# Todo: maybe alert again after some time?
else:
try_alert = True
if try_alert:
handle_fn = SUITE_TO_FN.get(test_suite, None)
if not handle_fn:
logger.warning(f"No handle for suite {test_suite}")
alert = default_handle_result(
created_on,
category,
test_suite,
test_name,
status,
results,
artifacts,
last_logs,
)
else:
alert = handle_fn(
created_on,
category,
test_suite,
test_name,
status,
results,
artifacts,
last_logs,
)
if alert:
logger.warning(
f"Alert raised for test {test_suite}/{test_name} "
f"({category}): {alert}"
)
alerts.append((category, test_suite, test_name, alert))
else:
logger.debug(
f"No alert raised for test {test_suite}/{test_name} "
f"({category})"
)
non_alerts[category] += 1
if not no_status_update:
mark_as_handled(
rds_data_client,
key in last_notifications_map,
category,
test_suite,
test_name,
result_hash,
datetime.datetime.now(),
)
return alerts, non_alerts
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--stats",
action="store_true",
default=False,
help="Finish quickly for training.",
)
args = parser.parse_args()
maybe_fetch_slack_webhook()
rds_data_client = boto3.client("rds-data", region_name="us-west-2")
if args.stats:
# Only update last 24 hour stats
fetch_since = datetime.datetime.now() - datetime.timedelta(days=1)
alerts, non_alerts = handle_results_and_get_alerts(
rds_data_client,
fetch_since=fetch_since,
always_try_alert=True,
no_status_update=True,
)
post_statistics_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
else:
alerts, non_alerts = handle_results_and_get_alerts(rds_data_client)
post_alerts_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)

View file

@ -1,145 +0,0 @@
- name: single_node
team: core
cluster:
app_config: app_config.yaml
compute_template: single_node.yaml
run:
timeout: 12000
prepare: sleep 0
script: python single_node/test_single_node.py
- name: object_store
team: core
cluster:
app_config: app_config.yaml
compute_template: object_store.yaml
run:
timeout: 3600
prepare: python distributed/wait_cluster.py --num-nodes=50
script: python object_store/test_object_store.py
- name: many_actors
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=65
script: python distributed/test_many_actors.py
- name: many_actors_smoke_test
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=2
script: SMOKE_TEST=1 python distributed/test_many_actors.py
- name: many_tasks
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=65
script: python distributed/test_many_tasks.py --num-tasks=10000
- name: many_tasks_smoke_test
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=2
script: python distributed/test_many_tasks.py --num-tasks=100
- name: many_pgs
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=65
script: python distributed/test_many_pgs.py
- name: many_pgs_smoke_test
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=2
script: SMOKE_TEST=1 python distributed/test_many_pgs.py
# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
- name: many_nodes
team: core
cluster:
app_config: app_config.yaml
compute_template: many_nodes.yaml
run:
timeout: 3600 # 1hr
prepare: python distributed/wait_cluster.py --num-nodes=250
script: python distributed/test_many_tasks.py --num-tasks=1000
- name: scheduling_test_many_0s_tasks_single_node
team: core
cluster:
app_config: app_config.yaml
compute_template: scheduling.yaml
run:
timeout: 3600
prepare: python distributed/wait_cluster.py --num-nodes=32
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
- name: scheduling_test_many_0s_tasks_many_nodes
team: core
cluster:
app_config: app_config.yaml
compute_template: scheduling.yaml
run:
timeout: 3600
prepare: python distributed/wait_cluster.py --num-nodes=32
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
- name: scheduling_test_many_5s_tasks_single_node
team: core
cluster:
app_config: app_config.yaml
compute_template: scheduling.yaml
run:
timeout: 3600
prepare: python distributed/wait_cluster.py --num-nodes=32
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
stable: false
- name: scheduling_test_many_5s_tasks_many_nodes
team: core
cluster:
app_config: app_config.yaml
compute_template: scheduling.yaml
run:
timeout: 3600
prepare: python distributed/wait_cluster.py --num-nodes=32
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
stable: false

View file

@ -1,24 +0,0 @@
import click
import ray
import time
def num_alive_nodes():
n = 0
for node in ray.nodes():
if node["Alive"]:
n += 1
return n
@click.command()
@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
def wait_cluster(num_nodes: int):
ray.init(address="auto")
while num_alive_nodes() != num_nodes:
print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
time.sleep(5)
if __name__ == "__main__":
wait_cluster()

View file

@ -1,54 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,214 +0,0 @@
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>Releaser config generator</title>
<style type="text/css">
html {
background: #cccccc;
}
body {
background: #ffffff;
font-family: sans-serif;
padding: 1em 2em;
max-width: 800px;
margin: 0 auto;
}
textarea {
width: 600px;
height: 200px;
}
form .use {
white-space: nowrap;
padding-right: 1em;
}
form .val {
min-width: 300px;
}
form .val input {
width: 90%;
}
form .desc {
}
</style>
<script type="text/javascript">
var env_vars = [
{
"name": "RAY_TEST_REPO",
"short": "Git repo with test files",
"long": "Repository in which the test files are which you would like to run. Note that this doesn't have to be the same repo from which the wheels are installed.",
"default": "https://github.com/ray-project/ray.git",
"enabled": false,
},
{
"name": "RAY_TEST_BRANCH",
"short": "Git branch for test repo",
"long": "Git branch that is checked out from RAY_TEST_REPO and which contains the test files you would like to run. Note that this doesnt' have to be the same branch you're fetching the Ray wheels from.",
"default": "master",
"enabled": false,
},
{
"name": "RAY_REPO",
"short": "Git repo for the Ray wheels",
"long": "Repository from which to fetch the latest commits to find the Ray wheels",
"default": "https://github.com/ray-project/ray.git",
"enabled": false,
},
{
"name": "RAY_BRANCH",
"short": "Git branch for the Ray wheels",
"long": "Branch that is check out from RAY_REPO from which the latest commits are fetched to find the Ray wheels",
"default": "master",
"enabled": true,
},
{
"name": "RELEASE_TEST_SUITE",
"short": "Release test suite (nightly/weekly/manual)",
"long": "Release test suite as defined in releaser's build_pipeline.py",
"default": "nightly",
"enabled": true,
},
{
"name": "FILTER_FILE",
"short": "Filter test file by this string",
"long": "Only test files (e.g. xgboost_tests.yml) that match this string will be included in the test",
"default": "",
"enabled": false,
},
{
"name": "FILTER_TEST",
"short": "Filter test name by this string",
"long": "Only test names (e.g. tune_4x32) that match this string will be included in the test",
"default": "",
"enabled": false,
},
]
window.addEventListener('load', function () {
var table = document.getElementById("gen_table");
for (var env_var of env_vars) {
var use_td = document.createElement("td");
use_td.setAttribute("class", "use");
var use_input = document.createElement("input");
use_input.setAttribute("type", "checkbox");
use_input.setAttribute("data-activate", env_var["name"] + "_val");
use_input.setAttribute("id", env_var["name"] + "_use");
use_input.setAttribute("class", "input_use");
if (env_var["enabled"]) {
use_input.checked = true;
}
var use_label = document.createElement("label");
use_label.setAttribute("for", env_var["name"] + "_use");
use_label.innerHTML = env_var["name"];
use_td.append(use_input);
use_td.append(use_label);
val_td = document.createElement("td");
val_td.setAttribute("class", "val");
val_input = document.createElement("input");
val_input.setAttribute("type", "text");
if (!env_var["enabled"]) {
val_input.setAttribute("disabled", "disabled");
}
val_input.setAttribute("id", env_var["name"] + "_val");
val_input.setAttribute("name", env_var["name"]);
val_input.setAttribute("value", env_var["default"]);
val_input.setAttribute("class", "input_val");
val_td.append(val_input);
use_input.addEventListener("click", function(e) {
var toggle_val = document.getElementById(e.target.getAttribute("data-activate"))
if (toggle_val.disabled) {
toggle_val.removeAttribute("disabled");
} else {
toggle_val.setAttribute("disabled", "disabled");
}
generate_snippet();
});
val_input.addEventListener("change", function() { generate_snippet(); });
val_input.addEventListener("keydown", function() { generate_snippet(); });
val_input.addEventListener("keyup", function() { generate_snippet(); });
var desc_td = document.createElement("td");
desc_td.setAttribute("class", "desc");
var desc_a = document.createElement("a");
desc_a.setAttribute("title", env_var["long"]);
desc_a.innerHTML = env_var["short"];
desc_td.append(desc_a);
var tr = document.createElement("tr");
tr.append(use_td);
tr.append(val_td);
tr.append(desc_td);
table.append(tr);
}
var button = document.getElementById("generate");
button.addEventListener("click", function() {
generate_snippet();
})
generate_snippet()
})
function generate_snippet() {
full_snippet = ""
for (env_var of env_vars) {
var val_input = document.getElementById(env_var["name"] + "_val")
if (!val_input.disabled) {
full_snippet += env_var["name"] + "=\"" + val_input.value + "\"\n"
}
}
document.getElementById("snippet").innerHTML = full_snippet;
}
</script>
</head>
<body>
<header class="header">
<h1>Releaser config generator</h1>
<p>Use this form to generate a list of environment variables.</p>
<p>These variables can be passed to Buildkite to run a subset of release tests
and choose the correct wheels/release test branch</p>
</header>
<section class="main">
<form id="gen">
<table id="gen_table">
<tr>
<th>Set</th>
<th>Value</th>
<th>Description</th>
</tr>
</table>
</form>
<div>
<button id="generate">Generate snippet</button>
</div>
<div>
<textarea id="snippet">
</textarea>
</div>
</section>
</body>
</html>

File diff suppressed because it is too large Load diff

View file

@ -1,15 +0,0 @@
- name: horovod_test
team: ml
cluster:
app_config: app_config_master.yaml
compute_template: compute_tpl.yaml
run:
timeout: 36000
prepare: python wait_cluster.py 3 600
script: python workloads/horovod_tune_test.py
long_running: True
smoke_test:
run:
timeout: 1800

View file

@ -1,53 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,92 +0,0 @@
- name: train_small
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
run:
use_connect: True
autosuspend_mins: 10
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/train_small.py
- name: train_moderate
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml
run:
timeout: 600
prepare: python wait_cluster.py 32 600
script: python workloads/train_moderate.py
- name: train_gpu
team: ml
cluster:
app_config: app_config_gpu.yaml
compute_template: tpl_gpu_small.yaml
run:
timeout: 600
prepare: python wait_cluster.py 5 600
script: python workloads/train_gpu.py
- name: distributed_api_test
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
results:
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/distributed_api_test.py
results: ""
- name: ft_small_non_elastic
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
run:
timeout: 900
prepare: python wait_cluster.py 4 600
script: python workloads/ft_small_non_elastic.py
results: ""
- name: tune_small
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/tune_small.py
- name: tune_32x4
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml
run:
timeout: 900
prepare: python wait_cluster.py 32 600
script: python workloads/tune_32x4.py
- name: tune_4x32
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml
run:
timeout: 900
prepare: python wait_cluster.py 32 600
script: python workloads/tune_4x32.py

View file

@ -1,53 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,13 +0,0 @@
- name: pytorch_pbt_failure
team: ml
cluster:
app_config: app_config.yaml
compute_template: compute_tpl.yaml
run:
timeout: 86400
script: python workloads/pytorch_pbt_failure.py
long_running: True
smoke_test:
timeout: 3600

View file

@ -1,196 +0,0 @@
- name: actor_deaths
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/actor_deaths.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: apex
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: tpl_cpu_3.yaml
run:
timeout: 86400
prepare: python wait_cluster.py 3 600
script: python workloads/apex.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: impala
team: ml
cluster:
app_config: app_config_np.yaml
compute_template: tpl_cpu_1_large.yaml
run:
timeout: 86400
script: python workloads/impala.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: many_actor_tasks
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_actor_tasks.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: many_drivers
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_drivers.py --iteration-num=4000
long_running: True
smoke_test:
run:
timeout: 3600
- name: many_ppo
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: many_ppo.yaml
run:
timeout: 86400
prepare: python wait_cluster.py 1 600
script: python workloads/many_ppo.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: many_tasks
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_tasks.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: many_tasks_serialized_ids
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/many_tasks_serialized_ids.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: node_failures
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/node_failures.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: pbt
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/pbt.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: serve
team: serve
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/serve.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: serve_failure
team: serve
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
run:
timeout: 86400
prepare: ray stop
script: python workloads/serve_failure.py
long_running: True
smoke_test:
run:
timeout: 600
stable: False

View file

@ -1,53 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,9 +0,0 @@
# - name: microbenchmark
# team: core
# cluster:
# app_config: app_config.yaml
# compute_template: tpl_64.yaml
# run:
# timeout: 1800
# script: OMP_NUM_THREADS=64 RAY_ADDRESS= python run_microbenchmark.py

View file

@ -1,124 +0,0 @@
- name: horovod_user_test_latest
team: ml
cluster:
app_config: horovod/app_config.yaml
compute_template: horovod/compute_tpl.yaml
driver_setup: horovod/driver_setup_latest.sh
run:
use_connect: True
autosuspend_mins: 10
timeout: 1200
script: python horovod/horovod_user_test.py
- name: horovod_user_test_master
team: ml
cluster:
app_config: ../horovod_tests/app_config_master.yaml
compute_template: horovod/compute_tpl.yaml
driver_setup: horovod/driver_setup_master.sh
run:
use_connect: True
autosuspend_mins: 10
timeout: 1200
script: python horovod/horovod_user_test.py
- name: train_tensorflow_mnist_test
team: ml
cluster:
app_config: train/app_config.yaml
compute_template: train/compute_tpl.yaml
driver_setup: train/driver_setup.sh
run:
use_connect: True
timeout: 36000
script: python train/train_tensorflow_mnist_test.py
- name: train_torch_linear_test
team: ml
cluster:
app_config: train/app_config.yaml
compute_template: train/compute_tpl.yaml
driver_setup: train/driver_setup.sh
run:
use_connect: True
timeout: 36000
script: python train/train_torch_linear_test.py
- name: xgboost_gpu_connect_latest
team: ml
cluster:
app_config: xgboost/app_config_gpu.yaml
compute_template: xgboost/tpl_gpu_small_scaling.yaml
run:
use_connect: True
timeout: 1200
script: python xgboost/train_gpu_connect.py
- name: xgboost_gpu_connect_master
team: ml
cluster:
app_config: xgboost/app_config_gpu_master.yaml
compute_template: xgboost/tpl_gpu_small_scaling.yaml
run:
use_connect: True
timeout: 1200
script: python xgboost/train_gpu_connect.py
- name: ray_lightning_user_test_latest
team: ml
cluster:
app_config: ray-lightning/app_config.yaml
compute_template: ray-lightning/compute_tpl.yaml
driver_setup: ray-lightning/driver_setup.sh
run:
use_connect: True
autosuspend_mins: 10
timeout: 1200
script: python ray-lightning/ray_lightning_user_test.py
- name: ray_lightning_user_test_master
team: ml
cluster:
app_config: ray-lightning/app_config_master.yaml
compute_template: ray-lightning/compute_tpl.yaml
driver_setup: ray-lightning/driver_setup.sh
run:
use_connect: True
autosuspend_mins: 10
timeout: 1200
script: python ray-lightning/ray_lightning_user_test.py
- name: tune_rllib_connect_test
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: tune_rllib/compute_tpl.yaml
driver_setup: tune_rllib/driver_setup.sh
run:
use_connect: True
autosuspend_mins: 10
timeout: 1200
script: python tune_rllib/run_connect_tests.py

View file

@ -1,64 +0,0 @@
#
# Chaos tests.
#
# Run the test that invokes many tasks without object store usage.
- name: chaos_many_tasks_no_object_store
team: core
cluster:
app_config: chaos_test/app_config.yaml
compute_template: chaos_test/compute_template.yaml
run:
timeout: 3600
prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
script: python chaos_test/test_chaos_basic.py --workload=tasks
- name: chaos_many_actors
team: core
cluster:
app_config: chaos_test/app_config.yaml
compute_template: chaos_test/compute_template.yaml
run:
timeout: 3600
prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
script: python chaos_test/test_chaos_basic.py --workload=actors
- name: chaos_dask_on_ray_large_scale_test_no_spilling
team: core
cluster:
app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
run:
timeout: 7200
# Total run time without failures is about 300-400s.
prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
# Test large scale dask on ray test with spilling.
- name: chaos_dask_on_ray_large_scale_test_spilling
team: core
cluster:
app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
run:
timeout: 7200
# Total run time without failures is about 300-400s.
prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
- name: chaos_pipelined_ingestion_1500_gb_15_windows
team: core
cluster:
app_config: dataset/pipelined_ingestion_app.yaml
compute_template: dataset/pipelined_ingestion_compute.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 21 2400; python setup_chaos.py --node-kill-interval 300
script: python dataset/pipelined_training.py --epochs 1 --num-windows 15 --num-files 915 --debug
stable: false

View file

@ -1,95 +0,0 @@
- name: inference
team: core
cluster:
app_config: app_config.yaml
compute_template: inference.yaml
run:
timeout: 600
prepare: python wait_cluster.py 2 600
script: python inference.py
- name: shuffle_data_loader
team: core
cluster:
app_config: shuffle_app_config.yaml
compute_template: shuffle_compute.yaml
run:
timeout: 1800
script: python dataset_shuffle_data_loader.py
- name: parquet_metadata_resolution
team: core
cluster:
app_config: pipelined_training_app.yaml
compute_template: pipelined_training_compute.yaml
run:
timeout: 1200
prepare: python wait_cluster.py 15 1200
script: python parquet_metadata_resolution.py --num-files 915
- name: pipelined_training_50_gb
team: core
cluster:
app_config: pipelined_training_app.yaml
compute_template: pipelined_training_compute.yaml
run:
timeout: 4800
prepare: python wait_cluster.py 15 1200
script: python pipelined_training.py --epochs 1
- name: pipelined_ingestion_1500_gb
team: core
cluster:
app_config: pipelined_ingestion_app.yaml
compute_template: pipelined_ingestion_compute.yaml
run:
timeout: 9600
prepare: python wait_cluster.py 21 2400
script: python pipelined_training.py --epochs 2 --num-windows 2 --num-files 915 --debug
- name: datasets_ingest_train_infer
team: core
cluster:
app_config: ray_sgd_training_app.yaml
compute_template: ray_sgd_training_compute.yaml
run:
timeout: 14400
prepare: python wait_cluster.py 66 2400
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset
smoke_test:
cluster:
app_config: ray_sgd_training_app.yaml
compute_template: ray_sgd_training_smoke_compute.yaml
run:
timeout: 3600
prepare: python wait_cluster.py 8 2400
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
- name: datasets_preprocess_ingest
team: core
cluster:
app_config: ray_sgd_training_app.yaml
compute_template: ray_sgd_training_compute_no_gpu.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 21 2400
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset --debug
- name: datasets_ingest_400G
team: core
cluster:
app_config: ray_sgd_training_app.yaml
compute_template: dataset_ingest_400G_compute.yaml
run:
timeout: 7200
script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1

View file

@ -1,53 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,390 +0,0 @@
#
# Single node shuffle
#
# Test basic single node 10GB shuffle with a small number of partitions.
# This doesn't require object spilling.
# - name: shuffle_10gb
# team: core
# cluster:
# app_config: shuffle/shuffle_app_config.yaml
# compute_template: shuffle/shuffle_compute_single.yaml
# run:
# timeout: 3000
# script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6
# Test single node 50GB shuffle with a large number of partitions.
- name: shuffle_50gb
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
# Test single node 50GB shuffle with a large number of partitions.
- name: shuffle_50gb_large_partition
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
# Test non streaming shuffle in a single node with a small number of partition.
- name: non_streaming_shuffle_50gb
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming
# Test non streaming shuffle in a single node with a large number of partition.
- name: non_streaming_shuffle_50gb_large_partition
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
- name: dask_on_ray_10gb_sort
team: core
cluster:
app_config: dask_on_ray/dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
run:
timeout: 7200
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
- name: dask_on_ray_100gb_sort
team: core
cluster:
app_config: dask_on_ray/dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
run:
timeout: 7200
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
#
# Multi node shuffle
#
# Test multi nodes 100GB shuffle with a small number of partitions.
- name: shuffle_100gb
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_multi.yaml
run:
timeout: 3000
prepare: python wait_cluster.py 4 600
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
- name: non_streaming_shuffle_100gb
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_multi.yaml
run:
timeout: 3000
prepare: python wait_cluster.py 4 600
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
# Test autoscaling 1TB streaming shuffle with a large number of partitions.
- name: autoscaling_shuffle_1tb_1000_partitions
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_autoscaling.yaml
run:
timeout: 4000
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
# Test multi nodes 1TB streaming shuffle with a large number of partitions.
- name: shuffle_1tb_1000_partition
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_large_scale.yaml
run:
timeout: 3000
prepare: python wait_cluster.py 20 900
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
- name: non_streaming_shuffle_1tb_1000_partition
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_large_scale.yaml
run:
timeout: 3000
prepare: python wait_cluster.py 20 900
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
# Stress test for 1TB multi node streaming shuffle.
- name: shuffle_1tb_5000_partitions
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_large_scale.yaml
run:
timeout: 9000
prepare: python wait_cluster.py 20 900
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
# Stress test for 1TB multi node non-streaming shuffle.
# - name: non_streaming_shuffle_1tb_5000_partitions
# team: core
# stable: False
# cluster:
# app_config: shuffle/shuffle_app_config.yaml
# compute_template: shuffle/shuffle_compute_large_scale.yaml
# run:
# timeout: 7200
# prepare: python wait_cluster.py 20 900
# script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
- name: k8s_dask_on_ray_large_scale_test_no_spilling
team: core
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_stress_compute_k8s.yaml
compute_on_k8s: True
run:
timeout: 7200
prepare: python wait_cluster.py 21 600
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
stable: false
# # Test large scale dask on ray test without spilling.
# - name: dask_on_ray_large_scale_test_no_spilling
# team: core
# cluster:
# app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
# compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
# run:
# timeout: 7200
# prepare: python wait_cluster.py 21 600
# script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
# smoke_test:
# cluster:
# app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
# compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
# run:
# timeout: 7200
# prepare: python wait_cluster.py 5 600
# script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
# Test large scale dask on ray test with spilling.
- name: dask_on_ray_large_scale_test_spilling
team: core
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 21 600
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
smoke_test:
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 5 600
script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
# Stress tests with many tasks
- name: stress_test_many_tasks
team: core
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/stress_tests_compute.yaml
run:
timeout: 7200
script: python stress_tests/test_many_tasks.py
smoke_test:
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
# Stress tests with dead actors
- name: stress_test_dead_actors
team: core
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/stress_tests_compute.yaml
run:
timeout: 7200
script: python stress_tests/test_dead_actors.py
smoke_test:
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3
# Stress tests with placement groups
- name: stress_test_placement_group
team: core
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/placement_group_tests_compute.yaml
run:
timeout: 7200
script: python stress_tests/test_placement_group.py
# Stress tests with many threaded actors.
- name: threaded_actors_stress_test
team: core
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 201 600
script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
smoke_test:
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
prepare: python wait_cluster.py 5 600
script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
stable: false
- name: k8s_threaded_actors_stress_test
team: core
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/k8s_stress_test_threaded_actor_compute.yaml
compute_on_k8s: True
run:
timeout: 7200
prepare: python wait_cluster.py 201 600
script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
run:
timeout: 3600
prepare: python wait_cluster.py 5 600
script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
stable: false
# Test decision tree on autoscaling compute cluster.
- name: decision_tree_autoscaling
team: core
cluster:
app_config: decision_tree/decision_tree_app_config.yaml
compute_template: decision_tree/autoscaling_compute.yaml
run:
timeout: 3000
script: python decision_tree/cart_with_tree.py
# Test 20 concurrent decision tree runs on autoscaling compute cluster.
- name: decision_tree_autoscaling_20_runs
team: core
cluster:
app_config: decision_tree/decision_tree_app_config.yaml
compute_template: decision_tree/autoscaling_compute.yaml
run:
timeout: 9600
script: python decision_tree/cart_with_tree.py --concurrency=20
- name: dask_on_ray_1tb_sort
team: core
cluster:
app_config: dask_on_ray/dask_on_ray_app_config.yaml
compute_template: dask_on_ray/1tb_sort_compute.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 32 1000
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
- name: many_nodes_actor_test
team: core
cluster:
app_config: many_nodes_tests/app_config.yaml
compute_template: many_nodes_tests/compute_config.yaml
run:
timeout: 7200
prepare: python wait_cluster.py 251 5400
script: python many_nodes_tests/actor_test.py
- name: pg_autoscaling_regression_test
team: core
cluster:
app_config: placement_group_tests/app_config.yaml
compute_template: placement_group_tests/compute.yaml
run:
timeout: 1200
script: python placement_group_tests/pg_run.py
- name: pg_long_running_performance_test
team: core
cluster:
app_config: placement_group_tests/app_config.yaml
compute_template: placement_group_tests/long_running_test_compute.yaml
run:
timeout: 3600
prepare: python wait_cluster.py 2 600
script: python placement_group_tests/long_running_performance_test.py --num-stages 2000
- name: placement_group_performance_test
team: core
cluster:
app_config: placement_group_tests/app_config.yaml
compute_template: placement_group_tests/pg_perf_test_compute.yaml
run:
timeout: 1200
prepare: python wait_cluster.py 5 600
script: python placement_group_tests/placement_group_performance_test.py

View file

@ -1,54 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,103 +0,0 @@
# Heavy learning tests (Atari and HalfCheetah) for major algos.
- name: learning_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 8gpus_64cpus.yaml
run:
timeout: 14400
script: python learning_tests/run.py
smoke_test:
run:
timeout: 1200
# 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos.
- name: multi_gpu_learning_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 8gpus_96cpus.yaml
run:
timeout: 7200
script: python multi_gpu_learning_tests/run.py
# 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos
# (that support RNN models).
- name: multi_gpu_with_lstm_learning_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 8gpus_96cpus.yaml
run:
timeout: 7200
script: python multi_gpu_with_lstm_learning_tests/run.py
# 2-GPU learning tests (StatelessCartPole) + use_attention=True for major
# algos (that support RNN models).
- name: multi_gpu_with_attention_learning_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 8gpus_96cpus.yaml
run:
timeout: 7200
script: python multi_gpu_with_attention_learning_tests/run.py
# We'll have these as per-PR tests soon.
# - name: example_scripts_on_gpu_tests
# team: ml
# cluster:
# app_config: app_config.yaml
# compute_template: 1gpu_4cpus.yaml
# run:
# timeout: 7200
# script: bash unit_gpu_tests/run.sh
# IMPALA large machine stress tests (4x Atari).
- name: stress_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 4gpus_544_cpus.yaml
run:
timeout: 5400
prepare: python wait_cluster.py 6 600
script: python stress_tests/run_stress_tests.py
smoke_test:
run:
timeout: 2000
# Tests that exercise auto-scaling and Anyscale connect.
- name: connect_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: auto_scale.yaml
run:
use_connect: True
timeout: 3000
script: python connect_tests/run_connect_tests.py
# Nightly performance regression for popular algorithms.
# These algorithms run nightly for pre-determined amount of time without
# passing criteria.
# Performance metrics, such as reward achieved and throughput, are then
# collected and tracked over time.
- name: performance_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 12gpus_192cpus.yaml
run:
timeout: 10800
script: python performance_tests/run.py

View file

@ -1,53 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,176 +0,0 @@
#!/bin/bash
set -ex
cd "${0%/*}" || exit 1
reason() {
# Keep in sync with e2e.py ExitCode enum
case $1 in
0)
REASON="success"
;;
2)
REASON="unspecified"
;;
3)
REASON="unknown"
;;
4)
REASON="runtime error"
;;
5)
REASON="command error"
;;
6)
REASON="command timeout"
;;
7)
REASON="prepare timeout"
;;
8)
REASON="filesync timeout"
;;
9)
REASON="session timeout"
;;
10)
REASON="prepare error"
;;
11)
REASON="app config build error"
;;
12)
REASON="infra error"
;;
*)
REASON="untracked error"
;;
esac
echo "${REASON}"
}
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--ray-repo)
shift
RAY_REPO=$1
;;
--ray-branch)
shift
RAY_BRANCH=$1
;;
--ray-version)
shift
RAY_VERSION=$1
;;
--ray-wheels)
shift
RAY_WHEELS=$1
;;
--ray-test-repo)
shift
RAY_TEST_REPO=$1
;;
--ray-test-branch)
shift
RAY_TEST_BRANCH=$1
;;
--release-results-dir)
shift
RELEASE_RESULTS_DIR=$1
;;
*)
break
esac
shift
done
RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
export RAY_REPO RAY_BRANCH RAY_VERSION RAY_WHEELS RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
pip uninstall -q -y ray
pip install -q -r requirements.txt
pip install -q -U boto3 botocore
git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" ~/ray
RETRY_NUM=0
MAX_RETRIES=${MAX_RETRIES-3}
if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
MAX_RETRIES=1
fi
ALL_EXIT_CODES=()
while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
RETRY_NUM=$((RETRY_NUM + 1))
if [ "$RETRY_NUM" -gt 1 ]; then
# Sleep for random time between 30 and 90 minutes
SLEEP_TIME=$((1800 + RANDOM % 5400))
echo "----------------------------------------"
echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
echo "----------------------------------------"
sleep ${SLEEP_TIME}
fi
sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
python e2e.py "$@"
EXIT_CODE=$?
REASON=$(reason "${EXIT_CODE}")
ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
case ${EXIT_CODE} in
0)
echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
break
;;
7 | 9 | 10)
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
;;
*)
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
break
;;
esac
done
sudo rm -rf /tmp/ray_release_test_artifacts/* || true
sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
echo "----------------------------------------"
echo "e2e test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
echo "Run results:"
COUNTER=1
for EX in "${ALL_EXIT_CODES[@]}"; do
REASON=$(reason "${EX}")
echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
COUNTER=$((COUNTER + 1))
done
echo "----------------------------------------"
REASON=$(reason "${EXIT_CODE}")
echo "Final e2e exit code is ${EXIT_CODE} (${REASON})"
case ${EXIT_CODE} in
0)
;;
7 | 9 | 10)
echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
;;
*)
echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
;;
esac
exit $EXIT_CODE

View file

@ -1,34 +0,0 @@
- name: rte_many_tasks_actors
team: serve
cluster:
app_config: app_config.yaml
compute_template: rte_small.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/rte_many_tasks_actors.py
- name: wheel_urls
team: serve
cluster:
app_config: app_config.yaml
compute_template: rte_minimal.yaml
run:
timeout: 9000 # 2h30m
prepare: python wait_cluster.py 1 600
script: python workloads/wheel_urls.py
- name: rte_ray_client
team: serve
cluster:
app_config: app_config.yaml
compute_template: rte_minimal.yaml
run:
use_connect: True
autosuspend_mins: 10
timeout: 600
prepare: python wait_cluster.py 1 600
script: python workloads/rte_ray_client.py

View file

@ -1,53 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,101 +0,0 @@
- name: single_deployment_1k_noop_replica
team: serve
cluster:
app_config: app_config.yaml
compute_template: compute_tpl_32_cpu.yaml
run:
timeout: 7200
long_running: False
script: python workloads/single_deployment_1k_noop_replica.py
smoke_test:
timeout: 600
- name: multi_deployment_1k_noop_replica
team: serve
cluster:
app_config: app_config.yaml
compute_template: compute_tpl_32_cpu.yaml
run:
timeout: 7200
long_running: False
script: python workloads/multi_deployment_1k_noop_replica.py
smoke_test:
timeout: 600
- name: autoscaling_single_deployment
team: serve
cluster:
app_config: app_config.yaml
compute_template: compute_tpl_8_cpu_autoscaling.yaml
run:
timeout: 7200
long_running: False
script: python workloads/autoscaling_single_deployment.py
smoke_test:
timeout: 600
- name: autoscaling_multi_deployment
team: serve
cluster:
app_config: app_config.yaml
compute_template: compute_tpl_8_cpu_autoscaling.yaml
run:
timeout: 7200
long_running: False
script: python workloads/autoscaling_multi_deployment.py
smoke_test:
timeout: 600
- name: serve_micro_benchmark
team: serve
cluster:
app_config: app_config.yaml
# 16 CPUS
compute_template: compute_tpl_single_node.yaml
run:
timeout: 7200
long_running: False
script: python workloads/serve_micro_benchmark.py
smoke_test:
timeout: 600
- name: serve_micro_benchmark_k8s
team: serve
cluster:
app_config: app_config.yaml
# 16 CPUS
compute_template: compute_tpl_single_node_k8s.yaml
compute_on_k8s: True
run:
timeout: 7200
long_running: False
script: python workloads/serve_micro_benchmark.py
smoke_test:
timeout: 600
- name: serve_cluster_fault_tolerance
team: serve
cluster:
app_config: app_config.yaml
# 16 CPUS
compute_template: compute_tpl_single_node.yaml
run:
timeout: 7200
long_running: False
script: python workloads/serve_cluster_fault_tolerance.py
smoke_test:
timeout: 600

View file

@ -1,11 +0,0 @@
# Test multi-node, multi-GPU Ray SGD example.
- name: sgd_gpu
team: ml
cluster:
app_config: sgd_gpu/sgd_gpu_app_config.yaml
compute_template: sgd_gpu/sgd_gpu_compute.yaml
run:
timeout: 3000
prepare: python wait_cluster.py 2 600
script: python sgd_gpu/sgd_gpu_test.py --num-workers=2 --use-gpu --address=auto

View file

@ -1,53 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,27 +0,0 @@
# Specify the test owners (teams) here.
# The root key should be the name of the test yaml file without the .yaml.
# To specify owners of subtests, use a sub dict (see e.g. long_running_tests).
golden_notebook_tests: ml
horovod_tests: ml
lightgbm_tests: ml
long_running_distributed_tests: ml
long_running_tests:
actor_deaths: core
apex: ml
impala: ml
many_actor_tasks: core
many_drivers: core
many_ppo: core
many_tasks: core
many_tasks_serialized_ids: core
node_failures: core
pbt: ml
serve: serve
serve_failure: serve
microbenchmark: core
nightly_tests: core
rllib_tests: ml
runtime_env_tests: serve
serve_tests: serve
sgd_tests: ml
xgboost_tests: ml

View file

@ -1,118 +0,0 @@
- name: aws_no_sync_down
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py no_sync_down
- name: aws_ssh_sync
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py ssh_sync
- name: aws_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
- name: aws_durable_upload_rllib_str
team: ml
cluster:
app_config: app_config_ml.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str
- name: aws_durable_upload_rllib_trainer
team: ml
cluster:
app_config: app_config_ml.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer
- name: aws_no_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload
- name: gcp_k8s_no_sync_down
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
use_connect: True
timeout: 600
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
- name: gcp_k8s_ssh_sync
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
use_connect: True
timeout: 600
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
- name: gcp_k8s_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
use_connect: True
timeout: 600
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
- name: gcp_k8s_no_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
run:
use_connect: True
timeout: 600
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
script: python workloads/run_cloud_test.py no_durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload

View file

@ -1,54 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,90 +0,0 @@
- name: bookkeeping_overhead
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_1x16.yaml
run:
timeout: 1200
script: python workloads/test_bookkeeping_overhead.py
- name: durable_trainable
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_16x2.yaml
run:
timeout: 900
prepare: python wait_cluster.py 16 600
script: python workloads/test_durable_trainable.py --bucket data-test-ilr
- name: long_running_large_checkpoints
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_1x32_hd.yaml
run:
timeout: 86400
script: python workloads/test_long_running_large_checkpoints.py
long_running: True
smoke_test:
run:
timeout: 3600
- name: network_overhead
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_100x2.yaml
run:
timeout: 900
prepare_timeout: 1200
prepare: python wait_cluster.py 100 1200
script: python workloads/test_network_overhead.py
smoke_test:
cluster:
compute_template: tpl_20x2.yaml
run:
timeout: 400
prepare_timeout: 600
prepare: python wait_cluster.py 20 600
- name: result_throughput_cluster
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_16x64.yaml
run:
timeout: 600
prepare: python wait_cluster.py 16 600
script: python workloads/test_result_throughput_cluster.py
- name: result_throughput_single_node
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_1x96.yaml
run:
timeout: 600
script: python workloads/test_result_throughput_single_node.py
- name: xgboost_sweep
team: ml
cluster:
app_config: app_config_data.yaml
compute_template: tpl_16x64.yaml
run:
timeout: 3600
prepare: python wait_cluster.py 16 600
script: python workloads/test_xgboost_sweep.py

View file

@ -1,53 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,53 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,53 +0,0 @@
import argparse
import time
import ray
ray.init(address="auto")
parser = argparse.ArgumentParser()
parser.add_argument(
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
)
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
parser.add_argument(
"--feedback_interval_s",
type=int,
default=10,
help="Wait for this number of seconds",
)
args = parser.parse_args()
curr_nodes = 0
start = time.time()
next_feedback = start
max_time = start + args.max_time_s
while not curr_nodes >= args.num_nodes:
now = time.time()
if now >= max_time:
raise RuntimeError(
f"Maximum wait time reached, but only "
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
)
if now >= next_feedback:
passed = now - start
print(
f"Waiting for more nodes to come up: "
f"{curr_nodes}/{args.num_nodes} "
f"({passed:.0f} seconds passed)"
)
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
passed = time.time() - start
print(
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
f"{passed:.0f} seconds"
)

View file

@ -1,104 +0,0 @@
- name: train_small
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
run:
use_connect: True
autosuspend_mins: 10
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/train_small.py
- name: train_moderate
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml
run:
timeout: 600
prepare: python wait_cluster.py 32 600
script: python workloads/train_moderate.py
- name: train_gpu
team: ml
cluster:
app_config: app_config_gpu.yaml
compute_template: tpl_gpu_small.yaml
run:
timeout: 600
prepare: python wait_cluster.py 5 600
script: python workloads/train_gpu.py
- name: distributed_api_test
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
results:
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/distributed_api_test.py
results: ""
- name: ft_small_elastic
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
run:
timeout: 900
prepare: python wait_cluster.py 4 600
script: python workloads/ft_small_elastic.py
results: ""
- name: ft_small_non_elastic
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
run:
timeout: 900
prepare: python wait_cluster.py 4 600
script: python workloads/ft_small_non_elastic.py
results: ""
- name: tune_small
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
run:
timeout: 600
prepare: python wait_cluster.py 4 600
script: python workloads/tune_small.py
- name: tune_32x4
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml
run:
timeout: 900
prepare: python wait_cluster.py 32 600
script: python workloads/tune_32x4.py
- name: tune_4x32
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml
run:
timeout: 900
prepare: python wait_cluster.py 32 600
script: python workloads/tune_4x32.py