mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[ci/release] Remove old OSS release test infrastructure (#23134)
Now that we've migrated all OSS release tests to the new infrastructure, we can remove old config files and infra scripts.
This commit is contained in:
parent
d93fa95dd5
commit
8608b64885
39 changed files with 0 additions and 6712 deletions
|
@ -1,145 +0,0 @@
|
|||
- name: single_node
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: single_node.yaml
|
||||
|
||||
run:
|
||||
timeout: 12000
|
||||
prepare: sleep 0
|
||||
script: python single_node/test_single_node.py
|
||||
|
||||
- name: object_store
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: object_store.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=50
|
||||
script: python object_store/test_object_store.py
|
||||
|
||||
- name: many_actors
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
||||
script: python distributed/test_many_actors.py
|
||||
|
||||
- name: many_actors_smoke_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed_smoke_test.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
||||
script: SMOKE_TEST=1 python distributed/test_many_actors.py
|
||||
|
||||
- name: many_tasks
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
||||
script: python distributed/test_many_tasks.py --num-tasks=10000
|
||||
|
||||
- name: many_tasks_smoke_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed_smoke_test.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
||||
script: python distributed/test_many_tasks.py --num-tasks=100
|
||||
|
||||
- name: many_pgs
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
||||
script: python distributed/test_many_pgs.py
|
||||
|
||||
- name: many_pgs_smoke_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed_smoke_test.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
||||
script: SMOKE_TEST=1 python distributed/test_many_pgs.py
|
||||
|
||||
# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
|
||||
- name: many_nodes
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: many_nodes.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=250
|
||||
script: python distributed/test_many_tasks.py --num-tasks=1000
|
||||
|
||||
- name: scheduling_test_many_0s_tasks_single_node
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: scheduling.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
|
||||
|
||||
- name: scheduling_test_many_0s_tasks_many_nodes
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: scheduling.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
|
||||
|
||||
- name: scheduling_test_many_5s_tasks_single_node
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: scheduling.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
|
||||
stable: false
|
||||
|
||||
- name: scheduling_test_many_5s_tasks_many_nodes
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: scheduling.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
|
||||
stable: false
|
|
@ -1,24 +0,0 @@
|
|||
import click
|
||||
import ray
|
||||
import time
|
||||
|
||||
|
||||
def num_alive_nodes():
|
||||
n = 0
|
||||
for node in ray.nodes():
|
||||
if node["Alive"]:
|
||||
n += 1
|
||||
return n
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
|
||||
def wait_cluster(num_nodes: int):
|
||||
ray.init(address="auto")
|
||||
while num_alive_nodes() != num_nodes:
|
||||
print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
wait_cluster()
|
|
@ -1,680 +0,0 @@
|
|||
import copy
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
|
||||
# If you update or reorganize the periodic tests, please ensure the
|
||||
# relevant portions of the Ray release instructions (go/release-ray)
|
||||
# (in particular, running periodic tests and collecting release logs)
|
||||
# are up to date. If you need access, please contact @zhe-thoughts.
|
||||
|
||||
# Env variables:
|
||||
|
||||
# RAY_REPO Repo to use for finding the wheel
|
||||
# RAY_BRANCH Branch to find the wheel
|
||||
# RAY_VERSION Version to find the wheel
|
||||
# RAY_WHEELS Direct Ray wheel URL
|
||||
# RAY_TEST_REPO Repo to use for test scripts
|
||||
# RAY_TEST_BRANCH Branch for test scripts
|
||||
# FILTER_FILE File filter
|
||||
# FILTER_TEST Test name filter
|
||||
# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)
|
||||
|
||||
|
||||
class ReleaseTest:
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
smoke_test: bool = False,
|
||||
retry: int = 0,
|
||||
):
|
||||
self.name = name
|
||||
self.smoke_test = smoke_test
|
||||
self.retry = retry
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
def __contains__(self, item):
|
||||
return self.name.__contains__(item)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.name)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.name)
|
||||
|
||||
|
||||
class SmokeTest(ReleaseTest):
|
||||
def __init__(self, name: str, retry: int = 0):
|
||||
super(SmokeTest, self).__init__(name=name, smoke_test=True, retry=retry)
|
||||
|
||||
|
||||
CORE_NIGHTLY_TESTS = {
|
||||
# "~/ray/release/nightly_tests/nightly_tests.yaml": [
|
||||
# "shuffle_10gb",
|
||||
# "shuffle_50gb",
|
||||
# "shuffle_50gb_large_partition",
|
||||
# "shuffle_100gb",
|
||||
# "non_streaming_shuffle_100gb",
|
||||
# "non_streaming_shuffle_50gb_large_partition",
|
||||
# "non_streaming_shuffle_50gb",
|
||||
# SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
|
||||
# SmokeTest("dask_on_ray_large_scale_test_spilling"),
|
||||
# "stress_test_placement_group",
|
||||
# "shuffle_1tb_1000_partition",
|
||||
# "non_streaming_shuffle_1tb_1000_partition",
|
||||
# "shuffle_1tb_5000_partitions",
|
||||
# TODO(sang): It doesn't even work without spilling
|
||||
# as it hits the scalability limit.
|
||||
# "non_streaming_shuffle_1tb_5000_partitions",
|
||||
# "decision_tree_autoscaling",
|
||||
# "decision_tree_autoscaling_20_runs",
|
||||
# "autoscaling_shuffle_1tb_1000_partitions",
|
||||
# SmokeTest("stress_test_many_tasks"),
|
||||
# SmokeTest("stress_test_dead_actors"),
|
||||
# SmokeTest("threaded_actors_stress_test"),
|
||||
# "pg_long_running_performance_test",
|
||||
# ],
|
||||
# "~/ray/benchmarks/benchmark_tests.yaml": [
|
||||
# "single_node",
|
||||
# "object_store",
|
||||
# "many_actors_smoke_test",
|
||||
# "many_tasks_smoke_test",
|
||||
# "many_pgs_smoke_test",
|
||||
# ],
|
||||
# "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
|
||||
# "inference",
|
||||
# "shuffle_data_loader",
|
||||
# "parquet_metadata_resolution",
|
||||
# "pipelined_training_50_gb",
|
||||
# "pipelined_ingestion_1500_gb",
|
||||
# "datasets_preprocess_ingest",
|
||||
# "datasets_ingest_400G",
|
||||
# SmokeTest("datasets_ingest_train_infer"),
|
||||
# ],
|
||||
# "~/ray/release/nightly_tests/chaos_test.yaml": [
|
||||
# "chaos_many_actors",
|
||||
# "chaos_many_tasks_no_object_store",
|
||||
# "chaos_pipelined_ingestion_1500_gb_15_windows",
|
||||
# ],
|
||||
# "~/ray/release/microbenchmark/microbenchmark.yaml": [
|
||||
# "microbenchmark",
|
||||
# ],
|
||||
}
|
||||
|
||||
SERVE_NIGHTLY_TESTS = {
|
||||
# "~/ray/release/long_running_tests/long_running_tests.yaml": [
|
||||
# SmokeTest("serve"),
|
||||
# SmokeTest("serve_failure"),
|
||||
# ],
|
||||
# "~/ray/release/serve_tests/serve_tests.yaml": [
|
||||
# "single_deployment_1k_noop_replica",
|
||||
# "multi_deployment_1k_noop_replica",
|
||||
# "autoscaling_single_deployment",
|
||||
# "autoscaling_multi_deployment",
|
||||
# "serve_micro_benchmark",
|
||||
# # TODO(architkulkarni) Reenable after K8s migration. Currently failing
|
||||
# # "serve_micro_benchmark_k8s",
|
||||
# "serve_cluster_fault_tolerance",
|
||||
# ],
|
||||
}
|
||||
|
||||
CORE_DAILY_TESTS = {
|
||||
# "~/ray/release/nightly_tests/nightly_tests.yaml": [
|
||||
# "k8s_dask_on_ray_large_scale_test_no_spilling",
|
||||
# "dask_on_ray_large_scale_test_no_spilling",
|
||||
# "dask_on_ray_large_scale_test_spilling",
|
||||
# "pg_autoscaling_regression_test",
|
||||
# "threaded_actors_stress_test",
|
||||
# "k8s_threaded_actors_stress_test",
|
||||
# "stress_test_many_tasks",
|
||||
# "stress_test_dead_actors",
|
||||
# ],
|
||||
# "~/ray/release/nightly_tests/chaos_test.yaml": [
|
||||
# "chaos_dask_on_ray_large_scale_test_no_spilling",
|
||||
# "chaos_dask_on_ray_large_scale_test_spilling",
|
||||
# ],
|
||||
}
|
||||
|
||||
CORE_SCALABILITY_TESTS_DAILY = {
|
||||
# "~/ray/benchmarks/benchmark_tests.yaml": [
|
||||
# "many_actors",
|
||||
# "many_tasks",
|
||||
# "many_pgs",
|
||||
# "many_nodes",
|
||||
# ],
|
||||
}
|
||||
|
||||
CORE_SCHEDULING_DAILY = {
|
||||
# "~/ray/benchmarks/benchmark_tests.yaml": [
|
||||
# "scheduling_test_many_0s_tasks_single_node",
|
||||
# "scheduling_test_many_0s_tasks_many_nodes",
|
||||
# # Reenable these two once we got right setup
|
||||
# # "scheduling_test_many_5s_tasks_single_node",
|
||||
# # "scheduling_test_many_5s_tasks_many_nodes",
|
||||
# ],
|
||||
# "~/ray/release/nightly_tests/nightly_tests.yaml": [
|
||||
# "many_nodes_actor_test",
|
||||
# "dask_on_ray_10gb_sort",
|
||||
# "dask_on_ray_100gb_sort",
|
||||
# "dask_on_ray_1tb_sort",
|
||||
# "placement_group_performance_test",
|
||||
# ],
|
||||
}
|
||||
|
||||
NIGHTLY_TESTS = {
|
||||
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
|
||||
# SmokeTest("horovod_test"),
|
||||
# ], # Should we enable this?
|
||||
# "~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
|
||||
# "dask_xgboost_test",
|
||||
# "modin_xgboost_test",
|
||||
# "torch_tune_serve_test",
|
||||
# ],
|
||||
# "~/ray/release/long_running_tests/long_running_tests.yaml": [
|
||||
# SmokeTest("actor_deaths"),
|
||||
# SmokeTest("apex"),
|
||||
# SmokeTest("impala"),
|
||||
# SmokeTest("many_actor_tasks"),
|
||||
# SmokeTest("many_drivers"),
|
||||
# SmokeTest("many_ppo"),
|
||||
# SmokeTest("many_tasks"),
|
||||
# SmokeTest("many_tasks_serialized_ids"),
|
||||
# SmokeTest("node_failures"),
|
||||
# SmokeTest("pbt"),
|
||||
# # SmokeTest("serve"),
|
||||
# # SmokeTest("serve_failure"),
|
||||
# # Full long running tests (1 day runtime)
|
||||
# "actor_deaths",
|
||||
# "apex",
|
||||
# "impala",
|
||||
# "many_actor_tasks",
|
||||
# "many_drivers",
|
||||
# "many_ppo",
|
||||
# "many_tasks",
|
||||
# "many_tasks_serialized_ids",
|
||||
# "node_failures",
|
||||
# "pbt",
|
||||
# "serve",
|
||||
# "serve_failure",
|
||||
# ],
|
||||
# "~/ray/release/sgd_tests/sgd_tests.yaml": [
|
||||
# "sgd_gpu",
|
||||
# ],
|
||||
# "~/ray/release/tune_tests/cloud_tests/tune_cloud_tests.yaml": [
|
||||
# "aws_no_sync_down",
|
||||
# "aws_ssh_sync",
|
||||
# "aws_durable_upload",
|
||||
# "aws_durable_upload_rllib_str",
|
||||
# "aws_durable_upload_rllib_trainer",
|
||||
# "gcp_k8s_durable_upload",
|
||||
# ],
|
||||
# "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
|
||||
# "bookkeeping_overhead",
|
||||
# "durable_trainable",
|
||||
# SmokeTest("long_running_large_checkpoints"),
|
||||
# SmokeTest("network_overhead"),
|
||||
# "result_throughput_cluster",
|
||||
# "result_throughput_single_node",
|
||||
# ],
|
||||
# "~/ray/release/xgboost_tests/xgboost_tests.yaml": [
|
||||
# "train_small",
|
||||
# "train_moderate",
|
||||
# "train_gpu",
|
||||
# "tune_small",
|
||||
# "tune_4x32",
|
||||
# "tune_32x4",
|
||||
# "ft_small_elastic",
|
||||
# "ft_small_non_elastic",
|
||||
# "distributed_api_test",
|
||||
# ],
|
||||
# "~/ray/release/rllib_tests/rllib_tests.yaml": [
|
||||
# SmokeTest("learning_tests"),
|
||||
# SmokeTest("stress_tests"),
|
||||
# "performance_tests",
|
||||
# "multi_gpu_learning_tests",
|
||||
# "multi_gpu_with_lstm_learning_tests",
|
||||
# "multi_gpu_with_attention_learning_tests",
|
||||
# # We'll have these as per-PR tests soon.
|
||||
# # "example_scripts_on_gpu_tests",
|
||||
# ],
|
||||
# "~/ray/release/runtime_env_tests/runtime_env_tests.yaml": [
|
||||
# "rte_many_tasks_actors",
|
||||
# "wheel_urls",
|
||||
# "rte_ray_client",
|
||||
# ],
|
||||
}
|
||||
|
||||
WEEKLY_TESTS = {
|
||||
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
|
||||
# "horovod_test",
|
||||
# ],
|
||||
"~/ray/release/long_running_distributed_tests"
|
||||
# "/long_running_distributed.yaml": [
|
||||
# "pytorch_pbt_failure",
|
||||
# ],
|
||||
# "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
|
||||
# "network_overhead",
|
||||
# "long_running_large_checkpoints",
|
||||
# "xgboost_sweep",
|
||||
# ],
|
||||
# "~/ray/release/rllib_tests/rllib_tests.yaml": [
|
||||
# "learning_tests",
|
||||
# "stress_tests",
|
||||
# ],
|
||||
}
|
||||
|
||||
# This test suite holds "user" tests to test important user workflows
|
||||
# in a particular environment.
|
||||
# All workloads in this test suite should:
|
||||
# 1. Be run in a distributed (multi-node) fashion
|
||||
# 2. Use autoscaling/scale up (no wait_cluster.py)
|
||||
# 3. Use GPUs if applicable
|
||||
# 4. Have the `use_connect` flag set.
|
||||
USER_TESTS = {
|
||||
# "~/ray/release/ml_user_tests/ml_user_tests.yaml": [
|
||||
# "train_tensorflow_mnist_test",
|
||||
# "train_torch_linear_test",
|
||||
# "ray_lightning_user_test_latest",
|
||||
# "ray_lightning_user_test_master",
|
||||
# "horovod_user_test_latest",
|
||||
# "horovod_user_test_master",
|
||||
# "xgboost_gpu_connect_latest",
|
||||
# "xgboost_gpu_connect_master",
|
||||
# "tune_rllib_connect_test",
|
||||
# ]
|
||||
}
|
||||
|
||||
SUITES = {
|
||||
"core-nightly": CORE_NIGHTLY_TESTS,
|
||||
"serve-nightly": SERVE_NIGHTLY_TESTS,
|
||||
"core-daily": CORE_DAILY_TESTS,
|
||||
"core-scalability": CORE_SCALABILITY_TESTS_DAILY,
|
||||
"nightly": {**NIGHTLY_TESTS, **USER_TESTS},
|
||||
"core-scheduling-daily": CORE_SCHEDULING_DAILY,
|
||||
"weekly": WEEKLY_TESTS,
|
||||
}
|
||||
|
||||
DEFAULT_STEP_TEMPLATE = {
|
||||
"env": {
|
||||
"ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
|
||||
"ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
|
||||
"RELEASE_AWS_BUCKET": "ray-release-automation-results",
|
||||
"RELEASE_AWS_LOCATION": "dev",
|
||||
"RELEASE_AWS_DB_NAME": "ray_ci",
|
||||
"RELEASE_AWS_DB_TABLE": "release_test_result",
|
||||
"AWS_REGION": "us-west-2",
|
||||
},
|
||||
"agents": {"queue": "runner_queue_branch"},
|
||||
"plugins": [
|
||||
{
|
||||
"docker#v3.9.0": {
|
||||
"image": "rayproject/ray",
|
||||
"propagate-environment": True,
|
||||
"volumes": [
|
||||
"/tmp/ray_release_test_artifacts:" "/tmp/ray_release_test_artifacts"
|
||||
],
|
||||
}
|
||||
}
|
||||
],
|
||||
"artifact_paths": ["/tmp/ray_release_test_artifacts/**/*"],
|
||||
}
|
||||
|
||||
|
||||
def ask_configuration():
|
||||
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
|
||||
RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
|
||||
RAY_VERSION = os.environ.get("RAY_VERSION", "")
|
||||
RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
|
||||
|
||||
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
|
||||
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
|
||||
|
||||
RELEASE_TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
|
||||
FILTER_FILE = os.environ.get("FILTER_FILE", "")
|
||||
FILTER_TEST = os.environ.get("FILTER_TEST", "")
|
||||
|
||||
input_ask_step = {
|
||||
"input": "Input required: Please specify tests to run",
|
||||
"fields": [
|
||||
{
|
||||
"text": (
|
||||
"RAY_REPO: Please specify the Ray repository used "
|
||||
"to find the wheel."
|
||||
),
|
||||
"hint": (
|
||||
"Repository from which to fetch the latest "
|
||||
"commits to find the Ray wheels. Usually you don't "
|
||||
"need to change this."
|
||||
),
|
||||
"default": RAY_REPO,
|
||||
"key": "ray_repo",
|
||||
},
|
||||
{
|
||||
"text": (
|
||||
"RAY_BRANCH: Please specify the Ray branch used "
|
||||
"to find the wheel."
|
||||
),
|
||||
"hint": "For releases, this will be e.g. `releases/1.x.0`",
|
||||
"default": RAY_BRANCH,
|
||||
"key": "ray_branch",
|
||||
},
|
||||
{
|
||||
"text": (
|
||||
"RAY_VERSION: Please specify the Ray version used "
|
||||
"to find the wheel."
|
||||
),
|
||||
"hint": (
|
||||
"Leave empty for latest master. For releases, "
|
||||
"specify the release version."
|
||||
),
|
||||
"required": False,
|
||||
"default": RAY_VERSION,
|
||||
"key": "ray_version",
|
||||
},
|
||||
{
|
||||
"text": "RAY_WHEELS: Please specify the Ray wheel URL.",
|
||||
"hint": (
|
||||
"ATTENTION: If you provide this, RAY_REPO, "
|
||||
"RAY_BRANCH and RAY_VERSION will be ignored! "
|
||||
"Please also make sure to provide the wheels URL "
|
||||
"for Python 3.7 on Linux.\n"
|
||||
"You can also insert a commit hash here instead "
|
||||
"of a full URL.\n"
|
||||
"NOTE: You can specify multiple commits or URLs "
|
||||
"for easy bisection (one per line) - this will "
|
||||
"run each test on each of the specified wheels."
|
||||
),
|
||||
"required": False,
|
||||
"default": RAY_WHEELS,
|
||||
"key": "ray_wheels",
|
||||
},
|
||||
{
|
||||
"text": (
|
||||
"RAY_TEST_REPO: Please specify the Ray repository "
|
||||
"used to find the tests you would like to run."
|
||||
),
|
||||
"hint": (
|
||||
"If you're developing a new release test, this "
|
||||
"will most likely be your GitHub fork."
|
||||
),
|
||||
"default": RAY_TEST_REPO,
|
||||
"key": "ray_test_repo",
|
||||
},
|
||||
{
|
||||
"text": (
|
||||
"RAY_TEST_BRANCH: Please specify the Ray branch used "
|
||||
"to find the tests you would like to run."
|
||||
),
|
||||
"hint": (
|
||||
"If you're developing a new release test, this "
|
||||
"will most likely be a branch living on your "
|
||||
"GitHub fork."
|
||||
),
|
||||
"default": RAY_TEST_BRANCH,
|
||||
"key": "ray_test_branch",
|
||||
},
|
||||
{
|
||||
"select": (
|
||||
"RELEASE_TEST_SUITE: Please specify the release "
|
||||
"test suite containing the tests you would like "
|
||||
"to run."
|
||||
),
|
||||
"hint": (
|
||||
"Check in the `build_pipeline.py` if you're "
|
||||
"unsure which suite contains your tests."
|
||||
),
|
||||
"required": True,
|
||||
"options": sorted(SUITES.keys()),
|
||||
"default": RELEASE_TEST_SUITE,
|
||||
"key": "release_test_suite",
|
||||
},
|
||||
{
|
||||
"text": (
|
||||
"FILTER_FILE: Please specify a filter for the "
|
||||
"test files that should be included in this build."
|
||||
),
|
||||
"hint": (
|
||||
"Only test files (e.g. xgboost_tests.yml) that "
|
||||
"match this string will be included in the test"
|
||||
),
|
||||
"default": FILTER_FILE,
|
||||
"required": False,
|
||||
"key": "filter_file",
|
||||
},
|
||||
{
|
||||
"text": (
|
||||
"FILTER_TEST: Please specify a filter for the "
|
||||
"test names that should be included in this build."
|
||||
),
|
||||
"hint": (
|
||||
"Only test names (e.g. tune_4x32) that match "
|
||||
"this string will be included in the test"
|
||||
),
|
||||
"default": FILTER_TEST,
|
||||
"required": False,
|
||||
"key": "filter_test",
|
||||
},
|
||||
],
|
||||
"key": "input_ask_step",
|
||||
}
|
||||
|
||||
run_again_step = {
|
||||
"commands": [
|
||||
f'export {v}=$(buildkite-agent meta-data get "{k}")'
|
||||
for k, v in {
|
||||
"ray_branch": "RAY_BRANCH",
|
||||
"ray_repo": "RAY_REPO",
|
||||
"ray_version": "RAY_VERSION",
|
||||
"ray_wheels": "RAY_WHEELS",
|
||||
"ray_test_branch": "RAY_TEST_BRANCH",
|
||||
"ray_test_repo": "RAY_TEST_REPO",
|
||||
"release_test_suite": "RELEASE_TEST_SUITE",
|
||||
"filter_file": "FILTER_FILE",
|
||||
"filter_test": "FILTER_TEST",
|
||||
}.items()
|
||||
]
|
||||
+ [
|
||||
"export AUTOMATIC=1",
|
||||
"python3 -m pip install --user pyyaml",
|
||||
"rm -rf ~/ray || true",
|
||||
"git clone -b $${RAY_TEST_BRANCH} $${RAY_TEST_REPO} ~/ray",
|
||||
(
|
||||
"python3 ~/ray/release/.buildkite/build_pipeline.py "
|
||||
"| buildkite-agent pipeline upload"
|
||||
),
|
||||
],
|
||||
"label": ":pipeline: Again",
|
||||
"agents": {"queue": "runner_queue_branch"},
|
||||
"depends_on": "input_ask_step",
|
||||
"key": "run_again_step",
|
||||
}
|
||||
|
||||
return [
|
||||
input_ask_step,
|
||||
run_again_step,
|
||||
]
|
||||
|
||||
|
||||
def create_test_step(
|
||||
ray_repo: str,
|
||||
ray_branch: str,
|
||||
ray_version: str,
|
||||
ray_wheels: str,
|
||||
ray_test_repo: str,
|
||||
ray_test_branch: str,
|
||||
test_file: str,
|
||||
test_name: ReleaseTest,
|
||||
):
|
||||
custom_commit_str = "custom_wheels_url"
|
||||
if ray_wheels:
|
||||
# Extract commit from url
|
||||
p = re.compile(r"([a-f0-9]{40})")
|
||||
m = p.search(ray_wheels)
|
||||
if m is not None:
|
||||
custom_commit_str = m.group(1)
|
||||
|
||||
ray_wheels_str = f" ({ray_wheels}) " if ray_wheels else ""
|
||||
|
||||
logging.info(f"Creating step for {test_file}/{test_name}{ray_wheels_str}")
|
||||
|
||||
cmd = (
|
||||
f"./release/run_e2e.sh "
|
||||
f'--ray-repo "{ray_repo}" '
|
||||
f'--ray-branch "{ray_branch}" '
|
||||
f'--ray-version "{ray_version}" '
|
||||
f'--ray-wheels "{ray_wheels}" '
|
||||
f'--ray-test-repo "{ray_test_repo}" '
|
||||
f'--ray-test-branch "{ray_test_branch}" '
|
||||
)
|
||||
|
||||
args = (
|
||||
f"--category {ray_branch} "
|
||||
f"--test-config {test_file} "
|
||||
f"--test-name {test_name} "
|
||||
f"--keep-results-dir"
|
||||
)
|
||||
|
||||
if test_name.smoke_test:
|
||||
logging.info("This test will run as a smoke test.")
|
||||
args += " --smoke-test"
|
||||
|
||||
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
|
||||
|
||||
if test_name.retry:
|
||||
logging.info(f"This test will be retried up to " f"{test_name.retry} times.")
|
||||
step_conf["retry"] = {
|
||||
"automatic": [{"exit_status": "*", "limit": test_name.retry}]
|
||||
}
|
||||
else:
|
||||
# Default retry logic
|
||||
# Warning: Exit codes are currently not correctly propagated to
|
||||
# buildkite! Thus, actual retry logic is currently implemented in
|
||||
# the run_e2e.sh script!
|
||||
step_conf["retry"] = {
|
||||
"automatic": [
|
||||
{"exit_status": 7, "limit": 2}, # Prepare timeout
|
||||
{"exit_status": 9, "limit": 2}, # Session timeout
|
||||
{"exit_status": 10, "limit": 2}, # Prepare error
|
||||
],
|
||||
}
|
||||
|
||||
step_conf["command"] = cmd + args
|
||||
|
||||
step_conf["label"] = (
|
||||
f"{test_name} "
|
||||
f"({custom_commit_str if ray_wheels_str else ray_branch}) - "
|
||||
f"{ray_test_branch}/{ray_test_repo}"
|
||||
)
|
||||
return step_conf
|
||||
|
||||
|
||||
def build_pipeline(steps):
|
||||
all_steps = []
|
||||
|
||||
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
|
||||
RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
|
||||
RAY_VERSION = os.environ.get("RAY_VERSION", "")
|
||||
RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
|
||||
|
||||
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
|
||||
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
|
||||
|
||||
FILTER_FILE = os.environ.get("FILTER_FILE", "")
|
||||
FILTER_TEST = os.environ.get("FILTER_TEST", "")
|
||||
|
||||
ray_wheels_list = [""]
|
||||
if RAY_WHEELS:
|
||||
ray_wheels_list = RAY_WHEELS.split("\n")
|
||||
|
||||
if len(ray_wheels_list) > 1:
|
||||
logging.info(
|
||||
f"This will run a bisec on the following URLs/commits: "
|
||||
f"{ray_wheels_list}"
|
||||
)
|
||||
|
||||
logging.info(
|
||||
f"Building pipeline \n"
|
||||
f"Ray repo/branch to test:\n"
|
||||
f" RAY_REPO = {RAY_REPO}\n"
|
||||
f" RAY_BRANCH = {RAY_BRANCH}\n\n"
|
||||
f" RAY_VERSION = {RAY_VERSION}\n\n"
|
||||
f" RAY_WHEELS = {RAY_WHEELS}\n\n"
|
||||
f"Ray repo/branch containing the test configurations and scripts:"
|
||||
f" RAY_TEST_REPO = {RAY_TEST_REPO}\n"
|
||||
f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
|
||||
f"Filtering for these tests:\n"
|
||||
f" FILTER_FILE = {FILTER_FILE}\n"
|
||||
f" FILTER_TEST = {FILTER_TEST}\n\n"
|
||||
)
|
||||
|
||||
for test_file, test_names in steps.items():
|
||||
if FILTER_FILE and FILTER_FILE not in test_file:
|
||||
continue
|
||||
|
||||
test_base = os.path.basename(test_file)
|
||||
for test_name in test_names:
|
||||
if FILTER_TEST and FILTER_TEST not in test_name:
|
||||
continue
|
||||
|
||||
if not isinstance(test_name, ReleaseTest):
|
||||
test_name = ReleaseTest(name=test_name)
|
||||
|
||||
logging.info(f"Adding test: {test_base}/{test_name}")
|
||||
|
||||
for ray_wheels in ray_wheels_list:
|
||||
step_conf = create_test_step(
|
||||
ray_repo=RAY_REPO,
|
||||
ray_branch=RAY_BRANCH,
|
||||
ray_version=RAY_VERSION,
|
||||
ray_wheels=ray_wheels,
|
||||
ray_test_repo=RAY_TEST_REPO,
|
||||
ray_test_branch=RAY_TEST_BRANCH,
|
||||
test_file=test_file,
|
||||
test_name=test_name,
|
||||
)
|
||||
|
||||
all_steps.append(step_conf)
|
||||
|
||||
return all_steps
|
||||
|
||||
|
||||
def alert_pipeline(stats: bool = False):
|
||||
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
|
||||
|
||||
cmd = "python release/alert.py"
|
||||
if stats:
|
||||
cmd += " --stats"
|
||||
|
||||
step_conf["commands"] = [
|
||||
"pip install -q -r release/requirements.txt",
|
||||
"pip install -U boto3 botocore",
|
||||
cmd,
|
||||
]
|
||||
step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
|
||||
return [step_conf]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
alert = os.environ.get("RELEASE_ALERT", "0")
|
||||
|
||||
ask_for_config = not bool(int(os.environ.get("AUTOMATIC", "0")))
|
||||
|
||||
if alert in ["1", "stats"]:
|
||||
steps = alert_pipeline(alert == "stats")
|
||||
elif ask_for_config:
|
||||
steps = ask_configuration()
|
||||
else:
|
||||
TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
|
||||
PIPELINE_SPEC = SUITES[TEST_SUITE]
|
||||
|
||||
steps = build_pipeline(PIPELINE_SPEC)
|
||||
|
||||
yaml.dump({"steps": steps}, sys.stdout)
|
441
release/alert.py
441
release/alert.py
|
@ -1,441 +0,0 @@
|
|||
import argparse
|
||||
from collections import defaultdict, Counter
|
||||
from typing import Any, List, Tuple, Mapping, Optional
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import requests
|
||||
import sys
|
||||
|
||||
import boto3
|
||||
|
||||
from e2e import GLOBAL_CONFIG
|
||||
|
||||
from alerts.default import handle_result as default_handle_result
|
||||
from alerts.rllib_tests import handle_result as rllib_tests_handle_result
|
||||
from alerts.long_running_tests import handle_result as long_running_tests_handle_result
|
||||
from alerts.tune_tests import handle_result as tune_tests_handle_result
|
||||
from alerts.xgboost_tests import handle_result as xgboost_tests_handle_result
|
||||
|
||||
SUITE_TO_FN = {
|
||||
"long_running_tests": long_running_tests_handle_result,
|
||||
"rllib_tests": rllib_tests_handle_result,
|
||||
"tune_tests": tune_tests_handle_result,
|
||||
"xgboost_tests": xgboost_tests_handle_result,
|
||||
}
|
||||
|
||||
GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] = "alert_state"
|
||||
GLOBAL_CONFIG["SLACK_WEBHOOK"] = os.environ.get("SLACK_WEBHOOK", "")
|
||||
GLOBAL_CONFIG["SLACK_CHANNEL"] = os.environ.get("SLACK_CHANNEL", "#oss-test-cop")
|
||||
|
||||
RESULTS_LIMIT = 120
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
handler = logging.StreamHandler(stream=sys.stdout)
|
||||
formatter = logging.Formatter(
|
||||
fmt="[%(levelname)s %(asctime)s] " "%(filename)s: %(lineno)d " "%(message)s"
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def maybe_fetch_slack_webhook():
|
||||
if GLOBAL_CONFIG["SLACK_WEBHOOK"] in [None, ""]:
|
||||
print("Missing SLACK_WEBHOOK, retrieving from AWS secrets store")
|
||||
GLOBAL_CONFIG["SLACK_WEBHOOK"] = boto3.client(
|
||||
"secretsmanager", region_name="us-west-2"
|
||||
).get_secret_value(
|
||||
SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:"
|
||||
"release-automation/"
|
||||
"slack-webhook-Na0CFP"
|
||||
)[
|
||||
"SecretString"
|
||||
]
|
||||
|
||||
|
||||
def _obj_hash(obj: Any) -> str:
|
||||
json_str = json.dumps(obj, sort_keys=True, ensure_ascii=True)
|
||||
sha = hashlib.sha256()
|
||||
sha.update(json_str.encode())
|
||||
return sha.hexdigest()
|
||||
|
||||
|
||||
def fetch_latest_alerts(rds_data_client):
|
||||
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
|
||||
|
||||
sql = f"""
|
||||
SELECT DISTINCT ON (category, test_suite, test_name)
|
||||
category, test_suite, test_name, last_result_hash,
|
||||
last_notification_dt
|
||||
FROM {schema}
|
||||
ORDER BY category, test_suite, test_name, last_notification_dt DESC
|
||||
LIMIT {RESULTS_LIMIT}
|
||||
"""
|
||||
|
||||
result = rds_data_client.execute_statement(
|
||||
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
|
||||
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
|
||||
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
|
||||
schema=schema,
|
||||
sql=sql,
|
||||
)
|
||||
for row in result["records"]:
|
||||
category, test_suite, test_name, last_result_hash, last_notification_dt = (
|
||||
r["stringValue"] if "stringValue" in r else None for r in row
|
||||
)
|
||||
last_notification_dt = datetime.datetime.strptime(
|
||||
last_notification_dt, "%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
yield category, test_suite, test_name, last_result_hash, last_notification_dt
|
||||
|
||||
|
||||
def fetch_latest_results(
|
||||
rds_data_client, fetch_since: Optional[datetime.datetime] = None
|
||||
):
|
||||
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
|
||||
|
||||
sql = f"""
|
||||
SELECT DISTINCT ON (category, test_suite, test_name)
|
||||
created_on, category, test_suite, test_name, status, results,
|
||||
artifacts, last_logs
|
||||
FROM {schema} """
|
||||
|
||||
parameters = []
|
||||
if fetch_since is not None:
|
||||
sql += "WHERE created_on >= :created_on "
|
||||
parameters = [
|
||||
{
|
||||
"name": "created_on",
|
||||
"typeHint": "TIMESTAMP",
|
||||
"value": {"stringValue": fetch_since.strftime("%Y-%m-%d %H:%M:%S")},
|
||||
},
|
||||
]
|
||||
|
||||
sql += "ORDER BY category, test_suite, test_name, created_on DESC "
|
||||
sql += f"LIMIT {RESULTS_LIMIT}"
|
||||
|
||||
result = rds_data_client.execute_statement(
|
||||
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
|
||||
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
|
||||
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
|
||||
schema=schema,
|
||||
sql=sql,
|
||||
parameters=parameters,
|
||||
)
|
||||
for row in result["records"]:
|
||||
(
|
||||
created_on,
|
||||
category,
|
||||
test_suite,
|
||||
test_name,
|
||||
status,
|
||||
results,
|
||||
artifacts,
|
||||
last_logs,
|
||||
) = (r["stringValue"] if "stringValue" in r else None for r in row)
|
||||
|
||||
# Calculate hash before converting strings to objects
|
||||
result_obj = (
|
||||
created_on,
|
||||
category,
|
||||
test_suite,
|
||||
test_name,
|
||||
status,
|
||||
results,
|
||||
artifacts,
|
||||
last_logs,
|
||||
)
|
||||
result_json = json.dumps(result_obj)
|
||||
result_hash = _obj_hash(result_json)
|
||||
|
||||
# Convert some strings to python objects
|
||||
created_on = datetime.datetime.strptime(created_on, "%Y-%m-%d %H:%M:%S")
|
||||
results = json.loads(results)
|
||||
artifacts = json.loads(artifacts)
|
||||
|
||||
yield result_hash, created_on, category, test_suite, test_name, status, results, artifacts, last_logs # noqa: E501
|
||||
|
||||
|
||||
def mark_as_handled(
|
||||
rds_data_client,
|
||||
update: bool,
|
||||
category: str,
|
||||
test_suite: str,
|
||||
test_name: str,
|
||||
result_hash: str,
|
||||
last_notification_dt: datetime.datetime,
|
||||
):
|
||||
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
|
||||
|
||||
if not update:
|
||||
sql = f"""
|
||||
INSERT INTO {schema}
|
||||
(category, test_suite, test_name,
|
||||
last_result_hash, last_notification_dt)
|
||||
VALUES (:category, :test_suite, :test_name,
|
||||
:last_result_hash, :last_notification_dt)
|
||||
"""
|
||||
else:
|
||||
sql = f"""
|
||||
UPDATE {schema}
|
||||
SET last_result_hash=:last_result_hash,
|
||||
last_notification_dt=:last_notification_dt
|
||||
WHERE category=:category AND test_suite=:test_suite
|
||||
AND test_name=:test_name
|
||||
"""
|
||||
|
||||
rds_data_client.execute_statement(
|
||||
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
|
||||
parameters=[
|
||||
{"name": "category", "value": {"stringValue": category}},
|
||||
{"name": "test_suite", "value": {"stringValue": test_suite or ""}},
|
||||
{"name": "test_name", "value": {"stringValue": test_name}},
|
||||
{"name": "last_result_hash", "value": {"stringValue": result_hash}},
|
||||
{
|
||||
"name": "last_notification_dt",
|
||||
"typeHint": "TIMESTAMP",
|
||||
"value": {
|
||||
"stringValue": last_notification_dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
},
|
||||
},
|
||||
],
|
||||
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
|
||||
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
|
||||
schema=schema,
|
||||
sql=sql,
|
||||
)
|
||||
|
||||
|
||||
def post_alerts_to_slack(
|
||||
channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
|
||||
):
|
||||
if len(alerts) == 0:
|
||||
logger.info("No alerts to post to slack.")
|
||||
return
|
||||
|
||||
markdown_lines = [
|
||||
f"* {len(alerts)} new release test failures found!*",
|
||||
"",
|
||||
]
|
||||
|
||||
category_alerts = defaultdict(list)
|
||||
for (category, test_suite, test_name, alert) in alerts:
|
||||
category_alerts[category].append(
|
||||
f" *{test_suite}/{test_name}* failed: {alert}"
|
||||
)
|
||||
|
||||
for category, alert_list in category_alerts.items():
|
||||
markdown_lines.append(f"Branch: *{category}*")
|
||||
markdown_lines.extend(alert_list)
|
||||
markdown_lines.append("")
|
||||
|
||||
total_non_alerts = sum(n for n in non_alerts.values())
|
||||
non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
|
||||
|
||||
markdown_lines += [
|
||||
f"Additionally, {total_non_alerts} tests passed successfully "
|
||||
f"({', '.join(non_alert_detail)})."
|
||||
]
|
||||
|
||||
slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
|
||||
|
||||
resp = requests.post(
|
||||
slack_url,
|
||||
json={
|
||||
"text": "\n".join(markdown_lines),
|
||||
"channel": channel,
|
||||
"username": "Fail Bot",
|
||||
"icon_emoji": ":red_circle:",
|
||||
},
|
||||
)
|
||||
print(resp.status_code)
|
||||
print(resp.text)
|
||||
|
||||
|
||||
def post_statistics_to_slack(
|
||||
channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
|
||||
):
|
||||
total_alerts = len(alerts)
|
||||
|
||||
category_alerts = defaultdict(list)
|
||||
for (category, test_suite, test_name, alert) in alerts:
|
||||
category_alerts[category].append(f"`{test_suite}/{test_name}`")
|
||||
|
||||
alert_detail = [f"{len(a)} on {c}" for c, a in category_alerts.items()]
|
||||
|
||||
total_non_alerts = sum(n for n in non_alerts.values())
|
||||
non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
|
||||
|
||||
markdown_lines = [
|
||||
"*Periodic release test report*",
|
||||
"",
|
||||
f"In the past 24 hours, "
|
||||
f"*{total_non_alerts}* release tests finished successfully, and "
|
||||
f"*{total_alerts}* release tests failed.",
|
||||
]
|
||||
|
||||
markdown_lines.append("")
|
||||
|
||||
if total_alerts:
|
||||
markdown_lines.append(f"*Failing:* {', '.join(alert_detail)}")
|
||||
for c, a in category_alerts.items():
|
||||
markdown_lines.append(f" *{c}*: {', '.join(sorted(a))}")
|
||||
else:
|
||||
markdown_lines.append("*Failing:* None")
|
||||
|
||||
markdown_lines.append("")
|
||||
|
||||
if total_non_alerts:
|
||||
markdown_lines.append(f"*Passing:* {', '.join(non_alert_detail)}")
|
||||
else:
|
||||
markdown_lines.append("*Passing:* None")
|
||||
|
||||
slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
|
||||
|
||||
resp = requests.post(
|
||||
slack_url,
|
||||
json={
|
||||
"text": "\n".join(markdown_lines),
|
||||
"channel": channel,
|
||||
"username": "Fail Bot",
|
||||
"icon_emoji": ":red_circle:",
|
||||
},
|
||||
)
|
||||
print(resp.status_code)
|
||||
print(resp.text)
|
||||
|
||||
|
||||
def handle_results_and_get_alerts(
|
||||
rds_data_client,
|
||||
fetch_since: Optional[datetime.datetime] = None,
|
||||
always_try_alert: bool = False,
|
||||
no_status_update: bool = False,
|
||||
):
|
||||
# First build a map of last notifications
|
||||
last_notifications_map = {}
|
||||
for (
|
||||
category,
|
||||
test_suite,
|
||||
test_name,
|
||||
last_result_hash,
|
||||
last_notification_dt,
|
||||
) in fetch_latest_alerts(rds_data_client):
|
||||
last_notifications_map[(category, test_suite, test_name)] = (
|
||||
last_result_hash,
|
||||
last_notification_dt,
|
||||
)
|
||||
|
||||
alerts = []
|
||||
non_alerts = Counter()
|
||||
|
||||
# Then fetch latest results
|
||||
for (
|
||||
result_hash,
|
||||
created_on,
|
||||
category,
|
||||
test_suite,
|
||||
test_name,
|
||||
status,
|
||||
results,
|
||||
artifacts,
|
||||
last_logs,
|
||||
) in fetch_latest_results(rds_data_client, fetch_since=fetch_since):
|
||||
key = (category, test_suite, test_name)
|
||||
|
||||
try_alert = always_try_alert
|
||||
if key in last_notifications_map:
|
||||
# If we have an alert for this key, fetch info
|
||||
last_result_hash, last_notification_dt = last_notifications_map[key]
|
||||
|
||||
if last_result_hash != result_hash:
|
||||
# If we got a new result, handle new result
|
||||
try_alert = True
|
||||
# Todo: maybe alert again after some time?
|
||||
else:
|
||||
try_alert = True
|
||||
|
||||
if try_alert:
|
||||
handle_fn = SUITE_TO_FN.get(test_suite, None)
|
||||
if not handle_fn:
|
||||
logger.warning(f"No handle for suite {test_suite}")
|
||||
alert = default_handle_result(
|
||||
created_on,
|
||||
category,
|
||||
test_suite,
|
||||
test_name,
|
||||
status,
|
||||
results,
|
||||
artifacts,
|
||||
last_logs,
|
||||
)
|
||||
else:
|
||||
alert = handle_fn(
|
||||
created_on,
|
||||
category,
|
||||
test_suite,
|
||||
test_name,
|
||||
status,
|
||||
results,
|
||||
artifacts,
|
||||
last_logs,
|
||||
)
|
||||
|
||||
if alert:
|
||||
logger.warning(
|
||||
f"Alert raised for test {test_suite}/{test_name} "
|
||||
f"({category}): {alert}"
|
||||
)
|
||||
|
||||
alerts.append((category, test_suite, test_name, alert))
|
||||
else:
|
||||
logger.debug(
|
||||
f"No alert raised for test {test_suite}/{test_name} "
|
||||
f"({category})"
|
||||
)
|
||||
non_alerts[category] += 1
|
||||
|
||||
if not no_status_update:
|
||||
mark_as_handled(
|
||||
rds_data_client,
|
||||
key in last_notifications_map,
|
||||
category,
|
||||
test_suite,
|
||||
test_name,
|
||||
result_hash,
|
||||
datetime.datetime.now(),
|
||||
)
|
||||
|
||||
return alerts, non_alerts
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--stats",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Finish quickly for training.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
maybe_fetch_slack_webhook()
|
||||
|
||||
rds_data_client = boto3.client("rds-data", region_name="us-west-2")
|
||||
|
||||
if args.stats:
|
||||
# Only update last 24 hour stats
|
||||
fetch_since = datetime.datetime.now() - datetime.timedelta(days=1)
|
||||
alerts, non_alerts = handle_results_and_get_alerts(
|
||||
rds_data_client,
|
||||
fetch_since=fetch_since,
|
||||
always_try_alert=True,
|
||||
no_status_update=True,
|
||||
)
|
||||
post_statistics_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
|
||||
|
||||
else:
|
||||
alerts, non_alerts = handle_results_and_get_alerts(rds_data_client)
|
||||
post_alerts_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
|
|
@ -1,145 +0,0 @@
|
|||
- name: single_node
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: single_node.yaml
|
||||
|
||||
run:
|
||||
timeout: 12000
|
||||
prepare: sleep 0
|
||||
script: python single_node/test_single_node.py
|
||||
|
||||
- name: object_store
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: object_store.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=50
|
||||
script: python object_store/test_object_store.py
|
||||
|
||||
- name: many_actors
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
||||
script: python distributed/test_many_actors.py
|
||||
|
||||
- name: many_actors_smoke_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed_smoke_test.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
||||
script: SMOKE_TEST=1 python distributed/test_many_actors.py
|
||||
|
||||
- name: many_tasks
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
||||
script: python distributed/test_many_tasks.py --num-tasks=10000
|
||||
|
||||
- name: many_tasks_smoke_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed_smoke_test.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
||||
script: python distributed/test_many_tasks.py --num-tasks=100
|
||||
|
||||
- name: many_pgs
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
||||
script: python distributed/test_many_pgs.py
|
||||
|
||||
- name: many_pgs_smoke_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: distributed_smoke_test.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
||||
script: SMOKE_TEST=1 python distributed/test_many_pgs.py
|
||||
|
||||
# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
|
||||
- name: many_nodes
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: many_nodes.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600 # 1hr
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=250
|
||||
script: python distributed/test_many_tasks.py --num-tasks=1000
|
||||
|
||||
- name: scheduling_test_many_0s_tasks_single_node
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: scheduling.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
|
||||
|
||||
- name: scheduling_test_many_0s_tasks_many_nodes
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: scheduling.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
|
||||
|
||||
- name: scheduling_test_many_5s_tasks_single_node
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: scheduling.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
|
||||
stable: false
|
||||
|
||||
- name: scheduling_test_many_5s_tasks_many_nodes
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: scheduling.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
|
||||
stable: false
|
|
@ -1,24 +0,0 @@
|
|||
import click
|
||||
import ray
|
||||
import time
|
||||
|
||||
|
||||
def num_alive_nodes():
|
||||
n = 0
|
||||
for node in ray.nodes():
|
||||
if node["Alive"]:
|
||||
n += 1
|
||||
return n
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
|
||||
def wait_cluster(num_nodes: int):
|
||||
ray.init(address="auto")
|
||||
while num_alive_nodes() != num_nodes:
|
||||
print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
wait_cluster()
|
|
@ -1,54 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,214 +0,0 @@
|
|||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Releaser config generator</title>
|
||||
<style type="text/css">
|
||||
html {
|
||||
background: #cccccc;
|
||||
}
|
||||
body {
|
||||
background: #ffffff;
|
||||
font-family: sans-serif;
|
||||
padding: 1em 2em;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
textarea {
|
||||
width: 600px;
|
||||
height: 200px;
|
||||
}
|
||||
form .use {
|
||||
white-space: nowrap;
|
||||
padding-right: 1em;
|
||||
}
|
||||
form .val {
|
||||
min-width: 300px;
|
||||
}
|
||||
form .val input {
|
||||
width: 90%;
|
||||
}
|
||||
form .desc {
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript">
|
||||
var env_vars = [
|
||||
{
|
||||
"name": "RAY_TEST_REPO",
|
||||
"short": "Git repo with test files",
|
||||
"long": "Repository in which the test files are which you would like to run. Note that this doesn't have to be the same repo from which the wheels are installed.",
|
||||
"default": "https://github.com/ray-project/ray.git",
|
||||
"enabled": false,
|
||||
},
|
||||
{
|
||||
"name": "RAY_TEST_BRANCH",
|
||||
"short": "Git branch for test repo",
|
||||
"long": "Git branch that is checked out from RAY_TEST_REPO and which contains the test files you would like to run. Note that this doesnt' have to be the same branch you're fetching the Ray wheels from.",
|
||||
"default": "master",
|
||||
"enabled": false,
|
||||
},
|
||||
{
|
||||
"name": "RAY_REPO",
|
||||
"short": "Git repo for the Ray wheels",
|
||||
"long": "Repository from which to fetch the latest commits to find the Ray wheels",
|
||||
"default": "https://github.com/ray-project/ray.git",
|
||||
"enabled": false,
|
||||
},
|
||||
{
|
||||
"name": "RAY_BRANCH",
|
||||
"short": "Git branch for the Ray wheels",
|
||||
"long": "Branch that is check out from RAY_REPO from which the latest commits are fetched to find the Ray wheels",
|
||||
"default": "master",
|
||||
"enabled": true,
|
||||
},
|
||||
{
|
||||
"name": "RELEASE_TEST_SUITE",
|
||||
"short": "Release test suite (nightly/weekly/manual)",
|
||||
"long": "Release test suite as defined in releaser's build_pipeline.py",
|
||||
"default": "nightly",
|
||||
"enabled": true,
|
||||
},
|
||||
{
|
||||
"name": "FILTER_FILE",
|
||||
"short": "Filter test file by this string",
|
||||
"long": "Only test files (e.g. xgboost_tests.yml) that match this string will be included in the test",
|
||||
"default": "",
|
||||
"enabled": false,
|
||||
},
|
||||
{
|
||||
"name": "FILTER_TEST",
|
||||
"short": "Filter test name by this string",
|
||||
"long": "Only test names (e.g. tune_4x32) that match this string will be included in the test",
|
||||
"default": "",
|
||||
"enabled": false,
|
||||
},
|
||||
]
|
||||
|
||||
window.addEventListener('load', function () {
|
||||
|
||||
var table = document.getElementById("gen_table");
|
||||
|
||||
for (var env_var of env_vars) {
|
||||
|
||||
var use_td = document.createElement("td");
|
||||
use_td.setAttribute("class", "use");
|
||||
|
||||
var use_input = document.createElement("input");
|
||||
use_input.setAttribute("type", "checkbox");
|
||||
use_input.setAttribute("data-activate", env_var["name"] + "_val");
|
||||
use_input.setAttribute("id", env_var["name"] + "_use");
|
||||
use_input.setAttribute("class", "input_use");
|
||||
if (env_var["enabled"]) {
|
||||
use_input.checked = true;
|
||||
}
|
||||
|
||||
|
||||
var use_label = document.createElement("label");
|
||||
use_label.setAttribute("for", env_var["name"] + "_use");
|
||||
use_label.innerHTML = env_var["name"];
|
||||
|
||||
use_td.append(use_input);
|
||||
use_td.append(use_label);
|
||||
|
||||
val_td = document.createElement("td");
|
||||
val_td.setAttribute("class", "val");
|
||||
|
||||
val_input = document.createElement("input");
|
||||
val_input.setAttribute("type", "text");
|
||||
if (!env_var["enabled"]) {
|
||||
val_input.setAttribute("disabled", "disabled");
|
||||
}
|
||||
val_input.setAttribute("id", env_var["name"] + "_val");
|
||||
val_input.setAttribute("name", env_var["name"]);
|
||||
val_input.setAttribute("value", env_var["default"]);
|
||||
val_input.setAttribute("class", "input_val");
|
||||
|
||||
val_td.append(val_input);
|
||||
|
||||
use_input.addEventListener("click", function(e) {
|
||||
var toggle_val = document.getElementById(e.target.getAttribute("data-activate"))
|
||||
|
||||
if (toggle_val.disabled) {
|
||||
toggle_val.removeAttribute("disabled");
|
||||
} else {
|
||||
toggle_val.setAttribute("disabled", "disabled");
|
||||
}
|
||||
generate_snippet();
|
||||
});
|
||||
|
||||
val_input.addEventListener("change", function() { generate_snippet(); });
|
||||
val_input.addEventListener("keydown", function() { generate_snippet(); });
|
||||
val_input.addEventListener("keyup", function() { generate_snippet(); });
|
||||
|
||||
var desc_td = document.createElement("td");
|
||||
desc_td.setAttribute("class", "desc");
|
||||
|
||||
var desc_a = document.createElement("a");
|
||||
desc_a.setAttribute("title", env_var["long"]);
|
||||
desc_a.innerHTML = env_var["short"];
|
||||
|
||||
desc_td.append(desc_a);
|
||||
|
||||
var tr = document.createElement("tr");
|
||||
tr.append(use_td);
|
||||
tr.append(val_td);
|
||||
tr.append(desc_td);
|
||||
|
||||
table.append(tr);
|
||||
}
|
||||
|
||||
var button = document.getElementById("generate");
|
||||
button.addEventListener("click", function() {
|
||||
generate_snippet();
|
||||
})
|
||||
|
||||
generate_snippet()
|
||||
})
|
||||
|
||||
function generate_snippet() {
|
||||
full_snippet = ""
|
||||
for (env_var of env_vars) {
|
||||
var val_input = document.getElementById(env_var["name"] + "_val")
|
||||
|
||||
if (!val_input.disabled) {
|
||||
full_snippet += env_var["name"] + "=\"" + val_input.value + "\"\n"
|
||||
}
|
||||
}
|
||||
|
||||
document.getElementById("snippet").innerHTML = full_snippet;
|
||||
}
|
||||
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<header class="header">
|
||||
<h1>Releaser config generator</h1>
|
||||
<p>Use this form to generate a list of environment variables.</p>
|
||||
<p>These variables can be passed to Buildkite to run a subset of release tests
|
||||
and choose the correct wheels/release test branch</p>
|
||||
</header>
|
||||
<section class="main">
|
||||
<form id="gen">
|
||||
<table id="gen_table">
|
||||
<tr>
|
||||
<th>Set</th>
|
||||
<th>Value</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
|
||||
</form>
|
||||
|
||||
<div>
|
||||
<button id="generate">Generate snippet</button>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<textarea id="snippet">
|
||||
|
||||
</textarea>
|
||||
</div>
|
||||
</section>
|
||||
</body>
|
||||
</html>
|
2585
release/e2e.py
2585
release/e2e.py
File diff suppressed because it is too large
Load diff
|
@ -1,15 +0,0 @@
|
|||
- name: horovod_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_master.yaml
|
||||
compute_template: compute_tpl.yaml
|
||||
|
||||
run:
|
||||
timeout: 36000
|
||||
prepare: python wait_cluster.py 3 600
|
||||
script: python workloads/horovod_tune_test.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 1800
|
|
@ -1,53 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,92 +0,0 @@
|
|||
- name: train_small
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/train_small.py
|
||||
|
||||
- name: train_moderate
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 32 600
|
||||
script: python workloads/train_moderate.py
|
||||
|
||||
- name: train_gpu
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_gpu.yaml
|
||||
compute_template: tpl_gpu_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 5 600
|
||||
script: python workloads/train_gpu.py
|
||||
|
||||
- name: distributed_api_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
results:
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/distributed_api_test.py
|
||||
results: ""
|
||||
|
||||
- name: ft_small_non_elastic
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/ft_small_non_elastic.py
|
||||
results: ""
|
||||
|
||||
- name: tune_small
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/tune_small.py
|
||||
|
||||
- name: tune_32x4
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 32 600
|
||||
script: python workloads/tune_32x4.py
|
||||
|
||||
- name: tune_4x32
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 32 600
|
||||
script: python workloads/tune_4x32.py
|
|
@ -1,53 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,13 +0,0 @@
|
|||
- name: pytorch_pbt_failure
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: compute_tpl.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
script: python workloads/pytorch_pbt_failure.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
timeout: 3600
|
|
@ -1,196 +0,0 @@
|
|||
- name: actor_deaths
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: ray stop
|
||||
script: python workloads/actor_deaths.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
- name: apex
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ../rllib_tests/app_config.yaml
|
||||
compute_template: tpl_cpu_3.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: python wait_cluster.py 3 600
|
||||
script: python workloads/apex.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
|
||||
- name: impala
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_np.yaml
|
||||
compute_template: tpl_cpu_1_large.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
script: python workloads/impala.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
- name: many_actor_tasks
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: ray stop
|
||||
script: python workloads/many_actor_tasks.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
|
||||
- name: many_drivers
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: ray stop
|
||||
script: python workloads/many_drivers.py --iteration-num=4000
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
|
||||
- name: many_ppo
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ../rllib_tests/app_config.yaml
|
||||
compute_template: many_ppo.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: python wait_cluster.py 1 600
|
||||
script: python workloads/many_ppo.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
- name: many_tasks
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: ray stop
|
||||
script: python workloads/many_tasks.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
- name: many_tasks_serialized_ids
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: ray stop
|
||||
script: python workloads/many_tasks_serialized_ids.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
|
||||
- name: node_failures
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: ray stop
|
||||
script: python workloads/node_failures.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
- name: pbt
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ../rllib_tests/app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: ray stop
|
||||
script: python workloads/pbt.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
- name: serve
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: ray stop
|
||||
script: python workloads/serve.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
- name: serve_failure
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_1.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
prepare: ray stop
|
||||
script: python workloads/serve_failure.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 600
|
||||
|
||||
stable: False
|
|
@ -1,53 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,9 +0,0 @@
|
|||
# - name: microbenchmark
|
||||
# team: core
|
||||
# cluster:
|
||||
# app_config: app_config.yaml
|
||||
# compute_template: tpl_64.yaml
|
||||
|
||||
# run:
|
||||
# timeout: 1800
|
||||
# script: OMP_NUM_THREADS=64 RAY_ADDRESS= python run_microbenchmark.py
|
|
@ -1,124 +0,0 @@
|
|||
- name: horovod_user_test_latest
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: horovod/app_config.yaml
|
||||
compute_template: horovod/compute_tpl.yaml
|
||||
|
||||
|
||||
driver_setup: horovod/driver_setup_latest.sh
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 1200
|
||||
script: python horovod/horovod_user_test.py
|
||||
|
||||
- name: horovod_user_test_master
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ../horovod_tests/app_config_master.yaml
|
||||
compute_template: horovod/compute_tpl.yaml
|
||||
|
||||
driver_setup: horovod/driver_setup_master.sh
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 1200
|
||||
script: python horovod/horovod_user_test.py
|
||||
|
||||
|
||||
- name: train_tensorflow_mnist_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: train/app_config.yaml
|
||||
compute_template: train/compute_tpl.yaml
|
||||
|
||||
driver_setup: train/driver_setup.sh
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 36000
|
||||
script: python train/train_tensorflow_mnist_test.py
|
||||
|
||||
- name: train_torch_linear_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: train/app_config.yaml
|
||||
compute_template: train/compute_tpl.yaml
|
||||
|
||||
driver_setup: train/driver_setup.sh
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 36000
|
||||
script: python train/train_torch_linear_test.py
|
||||
|
||||
|
||||
- name: xgboost_gpu_connect_latest
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: xgboost/app_config_gpu.yaml
|
||||
compute_template: xgboost/tpl_gpu_small_scaling.yaml
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 1200
|
||||
script: python xgboost/train_gpu_connect.py
|
||||
|
||||
- name: xgboost_gpu_connect_master
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: xgboost/app_config_gpu_master.yaml
|
||||
compute_template: xgboost/tpl_gpu_small_scaling.yaml
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 1200
|
||||
script: python xgboost/train_gpu_connect.py
|
||||
|
||||
- name: ray_lightning_user_test_latest
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ray-lightning/app_config.yaml
|
||||
compute_template: ray-lightning/compute_tpl.yaml
|
||||
|
||||
driver_setup: ray-lightning/driver_setup.sh
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 1200
|
||||
script: python ray-lightning/ray_lightning_user_test.py
|
||||
|
||||
|
||||
- name: ray_lightning_user_test_master
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ray-lightning/app_config_master.yaml
|
||||
compute_template: ray-lightning/compute_tpl.yaml
|
||||
|
||||
|
||||
driver_setup: ray-lightning/driver_setup.sh
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 1200
|
||||
script: python ray-lightning/ray_lightning_user_test.py
|
||||
|
||||
|
||||
- name: tune_rllib_connect_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: ../rllib_tests/app_config.yaml
|
||||
compute_template: tune_rllib/compute_tpl.yaml
|
||||
|
||||
|
||||
driver_setup: tune_rllib/driver_setup.sh
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 1200
|
||||
script: python tune_rllib/run_connect_tests.py
|
|
@ -1,64 +0,0 @@
|
|||
#
|
||||
# Chaos tests.
|
||||
#
|
||||
|
||||
# Run the test that invokes many tasks without object store usage.
|
||||
- name: chaos_many_tasks_no_object_store
|
||||
team: core
|
||||
cluster:
|
||||
app_config: chaos_test/app_config.yaml
|
||||
compute_template: chaos_test/compute_template.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
|
||||
script: python chaos_test/test_chaos_basic.py --workload=tasks
|
||||
|
||||
- name: chaos_many_actors
|
||||
team: core
|
||||
cluster:
|
||||
app_config: chaos_test/app_config.yaml
|
||||
compute_template: chaos_test/compute_template.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
|
||||
script: python chaos_test/test_chaos_basic.py --workload=actors
|
||||
|
||||
- name: chaos_dask_on_ray_large_scale_test_no_spilling
|
||||
team: core
|
||||
cluster:
|
||||
app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
# Total run time without failures is about 300-400s.
|
||||
prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
|
||||
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
||||
|
||||
# Test large scale dask on ray test with spilling.
|
||||
- name: chaos_dask_on_ray_large_scale_test_spilling
|
||||
team: core
|
||||
cluster:
|
||||
app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
# Total run time without failures is about 300-400s.
|
||||
prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
|
||||
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
|
||||
|
||||
- name: chaos_pipelined_ingestion_1500_gb_15_windows
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dataset/pipelined_ingestion_app.yaml
|
||||
compute_template: dataset/pipelined_ingestion_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
prepare: python wait_cluster.py 21 2400; python setup_chaos.py --node-kill-interval 300
|
||||
script: python dataset/pipelined_training.py --epochs 1 --num-windows 15 --num-files 915 --debug
|
||||
|
||||
stable: false
|
|
@ -1,95 +0,0 @@
|
|||
- name: inference
|
||||
team: core
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: inference.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 2 600
|
||||
script: python inference.py
|
||||
|
||||
- name: shuffle_data_loader
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle_app_config.yaml
|
||||
compute_template: shuffle_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 1800
|
||||
script: python dataset_shuffle_data_loader.py
|
||||
|
||||
- name: parquet_metadata_resolution
|
||||
team: core
|
||||
cluster:
|
||||
app_config: pipelined_training_app.yaml
|
||||
compute_template: pipelined_training_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 1200
|
||||
prepare: python wait_cluster.py 15 1200
|
||||
script: python parquet_metadata_resolution.py --num-files 915
|
||||
|
||||
- name: pipelined_training_50_gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: pipelined_training_app.yaml
|
||||
compute_template: pipelined_training_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 4800
|
||||
prepare: python wait_cluster.py 15 1200
|
||||
script: python pipelined_training.py --epochs 1
|
||||
|
||||
- name: pipelined_ingestion_1500_gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: pipelined_ingestion_app.yaml
|
||||
compute_template: pipelined_ingestion_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 9600
|
||||
prepare: python wait_cluster.py 21 2400
|
||||
script: python pipelined_training.py --epochs 2 --num-windows 2 --num-files 915 --debug
|
||||
|
||||
- name: datasets_ingest_train_infer
|
||||
team: core
|
||||
cluster:
|
||||
app_config: ray_sgd_training_app.yaml
|
||||
compute_template: ray_sgd_training_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 14400
|
||||
prepare: python wait_cluster.py 66 2400
|
||||
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset
|
||||
|
||||
smoke_test:
|
||||
cluster:
|
||||
app_config: ray_sgd_training_app.yaml
|
||||
compute_template: ray_sgd_training_smoke_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python wait_cluster.py 8 2400
|
||||
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
|
||||
|
||||
- name: datasets_preprocess_ingest
|
||||
team: core
|
||||
cluster:
|
||||
app_config: ray_sgd_training_app.yaml
|
||||
compute_template: ray_sgd_training_compute_no_gpu.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
prepare: python wait_cluster.py 21 2400
|
||||
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset --debug
|
||||
|
||||
- name: datasets_ingest_400G
|
||||
team: core
|
||||
cluster:
|
||||
app_config: ray_sgd_training_app.yaml
|
||||
compute_template: dataset_ingest_400G_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1
|
|
@ -1,53 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,390 +0,0 @@
|
|||
#
|
||||
# Single node shuffle
|
||||
#
|
||||
# Test basic single node 10GB shuffle with a small number of partitions.
|
||||
# This doesn't require object spilling.
|
||||
# - name: shuffle_10gb
|
||||
# team: core
|
||||
# cluster:
|
||||
# app_config: shuffle/shuffle_app_config.yaml
|
||||
# compute_template: shuffle/shuffle_compute_single.yaml
|
||||
|
||||
# run:
|
||||
# timeout: 3000
|
||||
# script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6
|
||||
|
||||
# Test single node 50GB shuffle with a large number of partitions.
|
||||
- name: shuffle_50gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_single.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
|
||||
|
||||
# Test single node 50GB shuffle with a large number of partitions.
|
||||
- name: shuffle_50gb_large_partition
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_single.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
|
||||
|
||||
# Test non streaming shuffle in a single node with a small number of partition.
|
||||
- name: non_streaming_shuffle_50gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_single.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming
|
||||
|
||||
# Test non streaming shuffle in a single node with a large number of partition.
|
||||
- name: non_streaming_shuffle_50gb_large_partition
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_single.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
|
||||
|
||||
- name: dask_on_ray_10gb_sort
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
|
||||
|
||||
- name: dask_on_ray_100gb_sort
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
|
||||
|
||||
#
|
||||
# Multi node shuffle
|
||||
#
|
||||
|
||||
# Test multi nodes 100GB shuffle with a small number of partitions.
|
||||
- name: shuffle_100gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_multi.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
|
||||
|
||||
# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
|
||||
- name: non_streaming_shuffle_100gb
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_multi.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
|
||||
|
||||
# Test autoscaling 1TB streaming shuffle with a large number of partitions.
|
||||
- name: autoscaling_shuffle_1tb_1000_partitions
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_autoscaling.yaml
|
||||
|
||||
run:
|
||||
timeout: 4000
|
||||
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
|
||||
|
||||
# Test multi nodes 1TB streaming shuffle with a large number of partitions.
|
||||
- name: shuffle_1tb_1000_partition
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
prepare: python wait_cluster.py 20 900
|
||||
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
|
||||
|
||||
# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
|
||||
- name: non_streaming_shuffle_1tb_1000_partition
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
prepare: python wait_cluster.py 20 900
|
||||
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
|
||||
|
||||
# Stress test for 1TB multi node streaming shuffle.
|
||||
- name: shuffle_1tb_5000_partitions
|
||||
team: core
|
||||
cluster:
|
||||
app_config: shuffle/shuffle_app_config.yaml
|
||||
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
||||
|
||||
run:
|
||||
timeout: 9000
|
||||
prepare: python wait_cluster.py 20 900
|
||||
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
|
||||
|
||||
# Stress test for 1TB multi node non-streaming shuffle.
|
||||
# - name: non_streaming_shuffle_1tb_5000_partitions
|
||||
# team: core
|
||||
# stable: False
|
||||
# cluster:
|
||||
# app_config: shuffle/shuffle_app_config.yaml
|
||||
# compute_template: shuffle/shuffle_compute_large_scale.yaml
|
||||
|
||||
# run:
|
||||
# timeout: 7200
|
||||
# prepare: python wait_cluster.py 20 900
|
||||
# script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
|
||||
|
||||
- name: k8s_dask_on_ray_large_scale_test_no_spilling
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_stress_compute_k8s.yaml
|
||||
compute_on_k8s: True
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
prepare: python wait_cluster.py 21 600
|
||||
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
||||
stable: false
|
||||
|
||||
# # Test large scale dask on ray test without spilling.
|
||||
# - name: dask_on_ray_large_scale_test_no_spilling
|
||||
# team: core
|
||||
# cluster:
|
||||
# app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
||||
# compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
||||
|
||||
# run:
|
||||
# timeout: 7200
|
||||
# prepare: python wait_cluster.py 21 600
|
||||
# script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
||||
|
||||
# smoke_test:
|
||||
# cluster:
|
||||
# app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
||||
# compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
|
||||
|
||||
# run:
|
||||
# timeout: 7200
|
||||
# prepare: python wait_cluster.py 5 600
|
||||
# script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
||||
|
||||
# Test large scale dask on ray test with spilling.
|
||||
- name: dask_on_ray_large_scale_test_spilling
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
prepare: python wait_cluster.py 21 600
|
||||
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
|
||||
|
||||
smoke_test:
|
||||
cluster:
|
||||
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
prepare: python wait_cluster.py 5 600
|
||||
script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
|
||||
|
||||
# Stress tests with many tasks
|
||||
- name: stress_test_many_tasks
|
||||
team: core
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/stress_tests_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
script: python stress_tests/test_many_tasks.py
|
||||
|
||||
smoke_test:
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/smoke_test_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
|
||||
|
||||
# Stress tests with dead actors
|
||||
- name: stress_test_dead_actors
|
||||
team: core
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/stress_tests_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
script: python stress_tests/test_dead_actors.py
|
||||
|
||||
smoke_test:
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/smoke_test_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3
|
||||
|
||||
# Stress tests with placement groups
|
||||
- name: stress_test_placement_group
|
||||
team: core
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/placement_group_tests_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
script: python stress_tests/test_placement_group.py
|
||||
|
||||
# Stress tests with many threaded actors.
|
||||
- name: threaded_actors_stress_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
prepare: python wait_cluster.py 201 600
|
||||
script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
|
||||
|
||||
smoke_test:
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/smoke_test_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python wait_cluster.py 5 600
|
||||
script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
|
||||
stable: false
|
||||
|
||||
- name: k8s_threaded_actors_stress_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
compute_template: stress_tests/k8s_stress_test_threaded_actor_compute.yaml
|
||||
compute_on_k8s: True
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
prepare: python wait_cluster.py 201 600
|
||||
script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python wait_cluster.py 5 600
|
||||
script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
|
||||
stable: false
|
||||
|
||||
# Test decision tree on autoscaling compute cluster.
|
||||
- name: decision_tree_autoscaling
|
||||
team: core
|
||||
cluster:
|
||||
app_config: decision_tree/decision_tree_app_config.yaml
|
||||
compute_template: decision_tree/autoscaling_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
script: python decision_tree/cart_with_tree.py
|
||||
|
||||
# Test 20 concurrent decision tree runs on autoscaling compute cluster.
|
||||
- name: decision_tree_autoscaling_20_runs
|
||||
team: core
|
||||
cluster:
|
||||
app_config: decision_tree/decision_tree_app_config.yaml
|
||||
compute_template: decision_tree/autoscaling_compute.yaml
|
||||
run:
|
||||
timeout: 9600
|
||||
script: python decision_tree/cart_with_tree.py --concurrency=20
|
||||
|
||||
- name: dask_on_ray_1tb_sort
|
||||
team: core
|
||||
cluster:
|
||||
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
||||
compute_template: dask_on_ray/1tb_sort_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
prepare: python wait_cluster.py 32 1000
|
||||
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
|
||||
|
||||
- name: many_nodes_actor_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: many_nodes_tests/app_config.yaml
|
||||
compute_template: many_nodes_tests/compute_config.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
prepare: python wait_cluster.py 251 5400
|
||||
script: python many_nodes_tests/actor_test.py
|
||||
|
||||
- name: pg_autoscaling_regression_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: placement_group_tests/app_config.yaml
|
||||
compute_template: placement_group_tests/compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 1200
|
||||
script: python placement_group_tests/pg_run.py
|
||||
|
||||
- name: pg_long_running_performance_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: placement_group_tests/app_config.yaml
|
||||
compute_template: placement_group_tests/long_running_test_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python wait_cluster.py 2 600
|
||||
script: python placement_group_tests/long_running_performance_test.py --num-stages 2000
|
||||
|
||||
- name: placement_group_performance_test
|
||||
team: core
|
||||
cluster:
|
||||
app_config: placement_group_tests/app_config.yaml
|
||||
compute_template: placement_group_tests/pg_perf_test_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 1200
|
||||
prepare: python wait_cluster.py 5 600
|
||||
script: python placement_group_tests/placement_group_performance_test.py
|
|
@ -1,54 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,103 +0,0 @@
|
|||
# Heavy learning tests (Atari and HalfCheetah) for major algos.
|
||||
- name: learning_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 8gpus_64cpus.yaml
|
||||
|
||||
run:
|
||||
timeout: 14400
|
||||
script: python learning_tests/run.py
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 1200
|
||||
|
||||
# 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos.
|
||||
- name: multi_gpu_learning_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 8gpus_96cpus.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
script: python multi_gpu_learning_tests/run.py
|
||||
|
||||
# 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos
|
||||
# (that support RNN models).
|
||||
- name: multi_gpu_with_lstm_learning_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 8gpus_96cpus.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
script: python multi_gpu_with_lstm_learning_tests/run.py
|
||||
|
||||
# 2-GPU learning tests (StatelessCartPole) + use_attention=True for major
|
||||
# algos (that support RNN models).
|
||||
- name: multi_gpu_with_attention_learning_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 8gpus_96cpus.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
script: python multi_gpu_with_attention_learning_tests/run.py
|
||||
|
||||
# We'll have these as per-PR tests soon.
|
||||
# - name: example_scripts_on_gpu_tests
|
||||
# team: ml
|
||||
# cluster:
|
||||
# app_config: app_config.yaml
|
||||
# compute_template: 1gpu_4cpus.yaml
|
||||
|
||||
# run:
|
||||
# timeout: 7200
|
||||
# script: bash unit_gpu_tests/run.sh
|
||||
|
||||
# IMPALA large machine stress tests (4x Atari).
|
||||
- name: stress_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 4gpus_544_cpus.yaml
|
||||
|
||||
run:
|
||||
timeout: 5400
|
||||
prepare: python wait_cluster.py 6 600
|
||||
script: python stress_tests/run_stress_tests.py
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 2000
|
||||
|
||||
# Tests that exercise auto-scaling and Anyscale connect.
|
||||
- name: connect_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: auto_scale.yaml
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 3000
|
||||
script: python connect_tests/run_connect_tests.py
|
||||
|
||||
# Nightly performance regression for popular algorithms.
|
||||
# These algorithms run nightly for pre-determined amount of time without
|
||||
# passing criteria.
|
||||
# Performance metrics, such as reward achieved and throughput, are then
|
||||
# collected and tracked over time.
|
||||
- name: performance_tests
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: 12gpus_192cpus.yaml
|
||||
|
||||
run:
|
||||
timeout: 10800
|
||||
script: python performance_tests/run.py
|
|
@ -1,53 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,176 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
cd "${0%/*}" || exit 1
|
||||
|
||||
reason() {
|
||||
# Keep in sync with e2e.py ExitCode enum
|
||||
case $1 in
|
||||
0)
|
||||
REASON="success"
|
||||
;;
|
||||
2)
|
||||
REASON="unspecified"
|
||||
;;
|
||||
3)
|
||||
REASON="unknown"
|
||||
;;
|
||||
4)
|
||||
REASON="runtime error"
|
||||
;;
|
||||
5)
|
||||
REASON="command error"
|
||||
;;
|
||||
6)
|
||||
REASON="command timeout"
|
||||
;;
|
||||
7)
|
||||
REASON="prepare timeout"
|
||||
;;
|
||||
8)
|
||||
REASON="filesync timeout"
|
||||
;;
|
||||
9)
|
||||
REASON="session timeout"
|
||||
;;
|
||||
10)
|
||||
REASON="prepare error"
|
||||
;;
|
||||
11)
|
||||
REASON="app config build error"
|
||||
;;
|
||||
12)
|
||||
REASON="infra error"
|
||||
;;
|
||||
*)
|
||||
REASON="untracked error"
|
||||
;;
|
||||
esac
|
||||
echo "${REASON}"
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]
|
||||
do
|
||||
key="$1"
|
||||
case $key in
|
||||
--ray-repo)
|
||||
shift
|
||||
RAY_REPO=$1
|
||||
;;
|
||||
--ray-branch)
|
||||
shift
|
||||
RAY_BRANCH=$1
|
||||
;;
|
||||
--ray-version)
|
||||
shift
|
||||
RAY_VERSION=$1
|
||||
;;
|
||||
--ray-wheels)
|
||||
shift
|
||||
RAY_WHEELS=$1
|
||||
;;
|
||||
--ray-test-repo)
|
||||
shift
|
||||
RAY_TEST_REPO=$1
|
||||
;;
|
||||
--ray-test-branch)
|
||||
shift
|
||||
RAY_TEST_BRANCH=$1
|
||||
;;
|
||||
--release-results-dir)
|
||||
shift
|
||||
RELEASE_RESULTS_DIR=$1
|
||||
;;
|
||||
*)
|
||||
break
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
|
||||
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
|
||||
RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
|
||||
|
||||
export RAY_REPO RAY_BRANCH RAY_VERSION RAY_WHEELS RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
|
||||
|
||||
pip uninstall -q -y ray
|
||||
pip install -q -r requirements.txt
|
||||
pip install -q -U boto3 botocore
|
||||
git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" ~/ray
|
||||
|
||||
RETRY_NUM=0
|
||||
MAX_RETRIES=${MAX_RETRIES-3}
|
||||
|
||||
if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
|
||||
echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
|
||||
MAX_RETRIES=1
|
||||
fi
|
||||
|
||||
ALL_EXIT_CODES=()
|
||||
while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
|
||||
RETRY_NUM=$((RETRY_NUM + 1))
|
||||
|
||||
if [ "$RETRY_NUM" -gt 1 ]; then
|
||||
# Sleep for random time between 30 and 90 minutes
|
||||
SLEEP_TIME=$((1800 + RANDOM % 5400))
|
||||
echo "----------------------------------------"
|
||||
echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
|
||||
echo "----------------------------------------"
|
||||
sleep ${SLEEP_TIME}
|
||||
fi
|
||||
|
||||
sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
|
||||
|
||||
python e2e.py "$@"
|
||||
EXIT_CODE=$?
|
||||
REASON=$(reason "${EXIT_CODE}")
|
||||
ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
|
||||
|
||||
case ${EXIT_CODE} in
|
||||
0)
|
||||
echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
|
||||
break
|
||||
;;
|
||||
7 | 9 | 10)
|
||||
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
|
||||
;;
|
||||
*)
|
||||
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
|
||||
break
|
||||
;;
|
||||
esac
|
||||
|
||||
done
|
||||
|
||||
sudo rm -rf /tmp/ray_release_test_artifacts/* || true
|
||||
sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
|
||||
|
||||
echo "----------------------------------------"
|
||||
echo "e2e test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
|
||||
echo "Run results:"
|
||||
|
||||
COUNTER=1
|
||||
for EX in "${ALL_EXIT_CODES[@]}"; do
|
||||
REASON=$(reason "${EX}")
|
||||
echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
|
||||
COUNTER=$((COUNTER + 1))
|
||||
done
|
||||
|
||||
echo "----------------------------------------"
|
||||
|
||||
REASON=$(reason "${EXIT_CODE}")
|
||||
echo "Final e2e exit code is ${EXIT_CODE} (${REASON})"
|
||||
|
||||
case ${EXIT_CODE} in
|
||||
0)
|
||||
;;
|
||||
7 | 9 | 10)
|
||||
echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
|
||||
;;
|
||||
*)
|
||||
echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
|
||||
;;
|
||||
esac
|
||||
|
||||
exit $EXIT_CODE
|
|
@ -1,34 +0,0 @@
|
|||
- name: rte_many_tasks_actors
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: rte_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/rte_many_tasks_actors.py
|
||||
|
||||
- name: wheel_urls
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: rte_minimal.yaml
|
||||
|
||||
run:
|
||||
timeout: 9000 # 2h30m
|
||||
prepare: python wait_cluster.py 1 600
|
||||
script: python workloads/wheel_urls.py
|
||||
|
||||
- name: rte_ray_client
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: rte_minimal.yaml
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 1 600
|
||||
script: python workloads/rte_ray_client.py
|
|
@ -1,53 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,101 +0,0 @@
|
|||
- name: single_deployment_1k_noop_replica
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: compute_tpl_32_cpu.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
long_running: False
|
||||
script: python workloads/single_deployment_1k_noop_replica.py
|
||||
|
||||
smoke_test:
|
||||
timeout: 600
|
||||
|
||||
- name: multi_deployment_1k_noop_replica
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: compute_tpl_32_cpu.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
long_running: False
|
||||
script: python workloads/multi_deployment_1k_noop_replica.py
|
||||
|
||||
smoke_test:
|
||||
timeout: 600
|
||||
|
||||
- name: autoscaling_single_deployment
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: compute_tpl_8_cpu_autoscaling.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
long_running: False
|
||||
script: python workloads/autoscaling_single_deployment.py
|
||||
|
||||
smoke_test:
|
||||
timeout: 600
|
||||
|
||||
- name: autoscaling_multi_deployment
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: compute_tpl_8_cpu_autoscaling.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
long_running: False
|
||||
script: python workloads/autoscaling_multi_deployment.py
|
||||
|
||||
smoke_test:
|
||||
timeout: 600
|
||||
|
||||
- name: serve_micro_benchmark
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
# 16 CPUS
|
||||
compute_template: compute_tpl_single_node.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
long_running: False
|
||||
script: python workloads/serve_micro_benchmark.py
|
||||
|
||||
smoke_test:
|
||||
timeout: 600
|
||||
|
||||
- name: serve_micro_benchmark_k8s
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
# 16 CPUS
|
||||
compute_template: compute_tpl_single_node_k8s.yaml
|
||||
compute_on_k8s: True
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
long_running: False
|
||||
script: python workloads/serve_micro_benchmark.py
|
||||
|
||||
smoke_test:
|
||||
timeout: 600
|
||||
|
||||
- name: serve_cluster_fault_tolerance
|
||||
team: serve
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
# 16 CPUS
|
||||
compute_template: compute_tpl_single_node.yaml
|
||||
|
||||
run:
|
||||
timeout: 7200
|
||||
long_running: False
|
||||
script: python workloads/serve_cluster_fault_tolerance.py
|
||||
|
||||
smoke_test:
|
||||
timeout: 600
|
|
@ -1,11 +0,0 @@
|
|||
# Test multi-node, multi-GPU Ray SGD example.
|
||||
- name: sgd_gpu
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: sgd_gpu/sgd_gpu_app_config.yaml
|
||||
compute_template: sgd_gpu/sgd_gpu_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 3000
|
||||
prepare: python wait_cluster.py 2 600
|
||||
script: python sgd_gpu/sgd_gpu_test.py --num-workers=2 --use-gpu --address=auto
|
|
@ -1,53 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,27 +0,0 @@
|
|||
# Specify the test owners (teams) here.
|
||||
# The root key should be the name of the test yaml file without the .yaml.
|
||||
# To specify owners of subtests, use a sub dict (see e.g. long_running_tests).
|
||||
golden_notebook_tests: ml
|
||||
horovod_tests: ml
|
||||
lightgbm_tests: ml
|
||||
long_running_distributed_tests: ml
|
||||
long_running_tests:
|
||||
actor_deaths: core
|
||||
apex: ml
|
||||
impala: ml
|
||||
many_actor_tasks: core
|
||||
many_drivers: core
|
||||
many_ppo: core
|
||||
many_tasks: core
|
||||
many_tasks_serialized_ids: core
|
||||
node_failures: core
|
||||
pbt: ml
|
||||
serve: serve
|
||||
serve_failure: serve
|
||||
microbenchmark: core
|
||||
nightly_tests: core
|
||||
rllib_tests: ml
|
||||
runtime_env_tests: serve
|
||||
serve_tests: serve
|
||||
sgd_tests: ml
|
||||
xgboost_tests: ml
|
|
@ -1,118 +0,0 @@
|
|||
- name: aws_no_sync_down
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/run_cloud_test.py no_sync_down
|
||||
|
||||
- name: aws_ssh_sync
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/run_cloud_test.py ssh_sync
|
||||
|
||||
- name: aws_durable_upload
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
|
||||
|
||||
- name: aws_durable_upload_rllib_str
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_ml.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str
|
||||
|
||||
- name: aws_durable_upload_rllib_trainer
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_ml.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer
|
||||
|
||||
- name: aws_no_durable_upload
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_aws_4x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload
|
||||
|
||||
- name: gcp_k8s_no_sync_down
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 600
|
||||
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
|
||||
script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
|
||||
|
||||
- name: gcp_k8s_ssh_sync
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 600
|
||||
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
|
||||
script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
|
||||
|
||||
- name: gcp_k8s_durable_upload
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 600
|
||||
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
|
||||
script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
|
||||
|
||||
|
||||
- name: gcp_k8s_no_durable_upload
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
timeout: 600
|
||||
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
|
||||
script: python workloads/run_cloud_test.py no_durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
|
|
@ -1,54 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,90 +0,0 @@
|
|||
- name: bookkeeping_overhead
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_1x16.yaml
|
||||
|
||||
run:
|
||||
timeout: 1200
|
||||
script: python workloads/test_bookkeeping_overhead.py
|
||||
|
||||
|
||||
- name: durable_trainable
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_16x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 16 600
|
||||
script: python workloads/test_durable_trainable.py --bucket data-test-ilr
|
||||
|
||||
- name: long_running_large_checkpoints
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_1x32_hd.yaml
|
||||
|
||||
run:
|
||||
timeout: 86400
|
||||
script: python workloads/test_long_running_large_checkpoints.py
|
||||
long_running: True
|
||||
|
||||
smoke_test:
|
||||
run:
|
||||
timeout: 3600
|
||||
|
||||
|
||||
- name: network_overhead
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_100x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare_timeout: 1200
|
||||
prepare: python wait_cluster.py 100 1200
|
||||
script: python workloads/test_network_overhead.py
|
||||
|
||||
smoke_test:
|
||||
cluster:
|
||||
compute_template: tpl_20x2.yaml
|
||||
|
||||
run:
|
||||
timeout: 400
|
||||
prepare_timeout: 600
|
||||
prepare: python wait_cluster.py 20 600
|
||||
|
||||
- name: result_throughput_cluster
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_16x64.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 16 600
|
||||
script: python workloads/test_result_throughput_cluster.py
|
||||
|
||||
- name: result_throughput_single_node
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_1x96.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
script: python workloads/test_result_throughput_single_node.py
|
||||
|
||||
- name: xgboost_sweep
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_data.yaml
|
||||
compute_template: tpl_16x64.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
prepare: python wait_cluster.py 16 600
|
||||
script: python workloads/test_xgboost_sweep.py
|
|
@ -1,53 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,53 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,53 +0,0 @@
|
|||
import argparse
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address="auto")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
||||
)
|
||||
|
||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
||||
|
||||
parser.add_argument(
|
||||
"--feedback_interval_s",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Wait for this number of seconds",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
curr_nodes = 0
|
||||
start = time.time()
|
||||
next_feedback = start
|
||||
max_time = start + args.max_time_s
|
||||
while not curr_nodes >= args.num_nodes:
|
||||
now = time.time()
|
||||
|
||||
if now >= max_time:
|
||||
raise RuntimeError(
|
||||
f"Maximum wait time reached, but only "
|
||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
||||
)
|
||||
|
||||
if now >= next_feedback:
|
||||
passed = now - start
|
||||
print(
|
||||
f"Waiting for more nodes to come up: "
|
||||
f"{curr_nodes}/{args.num_nodes} "
|
||||
f"({passed:.0f} seconds passed)"
|
||||
)
|
||||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
||||
f"{passed:.0f} seconds"
|
||||
)
|
|
@ -1,104 +0,0 @@
|
|||
- name: train_small
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/train_small.py
|
||||
|
||||
- name: train_moderate
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 32 600
|
||||
script: python workloads/train_moderate.py
|
||||
|
||||
- name: train_gpu
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config_gpu.yaml
|
||||
compute_template: tpl_gpu_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 5 600
|
||||
script: python workloads/train_gpu.py
|
||||
|
||||
- name: distributed_api_test
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
results:
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/distributed_api_test.py
|
||||
results: ""
|
||||
|
||||
- name: ft_small_elastic
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/ft_small_elastic.py
|
||||
results: ""
|
||||
|
||||
- name: ft_small_non_elastic
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/ft_small_non_elastic.py
|
||||
results: ""
|
||||
|
||||
- name: tune_small
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/tune_small.py
|
||||
|
||||
- name: tune_32x4
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 32 600
|
||||
script: python workloads/tune_32x4.py
|
||||
|
||||
- name: tune_4x32
|
||||
team: ml
|
||||
cluster:
|
||||
app_config: app_config.yaml
|
||||
compute_template: tpl_cpu_moderate.yaml
|
||||
|
||||
run:
|
||||
timeout: 900
|
||||
prepare: python wait_cluster.py 32 600
|
||||
script: python workloads/tune_4x32.py
|
Loading…
Add table
Reference in a new issue