mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[ci/release] Remove old OSS release test infrastructure (#23134)
Now that we've migrated all OSS release tests to the new infrastructure, we can remove old config files and infra scripts.
This commit is contained in:
parent
d93fa95dd5
commit
8608b64885
39 changed files with 0 additions and 6712 deletions
|
@ -1,145 +0,0 @@
|
||||||
- name: single_node
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: single_node.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 12000
|
|
||||||
prepare: sleep 0
|
|
||||||
script: python single_node/test_single_node.py
|
|
||||||
|
|
||||||
- name: object_store
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: object_store.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=50
|
|
||||||
script: python object_store/test_object_store.py
|
|
||||||
|
|
||||||
- name: many_actors
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
|
||||||
script: python distributed/test_many_actors.py
|
|
||||||
|
|
||||||
- name: many_actors_smoke_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed_smoke_test.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
|
||||||
script: SMOKE_TEST=1 python distributed/test_many_actors.py
|
|
||||||
|
|
||||||
- name: many_tasks
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
|
||||||
script: python distributed/test_many_tasks.py --num-tasks=10000
|
|
||||||
|
|
||||||
- name: many_tasks_smoke_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed_smoke_test.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
|
||||||
script: python distributed/test_many_tasks.py --num-tasks=100
|
|
||||||
|
|
||||||
- name: many_pgs
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
|
||||||
script: python distributed/test_many_pgs.py
|
|
||||||
|
|
||||||
- name: many_pgs_smoke_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed_smoke_test.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
|
||||||
script: SMOKE_TEST=1 python distributed/test_many_pgs.py
|
|
||||||
|
|
||||||
# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
|
|
||||||
- name: many_nodes
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: many_nodes.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=250
|
|
||||||
script: python distributed/test_many_tasks.py --num-tasks=1000
|
|
||||||
|
|
||||||
- name: scheduling_test_many_0s_tasks_single_node
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: scheduling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
|
||||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
|
|
||||||
|
|
||||||
- name: scheduling_test_many_0s_tasks_many_nodes
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: scheduling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
|
||||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
|
|
||||||
|
|
||||||
- name: scheduling_test_many_5s_tasks_single_node
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: scheduling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
|
||||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
|
|
||||||
stable: false
|
|
||||||
|
|
||||||
- name: scheduling_test_many_5s_tasks_many_nodes
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: scheduling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
|
||||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
|
|
||||||
stable: false
|
|
|
@ -1,24 +0,0 @@
|
||||||
import click
|
|
||||||
import ray
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
def num_alive_nodes():
|
|
||||||
n = 0
|
|
||||||
for node in ray.nodes():
|
|
||||||
if node["Alive"]:
|
|
||||||
n += 1
|
|
||||||
return n
|
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
|
||||||
@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
|
|
||||||
def wait_cluster(num_nodes: int):
|
|
||||||
ray.init(address="auto")
|
|
||||||
while num_alive_nodes() != num_nodes:
|
|
||||||
print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
wait_cluster()
|
|
|
@ -1,680 +0,0 @@
|
||||||
import copy
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
# If you update or reorganize the periodic tests, please ensure the
|
|
||||||
# relevant portions of the Ray release instructions (go/release-ray)
|
|
||||||
# (in particular, running periodic tests and collecting release logs)
|
|
||||||
# are up to date. If you need access, please contact @zhe-thoughts.
|
|
||||||
|
|
||||||
# Env variables:
|
|
||||||
|
|
||||||
# RAY_REPO Repo to use for finding the wheel
|
|
||||||
# RAY_BRANCH Branch to find the wheel
|
|
||||||
# RAY_VERSION Version to find the wheel
|
|
||||||
# RAY_WHEELS Direct Ray wheel URL
|
|
||||||
# RAY_TEST_REPO Repo to use for test scripts
|
|
||||||
# RAY_TEST_BRANCH Branch for test scripts
|
|
||||||
# FILTER_FILE File filter
|
|
||||||
# FILTER_TEST Test name filter
|
|
||||||
# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)
|
|
||||||
|
|
||||||
|
|
||||||
class ReleaseTest:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
name: str,
|
|
||||||
smoke_test: bool = False,
|
|
||||||
retry: int = 0,
|
|
||||||
):
|
|
||||||
self.name = name
|
|
||||||
self.smoke_test = smoke_test
|
|
||||||
self.retry = retry
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.name
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.name
|
|
||||||
|
|
||||||
def __contains__(self, item):
|
|
||||||
return self.name.__contains__(item)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return iter(self.name)
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self.name)
|
|
||||||
|
|
||||||
|
|
||||||
class SmokeTest(ReleaseTest):
|
|
||||||
def __init__(self, name: str, retry: int = 0):
|
|
||||||
super(SmokeTest, self).__init__(name=name, smoke_test=True, retry=retry)
|
|
||||||
|
|
||||||
|
|
||||||
CORE_NIGHTLY_TESTS = {
|
|
||||||
# "~/ray/release/nightly_tests/nightly_tests.yaml": [
|
|
||||||
# "shuffle_10gb",
|
|
||||||
# "shuffle_50gb",
|
|
||||||
# "shuffle_50gb_large_partition",
|
|
||||||
# "shuffle_100gb",
|
|
||||||
# "non_streaming_shuffle_100gb",
|
|
||||||
# "non_streaming_shuffle_50gb_large_partition",
|
|
||||||
# "non_streaming_shuffle_50gb",
|
|
||||||
# SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
|
|
||||||
# SmokeTest("dask_on_ray_large_scale_test_spilling"),
|
|
||||||
# "stress_test_placement_group",
|
|
||||||
# "shuffle_1tb_1000_partition",
|
|
||||||
# "non_streaming_shuffle_1tb_1000_partition",
|
|
||||||
# "shuffle_1tb_5000_partitions",
|
|
||||||
# TODO(sang): It doesn't even work without spilling
|
|
||||||
# as it hits the scalability limit.
|
|
||||||
# "non_streaming_shuffle_1tb_5000_partitions",
|
|
||||||
# "decision_tree_autoscaling",
|
|
||||||
# "decision_tree_autoscaling_20_runs",
|
|
||||||
# "autoscaling_shuffle_1tb_1000_partitions",
|
|
||||||
# SmokeTest("stress_test_many_tasks"),
|
|
||||||
# SmokeTest("stress_test_dead_actors"),
|
|
||||||
# SmokeTest("threaded_actors_stress_test"),
|
|
||||||
# "pg_long_running_performance_test",
|
|
||||||
# ],
|
|
||||||
# "~/ray/benchmarks/benchmark_tests.yaml": [
|
|
||||||
# "single_node",
|
|
||||||
# "object_store",
|
|
||||||
# "many_actors_smoke_test",
|
|
||||||
# "many_tasks_smoke_test",
|
|
||||||
# "many_pgs_smoke_test",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
|
|
||||||
# "inference",
|
|
||||||
# "shuffle_data_loader",
|
|
||||||
# "parquet_metadata_resolution",
|
|
||||||
# "pipelined_training_50_gb",
|
|
||||||
# "pipelined_ingestion_1500_gb",
|
|
||||||
# "datasets_preprocess_ingest",
|
|
||||||
# "datasets_ingest_400G",
|
|
||||||
# SmokeTest("datasets_ingest_train_infer"),
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/nightly_tests/chaos_test.yaml": [
|
|
||||||
# "chaos_many_actors",
|
|
||||||
# "chaos_many_tasks_no_object_store",
|
|
||||||
# "chaos_pipelined_ingestion_1500_gb_15_windows",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/microbenchmark/microbenchmark.yaml": [
|
|
||||||
# "microbenchmark",
|
|
||||||
# ],
|
|
||||||
}
|
|
||||||
|
|
||||||
SERVE_NIGHTLY_TESTS = {
|
|
||||||
# "~/ray/release/long_running_tests/long_running_tests.yaml": [
|
|
||||||
# SmokeTest("serve"),
|
|
||||||
# SmokeTest("serve_failure"),
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/serve_tests/serve_tests.yaml": [
|
|
||||||
# "single_deployment_1k_noop_replica",
|
|
||||||
# "multi_deployment_1k_noop_replica",
|
|
||||||
# "autoscaling_single_deployment",
|
|
||||||
# "autoscaling_multi_deployment",
|
|
||||||
# "serve_micro_benchmark",
|
|
||||||
# # TODO(architkulkarni) Reenable after K8s migration. Currently failing
|
|
||||||
# # "serve_micro_benchmark_k8s",
|
|
||||||
# "serve_cluster_fault_tolerance",
|
|
||||||
# ],
|
|
||||||
}
|
|
||||||
|
|
||||||
CORE_DAILY_TESTS = {
|
|
||||||
# "~/ray/release/nightly_tests/nightly_tests.yaml": [
|
|
||||||
# "k8s_dask_on_ray_large_scale_test_no_spilling",
|
|
||||||
# "dask_on_ray_large_scale_test_no_spilling",
|
|
||||||
# "dask_on_ray_large_scale_test_spilling",
|
|
||||||
# "pg_autoscaling_regression_test",
|
|
||||||
# "threaded_actors_stress_test",
|
|
||||||
# "k8s_threaded_actors_stress_test",
|
|
||||||
# "stress_test_many_tasks",
|
|
||||||
# "stress_test_dead_actors",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/nightly_tests/chaos_test.yaml": [
|
|
||||||
# "chaos_dask_on_ray_large_scale_test_no_spilling",
|
|
||||||
# "chaos_dask_on_ray_large_scale_test_spilling",
|
|
||||||
# ],
|
|
||||||
}
|
|
||||||
|
|
||||||
CORE_SCALABILITY_TESTS_DAILY = {
|
|
||||||
# "~/ray/benchmarks/benchmark_tests.yaml": [
|
|
||||||
# "many_actors",
|
|
||||||
# "many_tasks",
|
|
||||||
# "many_pgs",
|
|
||||||
# "many_nodes",
|
|
||||||
# ],
|
|
||||||
}
|
|
||||||
|
|
||||||
CORE_SCHEDULING_DAILY = {
|
|
||||||
# "~/ray/benchmarks/benchmark_tests.yaml": [
|
|
||||||
# "scheduling_test_many_0s_tasks_single_node",
|
|
||||||
# "scheduling_test_many_0s_tasks_many_nodes",
|
|
||||||
# # Reenable these two once we got right setup
|
|
||||||
# # "scheduling_test_many_5s_tasks_single_node",
|
|
||||||
# # "scheduling_test_many_5s_tasks_many_nodes",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/nightly_tests/nightly_tests.yaml": [
|
|
||||||
# "many_nodes_actor_test",
|
|
||||||
# "dask_on_ray_10gb_sort",
|
|
||||||
# "dask_on_ray_100gb_sort",
|
|
||||||
# "dask_on_ray_1tb_sort",
|
|
||||||
# "placement_group_performance_test",
|
|
||||||
# ],
|
|
||||||
}
|
|
||||||
|
|
||||||
NIGHTLY_TESTS = {
|
|
||||||
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
|
|
||||||
# SmokeTest("horovod_test"),
|
|
||||||
# ], # Should we enable this?
|
|
||||||
# "~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
|
|
||||||
# "dask_xgboost_test",
|
|
||||||
# "modin_xgboost_test",
|
|
||||||
# "torch_tune_serve_test",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/long_running_tests/long_running_tests.yaml": [
|
|
||||||
# SmokeTest("actor_deaths"),
|
|
||||||
# SmokeTest("apex"),
|
|
||||||
# SmokeTest("impala"),
|
|
||||||
# SmokeTest("many_actor_tasks"),
|
|
||||||
# SmokeTest("many_drivers"),
|
|
||||||
# SmokeTest("many_ppo"),
|
|
||||||
# SmokeTest("many_tasks"),
|
|
||||||
# SmokeTest("many_tasks_serialized_ids"),
|
|
||||||
# SmokeTest("node_failures"),
|
|
||||||
# SmokeTest("pbt"),
|
|
||||||
# # SmokeTest("serve"),
|
|
||||||
# # SmokeTest("serve_failure"),
|
|
||||||
# # Full long running tests (1 day runtime)
|
|
||||||
# "actor_deaths",
|
|
||||||
# "apex",
|
|
||||||
# "impala",
|
|
||||||
# "many_actor_tasks",
|
|
||||||
# "many_drivers",
|
|
||||||
# "many_ppo",
|
|
||||||
# "many_tasks",
|
|
||||||
# "many_tasks_serialized_ids",
|
|
||||||
# "node_failures",
|
|
||||||
# "pbt",
|
|
||||||
# "serve",
|
|
||||||
# "serve_failure",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/sgd_tests/sgd_tests.yaml": [
|
|
||||||
# "sgd_gpu",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/tune_tests/cloud_tests/tune_cloud_tests.yaml": [
|
|
||||||
# "aws_no_sync_down",
|
|
||||||
# "aws_ssh_sync",
|
|
||||||
# "aws_durable_upload",
|
|
||||||
# "aws_durable_upload_rllib_str",
|
|
||||||
# "aws_durable_upload_rllib_trainer",
|
|
||||||
# "gcp_k8s_durable_upload",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
|
|
||||||
# "bookkeeping_overhead",
|
|
||||||
# "durable_trainable",
|
|
||||||
# SmokeTest("long_running_large_checkpoints"),
|
|
||||||
# SmokeTest("network_overhead"),
|
|
||||||
# "result_throughput_cluster",
|
|
||||||
# "result_throughput_single_node",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/xgboost_tests/xgboost_tests.yaml": [
|
|
||||||
# "train_small",
|
|
||||||
# "train_moderate",
|
|
||||||
# "train_gpu",
|
|
||||||
# "tune_small",
|
|
||||||
# "tune_4x32",
|
|
||||||
# "tune_32x4",
|
|
||||||
# "ft_small_elastic",
|
|
||||||
# "ft_small_non_elastic",
|
|
||||||
# "distributed_api_test",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/rllib_tests/rllib_tests.yaml": [
|
|
||||||
# SmokeTest("learning_tests"),
|
|
||||||
# SmokeTest("stress_tests"),
|
|
||||||
# "performance_tests",
|
|
||||||
# "multi_gpu_learning_tests",
|
|
||||||
# "multi_gpu_with_lstm_learning_tests",
|
|
||||||
# "multi_gpu_with_attention_learning_tests",
|
|
||||||
# # We'll have these as per-PR tests soon.
|
|
||||||
# # "example_scripts_on_gpu_tests",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/runtime_env_tests/runtime_env_tests.yaml": [
|
|
||||||
# "rte_many_tasks_actors",
|
|
||||||
# "wheel_urls",
|
|
||||||
# "rte_ray_client",
|
|
||||||
# ],
|
|
||||||
}
|
|
||||||
|
|
||||||
WEEKLY_TESTS = {
|
|
||||||
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
|
|
||||||
# "horovod_test",
|
|
||||||
# ],
|
|
||||||
"~/ray/release/long_running_distributed_tests"
|
|
||||||
# "/long_running_distributed.yaml": [
|
|
||||||
# "pytorch_pbt_failure",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
|
|
||||||
# "network_overhead",
|
|
||||||
# "long_running_large_checkpoints",
|
|
||||||
# "xgboost_sweep",
|
|
||||||
# ],
|
|
||||||
# "~/ray/release/rllib_tests/rllib_tests.yaml": [
|
|
||||||
# "learning_tests",
|
|
||||||
# "stress_tests",
|
|
||||||
# ],
|
|
||||||
}
|
|
||||||
|
|
||||||
# This test suite holds "user" tests to test important user workflows
|
|
||||||
# in a particular environment.
|
|
||||||
# All workloads in this test suite should:
|
|
||||||
# 1. Be run in a distributed (multi-node) fashion
|
|
||||||
# 2. Use autoscaling/scale up (no wait_cluster.py)
|
|
||||||
# 3. Use GPUs if applicable
|
|
||||||
# 4. Have the `use_connect` flag set.
|
|
||||||
USER_TESTS = {
|
|
||||||
# "~/ray/release/ml_user_tests/ml_user_tests.yaml": [
|
|
||||||
# "train_tensorflow_mnist_test",
|
|
||||||
# "train_torch_linear_test",
|
|
||||||
# "ray_lightning_user_test_latest",
|
|
||||||
# "ray_lightning_user_test_master",
|
|
||||||
# "horovod_user_test_latest",
|
|
||||||
# "horovod_user_test_master",
|
|
||||||
# "xgboost_gpu_connect_latest",
|
|
||||||
# "xgboost_gpu_connect_master",
|
|
||||||
# "tune_rllib_connect_test",
|
|
||||||
# ]
|
|
||||||
}
|
|
||||||
|
|
||||||
SUITES = {
|
|
||||||
"core-nightly": CORE_NIGHTLY_TESTS,
|
|
||||||
"serve-nightly": SERVE_NIGHTLY_TESTS,
|
|
||||||
"core-daily": CORE_DAILY_TESTS,
|
|
||||||
"core-scalability": CORE_SCALABILITY_TESTS_DAILY,
|
|
||||||
"nightly": {**NIGHTLY_TESTS, **USER_TESTS},
|
|
||||||
"core-scheduling-daily": CORE_SCHEDULING_DAILY,
|
|
||||||
"weekly": WEEKLY_TESTS,
|
|
||||||
}
|
|
||||||
|
|
||||||
DEFAULT_STEP_TEMPLATE = {
|
|
||||||
"env": {
|
|
||||||
"ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
|
|
||||||
"ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
|
|
||||||
"RELEASE_AWS_BUCKET": "ray-release-automation-results",
|
|
||||||
"RELEASE_AWS_LOCATION": "dev",
|
|
||||||
"RELEASE_AWS_DB_NAME": "ray_ci",
|
|
||||||
"RELEASE_AWS_DB_TABLE": "release_test_result",
|
|
||||||
"AWS_REGION": "us-west-2",
|
|
||||||
},
|
|
||||||
"agents": {"queue": "runner_queue_branch"},
|
|
||||||
"plugins": [
|
|
||||||
{
|
|
||||||
"docker#v3.9.0": {
|
|
||||||
"image": "rayproject/ray",
|
|
||||||
"propagate-environment": True,
|
|
||||||
"volumes": [
|
|
||||||
"/tmp/ray_release_test_artifacts:" "/tmp/ray_release_test_artifacts"
|
|
||||||
],
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"artifact_paths": ["/tmp/ray_release_test_artifacts/**/*"],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def ask_configuration():
|
|
||||||
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
|
|
||||||
RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
|
|
||||||
RAY_VERSION = os.environ.get("RAY_VERSION", "")
|
|
||||||
RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
|
|
||||||
|
|
||||||
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
|
|
||||||
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
|
|
||||||
|
|
||||||
RELEASE_TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
|
|
||||||
FILTER_FILE = os.environ.get("FILTER_FILE", "")
|
|
||||||
FILTER_TEST = os.environ.get("FILTER_TEST", "")
|
|
||||||
|
|
||||||
input_ask_step = {
|
|
||||||
"input": "Input required: Please specify tests to run",
|
|
||||||
"fields": [
|
|
||||||
{
|
|
||||||
"text": (
|
|
||||||
"RAY_REPO: Please specify the Ray repository used "
|
|
||||||
"to find the wheel."
|
|
||||||
),
|
|
||||||
"hint": (
|
|
||||||
"Repository from which to fetch the latest "
|
|
||||||
"commits to find the Ray wheels. Usually you don't "
|
|
||||||
"need to change this."
|
|
||||||
),
|
|
||||||
"default": RAY_REPO,
|
|
||||||
"key": "ray_repo",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": (
|
|
||||||
"RAY_BRANCH: Please specify the Ray branch used "
|
|
||||||
"to find the wheel."
|
|
||||||
),
|
|
||||||
"hint": "For releases, this will be e.g. `releases/1.x.0`",
|
|
||||||
"default": RAY_BRANCH,
|
|
||||||
"key": "ray_branch",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": (
|
|
||||||
"RAY_VERSION: Please specify the Ray version used "
|
|
||||||
"to find the wheel."
|
|
||||||
),
|
|
||||||
"hint": (
|
|
||||||
"Leave empty for latest master. For releases, "
|
|
||||||
"specify the release version."
|
|
||||||
),
|
|
||||||
"required": False,
|
|
||||||
"default": RAY_VERSION,
|
|
||||||
"key": "ray_version",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": "RAY_WHEELS: Please specify the Ray wheel URL.",
|
|
||||||
"hint": (
|
|
||||||
"ATTENTION: If you provide this, RAY_REPO, "
|
|
||||||
"RAY_BRANCH and RAY_VERSION will be ignored! "
|
|
||||||
"Please also make sure to provide the wheels URL "
|
|
||||||
"for Python 3.7 on Linux.\n"
|
|
||||||
"You can also insert a commit hash here instead "
|
|
||||||
"of a full URL.\n"
|
|
||||||
"NOTE: You can specify multiple commits or URLs "
|
|
||||||
"for easy bisection (one per line) - this will "
|
|
||||||
"run each test on each of the specified wheels."
|
|
||||||
),
|
|
||||||
"required": False,
|
|
||||||
"default": RAY_WHEELS,
|
|
||||||
"key": "ray_wheels",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": (
|
|
||||||
"RAY_TEST_REPO: Please specify the Ray repository "
|
|
||||||
"used to find the tests you would like to run."
|
|
||||||
),
|
|
||||||
"hint": (
|
|
||||||
"If you're developing a new release test, this "
|
|
||||||
"will most likely be your GitHub fork."
|
|
||||||
),
|
|
||||||
"default": RAY_TEST_REPO,
|
|
||||||
"key": "ray_test_repo",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": (
|
|
||||||
"RAY_TEST_BRANCH: Please specify the Ray branch used "
|
|
||||||
"to find the tests you would like to run."
|
|
||||||
),
|
|
||||||
"hint": (
|
|
||||||
"If you're developing a new release test, this "
|
|
||||||
"will most likely be a branch living on your "
|
|
||||||
"GitHub fork."
|
|
||||||
),
|
|
||||||
"default": RAY_TEST_BRANCH,
|
|
||||||
"key": "ray_test_branch",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"select": (
|
|
||||||
"RELEASE_TEST_SUITE: Please specify the release "
|
|
||||||
"test suite containing the tests you would like "
|
|
||||||
"to run."
|
|
||||||
),
|
|
||||||
"hint": (
|
|
||||||
"Check in the `build_pipeline.py` if you're "
|
|
||||||
"unsure which suite contains your tests."
|
|
||||||
),
|
|
||||||
"required": True,
|
|
||||||
"options": sorted(SUITES.keys()),
|
|
||||||
"default": RELEASE_TEST_SUITE,
|
|
||||||
"key": "release_test_suite",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": (
|
|
||||||
"FILTER_FILE: Please specify a filter for the "
|
|
||||||
"test files that should be included in this build."
|
|
||||||
),
|
|
||||||
"hint": (
|
|
||||||
"Only test files (e.g. xgboost_tests.yml) that "
|
|
||||||
"match this string will be included in the test"
|
|
||||||
),
|
|
||||||
"default": FILTER_FILE,
|
|
||||||
"required": False,
|
|
||||||
"key": "filter_file",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": (
|
|
||||||
"FILTER_TEST: Please specify a filter for the "
|
|
||||||
"test names that should be included in this build."
|
|
||||||
),
|
|
||||||
"hint": (
|
|
||||||
"Only test names (e.g. tune_4x32) that match "
|
|
||||||
"this string will be included in the test"
|
|
||||||
),
|
|
||||||
"default": FILTER_TEST,
|
|
||||||
"required": False,
|
|
||||||
"key": "filter_test",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"key": "input_ask_step",
|
|
||||||
}
|
|
||||||
|
|
||||||
run_again_step = {
|
|
||||||
"commands": [
|
|
||||||
f'export {v}=$(buildkite-agent meta-data get "{k}")'
|
|
||||||
for k, v in {
|
|
||||||
"ray_branch": "RAY_BRANCH",
|
|
||||||
"ray_repo": "RAY_REPO",
|
|
||||||
"ray_version": "RAY_VERSION",
|
|
||||||
"ray_wheels": "RAY_WHEELS",
|
|
||||||
"ray_test_branch": "RAY_TEST_BRANCH",
|
|
||||||
"ray_test_repo": "RAY_TEST_REPO",
|
|
||||||
"release_test_suite": "RELEASE_TEST_SUITE",
|
|
||||||
"filter_file": "FILTER_FILE",
|
|
||||||
"filter_test": "FILTER_TEST",
|
|
||||||
}.items()
|
|
||||||
]
|
|
||||||
+ [
|
|
||||||
"export AUTOMATIC=1",
|
|
||||||
"python3 -m pip install --user pyyaml",
|
|
||||||
"rm -rf ~/ray || true",
|
|
||||||
"git clone -b $${RAY_TEST_BRANCH} $${RAY_TEST_REPO} ~/ray",
|
|
||||||
(
|
|
||||||
"python3 ~/ray/release/.buildkite/build_pipeline.py "
|
|
||||||
"| buildkite-agent pipeline upload"
|
|
||||||
),
|
|
||||||
],
|
|
||||||
"label": ":pipeline: Again",
|
|
||||||
"agents": {"queue": "runner_queue_branch"},
|
|
||||||
"depends_on": "input_ask_step",
|
|
||||||
"key": "run_again_step",
|
|
||||||
}
|
|
||||||
|
|
||||||
return [
|
|
||||||
input_ask_step,
|
|
||||||
run_again_step,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def create_test_step(
|
|
||||||
ray_repo: str,
|
|
||||||
ray_branch: str,
|
|
||||||
ray_version: str,
|
|
||||||
ray_wheels: str,
|
|
||||||
ray_test_repo: str,
|
|
||||||
ray_test_branch: str,
|
|
||||||
test_file: str,
|
|
||||||
test_name: ReleaseTest,
|
|
||||||
):
|
|
||||||
custom_commit_str = "custom_wheels_url"
|
|
||||||
if ray_wheels:
|
|
||||||
# Extract commit from url
|
|
||||||
p = re.compile(r"([a-f0-9]{40})")
|
|
||||||
m = p.search(ray_wheels)
|
|
||||||
if m is not None:
|
|
||||||
custom_commit_str = m.group(1)
|
|
||||||
|
|
||||||
ray_wheels_str = f" ({ray_wheels}) " if ray_wheels else ""
|
|
||||||
|
|
||||||
logging.info(f"Creating step for {test_file}/{test_name}{ray_wheels_str}")
|
|
||||||
|
|
||||||
cmd = (
|
|
||||||
f"./release/run_e2e.sh "
|
|
||||||
f'--ray-repo "{ray_repo}" '
|
|
||||||
f'--ray-branch "{ray_branch}" '
|
|
||||||
f'--ray-version "{ray_version}" '
|
|
||||||
f'--ray-wheels "{ray_wheels}" '
|
|
||||||
f'--ray-test-repo "{ray_test_repo}" '
|
|
||||||
f'--ray-test-branch "{ray_test_branch}" '
|
|
||||||
)
|
|
||||||
|
|
||||||
args = (
|
|
||||||
f"--category {ray_branch} "
|
|
||||||
f"--test-config {test_file} "
|
|
||||||
f"--test-name {test_name} "
|
|
||||||
f"--keep-results-dir"
|
|
||||||
)
|
|
||||||
|
|
||||||
if test_name.smoke_test:
|
|
||||||
logging.info("This test will run as a smoke test.")
|
|
||||||
args += " --smoke-test"
|
|
||||||
|
|
||||||
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
|
|
||||||
|
|
||||||
if test_name.retry:
|
|
||||||
logging.info(f"This test will be retried up to " f"{test_name.retry} times.")
|
|
||||||
step_conf["retry"] = {
|
|
||||||
"automatic": [{"exit_status": "*", "limit": test_name.retry}]
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
# Default retry logic
|
|
||||||
# Warning: Exit codes are currently not correctly propagated to
|
|
||||||
# buildkite! Thus, actual retry logic is currently implemented in
|
|
||||||
# the run_e2e.sh script!
|
|
||||||
step_conf["retry"] = {
|
|
||||||
"automatic": [
|
|
||||||
{"exit_status": 7, "limit": 2}, # Prepare timeout
|
|
||||||
{"exit_status": 9, "limit": 2}, # Session timeout
|
|
||||||
{"exit_status": 10, "limit": 2}, # Prepare error
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
step_conf["command"] = cmd + args
|
|
||||||
|
|
||||||
step_conf["label"] = (
|
|
||||||
f"{test_name} "
|
|
||||||
f"({custom_commit_str if ray_wheels_str else ray_branch}) - "
|
|
||||||
f"{ray_test_branch}/{ray_test_repo}"
|
|
||||||
)
|
|
||||||
return step_conf
|
|
||||||
|
|
||||||
|
|
||||||
def build_pipeline(steps):
|
|
||||||
all_steps = []
|
|
||||||
|
|
||||||
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
|
|
||||||
RAY_REPO = os.environ.get("RAY_REPO", "https://github.com/ray-project/ray.git")
|
|
||||||
RAY_VERSION = os.environ.get("RAY_VERSION", "")
|
|
||||||
RAY_WHEELS = os.environ.get("RAY_WHEELS", "")
|
|
||||||
|
|
||||||
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
|
|
||||||
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)
|
|
||||||
|
|
||||||
FILTER_FILE = os.environ.get("FILTER_FILE", "")
|
|
||||||
FILTER_TEST = os.environ.get("FILTER_TEST", "")
|
|
||||||
|
|
||||||
ray_wheels_list = [""]
|
|
||||||
if RAY_WHEELS:
|
|
||||||
ray_wheels_list = RAY_WHEELS.split("\n")
|
|
||||||
|
|
||||||
if len(ray_wheels_list) > 1:
|
|
||||||
logging.info(
|
|
||||||
f"This will run a bisec on the following URLs/commits: "
|
|
||||||
f"{ray_wheels_list}"
|
|
||||||
)
|
|
||||||
|
|
||||||
logging.info(
|
|
||||||
f"Building pipeline \n"
|
|
||||||
f"Ray repo/branch to test:\n"
|
|
||||||
f" RAY_REPO = {RAY_REPO}\n"
|
|
||||||
f" RAY_BRANCH = {RAY_BRANCH}\n\n"
|
|
||||||
f" RAY_VERSION = {RAY_VERSION}\n\n"
|
|
||||||
f" RAY_WHEELS = {RAY_WHEELS}\n\n"
|
|
||||||
f"Ray repo/branch containing the test configurations and scripts:"
|
|
||||||
f" RAY_TEST_REPO = {RAY_TEST_REPO}\n"
|
|
||||||
f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
|
|
||||||
f"Filtering for these tests:\n"
|
|
||||||
f" FILTER_FILE = {FILTER_FILE}\n"
|
|
||||||
f" FILTER_TEST = {FILTER_TEST}\n\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
for test_file, test_names in steps.items():
|
|
||||||
if FILTER_FILE and FILTER_FILE not in test_file:
|
|
||||||
continue
|
|
||||||
|
|
||||||
test_base = os.path.basename(test_file)
|
|
||||||
for test_name in test_names:
|
|
||||||
if FILTER_TEST and FILTER_TEST not in test_name:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not isinstance(test_name, ReleaseTest):
|
|
||||||
test_name = ReleaseTest(name=test_name)
|
|
||||||
|
|
||||||
logging.info(f"Adding test: {test_base}/{test_name}")
|
|
||||||
|
|
||||||
for ray_wheels in ray_wheels_list:
|
|
||||||
step_conf = create_test_step(
|
|
||||||
ray_repo=RAY_REPO,
|
|
||||||
ray_branch=RAY_BRANCH,
|
|
||||||
ray_version=RAY_VERSION,
|
|
||||||
ray_wheels=ray_wheels,
|
|
||||||
ray_test_repo=RAY_TEST_REPO,
|
|
||||||
ray_test_branch=RAY_TEST_BRANCH,
|
|
||||||
test_file=test_file,
|
|
||||||
test_name=test_name,
|
|
||||||
)
|
|
||||||
|
|
||||||
all_steps.append(step_conf)
|
|
||||||
|
|
||||||
return all_steps
|
|
||||||
|
|
||||||
|
|
||||||
def alert_pipeline(stats: bool = False):
|
|
||||||
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)
|
|
||||||
|
|
||||||
cmd = "python release/alert.py"
|
|
||||||
if stats:
|
|
||||||
cmd += " --stats"
|
|
||||||
|
|
||||||
step_conf["commands"] = [
|
|
||||||
"pip install -q -r release/requirements.txt",
|
|
||||||
"pip install -U boto3 botocore",
|
|
||||||
cmd,
|
|
||||||
]
|
|
||||||
step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
|
|
||||||
return [step_conf]
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
alert = os.environ.get("RELEASE_ALERT", "0")
|
|
||||||
|
|
||||||
ask_for_config = not bool(int(os.environ.get("AUTOMATIC", "0")))
|
|
||||||
|
|
||||||
if alert in ["1", "stats"]:
|
|
||||||
steps = alert_pipeline(alert == "stats")
|
|
||||||
elif ask_for_config:
|
|
||||||
steps = ask_configuration()
|
|
||||||
else:
|
|
||||||
TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
|
|
||||||
PIPELINE_SPEC = SUITES[TEST_SUITE]
|
|
||||||
|
|
||||||
steps = build_pipeline(PIPELINE_SPEC)
|
|
||||||
|
|
||||||
yaml.dump({"steps": steps}, sys.stdout)
|
|
441
release/alert.py
441
release/alert.py
|
@ -1,441 +0,0 @@
|
||||||
import argparse
|
|
||||||
from collections import defaultdict, Counter
|
|
||||||
from typing import Any, List, Tuple, Mapping, Optional
|
|
||||||
import datetime
|
|
||||||
import hashlib
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import requests
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import boto3
|
|
||||||
|
|
||||||
from e2e import GLOBAL_CONFIG
|
|
||||||
|
|
||||||
from alerts.default import handle_result as default_handle_result
|
|
||||||
from alerts.rllib_tests import handle_result as rllib_tests_handle_result
|
|
||||||
from alerts.long_running_tests import handle_result as long_running_tests_handle_result
|
|
||||||
from alerts.tune_tests import handle_result as tune_tests_handle_result
|
|
||||||
from alerts.xgboost_tests import handle_result as xgboost_tests_handle_result
|
|
||||||
|
|
||||||
SUITE_TO_FN = {
|
|
||||||
"long_running_tests": long_running_tests_handle_result,
|
|
||||||
"rllib_tests": rllib_tests_handle_result,
|
|
||||||
"tune_tests": tune_tests_handle_result,
|
|
||||||
"xgboost_tests": xgboost_tests_handle_result,
|
|
||||||
}
|
|
||||||
|
|
||||||
GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"] = "alert_state"
|
|
||||||
GLOBAL_CONFIG["SLACK_WEBHOOK"] = os.environ.get("SLACK_WEBHOOK", "")
|
|
||||||
GLOBAL_CONFIG["SLACK_CHANNEL"] = os.environ.get("SLACK_CHANNEL", "#oss-test-cop")
|
|
||||||
|
|
||||||
RESULTS_LIMIT = 120
|
|
||||||
|
|
||||||
logger = logging.getLogger()
|
|
||||||
logger.setLevel(logging.INFO)
|
|
||||||
handler = logging.StreamHandler(stream=sys.stdout)
|
|
||||||
formatter = logging.Formatter(
|
|
||||||
fmt="[%(levelname)s %(asctime)s] " "%(filename)s: %(lineno)d " "%(message)s"
|
|
||||||
)
|
|
||||||
handler.setFormatter(formatter)
|
|
||||||
logger.addHandler(handler)
|
|
||||||
|
|
||||||
|
|
||||||
def maybe_fetch_slack_webhook():
|
|
||||||
if GLOBAL_CONFIG["SLACK_WEBHOOK"] in [None, ""]:
|
|
||||||
print("Missing SLACK_WEBHOOK, retrieving from AWS secrets store")
|
|
||||||
GLOBAL_CONFIG["SLACK_WEBHOOK"] = boto3.client(
|
|
||||||
"secretsmanager", region_name="us-west-2"
|
|
||||||
).get_secret_value(
|
|
||||||
SecretId="arn:aws:secretsmanager:us-west-2:029272617770:secret:"
|
|
||||||
"release-automation/"
|
|
||||||
"slack-webhook-Na0CFP"
|
|
||||||
)[
|
|
||||||
"SecretString"
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def _obj_hash(obj: Any) -> str:
|
|
||||||
json_str = json.dumps(obj, sort_keys=True, ensure_ascii=True)
|
|
||||||
sha = hashlib.sha256()
|
|
||||||
sha.update(json_str.encode())
|
|
||||||
return sha.hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_latest_alerts(rds_data_client):
|
|
||||||
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
|
|
||||||
|
|
||||||
sql = f"""
|
|
||||||
SELECT DISTINCT ON (category, test_suite, test_name)
|
|
||||||
category, test_suite, test_name, last_result_hash,
|
|
||||||
last_notification_dt
|
|
||||||
FROM {schema}
|
|
||||||
ORDER BY category, test_suite, test_name, last_notification_dt DESC
|
|
||||||
LIMIT {RESULTS_LIMIT}
|
|
||||||
"""
|
|
||||||
|
|
||||||
result = rds_data_client.execute_statement(
|
|
||||||
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
|
|
||||||
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
|
|
||||||
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
|
|
||||||
schema=schema,
|
|
||||||
sql=sql,
|
|
||||||
)
|
|
||||||
for row in result["records"]:
|
|
||||||
category, test_suite, test_name, last_result_hash, last_notification_dt = (
|
|
||||||
r["stringValue"] if "stringValue" in r else None for r in row
|
|
||||||
)
|
|
||||||
last_notification_dt = datetime.datetime.strptime(
|
|
||||||
last_notification_dt, "%Y-%m-%d %H:%M:%S"
|
|
||||||
)
|
|
||||||
yield category, test_suite, test_name, last_result_hash, last_notification_dt
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_latest_results(
|
|
||||||
rds_data_client, fetch_since: Optional[datetime.datetime] = None
|
|
||||||
):
|
|
||||||
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
|
|
||||||
|
|
||||||
sql = f"""
|
|
||||||
SELECT DISTINCT ON (category, test_suite, test_name)
|
|
||||||
created_on, category, test_suite, test_name, status, results,
|
|
||||||
artifacts, last_logs
|
|
||||||
FROM {schema} """
|
|
||||||
|
|
||||||
parameters = []
|
|
||||||
if fetch_since is not None:
|
|
||||||
sql += "WHERE created_on >= :created_on "
|
|
||||||
parameters = [
|
|
||||||
{
|
|
||||||
"name": "created_on",
|
|
||||||
"typeHint": "TIMESTAMP",
|
|
||||||
"value": {"stringValue": fetch_since.strftime("%Y-%m-%d %H:%M:%S")},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
sql += "ORDER BY category, test_suite, test_name, created_on DESC "
|
|
||||||
sql += f"LIMIT {RESULTS_LIMIT}"
|
|
||||||
|
|
||||||
result = rds_data_client.execute_statement(
|
|
||||||
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
|
|
||||||
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
|
|
||||||
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
|
|
||||||
schema=schema,
|
|
||||||
sql=sql,
|
|
||||||
parameters=parameters,
|
|
||||||
)
|
|
||||||
for row in result["records"]:
|
|
||||||
(
|
|
||||||
created_on,
|
|
||||||
category,
|
|
||||||
test_suite,
|
|
||||||
test_name,
|
|
||||||
status,
|
|
||||||
results,
|
|
||||||
artifacts,
|
|
||||||
last_logs,
|
|
||||||
) = (r["stringValue"] if "stringValue" in r else None for r in row)
|
|
||||||
|
|
||||||
# Calculate hash before converting strings to objects
|
|
||||||
result_obj = (
|
|
||||||
created_on,
|
|
||||||
category,
|
|
||||||
test_suite,
|
|
||||||
test_name,
|
|
||||||
status,
|
|
||||||
results,
|
|
||||||
artifacts,
|
|
||||||
last_logs,
|
|
||||||
)
|
|
||||||
result_json = json.dumps(result_obj)
|
|
||||||
result_hash = _obj_hash(result_json)
|
|
||||||
|
|
||||||
# Convert some strings to python objects
|
|
||||||
created_on = datetime.datetime.strptime(created_on, "%Y-%m-%d %H:%M:%S")
|
|
||||||
results = json.loads(results)
|
|
||||||
artifacts = json.loads(artifacts)
|
|
||||||
|
|
||||||
yield result_hash, created_on, category, test_suite, test_name, status, results, artifacts, last_logs # noqa: E501
|
|
||||||
|
|
||||||
|
|
||||||
def mark_as_handled(
|
|
||||||
rds_data_client,
|
|
||||||
update: bool,
|
|
||||||
category: str,
|
|
||||||
test_suite: str,
|
|
||||||
test_name: str,
|
|
||||||
result_hash: str,
|
|
||||||
last_notification_dt: datetime.datetime,
|
|
||||||
):
|
|
||||||
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_STATE_TABLE"]
|
|
||||||
|
|
||||||
if not update:
|
|
||||||
sql = f"""
|
|
||||||
INSERT INTO {schema}
|
|
||||||
(category, test_suite, test_name,
|
|
||||||
last_result_hash, last_notification_dt)
|
|
||||||
VALUES (:category, :test_suite, :test_name,
|
|
||||||
:last_result_hash, :last_notification_dt)
|
|
||||||
"""
|
|
||||||
else:
|
|
||||||
sql = f"""
|
|
||||||
UPDATE {schema}
|
|
||||||
SET last_result_hash=:last_result_hash,
|
|
||||||
last_notification_dt=:last_notification_dt
|
|
||||||
WHERE category=:category AND test_suite=:test_suite
|
|
||||||
AND test_name=:test_name
|
|
||||||
"""
|
|
||||||
|
|
||||||
rds_data_client.execute_statement(
|
|
||||||
database=GLOBAL_CONFIG["RELEASE_AWS_DB_NAME"],
|
|
||||||
parameters=[
|
|
||||||
{"name": "category", "value": {"stringValue": category}},
|
|
||||||
{"name": "test_suite", "value": {"stringValue": test_suite or ""}},
|
|
||||||
{"name": "test_name", "value": {"stringValue": test_name}},
|
|
||||||
{"name": "last_result_hash", "value": {"stringValue": result_hash}},
|
|
||||||
{
|
|
||||||
"name": "last_notification_dt",
|
|
||||||
"typeHint": "TIMESTAMP",
|
|
||||||
"value": {
|
|
||||||
"stringValue": last_notification_dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
secretArn=GLOBAL_CONFIG["RELEASE_AWS_DB_SECRET_ARN"],
|
|
||||||
resourceArn=GLOBAL_CONFIG["RELEASE_AWS_DB_RESOURCE_ARN"],
|
|
||||||
schema=schema,
|
|
||||||
sql=sql,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def post_alerts_to_slack(
|
|
||||||
channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
|
|
||||||
):
|
|
||||||
if len(alerts) == 0:
|
|
||||||
logger.info("No alerts to post to slack.")
|
|
||||||
return
|
|
||||||
|
|
||||||
markdown_lines = [
|
|
||||||
f"* {len(alerts)} new release test failures found!*",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
|
|
||||||
category_alerts = defaultdict(list)
|
|
||||||
for (category, test_suite, test_name, alert) in alerts:
|
|
||||||
category_alerts[category].append(
|
|
||||||
f" *{test_suite}/{test_name}* failed: {alert}"
|
|
||||||
)
|
|
||||||
|
|
||||||
for category, alert_list in category_alerts.items():
|
|
||||||
markdown_lines.append(f"Branch: *{category}*")
|
|
||||||
markdown_lines.extend(alert_list)
|
|
||||||
markdown_lines.append("")
|
|
||||||
|
|
||||||
total_non_alerts = sum(n for n in non_alerts.values())
|
|
||||||
non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
|
|
||||||
|
|
||||||
markdown_lines += [
|
|
||||||
f"Additionally, {total_non_alerts} tests passed successfully "
|
|
||||||
f"({', '.join(non_alert_detail)})."
|
|
||||||
]
|
|
||||||
|
|
||||||
slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
|
|
||||||
|
|
||||||
resp = requests.post(
|
|
||||||
slack_url,
|
|
||||||
json={
|
|
||||||
"text": "\n".join(markdown_lines),
|
|
||||||
"channel": channel,
|
|
||||||
"username": "Fail Bot",
|
|
||||||
"icon_emoji": ":red_circle:",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
print(resp.status_code)
|
|
||||||
print(resp.text)
|
|
||||||
|
|
||||||
|
|
||||||
def post_statistics_to_slack(
|
|
||||||
channel: str, alerts: List[Tuple[str, str, str, str]], non_alerts: Mapping[str, int]
|
|
||||||
):
|
|
||||||
total_alerts = len(alerts)
|
|
||||||
|
|
||||||
category_alerts = defaultdict(list)
|
|
||||||
for (category, test_suite, test_name, alert) in alerts:
|
|
||||||
category_alerts[category].append(f"`{test_suite}/{test_name}`")
|
|
||||||
|
|
||||||
alert_detail = [f"{len(a)} on {c}" for c, a in category_alerts.items()]
|
|
||||||
|
|
||||||
total_non_alerts = sum(n for n in non_alerts.values())
|
|
||||||
non_alert_detail = [f"{n} on {c}" for c, n in non_alerts.items()]
|
|
||||||
|
|
||||||
markdown_lines = [
|
|
||||||
"*Periodic release test report*",
|
|
||||||
"",
|
|
||||||
f"In the past 24 hours, "
|
|
||||||
f"*{total_non_alerts}* release tests finished successfully, and "
|
|
||||||
f"*{total_alerts}* release tests failed.",
|
|
||||||
]
|
|
||||||
|
|
||||||
markdown_lines.append("")
|
|
||||||
|
|
||||||
if total_alerts:
|
|
||||||
markdown_lines.append(f"*Failing:* {', '.join(alert_detail)}")
|
|
||||||
for c, a in category_alerts.items():
|
|
||||||
markdown_lines.append(f" *{c}*: {', '.join(sorted(a))}")
|
|
||||||
else:
|
|
||||||
markdown_lines.append("*Failing:* None")
|
|
||||||
|
|
||||||
markdown_lines.append("")
|
|
||||||
|
|
||||||
if total_non_alerts:
|
|
||||||
markdown_lines.append(f"*Passing:* {', '.join(non_alert_detail)}")
|
|
||||||
else:
|
|
||||||
markdown_lines.append("*Passing:* None")
|
|
||||||
|
|
||||||
slack_url = GLOBAL_CONFIG["SLACK_WEBHOOK"]
|
|
||||||
|
|
||||||
resp = requests.post(
|
|
||||||
slack_url,
|
|
||||||
json={
|
|
||||||
"text": "\n".join(markdown_lines),
|
|
||||||
"channel": channel,
|
|
||||||
"username": "Fail Bot",
|
|
||||||
"icon_emoji": ":red_circle:",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
print(resp.status_code)
|
|
||||||
print(resp.text)
|
|
||||||
|
|
||||||
|
|
||||||
def handle_results_and_get_alerts(
|
|
||||||
rds_data_client,
|
|
||||||
fetch_since: Optional[datetime.datetime] = None,
|
|
||||||
always_try_alert: bool = False,
|
|
||||||
no_status_update: bool = False,
|
|
||||||
):
|
|
||||||
# First build a map of last notifications
|
|
||||||
last_notifications_map = {}
|
|
||||||
for (
|
|
||||||
category,
|
|
||||||
test_suite,
|
|
||||||
test_name,
|
|
||||||
last_result_hash,
|
|
||||||
last_notification_dt,
|
|
||||||
) in fetch_latest_alerts(rds_data_client):
|
|
||||||
last_notifications_map[(category, test_suite, test_name)] = (
|
|
||||||
last_result_hash,
|
|
||||||
last_notification_dt,
|
|
||||||
)
|
|
||||||
|
|
||||||
alerts = []
|
|
||||||
non_alerts = Counter()
|
|
||||||
|
|
||||||
# Then fetch latest results
|
|
||||||
for (
|
|
||||||
result_hash,
|
|
||||||
created_on,
|
|
||||||
category,
|
|
||||||
test_suite,
|
|
||||||
test_name,
|
|
||||||
status,
|
|
||||||
results,
|
|
||||||
artifacts,
|
|
||||||
last_logs,
|
|
||||||
) in fetch_latest_results(rds_data_client, fetch_since=fetch_since):
|
|
||||||
key = (category, test_suite, test_name)
|
|
||||||
|
|
||||||
try_alert = always_try_alert
|
|
||||||
if key in last_notifications_map:
|
|
||||||
# If we have an alert for this key, fetch info
|
|
||||||
last_result_hash, last_notification_dt = last_notifications_map[key]
|
|
||||||
|
|
||||||
if last_result_hash != result_hash:
|
|
||||||
# If we got a new result, handle new result
|
|
||||||
try_alert = True
|
|
||||||
# Todo: maybe alert again after some time?
|
|
||||||
else:
|
|
||||||
try_alert = True
|
|
||||||
|
|
||||||
if try_alert:
|
|
||||||
handle_fn = SUITE_TO_FN.get(test_suite, None)
|
|
||||||
if not handle_fn:
|
|
||||||
logger.warning(f"No handle for suite {test_suite}")
|
|
||||||
alert = default_handle_result(
|
|
||||||
created_on,
|
|
||||||
category,
|
|
||||||
test_suite,
|
|
||||||
test_name,
|
|
||||||
status,
|
|
||||||
results,
|
|
||||||
artifacts,
|
|
||||||
last_logs,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
alert = handle_fn(
|
|
||||||
created_on,
|
|
||||||
category,
|
|
||||||
test_suite,
|
|
||||||
test_name,
|
|
||||||
status,
|
|
||||||
results,
|
|
||||||
artifacts,
|
|
||||||
last_logs,
|
|
||||||
)
|
|
||||||
|
|
||||||
if alert:
|
|
||||||
logger.warning(
|
|
||||||
f"Alert raised for test {test_suite}/{test_name} "
|
|
||||||
f"({category}): {alert}"
|
|
||||||
)
|
|
||||||
|
|
||||||
alerts.append((category, test_suite, test_name, alert))
|
|
||||||
else:
|
|
||||||
logger.debug(
|
|
||||||
f"No alert raised for test {test_suite}/{test_name} "
|
|
||||||
f"({category})"
|
|
||||||
)
|
|
||||||
non_alerts[category] += 1
|
|
||||||
|
|
||||||
if not no_status_update:
|
|
||||||
mark_as_handled(
|
|
||||||
rds_data_client,
|
|
||||||
key in last_notifications_map,
|
|
||||||
category,
|
|
||||||
test_suite,
|
|
||||||
test_name,
|
|
||||||
result_hash,
|
|
||||||
datetime.datetime.now(),
|
|
||||||
)
|
|
||||||
|
|
||||||
return alerts, non_alerts
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"--stats",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
help="Finish quickly for training.",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
maybe_fetch_slack_webhook()
|
|
||||||
|
|
||||||
rds_data_client = boto3.client("rds-data", region_name="us-west-2")
|
|
||||||
|
|
||||||
if args.stats:
|
|
||||||
# Only update last 24 hour stats
|
|
||||||
fetch_since = datetime.datetime.now() - datetime.timedelta(days=1)
|
|
||||||
alerts, non_alerts = handle_results_and_get_alerts(
|
|
||||||
rds_data_client,
|
|
||||||
fetch_since=fetch_since,
|
|
||||||
always_try_alert=True,
|
|
||||||
no_status_update=True,
|
|
||||||
)
|
|
||||||
post_statistics_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
|
|
||||||
|
|
||||||
else:
|
|
||||||
alerts, non_alerts = handle_results_and_get_alerts(rds_data_client)
|
|
||||||
post_alerts_to_slack(GLOBAL_CONFIG["SLACK_CHANNEL"], alerts, non_alerts)
|
|
|
@ -1,145 +0,0 @@
|
||||||
- name: single_node
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: single_node.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 12000
|
|
||||||
prepare: sleep 0
|
|
||||||
script: python single_node/test_single_node.py
|
|
||||||
|
|
||||||
- name: object_store
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: object_store.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=50
|
|
||||||
script: python object_store/test_object_store.py
|
|
||||||
|
|
||||||
- name: many_actors
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
|
||||||
script: python distributed/test_many_actors.py
|
|
||||||
|
|
||||||
- name: many_actors_smoke_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed_smoke_test.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
|
||||||
script: SMOKE_TEST=1 python distributed/test_many_actors.py
|
|
||||||
|
|
||||||
- name: many_tasks
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
|
||||||
script: python distributed/test_many_tasks.py --num-tasks=10000
|
|
||||||
|
|
||||||
- name: many_tasks_smoke_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed_smoke_test.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
|
||||||
script: python distributed/test_many_tasks.py --num-tasks=100
|
|
||||||
|
|
||||||
- name: many_pgs
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=65
|
|
||||||
script: python distributed/test_many_pgs.py
|
|
||||||
|
|
||||||
- name: many_pgs_smoke_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: distributed_smoke_test.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=2
|
|
||||||
script: SMOKE_TEST=1 python distributed/test_many_pgs.py
|
|
||||||
|
|
||||||
# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
|
|
||||||
- name: many_nodes
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: many_nodes.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600 # 1hr
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=250
|
|
||||||
script: python distributed/test_many_tasks.py --num-tasks=1000
|
|
||||||
|
|
||||||
- name: scheduling_test_many_0s_tasks_single_node
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: scheduling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
|
||||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=1 --num-actors-per-nodes=1
|
|
||||||
|
|
||||||
- name: scheduling_test_many_0s_tasks_many_nodes
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: scheduling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
|
||||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=0 --total-num-actors=32 --num-actors-per-nodes=1
|
|
||||||
|
|
||||||
- name: scheduling_test_many_5s_tasks_single_node
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: scheduling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
|
||||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=1 --num-actors-per-nodes=1
|
|
||||||
stable: false
|
|
||||||
|
|
||||||
- name: scheduling_test_many_5s_tasks_many_nodes
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: scheduling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python distributed/wait_cluster.py --num-nodes=32
|
|
||||||
script: python distributed/test_scheduling.py --total-num-task=1984000 --num-cpu-per-task=1 --task-duration-s=5 --total-num-actors=32 --num-actors-per-nodes=1
|
|
||||||
stable: false
|
|
|
@ -1,24 +0,0 @@
|
||||||
import click
|
|
||||||
import ray
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
def num_alive_nodes():
|
|
||||||
n = 0
|
|
||||||
for node in ray.nodes():
|
|
||||||
if node["Alive"]:
|
|
||||||
n += 1
|
|
||||||
return n
|
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
|
||||||
@click.option("--num-nodes", required=True, type=int, help="The target number of nodes")
|
|
||||||
def wait_cluster(num_nodes: int):
|
|
||||||
ray.init(address="auto")
|
|
||||||
while num_alive_nodes() != num_nodes:
|
|
||||||
print(f"Waiting for nodes: {num_alive_nodes()}/{num_nodes}")
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
wait_cluster()
|
|
|
@ -1,54 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,214 +0,0 @@
|
||||||
<!doctype html>
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<meta charset="utf-8">
|
|
||||||
<title>Releaser config generator</title>
|
|
||||||
<style type="text/css">
|
|
||||||
html {
|
|
||||||
background: #cccccc;
|
|
||||||
}
|
|
||||||
body {
|
|
||||||
background: #ffffff;
|
|
||||||
font-family: sans-serif;
|
|
||||||
padding: 1em 2em;
|
|
||||||
max-width: 800px;
|
|
||||||
margin: 0 auto;
|
|
||||||
}
|
|
||||||
textarea {
|
|
||||||
width: 600px;
|
|
||||||
height: 200px;
|
|
||||||
}
|
|
||||||
form .use {
|
|
||||||
white-space: nowrap;
|
|
||||||
padding-right: 1em;
|
|
||||||
}
|
|
||||||
form .val {
|
|
||||||
min-width: 300px;
|
|
||||||
}
|
|
||||||
form .val input {
|
|
||||||
width: 90%;
|
|
||||||
}
|
|
||||||
form .desc {
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
<script type="text/javascript">
|
|
||||||
var env_vars = [
|
|
||||||
{
|
|
||||||
"name": "RAY_TEST_REPO",
|
|
||||||
"short": "Git repo with test files",
|
|
||||||
"long": "Repository in which the test files are which you would like to run. Note that this doesn't have to be the same repo from which the wheels are installed.",
|
|
||||||
"default": "https://github.com/ray-project/ray.git",
|
|
||||||
"enabled": false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "RAY_TEST_BRANCH",
|
|
||||||
"short": "Git branch for test repo",
|
|
||||||
"long": "Git branch that is checked out from RAY_TEST_REPO and which contains the test files you would like to run. Note that this doesnt' have to be the same branch you're fetching the Ray wheels from.",
|
|
||||||
"default": "master",
|
|
||||||
"enabled": false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "RAY_REPO",
|
|
||||||
"short": "Git repo for the Ray wheels",
|
|
||||||
"long": "Repository from which to fetch the latest commits to find the Ray wheels",
|
|
||||||
"default": "https://github.com/ray-project/ray.git",
|
|
||||||
"enabled": false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "RAY_BRANCH",
|
|
||||||
"short": "Git branch for the Ray wheels",
|
|
||||||
"long": "Branch that is check out from RAY_REPO from which the latest commits are fetched to find the Ray wheels",
|
|
||||||
"default": "master",
|
|
||||||
"enabled": true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "RELEASE_TEST_SUITE",
|
|
||||||
"short": "Release test suite (nightly/weekly/manual)",
|
|
||||||
"long": "Release test suite as defined in releaser's build_pipeline.py",
|
|
||||||
"default": "nightly",
|
|
||||||
"enabled": true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "FILTER_FILE",
|
|
||||||
"short": "Filter test file by this string",
|
|
||||||
"long": "Only test files (e.g. xgboost_tests.yml) that match this string will be included in the test",
|
|
||||||
"default": "",
|
|
||||||
"enabled": false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "FILTER_TEST",
|
|
||||||
"short": "Filter test name by this string",
|
|
||||||
"long": "Only test names (e.g. tune_4x32) that match this string will be included in the test",
|
|
||||||
"default": "",
|
|
||||||
"enabled": false,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
window.addEventListener('load', function () {
|
|
||||||
|
|
||||||
var table = document.getElementById("gen_table");
|
|
||||||
|
|
||||||
for (var env_var of env_vars) {
|
|
||||||
|
|
||||||
var use_td = document.createElement("td");
|
|
||||||
use_td.setAttribute("class", "use");
|
|
||||||
|
|
||||||
var use_input = document.createElement("input");
|
|
||||||
use_input.setAttribute("type", "checkbox");
|
|
||||||
use_input.setAttribute("data-activate", env_var["name"] + "_val");
|
|
||||||
use_input.setAttribute("id", env_var["name"] + "_use");
|
|
||||||
use_input.setAttribute("class", "input_use");
|
|
||||||
if (env_var["enabled"]) {
|
|
||||||
use_input.checked = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
var use_label = document.createElement("label");
|
|
||||||
use_label.setAttribute("for", env_var["name"] + "_use");
|
|
||||||
use_label.innerHTML = env_var["name"];
|
|
||||||
|
|
||||||
use_td.append(use_input);
|
|
||||||
use_td.append(use_label);
|
|
||||||
|
|
||||||
val_td = document.createElement("td");
|
|
||||||
val_td.setAttribute("class", "val");
|
|
||||||
|
|
||||||
val_input = document.createElement("input");
|
|
||||||
val_input.setAttribute("type", "text");
|
|
||||||
if (!env_var["enabled"]) {
|
|
||||||
val_input.setAttribute("disabled", "disabled");
|
|
||||||
}
|
|
||||||
val_input.setAttribute("id", env_var["name"] + "_val");
|
|
||||||
val_input.setAttribute("name", env_var["name"]);
|
|
||||||
val_input.setAttribute("value", env_var["default"]);
|
|
||||||
val_input.setAttribute("class", "input_val");
|
|
||||||
|
|
||||||
val_td.append(val_input);
|
|
||||||
|
|
||||||
use_input.addEventListener("click", function(e) {
|
|
||||||
var toggle_val = document.getElementById(e.target.getAttribute("data-activate"))
|
|
||||||
|
|
||||||
if (toggle_val.disabled) {
|
|
||||||
toggle_val.removeAttribute("disabled");
|
|
||||||
} else {
|
|
||||||
toggle_val.setAttribute("disabled", "disabled");
|
|
||||||
}
|
|
||||||
generate_snippet();
|
|
||||||
});
|
|
||||||
|
|
||||||
val_input.addEventListener("change", function() { generate_snippet(); });
|
|
||||||
val_input.addEventListener("keydown", function() { generate_snippet(); });
|
|
||||||
val_input.addEventListener("keyup", function() { generate_snippet(); });
|
|
||||||
|
|
||||||
var desc_td = document.createElement("td");
|
|
||||||
desc_td.setAttribute("class", "desc");
|
|
||||||
|
|
||||||
var desc_a = document.createElement("a");
|
|
||||||
desc_a.setAttribute("title", env_var["long"]);
|
|
||||||
desc_a.innerHTML = env_var["short"];
|
|
||||||
|
|
||||||
desc_td.append(desc_a);
|
|
||||||
|
|
||||||
var tr = document.createElement("tr");
|
|
||||||
tr.append(use_td);
|
|
||||||
tr.append(val_td);
|
|
||||||
tr.append(desc_td);
|
|
||||||
|
|
||||||
table.append(tr);
|
|
||||||
}
|
|
||||||
|
|
||||||
var button = document.getElementById("generate");
|
|
||||||
button.addEventListener("click", function() {
|
|
||||||
generate_snippet();
|
|
||||||
})
|
|
||||||
|
|
||||||
generate_snippet()
|
|
||||||
})
|
|
||||||
|
|
||||||
function generate_snippet() {
|
|
||||||
full_snippet = ""
|
|
||||||
for (env_var of env_vars) {
|
|
||||||
var val_input = document.getElementById(env_var["name"] + "_val")
|
|
||||||
|
|
||||||
if (!val_input.disabled) {
|
|
||||||
full_snippet += env_var["name"] + "=\"" + val_input.value + "\"\n"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
document.getElementById("snippet").innerHTML = full_snippet;
|
|
||||||
}
|
|
||||||
|
|
||||||
</script>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<header class="header">
|
|
||||||
<h1>Releaser config generator</h1>
|
|
||||||
<p>Use this form to generate a list of environment variables.</p>
|
|
||||||
<p>These variables can be passed to Buildkite to run a subset of release tests
|
|
||||||
and choose the correct wheels/release test branch</p>
|
|
||||||
</header>
|
|
||||||
<section class="main">
|
|
||||||
<form id="gen">
|
|
||||||
<table id="gen_table">
|
|
||||||
<tr>
|
|
||||||
<th>Set</th>
|
|
||||||
<th>Value</th>
|
|
||||||
<th>Description</th>
|
|
||||||
</tr>
|
|
||||||
|
|
||||||
</table>
|
|
||||||
|
|
||||||
</form>
|
|
||||||
|
|
||||||
<div>
|
|
||||||
<button id="generate">Generate snippet</button>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div>
|
|
||||||
<textarea id="snippet">
|
|
||||||
|
|
||||||
</textarea>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
2585
release/e2e.py
2585
release/e2e.py
File diff suppressed because it is too large
Load diff
|
@ -1,15 +0,0 @@
|
||||||
- name: horovod_test
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config_master.yaml
|
|
||||||
compute_template: compute_tpl.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 36000
|
|
||||||
prepare: python wait_cluster.py 3 600
|
|
||||||
script: python workloads/horovod_tune_test.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 1800
|
|
|
@ -1,53 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,92 +0,0 @@
|
||||||
- name: train_small
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_small.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
autosuspend_mins: 10
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/train_small.py
|
|
||||||
|
|
||||||
- name: train_moderate
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_moderate.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 32 600
|
|
||||||
script: python workloads/train_moderate.py
|
|
||||||
|
|
||||||
- name: train_gpu
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config_gpu.yaml
|
|
||||||
compute_template: tpl_gpu_small.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 5 600
|
|
||||||
script: python workloads/train_gpu.py
|
|
||||||
|
|
||||||
- name: distributed_api_test
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_small.yaml
|
|
||||||
results:
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/distributed_api_test.py
|
|
||||||
results: ""
|
|
||||||
|
|
||||||
- name: ft_small_non_elastic
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_small.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 900
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/ft_small_non_elastic.py
|
|
||||||
results: ""
|
|
||||||
|
|
||||||
- name: tune_small
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_small.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/tune_small.py
|
|
||||||
|
|
||||||
- name: tune_32x4
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_moderate.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 900
|
|
||||||
prepare: python wait_cluster.py 32 600
|
|
||||||
script: python workloads/tune_32x4.py
|
|
||||||
|
|
||||||
- name: tune_4x32
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_moderate.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 900
|
|
||||||
prepare: python wait_cluster.py 32 600
|
|
||||||
script: python workloads/tune_4x32.py
|
|
|
@ -1,53 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,13 +0,0 @@
|
||||||
- name: pytorch_pbt_failure
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: compute_tpl.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
script: python workloads/pytorch_pbt_failure.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
timeout: 3600
|
|
|
@ -1,196 +0,0 @@
|
||||||
- name: actor_deaths
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_1.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: ray stop
|
|
||||||
script: python workloads/actor_deaths.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
- name: apex
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: ../rllib_tests/app_config.yaml
|
|
||||||
compute_template: tpl_cpu_3.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: python wait_cluster.py 3 600
|
|
||||||
script: python workloads/apex.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
|
|
||||||
- name: impala
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config_np.yaml
|
|
||||||
compute_template: tpl_cpu_1_large.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
script: python workloads/impala.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
- name: many_actor_tasks
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_1.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: ray stop
|
|
||||||
script: python workloads/many_actor_tasks.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
|
|
||||||
- name: many_drivers
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_1.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: ray stop
|
|
||||||
script: python workloads/many_drivers.py --iteration-num=4000
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
|
|
||||||
- name: many_ppo
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: ../rllib_tests/app_config.yaml
|
|
||||||
compute_template: many_ppo.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: python wait_cluster.py 1 600
|
|
||||||
script: python workloads/many_ppo.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
- name: many_tasks
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_1.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: ray stop
|
|
||||||
script: python workloads/many_tasks.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
- name: many_tasks_serialized_ids
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_1.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: ray stop
|
|
||||||
script: python workloads/many_tasks_serialized_ids.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
|
|
||||||
- name: node_failures
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_1.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: ray stop
|
|
||||||
script: python workloads/node_failures.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
- name: pbt
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: ../rllib_tests/app_config.yaml
|
|
||||||
compute_template: tpl_cpu_1.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: ray stop
|
|
||||||
script: python workloads/pbt.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
- name: serve
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_1.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: ray stop
|
|
||||||
script: python workloads/serve.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
- name: serve_failure
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_1.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
prepare: ray stop
|
|
||||||
script: python workloads/serve_failure.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
|
|
||||||
stable: False
|
|
|
@ -1,53 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,9 +0,0 @@
|
||||||
# - name: microbenchmark
|
|
||||||
# team: core
|
|
||||||
# cluster:
|
|
||||||
# app_config: app_config.yaml
|
|
||||||
# compute_template: tpl_64.yaml
|
|
||||||
|
|
||||||
# run:
|
|
||||||
# timeout: 1800
|
|
||||||
# script: OMP_NUM_THREADS=64 RAY_ADDRESS= python run_microbenchmark.py
|
|
|
@ -1,124 +0,0 @@
|
||||||
- name: horovod_user_test_latest
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: horovod/app_config.yaml
|
|
||||||
compute_template: horovod/compute_tpl.yaml
|
|
||||||
|
|
||||||
|
|
||||||
driver_setup: horovod/driver_setup_latest.sh
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
autosuspend_mins: 10
|
|
||||||
timeout: 1200
|
|
||||||
script: python horovod/horovod_user_test.py
|
|
||||||
|
|
||||||
- name: horovod_user_test_master
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: ../horovod_tests/app_config_master.yaml
|
|
||||||
compute_template: horovod/compute_tpl.yaml
|
|
||||||
|
|
||||||
driver_setup: horovod/driver_setup_master.sh
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
autosuspend_mins: 10
|
|
||||||
timeout: 1200
|
|
||||||
script: python horovod/horovod_user_test.py
|
|
||||||
|
|
||||||
|
|
||||||
- name: train_tensorflow_mnist_test
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: train/app_config.yaml
|
|
||||||
compute_template: train/compute_tpl.yaml
|
|
||||||
|
|
||||||
driver_setup: train/driver_setup.sh
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
timeout: 36000
|
|
||||||
script: python train/train_tensorflow_mnist_test.py
|
|
||||||
|
|
||||||
- name: train_torch_linear_test
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: train/app_config.yaml
|
|
||||||
compute_template: train/compute_tpl.yaml
|
|
||||||
|
|
||||||
driver_setup: train/driver_setup.sh
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
timeout: 36000
|
|
||||||
script: python train/train_torch_linear_test.py
|
|
||||||
|
|
||||||
|
|
||||||
- name: xgboost_gpu_connect_latest
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: xgboost/app_config_gpu.yaml
|
|
||||||
compute_template: xgboost/tpl_gpu_small_scaling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
timeout: 1200
|
|
||||||
script: python xgboost/train_gpu_connect.py
|
|
||||||
|
|
||||||
- name: xgboost_gpu_connect_master
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: xgboost/app_config_gpu_master.yaml
|
|
||||||
compute_template: xgboost/tpl_gpu_small_scaling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
timeout: 1200
|
|
||||||
script: python xgboost/train_gpu_connect.py
|
|
||||||
|
|
||||||
- name: ray_lightning_user_test_latest
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: ray-lightning/app_config.yaml
|
|
||||||
compute_template: ray-lightning/compute_tpl.yaml
|
|
||||||
|
|
||||||
driver_setup: ray-lightning/driver_setup.sh
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
autosuspend_mins: 10
|
|
||||||
timeout: 1200
|
|
||||||
script: python ray-lightning/ray_lightning_user_test.py
|
|
||||||
|
|
||||||
|
|
||||||
- name: ray_lightning_user_test_master
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: ray-lightning/app_config_master.yaml
|
|
||||||
compute_template: ray-lightning/compute_tpl.yaml
|
|
||||||
|
|
||||||
|
|
||||||
driver_setup: ray-lightning/driver_setup.sh
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
autosuspend_mins: 10
|
|
||||||
timeout: 1200
|
|
||||||
script: python ray-lightning/ray_lightning_user_test.py
|
|
||||||
|
|
||||||
|
|
||||||
- name: tune_rllib_connect_test
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: ../rllib_tests/app_config.yaml
|
|
||||||
compute_template: tune_rllib/compute_tpl.yaml
|
|
||||||
|
|
||||||
|
|
||||||
driver_setup: tune_rllib/driver_setup.sh
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
autosuspend_mins: 10
|
|
||||||
timeout: 1200
|
|
||||||
script: python tune_rllib/run_connect_tests.py
|
|
|
@ -1,64 +0,0 @@
|
||||||
#
|
|
||||||
# Chaos tests.
|
|
||||||
#
|
|
||||||
|
|
||||||
# Run the test that invokes many tasks without object store usage.
|
|
||||||
- name: chaos_many_tasks_no_object_store
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: chaos_test/app_config.yaml
|
|
||||||
compute_template: chaos_test/compute_template.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
|
|
||||||
script: python chaos_test/test_chaos_basic.py --workload=tasks
|
|
||||||
|
|
||||||
- name: chaos_many_actors
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: chaos_test/app_config.yaml
|
|
||||||
compute_template: chaos_test/compute_template.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python wait_cluster.py 10 600; python setup_chaos.py --no-start
|
|
||||||
script: python chaos_test/test_chaos_basic.py --workload=actors
|
|
||||||
|
|
||||||
- name: chaos_dask_on_ray_large_scale_test_no_spilling
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
|
|
||||||
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
# Total run time without failures is about 300-400s.
|
|
||||||
prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
|
|
||||||
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
|
||||||
|
|
||||||
# Test large scale dask on ray test with spilling.
|
|
||||||
- name: chaos_dask_on_ray_large_scale_test_spilling
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
|
|
||||||
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
# Total run time without failures is about 300-400s.
|
|
||||||
prepare: python wait_cluster.py 21 600; python setup_chaos.py --node-kill-interval 100
|
|
||||||
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
|
|
||||||
|
|
||||||
- name: chaos_pipelined_ingestion_1500_gb_15_windows
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: dataset/pipelined_ingestion_app.yaml
|
|
||||||
compute_template: dataset/pipelined_ingestion_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
prepare: python wait_cluster.py 21 2400; python setup_chaos.py --node-kill-interval 300
|
|
||||||
script: python dataset/pipelined_training.py --epochs 1 --num-windows 15 --num-files 915 --debug
|
|
||||||
|
|
||||||
stable: false
|
|
|
@ -1,95 +0,0 @@
|
||||||
- name: inference
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: inference.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 2 600
|
|
||||||
script: python inference.py
|
|
||||||
|
|
||||||
- name: shuffle_data_loader
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 1800
|
|
||||||
script: python dataset_shuffle_data_loader.py
|
|
||||||
|
|
||||||
- name: parquet_metadata_resolution
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: pipelined_training_app.yaml
|
|
||||||
compute_template: pipelined_training_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 1200
|
|
||||||
prepare: python wait_cluster.py 15 1200
|
|
||||||
script: python parquet_metadata_resolution.py --num-files 915
|
|
||||||
|
|
||||||
- name: pipelined_training_50_gb
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: pipelined_training_app.yaml
|
|
||||||
compute_template: pipelined_training_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 4800
|
|
||||||
prepare: python wait_cluster.py 15 1200
|
|
||||||
script: python pipelined_training.py --epochs 1
|
|
||||||
|
|
||||||
- name: pipelined_ingestion_1500_gb
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: pipelined_ingestion_app.yaml
|
|
||||||
compute_template: pipelined_ingestion_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 9600
|
|
||||||
prepare: python wait_cluster.py 21 2400
|
|
||||||
script: python pipelined_training.py --epochs 2 --num-windows 2 --num-files 915 --debug
|
|
||||||
|
|
||||||
- name: datasets_ingest_train_infer
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: ray_sgd_training_app.yaml
|
|
||||||
compute_template: ray_sgd_training_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 14400
|
|
||||||
prepare: python wait_cluster.py 66 2400
|
|
||||||
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
cluster:
|
|
||||||
app_config: ray_sgd_training_app.yaml
|
|
||||||
compute_template: ray_sgd_training_smoke_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python wait_cluster.py 8 2400
|
|
||||||
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
|
|
||||||
|
|
||||||
- name: datasets_preprocess_ingest
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: ray_sgd_training_app.yaml
|
|
||||||
compute_template: ray_sgd_training_compute_no_gpu.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
prepare: python wait_cluster.py 21 2400
|
|
||||||
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu --large-dataset --debug
|
|
||||||
|
|
||||||
- name: datasets_ingest_400G
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: ray_sgd_training_app.yaml
|
|
||||||
compute_template: dataset_ingest_400G_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1
|
|
|
@ -1,53 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,390 +0,0 @@
|
||||||
#
|
|
||||||
# Single node shuffle
|
|
||||||
#
|
|
||||||
# Test basic single node 10GB shuffle with a small number of partitions.
|
|
||||||
# This doesn't require object spilling.
|
|
||||||
# - name: shuffle_10gb
|
|
||||||
# team: core
|
|
||||||
# cluster:
|
|
||||||
# app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
# compute_template: shuffle/shuffle_compute_single.yaml
|
|
||||||
|
|
||||||
# run:
|
|
||||||
# timeout: 3000
|
|
||||||
# script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=200e6
|
|
||||||
|
|
||||||
# Test single node 50GB shuffle with a large number of partitions.
|
|
||||||
- name: shuffle_50gb
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle/shuffle_compute_single.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3000
|
|
||||||
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9
|
|
||||||
|
|
||||||
# Test single node 50GB shuffle with a large number of partitions.
|
|
||||||
- name: shuffle_50gb_large_partition
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle/shuffle_compute_single.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3000
|
|
||||||
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6
|
|
||||||
|
|
||||||
# Test non streaming shuffle in a single node with a small number of partition.
|
|
||||||
- name: non_streaming_shuffle_50gb
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle/shuffle_compute_single.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3000
|
|
||||||
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming
|
|
||||||
|
|
||||||
# Test non streaming shuffle in a single node with a large number of partition.
|
|
||||||
- name: non_streaming_shuffle_50gb_large_partition
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle/shuffle_compute_single.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3000
|
|
||||||
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
|
|
||||||
|
|
||||||
- name: dask_on_ray_10gb_sort
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
|
||||||
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
|
|
||||||
|
|
||||||
- name: dask_on_ray_100gb_sort
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
|
||||||
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 100_000_000_000 --npartitions 200 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
|
|
||||||
|
|
||||||
#
|
|
||||||
# Multi node shuffle
|
|
||||||
#
|
|
||||||
|
|
||||||
# Test multi nodes 100GB shuffle with a small number of partitions.
|
|
||||||
- name: shuffle_100gb
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle/shuffle_compute_multi.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3000
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6
|
|
||||||
|
|
||||||
# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
|
|
||||||
- name: non_streaming_shuffle_100gb
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle/shuffle_compute_multi.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3000
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
|
|
||||||
|
|
||||||
# Test autoscaling 1TB streaming shuffle with a large number of partitions.
|
|
||||||
- name: autoscaling_shuffle_1tb_1000_partitions
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle/shuffle_compute_autoscaling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 4000
|
|
||||||
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
|
|
||||||
|
|
||||||
# Test multi nodes 1TB streaming shuffle with a large number of partitions.
|
|
||||||
- name: shuffle_1tb_1000_partition
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3000
|
|
||||||
prepare: python wait_cluster.py 20 900
|
|
||||||
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9
|
|
||||||
|
|
||||||
# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
|
|
||||||
- name: non_streaming_shuffle_1tb_1000_partition
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3000
|
|
||||||
prepare: python wait_cluster.py 20 900
|
|
||||||
script: python shuffle/shuffle_test.py --num-partitions=1000 --partition-size=1e9 --no-streaming
|
|
||||||
|
|
||||||
# Stress test for 1TB multi node streaming shuffle.
|
|
||||||
- name: shuffle_1tb_5000_partitions
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 9000
|
|
||||||
prepare: python wait_cluster.py 20 900
|
|
||||||
script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6
|
|
||||||
|
|
||||||
# Stress test for 1TB multi node non-streaming shuffle.
|
|
||||||
# - name: non_streaming_shuffle_1tb_5000_partitions
|
|
||||||
# team: core
|
|
||||||
# stable: False
|
|
||||||
# cluster:
|
|
||||||
# app_config: shuffle/shuffle_app_config.yaml
|
|
||||||
# compute_template: shuffle/shuffle_compute_large_scale.yaml
|
|
||||||
|
|
||||||
# run:
|
|
||||||
# timeout: 7200
|
|
||||||
# prepare: python wait_cluster.py 20 900
|
|
||||||
# script: python shuffle/shuffle_test.py --num-partitions=5000 --partition-size=200e6 --no-streaming
|
|
||||||
|
|
||||||
- name: k8s_dask_on_ray_large_scale_test_no_spilling
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
||||||
compute_template: dask_on_ray/dask_on_ray_stress_compute_k8s.yaml
|
|
||||||
compute_on_k8s: True
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
prepare: python wait_cluster.py 21 600
|
|
||||||
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
|
||||||
stable: false
|
|
||||||
|
|
||||||
# # Test large scale dask on ray test without spilling.
|
|
||||||
# - name: dask_on_ray_large_scale_test_no_spilling
|
|
||||||
# team: core
|
|
||||||
# cluster:
|
|
||||||
# app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
||||||
# compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
|
||||||
|
|
||||||
# run:
|
|
||||||
# timeout: 7200
|
|
||||||
# prepare: python wait_cluster.py 21 600
|
|
||||||
# script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
|
||||||
|
|
||||||
# smoke_test:
|
|
||||||
# cluster:
|
|
||||||
# app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
||||||
# compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
|
|
||||||
|
|
||||||
# run:
|
|
||||||
# timeout: 7200
|
|
||||||
# prepare: python wait_cluster.py 5 600
|
|
||||||
# script: python dask_on_ray/large_scale_test.py --num_workers 4 --worker_obj_store_size_in_gb 20 --error_rate 0 --data_save_path /tmp/ray
|
|
||||||
|
|
||||||
# Test large scale dask on ray test with spilling.
|
|
||||||
- name: dask_on_ray_large_scale_test_spilling
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
||||||
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
prepare: python wait_cluster.py 21 600
|
|
||||||
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
cluster:
|
|
||||||
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
|
|
||||||
compute_template: dask_on_ray/large_scale_dask_on_ray_compute_template.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
prepare: python wait_cluster.py 5 600
|
|
||||||
script: python dask_on_ray/large_scale_test.py --num_workers 32 --worker_obj_store_size_in_gb 70 --error_rate 0 --data_save_path /tmp/ray
|
|
||||||
|
|
||||||
# Stress tests with many tasks
|
|
||||||
- name: stress_test_many_tasks
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: stress_tests/stress_tests_app_config.yaml
|
|
||||||
compute_template: stress_tests/stress_tests_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
script: python stress_tests/test_many_tasks.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
cluster:
|
|
||||||
app_config: stress_tests/stress_tests_app_config.yaml
|
|
||||||
compute_template: stress_tests/smoke_test_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
script: python stress_tests/test_many_tasks.py --num-nodes=4 --smoke-test
|
|
||||||
|
|
||||||
# Stress tests with dead actors
|
|
||||||
- name: stress_test_dead_actors
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: stress_tests/stress_tests_app_config.yaml
|
|
||||||
compute_template: stress_tests/stress_tests_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
script: python stress_tests/test_dead_actors.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
cluster:
|
|
||||||
app_config: stress_tests/stress_tests_app_config.yaml
|
|
||||||
compute_template: stress_tests/smoke_test_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
script: python stress_tests/test_dead_actors.py --num-nodes=4 --num-parents=3 --num-children=3
|
|
||||||
|
|
||||||
# Stress tests with placement groups
|
|
||||||
- name: stress_test_placement_group
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: stress_tests/stress_tests_app_config.yaml
|
|
||||||
compute_template: stress_tests/placement_group_tests_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
script: python stress_tests/test_placement_group.py
|
|
||||||
|
|
||||||
# Stress tests with many threaded actors.
|
|
||||||
- name: threaded_actors_stress_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: stress_tests/stress_tests_app_config.yaml
|
|
||||||
compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
prepare: python wait_cluster.py 201 600
|
|
||||||
script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
cluster:
|
|
||||||
app_config: stress_tests/stress_tests_app_config.yaml
|
|
||||||
compute_template: stress_tests/smoke_test_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python wait_cluster.py 5 600
|
|
||||||
script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
|
|
||||||
stable: false
|
|
||||||
|
|
||||||
- name: k8s_threaded_actors_stress_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: stress_tests/stress_tests_app_config.yaml
|
|
||||||
compute_template: stress_tests/k8s_stress_test_threaded_actor_compute.yaml
|
|
||||||
compute_on_k8s: True
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
prepare: python wait_cluster.py 201 600
|
|
||||||
script: python stress_tests/test_threaded_actors.py --test-runtime 3600 --kill-interval_s 60
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python wait_cluster.py 5 600
|
|
||||||
script: python stress_tests/test_threaded_actors.py --test-runtime 1800 --kill-interval_s 30
|
|
||||||
stable: false
|
|
||||||
|
|
||||||
# Test decision tree on autoscaling compute cluster.
|
|
||||||
- name: decision_tree_autoscaling
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: decision_tree/decision_tree_app_config.yaml
|
|
||||||
compute_template: decision_tree/autoscaling_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3000
|
|
||||||
script: python decision_tree/cart_with_tree.py
|
|
||||||
|
|
||||||
# Test 20 concurrent decision tree runs on autoscaling compute cluster.
|
|
||||||
- name: decision_tree_autoscaling_20_runs
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: decision_tree/decision_tree_app_config.yaml
|
|
||||||
compute_template: decision_tree/autoscaling_compute.yaml
|
|
||||||
run:
|
|
||||||
timeout: 9600
|
|
||||||
script: python decision_tree/cart_with_tree.py --concurrency=20
|
|
||||||
|
|
||||||
- name: dask_on_ray_1tb_sort
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: dask_on_ray/dask_on_ray_app_config.yaml
|
|
||||||
compute_template: dask_on_ray/1tb_sort_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
prepare: python wait_cluster.py 32 1000
|
|
||||||
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
|
|
||||||
|
|
||||||
- name: many_nodes_actor_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: many_nodes_tests/app_config.yaml
|
|
||||||
compute_template: many_nodes_tests/compute_config.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
prepare: python wait_cluster.py 251 5400
|
|
||||||
script: python many_nodes_tests/actor_test.py
|
|
||||||
|
|
||||||
- name: pg_autoscaling_regression_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: placement_group_tests/app_config.yaml
|
|
||||||
compute_template: placement_group_tests/compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 1200
|
|
||||||
script: python placement_group_tests/pg_run.py
|
|
||||||
|
|
||||||
- name: pg_long_running_performance_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: placement_group_tests/app_config.yaml
|
|
||||||
compute_template: placement_group_tests/long_running_test_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python wait_cluster.py 2 600
|
|
||||||
script: python placement_group_tests/long_running_performance_test.py --num-stages 2000
|
|
||||||
|
|
||||||
- name: placement_group_performance_test
|
|
||||||
team: core
|
|
||||||
cluster:
|
|
||||||
app_config: placement_group_tests/app_config.yaml
|
|
||||||
compute_template: placement_group_tests/pg_perf_test_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 1200
|
|
||||||
prepare: python wait_cluster.py 5 600
|
|
||||||
script: python placement_group_tests/placement_group_performance_test.py
|
|
|
@ -1,54 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,103 +0,0 @@
|
||||||
# Heavy learning tests (Atari and HalfCheetah) for major algos.
|
|
||||||
- name: learning_tests
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: 8gpus_64cpus.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 14400
|
|
||||||
script: python learning_tests/run.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 1200
|
|
||||||
|
|
||||||
# 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos.
|
|
||||||
- name: multi_gpu_learning_tests
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: 8gpus_96cpus.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
script: python multi_gpu_learning_tests/run.py
|
|
||||||
|
|
||||||
# 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos
|
|
||||||
# (that support RNN models).
|
|
||||||
- name: multi_gpu_with_lstm_learning_tests
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: 8gpus_96cpus.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
script: python multi_gpu_with_lstm_learning_tests/run.py
|
|
||||||
|
|
||||||
# 2-GPU learning tests (StatelessCartPole) + use_attention=True for major
|
|
||||||
# algos (that support RNN models).
|
|
||||||
- name: multi_gpu_with_attention_learning_tests
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: 8gpus_96cpus.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
script: python multi_gpu_with_attention_learning_tests/run.py
|
|
||||||
|
|
||||||
# We'll have these as per-PR tests soon.
|
|
||||||
# - name: example_scripts_on_gpu_tests
|
|
||||||
# team: ml
|
|
||||||
# cluster:
|
|
||||||
# app_config: app_config.yaml
|
|
||||||
# compute_template: 1gpu_4cpus.yaml
|
|
||||||
|
|
||||||
# run:
|
|
||||||
# timeout: 7200
|
|
||||||
# script: bash unit_gpu_tests/run.sh
|
|
||||||
|
|
||||||
# IMPALA large machine stress tests (4x Atari).
|
|
||||||
- name: stress_tests
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: 4gpus_544_cpus.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 5400
|
|
||||||
prepare: python wait_cluster.py 6 600
|
|
||||||
script: python stress_tests/run_stress_tests.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 2000
|
|
||||||
|
|
||||||
# Tests that exercise auto-scaling and Anyscale connect.
|
|
||||||
- name: connect_tests
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: auto_scale.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
timeout: 3000
|
|
||||||
script: python connect_tests/run_connect_tests.py
|
|
||||||
|
|
||||||
# Nightly performance regression for popular algorithms.
|
|
||||||
# These algorithms run nightly for pre-determined amount of time without
|
|
||||||
# passing criteria.
|
|
||||||
# Performance metrics, such as reward achieved and throughput, are then
|
|
||||||
# collected and tracked over time.
|
|
||||||
- name: performance_tests
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: 12gpus_192cpus.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 10800
|
|
||||||
script: python performance_tests/run.py
|
|
|
@ -1,53 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,176 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
cd "${0%/*}" || exit 1
|
|
||||||
|
|
||||||
reason() {
|
|
||||||
# Keep in sync with e2e.py ExitCode enum
|
|
||||||
case $1 in
|
|
||||||
0)
|
|
||||||
REASON="success"
|
|
||||||
;;
|
|
||||||
2)
|
|
||||||
REASON="unspecified"
|
|
||||||
;;
|
|
||||||
3)
|
|
||||||
REASON="unknown"
|
|
||||||
;;
|
|
||||||
4)
|
|
||||||
REASON="runtime error"
|
|
||||||
;;
|
|
||||||
5)
|
|
||||||
REASON="command error"
|
|
||||||
;;
|
|
||||||
6)
|
|
||||||
REASON="command timeout"
|
|
||||||
;;
|
|
||||||
7)
|
|
||||||
REASON="prepare timeout"
|
|
||||||
;;
|
|
||||||
8)
|
|
||||||
REASON="filesync timeout"
|
|
||||||
;;
|
|
||||||
9)
|
|
||||||
REASON="session timeout"
|
|
||||||
;;
|
|
||||||
10)
|
|
||||||
REASON="prepare error"
|
|
||||||
;;
|
|
||||||
11)
|
|
||||||
REASON="app config build error"
|
|
||||||
;;
|
|
||||||
12)
|
|
||||||
REASON="infra error"
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
REASON="untracked error"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
echo "${REASON}"
|
|
||||||
}
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]
|
|
||||||
do
|
|
||||||
key="$1"
|
|
||||||
case $key in
|
|
||||||
--ray-repo)
|
|
||||||
shift
|
|
||||||
RAY_REPO=$1
|
|
||||||
;;
|
|
||||||
--ray-branch)
|
|
||||||
shift
|
|
||||||
RAY_BRANCH=$1
|
|
||||||
;;
|
|
||||||
--ray-version)
|
|
||||||
shift
|
|
||||||
RAY_VERSION=$1
|
|
||||||
;;
|
|
||||||
--ray-wheels)
|
|
||||||
shift
|
|
||||||
RAY_WHEELS=$1
|
|
||||||
;;
|
|
||||||
--ray-test-repo)
|
|
||||||
shift
|
|
||||||
RAY_TEST_REPO=$1
|
|
||||||
;;
|
|
||||||
--ray-test-branch)
|
|
||||||
shift
|
|
||||||
RAY_TEST_BRANCH=$1
|
|
||||||
;;
|
|
||||||
--release-results-dir)
|
|
||||||
shift
|
|
||||||
RELEASE_RESULTS_DIR=$1
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
break
|
|
||||||
esac
|
|
||||||
shift
|
|
||||||
done
|
|
||||||
|
|
||||||
RAY_TEST_REPO=${RAY_TEST_REPO-https://github.com/ray-project/ray.git}
|
|
||||||
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
|
|
||||||
RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
|
|
||||||
|
|
||||||
export RAY_REPO RAY_BRANCH RAY_VERSION RAY_WHEELS RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
|
|
||||||
|
|
||||||
pip uninstall -q -y ray
|
|
||||||
pip install -q -r requirements.txt
|
|
||||||
pip install -q -U boto3 botocore
|
|
||||||
git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" ~/ray
|
|
||||||
|
|
||||||
RETRY_NUM=0
|
|
||||||
MAX_RETRIES=${MAX_RETRIES-3}
|
|
||||||
|
|
||||||
if [ "${BUILDKITE_RETRY_COUNT-0}" -ge 1 ]; then
|
|
||||||
echo "This is a manually triggered retry from the Buildkite web UI, so we set the number of infra retries to 1."
|
|
||||||
MAX_RETRIES=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
ALL_EXIT_CODES=()
|
|
||||||
while [ "$RETRY_NUM" -lt "$MAX_RETRIES" ]; do
|
|
||||||
RETRY_NUM=$((RETRY_NUM + 1))
|
|
||||||
|
|
||||||
if [ "$RETRY_NUM" -gt 1 ]; then
|
|
||||||
# Sleep for random time between 30 and 90 minutes
|
|
||||||
SLEEP_TIME=$((1800 + RANDOM % 5400))
|
|
||||||
echo "----------------------------------------"
|
|
||||||
echo "Retry count: ${RETRY_NUM}/${MAX_RETRIES}. Sleeping for ${SLEEP_TIME} seconds before retrying the run."
|
|
||||||
echo "----------------------------------------"
|
|
||||||
sleep ${SLEEP_TIME}
|
|
||||||
fi
|
|
||||||
|
|
||||||
sudo rm -rf "${RELEASE_RESULTS_DIR}"/* || true
|
|
||||||
|
|
||||||
python e2e.py "$@"
|
|
||||||
EXIT_CODE=$?
|
|
||||||
REASON=$(reason "${EXIT_CODE}")
|
|
||||||
ALL_EXIT_CODES[${#ALL_EXIT_CODES[@]}]=$EXIT_CODE
|
|
||||||
|
|
||||||
case ${EXIT_CODE} in
|
|
||||||
0)
|
|
||||||
echo "Script finished successfully on try ${RETRY_NUM}/${MAX_RETRIES}"
|
|
||||||
break
|
|
||||||
;;
|
|
||||||
7 | 9 | 10)
|
|
||||||
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON})."
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Script failed on try ${RETRY_NUM}/${MAX_RETRIES} with exit code ${EXIT_CODE} (${REASON}), aborting."
|
|
||||||
break
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
sudo rm -rf /tmp/ray_release_test_artifacts/* || true
|
|
||||||
sudo cp -rf "${RELEASE_RESULTS_DIR}"/* /tmp/ray_release_test_artifacts/ || true
|
|
||||||
|
|
||||||
echo "----------------------------------------"
|
|
||||||
echo "e2e test finished with final exit code ${EXIT_CODE} after ${RETRY_NUM}/${MAX_RETRIES} tries"
|
|
||||||
echo "Run results:"
|
|
||||||
|
|
||||||
COUNTER=1
|
|
||||||
for EX in "${ALL_EXIT_CODES[@]}"; do
|
|
||||||
REASON=$(reason "${EX}")
|
|
||||||
echo " Run $COUNTER: Exit code = ${EX} (${REASON})"
|
|
||||||
COUNTER=$((COUNTER + 1))
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "----------------------------------------"
|
|
||||||
|
|
||||||
REASON=$(reason "${EXIT_CODE}")
|
|
||||||
echo "Final e2e exit code is ${EXIT_CODE} (${REASON})"
|
|
||||||
|
|
||||||
case ${EXIT_CODE} in
|
|
||||||
0)
|
|
||||||
;;
|
|
||||||
7 | 9 | 10)
|
|
||||||
echo "RELEASE MANAGER: This is likely an infra error that can be solved by RESTARTING this test."
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "RELEASE MANAGER: This could be an error in the test. Please REVIEW THE LOGS and ping the test owner."
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
exit $EXIT_CODE
|
|
|
@ -1,34 +0,0 @@
|
||||||
- name: rte_many_tasks_actors
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: rte_small.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/rte_many_tasks_actors.py
|
|
||||||
|
|
||||||
- name: wheel_urls
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: rte_minimal.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 9000 # 2h30m
|
|
||||||
prepare: python wait_cluster.py 1 600
|
|
||||||
script: python workloads/wheel_urls.py
|
|
||||||
|
|
||||||
- name: rte_ray_client
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: rte_minimal.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
autosuspend_mins: 10
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 1 600
|
|
||||||
script: python workloads/rte_ray_client.py
|
|
|
@ -1,53 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,101 +0,0 @@
|
||||||
- name: single_deployment_1k_noop_replica
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: compute_tpl_32_cpu.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
long_running: False
|
|
||||||
script: python workloads/single_deployment_1k_noop_replica.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
timeout: 600
|
|
||||||
|
|
||||||
- name: multi_deployment_1k_noop_replica
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: compute_tpl_32_cpu.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
long_running: False
|
|
||||||
script: python workloads/multi_deployment_1k_noop_replica.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
timeout: 600
|
|
||||||
|
|
||||||
- name: autoscaling_single_deployment
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: compute_tpl_8_cpu_autoscaling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
long_running: False
|
|
||||||
script: python workloads/autoscaling_single_deployment.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
timeout: 600
|
|
||||||
|
|
||||||
- name: autoscaling_multi_deployment
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: compute_tpl_8_cpu_autoscaling.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
long_running: False
|
|
||||||
script: python workloads/autoscaling_multi_deployment.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
timeout: 600
|
|
||||||
|
|
||||||
- name: serve_micro_benchmark
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
# 16 CPUS
|
|
||||||
compute_template: compute_tpl_single_node.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
long_running: False
|
|
||||||
script: python workloads/serve_micro_benchmark.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
timeout: 600
|
|
||||||
|
|
||||||
- name: serve_micro_benchmark_k8s
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
# 16 CPUS
|
|
||||||
compute_template: compute_tpl_single_node_k8s.yaml
|
|
||||||
compute_on_k8s: True
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
long_running: False
|
|
||||||
script: python workloads/serve_micro_benchmark.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
timeout: 600
|
|
||||||
|
|
||||||
- name: serve_cluster_fault_tolerance
|
|
||||||
team: serve
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
# 16 CPUS
|
|
||||||
compute_template: compute_tpl_single_node.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 7200
|
|
||||||
long_running: False
|
|
||||||
script: python workloads/serve_cluster_fault_tolerance.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
timeout: 600
|
|
|
@ -1,11 +0,0 @@
|
||||||
# Test multi-node, multi-GPU Ray SGD example.
|
|
||||||
- name: sgd_gpu
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: sgd_gpu/sgd_gpu_app_config.yaml
|
|
||||||
compute_template: sgd_gpu/sgd_gpu_compute.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3000
|
|
||||||
prepare: python wait_cluster.py 2 600
|
|
||||||
script: python sgd_gpu/sgd_gpu_test.py --num-workers=2 --use-gpu --address=auto
|
|
|
@ -1,53 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,27 +0,0 @@
|
||||||
# Specify the test owners (teams) here.
|
|
||||||
# The root key should be the name of the test yaml file without the .yaml.
|
|
||||||
# To specify owners of subtests, use a sub dict (see e.g. long_running_tests).
|
|
||||||
golden_notebook_tests: ml
|
|
||||||
horovod_tests: ml
|
|
||||||
lightgbm_tests: ml
|
|
||||||
long_running_distributed_tests: ml
|
|
||||||
long_running_tests:
|
|
||||||
actor_deaths: core
|
|
||||||
apex: ml
|
|
||||||
impala: ml
|
|
||||||
many_actor_tasks: core
|
|
||||||
many_drivers: core
|
|
||||||
many_ppo: core
|
|
||||||
many_tasks: core
|
|
||||||
many_tasks_serialized_ids: core
|
|
||||||
node_failures: core
|
|
||||||
pbt: ml
|
|
||||||
serve: serve
|
|
||||||
serve_failure: serve
|
|
||||||
microbenchmark: core
|
|
||||||
nightly_tests: core
|
|
||||||
rllib_tests: ml
|
|
||||||
runtime_env_tests: serve
|
|
||||||
serve_tests: serve
|
|
||||||
sgd_tests: ml
|
|
||||||
xgboost_tests: ml
|
|
|
@ -1,118 +0,0 @@
|
||||||
- name: aws_no_sync_down
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_aws_4x2.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/run_cloud_test.py no_sync_down
|
|
||||||
|
|
||||||
- name: aws_ssh_sync
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_aws_4x2.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/run_cloud_test.py ssh_sync
|
|
||||||
|
|
||||||
- name: aws_durable_upload
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_aws_4x2.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
|
|
||||||
|
|
||||||
- name: aws_durable_upload_rllib_str
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config_ml.yaml
|
|
||||||
compute_template: tpl_aws_4x2.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str
|
|
||||||
|
|
||||||
- name: aws_durable_upload_rllib_trainer
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config_ml.yaml
|
|
||||||
compute_template: tpl_aws_4x2.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer
|
|
||||||
|
|
||||||
- name: aws_no_durable_upload
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_aws_4x2.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload
|
|
||||||
|
|
||||||
- name: gcp_k8s_no_sync_down
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
|
||||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
timeout: 600
|
|
||||||
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
|
|
||||||
script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
|
|
||||||
|
|
||||||
- name: gcp_k8s_ssh_sync
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
|
||||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
timeout: 600
|
|
||||||
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
|
|
||||||
script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
|
|
||||||
|
|
||||||
- name: gcp_k8s_durable_upload
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
|
||||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
timeout: 600
|
|
||||||
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
|
|
||||||
script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
|
|
||||||
|
|
||||||
|
|
||||||
- name: gcp_k8s_no_durable_upload
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_gcp_k8s_4x8.yaml
|
|
||||||
cloud_id: cld_k8WcxPgjUtSE8RVmfZpTLuKM # anyscale_k8s_gcp_cloud
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
timeout: 600
|
|
||||||
# Remove --cpus-per-trial 8 once n2-standard-2 is supported
|
|
||||||
script: python workloads/run_cloud_test.py no_durable_upload --cpus-per-trial 8 --bucket gs://jun-riot-test/durable_upload
|
|
|
@ -1,54 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,90 +0,0 @@
|
||||||
- name: bookkeeping_overhead
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_1x16.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 1200
|
|
||||||
script: python workloads/test_bookkeeping_overhead.py
|
|
||||||
|
|
||||||
|
|
||||||
- name: durable_trainable
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_16x2.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 900
|
|
||||||
prepare: python wait_cluster.py 16 600
|
|
||||||
script: python workloads/test_durable_trainable.py --bucket data-test-ilr
|
|
||||||
|
|
||||||
- name: long_running_large_checkpoints
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_1x32_hd.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 86400
|
|
||||||
script: python workloads/test_long_running_large_checkpoints.py
|
|
||||||
long_running: True
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
|
|
||||||
|
|
||||||
- name: network_overhead
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_100x2.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 900
|
|
||||||
prepare_timeout: 1200
|
|
||||||
prepare: python wait_cluster.py 100 1200
|
|
||||||
script: python workloads/test_network_overhead.py
|
|
||||||
|
|
||||||
smoke_test:
|
|
||||||
cluster:
|
|
||||||
compute_template: tpl_20x2.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 400
|
|
||||||
prepare_timeout: 600
|
|
||||||
prepare: python wait_cluster.py 20 600
|
|
||||||
|
|
||||||
- name: result_throughput_cluster
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_16x64.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 16 600
|
|
||||||
script: python workloads/test_result_throughput_cluster.py
|
|
||||||
|
|
||||||
- name: result_throughput_single_node
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_1x96.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
script: python workloads/test_result_throughput_single_node.py
|
|
||||||
|
|
||||||
- name: xgboost_sweep
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config_data.yaml
|
|
||||||
compute_template: tpl_16x64.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 3600
|
|
||||||
prepare: python wait_cluster.py 16 600
|
|
||||||
script: python workloads/test_xgboost_sweep.py
|
|
|
@ -1,53 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,53 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,53 +0,0 @@
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
|
|
||||||
import ray
|
|
||||||
|
|
||||||
ray.init(address="auto")
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"num_nodes", type=int, help="Wait for this number of nodes (includes head)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("max_time_s", type=int, help="Wait for this number of seconds")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--feedback_interval_s",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Wait for this number of seconds",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
curr_nodes = 0
|
|
||||||
start = time.time()
|
|
||||||
next_feedback = start
|
|
||||||
max_time = start + args.max_time_s
|
|
||||||
while not curr_nodes >= args.num_nodes:
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
if now >= max_time:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Maximum wait time reached, but only "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} nodes came up. Aborting."
|
|
||||||
)
|
|
||||||
|
|
||||||
if now >= next_feedback:
|
|
||||||
passed = now - start
|
|
||||||
print(
|
|
||||||
f"Waiting for more nodes to come up: "
|
|
||||||
f"{curr_nodes}/{args.num_nodes} "
|
|
||||||
f"({passed:.0f} seconds passed)"
|
|
||||||
)
|
|
||||||
next_feedback = now + args.feedback_interval_s
|
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
curr_nodes = len(ray.nodes())
|
|
||||||
|
|
||||||
passed = time.time() - start
|
|
||||||
print(
|
|
||||||
f"Cluster is up: {curr_nodes}/{args.num_nodes} nodes online after "
|
|
||||||
f"{passed:.0f} seconds"
|
|
||||||
)
|
|
|
@ -1,104 +0,0 @@
|
||||||
- name: train_small
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_small.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
use_connect: True
|
|
||||||
autosuspend_mins: 10
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/train_small.py
|
|
||||||
|
|
||||||
- name: train_moderate
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_moderate.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 32 600
|
|
||||||
script: python workloads/train_moderate.py
|
|
||||||
|
|
||||||
- name: train_gpu
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config_gpu.yaml
|
|
||||||
compute_template: tpl_gpu_small.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 5 600
|
|
||||||
script: python workloads/train_gpu.py
|
|
||||||
|
|
||||||
- name: distributed_api_test
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_small.yaml
|
|
||||||
results:
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/distributed_api_test.py
|
|
||||||
results: ""
|
|
||||||
|
|
||||||
- name: ft_small_elastic
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_small.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 900
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/ft_small_elastic.py
|
|
||||||
results: ""
|
|
||||||
|
|
||||||
- name: ft_small_non_elastic
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_small.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 900
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/ft_small_non_elastic.py
|
|
||||||
results: ""
|
|
||||||
|
|
||||||
- name: tune_small
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_small.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 600
|
|
||||||
prepare: python wait_cluster.py 4 600
|
|
||||||
script: python workloads/tune_small.py
|
|
||||||
|
|
||||||
- name: tune_32x4
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_moderate.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 900
|
|
||||||
prepare: python wait_cluster.py 32 600
|
|
||||||
script: python workloads/tune_32x4.py
|
|
||||||
|
|
||||||
- name: tune_4x32
|
|
||||||
team: ml
|
|
||||||
cluster:
|
|
||||||
app_config: app_config.yaml
|
|
||||||
compute_template: tpl_cpu_moderate.yaml
|
|
||||||
|
|
||||||
run:
|
|
||||||
timeout: 900
|
|
||||||
prepare: python wait_cluster.py 32 600
|
|
||||||
script: python workloads/tune_4x32.py
|
|
Loading…
Add table
Reference in a new issue