[Nightly Test] Add a team column to each test config. (#21198)

Please review **e2e.py and test_suite belonging to your team**! 

This is the first part of https://docs.google.com/document/d/16IrwerYi2oJugnRf5hvzukgpJ6FAVEpB6stH_CiNMjY/edit#

This PR adds a team name to each test suite.

If the name is not specified, it will be reported as unspecified. 

If you are running a local test, and if the new test suite doesn't have a team name specified, it will raise an exception (in this way, we can avoid missing team names in the future).

Note that we will aggregate all of test config into a single file, nightly_test.yaml.
This commit is contained in:
SangBin Cho 2021-12-28 07:42:41 +09:00 committed by GitHub
parent 3de18d2ada
commit b5b11b2d06
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
19 changed files with 183 additions and 124 deletions

View file

@ -1,8 +1,5 @@
- name: single_node
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"
team: core
cluster:
app_config: app_config.yaml
compute_template: single_node.yaml
@ -13,10 +10,7 @@
script: python single_node/test_single_node.py
- name: object_store
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"
team: core
cluster:
app_config: app_config.yaml
compute_template: object_store.yaml
@ -27,10 +21,7 @@
script: python object_store/test_object_store.py
- name: many_actors
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
@ -41,10 +32,7 @@
script: python distributed/test_many_actors.py
- name: many_actors_smoke_test
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
@ -55,10 +43,7 @@
script: SMOKE_TEST=1 python distributed/test_many_actors.py
- name: many_tasks
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
@ -69,10 +54,7 @@
script: python distributed/test_many_tasks.py --num-tasks=10000
- name: many_tasks_smoke_test
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
@ -83,10 +65,7 @@
script: python distributed/test_many_tasks.py --num-tasks=100
- name: many_pgs
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
@ -97,10 +76,7 @@
script: python distributed/test_many_pgs.py
- name: many_pgs_smoke_test
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
@ -112,10 +88,7 @@
# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
- name: many_nodes
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"
team: core
cluster:
app_config: app_config.yaml
compute_template: many_nodes.yaml
@ -126,10 +99,7 @@
script: python distributed/test_many_tasks.py --num-tasks=1000
- name: many_tasks_redis_ha
owner:
mail: "core@anyscale.com"
slack: "@Yi Cheng"
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
@ -146,10 +116,7 @@
stable: false
- name: many_actors_redis_ha
owner:
mail: "core@anyscale.com"
slack: "@Yi Cheng"
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
@ -166,10 +133,7 @@
stable: false
- name: many_nodes_redis_ha
owner:
mail: "core@anyscale.com"
slack: "@Yi Cheng"
team: core
cluster:
app_config: app_config.yaml
compute_template: many_nodes.yaml
@ -186,10 +150,7 @@
stable: false
- name: many_pgs_redis_ha
owner:
mail: "core@anyscale.com"
slack: "@Yi Cheng"
team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml

View file

@ -283,6 +283,7 @@ GLOBAL_CONFIG = {
REPORT_S = 30
RETRY_MULTIPLIER = 2
VALID_TEAMS = ["ml", "core", "serve"]
class ExitCode(enum.Enum):
@ -573,20 +574,17 @@ def maybe_get_alert_for_result(result_dict: Dict[str, Any]) -> Optional[str]:
return alert
def report_result(test_suite: str, test_name: str, status: str, last_logs: str,
results: Dict[Any, Any], artifacts: Dict[Any, Any],
category: str):
def report_result(*, test_suite: str, test_name: str, status: str,
last_logs: str, results: Dict[Any, Any],
artifacts: Dict[Any, Any], category: str, team: str):
# session_url: str, commit_url: str,
# runtime: float, stable: bool, frequency: str, return_code: int):
"""Report the test result to database."""
now = datetime.datetime.utcnow()
rds_data_client = boto3.client("rds-data", region_name="us-west-2")
schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"]
sql = (
f"INSERT INTO {schema} "
f"(created_on, test_suite, test_name, status, last_logs, "
f"results, artifacts, category) "
f"VALUES (:created_on, :test_suite, :test_name, :status, :last_logs, "
f":results, :artifacts, :category)")
parameters = [{
"name": "created_on",
"typeHint": "TIMESTAMP",
@ -630,7 +628,20 @@ def report_result(test_suite: str, test_name: str, status: str, last_logs: str,
"value": {
"stringValue": category
}
}, {
"name": "team",
"value": {
"stringValue": team
}
}]
columns = [param["name"] for param in parameters]
values = [f":{param['name']}" for param in parameters]
column_str = ", ".join(columns).strip(", ")
value_str = ", ".join(values).strip(", ")
sql = (f"INSERT INTO {schema} " f"({column_str}) " f"VALUES ({value_str})")
logger.info(f"Query: {sql}")
# Default boto3 call timeout is 45 seconds.
retry_delay_s = 64
@ -2041,6 +2052,18 @@ def run_test(test_config_file: str,
driver_setup_script = test_config.get("driver_setup", None)
if driver_setup_script:
run_bash_script(local_dir, driver_setup_script)
logger.info(test_config)
team = test_config.get("team", "unspecified").strip(" ").lower()
# When running local test, this validates the team name.
# If the team name is not specified, they will be recorded as "unspecified"
if not report and team not in VALID_TEAMS:
raise ValueError(
f"Incorrect team name {team} has given."
"Please specify team under the name field in the test config. "
"For example, within nightly_tests.yaml,\n"
"\tname: test_xxx\n"
f"\tteam: {'|'.join(VALID_TEAMS)}\n"
"\tcluster:...")
result = run_test_config(
local_dir,
@ -2090,7 +2113,7 @@ def run_test(test_config_file: str,
results=result.get("results", {}),
artifacts=result.get("artifacts", {}),
category=category,
)
team=team)
if not has_errored(result):
# Check if result are met if test succeeded
@ -2118,7 +2141,7 @@ def run_test(test_config_file: str,
except Exception as e:
# On database error the test should still pass
# Todo: flag somewhere else?
logger.error(f"Error persisting results to database: {e}")
logger.exception(f"Error persisting results to database: {e}")
else:
logger.info(f"Usually I would now report the following results:\n"
f"{report_kwargs}")

View file

@ -1,7 +1,5 @@
- name: dask_xgboost_test
owner:
mail: "antoni@anyscale.com"
slack: "@team_ml"
team: ml
cluster:
app_config: dask_xgboost_app_config.yaml
compute_template: compute_tpl.yaml
@ -20,9 +18,7 @@
]
- name: modin_xgboost_test
owner:
mail: "antoni@anyscale.com"
slack: "@team_ml"
team: ml
cluster:
app_config: modin_xgboost_app_config.yaml
compute_template: compute_tpl.yaml
@ -41,10 +37,7 @@
]
- name: torch_tune_serve_test
owner:
mail: "matt@anyscale.com"
slack: "@team_ml"
team: ml
cluster:
app_config: torch_tune_serve_app_config.yaml
compute_template: gpu_tpl.yaml

View file

@ -1,4 +1,5 @@
- name: horovod_test
team: ml
cluster:
app_config: app_config_master.yaml
compute_template: compute_tpl.yaml

View file

@ -1,4 +1,5 @@
- name: train_small
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
@ -11,6 +12,7 @@
script: python workloads/train_small.py
- name: train_moderate
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml
@ -21,6 +23,7 @@
script: python workloads/train_moderate.py
- name: train_gpu
team: ml
cluster:
app_config: app_config_gpu.yaml
compute_template: tpl_gpu_small.yaml
@ -31,6 +34,7 @@
script: python workloads/train_gpu.py
- name: distributed_api_test
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
@ -43,6 +47,7 @@
results: ""
- name: ft_small_non_elastic
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
@ -54,6 +59,7 @@
results: ""
- name: tune_small
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
@ -64,6 +70,7 @@
script: python workloads/tune_small.py
- name: tune_32x4
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml
@ -74,6 +81,7 @@
script: python workloads/tune_32x4.py
- name: tune_4x32
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml

View file

@ -1,4 +1,5 @@
- name: pytorch_pbt_failure
team: ml
cluster:
app_config: app_config.yaml
compute_template: compute_tpl.yaml

View file

@ -1,4 +1,5 @@
- name: actor_deaths
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
@ -14,6 +15,7 @@
timeout: 3600
- name: apex
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: tpl_cpu_3.yaml
@ -30,6 +32,7 @@
- name: impala
team: ml
cluster:
app_config: app_config_np.yaml
compute_template: tpl_cpu_1_large.yaml
@ -44,6 +47,7 @@
timeout: 3600
- name: many_actor_tasks
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
@ -60,6 +64,7 @@
- name: many_drivers
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
@ -76,6 +81,7 @@
- name: many_ppo
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: many_ppo.yaml
@ -93,6 +99,7 @@
- name: many_tasks
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
@ -108,6 +115,7 @@
timeout: 3600
- name: many_tasks_serialized_ids
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
@ -124,6 +132,7 @@
- name: node_failures
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
@ -139,6 +148,7 @@
timeout: 3600
- name: pbt
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: tpl_cpu_1.yaml
@ -154,6 +164,7 @@
timeout: 3600
- name: serve
team: serve
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml
@ -169,6 +180,7 @@
timeout: 3600
- name: serve_failure
team: serve
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_1.yaml

View file

@ -1,4 +1,5 @@
- name: microbenchmark
team: core
cluster:
app_config: app_config.yaml
compute_template: tpl_64.yaml

View file

@ -1,4 +1,5 @@
- name: horovod_user_test_latest
team: ml
cluster:
app_config: horovod/app_config.yaml
compute_template: horovod/compute_tpl.yaml
@ -13,6 +14,7 @@
script: python horovod/horovod_user_test.py
- name: horovod_user_test_master
team: ml
cluster:
app_config: ../horovod_tests/app_config_master.yaml
compute_template: horovod/compute_tpl.yaml
@ -26,32 +28,35 @@
script: python horovod/horovod_user_test.py
- name: train_tensorflow_mnist_test
cluster:
app_config: train/app_config.yaml
compute_template: train/compute_tpl.yaml
- name: train_tensorflow_mnist_test
team: ml
cluster:
app_config: train/app_config.yaml
compute_template: train/compute_tpl.yaml
driver_setup: train/driver_setup.sh
driver_setup: train/driver_setup.sh
run:
use_connect: True
timeout: 36000
script: python train/train_tensorflow_mnist_test.py
run:
use_connect: True
timeout: 36000
script: python train/train_tensorflow_mnist_test.py
- name: train_torch_linear_test
cluster:
app_config: train/app_config.yaml
compute_template: train/compute_tpl.yaml
- name: train_torch_linear_test
team: ml
cluster:
app_config: train/app_config.yaml
compute_template: train/compute_tpl.yaml
driver_setup: train/driver_setup.sh
driver_setup: train/driver_setup.sh
run:
use_connect: True
timeout: 36000
script: python train/train_torch_linear_test.py
run:
use_connect: True
timeout: 36000
script: python train/train_torch_linear_test.py
- name: xgboost_gpu_connect_latest
team: ml
cluster:
app_config: xgboost/app_config_gpu.yaml
compute_template: xgboost/tpl_gpu_small_scaling.yaml
@ -62,6 +67,7 @@
script: python xgboost/train_gpu_connect.py
- name: xgboost_gpu_connect_master
team: ml
cluster:
app_config: xgboost/app_config_gpu_master.yaml
compute_template: xgboost/tpl_gpu_small_scaling.yaml
@ -72,6 +78,7 @@
script: python xgboost/train_gpu_connect.py
- name: ray_lightning_user_test_latest
team: ml
cluster:
app_config: ray-lightning/app_config.yaml
compute_template: ray-lightning/compute_tpl.yaml
@ -86,6 +93,7 @@
- name: ray_lightning_user_test_master
team: ml
cluster:
app_config: ray-lightning/app_config_master.yaml
compute_template: ray-lightning/compute_tpl.yaml
@ -101,6 +109,7 @@
- name: tune_rllib_connect_test
team: ml
cluster:
app_config: ../rllib_tests/app_config.yaml
compute_template: tune_rllib/compute_tpl.yaml

View file

@ -4,6 +4,7 @@
# Run the test that invokes many tasks without object store usage.
- name: chaos_many_tasks_no_object_store
team: core
cluster:
app_config: chaos_test/app_config.yaml
compute_template: chaos_test/compute_template.yaml
@ -14,6 +15,7 @@
script: python chaos_test/test_chaos_basic.py --workload=tasks
- name: chaos_many_actors
team: core
cluster:
app_config: chaos_test/app_config.yaml
compute_template: chaos_test/compute_template.yaml
@ -24,6 +26,7 @@
script: python chaos_test/test_chaos_basic.py --workload=actors
- name: chaos_dask_on_ray_large_scale_test_no_spilling
team: core
cluster:
app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
@ -37,6 +40,7 @@
# Test large scale dask on ray test with spilling.
- name: chaos_dask_on_ray_large_scale_test_spilling
team: core
cluster:
app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
@ -49,10 +53,7 @@
stable: false
- name: chaos_pipelined_ingestion_1500_gb_15_windows
owner:
mail: "core@anyscale.com"
slack: "@Chen Shen"
team: core
cluster:
app_config: dataset/pipelined_ingestion_app.yaml
compute_template: dataset/pipelined_ingestion_compute.yaml

View file

@ -1,8 +1,5 @@
- name: inference
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"
team: core
cluster:
app_config: app_config.yaml
compute_template: inference.yaml
@ -13,10 +10,7 @@
script: python inference.py
- name: shuffle_data_loader
owner:
mail: "core@anyscale.com"
slack: "@Chen Shen"
team: core
cluster:
app_config: shuffle_app_config.yaml
compute_template: shuffle_compute.yaml
@ -26,10 +20,7 @@
script: python dataset_shuffle_data_loader.py
- name: pipelined_training_50_gb
owner:
mail: "core@anyscale.com"
slack: "@Chen Shen"
team: core
cluster:
app_config: pipelined_training_app.yaml
compute_template: pipelined_training_compute.yaml
@ -40,10 +31,7 @@
script: python pipelined_training.py --epochs 1
- name: pipelined_ingestion_1500_gb_15_windows
owner:
mail: "core@anyscale.com"
slack: "@Chen Shen"
team: core
cluster:
app_config: pipelined_ingestion_app.yaml
compute_template: pipelined_ingestion_compute.yaml
@ -54,10 +42,7 @@
script: python pipelined_training.py --epochs 2 --num-windows 15 --num-files 915 --debug
- name: datasets_ingest_train_infer
owner:
mail: "core@anyscale.com"
slack: "@Chen Shen"
team: core
cluster:
app_config: ray_sgd_training_app.yaml
compute_template: ray_sgd_training_compute.yaml
@ -80,10 +65,7 @@
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
- name: datasets_preprocess_ingest
owner:
mail: "core@anyscale.com"
slack: "@Chen Shen"
team: core
cluster:
app_config: ray_sgd_training_app.yaml
compute_template: ray_sgd_training_compute_no_gpu.yaml
@ -96,10 +78,7 @@
stable: false
- name: datasets_ingest_400G
owner:
mail: "core@anyscale.com"
slack: "@Chen Shen"
team: core
cluster:
app_config: ray_sgd_training_app.yaml
compute_template: dataset_ingest_400G_compute.yaml

View file

@ -1,10 +1,10 @@
#
# Single node shuffle
#
# Test basic single node 10GB shuffle with a small number of partitions.
# This doesn't require object spilling.
- name: shuffle_10gb
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
@ -15,6 +15,7 @@
# Test single node 50GB shuffle with a large number of partitions.
- name: shuffle_50gb
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
@ -25,6 +26,7 @@
# Test single node 50GB shuffle with a large number of partitions.
- name: shuffle_50gb_large_partition
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
@ -35,6 +37,7 @@
# Test non streaming shuffle in a single node with a small number of partition.
- name: non_streaming_shuffle_50gb
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
@ -45,6 +48,7 @@
# Test non streaming shuffle in a single node with a large number of partition.
- name: non_streaming_shuffle_50gb_large_partition
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
@ -54,6 +58,7 @@
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
- name: dask_on_ray_10gb_sort
team: core
cluster:
app_config: dask_on_ray/dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
@ -63,6 +68,7 @@
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray
- name: dask_on_ray_100gb_sort
team: core
cluster:
app_config: dask_on_ray/dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml
@ -77,6 +83,7 @@
# Test multi nodes 100GB shuffle with a small number of partitions.
- name: shuffle_100gb
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_multi.yaml
@ -88,6 +95,7 @@
# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
- name: non_streaming_shuffle_100gb
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_multi.yaml
@ -99,6 +107,7 @@
# Test autoscaling 1TB streaming shuffle with a large number of partitions.
- name: autoscaling_shuffle_1tb_1000_partitions
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_autoscaling.yaml
@ -109,6 +118,7 @@
# Test multi nodes 1TB streaming shuffle with a large number of partitions.
- name: shuffle_1tb_1000_partition
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_large_scale.yaml
@ -120,6 +130,7 @@
# Test multi nodes 1TB non streaming shuffle with a large number of partitions.
- name: non_streaming_shuffle_1tb_1000_partition
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_large_scale.yaml
@ -131,6 +142,7 @@
# Stress test for 1TB multi node streaming shuffle.
- name: shuffle_1tb_5000_partitions
team: core
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_large_scale.yaml
@ -142,6 +154,7 @@
# Stress test for 1TB multi node non-streaming shuffle.
# - name: non_streaming_shuffle_1tb_5000_partitions
# team: core
# stable: False
# cluster:
# app_config: shuffle/shuffle_app_config.yaml
@ -154,6 +167,7 @@
# Test large scale dask on ray test without spilling.
- name: dask_on_ray_large_scale_test_no_spilling
team: core
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
@ -175,6 +189,7 @@
# Test large scale dask on ray test with spilling.
- name: dask_on_ray_large_scale_test_spilling
team: core
cluster:
app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml
@ -196,6 +211,7 @@
# Stress tests with many tasks
- name: stress_test_many_tasks
team: core
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/stress_tests_compute.yaml
@ -215,6 +231,7 @@
# Stress tests with dead actors
- name: stress_test_dead_actors
team: core
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/stress_tests_compute.yaml
@ -234,6 +251,7 @@
# Stress tests with placement groups
- name: stress_test_placement_group
team: core
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/placement_group_tests_compute.yaml
@ -244,6 +262,7 @@
# Stress tests with many threaded actors.
- name: threaded_actors_stress_test
team: core
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
compute_template: stress_tests/stress_test_threaded_actor_compute.yaml
@ -266,6 +285,7 @@
# Test decision tree on autoscaling compute cluster.
- name: decision_tree_autoscaling
team: core
cluster:
app_config: decision_tree/decision_tree_app_config.yaml
compute_template: decision_tree/autoscaling_compute.yaml
@ -276,6 +296,7 @@
# Test 20 concurrent decision tree runs on autoscaling compute cluster.
- name: decision_tree_autoscaling_20_runs
team: core
cluster:
app_config: decision_tree/decision_tree_app_config.yaml
compute_template: decision_tree/autoscaling_compute.yaml
@ -285,6 +306,7 @@
# Stress test shuffle_data_loader.
- name: shuffle_data_loader
team: core
cluster:
app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
compute_template: shuffle_data_loader/shuffle_data_loader_compute.yaml
@ -307,6 +329,7 @@
# Stress test shuffle_data_loader.
- name: shuffle_data_loader_4_nodes
team: core
cluster:
app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml
compute_template: shuffle_data_loader/shuffle_data_loader_compute_4_nodes.yaml
@ -329,6 +352,7 @@
--no-stats
- name: dask_on_ray_1tb_sort
team: core
cluster:
app_config: dask_on_ray/dask_on_ray_app_config.yaml
compute_template: dask_on_ray/1tb_sort_compute.yaml
@ -339,6 +363,7 @@
script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test
- name: many_nodes_actor_test
team: core
cluster:
app_config: many_nodes_tests/app_config.yaml
compute_template: many_nodes_tests/compute_config.yaml
@ -349,6 +374,7 @@
script: python many_nodes_tests/actor_test.py
- name: pg_autoscaling_regression_test
team: core
cluster:
app_config: placement_group_tests/app_config.yaml
compute_template: placement_group_tests/compute.yaml
@ -358,6 +384,7 @@
script: python placement_group_tests/pg_run.py
- name: pg_long_running_performance_test
team: core
cluster:
app_config: placement_group_tests/app_config.yaml
compute_template: placement_group_tests/long_running_test_compute.yaml
@ -368,6 +395,7 @@
script: python placement_group_tests/long_running_performance_test.py --num-stages 2000
- name: placement_group_performance_test
team: core
cluster:
app_config: placement_group_tests/app_config.yaml
compute_template: placement_group_tests/pg_perf_test_compute.yaml

View file

@ -1,5 +1,6 @@
# Heavy learning tests (Atari and HalfCheetah) for major algos.
- name: learning_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 8gpus_64cpus.yaml
@ -14,6 +15,7 @@
# 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos.
- name: multi_gpu_learning_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 8gpus_96cpus.yaml
@ -25,6 +27,7 @@
# 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos
# (that support RNN models).
- name: multi_gpu_with_lstm_learning_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 8gpus_96cpus.yaml
@ -36,6 +39,7 @@
# 2-GPU learning tests (StatelessCartPole) + use_attention=True for major
# algos (that support RNN models).
- name: multi_gpu_with_attention_learning_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 8gpus_96cpus.yaml
@ -46,6 +50,7 @@
# We'll have these as per-PR tests soon.
# - name: example_scripts_on_gpu_tests
# team: ml
# cluster:
# app_config: app_config.yaml
# compute_template: 1gpu_4cpus.yaml
@ -56,6 +61,7 @@
# IMPALA large machine stress tests (4x Atari).
- name: stress_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 4gpus_544_cpus.yaml
@ -71,6 +77,7 @@
# Tests that exercise auto-scaling and Anyscale connect.
- name: connect_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: auto_scale.yaml
@ -86,6 +93,7 @@
# Performance metrics, such as reward achieved and throughput, are then
# collected and tracked over time.
- name: performance_tests
team: ml
cluster:
app_config: app_config.yaml
compute_template: 12gpus_192cpus.yaml

View file

@ -1,4 +1,5 @@
- name: rte_many_tasks_actors
team: serve
cluster:
app_config: app_config.yaml
compute_template: rte_small.yaml
@ -9,6 +10,7 @@
script: python workloads/rte_many_tasks_actors.py
- name: wheel_urls
team: serve
cluster:
app_config: app_config.yaml
compute_template: rte_minimal.yaml
@ -19,6 +21,7 @@
script: python workloads/wheel_urls.py
- name: rte_ray_client
team: serve
cluster:
app_config: app_config.yaml
compute_template: rte_minimal.yaml

View file

@ -1,4 +1,5 @@
- name: single_deployment_1k_noop_replica
team: serve
cluster:
app_config: app_config.yaml
compute_template: compute_tpl_8_cpu.yaml
@ -12,6 +13,7 @@
timeout: 600
- name: multi_deployment_1k_noop_replica
team: serve
cluster:
app_config: app_config.yaml
compute_template: compute_tpl_8_cpu.yaml
@ -25,6 +27,7 @@
timeout: 600
- name: serve_micro_benchmark
team: serve
cluster:
app_config: app_config.yaml
# 16 CPUS
@ -39,6 +42,7 @@
timeout: 600
- name: serve_cluster_fault_tolerance
team: serve
cluster:
app_config: app_config.yaml
# 16 CPUS

View file

@ -1,5 +1,6 @@
# Test multi-node, multi-GPU Ray SGD example.
- name: sgd_gpu
team: ml
cluster:
app_config: sgd_gpu/sgd_gpu_app_config.yaml
compute_template: sgd_gpu/sgd_gpu_compute.yaml

View file

@ -1,4 +1,5 @@
- name: aws_no_sync_down
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
@ -9,6 +10,7 @@
script: python workloads/run_cloud_test.py no_sync_down
- name: aws_ssh_sync
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
@ -19,6 +21,7 @@
script: python workloads/run_cloud_test.py ssh_sync
- name: aws_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
@ -29,6 +32,7 @@
script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload
- name: aws_durable_upload_rllib_str
team: ml
cluster:
app_config: app_config_ml.yaml
compute_template: tpl_aws_4x2.yaml
@ -39,6 +43,7 @@
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str
- name: aws_durable_upload_rllib_trainer
team: ml
cluster:
app_config: app_config_ml.yaml
compute_template: tpl_aws_4x2.yaml
@ -49,6 +54,7 @@
script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer
- name: aws_no_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_aws_4x2.yaml
@ -59,6 +65,7 @@
script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload
- name: gcp_k8s_no_sync_down
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
@ -71,6 +78,7 @@
script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
- name: gcp_k8s_ssh_sync
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
@ -83,6 +91,7 @@
script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8
- name: gcp_k8s_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml
@ -96,6 +105,7 @@
- name: gcp_k8s_no_durable_upload
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_gcp_k8s_4x8.yaml

View file

@ -1,4 +1,5 @@
- name: bookkeeping_overhead
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_1x16.yaml
@ -9,6 +10,7 @@
- name: durable_trainable
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_16x2.yaml
@ -19,6 +21,7 @@
script: python workloads/test_durable_trainable.py --bucket data-test-ilr
- name: long_running_large_checkpoints
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_1x32_hd.yaml
@ -34,6 +37,7 @@
- name: network_overhead
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_100x2.yaml
@ -54,6 +58,7 @@
prepare: python wait_cluster.py 20 600
- name: result_throughput_cluster
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_16x64.yaml
@ -64,6 +69,7 @@
script: python workloads/test_result_throughput_cluster.py
- name: result_throughput_single_node
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_1x96.yaml
@ -73,6 +79,7 @@
script: python workloads/test_result_throughput_single_node.py
- name: xgboost_sweep
team: ml
cluster:
app_config: app_config_data.yaml
compute_template: tpl_16x64.yaml

View file

@ -1,4 +1,5 @@
- name: train_small
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
@ -11,6 +12,7 @@
script: python workloads/train_small.py
- name: train_moderate
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml
@ -21,6 +23,7 @@
script: python workloads/train_moderate.py
- name: train_gpu
team: ml
cluster:
app_config: app_config_gpu.yaml
compute_template: tpl_gpu_small.yaml
@ -31,6 +34,7 @@
script: python workloads/train_gpu.py
- name: distributed_api_test
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
@ -43,6 +47,7 @@
results: ""
- name: ft_small_elastic
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
@ -54,6 +59,7 @@
results: ""
- name: ft_small_non_elastic
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
@ -65,6 +71,7 @@
results: ""
- name: tune_small
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_small.yaml
@ -75,6 +82,7 @@
script: python workloads/tune_small.py
- name: tune_32x4
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml
@ -85,6 +93,7 @@
script: python workloads/tune_32x4.py
- name: tune_4x32
team: ml
cluster:
app_config: app_config.yaml
compute_template: tpl_cpu_moderate.yaml