diff --git a/benchmarks/benchmark_tests.yaml b/benchmarks/benchmark_tests.yaml index 13fa72917..3172ae69b 100644 --- a/benchmarks/benchmark_tests.yaml +++ b/benchmarks/benchmark_tests.yaml @@ -1,8 +1,5 @@ - name: single_node - owner: - mail: "core@anyscale.com" - slack: "@Alex Wu" - + team: core cluster: app_config: app_config.yaml compute_template: single_node.yaml @@ -13,10 +10,7 @@ script: python single_node/test_single_node.py - name: object_store - owner: - mail: "core@anyscale.com" - slack: "@Alex Wu" - + team: core cluster: app_config: app_config.yaml compute_template: object_store.yaml @@ -27,10 +21,7 @@ script: python object_store/test_object_store.py - name: many_actors - owner: - mail: "core@anyscale.com" - slack: "@Alex Wu" - + team: core cluster: app_config: app_config.yaml compute_template: distributed.yaml @@ -41,10 +32,7 @@ script: python distributed/test_many_actors.py - name: many_actors_smoke_test - owner: - mail: "core@anyscale.com" - slack: "@Alex Wu" - + team: core cluster: app_config: app_config.yaml compute_template: distributed_smoke_test.yaml @@ -55,10 +43,7 @@ script: SMOKE_TEST=1 python distributed/test_many_actors.py - name: many_tasks - owner: - mail: "core@anyscale.com" - slack: "@Alex Wu" - + team: core cluster: app_config: app_config.yaml compute_template: distributed.yaml @@ -69,10 +54,7 @@ script: python distributed/test_many_tasks.py --num-tasks=10000 - name: many_tasks_smoke_test - owner: - mail: "core@anyscale.com" - slack: "@Alex Wu" - + team: core cluster: app_config: app_config.yaml compute_template: distributed_smoke_test.yaml @@ -83,10 +65,7 @@ script: python distributed/test_many_tasks.py --num-tasks=100 - name: many_pgs - owner: - mail: "core@anyscale.com" - slack: "@Alex Wu" - + team: core cluster: app_config: app_config.yaml compute_template: distributed.yaml @@ -97,10 +76,7 @@ script: python distributed/test_many_pgs.py - name: many_pgs_smoke_test - owner: - mail: "core@anyscale.com" - slack: "@Alex Wu" - + team: core cluster: app_config: app_config.yaml compute_template: distributed_smoke_test.yaml @@ -112,10 +88,7 @@ # NOTE: No smoke test since this shares a script with the many_tasks_smoke_test - name: many_nodes - owner: - mail: "core@anyscale.com" - slack: "@Alex Wu" - + team: core cluster: app_config: app_config.yaml compute_template: many_nodes.yaml @@ -126,10 +99,7 @@ script: python distributed/test_many_tasks.py --num-tasks=1000 - name: many_tasks_redis_ha - owner: - mail: "core@anyscale.com" - slack: "@Yi Cheng" - + team: core cluster: app_config: app_config.yaml compute_template: distributed.yaml @@ -146,10 +116,7 @@ stable: false - name: many_actors_redis_ha - owner: - mail: "core@anyscale.com" - slack: "@Yi Cheng" - + team: core cluster: app_config: app_config.yaml compute_template: distributed.yaml @@ -166,10 +133,7 @@ stable: false - name: many_nodes_redis_ha - owner: - mail: "core@anyscale.com" - slack: "@Yi Cheng" - + team: core cluster: app_config: app_config.yaml compute_template: many_nodes.yaml @@ -186,10 +150,7 @@ stable: false - name: many_pgs_redis_ha - owner: - mail: "core@anyscale.com" - slack: "@Yi Cheng" - + team: core cluster: app_config: app_config.yaml compute_template: distributed.yaml diff --git a/release/e2e.py b/release/e2e.py index 4366fbaa3..5cdb8115b 100644 --- a/release/e2e.py +++ b/release/e2e.py @@ -283,6 +283,7 @@ GLOBAL_CONFIG = { REPORT_S = 30 RETRY_MULTIPLIER = 2 +VALID_TEAMS = ["ml", "core", "serve"] class ExitCode(enum.Enum): @@ -573,20 +574,17 @@ def maybe_get_alert_for_result(result_dict: Dict[str, Any]) -> Optional[str]: return alert -def report_result(test_suite: str, test_name: str, status: str, last_logs: str, - results: Dict[Any, Any], artifacts: Dict[Any, Any], - category: str): +def report_result(*, test_suite: str, test_name: str, status: str, + last_logs: str, results: Dict[Any, Any], + artifacts: Dict[Any, Any], category: str, team: str): + # session_url: str, commit_url: str, + # runtime: float, stable: bool, frequency: str, return_code: int): + """Report the test result to database.""" now = datetime.datetime.utcnow() rds_data_client = boto3.client("rds-data", region_name="us-west-2") schema = GLOBAL_CONFIG["RELEASE_AWS_DB_TABLE"] - sql = ( - f"INSERT INTO {schema} " - f"(created_on, test_suite, test_name, status, last_logs, " - f"results, artifacts, category) " - f"VALUES (:created_on, :test_suite, :test_name, :status, :last_logs, " - f":results, :artifacts, :category)") parameters = [{ "name": "created_on", "typeHint": "TIMESTAMP", @@ -630,7 +628,20 @@ def report_result(test_suite: str, test_name: str, status: str, last_logs: str, "value": { "stringValue": category } + }, { + "name": "team", + "value": { + "stringValue": team + } }] + columns = [param["name"] for param in parameters] + values = [f":{param['name']}" for param in parameters] + column_str = ", ".join(columns).strip(", ") + value_str = ", ".join(values).strip(", ") + + sql = (f"INSERT INTO {schema} " f"({column_str}) " f"VALUES ({value_str})") + + logger.info(f"Query: {sql}") # Default boto3 call timeout is 45 seconds. retry_delay_s = 64 @@ -2041,6 +2052,18 @@ def run_test(test_config_file: str, driver_setup_script = test_config.get("driver_setup", None) if driver_setup_script: run_bash_script(local_dir, driver_setup_script) + logger.info(test_config) + team = test_config.get("team", "unspecified").strip(" ").lower() + # When running local test, this validates the team name. + # If the team name is not specified, they will be recorded as "unspecified" + if not report and team not in VALID_TEAMS: + raise ValueError( + f"Incorrect team name {team} has given." + "Please specify team under the name field in the test config. " + "For example, within nightly_tests.yaml,\n" + "\tname: test_xxx\n" + f"\tteam: {'|'.join(VALID_TEAMS)}\n" + "\tcluster:...") result = run_test_config( local_dir, @@ -2090,7 +2113,7 @@ def run_test(test_config_file: str, results=result.get("results", {}), artifacts=result.get("artifacts", {}), category=category, - ) + team=team) if not has_errored(result): # Check if result are met if test succeeded @@ -2118,7 +2141,7 @@ def run_test(test_config_file: str, except Exception as e: # On database error the test should still pass # Todo: flag somewhere else? - logger.error(f"Error persisting results to database: {e}") + logger.exception(f"Error persisting results to database: {e}") else: logger.info(f"Usually I would now report the following results:\n" f"{report_kwargs}") diff --git a/release/golden_notebook_tests/golden_notebook_tests.yaml b/release/golden_notebook_tests/golden_notebook_tests.yaml index e6d5838d1..fd1d04302 100644 --- a/release/golden_notebook_tests/golden_notebook_tests.yaml +++ b/release/golden_notebook_tests/golden_notebook_tests.yaml @@ -1,7 +1,5 @@ - name: dask_xgboost_test - owner: - mail: "antoni@anyscale.com" - slack: "@team_ml" + team: ml cluster: app_config: dask_xgboost_app_config.yaml compute_template: compute_tpl.yaml @@ -20,9 +18,7 @@ ] - name: modin_xgboost_test - owner: - mail: "antoni@anyscale.com" - slack: "@team_ml" + team: ml cluster: app_config: modin_xgboost_app_config.yaml compute_template: compute_tpl.yaml @@ -41,10 +37,7 @@ ] - name: torch_tune_serve_test - owner: - mail: "matt@anyscale.com" - slack: "@team_ml" - + team: ml cluster: app_config: torch_tune_serve_app_config.yaml compute_template: gpu_tpl.yaml diff --git a/release/horovod_tests/horovod_tests.yaml b/release/horovod_tests/horovod_tests.yaml index 87c99a420..ce0abe719 100644 --- a/release/horovod_tests/horovod_tests.yaml +++ b/release/horovod_tests/horovod_tests.yaml @@ -1,4 +1,5 @@ - name: horovod_test + team: ml cluster: app_config: app_config_master.yaml compute_template: compute_tpl.yaml diff --git a/release/lightgbm_tests/lightgbm_tests.yaml b/release/lightgbm_tests/lightgbm_tests.yaml index 2ac0b182e..07aa9e5cf 100644 --- a/release/lightgbm_tests/lightgbm_tests.yaml +++ b/release/lightgbm_tests/lightgbm_tests.yaml @@ -1,4 +1,5 @@ - name: train_small + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_small.yaml @@ -11,6 +12,7 @@ script: python workloads/train_small.py - name: train_moderate + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_moderate.yaml @@ -21,6 +23,7 @@ script: python workloads/train_moderate.py - name: train_gpu + team: ml cluster: app_config: app_config_gpu.yaml compute_template: tpl_gpu_small.yaml @@ -31,6 +34,7 @@ script: python workloads/train_gpu.py - name: distributed_api_test + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_small.yaml @@ -43,6 +47,7 @@ results: "" - name: ft_small_non_elastic + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_small.yaml @@ -54,6 +59,7 @@ results: "" - name: tune_small + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_small.yaml @@ -64,6 +70,7 @@ script: python workloads/tune_small.py - name: tune_32x4 + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_moderate.yaml @@ -74,6 +81,7 @@ script: python workloads/tune_32x4.py - name: tune_4x32 + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_moderate.yaml diff --git a/release/long_running_distributed_tests/long_running_distributed.yaml b/release/long_running_distributed_tests/long_running_distributed.yaml index 132cb9ea3..189ffd3f9 100644 --- a/release/long_running_distributed_tests/long_running_distributed.yaml +++ b/release/long_running_distributed_tests/long_running_distributed.yaml @@ -1,4 +1,5 @@ - name: pytorch_pbt_failure + team: ml cluster: app_config: app_config.yaml compute_template: compute_tpl.yaml diff --git a/release/long_running_tests/long_running_tests.yaml b/release/long_running_tests/long_running_tests.yaml index bc2f195f8..d7be5f10b 100644 --- a/release/long_running_tests/long_running_tests.yaml +++ b/release/long_running_tests/long_running_tests.yaml @@ -1,4 +1,5 @@ - name: actor_deaths + team: core cluster: app_config: app_config.yaml compute_template: tpl_cpu_1.yaml @@ -14,6 +15,7 @@ timeout: 3600 - name: apex + team: ml cluster: app_config: ../rllib_tests/app_config.yaml compute_template: tpl_cpu_3.yaml @@ -30,6 +32,7 @@ - name: impala + team: ml cluster: app_config: app_config_np.yaml compute_template: tpl_cpu_1_large.yaml @@ -44,6 +47,7 @@ timeout: 3600 - name: many_actor_tasks + team: core cluster: app_config: app_config.yaml compute_template: tpl_cpu_1.yaml @@ -60,6 +64,7 @@ - name: many_drivers + team: core cluster: app_config: app_config.yaml compute_template: tpl_cpu_1.yaml @@ -76,6 +81,7 @@ - name: many_ppo + team: ml cluster: app_config: ../rllib_tests/app_config.yaml compute_template: many_ppo.yaml @@ -93,6 +99,7 @@ - name: many_tasks + team: core cluster: app_config: app_config.yaml compute_template: tpl_cpu_1.yaml @@ -108,6 +115,7 @@ timeout: 3600 - name: many_tasks_serialized_ids + team: core cluster: app_config: app_config.yaml compute_template: tpl_cpu_1.yaml @@ -124,6 +132,7 @@ - name: node_failures + team: core cluster: app_config: app_config.yaml compute_template: tpl_cpu_1.yaml @@ -139,6 +148,7 @@ timeout: 3600 - name: pbt + team: ml cluster: app_config: ../rllib_tests/app_config.yaml compute_template: tpl_cpu_1.yaml @@ -154,6 +164,7 @@ timeout: 3600 - name: serve + team: serve cluster: app_config: app_config.yaml compute_template: tpl_cpu_1.yaml @@ -169,6 +180,7 @@ timeout: 3600 - name: serve_failure + team: serve cluster: app_config: app_config.yaml compute_template: tpl_cpu_1.yaml diff --git a/release/microbenchmark/microbenchmark.yaml b/release/microbenchmark/microbenchmark.yaml index 1f33bca81..ffe8137f1 100644 --- a/release/microbenchmark/microbenchmark.yaml +++ b/release/microbenchmark/microbenchmark.yaml @@ -1,4 +1,5 @@ - name: microbenchmark + team: core cluster: app_config: app_config.yaml compute_template: tpl_64.yaml diff --git a/release/ml_user_tests/ml_user_tests.yaml b/release/ml_user_tests/ml_user_tests.yaml index ebdb5ebc6..8c6a8162e 100644 --- a/release/ml_user_tests/ml_user_tests.yaml +++ b/release/ml_user_tests/ml_user_tests.yaml @@ -1,4 +1,5 @@ - name: horovod_user_test_latest + team: ml cluster: app_config: horovod/app_config.yaml compute_template: horovod/compute_tpl.yaml @@ -13,6 +14,7 @@ script: python horovod/horovod_user_test.py - name: horovod_user_test_master + team: ml cluster: app_config: ../horovod_tests/app_config_master.yaml compute_template: horovod/compute_tpl.yaml @@ -26,32 +28,35 @@ script: python horovod/horovod_user_test.py -- name: train_tensorflow_mnist_test - cluster: - app_config: train/app_config.yaml - compute_template: train/compute_tpl.yaml +- name: train_tensorflow_mnist_test + team: ml + cluster: + app_config: train/app_config.yaml + compute_template: train/compute_tpl.yaml - driver_setup: train/driver_setup.sh + driver_setup: train/driver_setup.sh - run: - use_connect: True - timeout: 36000 - script: python train/train_tensorflow_mnist_test.py + run: + use_connect: True + timeout: 36000 + script: python train/train_tensorflow_mnist_test.py -- name: train_torch_linear_test - cluster: - app_config: train/app_config.yaml - compute_template: train/compute_tpl.yaml +- name: train_torch_linear_test + team: ml + cluster: + app_config: train/app_config.yaml + compute_template: train/compute_tpl.yaml - driver_setup: train/driver_setup.sh + driver_setup: train/driver_setup.sh - run: - use_connect: True - timeout: 36000 - script: python train/train_torch_linear_test.py + run: + use_connect: True + timeout: 36000 + script: python train/train_torch_linear_test.py - name: xgboost_gpu_connect_latest + team: ml cluster: app_config: xgboost/app_config_gpu.yaml compute_template: xgboost/tpl_gpu_small_scaling.yaml @@ -62,6 +67,7 @@ script: python xgboost/train_gpu_connect.py - name: xgboost_gpu_connect_master + team: ml cluster: app_config: xgboost/app_config_gpu_master.yaml compute_template: xgboost/tpl_gpu_small_scaling.yaml @@ -72,6 +78,7 @@ script: python xgboost/train_gpu_connect.py - name: ray_lightning_user_test_latest + team: ml cluster: app_config: ray-lightning/app_config.yaml compute_template: ray-lightning/compute_tpl.yaml @@ -86,6 +93,7 @@ - name: ray_lightning_user_test_master + team: ml cluster: app_config: ray-lightning/app_config_master.yaml compute_template: ray-lightning/compute_tpl.yaml @@ -101,6 +109,7 @@ - name: tune_rllib_connect_test + team: ml cluster: app_config: ../rllib_tests/app_config.yaml compute_template: tune_rllib/compute_tpl.yaml diff --git a/release/nightly_tests/chaos_test.yaml b/release/nightly_tests/chaos_test.yaml index c9eec413b..cc2838cf5 100644 --- a/release/nightly_tests/chaos_test.yaml +++ b/release/nightly_tests/chaos_test.yaml @@ -4,6 +4,7 @@ # Run the test that invokes many tasks without object store usage. - name: chaos_many_tasks_no_object_store + team: core cluster: app_config: chaos_test/app_config.yaml compute_template: chaos_test/compute_template.yaml @@ -14,6 +15,7 @@ script: python chaos_test/test_chaos_basic.py --workload=tasks - name: chaos_many_actors + team: core cluster: app_config: chaos_test/app_config.yaml compute_template: chaos_test/compute_template.yaml @@ -24,6 +26,7 @@ script: python chaos_test/test_chaos_basic.py --workload=actors - name: chaos_dask_on_ray_large_scale_test_no_spilling + team: core cluster: app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml @@ -37,6 +40,7 @@ # Test large scale dask on ray test with spilling. - name: chaos_dask_on_ray_large_scale_test_spilling + team: core cluster: app_config: chaos_test/dask_on_ray_app_config_reconstruction.yaml compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml @@ -49,10 +53,7 @@ stable: false - name: chaos_pipelined_ingestion_1500_gb_15_windows - owner: - mail: "core@anyscale.com" - slack: "@Chen Shen" - + team: core cluster: app_config: dataset/pipelined_ingestion_app.yaml compute_template: dataset/pipelined_ingestion_compute.yaml diff --git a/release/nightly_tests/dataset/dataset_test.yaml b/release/nightly_tests/dataset/dataset_test.yaml index 7103f5158..c65a7c6fc 100644 --- a/release/nightly_tests/dataset/dataset_test.yaml +++ b/release/nightly_tests/dataset/dataset_test.yaml @@ -1,8 +1,5 @@ - name: inference - owner: - mail: "core@anyscale.com" - slack: "@Alex Wu" - + team: core cluster: app_config: app_config.yaml compute_template: inference.yaml @@ -13,10 +10,7 @@ script: python inference.py - name: shuffle_data_loader - owner: - mail: "core@anyscale.com" - slack: "@Chen Shen" - + team: core cluster: app_config: shuffle_app_config.yaml compute_template: shuffle_compute.yaml @@ -26,10 +20,7 @@ script: python dataset_shuffle_data_loader.py - name: pipelined_training_50_gb - owner: - mail: "core@anyscale.com" - slack: "@Chen Shen" - + team: core cluster: app_config: pipelined_training_app.yaml compute_template: pipelined_training_compute.yaml @@ -40,10 +31,7 @@ script: python pipelined_training.py --epochs 1 - name: pipelined_ingestion_1500_gb_15_windows - owner: - mail: "core@anyscale.com" - slack: "@Chen Shen" - + team: core cluster: app_config: pipelined_ingestion_app.yaml compute_template: pipelined_ingestion_compute.yaml @@ -54,10 +42,7 @@ script: python pipelined_training.py --epochs 2 --num-windows 15 --num-files 915 --debug - name: datasets_ingest_train_infer - owner: - mail: "core@anyscale.com" - slack: "@Chen Shen" - + team: core cluster: app_config: ray_sgd_training_app.yaml compute_template: ray_sgd_training_compute.yaml @@ -80,10 +65,7 @@ script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu - name: datasets_preprocess_ingest - owner: - mail: "core@anyscale.com" - slack: "@Chen Shen" - + team: core cluster: app_config: ray_sgd_training_app.yaml compute_template: ray_sgd_training_compute_no_gpu.yaml @@ -96,10 +78,7 @@ stable: false - name: datasets_ingest_400G - owner: - mail: "core@anyscale.com" - slack: "@Chen Shen" - + team: core cluster: app_config: ray_sgd_training_app.yaml compute_template: dataset_ingest_400G_compute.yaml diff --git a/release/nightly_tests/nightly_tests.yaml b/release/nightly_tests/nightly_tests.yaml index 8a6e25602..fc48f81ef 100644 --- a/release/nightly_tests/nightly_tests.yaml +++ b/release/nightly_tests/nightly_tests.yaml @@ -1,10 +1,10 @@ # # Single node shuffle # - # Test basic single node 10GB shuffle with a small number of partitions. # This doesn't require object spilling. - name: shuffle_10gb + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_single.yaml @@ -15,6 +15,7 @@ # Test single node 50GB shuffle with a large number of partitions. - name: shuffle_50gb + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_single.yaml @@ -25,6 +26,7 @@ # Test single node 50GB shuffle with a large number of partitions. - name: shuffle_50gb_large_partition + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_single.yaml @@ -35,6 +37,7 @@ # Test non streaming shuffle in a single node with a small number of partition. - name: non_streaming_shuffle_50gb + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_single.yaml @@ -45,6 +48,7 @@ # Test non streaming shuffle in a single node with a large number of partition. - name: non_streaming_shuffle_50gb_large_partition + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_single.yaml @@ -54,6 +58,7 @@ script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming - name: dask_on_ray_10gb_sort + team: core cluster: app_config: dask_on_ray/dask_on_ray_app_config.yaml compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml @@ -63,6 +68,7 @@ script: python dask_on_ray/dask_on_ray_sort.py --nbytes 10_000_000_000 --npartitions 50 --num-nodes 1 --ray --data-dir /tmp/ray --file-path /tmp/ray - name: dask_on_ray_100gb_sort + team: core cluster: app_config: dask_on_ray/dask_on_ray_app_config.yaml compute_template: dask_on_ray/dask_on_ray_sort_compute_template.yaml @@ -77,6 +83,7 @@ # Test multi nodes 100GB shuffle with a small number of partitions. - name: shuffle_100gb + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_multi.yaml @@ -88,6 +95,7 @@ # Test non streaming multi nodes 100GB shuffle with a small number of partitions. - name: non_streaming_shuffle_100gb + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_multi.yaml @@ -99,6 +107,7 @@ # Test autoscaling 1TB streaming shuffle with a large number of partitions. - name: autoscaling_shuffle_1tb_1000_partitions + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_autoscaling.yaml @@ -109,6 +118,7 @@ # Test multi nodes 1TB streaming shuffle with a large number of partitions. - name: shuffle_1tb_1000_partition + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_large_scale.yaml @@ -120,6 +130,7 @@ # Test multi nodes 1TB non streaming shuffle with a large number of partitions. - name: non_streaming_shuffle_1tb_1000_partition + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_large_scale.yaml @@ -131,6 +142,7 @@ # Stress test for 1TB multi node streaming shuffle. - name: shuffle_1tb_5000_partitions + team: core cluster: app_config: shuffle/shuffle_app_config.yaml compute_template: shuffle/shuffle_compute_large_scale.yaml @@ -142,6 +154,7 @@ # Stress test for 1TB multi node non-streaming shuffle. # - name: non_streaming_shuffle_1tb_5000_partitions +# team: core # stable: False # cluster: # app_config: shuffle/shuffle_app_config.yaml @@ -154,6 +167,7 @@ # Test large scale dask on ray test without spilling. - name: dask_on_ray_large_scale_test_no_spilling + team: core cluster: app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml @@ -175,6 +189,7 @@ # Test large scale dask on ray test with spilling. - name: dask_on_ray_large_scale_test_spilling + team: core cluster: app_config: dask_on_ray/large_scale_dask_on_ray_app_config.yaml compute_template: dask_on_ray/dask_on_ray_stress_compute.yaml @@ -196,6 +211,7 @@ # Stress tests with many tasks - name: stress_test_many_tasks + team: core cluster: app_config: stress_tests/stress_tests_app_config.yaml compute_template: stress_tests/stress_tests_compute.yaml @@ -215,6 +231,7 @@ # Stress tests with dead actors - name: stress_test_dead_actors + team: core cluster: app_config: stress_tests/stress_tests_app_config.yaml compute_template: stress_tests/stress_tests_compute.yaml @@ -234,6 +251,7 @@ # Stress tests with placement groups - name: stress_test_placement_group + team: core cluster: app_config: stress_tests/stress_tests_app_config.yaml compute_template: stress_tests/placement_group_tests_compute.yaml @@ -244,6 +262,7 @@ # Stress tests with many threaded actors. - name: threaded_actors_stress_test + team: core cluster: app_config: stress_tests/stress_tests_app_config.yaml compute_template: stress_tests/stress_test_threaded_actor_compute.yaml @@ -266,6 +285,7 @@ # Test decision tree on autoscaling compute cluster. - name: decision_tree_autoscaling + team: core cluster: app_config: decision_tree/decision_tree_app_config.yaml compute_template: decision_tree/autoscaling_compute.yaml @@ -276,6 +296,7 @@ # Test 20 concurrent decision tree runs on autoscaling compute cluster. - name: decision_tree_autoscaling_20_runs + team: core cluster: app_config: decision_tree/decision_tree_app_config.yaml compute_template: decision_tree/autoscaling_compute.yaml @@ -285,6 +306,7 @@ # Stress test shuffle_data_loader. - name: shuffle_data_loader + team: core cluster: app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml compute_template: shuffle_data_loader/shuffle_data_loader_compute.yaml @@ -307,6 +329,7 @@ # Stress test shuffle_data_loader. - name: shuffle_data_loader_4_nodes + team: core cluster: app_config: shuffle_data_loader/shuffle_data_loader_app_config.yaml compute_template: shuffle_data_loader/shuffle_data_loader_compute_4_nodes.yaml @@ -329,6 +352,7 @@ --no-stats - name: dask_on_ray_1tb_sort + team: core cluster: app_config: dask_on_ray/dask_on_ray_app_config.yaml compute_template: dask_on_ray/1tb_sort_compute.yaml @@ -339,6 +363,7 @@ script: python dask_on_ray/dask_on_ray_sort.py --nbytes 1_000_000_000_000 --npartitions 1000 --num-nodes 31 --ray --data-dir /tmp/ray --s3-bucket core-nightly-test - name: many_nodes_actor_test + team: core cluster: app_config: many_nodes_tests/app_config.yaml compute_template: many_nodes_tests/compute_config.yaml @@ -349,6 +374,7 @@ script: python many_nodes_tests/actor_test.py - name: pg_autoscaling_regression_test + team: core cluster: app_config: placement_group_tests/app_config.yaml compute_template: placement_group_tests/compute.yaml @@ -358,6 +384,7 @@ script: python placement_group_tests/pg_run.py - name: pg_long_running_performance_test + team: core cluster: app_config: placement_group_tests/app_config.yaml compute_template: placement_group_tests/long_running_test_compute.yaml @@ -368,6 +395,7 @@ script: python placement_group_tests/long_running_performance_test.py --num-stages 2000 - name: placement_group_performance_test + team: core cluster: app_config: placement_group_tests/app_config.yaml compute_template: placement_group_tests/pg_perf_test_compute.yaml diff --git a/release/rllib_tests/rllib_tests.yaml b/release/rllib_tests/rllib_tests.yaml index 452040601..67c15d2ab 100644 --- a/release/rllib_tests/rllib_tests.yaml +++ b/release/rllib_tests/rllib_tests.yaml @@ -1,5 +1,6 @@ # Heavy learning tests (Atari and HalfCheetah) for major algos. - name: learning_tests + team: ml cluster: app_config: app_config.yaml compute_template: 8gpus_64cpus.yaml @@ -14,6 +15,7 @@ # 2-GPU learning tests (CartPole and RepeatAfterMeEnv) for major algos. - name: multi_gpu_learning_tests + team: ml cluster: app_config: app_config.yaml compute_template: 8gpus_96cpus.yaml @@ -25,6 +27,7 @@ # 2-GPU learning tests (StatelessCartPole) + use_lstm=True for major algos # (that support RNN models). - name: multi_gpu_with_lstm_learning_tests + team: ml cluster: app_config: app_config.yaml compute_template: 8gpus_96cpus.yaml @@ -36,6 +39,7 @@ # 2-GPU learning tests (StatelessCartPole) + use_attention=True for major # algos (that support RNN models). - name: multi_gpu_with_attention_learning_tests + team: ml cluster: app_config: app_config.yaml compute_template: 8gpus_96cpus.yaml @@ -46,6 +50,7 @@ # We'll have these as per-PR tests soon. # - name: example_scripts_on_gpu_tests +# team: ml # cluster: # app_config: app_config.yaml # compute_template: 1gpu_4cpus.yaml @@ -56,6 +61,7 @@ # IMPALA large machine stress tests (4x Atari). - name: stress_tests + team: ml cluster: app_config: app_config.yaml compute_template: 4gpus_544_cpus.yaml @@ -71,6 +77,7 @@ # Tests that exercise auto-scaling and Anyscale connect. - name: connect_tests + team: ml cluster: app_config: app_config.yaml compute_template: auto_scale.yaml @@ -86,6 +93,7 @@ # Performance metrics, such as reward achieved and throughput, are then # collected and tracked over time. - name: performance_tests + team: ml cluster: app_config: app_config.yaml compute_template: 12gpus_192cpus.yaml diff --git a/release/runtime_env_tests/runtime_env_tests.yaml b/release/runtime_env_tests/runtime_env_tests.yaml index 3fd06a04b..7a55da490 100644 --- a/release/runtime_env_tests/runtime_env_tests.yaml +++ b/release/runtime_env_tests/runtime_env_tests.yaml @@ -1,4 +1,5 @@ - name: rte_many_tasks_actors + team: serve cluster: app_config: app_config.yaml compute_template: rte_small.yaml @@ -9,6 +10,7 @@ script: python workloads/rte_many_tasks_actors.py - name: wheel_urls + team: serve cluster: app_config: app_config.yaml compute_template: rte_minimal.yaml @@ -19,6 +21,7 @@ script: python workloads/wheel_urls.py - name: rte_ray_client + team: serve cluster: app_config: app_config.yaml compute_template: rte_minimal.yaml diff --git a/release/serve_tests/serve_tests.yaml b/release/serve_tests/serve_tests.yaml index 4362ca296..6fb007c89 100644 --- a/release/serve_tests/serve_tests.yaml +++ b/release/serve_tests/serve_tests.yaml @@ -1,4 +1,5 @@ - name: single_deployment_1k_noop_replica + team: serve cluster: app_config: app_config.yaml compute_template: compute_tpl_8_cpu.yaml @@ -12,6 +13,7 @@ timeout: 600 - name: multi_deployment_1k_noop_replica + team: serve cluster: app_config: app_config.yaml compute_template: compute_tpl_8_cpu.yaml @@ -25,6 +27,7 @@ timeout: 600 - name: serve_micro_benchmark + team: serve cluster: app_config: app_config.yaml # 16 CPUS @@ -39,6 +42,7 @@ timeout: 600 - name: serve_cluster_fault_tolerance + team: serve cluster: app_config: app_config.yaml # 16 CPUS diff --git a/release/sgd_tests/sgd_tests.yaml b/release/sgd_tests/sgd_tests.yaml index 7c538ef58..cb0d4d5c3 100644 --- a/release/sgd_tests/sgd_tests.yaml +++ b/release/sgd_tests/sgd_tests.yaml @@ -1,5 +1,6 @@ # Test multi-node, multi-GPU Ray SGD example. - name: sgd_gpu + team: ml cluster: app_config: sgd_gpu/sgd_gpu_app_config.yaml compute_template: sgd_gpu/sgd_gpu_compute.yaml diff --git a/release/tune_tests/cloud_tests/tune_cloud_tests.yaml b/release/tune_tests/cloud_tests/tune_cloud_tests.yaml index f4c3f0933..72279931e 100644 --- a/release/tune_tests/cloud_tests/tune_cloud_tests.yaml +++ b/release/tune_tests/cloud_tests/tune_cloud_tests.yaml @@ -1,4 +1,5 @@ - name: aws_no_sync_down + team: ml cluster: app_config: app_config.yaml compute_template: tpl_aws_4x2.yaml @@ -9,6 +10,7 @@ script: python workloads/run_cloud_test.py no_sync_down - name: aws_ssh_sync + team: ml cluster: app_config: app_config.yaml compute_template: tpl_aws_4x2.yaml @@ -19,6 +21,7 @@ script: python workloads/run_cloud_test.py ssh_sync - name: aws_durable_upload + team: ml cluster: app_config: app_config.yaml compute_template: tpl_aws_4x2.yaml @@ -29,6 +32,7 @@ script: python workloads/run_cloud_test.py durable_upload --bucket s3://data-test-ilr/durable_upload - name: aws_durable_upload_rllib_str + team: ml cluster: app_config: app_config_ml.yaml compute_template: tpl_aws_4x2.yaml @@ -39,6 +43,7 @@ script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str --bucket s3://data-test-ilr/durable_upload_rllib_str - name: aws_durable_upload_rllib_trainer + team: ml cluster: app_config: app_config_ml.yaml compute_template: tpl_aws_4x2.yaml @@ -49,6 +54,7 @@ script: python workloads/run_cloud_test.py durable_upload --trainable rllib_trainer --bucket s3://data-test-ilr/durable_upload_rllib_trainer - name: aws_no_durable_upload + team: ml cluster: app_config: app_config.yaml compute_template: tpl_aws_4x2.yaml @@ -59,6 +65,7 @@ script: python workloads/run_cloud_test.py no_durable_upload --bucket s3://data-test-ilr/durable_upload - name: gcp_k8s_no_sync_down + team: ml cluster: app_config: app_config.yaml compute_template: tpl_gcp_k8s_4x8.yaml @@ -71,6 +78,7 @@ script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8 - name: gcp_k8s_ssh_sync + team: ml cluster: app_config: app_config.yaml compute_template: tpl_gcp_k8s_4x8.yaml @@ -83,6 +91,7 @@ script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8 - name: gcp_k8s_durable_upload + team: ml cluster: app_config: app_config.yaml compute_template: tpl_gcp_k8s_4x8.yaml @@ -96,6 +105,7 @@ - name: gcp_k8s_no_durable_upload + team: ml cluster: app_config: app_config.yaml compute_template: tpl_gcp_k8s_4x8.yaml diff --git a/release/tune_tests/scalability_tests/tune_tests.yaml b/release/tune_tests/scalability_tests/tune_tests.yaml index 30ab44769..ba8a5a230 100644 --- a/release/tune_tests/scalability_tests/tune_tests.yaml +++ b/release/tune_tests/scalability_tests/tune_tests.yaml @@ -1,4 +1,5 @@ - name: bookkeeping_overhead + team: ml cluster: app_config: app_config.yaml compute_template: tpl_1x16.yaml @@ -9,6 +10,7 @@ - name: durable_trainable + team: ml cluster: app_config: app_config.yaml compute_template: tpl_16x2.yaml @@ -19,6 +21,7 @@ script: python workloads/test_durable_trainable.py --bucket data-test-ilr - name: long_running_large_checkpoints + team: ml cluster: app_config: app_config.yaml compute_template: tpl_1x32_hd.yaml @@ -34,6 +37,7 @@ - name: network_overhead + team: ml cluster: app_config: app_config.yaml compute_template: tpl_100x2.yaml @@ -54,6 +58,7 @@ prepare: python wait_cluster.py 20 600 - name: result_throughput_cluster + team: ml cluster: app_config: app_config.yaml compute_template: tpl_16x64.yaml @@ -64,6 +69,7 @@ script: python workloads/test_result_throughput_cluster.py - name: result_throughput_single_node + team: ml cluster: app_config: app_config.yaml compute_template: tpl_1x96.yaml @@ -73,6 +79,7 @@ script: python workloads/test_result_throughput_single_node.py - name: xgboost_sweep + team: ml cluster: app_config: app_config_data.yaml compute_template: tpl_16x64.yaml diff --git a/release/xgboost_tests/xgboost_tests.yaml b/release/xgboost_tests/xgboost_tests.yaml index e095500fe..264443308 100644 --- a/release/xgboost_tests/xgboost_tests.yaml +++ b/release/xgboost_tests/xgboost_tests.yaml @@ -1,4 +1,5 @@ - name: train_small + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_small.yaml @@ -11,6 +12,7 @@ script: python workloads/train_small.py - name: train_moderate + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_moderate.yaml @@ -21,6 +23,7 @@ script: python workloads/train_moderate.py - name: train_gpu + team: ml cluster: app_config: app_config_gpu.yaml compute_template: tpl_gpu_small.yaml @@ -31,6 +34,7 @@ script: python workloads/train_gpu.py - name: distributed_api_test + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_small.yaml @@ -43,6 +47,7 @@ results: "" - name: ft_small_elastic + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_small.yaml @@ -54,6 +59,7 @@ results: "" - name: ft_small_non_elastic + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_small.yaml @@ -65,6 +71,7 @@ results: "" - name: tune_small + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_small.yaml @@ -75,6 +82,7 @@ script: python workloads/tune_small.py - name: tune_32x4 + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_moderate.yaml @@ -85,6 +93,7 @@ script: python workloads/tune_32x4.py - name: tune_4x32 + team: ml cluster: app_config: app_config.yaml compute_template: tpl_cpu_moderate.yaml