From 2b38fe89e2737403891c205a63d6c193e4ecd3bb Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Sat, 12 Mar 2022 03:41:14 +0900 Subject: [PATCH] [Nightly tests] Migrate rest of core tests (#23085) MIgrate the rest of core tests --- release/.buildkite/build_pipeline.py | 88 ++++---- release/release_tests.yaml | 326 +++++++++++++++++++++++++++ 2 files changed, 370 insertions(+), 44 deletions(-) diff --git a/release/.buildkite/build_pipeline.py b/release/.buildkite/build_pipeline.py index 98d37a4b2..e86f33b5a 100644 --- a/release/.buildkite/build_pipeline.py +++ b/release/.buildkite/build_pipeline.py @@ -57,31 +57,31 @@ class SmokeTest(ReleaseTest): CORE_NIGHTLY_TESTS = { - "~/ray/release/nightly_tests/nightly_tests.yaml": [ - # "shuffle_10gb", - # "shuffle_50gb", - # "shuffle_50gb_large_partition", - # "shuffle_100gb", - # "non_streaming_shuffle_100gb", - # "non_streaming_shuffle_50gb_large_partition", - # "non_streaming_shuffle_50gb", - # SmokeTest("dask_on_ray_large_scale_test_no_spilling"), - # SmokeTest("dask_on_ray_large_scale_test_spilling"), - # "stress_test_placement_group", - # "shuffle_1tb_1000_partition", - # "non_streaming_shuffle_1tb_1000_partition", - # "shuffle_1tb_5000_partitions", - # TODO(sang): It doesn't even work without spilling - # as it hits the scalability limit. - # "non_streaming_shuffle_1tb_5000_partitions", - # "decision_tree_autoscaling", - # "decision_tree_autoscaling_20_runs", - # "autoscaling_shuffle_1tb_1000_partitions", - # SmokeTest("stress_test_many_tasks"), - # SmokeTest("stress_test_dead_actors"), - # SmokeTest("threaded_actors_stress_test"), - # "pg_long_running_performance_test", - ], + # "~/ray/release/nightly_tests/nightly_tests.yaml": [ + # "shuffle_10gb", + # "shuffle_50gb", + # "shuffle_50gb_large_partition", + # "shuffle_100gb", + # "non_streaming_shuffle_100gb", + # "non_streaming_shuffle_50gb_large_partition", + # "non_streaming_shuffle_50gb", + # SmokeTest("dask_on_ray_large_scale_test_no_spilling"), + # SmokeTest("dask_on_ray_large_scale_test_spilling"), + # "stress_test_placement_group", + # "shuffle_1tb_1000_partition", + # "non_streaming_shuffle_1tb_1000_partition", + # "shuffle_1tb_5000_partitions", + # TODO(sang): It doesn't even work without spilling + # as it hits the scalability limit. + # "non_streaming_shuffle_1tb_5000_partitions", + # "decision_tree_autoscaling", + # "decision_tree_autoscaling_20_runs", + # "autoscaling_shuffle_1tb_1000_partitions", + # SmokeTest("stress_test_many_tasks"), + # SmokeTest("stress_test_dead_actors"), + # SmokeTest("threaded_actors_stress_test"), + # "pg_long_running_performance_test", + # ], # "~/ray/benchmarks/benchmark_tests.yaml": [ # "single_node", # "object_store", @@ -89,21 +89,21 @@ CORE_NIGHTLY_TESTS = { # "many_tasks_smoke_test", # "many_pgs_smoke_test", # ], - "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [ - "inference", - "shuffle_data_loader", - "parquet_metadata_resolution", - "pipelined_training_50_gb", - "pipelined_ingestion_1500_gb", - "datasets_preprocess_ingest", - "datasets_ingest_400G", - SmokeTest("datasets_ingest_train_infer"), - ], - "~/ray/release/nightly_tests/chaos_test.yaml": [ - "chaos_many_actors", - "chaos_many_tasks_no_object_store", - "chaos_pipelined_ingestion_1500_gb_15_windows", - ], + # "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [ + # "inference", + # "shuffle_data_loader", + # "parquet_metadata_resolution", + # "pipelined_training_50_gb", + # "pipelined_ingestion_1500_gb", + # "datasets_preprocess_ingest", + # "datasets_ingest_400G", + # SmokeTest("datasets_ingest_train_infer"), + # ], + # "~/ray/release/nightly_tests/chaos_test.yaml": [ + # "chaos_many_actors", + # "chaos_many_tasks_no_object_store", + # "chaos_pipelined_ingestion_1500_gb_15_windows", + # ], # "~/ray/release/microbenchmark/microbenchmark.yaml": [ # "microbenchmark", # ], @@ -137,10 +137,10 @@ CORE_DAILY_TESTS = { # "stress_test_many_tasks", # "stress_test_dead_actors", # ], - "~/ray/release/nightly_tests/chaos_test.yaml": [ - "chaos_dask_on_ray_large_scale_test_no_spilling", - "chaos_dask_on_ray_large_scale_test_spilling", - ], + # "~/ray/release/nightly_tests/chaos_test.yaml": [ + # "chaos_dask_on_ray_large_scale_test_no_spilling", + # "chaos_dask_on_ray_large_scale_test_spilling", + # ], } CORE_SCALABILITY_TESTS_DAILY = { diff --git a/release/release_tests.yaml b/release/release_tests.yaml index bce4f456c..9085a68cf 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -2391,6 +2391,204 @@ # file_manager: sdk # stable: false +############### +# Dataset tests +############### + +- name: inference + group: core-dataset-tests + working_dir: dataset + legacy: + test_name: inference + test_suite: dataset_test + + frequency: multi + team: core + cluster: + cluster_env: app_config.yaml + cluster_compute: inference.yaml + + run: + timeout: 600 + script: python inference.py + wait_for_nodes: + num_nodes: 2 + timeout: 600 + + type: sdk_command + file_manager: sdk + +- name: shuffle_data_loader + group: core-dataset-tests + working_dir: dataset + legacy: + test_name: shuffle_data_loader + test_suite: dataset_test + + frequency: multi + team: core + cluster: + cluster_env: shuffle_app_config.yaml + cluster_compute: shuffle_compute.yaml + + run: + timeout: 1800 + script: python dataset_shuffle_data_loader.py + type: sdk_command + file_manager: sdk + +- name: parquet_metadata_resolution + group: core-dataset-tests + working_dir: dataset + legacy: + test_name: parquet_metadata_resolution + test_suite: dataset_test + + frequency: multi + team: core + cluster: + cluster_env: pipelined_training_app.yaml + cluster_compute: pipelined_training_compute.yaml + + run: + timeout: 1200 + script: python parquet_metadata_resolution.py --num-files 915 + wait_for_nodes: + num_nodes: 15 + timeout: 1200 + + type: sdk_command + file_manager: sdk + +- name: pipelined_training_50_gb + group: core-dataset-tests + working_dir: dataset + legacy: + test_name: pipelined_training_50_gb + test_suite: dataset_test + + frequency: multi + team: core + cluster: + cluster_env: pipelined_training_app.yaml + cluster_compute: pipelined_training_compute.yaml + + run: + timeout: 4800 + script: python pipelined_training.py --epochs 1 + wait_for_nodes: + num_nodes: 15 + timeout: 1200 + + type: sdk_command + file_manager: sdk + +- name: pipelined_ingestion_1500_gb + group: core-dataset-tests + working_dir: dataset + legacy: + test_name: pipelined_ingestion_1500_gb + test_suite: dataset_test + + frequency: multi + team: core + cluster: + cluster_env: pipelined_ingestion_app.yaml + cluster_compute: pipelined_ingestion_compute.yaml + + run: + timeout: 9600 + script: python pipelined_training.py --epochs 2 --num-windows 2 --num-files 915 + --debug + + wait_for_nodes: + num_nodes: 21 + timeout: 2400 + + type: sdk_command + file_manager: sdk + +- name: datasets_ingest_train_infer + group: core-dataset-tests + working_dir: dataset + legacy: + test_name: datasets_ingest_train_infer + test_suite: dataset_test + + frequency: multi + team: core + cluster: + cluster_env: ray_sgd_training_app.yaml + cluster_compute: ray_sgd_training_compute.yaml + + run: + timeout: 14400 + script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu + --large-dataset + + wait_for_nodes: + num_nodes: 66 + timeout: 2400 + + type: sdk_command + file_manager: sdk + + smoke_test: + cluster: + app_config: ray_sgd_training_app.yaml + compute_template: ray_sgd_training_smoke_compute.yaml + + run: + timeout: 3600 + script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu + wait_for_nodes: + num_nodes: 8 + timeout: 2400 + +- name: datasets_preprocess_ingest + group: core-dataset-tests + working_dir: dataset + legacy: + test_name: datasets_preprocess_ingest + test_suite: dataset_test + + frequency: multi + team: core + cluster: + cluster_env: ray_sgd_training_app.yaml + cluster_compute: ray_sgd_training_compute_no_gpu.yaml + + run: + timeout: 7200 + script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu + --large-dataset --debug + + wait_for_nodes: + num_nodes: 21 + timeout: 2400 + + type: sdk_command + file_manager: sdk + +- name: datasets_ingest_400G + group: core-dataset-tests + working_dir: dataset + legacy: + test_name: datasets_ingest_400G + test_suite: dataset_test + + frequency: multi + team: core + cluster: + cluster_env: ray_sgd_training_app.yaml + cluster_compute: dataset_ingest_400G_compute.yaml + + run: + timeout: 7200 + script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1 + type: sdk_command + file_manager: sdk + ################ # Core K8s tests ################ @@ -2472,3 +2670,131 @@ file_manager: job stable: false + +################## +# Core Chaos tests +################## + +- name: chaos_many_tasks_no_object_store + group: core-dataset-tests + working_dir: nightly_tests + legacy: + test_name: chaos_many_tasks_no_object_store + test_suite: chaos_test + + frequency: multi + team: core + cluster: + cluster_env: chaos_test/app_config.yaml + cluster_compute: chaos_test/compute_template.yaml + + run: + timeout: 3600 + wait_for_nodes: + num_nodes: 10 + timeout: 600 + prepare: python setup_chaos.py --no-start + script: python chaos_test/test_chaos_basic.py --workload=tasks + + type: sdk_command + file_manager: sdk + +- name: chaos_many_actors + group: core-dataset-tests + working_dir: nightly_tests + legacy: + test_name: chaos_many_actors + test_suite: chaos_test + + frequency: multi + team: core + cluster: + cluster_env: chaos_test/app_config.yaml + cluster_compute: chaos_test/compute_template.yaml + + run: + timeout: 3600 + wait_for_nodes: + num_nodes: 10 + timeout: 600 + prepare: python setup_chaos.py --no-start + script: python chaos_test/test_chaos_basic.py --workload=actors + + type: sdk_command + file_manager: sdk + +- name: chaos_dask_on_ray_large_scale_test_no_spilling + group: core-dataset-tests + working_dir: nightly_tests + legacy: + test_name: chaos_dask_on_ray_large_scale_test_no_spilling + test_suite: chaos_test + + frequency: nightly + team: core + cluster: + cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml + cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml + + run: + timeout: 7200 + wait_for_nodes: + num_nodes: 21 + timeout: 600 + prepare: python setup_chaos.py --node-kill-interval 100 + script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb + 20 --error_rate 0 --data_save_path /tmp/ray + + type: sdk_command + file_manager: sdk + +- name: chaos_dask_on_ray_large_scale_test_spilling + group: core-dataset-tests + working_dir: nightly_tests + legacy: + test_name: chaos_dask_on_ray_large_scale_test_spilling + test_suite: chaos_test + + frequency: nightly + team: core + cluster: + cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml + cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml + + run: + timeout: 7200 + wait_for_nodes: + num_nodes: 21 + timeout: 600 + prepare: python setup_chaos.py --node-kill-interval 100 + script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb + 70 --error_rate 0 --data_save_path /tmp/ray + + type: sdk_command + file_manager: sdk + +- name: chaos_pipelined_ingestion_1500_gb_15_windows + group: core-dataset-tests + working_dir: nightly_tests + legacy: + test_name: chaos_pipelined_ingestion_1500_gb_15_windows + test_suite: chaos_test + + frequency: multi + team: core + cluster: + cluster_env: dataset/pipelined_ingestion_app.yaml + cluster_compute: dataset/pipelined_ingestion_compute.yaml + + run: + timeout: 7200 + wait_for_nodes: + num_nodes: 21 + timeout: 2400 + prepare: ' python setup_chaos.py --node-kill-interval 300' + script: python dataset/pipelined_training.py --epochs 1 --num-windows 15 --num-files + 915 --debug + + type: sdk_command + file_manager: sdk +