From 2b38fe89e2737403891c205a63d6c193e4ecd3bb Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Sat, 12 Mar 2022 03:41:14 +0900
Subject: [PATCH] [Nightly tests] Migrate rest of core tests (#23085)

MIgrate the rest of core tests
---
 release/.buildkite/build_pipeline.py |  88 ++++----
 release/release_tests.yaml           | 326 +++++++++++++++++++++++++++
 2 files changed, 370 insertions(+), 44 deletions(-)

diff --git a/release/.buildkite/build_pipeline.py b/release/.buildkite/build_pipeline.py
index 98d37a4b2..e86f33b5a 100644
--- a/release/.buildkite/build_pipeline.py
+++ b/release/.buildkite/build_pipeline.py
@@ -57,31 +57,31 @@ class SmokeTest(ReleaseTest):
 
 
 CORE_NIGHTLY_TESTS = {
-    "~/ray/release/nightly_tests/nightly_tests.yaml": [
-        # "shuffle_10gb",
-        # "shuffle_50gb",
-        # "shuffle_50gb_large_partition",
-        # "shuffle_100gb",
-        # "non_streaming_shuffle_100gb",
-        # "non_streaming_shuffle_50gb_large_partition",
-        # "non_streaming_shuffle_50gb",
-        # SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
-        # SmokeTest("dask_on_ray_large_scale_test_spilling"),
-        # "stress_test_placement_group",
-        # "shuffle_1tb_1000_partition",
-        # "non_streaming_shuffle_1tb_1000_partition",
-        # "shuffle_1tb_5000_partitions",
-        # TODO(sang): It doesn't even work without spilling
-        # as it hits the scalability limit.
-        # "non_streaming_shuffle_1tb_5000_partitions",
-        # "decision_tree_autoscaling",
-        # "decision_tree_autoscaling_20_runs",
-        # "autoscaling_shuffle_1tb_1000_partitions",
-        # SmokeTest("stress_test_many_tasks"),
-        # SmokeTest("stress_test_dead_actors"),
-        # SmokeTest("threaded_actors_stress_test"),
-        # "pg_long_running_performance_test",
-    ],
+    # "~/ray/release/nightly_tests/nightly_tests.yaml": [
+    # "shuffle_10gb",
+    # "shuffle_50gb",
+    # "shuffle_50gb_large_partition",
+    # "shuffle_100gb",
+    # "non_streaming_shuffle_100gb",
+    # "non_streaming_shuffle_50gb_large_partition",
+    # "non_streaming_shuffle_50gb",
+    # SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
+    # SmokeTest("dask_on_ray_large_scale_test_spilling"),
+    # "stress_test_placement_group",
+    # "shuffle_1tb_1000_partition",
+    # "non_streaming_shuffle_1tb_1000_partition",
+    # "shuffle_1tb_5000_partitions",
+    # TODO(sang): It doesn't even work without spilling
+    # as it hits the scalability limit.
+    # "non_streaming_shuffle_1tb_5000_partitions",
+    # "decision_tree_autoscaling",
+    # "decision_tree_autoscaling_20_runs",
+    # "autoscaling_shuffle_1tb_1000_partitions",
+    # SmokeTest("stress_test_many_tasks"),
+    # SmokeTest("stress_test_dead_actors"),
+    # SmokeTest("threaded_actors_stress_test"),
+    # "pg_long_running_performance_test",
+    # ],
     # "~/ray/benchmarks/benchmark_tests.yaml": [
     #     "single_node",
     #     "object_store",
@@ -89,21 +89,21 @@ CORE_NIGHTLY_TESTS = {
     #     "many_tasks_smoke_test",
     #     "many_pgs_smoke_test",
     # ],
-    "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
-        "inference",
-        "shuffle_data_loader",
-        "parquet_metadata_resolution",
-        "pipelined_training_50_gb",
-        "pipelined_ingestion_1500_gb",
-        "datasets_preprocess_ingest",
-        "datasets_ingest_400G",
-        SmokeTest("datasets_ingest_train_infer"),
-    ],
-    "~/ray/release/nightly_tests/chaos_test.yaml": [
-        "chaos_many_actors",
-        "chaos_many_tasks_no_object_store",
-        "chaos_pipelined_ingestion_1500_gb_15_windows",
-    ],
+    # "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
+    #     "inference",
+    #     "shuffle_data_loader",
+    #     "parquet_metadata_resolution",
+    #     "pipelined_training_50_gb",
+    #     "pipelined_ingestion_1500_gb",
+    #     "datasets_preprocess_ingest",
+    #     "datasets_ingest_400G",
+    #     SmokeTest("datasets_ingest_train_infer"),
+    # ],
+    # "~/ray/release/nightly_tests/chaos_test.yaml": [
+    #     "chaos_many_actors",
+    #     "chaos_many_tasks_no_object_store",
+    #     "chaos_pipelined_ingestion_1500_gb_15_windows",
+    # ],
     # "~/ray/release/microbenchmark/microbenchmark.yaml": [
     #     "microbenchmark",
     # ],
@@ -137,10 +137,10 @@ CORE_DAILY_TESTS = {
     #     "stress_test_many_tasks",
     #     "stress_test_dead_actors",
     # ],
-    "~/ray/release/nightly_tests/chaos_test.yaml": [
-        "chaos_dask_on_ray_large_scale_test_no_spilling",
-        "chaos_dask_on_ray_large_scale_test_spilling",
-    ],
+    # "~/ray/release/nightly_tests/chaos_test.yaml": [
+    #     "chaos_dask_on_ray_large_scale_test_no_spilling",
+    #     "chaos_dask_on_ray_large_scale_test_spilling",
+    # ],
 }
 
 CORE_SCALABILITY_TESTS_DAILY = {
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index bce4f456c..9085a68cf 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -2391,6 +2391,204 @@
 #     file_manager: sdk
 #   stable: false
 
+###############
+# Dataset tests
+###############
+
+- name: inference
+  group: core-dataset-tests
+  working_dir: dataset
+  legacy:
+    test_name: inference
+    test_suite: dataset_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: app_config.yaml
+    cluster_compute: inference.yaml
+
+  run:
+    timeout: 600
+    script: python inference.py
+    wait_for_nodes:
+      num_nodes: 2
+      timeout: 600
+
+    type: sdk_command
+    file_manager: sdk
+
+- name: shuffle_data_loader
+  group: core-dataset-tests
+  working_dir: dataset
+  legacy:
+    test_name: shuffle_data_loader
+    test_suite: dataset_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: shuffle_app_config.yaml
+    cluster_compute: shuffle_compute.yaml
+
+  run:
+    timeout: 1800
+    script: python dataset_shuffle_data_loader.py
+    type: sdk_command
+    file_manager: sdk
+
+- name: parquet_metadata_resolution
+  group: core-dataset-tests
+  working_dir: dataset
+  legacy:
+    test_name: parquet_metadata_resolution
+    test_suite: dataset_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: pipelined_training_app.yaml
+    cluster_compute: pipelined_training_compute.yaml
+
+  run:
+    timeout: 1200
+    script: python parquet_metadata_resolution.py --num-files 915
+    wait_for_nodes:
+      num_nodes: 15
+      timeout: 1200
+
+    type: sdk_command
+    file_manager: sdk
+
+- name: pipelined_training_50_gb
+  group: core-dataset-tests
+  working_dir: dataset
+  legacy:
+    test_name: pipelined_training_50_gb
+    test_suite: dataset_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: pipelined_training_app.yaml
+    cluster_compute: pipelined_training_compute.yaml
+
+  run:
+    timeout: 4800
+    script: python pipelined_training.py --epochs 1
+    wait_for_nodes:
+      num_nodes: 15
+      timeout: 1200
+
+    type: sdk_command
+    file_manager: sdk
+
+- name: pipelined_ingestion_1500_gb
+  group: core-dataset-tests
+  working_dir: dataset
+  legacy:
+    test_name: pipelined_ingestion_1500_gb
+    test_suite: dataset_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: pipelined_ingestion_app.yaml
+    cluster_compute: pipelined_ingestion_compute.yaml
+
+  run:
+    timeout: 9600
+    script: python pipelined_training.py --epochs 2 --num-windows 2 --num-files 915
+      --debug
+
+    wait_for_nodes:
+      num_nodes: 21
+      timeout: 2400
+
+    type: sdk_command
+    file_manager: sdk
+
+- name: datasets_ingest_train_infer
+  group: core-dataset-tests
+  working_dir: dataset
+  legacy:
+    test_name: datasets_ingest_train_infer
+    test_suite: dataset_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: ray_sgd_training_app.yaml
+    cluster_compute: ray_sgd_training_compute.yaml
+
+  run:
+    timeout: 14400
+    script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu
+      --large-dataset
+
+    wait_for_nodes:
+      num_nodes: 66
+      timeout: 2400
+
+    type: sdk_command
+    file_manager: sdk
+
+  smoke_test:
+    cluster:
+      app_config: ray_sgd_training_app.yaml
+      compute_template: ray_sgd_training_smoke_compute.yaml
+
+    run:
+      timeout: 3600
+      script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
+      wait_for_nodes:
+        num_nodes: 8
+        timeout: 2400
+
+- name: datasets_preprocess_ingest
+  group: core-dataset-tests
+  working_dir: dataset
+  legacy:
+    test_name: datasets_preprocess_ingest
+    test_suite: dataset_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: ray_sgd_training_app.yaml
+    cluster_compute: ray_sgd_training_compute_no_gpu.yaml
+
+  run:
+    timeout: 7200
+    script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu
+      --large-dataset --debug
+
+    wait_for_nodes:
+      num_nodes: 21
+      timeout: 2400
+
+    type: sdk_command
+    file_manager: sdk
+
+- name: datasets_ingest_400G
+  group: core-dataset-tests
+  working_dir: dataset
+  legacy:
+    test_name: datasets_ingest_400G
+    test_suite: dataset_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: ray_sgd_training_app.yaml
+    cluster_compute: dataset_ingest_400G_compute.yaml
+
+  run:
+    timeout: 7200
+    script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1
+    type: sdk_command
+    file_manager: sdk
+
 ################
 # Core K8s tests
 ################
@@ -2472,3 +2670,131 @@
     file_manager: job
 
   stable: false
+
+##################
+# Core Chaos tests
+##################
+
+- name: chaos_many_tasks_no_object_store
+  group: core-dataset-tests
+  working_dir: nightly_tests
+  legacy:
+    test_name: chaos_many_tasks_no_object_store
+    test_suite: chaos_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: chaos_test/app_config.yaml
+    cluster_compute: chaos_test/compute_template.yaml
+
+  run:
+    timeout: 3600
+    wait_for_nodes:
+      num_nodes: 10
+      timeout: 600
+    prepare: python setup_chaos.py --no-start
+    script: python chaos_test/test_chaos_basic.py --workload=tasks
+
+    type: sdk_command
+    file_manager: sdk
+
+- name: chaos_many_actors
+  group: core-dataset-tests
+  working_dir: nightly_tests
+  legacy:
+    test_name: chaos_many_actors
+    test_suite: chaos_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: chaos_test/app_config.yaml
+    cluster_compute: chaos_test/compute_template.yaml
+
+  run:
+    timeout: 3600
+    wait_for_nodes:
+      num_nodes: 10
+      timeout: 600
+    prepare: python setup_chaos.py --no-start
+    script: python chaos_test/test_chaos_basic.py --workload=actors
+
+    type: sdk_command
+    file_manager: sdk
+
+- name: chaos_dask_on_ray_large_scale_test_no_spilling
+  group: core-dataset-tests
+  working_dir: nightly_tests
+  legacy:
+    test_name: chaos_dask_on_ray_large_scale_test_no_spilling
+    test_suite: chaos_test
+
+  frequency: nightly
+  team: core
+  cluster:
+    cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
+    cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
+
+  run:
+    timeout: 7200
+    wait_for_nodes:
+      num_nodes: 21
+      timeout: 600
+    prepare: python setup_chaos.py --node-kill-interval 100
+    script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb
+      20 --error_rate 0  --data_save_path /tmp/ray
+
+    type: sdk_command
+    file_manager: sdk
+
+- name: chaos_dask_on_ray_large_scale_test_spilling
+  group: core-dataset-tests
+  working_dir: nightly_tests
+  legacy:
+    test_name: chaos_dask_on_ray_large_scale_test_spilling
+    test_suite: chaos_test
+
+  frequency: nightly
+  team: core
+  cluster:
+    cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
+    cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
+
+  run:
+    timeout: 7200
+    wait_for_nodes:
+      num_nodes: 21
+      timeout: 600
+    prepare: python setup_chaos.py --node-kill-interval 100
+    script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
+      70 --error_rate 0  --data_save_path /tmp/ray
+
+    type: sdk_command
+    file_manager: sdk
+
+- name: chaos_pipelined_ingestion_1500_gb_15_windows
+  group: core-dataset-tests
+  working_dir: nightly_tests
+  legacy:
+    test_name: chaos_pipelined_ingestion_1500_gb_15_windows
+    test_suite: chaos_test
+
+  frequency: multi
+  team: core
+  cluster:
+    cluster_env: dataset/pipelined_ingestion_app.yaml
+    cluster_compute: dataset/pipelined_ingestion_compute.yaml
+
+  run:
+    timeout: 7200
+    wait_for_nodes:
+      num_nodes: 21
+      timeout: 2400
+    prepare: ' python setup_chaos.py --node-kill-interval 300'
+    script: python dataset/pipelined_training.py --epochs 1 --num-windows 15  --num-files
+      915 --debug
+
+    type: sdk_command
+    file_manager: sdk
+