[Nightly tests] Migrate rest of core tests (#23085)

MIgrate the rest of core tests
This commit is contained in:
SangBin Cho 2022-03-12 03:41:14 +09:00 committed by GitHub
parent 04ea180dfb
commit 2b38fe89e2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 370 additions and 44 deletions

View file

@ -57,31 +57,31 @@ class SmokeTest(ReleaseTest):
CORE_NIGHTLY_TESTS = {
"~/ray/release/nightly_tests/nightly_tests.yaml": [
# "shuffle_10gb",
# "shuffle_50gb",
# "shuffle_50gb_large_partition",
# "shuffle_100gb",
# "non_streaming_shuffle_100gb",
# "non_streaming_shuffle_50gb_large_partition",
# "non_streaming_shuffle_50gb",
# SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
# SmokeTest("dask_on_ray_large_scale_test_spilling"),
# "stress_test_placement_group",
# "shuffle_1tb_1000_partition",
# "non_streaming_shuffle_1tb_1000_partition",
# "shuffle_1tb_5000_partitions",
# TODO(sang): It doesn't even work without spilling
# as it hits the scalability limit.
# "non_streaming_shuffle_1tb_5000_partitions",
# "decision_tree_autoscaling",
# "decision_tree_autoscaling_20_runs",
# "autoscaling_shuffle_1tb_1000_partitions",
# SmokeTest("stress_test_many_tasks"),
# SmokeTest("stress_test_dead_actors"),
# SmokeTest("threaded_actors_stress_test"),
# "pg_long_running_performance_test",
],
# "~/ray/release/nightly_tests/nightly_tests.yaml": [
# "shuffle_10gb",
# "shuffle_50gb",
# "shuffle_50gb_large_partition",
# "shuffle_100gb",
# "non_streaming_shuffle_100gb",
# "non_streaming_shuffle_50gb_large_partition",
# "non_streaming_shuffle_50gb",
# SmokeTest("dask_on_ray_large_scale_test_no_spilling"),
# SmokeTest("dask_on_ray_large_scale_test_spilling"),
# "stress_test_placement_group",
# "shuffle_1tb_1000_partition",
# "non_streaming_shuffle_1tb_1000_partition",
# "shuffle_1tb_5000_partitions",
# TODO(sang): It doesn't even work without spilling
# as it hits the scalability limit.
# "non_streaming_shuffle_1tb_5000_partitions",
# "decision_tree_autoscaling",
# "decision_tree_autoscaling_20_runs",
# "autoscaling_shuffle_1tb_1000_partitions",
# SmokeTest("stress_test_many_tasks"),
# SmokeTest("stress_test_dead_actors"),
# SmokeTest("threaded_actors_stress_test"),
# "pg_long_running_performance_test",
# ],
# "~/ray/benchmarks/benchmark_tests.yaml": [
# "single_node",
# "object_store",
@ -89,21 +89,21 @@ CORE_NIGHTLY_TESTS = {
# "many_tasks_smoke_test",
# "many_pgs_smoke_test",
# ],
"~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
"inference",
"shuffle_data_loader",
"parquet_metadata_resolution",
"pipelined_training_50_gb",
"pipelined_ingestion_1500_gb",
"datasets_preprocess_ingest",
"datasets_ingest_400G",
SmokeTest("datasets_ingest_train_infer"),
],
"~/ray/release/nightly_tests/chaos_test.yaml": [
"chaos_many_actors",
"chaos_many_tasks_no_object_store",
"chaos_pipelined_ingestion_1500_gb_15_windows",
],
# "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
# "inference",
# "shuffle_data_loader",
# "parquet_metadata_resolution",
# "pipelined_training_50_gb",
# "pipelined_ingestion_1500_gb",
# "datasets_preprocess_ingest",
# "datasets_ingest_400G",
# SmokeTest("datasets_ingest_train_infer"),
# ],
# "~/ray/release/nightly_tests/chaos_test.yaml": [
# "chaos_many_actors",
# "chaos_many_tasks_no_object_store",
# "chaos_pipelined_ingestion_1500_gb_15_windows",
# ],
# "~/ray/release/microbenchmark/microbenchmark.yaml": [
# "microbenchmark",
# ],
@ -137,10 +137,10 @@ CORE_DAILY_TESTS = {
# "stress_test_many_tasks",
# "stress_test_dead_actors",
# ],
"~/ray/release/nightly_tests/chaos_test.yaml": [
"chaos_dask_on_ray_large_scale_test_no_spilling",
"chaos_dask_on_ray_large_scale_test_spilling",
],
# "~/ray/release/nightly_tests/chaos_test.yaml": [
# "chaos_dask_on_ray_large_scale_test_no_spilling",
# "chaos_dask_on_ray_large_scale_test_spilling",
# ],
}
CORE_SCALABILITY_TESTS_DAILY = {

View file

@ -2391,6 +2391,204 @@
# file_manager: sdk
# stable: false
###############
# Dataset tests
###############
- name: inference
group: core-dataset-tests
working_dir: dataset
legacy:
test_name: inference
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: app_config.yaml
cluster_compute: inference.yaml
run:
timeout: 600
script: python inference.py
wait_for_nodes:
num_nodes: 2
timeout: 600
type: sdk_command
file_manager: sdk
- name: shuffle_data_loader
group: core-dataset-tests
working_dir: dataset
legacy:
test_name: shuffle_data_loader
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: shuffle_app_config.yaml
cluster_compute: shuffle_compute.yaml
run:
timeout: 1800
script: python dataset_shuffle_data_loader.py
type: sdk_command
file_manager: sdk
- name: parquet_metadata_resolution
group: core-dataset-tests
working_dir: dataset
legacy:
test_name: parquet_metadata_resolution
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: pipelined_training_app.yaml
cluster_compute: pipelined_training_compute.yaml
run:
timeout: 1200
script: python parquet_metadata_resolution.py --num-files 915
wait_for_nodes:
num_nodes: 15
timeout: 1200
type: sdk_command
file_manager: sdk
- name: pipelined_training_50_gb
group: core-dataset-tests
working_dir: dataset
legacy:
test_name: pipelined_training_50_gb
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: pipelined_training_app.yaml
cluster_compute: pipelined_training_compute.yaml
run:
timeout: 4800
script: python pipelined_training.py --epochs 1
wait_for_nodes:
num_nodes: 15
timeout: 1200
type: sdk_command
file_manager: sdk
- name: pipelined_ingestion_1500_gb
group: core-dataset-tests
working_dir: dataset
legacy:
test_name: pipelined_ingestion_1500_gb
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: pipelined_ingestion_app.yaml
cluster_compute: pipelined_ingestion_compute.yaml
run:
timeout: 9600
script: python pipelined_training.py --epochs 2 --num-windows 2 --num-files 915
--debug
wait_for_nodes:
num_nodes: 21
timeout: 2400
type: sdk_command
file_manager: sdk
- name: datasets_ingest_train_infer
group: core-dataset-tests
working_dir: dataset
legacy:
test_name: datasets_ingest_train_infer
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: ray_sgd_training_app.yaml
cluster_compute: ray_sgd_training_compute.yaml
run:
timeout: 14400
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu
--large-dataset
wait_for_nodes:
num_nodes: 66
timeout: 2400
type: sdk_command
file_manager: sdk
smoke_test:
cluster:
app_config: ray_sgd_training_app.yaml
compute_template: ray_sgd_training_smoke_compute.yaml
run:
timeout: 3600
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 8 --use-gpu
wait_for_nodes:
num_nodes: 8
timeout: 2400
- name: datasets_preprocess_ingest
group: core-dataset-tests
working_dir: dataset
legacy:
test_name: datasets_preprocess_ingest
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: ray_sgd_training_app.yaml
cluster_compute: ray_sgd_training_compute_no_gpu.yaml
run:
timeout: 7200
script: python ray_sgd_training.py --address auto --use-s3 --num-workers 16 --use-gpu
--large-dataset --debug
wait_for_nodes:
num_nodes: 21
timeout: 2400
type: sdk_command
file_manager: sdk
- name: datasets_ingest_400G
group: core-dataset-tests
working_dir: dataset
legacy:
test_name: datasets_ingest_400G
test_suite: dataset_test
frequency: multi
team: core
cluster:
cluster_env: ray_sgd_training_app.yaml
cluster_compute: dataset_ingest_400G_compute.yaml
run:
timeout: 7200
script: python ray_sgd_runner.py --address auto --use-gpu --num-epochs 1
type: sdk_command
file_manager: sdk
################
# Core K8s tests
################
@ -2472,3 +2670,131 @@
file_manager: job
stable: false
##################
# Core Chaos tests
##################
- name: chaos_many_tasks_no_object_store
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_many_tasks_no_object_store
test_suite: chaos_test
frequency: multi
team: core
cluster:
cluster_env: chaos_test/app_config.yaml
cluster_compute: chaos_test/compute_template.yaml
run:
timeout: 3600
wait_for_nodes:
num_nodes: 10
timeout: 600
prepare: python setup_chaos.py --no-start
script: python chaos_test/test_chaos_basic.py --workload=tasks
type: sdk_command
file_manager: sdk
- name: chaos_many_actors
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_many_actors
test_suite: chaos_test
frequency: multi
team: core
cluster:
cluster_env: chaos_test/app_config.yaml
cluster_compute: chaos_test/compute_template.yaml
run:
timeout: 3600
wait_for_nodes:
num_nodes: 10
timeout: 600
prepare: python setup_chaos.py --no-start
script: python chaos_test/test_chaos_basic.py --workload=actors
type: sdk_command
file_manager: sdk
- name: chaos_dask_on_ray_large_scale_test_no_spilling
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_dask_on_ray_large_scale_test_no_spilling
test_suite: chaos_test
frequency: nightly
team: core
cluster:
cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
run:
timeout: 7200
wait_for_nodes:
num_nodes: 21
timeout: 600
prepare: python setup_chaos.py --node-kill-interval 100
script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb
20 --error_rate 0 --data_save_path /tmp/ray
type: sdk_command
file_manager: sdk
- name: chaos_dask_on_ray_large_scale_test_spilling
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_dask_on_ray_large_scale_test_spilling
test_suite: chaos_test
frequency: nightly
team: core
cluster:
cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml
run:
timeout: 7200
wait_for_nodes:
num_nodes: 21
timeout: 600
prepare: python setup_chaos.py --node-kill-interval 100
script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb
70 --error_rate 0 --data_save_path /tmp/ray
type: sdk_command
file_manager: sdk
- name: chaos_pipelined_ingestion_1500_gb_15_windows
group: core-dataset-tests
working_dir: nightly_tests
legacy:
test_name: chaos_pipelined_ingestion_1500_gb_15_windows
test_suite: chaos_test
frequency: multi
team: core
cluster:
cluster_env: dataset/pipelined_ingestion_app.yaml
cluster_compute: dataset/pipelined_ingestion_compute.yaml
run:
timeout: 7200
wait_for_nodes:
num_nodes: 21
timeout: 2400
prepare: ' python setup_chaos.py --node-kill-interval 300'
script: python dataset/pipelined_training.py --epochs 1 --num-windows 15 --num-files
915 --debug
type: sdk_command
file_manager: sdk