From 9fa3b9f6f3d9ea2da6ae4aefeefd613d1c7cf8b7 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Mon, 31 May 2021 15:28:02 -0700 Subject: [PATCH] [Nightly test] Test non streaming shuffle (#16150) --- release/nightly_tests/nightly_tests.yaml | 63 +++++++++++++++++++ .../shuffle/shuffle_compute_multi.yaml | 2 +- .../shuffle/shuffle_compute_single.yaml | 2 +- release/nightly_tests/shuffle/shuffle_test.py | 18 ++++-- 4 files changed, 77 insertions(+), 8 deletions(-) diff --git a/release/nightly_tests/nightly_tests.yaml b/release/nightly_tests/nightly_tests.yaml index d9f87b4ae..6500055ee 100644 --- a/release/nightly_tests/nightly_tests.yaml +++ b/release/nightly_tests/nightly_tests.yaml @@ -1,3 +1,7 @@ +# +# Single node shuffle +# + # Test basic single node 10GB shuffle with a small number of partitions. # This doesn't require object spilling. - name: shuffle_10gb @@ -53,6 +57,46 @@ cluster: compute_template: shuffle/shuffle_compute_smoke.yaml # Does not exist yet +# Test non streaming shuffle in a single node with a small number of partition. +- name: non_streaming_shuffle_50gb + owner: + mail: "sangcho@anyscale.com" + slack: "@proj-data-processing" + + cluster: + app_config: shuffle/shuffle_app_config.yaml + compute_template: shuffle/shuffle_compute_single.yaml + + run: + timeout: 3000 + script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming + + smoke_test: + cluster: + compute_template: shuffle/shuffle_compute_smoke.yaml # Does not exist yet + +# Test non streaming shuffle in a single node with a large number of partition. +- name: non_streaming_shuffle_50gb_large_partition + owner: + mail: "sangcho@anyscale.com" + slack: "@proj-data-processing" + + cluster: + app_config: shuffle/shuffle_app_config.yaml + compute_template: shuffle/shuffle_compute_single.yaml + + run: + timeout: 3000 + script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming + + smoke_test: + cluster: + compute_template: shuffle/shuffle_compute_smoke.yaml # Does not exist yet + +# +# Multi node shuffle +# + # Test multi nodes 100GB shuffle with a small number of partitions. - name: shuffle_100gb owner: @@ -72,6 +116,25 @@ cluster: compute_template: shuffle/shuffle_compute_smoke.yaml # Does not exist yet +# Test non streaming multi nodes 100GB shuffle with a small number of partitions. +- name: non_streaming_shuffle_100gb + owner: + mail: "sangcho@anyscale.com" + slack: "@proj-data-processing" + + cluster: + app_config: shuffle/shuffle_app_config.yaml + compute_template: shuffle/shuffle_compute_multi.yaml + + run: + timeout: 3000 + prepare: python wait_cluster.py 4 600 + script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming + + smoke_test: + cluster: + compute_template: shuffle/shuffle_compute_smoke.yaml # Does not exist yet + # Test multi nodes 100GB shuffle with a large number of partitions. # TODO(sang): Not working due to a bug https://github.com/ray-project/ray/issues/16025. # - name: shuffle_100gb_large_partition diff --git a/release/nightly_tests/shuffle/shuffle_compute_multi.yaml b/release/nightly_tests/shuffle/shuffle_compute_multi.yaml index d24706b4f..5b760c0c0 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_multi.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_multi.yaml @@ -10,7 +10,7 @@ head_node_type: worker_node_types: - name: worker_node - instance_type: i3.8xlarge + instance_type: i3.4xlarge min_workers: 3 max_workers: 3 use_spot: false diff --git a/release/nightly_tests/shuffle/shuffle_compute_single.yaml b/release/nightly_tests/shuffle/shuffle_compute_single.yaml index 85802afeb..eb9007984 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_single.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_single.yaml @@ -9,7 +9,7 @@ head_node_type: instance_type: i3.2xlarge worker_node_types: - - name: worker_node + - name: worker_node4 instance_type: i3.2xlarge min_workers: 0 max_workers: 0 diff --git a/release/nightly_tests/shuffle/shuffle_test.py b/release/nightly_tests/shuffle/shuffle_test.py index 47361d99e..7d66f25d5 100644 --- a/release/nightly_tests/shuffle/shuffle_test.py +++ b/release/nightly_tests/shuffle/shuffle_test.py @@ -13,17 +13,23 @@ if __name__ == "__main__": help="number of reducer actors used", default="200e6", type=str) + parser.add_argument( + "--no-streaming", help="Non streaming shuffle", action="store_true") args = parser.parse_args() start = time.time() success = 1 + commands = [ + "python", "-m", "ray.experimental.shuffle", "--ray-address={}".format( + os.environ["RAY_ADDRESS"]), + f"--num-partitions={args.num_partitions}", + f"--partition-size={args.partition_size}" + ] + if args.no_streaming: + commands.append("--no-streaming") + try: - subprocess.check_call([ - "python", "-m", "ray.experimental.shuffle", - "--ray-address={}".format(os.environ["RAY_ADDRESS"]), - f"--num-partitions={args.num_partitions}", - f"--partition-size={args.partition_size}" - ]) + subprocess.check_call(commands) except Exception as e: print(f"The test failed with {e}") success = 0