[Nightly test] Test non streaming shuffle (#16150)

This commit is contained in:
SangBin Cho 2021-05-31 15:28:02 -07:00 committed by GitHub
parent 45d2331d5a
commit 9fa3b9f6f3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 77 additions and 8 deletions

View file

@ -1,3 +1,7 @@
#
# Single node shuffle
#
# Test basic single node 10GB shuffle with a small number of partitions.
# This doesn't require object spilling.
- name: shuffle_10gb
@ -53,6 +57,46 @@
cluster:
compute_template: shuffle/shuffle_compute_smoke.yaml # Does not exist yet
# Test non streaming shuffle in a single node with a small number of partition.
- name: non_streaming_shuffle_50gb
owner:
mail: "sangcho@anyscale.com"
slack: "@proj-data-processing"
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=50 --partition-size=1e9 --no-streaming
smoke_test:
cluster:
compute_template: shuffle/shuffle_compute_smoke.yaml # Does not exist yet
# Test non streaming shuffle in a single node with a large number of partition.
- name: non_streaming_shuffle_50gb_large_partition
owner:
mail: "sangcho@anyscale.com"
slack: "@proj-data-processing"
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_single.yaml
run:
timeout: 3000
script: python shuffle/shuffle_test.py --num-partitions=500 --partition-size=100e6 --no-streaming
smoke_test:
cluster:
compute_template: shuffle/shuffle_compute_smoke.yaml # Does not exist yet
#
# Multi node shuffle
#
# Test multi nodes 100GB shuffle with a small number of partitions.
- name: shuffle_100gb
owner:
@ -72,6 +116,25 @@
cluster:
compute_template: shuffle/shuffle_compute_smoke.yaml # Does not exist yet
# Test non streaming multi nodes 100GB shuffle with a small number of partitions.
- name: non_streaming_shuffle_100gb
owner:
mail: "sangcho@anyscale.com"
slack: "@proj-data-processing"
cluster:
app_config: shuffle/shuffle_app_config.yaml
compute_template: shuffle/shuffle_compute_multi.yaml
run:
timeout: 3000
prepare: python wait_cluster.py 4 600
script: python shuffle/shuffle_test.py --num-partitions=200 --partition-size=500e6 --no-streaming
smoke_test:
cluster:
compute_template: shuffle/shuffle_compute_smoke.yaml # Does not exist yet
# Test multi nodes 100GB shuffle with a large number of partitions.
# TODO(sang): Not working due to a bug https://github.com/ray-project/ray/issues/16025.
# - name: shuffle_100gb_large_partition

View file

@ -10,7 +10,7 @@ head_node_type:
worker_node_types:
- name: worker_node
instance_type: i3.8xlarge
instance_type: i3.4xlarge
min_workers: 3
max_workers: 3
use_spot: false

View file

@ -9,7 +9,7 @@ head_node_type:
instance_type: i3.2xlarge
worker_node_types:
- name: worker_node
- name: worker_node4
instance_type: i3.2xlarge
min_workers: 0
max_workers: 0

View file

@ -13,17 +13,23 @@ if __name__ == "__main__":
help="number of reducer actors used",
default="200e6",
type=str)
parser.add_argument(
"--no-streaming", help="Non streaming shuffle", action="store_true")
args = parser.parse_args()
start = time.time()
success = 1
commands = [
"python", "-m", "ray.experimental.shuffle", "--ray-address={}".format(
os.environ["RAY_ADDRESS"]),
f"--num-partitions={args.num_partitions}",
f"--partition-size={args.partition_size}"
]
if args.no_streaming:
commands.append("--no-streaming")
try:
subprocess.check_call([
"python", "-m", "ray.experimental.shuffle",
"--ray-address={}".format(os.environ["RAY_ADDRESS"]),
f"--num-partitions={args.num_partitions}",
f"--partition-size={args.partition_size}"
])
subprocess.check_call(commands)
except Exception as e:
print(f"The test failed with {e}")
success = 0