Scalability Envelope Tests (#13464)

This commit is contained in:
Alex Wu 2021-01-25 18:48:31 -08:00 committed by GitHub
parent f2867b0609
commit 840987c7af
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 629 additions and 1 deletions

35
benchmarks/README.md Normal file
View file

@ -0,0 +1,35 @@
# Ray Scalability Envelope
### Note: This document is a WIP. This is not a scalability guarantee (yet).
## Distributed Benchmarks
All distributed tests are run on 64 nodes with 64 cores/node. Maximum number of nodes is achieved by adding 4 core nodes.
| Dimension | Quantity |
| --------- | -------- |
| # nodes in cluster (with trivial task workload) | 250+ |
| # actors in cluster (with trivial workload) | 10k+ |
| # simultaneously running tasks | 10k+ |
| # simultaneously running placement groups | 1k+ |
## Object Store Benchmarks
| Dimension | Quantity |
| --------- | -------- |
| 1 GiB object broadcast (# of nodes) | 50+ |
## Single Node Benchmarks.
All single node benchmarks are run on a single m4.16xlarge.
| Dimension | Quantity |
| --------- | -------- |
| # of object artuments to a single task | 10000+ |
| # of objects returned from a single task | 3000+ |
| # of plasma objects in a single `ray.get` call | 10000+ |
| # of tasks queued on a single node | 1,000,000+ |
| Maximum `ray.get` numpy object size | 100GiB+ |

View file

@ -0,0 +1,58 @@
cluster_name: distributed-benchmarks
min_workers: 0
max_workers: 999999
upscaling_speed: 9999999
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a, us-west-2b, us-west-2c, us-west-2d
auth:
ssh_user: ubuntu
available_node_types:
head_node:
node_config:
InstanceType: m5.16xlarge
ImageId: ami-098555c9b343eb09c
resources:
node: 1
small: 1
max_workers: 999999
worker_node:
node_config:
InstanceType: m5.16xlarge
ImageId: ami-098555c9b343eb09c
resources:
node: 1
min_workers: 63
max_workers: 63
small_worker_node:
node_config:
InstanceType: m5.xlarge
ImageId: ami-098555c9b343eb09c
resources:
node: 1
max_workers: 999999
head_node_type: head_node
worker_default_node_type: worker_node
setup_commands:
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
- pip install tqdm
- sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 65535" >> /etc/security/limits.conf; echo "* hard nofile 65535" >> /etc/security/limits.conf;'
idle_timeout_minutes: 1
head_start_ray_commands:
- ray stop
- ulimit -n 65535; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65535; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

View file

@ -0,0 +1,204 @@
import ray
import ray.autoscaler.sdk
from ray.test_utils import Semaphore
from ray.util.placement_group import placement_group, remove_placement_group
from time import sleep, perf_counter
from tqdm import tqdm, trange
TEST_NUM_NODES = 64
MAX_ACTORS_IN_CLUSTER = 10000
MAX_RUNNING_TASKS_IN_CLUSTER = 10000
MAX_PLACEMENT_GROUPS = 1000
MAX_NUM_NODES = 250
def num_alive_nodes():
n = 0
for node in ray.nodes():
if node["Alive"]:
n += 1
return n
def scale_to(target):
while num_alive_nodes() != target:
ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target)
print(f"Current # nodes: {num_alive_nodes()}, target: {target}")
print("Waiting ...")
sleep(5)
def test_nodes():
scale_to(MAX_NUM_NODES)
assert num_alive_nodes() == MAX_NUM_NODES
# Treat this as a trivial task to ensure the nodes are all functioning
test_max_running_tasks()
def test_max_actors():
# TODO (Alex): Dynamically set this based on number of cores
cpus_per_actor = 0.25
@ray.remote(num_cpus=cpus_per_actor)
class Actor:
def foo(self):
pass
actors = [
Actor.remote()
for _ in trange(MAX_ACTORS_IN_CLUSTER, desc="Launching actors")
]
for actor in tqdm(actors, desc="Ensuring actors have started"):
assert ray.get(actor.foo.remote()) is None
def test_max_running_tasks():
counter = Semaphore.remote(0)
blocker = Semaphore.remote(0)
@ray.remote(num_cpus=0.25)
def task(counter, blocker):
sleep(300)
refs = [
task.remote(counter, blocker)
for _ in trange(MAX_RUNNING_TASKS_IN_CLUSTER, desc="Launching tasks")
]
max_cpus = ray.cluster_resources()["CPU"]
min_cpus_available = max_cpus
for _ in trange(int(300 / 0.1), desc="Waiting"):
try:
cur_cpus = ray.available_resources().get("CPU", 0)
min_cpus_available = min(min_cpus_available, cur_cpus)
except Exception:
# There are race conditions `.get` can fail if a new heartbeat
# comes at the same time.
pass
sleep(0.1)
# There are some relevant magic numbers in this check. 10k tasks each
# require 1/4 cpus. Therefore, ideally 2.5k cpus will be used.
err_str = f"Only {max_cpus - min_cpus_available}/{max_cpus} cpus used."
assert max_cpus - min_cpus_available > 2000, err_str
for _ in trange(
MAX_RUNNING_TASKS_IN_CLUSTER,
desc="Ensuring all tasks have finished"):
done, refs = ray.wait(refs)
assert ray.get(done[0]) is None
def test_many_placement_groups():
@ray.remote(num_cpus=1, resources={"node": 0.02})
def f1():
sleep(10)
pass
@ray.remote(num_cpus=1)
def f2():
sleep(10)
pass
@ray.remote(resources={"node": 0.02})
def f3():
sleep(10)
pass
bundle1 = {"node": 0.02, "CPU": 1}
bundle2 = {"CPU": 1}
bundle3 = {"node": 0.02}
pgs = []
for _ in trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"):
pg = placement_group(bundles=[bundle1, bundle2, bundle3])
pgs.append(pg)
for pg in tqdm(pgs, desc="Waiting for pgs to be ready"):
ray.get(pg.ready())
refs = []
for pg in tqdm(pgs, desc="Scheduling tasks"):
ref1 = f1.options(placement_group=pg).remote()
ref2 = f2.options(placement_group=pg).remote()
ref3 = f3.options(placement_group=pg).remote()
refs.extend([ref1, ref2, ref3])
for _ in trange(10, desc="Waiting"):
sleep(1)
with tqdm() as p_bar:
while refs:
done, refs = ray.wait(refs)
p_bar.update()
for pg in tqdm(pgs, desc="Cleaning up pgs"):
remove_placement_group(pg)
ray.init(address="auto")
scale_to(TEST_NUM_NODES)
assert num_alive_nodes(
) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
cluster_resources = ray.cluster_resources()
available_resources = ray.available_resources()
assert available_resources == cluster_resources, (
str(available_resources) + " != " + str(cluster_resources))
print("Done launching nodes")
actor_start = perf_counter()
test_max_actors()
actor_end = perf_counter()
sleep(1)
assert num_alive_nodes(
) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
assert available_resources == cluster_resources, (
str(available_resources) + " != " + str(cluster_resources))
print("Done testing actors")
task_start = perf_counter()
test_max_running_tasks()
task_end = perf_counter()
sleep(1)
assert num_alive_nodes(
) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
assert available_resources == cluster_resources, (
str(available_resources) + " != " + str(cluster_resources))
print("Done testing tasks")
pg_start = perf_counter()
test_many_placement_groups()
pg_end = perf_counter()
sleep(1)
assert num_alive_nodes(
) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
assert available_resources == cluster_resources, (
str(available_resources) + " != " + str(cluster_resources))
print("Done testing placement groups")
launch_start = perf_counter()
test_nodes()
launch_end = perf_counter()
sleep(1)
assert num_alive_nodes(
) == MAX_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
print("Done.")
actor_time = actor_end - actor_start
task_time = task_end - task_start
pg_time = pg_end - pg_start
launch_time = launch_end - launch_start
print(f"Actor time: {actor_time} ({MAX_ACTORS_IN_CLUSTER} actors)")
print(f"Task time: {task_time} ({MAX_RUNNING_TASKS_IN_CLUSTER} tasks)")
print(f"PG time: {pg_time} ({MAX_PLACEMENT_GROUPS} placement groups)")
print(f"Node launch time: {launch_time} ({MAX_NUM_NODES} nodes)")

View file

@ -0,0 +1,48 @@
cluster_name: object-store-benchmarks
min_workers: 0
max_workers: 999999
upscaling_speed: 9999999
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a
auth:
ssh_user: ubuntu
available_node_types:
head_node:
node_config:
InstanceType: m4.4xlarge
ImageId: ami-098555c9b343eb09c
resources:
node: 1
max_workers: 999999
worker_node:
node_config:
InstanceType: m4.xlarge
ImageId: ami-098555c9b343eb09c
resources:
node: 1
max_workers: 999999
head_node_type: head_node
worker_default_node_type: worker_node
setup_commands:
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
- pip install tqdm numpy
idle_timeout_minutes: 5
head_start_ray_commands:
- ray stop
- ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 1000000; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

View file

@ -0,0 +1,61 @@
import numpy as np
import ray
import ray.autoscaler.sdk
from time import sleep, perf_counter
from tqdm import tqdm
NUM_NODES = 50
OBJECT_SIZE = 2**30
def num_alive_nodes():
n = 0
for node in ray.nodes():
if node["Alive"]:
n += 1
return n
def scale_to(target):
while num_alive_nodes() != target:
ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target)
print(f"Current # nodes: {num_alive_nodes()}, target: {target}")
print("Waiting ...")
sleep(5)
def test_object_broadcast():
scale_to(NUM_NODES)
@ray.remote(num_cpus=1, resources={"node": 1})
class Actor:
def foo(self):
pass
def sum(self, arr):
return np.sum(arr)
actors = [Actor.remote() for _ in range(NUM_NODES)]
arr = np.ones(OBJECT_SIZE, dtype=np.uint8)
ref = ray.put(arr)
for actor in tqdm(actors, desc="Ensure all actors have started."):
ray.get(actor.foo.remote())
result_refs = []
for actor in tqdm(actors, desc="Broadcasting objects"):
result_refs.append(actor.sum.remote(ref))
results = ray.get(result_refs)
for result in results:
assert result == OBJECT_SIZE
ray.init(address="auto")
start = perf_counter()
test_object_broadcast()
end = perf_counter()
print(f"Broadcast time: {end - start} ({OBJECT_SIZE} B x {NUM_NODES} nodes)")

View file

@ -0,0 +1,41 @@
cluster_name: single-node-benchmarks
min_workers: 0
max_workers: 0
upscaling_speed: 9999999
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a
auth:
ssh_user: ubuntu
available_node_types:
head_node:
node_config:
InstanceType: m4.16xlarge
ImageId: ami-098555c9b343eb09c
resources:
node: 1
max_workers: 999999
worker_node:
node_config:
InstanceType: m4.xlarge
ImageId: ami-098555c9b343eb09c
head_node_type: head_node
worker_default_node_type: worker_node
setup_commands:
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
- pip install numpy tqdm
- sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1000000" >> /etc/security/limits.conf; echo "* hard nofile 1000000" >> /etc/security/limits.conf;'
idle_timeout_minutes: 5
head_start_ray_commands:
- ray stop
- ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --object-store-memory=128000000000 --autoscaling-config=~/ray_bootstrap_config.yaml

View file

@ -0,0 +1,175 @@
import numpy as np
import ray
import ray.autoscaler.sdk
from ray.test_utils import Semaphore
from time import perf_counter
from tqdm import trange, tqdm
MAX_ARGS = 10000
MAX_RETURNS = 3000
MAX_RAY_GET_ARGS = 10000
MAX_QUEUED_TASKS = 1_000_000
MAX_RAY_GET_SIZE = 100 * 2**30
def test_many_args():
@ray.remote
def sum_args(*args):
return sum(sum(arg) for arg in args)
args = [[1 for _ in range(10000)] for _ in range(MAX_ARGS)]
result = ray.get(sum_args.remote(*args))
assert result == MAX_ARGS * 10000
def test_many_returns():
@ray.remote(num_returns=MAX_RETURNS)
def f():
to_return = []
for _ in range(MAX_RETURNS):
obj = list(range(10000))
to_return.append(obj)
return tuple(to_return)
returned_refs = f.remote()
assert len(returned_refs) == MAX_RETURNS
for ref in returned_refs:
expected = list(range(10000))
obj = ray.get(ref)
assert obj == expected
def test_ray_get_args():
def with_dese():
print("Putting test objects:")
refs = []
for _ in trange(MAX_RAY_GET_ARGS):
obj = list(range(10000))
refs.append(ray.put(obj))
print("Getting objects")
results = ray.get(refs)
assert len(results) == MAX_RAY_GET_ARGS
print("Asserting correctness")
for obj in tqdm(results):
expected = list(range(10000))
assert obj == expected
def with_zero_copy():
print("Putting test objects:")
refs = []
for _ in trange(MAX_RAY_GET_ARGS):
obj = np.arange(10000)
refs.append(ray.put(obj))
print("Getting objects")
results = ray.get(refs)
assert len(results) == MAX_RAY_GET_ARGS
print("Asserting correctness")
for obj in tqdm(results):
expected = np.arange(10000)
assert (obj == expected).all()
with_dese()
print("Done with dese")
with_zero_copy()
print("Done with zero copy")
def test_many_queued_tasks():
sema = Semaphore.remote(0)
@ray.remote(num_cpus=1)
def block():
ray.get(sema.acquire.remote())
@ray.remote(num_cpus=1)
def f():
pass
num_cpus = int(ray.cluster_resources()["CPU"])
blocked_tasks = []
for _ in range(num_cpus):
blocked_tasks.append(block.remote())
print("Submitting many tasks")
pending_tasks = []
for _ in trange(MAX_QUEUED_TASKS):
pending_tasks.append(f.remote())
# Make sure all the tasks can actually run.
for _ in range(num_cpus):
sema.release.remote()
print("Unblocking tasks")
for ref in tqdm(pending_tasks):
assert ray.get(ref) is None
def test_large_object():
print("Generating object")
obj = np.zeros(MAX_RAY_GET_SIZE, dtype=np.int8)
print("Putting object")
ref = ray.put(obj)
del obj
print("Getting object")
big_obj = ray.get(ref)
assert big_obj[0] == 0
assert big_obj[-1] == 0
ray.init(address="auto")
args_start = perf_counter()
test_many_args()
args_end = perf_counter()
assert ray.cluster_resources() == ray.available_resources()
print("Finished many args")
returns_start = perf_counter()
test_many_returns()
returns_end = perf_counter()
assert ray.cluster_resources() == ray.available_resources()
print("Finished many returns")
get_start = perf_counter()
test_ray_get_args()
get_end = perf_counter()
assert ray.cluster_resources() == ray.available_resources()
print("Finished ray.get on many objects")
queued_start = perf_counter()
test_many_queued_tasks()
queued_end = perf_counter()
assert ray.cluster_resources() == ray.available_resources()
print("Finished queueing many tasks")
large_object_start = perf_counter()
test_large_object()
large_object_end = perf_counter()
assert ray.cluster_resources() == ray.available_resources()
print("Done")
args_time = args_end - args_start
returns_time = returns_end - returns_start
get_time = get_end - get_start
queued_time = queued_end - queued_start
large_object_time = large_object_end - large_object_start
print(f"Many args time: {args_time} ({MAX_ARGS} args)")
print(f"Many returns time: {returns_time} ({MAX_RETURNS} returns)")
print(f"Ray.get time: {get_time} ({MAX_RAY_GET_ARGS} args)")
print(f"Queued task time: {queued_time} ({MAX_QUEUED_TASKS} tasks)")
print(f"Ray.get large object time: {large_object_time} "
f"({MAX_RAY_GET_SIZE} bytes)")

View file

@ -134,7 +134,13 @@ is generally the easiest way to run release tests.
The summaries printed by each test should be checked in under
``release_logs/<version>`` on the **master** branch (make a pull request).
5. **ASAN tests**
5. **Scalability envelope tests**
- Run the tests in `benchmarks/` (with `ray submit --start cluster.yaml <test file>`)
- Record the outputted times.
- Whether the results are acceptable is a judgement call.
6. **ASAN tests**
Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the whole Python tests to detect memory leaks.