mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
Scalability Envelope Tests (#13464)
This commit is contained in:
parent
f2867b0609
commit
840987c7af
8 changed files with 629 additions and 1 deletions
35
benchmarks/README.md
Normal file
35
benchmarks/README.md
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
# Ray Scalability Envelope
|
||||||
|
|
||||||
|
### Note: This document is a WIP. This is not a scalability guarantee (yet).
|
||||||
|
|
||||||
|
## Distributed Benchmarks
|
||||||
|
|
||||||
|
All distributed tests are run on 64 nodes with 64 cores/node. Maximum number of nodes is achieved by adding 4 core nodes.
|
||||||
|
|
||||||
|
| Dimension | Quantity |
|
||||||
|
| --------- | -------- |
|
||||||
|
| # nodes in cluster (with trivial task workload) | 250+ |
|
||||||
|
| # actors in cluster (with trivial workload) | 10k+ |
|
||||||
|
| # simultaneously running tasks | 10k+ |
|
||||||
|
| # simultaneously running placement groups | 1k+ |
|
||||||
|
|
||||||
|
## Object Store Benchmarks
|
||||||
|
|
||||||
|
| Dimension | Quantity |
|
||||||
|
| --------- | -------- |
|
||||||
|
| 1 GiB object broadcast (# of nodes) | 50+ |
|
||||||
|
|
||||||
|
|
||||||
|
## Single Node Benchmarks.
|
||||||
|
|
||||||
|
All single node benchmarks are run on a single m4.16xlarge.
|
||||||
|
|
||||||
|
| Dimension | Quantity |
|
||||||
|
| --------- | -------- |
|
||||||
|
| # of object artuments to a single task | 10000+ |
|
||||||
|
| # of objects returned from a single task | 3000+ |
|
||||||
|
| # of plasma objects in a single `ray.get` call | 10000+ |
|
||||||
|
| # of tasks queued on a single node | 1,000,000+ |
|
||||||
|
| Maximum `ray.get` numpy object size | 100GiB+ |
|
||||||
|
|
||||||
|
|
58
benchmarks/distributed/config.yaml
Normal file
58
benchmarks/distributed/config.yaml
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
cluster_name: distributed-benchmarks
|
||||||
|
min_workers: 0
|
||||||
|
max_workers: 999999
|
||||||
|
|
||||||
|
upscaling_speed: 9999999
|
||||||
|
|
||||||
|
provider:
|
||||||
|
type: aws
|
||||||
|
region: us-west-2
|
||||||
|
availability_zone: us-west-2a, us-west-2b, us-west-2c, us-west-2d
|
||||||
|
|
||||||
|
auth:
|
||||||
|
ssh_user: ubuntu
|
||||||
|
|
||||||
|
available_node_types:
|
||||||
|
head_node:
|
||||||
|
node_config:
|
||||||
|
InstanceType: m5.16xlarge
|
||||||
|
ImageId: ami-098555c9b343eb09c
|
||||||
|
resources:
|
||||||
|
node: 1
|
||||||
|
small: 1
|
||||||
|
max_workers: 999999
|
||||||
|
worker_node:
|
||||||
|
node_config:
|
||||||
|
InstanceType: m5.16xlarge
|
||||||
|
ImageId: ami-098555c9b343eb09c
|
||||||
|
resources:
|
||||||
|
node: 1
|
||||||
|
min_workers: 63
|
||||||
|
max_workers: 63
|
||||||
|
small_worker_node:
|
||||||
|
node_config:
|
||||||
|
InstanceType: m5.xlarge
|
||||||
|
ImageId: ami-098555c9b343eb09c
|
||||||
|
resources:
|
||||||
|
node: 1
|
||||||
|
max_workers: 999999
|
||||||
|
|
||||||
|
head_node_type: head_node
|
||||||
|
|
||||||
|
worker_default_node_type: worker_node
|
||||||
|
|
||||||
|
setup_commands:
|
||||||
|
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||||
|
- pip install tqdm
|
||||||
|
- sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 65535" >> /etc/security/limits.conf; echo "* hard nofile 65535" >> /etc/security/limits.conf;'
|
||||||
|
|
||||||
|
idle_timeout_minutes: 1
|
||||||
|
|
||||||
|
head_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 65535; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||||
|
|
||||||
|
# Command to start ray on worker nodes. You don't need to change this.
|
||||||
|
worker_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 65535; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
204
benchmarks/distributed/test_distributed.py
Normal file
204
benchmarks/distributed/test_distributed.py
Normal file
|
@ -0,0 +1,204 @@
|
||||||
|
import ray
|
||||||
|
import ray.autoscaler.sdk
|
||||||
|
from ray.test_utils import Semaphore
|
||||||
|
from ray.util.placement_group import placement_group, remove_placement_group
|
||||||
|
|
||||||
|
from time import sleep, perf_counter
|
||||||
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
|
TEST_NUM_NODES = 64
|
||||||
|
MAX_ACTORS_IN_CLUSTER = 10000
|
||||||
|
MAX_RUNNING_TASKS_IN_CLUSTER = 10000
|
||||||
|
MAX_PLACEMENT_GROUPS = 1000
|
||||||
|
MAX_NUM_NODES = 250
|
||||||
|
|
||||||
|
|
||||||
|
def num_alive_nodes():
|
||||||
|
n = 0
|
||||||
|
for node in ray.nodes():
|
||||||
|
if node["Alive"]:
|
||||||
|
n += 1
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
|
def scale_to(target):
|
||||||
|
while num_alive_nodes() != target:
|
||||||
|
ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target)
|
||||||
|
print(f"Current # nodes: {num_alive_nodes()}, target: {target}")
|
||||||
|
print("Waiting ...")
|
||||||
|
sleep(5)
|
||||||
|
|
||||||
|
|
||||||
|
def test_nodes():
|
||||||
|
scale_to(MAX_NUM_NODES)
|
||||||
|
assert num_alive_nodes() == MAX_NUM_NODES
|
||||||
|
# Treat this as a trivial task to ensure the nodes are all functioning
|
||||||
|
test_max_running_tasks()
|
||||||
|
|
||||||
|
|
||||||
|
def test_max_actors():
|
||||||
|
# TODO (Alex): Dynamically set this based on number of cores
|
||||||
|
cpus_per_actor = 0.25
|
||||||
|
|
||||||
|
@ray.remote(num_cpus=cpus_per_actor)
|
||||||
|
class Actor:
|
||||||
|
def foo(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
actors = [
|
||||||
|
Actor.remote()
|
||||||
|
for _ in trange(MAX_ACTORS_IN_CLUSTER, desc="Launching actors")
|
||||||
|
]
|
||||||
|
|
||||||
|
for actor in tqdm(actors, desc="Ensuring actors have started"):
|
||||||
|
assert ray.get(actor.foo.remote()) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_max_running_tasks():
|
||||||
|
counter = Semaphore.remote(0)
|
||||||
|
blocker = Semaphore.remote(0)
|
||||||
|
|
||||||
|
@ray.remote(num_cpus=0.25)
|
||||||
|
def task(counter, blocker):
|
||||||
|
sleep(300)
|
||||||
|
|
||||||
|
refs = [
|
||||||
|
task.remote(counter, blocker)
|
||||||
|
for _ in trange(MAX_RUNNING_TASKS_IN_CLUSTER, desc="Launching tasks")
|
||||||
|
]
|
||||||
|
|
||||||
|
max_cpus = ray.cluster_resources()["CPU"]
|
||||||
|
min_cpus_available = max_cpus
|
||||||
|
for _ in trange(int(300 / 0.1), desc="Waiting"):
|
||||||
|
try:
|
||||||
|
cur_cpus = ray.available_resources().get("CPU", 0)
|
||||||
|
min_cpus_available = min(min_cpus_available, cur_cpus)
|
||||||
|
except Exception:
|
||||||
|
# There are race conditions `.get` can fail if a new heartbeat
|
||||||
|
# comes at the same time.
|
||||||
|
pass
|
||||||
|
sleep(0.1)
|
||||||
|
|
||||||
|
# There are some relevant magic numbers in this check. 10k tasks each
|
||||||
|
# require 1/4 cpus. Therefore, ideally 2.5k cpus will be used.
|
||||||
|
err_str = f"Only {max_cpus - min_cpus_available}/{max_cpus} cpus used."
|
||||||
|
assert max_cpus - min_cpus_available > 2000, err_str
|
||||||
|
|
||||||
|
for _ in trange(
|
||||||
|
MAX_RUNNING_TASKS_IN_CLUSTER,
|
||||||
|
desc="Ensuring all tasks have finished"):
|
||||||
|
done, refs = ray.wait(refs)
|
||||||
|
assert ray.get(done[0]) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_many_placement_groups():
|
||||||
|
@ray.remote(num_cpus=1, resources={"node": 0.02})
|
||||||
|
def f1():
|
||||||
|
sleep(10)
|
||||||
|
pass
|
||||||
|
|
||||||
|
@ray.remote(num_cpus=1)
|
||||||
|
def f2():
|
||||||
|
sleep(10)
|
||||||
|
pass
|
||||||
|
|
||||||
|
@ray.remote(resources={"node": 0.02})
|
||||||
|
def f3():
|
||||||
|
sleep(10)
|
||||||
|
pass
|
||||||
|
|
||||||
|
bundle1 = {"node": 0.02, "CPU": 1}
|
||||||
|
bundle2 = {"CPU": 1}
|
||||||
|
bundle3 = {"node": 0.02}
|
||||||
|
|
||||||
|
pgs = []
|
||||||
|
for _ in trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"):
|
||||||
|
pg = placement_group(bundles=[bundle1, bundle2, bundle3])
|
||||||
|
pgs.append(pg)
|
||||||
|
|
||||||
|
for pg in tqdm(pgs, desc="Waiting for pgs to be ready"):
|
||||||
|
ray.get(pg.ready())
|
||||||
|
|
||||||
|
refs = []
|
||||||
|
for pg in tqdm(pgs, desc="Scheduling tasks"):
|
||||||
|
ref1 = f1.options(placement_group=pg).remote()
|
||||||
|
ref2 = f2.options(placement_group=pg).remote()
|
||||||
|
ref3 = f3.options(placement_group=pg).remote()
|
||||||
|
refs.extend([ref1, ref2, ref3])
|
||||||
|
|
||||||
|
for _ in trange(10, desc="Waiting"):
|
||||||
|
sleep(1)
|
||||||
|
|
||||||
|
with tqdm() as p_bar:
|
||||||
|
while refs:
|
||||||
|
done, refs = ray.wait(refs)
|
||||||
|
p_bar.update()
|
||||||
|
|
||||||
|
for pg in tqdm(pgs, desc="Cleaning up pgs"):
|
||||||
|
remove_placement_group(pg)
|
||||||
|
|
||||||
|
|
||||||
|
ray.init(address="auto")
|
||||||
|
|
||||||
|
scale_to(TEST_NUM_NODES)
|
||||||
|
assert num_alive_nodes(
|
||||||
|
) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
|
||||||
|
|
||||||
|
cluster_resources = ray.cluster_resources()
|
||||||
|
|
||||||
|
available_resources = ray.available_resources()
|
||||||
|
assert available_resources == cluster_resources, (
|
||||||
|
str(available_resources) + " != " + str(cluster_resources))
|
||||||
|
print("Done launching nodes")
|
||||||
|
|
||||||
|
actor_start = perf_counter()
|
||||||
|
test_max_actors()
|
||||||
|
actor_end = perf_counter()
|
||||||
|
|
||||||
|
sleep(1)
|
||||||
|
assert num_alive_nodes(
|
||||||
|
) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
|
||||||
|
assert available_resources == cluster_resources, (
|
||||||
|
str(available_resources) + " != " + str(cluster_resources))
|
||||||
|
print("Done testing actors")
|
||||||
|
|
||||||
|
task_start = perf_counter()
|
||||||
|
test_max_running_tasks()
|
||||||
|
task_end = perf_counter()
|
||||||
|
|
||||||
|
sleep(1)
|
||||||
|
assert num_alive_nodes(
|
||||||
|
) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
|
||||||
|
assert available_resources == cluster_resources, (
|
||||||
|
str(available_resources) + " != " + str(cluster_resources))
|
||||||
|
print("Done testing tasks")
|
||||||
|
|
||||||
|
pg_start = perf_counter()
|
||||||
|
test_many_placement_groups()
|
||||||
|
pg_end = perf_counter()
|
||||||
|
|
||||||
|
sleep(1)
|
||||||
|
assert num_alive_nodes(
|
||||||
|
) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
|
||||||
|
assert available_resources == cluster_resources, (
|
||||||
|
str(available_resources) + " != " + str(cluster_resources))
|
||||||
|
print("Done testing placement groups")
|
||||||
|
|
||||||
|
launch_start = perf_counter()
|
||||||
|
test_nodes()
|
||||||
|
launch_end = perf_counter()
|
||||||
|
|
||||||
|
sleep(1)
|
||||||
|
assert num_alive_nodes(
|
||||||
|
) == MAX_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
|
||||||
|
print("Done.")
|
||||||
|
|
||||||
|
actor_time = actor_end - actor_start
|
||||||
|
task_time = task_end - task_start
|
||||||
|
pg_time = pg_end - pg_start
|
||||||
|
launch_time = launch_end - launch_start
|
||||||
|
|
||||||
|
print(f"Actor time: {actor_time} ({MAX_ACTORS_IN_CLUSTER} actors)")
|
||||||
|
print(f"Task time: {task_time} ({MAX_RUNNING_TASKS_IN_CLUSTER} tasks)")
|
||||||
|
print(f"PG time: {pg_time} ({MAX_PLACEMENT_GROUPS} placement groups)")
|
||||||
|
print(f"Node launch time: {launch_time} ({MAX_NUM_NODES} nodes)")
|
48
benchmarks/object_store/config.yaml
Normal file
48
benchmarks/object_store/config.yaml
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
cluster_name: object-store-benchmarks
|
||||||
|
min_workers: 0
|
||||||
|
max_workers: 999999
|
||||||
|
|
||||||
|
upscaling_speed: 9999999
|
||||||
|
|
||||||
|
provider:
|
||||||
|
type: aws
|
||||||
|
region: us-west-2
|
||||||
|
availability_zone: us-west-2a
|
||||||
|
|
||||||
|
auth:
|
||||||
|
ssh_user: ubuntu
|
||||||
|
|
||||||
|
available_node_types:
|
||||||
|
head_node:
|
||||||
|
node_config:
|
||||||
|
InstanceType: m4.4xlarge
|
||||||
|
ImageId: ami-098555c9b343eb09c
|
||||||
|
resources:
|
||||||
|
node: 1
|
||||||
|
max_workers: 999999
|
||||||
|
worker_node:
|
||||||
|
node_config:
|
||||||
|
InstanceType: m4.xlarge
|
||||||
|
ImageId: ami-098555c9b343eb09c
|
||||||
|
resources:
|
||||||
|
node: 1
|
||||||
|
max_workers: 999999
|
||||||
|
|
||||||
|
head_node_type: head_node
|
||||||
|
|
||||||
|
worker_default_node_type: worker_node
|
||||||
|
|
||||||
|
setup_commands:
|
||||||
|
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||||
|
- pip install tqdm numpy
|
||||||
|
|
||||||
|
idle_timeout_minutes: 5
|
||||||
|
|
||||||
|
head_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||||
|
|
||||||
|
# Command to start ray on worker nodes. You don't need to change this.
|
||||||
|
worker_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 1000000; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
61
benchmarks/object_store/test_object_store.py
Normal file
61
benchmarks/object_store/test_object_store.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import ray
|
||||||
|
import ray.autoscaler.sdk
|
||||||
|
|
||||||
|
from time import sleep, perf_counter
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
NUM_NODES = 50
|
||||||
|
OBJECT_SIZE = 2**30
|
||||||
|
|
||||||
|
|
||||||
|
def num_alive_nodes():
|
||||||
|
n = 0
|
||||||
|
for node in ray.nodes():
|
||||||
|
if node["Alive"]:
|
||||||
|
n += 1
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
|
def scale_to(target):
|
||||||
|
while num_alive_nodes() != target:
|
||||||
|
ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target)
|
||||||
|
print(f"Current # nodes: {num_alive_nodes()}, target: {target}")
|
||||||
|
print("Waiting ...")
|
||||||
|
sleep(5)
|
||||||
|
|
||||||
|
|
||||||
|
def test_object_broadcast():
|
||||||
|
scale_to(NUM_NODES)
|
||||||
|
|
||||||
|
@ray.remote(num_cpus=1, resources={"node": 1})
|
||||||
|
class Actor:
|
||||||
|
def foo(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def sum(self, arr):
|
||||||
|
return np.sum(arr)
|
||||||
|
|
||||||
|
actors = [Actor.remote() for _ in range(NUM_NODES)]
|
||||||
|
|
||||||
|
arr = np.ones(OBJECT_SIZE, dtype=np.uint8)
|
||||||
|
ref = ray.put(arr)
|
||||||
|
|
||||||
|
for actor in tqdm(actors, desc="Ensure all actors have started."):
|
||||||
|
ray.get(actor.foo.remote())
|
||||||
|
|
||||||
|
result_refs = []
|
||||||
|
for actor in tqdm(actors, desc="Broadcasting objects"):
|
||||||
|
result_refs.append(actor.sum.remote(ref))
|
||||||
|
|
||||||
|
results = ray.get(result_refs)
|
||||||
|
for result in results:
|
||||||
|
assert result == OBJECT_SIZE
|
||||||
|
|
||||||
|
|
||||||
|
ray.init(address="auto")
|
||||||
|
start = perf_counter()
|
||||||
|
test_object_broadcast()
|
||||||
|
end = perf_counter()
|
||||||
|
print(f"Broadcast time: {end - start} ({OBJECT_SIZE} B x {NUM_NODES} nodes)")
|
41
benchmarks/single_node/config.yaml
Normal file
41
benchmarks/single_node/config.yaml
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
cluster_name: single-node-benchmarks
|
||||||
|
min_workers: 0
|
||||||
|
max_workers: 0
|
||||||
|
|
||||||
|
upscaling_speed: 9999999
|
||||||
|
|
||||||
|
provider:
|
||||||
|
type: aws
|
||||||
|
region: us-west-2
|
||||||
|
availability_zone: us-west-2a
|
||||||
|
|
||||||
|
auth:
|
||||||
|
ssh_user: ubuntu
|
||||||
|
|
||||||
|
available_node_types:
|
||||||
|
head_node:
|
||||||
|
node_config:
|
||||||
|
InstanceType: m4.16xlarge
|
||||||
|
ImageId: ami-098555c9b343eb09c
|
||||||
|
resources:
|
||||||
|
node: 1
|
||||||
|
max_workers: 999999
|
||||||
|
worker_node:
|
||||||
|
node_config:
|
||||||
|
InstanceType: m4.xlarge
|
||||||
|
ImageId: ami-098555c9b343eb09c
|
||||||
|
|
||||||
|
head_node_type: head_node
|
||||||
|
|
||||||
|
worker_default_node_type: worker_node
|
||||||
|
|
||||||
|
setup_commands:
|
||||||
|
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
|
||||||
|
- pip install numpy tqdm
|
||||||
|
- sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1000000" >> /etc/security/limits.conf; echo "* hard nofile 1000000" >> /etc/security/limits.conf;'
|
||||||
|
|
||||||
|
idle_timeout_minutes: 5
|
||||||
|
|
||||||
|
head_start_ray_commands:
|
||||||
|
- ray stop
|
||||||
|
- ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --object-store-memory=128000000000 --autoscaling-config=~/ray_bootstrap_config.yaml
|
175
benchmarks/single_node/test_single_node.py
Normal file
175
benchmarks/single_node/test_single_node.py
Normal file
|
@ -0,0 +1,175 @@
|
||||||
|
import numpy as np
|
||||||
|
import ray
|
||||||
|
import ray.autoscaler.sdk
|
||||||
|
from ray.test_utils import Semaphore
|
||||||
|
|
||||||
|
from time import perf_counter
|
||||||
|
from tqdm import trange, tqdm
|
||||||
|
|
||||||
|
MAX_ARGS = 10000
|
||||||
|
MAX_RETURNS = 3000
|
||||||
|
MAX_RAY_GET_ARGS = 10000
|
||||||
|
MAX_QUEUED_TASKS = 1_000_000
|
||||||
|
MAX_RAY_GET_SIZE = 100 * 2**30
|
||||||
|
|
||||||
|
|
||||||
|
def test_many_args():
|
||||||
|
@ray.remote
|
||||||
|
def sum_args(*args):
|
||||||
|
return sum(sum(arg) for arg in args)
|
||||||
|
|
||||||
|
args = [[1 for _ in range(10000)] for _ in range(MAX_ARGS)]
|
||||||
|
result = ray.get(sum_args.remote(*args))
|
||||||
|
assert result == MAX_ARGS * 10000
|
||||||
|
|
||||||
|
|
||||||
|
def test_many_returns():
|
||||||
|
@ray.remote(num_returns=MAX_RETURNS)
|
||||||
|
def f():
|
||||||
|
to_return = []
|
||||||
|
for _ in range(MAX_RETURNS):
|
||||||
|
obj = list(range(10000))
|
||||||
|
to_return.append(obj)
|
||||||
|
|
||||||
|
return tuple(to_return)
|
||||||
|
|
||||||
|
returned_refs = f.remote()
|
||||||
|
assert len(returned_refs) == MAX_RETURNS
|
||||||
|
|
||||||
|
for ref in returned_refs:
|
||||||
|
expected = list(range(10000))
|
||||||
|
obj = ray.get(ref)
|
||||||
|
assert obj == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_ray_get_args():
|
||||||
|
def with_dese():
|
||||||
|
print("Putting test objects:")
|
||||||
|
refs = []
|
||||||
|
for _ in trange(MAX_RAY_GET_ARGS):
|
||||||
|
obj = list(range(10000))
|
||||||
|
refs.append(ray.put(obj))
|
||||||
|
|
||||||
|
print("Getting objects")
|
||||||
|
results = ray.get(refs)
|
||||||
|
assert len(results) == MAX_RAY_GET_ARGS
|
||||||
|
|
||||||
|
print("Asserting correctness")
|
||||||
|
for obj in tqdm(results):
|
||||||
|
expected = list(range(10000))
|
||||||
|
assert obj == expected
|
||||||
|
|
||||||
|
def with_zero_copy():
|
||||||
|
print("Putting test objects:")
|
||||||
|
refs = []
|
||||||
|
for _ in trange(MAX_RAY_GET_ARGS):
|
||||||
|
obj = np.arange(10000)
|
||||||
|
refs.append(ray.put(obj))
|
||||||
|
|
||||||
|
print("Getting objects")
|
||||||
|
results = ray.get(refs)
|
||||||
|
assert len(results) == MAX_RAY_GET_ARGS
|
||||||
|
|
||||||
|
print("Asserting correctness")
|
||||||
|
for obj in tqdm(results):
|
||||||
|
expected = np.arange(10000)
|
||||||
|
assert (obj == expected).all()
|
||||||
|
|
||||||
|
with_dese()
|
||||||
|
print("Done with dese")
|
||||||
|
with_zero_copy()
|
||||||
|
print("Done with zero copy")
|
||||||
|
|
||||||
|
|
||||||
|
def test_many_queued_tasks():
|
||||||
|
sema = Semaphore.remote(0)
|
||||||
|
|
||||||
|
@ray.remote(num_cpus=1)
|
||||||
|
def block():
|
||||||
|
ray.get(sema.acquire.remote())
|
||||||
|
|
||||||
|
@ray.remote(num_cpus=1)
|
||||||
|
def f():
|
||||||
|
pass
|
||||||
|
|
||||||
|
num_cpus = int(ray.cluster_resources()["CPU"])
|
||||||
|
blocked_tasks = []
|
||||||
|
for _ in range(num_cpus):
|
||||||
|
blocked_tasks.append(block.remote())
|
||||||
|
|
||||||
|
print("Submitting many tasks")
|
||||||
|
pending_tasks = []
|
||||||
|
for _ in trange(MAX_QUEUED_TASKS):
|
||||||
|
pending_tasks.append(f.remote())
|
||||||
|
|
||||||
|
# Make sure all the tasks can actually run.
|
||||||
|
for _ in range(num_cpus):
|
||||||
|
sema.release.remote()
|
||||||
|
|
||||||
|
print("Unblocking tasks")
|
||||||
|
for ref in tqdm(pending_tasks):
|
||||||
|
assert ray.get(ref) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_large_object():
|
||||||
|
print("Generating object")
|
||||||
|
obj = np.zeros(MAX_RAY_GET_SIZE, dtype=np.int8)
|
||||||
|
print("Putting object")
|
||||||
|
ref = ray.put(obj)
|
||||||
|
del obj
|
||||||
|
print("Getting object")
|
||||||
|
big_obj = ray.get(ref)
|
||||||
|
|
||||||
|
assert big_obj[0] == 0
|
||||||
|
assert big_obj[-1] == 0
|
||||||
|
|
||||||
|
|
||||||
|
ray.init(address="auto")
|
||||||
|
|
||||||
|
args_start = perf_counter()
|
||||||
|
test_many_args()
|
||||||
|
args_end = perf_counter()
|
||||||
|
|
||||||
|
assert ray.cluster_resources() == ray.available_resources()
|
||||||
|
print("Finished many args")
|
||||||
|
|
||||||
|
returns_start = perf_counter()
|
||||||
|
test_many_returns()
|
||||||
|
returns_end = perf_counter()
|
||||||
|
|
||||||
|
assert ray.cluster_resources() == ray.available_resources()
|
||||||
|
print("Finished many returns")
|
||||||
|
|
||||||
|
get_start = perf_counter()
|
||||||
|
test_ray_get_args()
|
||||||
|
get_end = perf_counter()
|
||||||
|
|
||||||
|
assert ray.cluster_resources() == ray.available_resources()
|
||||||
|
print("Finished ray.get on many objects")
|
||||||
|
|
||||||
|
queued_start = perf_counter()
|
||||||
|
test_many_queued_tasks()
|
||||||
|
queued_end = perf_counter()
|
||||||
|
|
||||||
|
assert ray.cluster_resources() == ray.available_resources()
|
||||||
|
print("Finished queueing many tasks")
|
||||||
|
|
||||||
|
large_object_start = perf_counter()
|
||||||
|
test_large_object()
|
||||||
|
large_object_end = perf_counter()
|
||||||
|
|
||||||
|
assert ray.cluster_resources() == ray.available_resources()
|
||||||
|
print("Done")
|
||||||
|
|
||||||
|
args_time = args_end - args_start
|
||||||
|
returns_time = returns_end - returns_start
|
||||||
|
get_time = get_end - get_start
|
||||||
|
queued_time = queued_end - queued_start
|
||||||
|
large_object_time = large_object_end - large_object_start
|
||||||
|
|
||||||
|
print(f"Many args time: {args_time} ({MAX_ARGS} args)")
|
||||||
|
print(f"Many returns time: {returns_time} ({MAX_RETURNS} returns)")
|
||||||
|
print(f"Ray.get time: {get_time} ({MAX_RAY_GET_ARGS} args)")
|
||||||
|
print(f"Queued task time: {queued_time} ({MAX_QUEUED_TASKS} tasks)")
|
||||||
|
print(f"Ray.get large object time: {large_object_time} "
|
||||||
|
f"({MAX_RAY_GET_SIZE} bytes)")
|
|
@ -134,7 +134,13 @@ is generally the easiest way to run release tests.
|
||||||
The summaries printed by each test should be checked in under
|
The summaries printed by each test should be checked in under
|
||||||
``release_logs/<version>`` on the **master** branch (make a pull request).
|
``release_logs/<version>`` on the **master** branch (make a pull request).
|
||||||
|
|
||||||
5. **ASAN tests**
|
5. **Scalability envelope tests**
|
||||||
|
|
||||||
|
- Run the tests in `benchmarks/` (with `ray submit --start cluster.yaml <test file>`)
|
||||||
|
- Record the outputted times.
|
||||||
|
- Whether the results are acceptable is a judgement call.
|
||||||
|
|
||||||
|
6. **ASAN tests**
|
||||||
|
|
||||||
Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the whole Python tests to detect memory leaks.
|
Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the whole Python tests to detect memory leaks.
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue