Scalability Envelope Tests (#13464)

2025-03-06 02:21:39 -05:00 · 2021-01-25 18:48:31 -08:00 · 2021-01-25 18:48:31 -08:00 · 840987c7af
commit 840987c7af
parent f2867b0609
8 changed files with 629 additions and 1 deletions
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -0,0 +1,35 @@
 # Ray Scalability Envelope
 ### Note: This document is a WIP. This is not a scalability guarantee (yet).
 ## Distributed Benchmarks
 All distributed tests are run on 64 nodes with 64 cores/node. Maximum number of nodes is achieved by adding 4 core nodes.
 | Dimension                                       | Quantity |
 | ---------                                       | -------- |
 | # nodes in cluster (with trivial task workload) | 250+     |
 | # actors in cluster (with trivial workload)     | 10k+     |
 | # simultaneously running tasks                  | 10k+     |
 | # simultaneously running placement groups       | 1k+      |
 ## Object Store Benchmarks
 | Dimension                           | Quantity |
 | ---------                           | -------- |
 | 1 GiB object broadcast (# of nodes) | 50+      |
 ## Single Node Benchmarks.
 All single node benchmarks are run on a single m4.16xlarge.
 | Dimension                                      | Quantity   |
 | ---------                                      | --------   |
 | # of object artuments to  a single task        | 10000+     |
 | # of objects returned from a single task       | 3000+     |
 | # of plasma objects in a single `ray.get` call | 10000+     |
 | # of tasks queued on a single node             | 1,000,000+ |
 | Maximum `ray.get` numpy object size            | 100GiB+    |
--- a/benchmarks/distributed/config.yaml
+++ b/benchmarks/distributed/config.yaml
@ -0,0 +1,58 @@
 cluster_name: distributed-benchmarks
 min_workers: 0
 max_workers: 999999
 upscaling_speed: 9999999
 provider:
    type: aws
    region: us-west-2
    availability_zone: us-west-2a, us-west-2b, us-west-2c, us-west-2d
 auth:
    ssh_user: ubuntu
 available_node_types:
    head_node:
        node_config:
            InstanceType: m5.16xlarge
            ImageId: ami-098555c9b343eb09c 
        resources:
          node: 1
          small: 1
        max_workers: 999999
    worker_node:
        node_config:
            InstanceType: m5.16xlarge
            ImageId: ami-098555c9b343eb09c 
        resources:
          node: 1
        min_workers: 63
        max_workers: 63
    small_worker_node:
        node_config:
            InstanceType: m5.xlarge
            ImageId: ami-098555c9b343eb09c 
        resources:
          node: 1
        max_workers: 999999
 head_node_type: head_node
 worker_default_node_type: worker_node
 setup_commands:
  - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
  - pip install tqdm
  - sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 65535" >> /etc/security/limits.conf; echo "* hard nofile 65535" >> /etc/security/limits.conf;'
 idle_timeout_minutes: 1
 head_start_ray_commands:
    - ray stop
    - ulimit -n 65535; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 65535; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
--- a/benchmarks/distributed/test_distributed.py
+++ b/benchmarks/distributed/test_distributed.py
@ -0,0 +1,204 @@
 import ray
 import ray.autoscaler.sdk
 from ray.test_utils import Semaphore
 from ray.util.placement_group import placement_group, remove_placement_group
 from time import sleep, perf_counter
 from tqdm import tqdm, trange
 TEST_NUM_NODES = 64
 MAX_ACTORS_IN_CLUSTER = 10000
 MAX_RUNNING_TASKS_IN_CLUSTER = 10000
 MAX_PLACEMENT_GROUPS = 1000
 MAX_NUM_NODES = 250
 def num_alive_nodes():
    n = 0
    for node in ray.nodes():
        if node["Alive"]:
            n += 1
    return n
 def scale_to(target):
    while num_alive_nodes() != target:
        ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target)
        print(f"Current # nodes: {num_alive_nodes()}, target: {target}")
        print("Waiting ...")
        sleep(5)
 def test_nodes():
    scale_to(MAX_NUM_NODES)
    assert num_alive_nodes() == MAX_NUM_NODES
    # Treat this as a trivial task to ensure the nodes are all functioning
    test_max_running_tasks()
 def test_max_actors():
    # TODO (Alex): Dynamically set this based on number of cores
    cpus_per_actor = 0.25
    @ray.remote(num_cpus=cpus_per_actor)
    class Actor:
        def foo(self):
            pass
    actors = [
        Actor.remote()
        for _ in trange(MAX_ACTORS_IN_CLUSTER, desc="Launching actors")
    ]
    for actor in tqdm(actors, desc="Ensuring actors have started"):
        assert ray.get(actor.foo.remote()) is None
 def test_max_running_tasks():
    counter = Semaphore.remote(0)
    blocker = Semaphore.remote(0)
    @ray.remote(num_cpus=0.25)
    def task(counter, blocker):
        sleep(300)
    refs = [
        task.remote(counter, blocker)
        for _ in trange(MAX_RUNNING_TASKS_IN_CLUSTER, desc="Launching tasks")
    ]
    max_cpus = ray.cluster_resources()["CPU"]
    min_cpus_available = max_cpus
    for _ in trange(int(300 / 0.1), desc="Waiting"):
        try:
            cur_cpus = ray.available_resources().get("CPU", 0)
            min_cpus_available = min(min_cpus_available, cur_cpus)
        except Exception:
            # There are race conditions `.get` can fail if a new heartbeat
            # comes at the same time.
            pass
        sleep(0.1)
    # There are some relevant magic numbers in this check. 10k tasks each
    # require 1/4 cpus. Therefore, ideally 2.5k cpus will be used.
    err_str = f"Only {max_cpus - min_cpus_available}/{max_cpus} cpus used."
    assert max_cpus - min_cpus_available > 2000, err_str
    for _ in trange(
            MAX_RUNNING_TASKS_IN_CLUSTER,
            desc="Ensuring all tasks have finished"):
        done, refs = ray.wait(refs)
        assert ray.get(done[0]) is None
 def test_many_placement_groups():
    @ray.remote(num_cpus=1, resources={"node": 0.02})
    def f1():
        sleep(10)
        pass
    @ray.remote(num_cpus=1)
    def f2():
        sleep(10)
        pass
    @ray.remote(resources={"node": 0.02})
    def f3():
        sleep(10)
        pass
    bundle1 = {"node": 0.02, "CPU": 1}
    bundle2 = {"CPU": 1}
    bundle3 = {"node": 0.02}
    pgs = []
    for _ in trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"):
        pg = placement_group(bundles=[bundle1, bundle2, bundle3])
        pgs.append(pg)
    for pg in tqdm(pgs, desc="Waiting for pgs to be ready"):
        ray.get(pg.ready())
    refs = []
    for pg in tqdm(pgs, desc="Scheduling tasks"):
        ref1 = f1.options(placement_group=pg).remote()
        ref2 = f2.options(placement_group=pg).remote()
        ref3 = f3.options(placement_group=pg).remote()
        refs.extend([ref1, ref2, ref3])
    for _ in trange(10, desc="Waiting"):
        sleep(1)
    with tqdm() as p_bar:
        while refs:
            done, refs = ray.wait(refs)
            p_bar.update()
    for pg in tqdm(pgs, desc="Cleaning up pgs"):
        remove_placement_group(pg)
 ray.init(address="auto")
 scale_to(TEST_NUM_NODES)
 assert num_alive_nodes(
 ) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
 cluster_resources = ray.cluster_resources()
 available_resources = ray.available_resources()
 assert available_resources == cluster_resources, (
    str(available_resources) + " != " + str(cluster_resources))
 print("Done launching nodes")
 actor_start = perf_counter()
 test_max_actors()
 actor_end = perf_counter()
 sleep(1)
 assert num_alive_nodes(
 ) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
 assert available_resources == cluster_resources, (
    str(available_resources) + " != " + str(cluster_resources))
 print("Done testing actors")
 task_start = perf_counter()
 test_max_running_tasks()
 task_end = perf_counter()
 sleep(1)
 assert num_alive_nodes(
 ) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
 assert available_resources == cluster_resources, (
    str(available_resources) + " != " + str(cluster_resources))
 print("Done testing tasks")
 pg_start = perf_counter()
 test_many_placement_groups()
 pg_end = perf_counter()
 sleep(1)
 assert num_alive_nodes(
 ) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
 assert available_resources == cluster_resources, (
    str(available_resources) + " != " + str(cluster_resources))
 print("Done testing placement groups")
 launch_start = perf_counter()
 test_nodes()
 launch_end = perf_counter()
 sleep(1)
 assert num_alive_nodes(
 ) == MAX_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes())
 print("Done.")
 actor_time = actor_end - actor_start
 task_time = task_end - task_start
 pg_time = pg_end - pg_start
 launch_time = launch_end - launch_start
 print(f"Actor time: {actor_time} ({MAX_ACTORS_IN_CLUSTER} actors)")
 print(f"Task time: {task_time} ({MAX_RUNNING_TASKS_IN_CLUSTER} tasks)")
 print(f"PG time: {pg_time} ({MAX_PLACEMENT_GROUPS} placement groups)")
 print(f"Node launch time: {launch_time} ({MAX_NUM_NODES} nodes)")
--- a/benchmarks/object_store/config.yaml
+++ b/benchmarks/object_store/config.yaml
@ -0,0 +1,48 @@
 cluster_name: object-store-benchmarks
 min_workers: 0
 max_workers: 999999
 upscaling_speed: 9999999
 provider:
    type: aws
    region: us-west-2
    availability_zone: us-west-2a
 auth:
    ssh_user: ubuntu
 available_node_types:
    head_node:
        node_config:
            InstanceType: m4.4xlarge
            ImageId: ami-098555c9b343eb09c 
        resources:
          node: 1
        max_workers: 999999
    worker_node:
        node_config:
            InstanceType: m4.xlarge
            ImageId: ami-098555c9b343eb09c 
        resources:
          node: 1
        max_workers: 999999
 head_node_type: head_node
 worker_default_node_type: worker_node
 setup_commands:
  - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
  - pip install tqdm numpy
 idle_timeout_minutes: 5
 head_start_ray_commands:
    - ray stop
    - ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
    - ray stop
    - ulimit -n 1000000; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
--- a/benchmarks/object_store/test_object_store.py
+++ b/benchmarks/object_store/test_object_store.py
@ -0,0 +1,61 @@
 import numpy as np
 import ray
 import ray.autoscaler.sdk
 from time import sleep, perf_counter
 from tqdm import tqdm
 NUM_NODES = 50
 OBJECT_SIZE = 2**30
 def num_alive_nodes():
    n = 0
    for node in ray.nodes():
        if node["Alive"]:
            n += 1
    return n
 def scale_to(target):
    while num_alive_nodes() != target:
        ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target)
        print(f"Current # nodes: {num_alive_nodes()}, target: {target}")
        print("Waiting ...")
        sleep(5)
 def test_object_broadcast():
    scale_to(NUM_NODES)
    @ray.remote(num_cpus=1, resources={"node": 1})
    class Actor:
        def foo(self):
            pass
        def sum(self, arr):
            return np.sum(arr)
    actors = [Actor.remote() for _ in range(NUM_NODES)]
    arr = np.ones(OBJECT_SIZE, dtype=np.uint8)
    ref = ray.put(arr)
    for actor in tqdm(actors, desc="Ensure all actors have started."):
        ray.get(actor.foo.remote())
    result_refs = []
    for actor in tqdm(actors, desc="Broadcasting objects"):
        result_refs.append(actor.sum.remote(ref))
    results = ray.get(result_refs)
    for result in results:
        assert result == OBJECT_SIZE
 ray.init(address="auto")
 start = perf_counter()
 test_object_broadcast()
 end = perf_counter()
 print(f"Broadcast time: {end - start} ({OBJECT_SIZE} B x {NUM_NODES} nodes)")
--- a/benchmarks/single_node/config.yaml
+++ b/benchmarks/single_node/config.yaml
@ -0,0 +1,41 @@
 cluster_name: single-node-benchmarks
 min_workers: 0
 max_workers: 0
 upscaling_speed: 9999999
 provider:
    type: aws
    region: us-west-2
    availability_zone: us-west-2a
 auth:
    ssh_user: ubuntu
 available_node_types:
    head_node:
        node_config:
            InstanceType: m4.16xlarge
            ImageId: ami-098555c9b343eb09c 
        resources:
          node: 1
        max_workers: 999999
    worker_node:
        node_config:
            InstanceType: m4.xlarge
            ImageId: ami-098555c9b343eb09c 
 head_node_type: head_node
 worker_default_node_type: worker_node
 setup_commands:
  - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
  - pip install numpy tqdm
  - sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1000000" >> /etc/security/limits.conf; echo "* hard nofile 1000000" >> /etc/security/limits.conf;'
 idle_timeout_minutes: 5
 head_start_ray_commands:
    - ray stop
    - ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --object-store-memory=128000000000 --autoscaling-config=~/ray_bootstrap_config.yaml
--- a/benchmarks/single_node/test_single_node.py
+++ b/benchmarks/single_node/test_single_node.py
@ -0,0 +1,175 @@
 import numpy as np
 import ray
 import ray.autoscaler.sdk
 from ray.test_utils import Semaphore
 from time import perf_counter
 from tqdm import trange, tqdm
 MAX_ARGS = 10000
 MAX_RETURNS = 3000
 MAX_RAY_GET_ARGS = 10000
 MAX_QUEUED_TASKS = 1_000_000
 MAX_RAY_GET_SIZE = 100 * 2**30
 def test_many_args():
    @ray.remote
    def sum_args(*args):
        return sum(sum(arg) for arg in args)
    args = [[1 for _ in range(10000)] for _ in range(MAX_ARGS)]
    result = ray.get(sum_args.remote(*args))
    assert result == MAX_ARGS * 10000
 def test_many_returns():
    @ray.remote(num_returns=MAX_RETURNS)
    def f():
        to_return = []
        for _ in range(MAX_RETURNS):
            obj = list(range(10000))
            to_return.append(obj)
        return tuple(to_return)
    returned_refs = f.remote()
    assert len(returned_refs) == MAX_RETURNS
    for ref in returned_refs:
        expected = list(range(10000))
        obj = ray.get(ref)
        assert obj == expected
 def test_ray_get_args():
    def with_dese():
        print("Putting test objects:")
        refs = []
        for _ in trange(MAX_RAY_GET_ARGS):
            obj = list(range(10000))
            refs.append(ray.put(obj))
        print("Getting objects")
        results = ray.get(refs)
        assert len(results) == MAX_RAY_GET_ARGS
        print("Asserting correctness")
        for obj in tqdm(results):
            expected = list(range(10000))
            assert obj == expected
    def with_zero_copy():
        print("Putting test objects:")
        refs = []
        for _ in trange(MAX_RAY_GET_ARGS):
            obj = np.arange(10000)
            refs.append(ray.put(obj))
        print("Getting objects")
        results = ray.get(refs)
        assert len(results) == MAX_RAY_GET_ARGS
        print("Asserting correctness")
        for obj in tqdm(results):
            expected = np.arange(10000)
            assert (obj == expected).all()
    with_dese()
    print("Done with dese")
    with_zero_copy()
    print("Done with zero copy")
 def test_many_queued_tasks():
    sema = Semaphore.remote(0)
    @ray.remote(num_cpus=1)
    def block():
        ray.get(sema.acquire.remote())
    @ray.remote(num_cpus=1)
    def f():
        pass
    num_cpus = int(ray.cluster_resources()["CPU"])
    blocked_tasks = []
    for _ in range(num_cpus):
        blocked_tasks.append(block.remote())
    print("Submitting many tasks")
    pending_tasks = []
    for _ in trange(MAX_QUEUED_TASKS):
        pending_tasks.append(f.remote())
    # Make sure all the tasks can actually run.
    for _ in range(num_cpus):
        sema.release.remote()
    print("Unblocking tasks")
    for ref in tqdm(pending_tasks):
        assert ray.get(ref) is None
 def test_large_object():
    print("Generating object")
    obj = np.zeros(MAX_RAY_GET_SIZE, dtype=np.int8)
    print("Putting object")
    ref = ray.put(obj)
    del obj
    print("Getting object")
    big_obj = ray.get(ref)
    assert big_obj[0] == 0
    assert big_obj[-1] == 0
 ray.init(address="auto")
 args_start = perf_counter()
 test_many_args()
 args_end = perf_counter()
 assert ray.cluster_resources() == ray.available_resources()
 print("Finished many args")
 returns_start = perf_counter()
 test_many_returns()
 returns_end = perf_counter()
 assert ray.cluster_resources() == ray.available_resources()
 print("Finished many returns")
 get_start = perf_counter()
 test_ray_get_args()
 get_end = perf_counter()
 assert ray.cluster_resources() == ray.available_resources()
 print("Finished ray.get on many objects")
 queued_start = perf_counter()
 test_many_queued_tasks()
 queued_end = perf_counter()
 assert ray.cluster_resources() == ray.available_resources()
 print("Finished queueing many tasks")
 large_object_start = perf_counter()
 test_large_object()
 large_object_end = perf_counter()
 assert ray.cluster_resources() == ray.available_resources()
 print("Done")
 args_time = args_end - args_start
 returns_time = returns_end - returns_start
 get_time = get_end - get_start
 queued_time = queued_end - queued_start
 large_object_time = large_object_end - large_object_start
 print(f"Many args time: {args_time} ({MAX_ARGS} args)")
 print(f"Many returns time: {returns_time} ({MAX_RETURNS} returns)")
 print(f"Ray.get time: {get_time} ({MAX_RAY_GET_ARGS} args)")
 print(f"Queued task time: {queued_time} ({MAX_QUEUED_TASKS} tasks)")
 print(f"Ray.get large object time: {large_object_time} "
      f"({MAX_RAY_GET_SIZE} bytes)")
--- a/release/RELEASE_PROCESS.rst
+++ b/release/RELEASE_PROCESS.rst
@ -134,7 +134,13 @@ is generally the easiest way to run release tests.
   The summaries printed by each test should be checked in under
   ``release_logs/<version>`` on the **master** branch (make a pull request).
-5. **ASAN tests**
+5. **Scalability envelope tests**
   - Run the tests in `benchmarks/` (with `ray submit --start cluster.yaml <test file>`)
   - Record the outputted times.
     - Whether the results are acceptable is a judgement call.
 6. **ASAN tests**
   Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the whole Python tests to detect memory leaks.