mirror of
https://github.com/vale981/ray
synced 2025-03-06 18:41:40 -05:00
[serve] Add basic upscaling test using cluster_utils (#15044)
This commit is contained in:
parent
70f45af541
commit
126b9a6c14
3 changed files with 120 additions and 1 deletions
|
@ -183,6 +183,14 @@ py_test(
|
|||
deps = [":serve_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_cluster",
|
||||
size = "medium",
|
||||
srcs = serve_tests_srcs,
|
||||
tags = ["exclusive"],
|
||||
deps = [":serve_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_util",
|
||||
size = "small",
|
||||
|
|
|
@ -1166,7 +1166,7 @@ def make_deployment_cls(
|
|||
_ray_actor_options = ray_actor_options
|
||||
|
||||
@classmethod
|
||||
def deploy(cls, *init_args):
|
||||
def deploy(cls, *init_args, _blocking=True):
|
||||
"""Deploy this deployment.
|
||||
|
||||
Args:
|
||||
|
@ -1183,6 +1183,7 @@ def make_deployment_cls(
|
|||
ray_actor_options=cls._ray_actor_options,
|
||||
config=cls._config,
|
||||
version=cls._version,
|
||||
_blocking=_blocking,
|
||||
_internal=True)
|
||||
|
||||
@classmethod
|
||||
|
|
110
python/ray/serve/tests/test_cluster.py
Normal file
110
python/ray/serve/tests/test_cluster.py
Normal file
|
@ -0,0 +1,110 @@
|
|||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
import ray
|
||||
from ray import serve
|
||||
from ray.cluster_utils import Cluster
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_cluster():
|
||||
cluster = Cluster()
|
||||
yield Cluster()
|
||||
serve.shutdown()
|
||||
ray.shutdown()
|
||||
cluster.shutdown()
|
||||
|
||||
|
||||
def test_scale_up(ray_cluster):
|
||||
cluster = ray_cluster
|
||||
head_node = cluster.add_node(num_cpus=3)
|
||||
|
||||
@serve.deployment("D", version="1", num_replicas=1)
|
||||
def D(*args):
|
||||
return os.getpid()
|
||||
|
||||
def get_pids(expected, timeout=30):
|
||||
pids = set()
|
||||
start = time.time()
|
||||
while len(pids) < expected:
|
||||
pids.add(requests.get("http://localhost:8000/D").text)
|
||||
if time.time() - start >= timeout:
|
||||
raise TimeoutError("Timed out waiting for pids.")
|
||||
return pids
|
||||
|
||||
ray.init(head_node.address)
|
||||
serve.start(detached=True)
|
||||
client = serve.connect()
|
||||
|
||||
D.deploy()
|
||||
pids1 = get_pids(1)
|
||||
|
||||
goal_ref = D.options(num_replicas=3).deploy(_blocking=False)
|
||||
assert not client._wait_for_goal(goal_ref, timeout=0.1)
|
||||
assert get_pids(1) == pids1
|
||||
|
||||
# Add a node with another CPU, another replica should get placed.
|
||||
cluster.add_node(num_cpus=1)
|
||||
assert not client._wait_for_goal(goal_ref, timeout=0.1)
|
||||
pids2 = get_pids(2)
|
||||
assert pids1.issubset(pids2)
|
||||
|
||||
# Add a node with another CPU, the final replica should get placed
|
||||
# and the deploy goal should be done.
|
||||
cluster.add_node(num_cpus=1)
|
||||
assert client._wait_for_goal(goal_ref)
|
||||
pids3 = get_pids(3)
|
||||
assert pids2.issubset(pids3)
|
||||
|
||||
|
||||
@pytest.mark.skip("Currently hangs due to max_task_retries=-1.")
|
||||
def test_node_failure(ray_cluster):
|
||||
cluster = ray_cluster
|
||||
cluster.add_node(num_cpus=3)
|
||||
worker_node = cluster.add_node(num_cpus=2)
|
||||
|
||||
@serve.deployment("D", version="1", num_replicas=3)
|
||||
def D(*args):
|
||||
return os.getpid()
|
||||
|
||||
def get_pids(expected, timeout=30):
|
||||
pids = set()
|
||||
start = time.time()
|
||||
while len(pids) < expected:
|
||||
pids.add(requests.get("http://localhost:8000/D").text)
|
||||
if time.time() - start >= timeout:
|
||||
raise TimeoutError("Timed out waiting for pids.")
|
||||
return pids
|
||||
|
||||
ray.init(cluster.address)
|
||||
serve.start(detached=True)
|
||||
|
||||
print("Initial deploy.")
|
||||
D.deploy()
|
||||
pids1 = get_pids(3)
|
||||
|
||||
# Remove the node. There should still be one replica running.
|
||||
print("Kill node.")
|
||||
cluster.remove_node(worker_node)
|
||||
pids2 = get_pids(1)
|
||||
assert pids2.issubset(pids1)
|
||||
|
||||
# Add a worker node back. One replica should get placed.
|
||||
print("Add back first node.")
|
||||
cluster.add_node(num_cpus=1)
|
||||
pids3 = get_pids(2)
|
||||
assert pids2.issubset(pids3)
|
||||
|
||||
# Add another worker node. One more replica should get placed.
|
||||
print("Add back second node.")
|
||||
cluster.add_node(num_cpus=1)
|
||||
pids4 = get_pids(3)
|
||||
assert pids3.issubset(pids4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-v", "-s", __file__]))
|
Loading…
Add table
Reference in a new issue