diff --git a/python/ray/serve/BUILD b/python/ray/serve/BUILD index 50e3be10c..98d400d0f 100644 --- a/python/ray/serve/BUILD +++ b/python/ray/serve/BUILD @@ -224,7 +224,7 @@ py_test( py_test( name = "test_cluster", - size = "small", + size = "medium", srcs = serve_tests_srcs, tags = ["exclusive", "team:serve"], deps = [":serve_lib"], diff --git a/python/ray/serve/tests/test_cluster.py b/python/ray/serve/tests/test_cluster.py index b76ce793b..d23556e20 100644 --- a/python/ray/serve/tests/test_cluster.py +++ b/python/ray/serve/tests/test_cluster.py @@ -126,16 +126,16 @@ def test_replica_startup_status_transitions(ray_cluster): signal = SignalActor.remote() @serve.deployment(version="1", ray_actor_options={"num_cpus": 2}) - class D: + class E: def __init__(self): ray.get(signal.wait.remote()) - D.deploy(_blocking=False) + E.deploy(_blocking=False) def get_replicas(replica_state): controller = serve_instance._controller replicas = ray.get( - controller._dump_replica_states_for_testing.remote(D.name)) + controller._dump_replica_states_for_testing.remote(E.name)) return replicas.get([replica_state]) # wait for serve to start the replica, and catch a reference to it. @@ -152,8 +152,15 @@ def test_replica_startup_status_transitions(ray_cluster): # add the necessary resources to allocate the replica cluster.add_node(num_cpus=4) - wait_for_condition( - lambda: (replica.check_started() == PENDING_INITIALIZATION)) + wait_for_condition(lambda: (ray.cluster_resources().get("CPU", 0) >= 4)) + wait_for_condition(lambda: (ray.available_resources().get("CPU", 0) >= 2)) + + def is_replica_pending_initialization(): + status = replica.check_started() + print(status) + return status == PENDING_INITIALIZATION + + wait_for_condition(is_replica_pending_initialization) # send signal to complete replica intialization signal.send.remote()