[Serve] Use "serve" namespace during controller recovery (#25987)

This commit is contained in:
shrekris-anyscale 2022-06-22 16:08:07 -07:00 committed by GitHub
parent 67140f2d26
commit 4d8a82bdf6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 39 additions and 2 deletions

View file

@ -28,6 +28,7 @@ from ray.serve.constants import (
CONTROL_LOOP_PERIOD_S,
SERVE_LOGGER_NAME,
SERVE_ROOT_URL_ENV_KEY,
SERVE_NAMESPACE,
)
from ray.serve.deployment_state import DeploymentStateManager, ReplicaState
from ray.serve.endpoint_state import EndpointState
@ -110,15 +111,22 @@ class ServeController:
http_config,
)
self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
# Fetch all running actors in current cluster as source of current
# replica state for controller failure recovery
all_current_actor_names = ray.util.list_named_actors()
all_current_actors = ray.util.list_named_actors(all_namespaces=True)
all_serve_actor_names = [
actor["name"]
for actor in all_current_actors
if actor["namespace"] == SERVE_NAMESPACE
]
self.deployment_state_manager = DeploymentStateManager(
controller_name,
detached,
self.kv_store,
self.long_poll_host,
all_current_actor_names,
all_serve_actor_names,
)
# Reference to Ray task executing most recent deployment request

View file

@ -672,5 +672,34 @@ def test_serve_start_different_http_checkpoint_options_warning(caplog):
ray.shutdown()
def test_recovering_controller_no_redeploy():
"""Ensure controller doesn't redeploy running deployments when recovering."""
ray.init(namespace="x")
client = serve.start(detached=True)
@serve.deployment
def f():
pass
f.deploy()
num_actors = len(ray.util.list_named_actors(all_namespaces=True))
pid = ray.get(client._controller.get_pid.remote())
ray.kill(client._controller, no_restart=False)
wait_for_condition(lambda: ray.get(client._controller.get_pid.remote()) != pid)
# Confirm that no new deployment is deployed over the next 10 seconds
with pytest.raises(RuntimeError):
wait_for_condition(
lambda: len(ray.util.list_named_actors(all_namespaces=True)) > num_actors,
timeout=5,
)
serve.shutdown()
ray.shutdown()
if __name__ == "__main__":
sys.exit(pytest.main(["-v", "-s", __file__]))