From 4d8a82bdf6f0f11c17b6a7b987fcbce6cbeb1bbf Mon Sep 17 00:00:00 2001 From: shrekris-anyscale <92341594+shrekris-anyscale@users.noreply.github.com> Date: Wed, 22 Jun 2022 16:08:07 -0700 Subject: [PATCH] [Serve] Use `"serve"` namespace during controller recovery (#25987) --- python/ray/serve/controller.py | 12 ++++++++-- python/ray/serve/tests/test_standalone.py | 29 +++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index 1d2845398..001ab0652 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -28,6 +28,7 @@ from ray.serve.constants import ( CONTROL_LOOP_PERIOD_S, SERVE_LOGGER_NAME, SERVE_ROOT_URL_ENV_KEY, + SERVE_NAMESPACE, ) from ray.serve.deployment_state import DeploymentStateManager, ReplicaState from ray.serve.endpoint_state import EndpointState @@ -110,15 +111,22 @@ class ServeController: http_config, ) self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host) + # Fetch all running actors in current cluster as source of current # replica state for controller failure recovery - all_current_actor_names = ray.util.list_named_actors() + all_current_actors = ray.util.list_named_actors(all_namespaces=True) + all_serve_actor_names = [ + actor["name"] + for actor in all_current_actors + if actor["namespace"] == SERVE_NAMESPACE + ] + self.deployment_state_manager = DeploymentStateManager( controller_name, detached, self.kv_store, self.long_poll_host, - all_current_actor_names, + all_serve_actor_names, ) # Reference to Ray task executing most recent deployment request diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 4f65cc1f2..90c1f0dc6 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -672,5 +672,34 @@ def test_serve_start_different_http_checkpoint_options_warning(caplog): ray.shutdown() +def test_recovering_controller_no_redeploy(): + """Ensure controller doesn't redeploy running deployments when recovering.""" + ray.init(namespace="x") + client = serve.start(detached=True) + + @serve.deployment + def f(): + pass + + f.deploy() + + num_actors = len(ray.util.list_named_actors(all_namespaces=True)) + pid = ray.get(client._controller.get_pid.remote()) + + ray.kill(client._controller, no_restart=False) + + wait_for_condition(lambda: ray.get(client._controller.get_pid.remote()) != pid) + + # Confirm that no new deployment is deployed over the next 10 seconds + with pytest.raises(RuntimeError): + wait_for_condition( + lambda: len(ray.util.list_named_actors(all_namespaces=True)) > num_actors, + timeout=5, + ) + + serve.shutdown() + ray.shutdown() + + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__]))