mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[Serve] Use "serve"
namespace during controller recovery (#25987)
This commit is contained in:
parent
67140f2d26
commit
4d8a82bdf6
2 changed files with 39 additions and 2 deletions
|
@ -28,6 +28,7 @@ from ray.serve.constants import (
|
|||
CONTROL_LOOP_PERIOD_S,
|
||||
SERVE_LOGGER_NAME,
|
||||
SERVE_ROOT_URL_ENV_KEY,
|
||||
SERVE_NAMESPACE,
|
||||
)
|
||||
from ray.serve.deployment_state import DeploymentStateManager, ReplicaState
|
||||
from ray.serve.endpoint_state import EndpointState
|
||||
|
@ -110,15 +111,22 @@ class ServeController:
|
|||
http_config,
|
||||
)
|
||||
self.endpoint_state = EndpointState(self.kv_store, self.long_poll_host)
|
||||
|
||||
# Fetch all running actors in current cluster as source of current
|
||||
# replica state for controller failure recovery
|
||||
all_current_actor_names = ray.util.list_named_actors()
|
||||
all_current_actors = ray.util.list_named_actors(all_namespaces=True)
|
||||
all_serve_actor_names = [
|
||||
actor["name"]
|
||||
for actor in all_current_actors
|
||||
if actor["namespace"] == SERVE_NAMESPACE
|
||||
]
|
||||
|
||||
self.deployment_state_manager = DeploymentStateManager(
|
||||
controller_name,
|
||||
detached,
|
||||
self.kv_store,
|
||||
self.long_poll_host,
|
||||
all_current_actor_names,
|
||||
all_serve_actor_names,
|
||||
)
|
||||
|
||||
# Reference to Ray task executing most recent deployment request
|
||||
|
|
|
@ -672,5 +672,34 @@ def test_serve_start_different_http_checkpoint_options_warning(caplog):
|
|||
ray.shutdown()
|
||||
|
||||
|
||||
def test_recovering_controller_no_redeploy():
|
||||
"""Ensure controller doesn't redeploy running deployments when recovering."""
|
||||
ray.init(namespace="x")
|
||||
client = serve.start(detached=True)
|
||||
|
||||
@serve.deployment
|
||||
def f():
|
||||
pass
|
||||
|
||||
f.deploy()
|
||||
|
||||
num_actors = len(ray.util.list_named_actors(all_namespaces=True))
|
||||
pid = ray.get(client._controller.get_pid.remote())
|
||||
|
||||
ray.kill(client._controller, no_restart=False)
|
||||
|
||||
wait_for_condition(lambda: ray.get(client._controller.get_pid.remote()) != pid)
|
||||
|
||||
# Confirm that no new deployment is deployed over the next 10 seconds
|
||||
with pytest.raises(RuntimeError):
|
||||
wait_for_condition(
|
||||
lambda: len(ray.util.list_named_actors(all_namespaces=True)) > num_actors,
|
||||
timeout=5,
|
||||
)
|
||||
|
||||
serve.shutdown()
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-v", "-s", __file__]))
|
||||
|
|
Loading…
Add table
Reference in a new issue