mirror of
https://github.com/vale981/ray
synced 2025-03-13 14:46:38 -04:00

This PR fixed several issue which block serve agent when GCS is down. We need to make sure serve agent is always alive and can make sure the external requests can be sent to the agent and check the status. - internal kv used in dashboard/agent blocks the agent. We use the async one instead - serve controller use ray.nodes which is a blocking call and blocking forever. change to use gcs client with timeout - agent use serve controller client which is a blocking call with max retries = -1. This blocks until controller is back. To enable Serve HA, we also need to setup: - RAY_gcs_server_request_timeout_seconds=5 - RAY_SERVE_KV_TIMEOUT_S=5 which we should set in KubeRay.
53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
import ray.dashboard.utils as dashboard_utils
|
|
import ray.dashboard.optional_utils as optional_utils
|
|
from ray.dashboard.modules.healthz.utils import HealthChecker
|
|
from aiohttp.web import Request, Response
|
|
import grpc
|
|
|
|
routes = optional_utils.ClassMethodRouteTable
|
|
|
|
|
|
class HealthzAgent(dashboard_utils.DashboardAgentModule):
|
|
"""Health check in the agent.
|
|
|
|
This module adds health check related endpoint to the agent to check
|
|
local components' health.
|
|
"""
|
|
|
|
def __init__(self, dashboard_agent):
|
|
super().__init__(dashboard_agent)
|
|
self._health_checker = HealthChecker(
|
|
dashboard_agent.gcs_aio_client,
|
|
f"{dashboard_agent.ip}:{dashboard_agent.node_manager_port}",
|
|
)
|
|
|
|
@routes.get("/api/local_raylet_healthz")
|
|
async def health_check(self, req: Request) -> Response:
|
|
try:
|
|
alive = await self._health_checker.check_local_raylet_liveness()
|
|
if alive is False:
|
|
return Response(status=503, text="Local Raylet failed")
|
|
except grpc.RpcError as e:
|
|
# We only consider the error other than GCS unreachable as raylet failure
|
|
# to avoid false positive.
|
|
# In case of GCS failed, Raylet will crash eventually if GCS is not back
|
|
# within a given time and the check will fail since agent can't live
|
|
# without a local raylet.
|
|
if e.code() not in (
|
|
grpc.StatusCode.UNAVAILABLE,
|
|
grpc.StatusCode.UNKNOWN,
|
|
grpc.StatusCode.DEADLINE_EXCEEDED,
|
|
):
|
|
return Response(status=503, text=f"Health check failed due to: {e}")
|
|
|
|
return Response(
|
|
text="success",
|
|
content_type="application/text",
|
|
)
|
|
|
|
async def run(self, server):
|
|
pass
|
|
|
|
@staticmethod
|
|
def is_minimal_module():
|
|
return True
|