mirror of
https://github.com/vale981/ray
synced 2025-03-11 21:56:39 -04:00

## Why are these changes needed? As in this https://github.com/ray-project/ray/pull/26405 we added the health check for gcs and raylets. This PR expose them in the endpoint in dashboard and dashboard agent. For dashboard, we added `http://host:port/api/gcs_healthz` and it'll send RPC to GCS directly to see whether the GCS is alive or not. For agent, we added `http://host:port/api/local_raylet_healthz` and it'll send RPC to GCS to check whether raylet is alive or not. We think raylet is live if - GCS is dead - GCS is alive but GCS think the raylet is dead If GCS is dead for more than X seconds (60 by default), raylet will just crash itself, so KubeRay can still catch it.
53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
import ray.dashboard.utils as dashboard_utils
|
|
import ray.dashboard.optional_utils as optional_utils
|
|
from ray.dashboard.modules.healthz.utils import HealthChecker
|
|
from aiohttp.web import Request, Response, HTTPServiceUnavailable
|
|
import grpc
|
|
|
|
routes = optional_utils.ClassMethodRouteTable
|
|
|
|
|
|
class HealthzAgent(dashboard_utils.DashboardAgentModule):
|
|
"""Health check in the agent.
|
|
|
|
This module adds health check related endpoint to the agent to check
|
|
local components' health.
|
|
"""
|
|
|
|
def __init__(self, dashboard_agent):
|
|
super().__init__(dashboard_agent)
|
|
self._health_checker = HealthChecker(
|
|
dashboard_agent.gcs_aio_client,
|
|
f"{dashboard_agent.ip}:{dashboard_agent.node_manager_port}",
|
|
)
|
|
|
|
@routes.get("/api/local_raylet_healthz")
|
|
async def health_check(self, req: Request) -> Response:
|
|
try:
|
|
alive = await self._health_checker.check_local_raylet_liveness()
|
|
if alive is False:
|
|
return HTTPServiceUnavailable(reason="Local Raylet failed")
|
|
except grpc.RpcError as e:
|
|
# We only consider the error other than GCS unreachable as raylet failure
|
|
# to avoid false positive.
|
|
# In case of GCS failed, Raylet will crash eventually if GCS is not back
|
|
# within a given time and the check will fail since agent can't live
|
|
# without a local raylet.
|
|
if e.code() not in (
|
|
grpc.StatusCode.UNAVAILABLE,
|
|
grpc.StatusCode.UNKNOWN,
|
|
grpc.StatusCode.DEADLINE_EXCEEDED,
|
|
):
|
|
return HTTPServiceUnavailable(reason=e.message())
|
|
|
|
return Response(
|
|
text="success",
|
|
content_type="application/text",
|
|
)
|
|
|
|
async def run(self, server):
|
|
pass
|
|
|
|
@staticmethod
|
|
def is_minimal_module():
|
|
return True
|