mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00

## Why are these changes needed? As in this https://github.com/ray-project/ray/pull/26405 we added the health check for gcs and raylets. This PR expose them in the endpoint in dashboard and dashboard agent. For dashboard, we added `http://host:port/api/gcs_healthz` and it'll send RPC to GCS directly to see whether the GCS is alive or not. For agent, we added `http://host:port/api/local_raylet_healthz` and it'll send RPC to GCS to check whether raylet is alive or not. We think raylet is live if - GCS is dead - GCS is alive but GCS think the raylet is dead If GCS is dead for more than X seconds (60 by default), raylet will just crash itself, so KubeRay can still catch it.
40 lines
1.3 KiB
Python
40 lines
1.3 KiB
Python
import ray.dashboard.utils as dashboard_utils
|
|
import ray.dashboard.optional_utils as optional_utils
|
|
from ray.dashboard.modules.healthz.utils import HealthChecker
|
|
from aiohttp.web import Request, Response, HTTPServiceUnavailable
|
|
|
|
routes = optional_utils.ClassMethodRouteTable
|
|
|
|
|
|
class HealthzHead(dashboard_utils.DashboardHeadModule):
|
|
"""Health check in the head.
|
|
|
|
This module adds health check related endpoint to the head to check
|
|
GCS's heath.
|
|
"""
|
|
|
|
def __init__(self, dashboard_head):
|
|
super().__init__(dashboard_head)
|
|
self._health_checker = HealthChecker(dashboard_head.gcs_aio_client)
|
|
|
|
@routes.get("/api/gcs_healthz")
|
|
async def health_check(self, req: Request) -> Response:
|
|
alive = False
|
|
try:
|
|
alive = await self._health_checker.check_gcs_liveness()
|
|
if alive is True:
|
|
return Response(
|
|
text="success",
|
|
content_type="application/text",
|
|
)
|
|
except Exception as e:
|
|
return HTTPServiceUnavailable(reason=f"Health check failed: {e}")
|
|
|
|
return HTTPServiceUnavailable(reason="Health check failed")
|
|
|
|
async def run(self, server):
|
|
pass
|
|
|
|
@staticmethod
|
|
def is_minimal_module():
|
|
return True
|