mirror of
https://github.com/vale981/ray
synced 2025-03-12 14:16:39 -04:00
54 lines
1.8 KiB
Python
54 lines
1.8 KiB
Python
![]() |
import ray.dashboard.utils as dashboard_utils
|
||
|
import ray.dashboard.optional_utils as optional_utils
|
||
|
from ray.dashboard.modules.healthz.utils import HealthChecker
|
||
|
from aiohttp.web import Request, Response, HTTPServiceUnavailable
|
||
|
import grpc
|
||
|
|
||
|
routes = optional_utils.ClassMethodRouteTable
|
||
|
|
||
|
|
||
|
class HealthzAgent(dashboard_utils.DashboardAgentModule):
|
||
|
"""Health check in the agent.
|
||
|
|
||
|
This module adds health check related endpoint to the agent to check
|
||
|
local components' health.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, dashboard_agent):
|
||
|
super().__init__(dashboard_agent)
|
||
|
self._health_checker = HealthChecker(
|
||
|
dashboard_agent.gcs_aio_client,
|
||
|
f"{dashboard_agent.ip}:{dashboard_agent.node_manager_port}",
|
||
|
)
|
||
|
|
||
|
@routes.get("/api/local_raylet_healthz")
|
||
|
async def health_check(self, req: Request) -> Response:
|
||
|
try:
|
||
|
alive = await self._health_checker.check_local_raylet_liveness()
|
||
|
if alive is False:
|
||
|
return HTTPServiceUnavailable(reason="Local Raylet failed")
|
||
|
except grpc.RpcError as e:
|
||
|
# We only consider the error other than GCS unreachable as raylet failure
|
||
|
# to avoid false positive.
|
||
|
# In case of GCS failed, Raylet will crash eventually if GCS is not back
|
||
|
# within a given time and the check will fail since agent can't live
|
||
|
# without a local raylet.
|
||
|
if e.code() not in (
|
||
|
grpc.StatusCode.UNAVAILABLE,
|
||
|
grpc.StatusCode.UNKNOWN,
|
||
|
grpc.StatusCode.DEADLINE_EXCEEDED,
|
||
|
):
|
||
|
return HTTPServiceUnavailable(reason=e.message())
|
||
|
|
||
|
return Response(
|
||
|
text="success",
|
||
|
content_type="application/text",
|
||
|
)
|
||
|
|
||
|
async def run(self, server):
|
||
|
pass
|
||
|
|
||
|
@staticmethod
|
||
|
def is_minimal_module():
|
||
|
return True
|