ray/dashboard/modules/healthz/healthz_agent.py

54 lines
1.8 KiB
Python
Raw Normal View History

import ray.dashboard.utils as dashboard_utils
import ray.dashboard.optional_utils as optional_utils
from ray.dashboard.modules.healthz.utils import HealthChecker
from aiohttp.web import Request, Response, HTTPServiceUnavailable
import grpc
routes = optional_utils.ClassMethodRouteTable
class HealthzAgent(dashboard_utils.DashboardAgentModule):
"""Health check in the agent.
This module adds health check related endpoint to the agent to check
local components' health.
"""
def __init__(self, dashboard_agent):
super().__init__(dashboard_agent)
self._health_checker = HealthChecker(
dashboard_agent.gcs_aio_client,
f"{dashboard_agent.ip}:{dashboard_agent.node_manager_port}",
)
@routes.get("/api/local_raylet_healthz")
async def health_check(self, req: Request) -> Response:
try:
alive = await self._health_checker.check_local_raylet_liveness()
if alive is False:
return HTTPServiceUnavailable(reason="Local Raylet failed")
except grpc.RpcError as e:
# We only consider the error other than GCS unreachable as raylet failure
# to avoid false positive.
# In case of GCS failed, Raylet will crash eventually if GCS is not back
# within a given time and the check will fail since agent can't live
# without a local raylet.
if e.code() not in (
grpc.StatusCode.UNAVAILABLE,
grpc.StatusCode.UNKNOWN,
grpc.StatusCode.DEADLINE_EXCEEDED,
):
return HTTPServiceUnavailable(reason=e.message())
return Response(
text="success",
content_type="application/text",
)
async def run(self, server):
pass
@staticmethod
def is_minimal_module():
return True