mirror of
https://github.com/vale981/ray
synced 2025-03-09 12:56:46 -04:00

## Why are these changes needed? As in this https://github.com/ray-project/ray/pull/26405 we added the health check for gcs and raylets. This PR expose them in the endpoint in dashboard and dashboard agent. For dashboard, we added `http://host:port/api/gcs_healthz` and it'll send RPC to GCS directly to see whether the GCS is alive or not. For agent, we added `http://host:port/api/local_raylet_healthz` and it'll send RPC to GCS to check whether raylet is alive or not. We think raylet is live if - GCS is dead - GCS is alive but GCS think the raylet is dead If GCS is dead for more than X seconds (60 by default), raylet will just crash itself, so KubeRay can still catch it.
23 lines
708 B
Python
23 lines
708 B
Python
from typing import Optional
|
|
from ray._private.gcs_utils import GcsAioClient
|
|
|
|
|
|
class HealthChecker:
|
|
def __init__(
|
|
self, gcs_aio_client: GcsAioClient, local_node_address: Optional[str] = None
|
|
):
|
|
self._gcs_aio_client = gcs_aio_client
|
|
self._local_node_address = local_node_address
|
|
|
|
async def check_local_raylet_liveness(self) -> bool:
|
|
if self._local_node_address is None:
|
|
return False
|
|
|
|
liveness = await self._gcs_aio_client.check_alive(
|
|
[self._local_node_address.encode()], 1
|
|
)
|
|
return liveness[0]
|
|
|
|
async def check_gcs_liveness(self) -> bool:
|
|
await self._gcs_aio_client.check_alive([], 1)
|
|
return True
|