mirror of
https://github.com/vale981/ray
synced 2025-03-10 05:16:49 -04:00

## Why are these changes needed? As in this https://github.com/ray-project/ray/pull/26405 we added the health check for gcs and raylets. This PR expose them in the endpoint in dashboard and dashboard agent. For dashboard, we added `http://host:port/api/gcs_healthz` and it'll send RPC to GCS directly to see whether the GCS is alive or not. For agent, we added `http://host:port/api/local_raylet_healthz` and it'll send RPC to GCS to check whether raylet is alive or not. We think raylet is live if - GCS is dead - GCS is alive but GCS think the raylet is dead If GCS is dead for more than X seconds (60 by default), raylet will just crash itself, so KubeRay can still catch it.
58 lines
2.1 KiB
Python
58 lines
2.1 KiB
Python
import sys
|
|
import pytest
|
|
import requests
|
|
|
|
import ray._private.ray_constants as ray_constants
|
|
from ray.tests.conftest import * # noqa: F401 F403
|
|
from ray._private.test_utils import find_free_port, wait_for_condition
|
|
|
|
|
|
def test_healthz_head(ray_start_cluster):
|
|
dashboard_port = find_free_port()
|
|
h = ray_start_cluster.add_node(dashboard_port=dashboard_port)
|
|
uri = f"http://localhost:{dashboard_port}/api/gcs_healthz"
|
|
wait_for_condition(lambda: requests.get(uri).status_code == 200)
|
|
h.all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER][0].process.kill()
|
|
# It'll either timeout or just return an error
|
|
try:
|
|
wait_for_condition(lambda: requests.get(uri, timeout=1) != 200, timeout=4)
|
|
except RuntimeError as e:
|
|
assert "Read timed out" in str(e)
|
|
|
|
|
|
def test_healthz_agent_1(ray_start_cluster):
|
|
agent_port = find_free_port()
|
|
h = ray_start_cluster.add_node(dashboard_agent_listen_port=agent_port)
|
|
uri = f"http://localhost:{agent_port}/api/local_raylet_healthz"
|
|
|
|
wait_for_condition(lambda: requests.get(uri).status_code == 200)
|
|
|
|
h.all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER][0].process.kill()
|
|
# GCS's failure will not lead to healthz failure
|
|
assert requests.get(uri).status_code == 200
|
|
|
|
|
|
@pytest.mark.skipif(sys.platform == "win32", reason="SIGSTOP only on posix")
|
|
def test_healthz_agent_2(monkeypatch, ray_start_cluster):
|
|
monkeypatch.setenv("RAY_num_heartbeats_timeout", "3")
|
|
|
|
agent_port = find_free_port()
|
|
h = ray_start_cluster.add_node(dashboard_agent_listen_port=agent_port)
|
|
uri = f"http://localhost:{agent_port}/api/local_raylet_healthz"
|
|
|
|
wait_for_condition(lambda: requests.get(uri).status_code == 200)
|
|
|
|
import signal
|
|
|
|
h.all_processes[ray_constants.PROCESS_TYPE_RAYLET][0].process.send_signal(
|
|
signal.SIGSTOP
|
|
)
|
|
|
|
# GCS still think raylet is alive.
|
|
assert requests.get(uri).status_code == 200
|
|
# But after heartbeat timeout, it'll think the raylet is down.
|
|
wait_for_condition(lambda: requests.get(uri).status_code != 200)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(pytest.main(["-v", __file__]))
|