mirror of
https://github.com/vale981/ray
synced 2025-03-12 22:26:39 -04:00

This PR fixed several issue which block serve agent when GCS is down. We need to make sure serve agent is always alive and can make sure the external requests can be sent to the agent and check the status. - internal kv used in dashboard/agent blocks the agent. We use the async one instead - serve controller use ray.nodes which is a blocking call and blocking forever. change to use gcs client with timeout - agent use serve controller client which is a blocking call with max retries = -1. This blocks until controller is back. To enable Serve HA, we also need to setup: - RAY_gcs_server_request_timeout_seconds=5 - RAY_SERVE_KV_TIMEOUT_S=5 which we should set in KubeRay.
68 lines
2.2 KiB
Python
68 lines
2.2 KiB
Python
import requests
|
|
import pytest
|
|
import ray
|
|
import sys
|
|
from ray import serve
|
|
from ray.tests.conftest import * # noqa: F401,F403
|
|
from ray._private.test_utils import generate_system_config_map
|
|
|
|
DEPLOYMENTS_URL = "http://localhost:52365/api/serve/deployments/"
|
|
STATUS_URL = "http://localhost:52365/api/serve/deployments/status"
|
|
|
|
|
|
@pytest.mark.skipif(sys.platform == "darwin", reason="Flaky on OSX.")
|
|
@pytest.mark.parametrize(
|
|
"ray_start_regular_with_external_redis",
|
|
[
|
|
{
|
|
**generate_system_config_map(
|
|
gcs_failover_worker_reconnect_timeout=20,
|
|
gcs_rpc_server_reconnect_timeout_s=3600,
|
|
gcs_server_request_timeout_seconds=3,
|
|
),
|
|
}
|
|
],
|
|
indirect=True,
|
|
)
|
|
def test_deployments_get_tolerane(monkeypatch, ray_start_regular_with_external_redis):
|
|
# test serve agent's availability when gcs is down
|
|
monkeypatch.setenv("RAY_SERVE_KV_TIMEOUT_S", "3")
|
|
serve.start(detached=True)
|
|
|
|
get_response = requests.get(DEPLOYMENTS_URL, timeout=15)
|
|
assert get_response.status_code == 200
|
|
ray._private.worker._global_node.kill_gcs_server()
|
|
|
|
get_response = requests.get(DEPLOYMENTS_URL, timeout=30)
|
|
assert get_response.status_code == 503
|
|
|
|
|
|
@pytest.mark.skipif(sys.platform == "darwin", reason="Flaky on OSX.")
|
|
@pytest.mark.parametrize(
|
|
"ray_start_regular_with_external_redis",
|
|
[
|
|
{
|
|
**generate_system_config_map(
|
|
gcs_failover_worker_reconnect_timeout=20,
|
|
gcs_rpc_server_reconnect_timeout_s=3600,
|
|
gcs_server_request_timeout_seconds=1,
|
|
),
|
|
}
|
|
],
|
|
indirect=True,
|
|
)
|
|
def test_status_url_get_tolerane(monkeypatch, ray_start_regular_with_external_redis):
|
|
# test serve agent's availability when gcs is down
|
|
monkeypatch.setenv("RAY_SERVE_KV_TIMEOUT_S", "3")
|
|
serve.start(detached=True)
|
|
get_response = requests.get(STATUS_URL, timeout=15)
|
|
assert get_response.status_code == 200
|
|
|
|
ray._private.worker._global_node.kill_gcs_server()
|
|
|
|
get_response = requests.get(STATUS_URL, timeout=30)
|
|
assert get_response.status_code == 200
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(pytest.main(["-vs", "--forked", __file__]))
|