mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[ray client] [runtime env] Print error logs in driver upon connection failure (#18451)
This commit is contained in:
parent
d477fd7205
commit
0126837868
2 changed files with 41 additions and 3 deletions
|
@ -211,6 +211,30 @@ def test_startup_error_yields_clean_result(shutdown_only):
|
||||||
server.stop(0)
|
server.stop(0)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
sys.platform == "win32",
|
||||||
|
reason="PSUtil does not work the same on windows.")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"call_ray_start", [
|
||||||
|
"ray start --head --ray-client-server-port 25031 "
|
||||||
|
"--port 0 --redis-password=password"
|
||||||
|
],
|
||||||
|
indirect=True)
|
||||||
|
def test_runtime_install_error_message(call_ray_start):
|
||||||
|
"""
|
||||||
|
Check that an error while preparing the runtime environment for the client
|
||||||
|
server yields an actionable, clear error on the *client side*.
|
||||||
|
"""
|
||||||
|
with pytest.raises(ConnectionAbortedError) as excinfo:
|
||||||
|
ray.client("localhost:25031").env({
|
||||||
|
"pip": ["ray-this-doesnt-exist"]
|
||||||
|
}).connect()
|
||||||
|
assert ("No matching distribution found for ray-this-doesnt-exist" in str(
|
||||||
|
excinfo.value))
|
||||||
|
|
||||||
|
ray.util.disconnect()
|
||||||
|
|
||||||
|
|
||||||
def test_prepare_runtime_init_req_fails():
|
def test_prepare_runtime_init_req_fails():
|
||||||
"""
|
"""
|
||||||
Check that a connection that is initiated with a non-Init request
|
Check that a connection that is initiated with a non-Init request
|
||||||
|
|
|
@ -534,17 +534,31 @@ class DataServicerProxy(ray_client_pb2_grpc.RayletDataStreamerServicer):
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Server startup failed for client: {client_id}, "
|
f"Server startup failed for client: {client_id}, "
|
||||||
f"using JobConfig: {job_config}!")
|
f"using JobConfig: {job_config}!")
|
||||||
|
# TODO(architkulkarni): Once the client server runtime env
|
||||||
|
# setup is moved into the runtime env agent, revisit this
|
||||||
|
# and double check where the error logs end up being saved.
|
||||||
|
try:
|
||||||
|
with open("/tmp/ray/session_latest/logs/"
|
||||||
|
f"ray_client_server_{server.port}.err") as f:
|
||||||
|
runtime_env_error_str = f.read()
|
||||||
|
except FileNotFoundError:
|
||||||
|
runtime_env_error_str = "(File not found)"
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Starting Ray client server failed. This is most "
|
"Starting Ray client server failed. This is most "
|
||||||
"likely because the runtime_env failed to be "
|
"likely because the runtime_env failed to be "
|
||||||
"installed. See ray_client_server_[port].err on the "
|
f"installed. Printing the contents of "
|
||||||
"head node of the cluster for the relevant logs.")
|
f"ray_client_server_{server.port}.err below: \n"
|
||||||
|
f"{runtime_env_error_str}")
|
||||||
channel = self.proxy_manager.get_channel(client_id)
|
channel = self.proxy_manager.get_channel(client_id)
|
||||||
if channel is None:
|
if channel is None:
|
||||||
logger.error(f"Channel not found for {client_id}")
|
logger.error(f"Channel not found for {client_id}")
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Proxy failed to Connect to backend! Check "
|
"Proxy failed to Connect to backend! Check "
|
||||||
"`ray_client_server.err` on the cluster.")
|
"`ray_client_server.err` and "
|
||||||
|
f"`ray_client_server_{server.port}.err` on the head "
|
||||||
|
"node of the cluster for the relevant logs. "
|
||||||
|
"By default these are located at "
|
||||||
|
"/tmp/ray/session_latest/logs.")
|
||||||
stub = ray_client_pb2_grpc.RayletDataStreamerStub(channel)
|
stub = ray_client_pb2_grpc.RayletDataStreamerStub(channel)
|
||||||
except Exception:
|
except Exception:
|
||||||
init_resp = ray_client_pb2.DataResponse(
|
init_resp = ray_client_pb2.DataResponse(
|
||||||
|
|
Loading…
Add table
Reference in a new issue