mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[ray client] [runtime env] Print error logs in driver upon connection failure (#18451)
This commit is contained in:
parent
d477fd7205
commit
0126837868
2 changed files with 41 additions and 3 deletions
|
@ -211,6 +211,30 @@ def test_startup_error_yields_clean_result(shutdown_only):
|
|||
server.stop(0)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform == "win32",
|
||||
reason="PSUtil does not work the same on windows.")
|
||||
@pytest.mark.parametrize(
|
||||
"call_ray_start", [
|
||||
"ray start --head --ray-client-server-port 25031 "
|
||||
"--port 0 --redis-password=password"
|
||||
],
|
||||
indirect=True)
|
||||
def test_runtime_install_error_message(call_ray_start):
|
||||
"""
|
||||
Check that an error while preparing the runtime environment for the client
|
||||
server yields an actionable, clear error on the *client side*.
|
||||
"""
|
||||
with pytest.raises(ConnectionAbortedError) as excinfo:
|
||||
ray.client("localhost:25031").env({
|
||||
"pip": ["ray-this-doesnt-exist"]
|
||||
}).connect()
|
||||
assert ("No matching distribution found for ray-this-doesnt-exist" in str(
|
||||
excinfo.value))
|
||||
|
||||
ray.util.disconnect()
|
||||
|
||||
|
||||
def test_prepare_runtime_init_req_fails():
|
||||
"""
|
||||
Check that a connection that is initiated with a non-Init request
|
||||
|
|
|
@ -534,17 +534,31 @@ class DataServicerProxy(ray_client_pb2_grpc.RayletDataStreamerServicer):
|
|||
logger.error(
|
||||
f"Server startup failed for client: {client_id}, "
|
||||
f"using JobConfig: {job_config}!")
|
||||
# TODO(architkulkarni): Once the client server runtime env
|
||||
# setup is moved into the runtime env agent, revisit this
|
||||
# and double check where the error logs end up being saved.
|
||||
try:
|
||||
with open("/tmp/ray/session_latest/logs/"
|
||||
f"ray_client_server_{server.port}.err") as f:
|
||||
runtime_env_error_str = f.read()
|
||||
except FileNotFoundError:
|
||||
runtime_env_error_str = "(File not found)"
|
||||
raise RuntimeError(
|
||||
"Starting Ray client server failed. This is most "
|
||||
"likely because the runtime_env failed to be "
|
||||
"installed. See ray_client_server_[port].err on the "
|
||||
"head node of the cluster for the relevant logs.")
|
||||
f"installed. Printing the contents of "
|
||||
f"ray_client_server_{server.port}.err below: \n"
|
||||
f"{runtime_env_error_str}")
|
||||
channel = self.proxy_manager.get_channel(client_id)
|
||||
if channel is None:
|
||||
logger.error(f"Channel not found for {client_id}")
|
||||
raise RuntimeError(
|
||||
"Proxy failed to Connect to backend! Check "
|
||||
"`ray_client_server.err` on the cluster.")
|
||||
"`ray_client_server.err` and "
|
||||
f"`ray_client_server_{server.port}.err` on the head "
|
||||
"node of the cluster for the relevant logs. "
|
||||
"By default these are located at "
|
||||
"/tmp/ray/session_latest/logs.")
|
||||
stub = ray_client_pb2_grpc.RayletDataStreamerStub(channel)
|
||||
except Exception:
|
||||
init_resp = ray_client_pb2.DataResponse(
|
||||
|
|
Loading…
Add table
Reference in a new issue