[ray client] [runtime env] Print error logs in driver upon connection failure (#18451)

This commit is contained in:
architkulkarni 2021-09-09 11:50:55 -07:00 committed by GitHub
parent d477fd7205
commit 0126837868
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 41 additions and 3 deletions

View file

@ -211,6 +211,30 @@ def test_startup_error_yields_clean_result(shutdown_only):
server.stop(0)
@pytest.mark.skipif(
sys.platform == "win32",
reason="PSUtil does not work the same on windows.")
@pytest.mark.parametrize(
"call_ray_start", [
"ray start --head --ray-client-server-port 25031 "
"--port 0 --redis-password=password"
],
indirect=True)
def test_runtime_install_error_message(call_ray_start):
"""
Check that an error while preparing the runtime environment for the client
server yields an actionable, clear error on the *client side*.
"""
with pytest.raises(ConnectionAbortedError) as excinfo:
ray.client("localhost:25031").env({
"pip": ["ray-this-doesnt-exist"]
}).connect()
assert ("No matching distribution found for ray-this-doesnt-exist" in str(
excinfo.value))
ray.util.disconnect()
def test_prepare_runtime_init_req_fails():
"""
Check that a connection that is initiated with a non-Init request

View file

@ -534,17 +534,31 @@ class DataServicerProxy(ray_client_pb2_grpc.RayletDataStreamerServicer):
logger.error(
f"Server startup failed for client: {client_id}, "
f"using JobConfig: {job_config}!")
# TODO(architkulkarni): Once the client server runtime env
# setup is moved into the runtime env agent, revisit this
# and double check where the error logs end up being saved.
try:
with open("/tmp/ray/session_latest/logs/"
f"ray_client_server_{server.port}.err") as f:
runtime_env_error_str = f.read()
except FileNotFoundError:
runtime_env_error_str = "(File not found)"
raise RuntimeError(
"Starting Ray client server failed. This is most "
"likely because the runtime_env failed to be "
"installed. See ray_client_server_[port].err on the "
"head node of the cluster for the relevant logs.")
f"installed. Printing the contents of "
f"ray_client_server_{server.port}.err below: \n"
f"{runtime_env_error_str}")
channel = self.proxy_manager.get_channel(client_id)
if channel is None:
logger.error(f"Channel not found for {client_id}")
raise RuntimeError(
"Proxy failed to Connect to backend! Check "
"`ray_client_server.err` on the cluster.")
"`ray_client_server.err` and "
f"`ray_client_server_{server.port}.err` on the head "
"node of the cluster for the relevant logs. "
"By default these are located at "
"/tmp/ray/session_latest/logs.")
stub = ray_client_pb2_grpc.RayletDataStreamerStub(channel)
except Exception:
init_resp = ray_client_pb2.DataResponse(