From f4011754d68cc4a9b700991efeb7ded4bf274e8c Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 26 Dec 2018 03:01:53 +0800 Subject: [PATCH] Fix: ServerConnection should be closed before being removed (#3626) Otherwise, in the event of a remote raylet crashing, the connection might be held by boost asio forever, and the pending callbacks will never get invoked. See also #3586. --- src/ray/common/client_connection.cc | 8 +++++++- src/ray/raylet/node_manager.cc | 9 ++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/ray/common/client_connection.cc b/src/ray/common/client_connection.cc index db7caaeb6..1ca93ba2a 100644 --- a/src/ray/common/client_connection.cc +++ b/src/ray/common/client_connection.cc @@ -16,7 +16,13 @@ ray::Status TcpConnect(boost::asio::ip::tcp::socket &socket, boost::asio::ip::tcp::endpoint endpoint(ip_address, port); boost::system::error_code error; socket.connect(endpoint, error); - return boost_to_ray_status(error); + const auto status = boost_to_ray_status(error); + if (!status.ok()) { + // Close the socket if the connect failed. + boost::system::error_code close_error; + socket.close(close_error); + } + return status; } template diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 958fecef5..fd7013248 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -405,7 +405,14 @@ void NodeManager::ClientRemoved(const ClientTableDataT &client_data) { cluster_resource_map_.erase(client_id); // Remove the remote server connection. - remote_server_connections_.erase(client_id); + const auto connection_entry = remote_server_connections_.find(client_id); + if (connection_entry != remote_server_connections_.end()) { + connection_entry->second->Close(); + remote_server_connections_.erase(connection_entry); + } else { + RAY_LOG(WARNING) << "Received ClientRemoved callback for an unknown client " + << client_id << "."; + } // For any live actors that were on the dead node, broadcast a notification // about the actor's death