Fix: ServerConnection should be closed before being removed (#3626)

Otherwise, in the event of a remote raylet crashing, the connection might be held by boost asio forever, and the pending callbacks will never get invoked. See also #3586.
This commit is contained in:
Hao Chen 2018-12-26 03:01:53 +08:00 committed by Robert Nishihara
parent 5426234cd8
commit f4011754d6
2 changed files with 15 additions and 2 deletions

View file

@ -16,7 +16,13 @@ ray::Status TcpConnect(boost::asio::ip::tcp::socket &socket,
boost::asio::ip::tcp::endpoint endpoint(ip_address, port);
boost::system::error_code error;
socket.connect(endpoint, error);
return boost_to_ray_status(error);
const auto status = boost_to_ray_status(error);
if (!status.ok()) {
// Close the socket if the connect failed.
boost::system::error_code close_error;
socket.close(close_error);
}
return status;
}
template <class T>

View file

@ -405,7 +405,14 @@ void NodeManager::ClientRemoved(const ClientTableDataT &client_data) {
cluster_resource_map_.erase(client_id);
// Remove the remote server connection.
remote_server_connections_.erase(client_id);
const auto connection_entry = remote_server_connections_.find(client_id);
if (connection_entry != remote_server_connections_.end()) {
connection_entry->second->Close();
remote_server_connections_.erase(connection_entry);
} else {
RAY_LOG(WARNING) << "Received ClientRemoved callback for an unknown client "
<< client_id << ".";
}
// For any live actors that were on the dead node, broadcast a notification
// about the actor's death