[Core] Set keepalive only at gcs (#18086)

This commit is contained in:
SangBin Cho 2021-08-27 01:26:51 -07:00 committed by GitHub
parent 56089ae926
commit a25cc47399
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 21 additions and 6 deletions

View file

@ -449,6 +449,11 @@ RAY_CONFIG(bool, worker_resource_limits_enabled, false)
RAY_CONFIG(int64_t, gcs_max_active_rpcs_per_handler, 100)
/// grpc keepalive sent interval
/// This is only configured in GCS server now.
/// NOTE: It is not ideal for other components because
/// they have a failure model that considers network failures as component failures
/// and this configuration break that assumption. We should apply to every other component
/// after we change this failure assumption from code.
RAY_CONFIG(int64_t, grpc_keepalive_time_ms, 10000);
/// grpc keepalive timeout

View file

@ -35,7 +35,8 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config,
: config_(config),
main_service_(main_service),
rpc_server_(config.grpc_server_name, config.grpc_server_port,
config.grpc_server_thread_num),
config.grpc_server_thread_num,
/*keepalive_time_ms=*/RayConfig::instance().grpc_keepalive_time_ms()),
client_call_manager_(main_service),
raylet_client_pool_(
std::make_shared<rpc::NodeManagerClientPool>(client_call_manager_)),

View file

@ -35,8 +35,13 @@ DEFINE_stats(grpc_server_req_finished, "Finished request number in grpc server",
namespace ray {
namespace rpc {
GrpcServer::GrpcServer(std::string name, const uint32_t port, int num_threads)
: name_(std::move(name)), port_(port), is_closed_(true), num_threads_(num_threads) {
GrpcServer::GrpcServer(std::string name, const uint32_t port, int num_threads,
int64_t keepalive_time_ms)
: name_(std::move(name)),
port_(port),
is_closed_(true),
num_threads_(num_threads),
keepalive_time_ms_(keepalive_time_ms) {
cqs_.resize(num_threads_);
}
@ -52,8 +57,7 @@ void GrpcServer::Run() {
RayConfig::instance().max_grpc_message_size());
builder.AddChannelArgument(GRPC_ARG_MAX_RECEIVE_MESSAGE_LENGTH,
RayConfig::instance().max_grpc_message_size());
builder.AddChannelArgument(GRPC_ARG_KEEPALIVE_TIME_MS,
RayConfig::instance().grpc_keepalive_time_ms());
builder.AddChannelArgument(GRPC_ARG_KEEPALIVE_TIME_MS, keepalive_time_ms_);
builder.AddChannelArgument(GRPC_ARG_KEEPALIVE_TIMEOUT_MS,
RayConfig::instance().grpc_keepalive_timeout_ms());
builder.AddChannelArgument(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 0);

View file

@ -61,7 +61,8 @@ class GrpcServer {
/// \param[in] name Name of this server, used for logging and debugging purpose.
/// \param[in] port The port to bind this server to. If it's 0, a random available port
/// will be chosen.
GrpcServer(std::string name, const uint32_t port, int num_threads = 1);
GrpcServer(std::string name, const uint32_t port, int num_threads = 1,
int64_t keepalive_time_ms = 7200000 /*2 hours, grpc default*/);
/// Destruct this gRPC server.
~GrpcServer() { Shutdown(); }
@ -120,6 +121,10 @@ class GrpcServer {
std::unique_ptr<grpc::Server> server_;
/// The polling threads used to check the completion queues.
std::vector<std::thread> polling_threads_;
/// The interval to send a new gRPC keepalive timeout from server -> client.
/// gRPC server cannot get the ping response within the time, it triggers
/// the watchdog timer fired error, which will close the connection.
const int64_t keepalive_time_ms_;
};
/// Base class that represents an abstract gRPC service.