mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
Optimize gcs server resubscribe (#8896)
This commit is contained in:
parent
c6ee3cdff4
commit
c295284370
9 changed files with 403 additions and 302 deletions
|
@ -1171,8 +1171,6 @@ cc_test(
|
|||
"//:redis-cli",
|
||||
"//:redis-server",
|
||||
],
|
||||
# TODO(swang): Enable again once pubsub client supports GCS server restart.
|
||||
tags = ["manual"],
|
||||
deps = [
|
||||
":gcs_server_lib",
|
||||
":gcs_test_util_lib",
|
||||
|
|
|
@ -152,9 +152,12 @@ class ActorInfoAccessor {
|
|||
|
||||
/// Reestablish subscription.
|
||||
/// This should be called when GCS server restarts from a failure.
|
||||
/// PubSub server restart will cause GCS server restart. In this case, we need to
|
||||
/// resubscribe from PubSub server, otherwise we only need to fetch data from GCS
|
||||
/// server.
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status AsyncReSubscribe() = 0;
|
||||
/// \param is_pubsub_server_restarted Whether pubsub server is restarted.
|
||||
virtual void AsyncResubscribe(bool is_pubsub_server_restarted) = 0;
|
||||
|
||||
protected:
|
||||
ActorInfoAccessor() = default;
|
||||
|
@ -202,9 +205,12 @@ class JobInfoAccessor {
|
|||
|
||||
/// Reestablish subscription.
|
||||
/// This should be called when GCS server restarts from a failure.
|
||||
/// PubSub server restart will cause GCS server restart. In this case, we need to
|
||||
/// resubscribe from PubSub server, otherwise we only need to fetch data from GCS
|
||||
/// server.
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status AsyncReSubscribe() = 0;
|
||||
/// \param is_pubsub_server_restarted Whether pubsub server is restarted.
|
||||
virtual void AsyncResubscribe(bool is_pubsub_server_restarted) = 0;
|
||||
|
||||
protected:
|
||||
JobInfoAccessor() = default;
|
||||
|
@ -310,9 +316,12 @@ class TaskInfoAccessor {
|
|||
|
||||
/// Reestablish subscription.
|
||||
/// This should be called when GCS server restarts from a failure.
|
||||
/// PubSub server restart will cause GCS server restart. In this case, we need to
|
||||
/// resubscribe from PubSub server, otherwise we only need to fetch data from GCS
|
||||
/// server.
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status AsyncReSubscribe() = 0;
|
||||
/// \param is_pubsub_server_restarted Whether pubsub server is restarted.
|
||||
virtual void AsyncResubscribe(bool is_pubsub_server_restarted) = 0;
|
||||
|
||||
protected:
|
||||
TaskInfoAccessor() = default;
|
||||
|
@ -379,9 +388,12 @@ class ObjectInfoAccessor {
|
|||
|
||||
/// Reestablish subscription.
|
||||
/// This should be called when GCS server restarts from a failure.
|
||||
/// PubSub server restart will cause GCS server restart. In this case, we need to
|
||||
/// resubscribe from PubSub server, otherwise we only need to fetch data from GCS
|
||||
/// server.
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status AsyncReSubscribe() = 0;
|
||||
/// \param is_pubsub_server_restarted Whether pubsub server is restarted.
|
||||
virtual void AsyncResubscribe(bool is_pubsub_server_restarted) = 0;
|
||||
|
||||
protected:
|
||||
ObjectInfoAccessor() = default;
|
||||
|
@ -555,9 +567,12 @@ class NodeInfoAccessor {
|
|||
|
||||
/// Reestablish subscription.
|
||||
/// This should be called when GCS server restarts from a failure.
|
||||
/// PubSub server restart will cause GCS server restart. In this case, we need to
|
||||
/// resubscribe from PubSub server, otherwise we only need to fetch data from GCS
|
||||
/// server.
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status AsyncReSubscribe() = 0;
|
||||
/// \param is_pubsub_server_restarted Whether pubsub server is restarted.
|
||||
virtual void AsyncResubscribe(bool is_pubsub_server_restarted) = 0;
|
||||
|
||||
protected:
|
||||
NodeInfoAccessor() = default;
|
||||
|
@ -657,9 +672,12 @@ class WorkerInfoAccessor {
|
|||
|
||||
/// Reestablish subscription.
|
||||
/// This should be called when GCS server restarts from a failure.
|
||||
/// PubSub server restart will cause GCS server restart. In this case, we need to
|
||||
/// resubscribe from PubSub server, otherwise we only need to fetch data from GCS
|
||||
/// server.
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status AsyncReSubscribe() = 0;
|
||||
/// \param is_pubsub_server_restarted Whether pubsub server is restarted.
|
||||
virtual void AsyncResubscribe(bool is_pubsub_server_restarted) = 0;
|
||||
|
||||
protected:
|
||||
WorkerInfoAccessor() = default;
|
||||
|
|
|
@ -64,7 +64,6 @@ Status ServiceBasedJobInfoAccessor::AsyncSubscribeToFinishedJobs(
|
|||
const SubscribeCallback<JobID, JobTableData> &subscribe, const StatusCallback &done) {
|
||||
RAY_CHECK(subscribe != nullptr);
|
||||
subscribe_operation_ = [this, subscribe](const StatusCallback &done) {
|
||||
RAY_LOG(DEBUG) << "Subscribing finished job.";
|
||||
auto on_subscribe = [subscribe](const std::string &id, const std::string &data) {
|
||||
JobTableData job_data;
|
||||
job_data.ParseFromString(data);
|
||||
|
@ -72,20 +71,17 @@ Status ServiceBasedJobInfoAccessor::AsyncSubscribeToFinishedJobs(
|
|||
subscribe(JobID::FromBinary(id), job_data);
|
||||
}
|
||||
};
|
||||
Status status =
|
||||
client_impl_->GetGcsPubSub().SubscribeAll(JOB_CHANNEL, on_subscribe, done);
|
||||
RAY_LOG(DEBUG) << "Finished subscribing finished job.";
|
||||
return status;
|
||||
return client_impl_->GetGcsPubSub().SubscribeAll(JOB_CHANNEL, on_subscribe, done);
|
||||
};
|
||||
return subscribe_operation_(done);
|
||||
}
|
||||
|
||||
Status ServiceBasedJobInfoAccessor::AsyncReSubscribe() {
|
||||
void ServiceBasedJobInfoAccessor::AsyncResubscribe(bool is_pubsub_server_restarted) {
|
||||
RAY_LOG(INFO) << "Reestablishing subscription for job info.";
|
||||
if (subscribe_operation_ != nullptr) {
|
||||
return subscribe_operation_(nullptr);
|
||||
// If the pub-sub server has restarted, we need to resubscribe to the pub-sub server.
|
||||
if (subscribe_operation_ != nullptr && is_pubsub_server_restarted) {
|
||||
RAY_CHECK_OK(subscribe_operation_(nullptr));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ServiceBasedJobInfoAccessor::AsyncGetAll(
|
||||
|
@ -152,8 +148,7 @@ Status ServiceBasedActorInfoAccessor::AsyncGetByName(
|
|||
request,
|
||||
[name, callback](const Status &status, const rpc::GetNamedActorInfoReply &reply) {
|
||||
if (reply.has_actor_table_data()) {
|
||||
rpc::ActorTableData actor_table_data(reply.actor_table_data());
|
||||
callback(status, actor_table_data);
|
||||
callback(status, reply.actor_table_data());
|
||||
} else {
|
||||
callback(status, boost::none);
|
||||
}
|
||||
|
@ -234,37 +229,32 @@ Status ServiceBasedActorInfoAccessor::AsyncUpdate(
|
|||
Status ServiceBasedActorInfoAccessor::AsyncSubscribeAll(
|
||||
const SubscribeCallback<ActorID, rpc::ActorTableData> &subscribe,
|
||||
const StatusCallback &done) {
|
||||
RAY_LOG(DEBUG) << "Subscribing register or update operations of actors.";
|
||||
RAY_CHECK(subscribe != nullptr);
|
||||
fetch_all_data_operation_ = [this, subscribe](const StatusCallback &done) {
|
||||
auto callback = [subscribe, done](
|
||||
const Status &status,
|
||||
const std::vector<rpc::ActorTableData> &actor_info_list) {
|
||||
for (auto &actor_info : actor_info_list) {
|
||||
subscribe(ActorID::FromBinary(actor_info.actor_id()), actor_info);
|
||||
}
|
||||
if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGetAll(callback));
|
||||
};
|
||||
|
||||
subscribe_all_operation_ = [this, subscribe](const StatusCallback &done) {
|
||||
auto on_subscribe = [subscribe](const std::string &id, const std::string &data) {
|
||||
ActorTableData actor_data;
|
||||
actor_data.ParseFromString(data);
|
||||
subscribe(ActorID::FromBinary(actor_data.actor_id()), actor_data);
|
||||
};
|
||||
auto on_done = [this, subscribe, done](const Status &status) {
|
||||
if (status.ok()) {
|
||||
auto callback = [subscribe, done](
|
||||
const Status &status,
|
||||
const std::vector<rpc::ActorTableData> &actor_info_list) {
|
||||
for (auto &actor_info : actor_info_list) {
|
||||
subscribe(ActorID::FromBinary(actor_info.actor_id()), actor_info);
|
||||
}
|
||||
if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGetAll(callback));
|
||||
} else if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
auto status =
|
||||
client_impl_->GetGcsPubSub().SubscribeAll(ACTOR_CHANNEL, on_subscribe, on_done);
|
||||
RAY_LOG(DEBUG) << "Finished subscribing register or update operations of actors.";
|
||||
return status;
|
||||
return client_impl_->GetGcsPubSub().SubscribeAll(ACTOR_CHANNEL, on_subscribe, done);
|
||||
};
|
||||
return subscribe_all_operation_(done);
|
||||
|
||||
return subscribe_all_operation_(
|
||||
[this, done](const Status &status) { fetch_all_data_operation_(done); });
|
||||
}
|
||||
|
||||
Status ServiceBasedActorInfoAccessor::AsyncSubscribe(
|
||||
|
@ -273,43 +263,44 @@ Status ServiceBasedActorInfoAccessor::AsyncSubscribe(
|
|||
const StatusCallback &done) {
|
||||
RAY_LOG(DEBUG) << "Subscribing update operations of actor, actor id = " << actor_id;
|
||||
RAY_CHECK(subscribe != nullptr) << "Failed to subscribe actor, actor id = " << actor_id;
|
||||
auto subscribe_operation = [this, actor_id, subscribe](const StatusCallback &done) {
|
||||
|
||||
auto fetch_data_operation = [this, actor_id,
|
||||
subscribe](const StatusCallback &fetch_done) {
|
||||
auto callback = [actor_id, subscribe, fetch_done](
|
||||
const Status &status,
|
||||
const boost::optional<rpc::ActorTableData> &result) {
|
||||
if (result) {
|
||||
subscribe(actor_id, *result);
|
||||
}
|
||||
if (fetch_done) {
|
||||
fetch_done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGet(actor_id, callback));
|
||||
};
|
||||
|
||||
auto subscribe_operation = [this, actor_id,
|
||||
subscribe](const StatusCallback &subscribe_done) {
|
||||
auto on_subscribe = [subscribe](const std::string &id, const std::string &data) {
|
||||
ActorTableData actor_data;
|
||||
actor_data.ParseFromString(data);
|
||||
subscribe(ActorID::FromBinary(actor_data.actor_id()), actor_data);
|
||||
};
|
||||
auto on_done = [this, actor_id, subscribe, done](const Status &status) {
|
||||
if (status.ok()) {
|
||||
auto callback = [actor_id, subscribe, done](
|
||||
const Status &status,
|
||||
const boost::optional<rpc::ActorTableData> &result) {
|
||||
if (result) {
|
||||
subscribe(actor_id, *result);
|
||||
}
|
||||
if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGet(actor_id, callback));
|
||||
} else if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
auto status = client_impl_->GetGcsPubSub().Subscribe(ACTOR_CHANNEL, actor_id.Hex(),
|
||||
on_subscribe, on_done);
|
||||
RAY_LOG(DEBUG) << "Finished subscribing update operations of actor, actor id = "
|
||||
<< actor_id;
|
||||
return status;
|
||||
return client_impl_->GetGcsPubSub().Subscribe(ACTOR_CHANNEL, actor_id.Hex(),
|
||||
on_subscribe, subscribe_done);
|
||||
};
|
||||
|
||||
subscribe_operations_[actor_id] = subscribe_operation;
|
||||
return subscribe_operation(done);
|
||||
fetch_data_operations_[actor_id] = fetch_data_operation;
|
||||
return subscribe_operation(
|
||||
[fetch_data_operation, done](const Status &status) { fetch_data_operation(done); });
|
||||
}
|
||||
|
||||
Status ServiceBasedActorInfoAccessor::AsyncUnsubscribe(const ActorID &actor_id) {
|
||||
RAY_LOG(DEBUG) << "Cancelling subscription to an actor, actor id = " << actor_id;
|
||||
auto status = client_impl_->GetGcsPubSub().Unsubscribe(ACTOR_CHANNEL, actor_id.Hex());
|
||||
subscribe_operations_.erase(actor_id);
|
||||
fetch_data_operations_.erase(actor_id);
|
||||
RAY_LOG(DEBUG) << "Finished cancelling subscription to an actor, actor id = "
|
||||
<< actor_id;
|
||||
return status;
|
||||
|
@ -386,15 +377,30 @@ Status ServiceBasedActorInfoAccessor::AsyncGetCheckpointID(
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ServiceBasedActorInfoAccessor::AsyncReSubscribe() {
|
||||
void ServiceBasedActorInfoAccessor::AsyncResubscribe(bool is_pubsub_server_restarted) {
|
||||
RAY_LOG(INFO) << "Reestablishing subscription for actor info.";
|
||||
if (subscribe_all_operation_ != nullptr) {
|
||||
RAY_CHECK_OK(subscribe_all_operation_(nullptr));
|
||||
// If only the GCS sever has restarted, we only need to fetch data from the GCS server.
|
||||
// If the pub-sub server has also restarted, we need to resubscribe to the pub-sub
|
||||
// server first, then fetch data from the GCS server.
|
||||
if (is_pubsub_server_restarted) {
|
||||
if (subscribe_all_operation_ != nullptr) {
|
||||
RAY_CHECK_OK(subscribe_all_operation_(
|
||||
[this](const Status &status) { fetch_all_data_operation_(nullptr); }));
|
||||
}
|
||||
for (auto &item : subscribe_operations_) {
|
||||
auto &actor_id = item.first;
|
||||
RAY_CHECK_OK(item.second([this, actor_id](const Status &status) {
|
||||
fetch_data_operations_[actor_id](nullptr);
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
if (fetch_all_data_operation_ != nullptr) {
|
||||
fetch_all_data_operation_(nullptr);
|
||||
}
|
||||
for (auto &item : fetch_data_operations_) {
|
||||
item.second(nullptr);
|
||||
}
|
||||
}
|
||||
for (auto &item : subscribe_operations_) {
|
||||
RAY_CHECK_OK(item.second(nullptr));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ServiceBasedNodeInfoAccessor::ServiceBasedNodeInfoAccessor(
|
||||
|
@ -411,7 +417,7 @@ Status ServiceBasedNodeInfoAccessor::RegisterSelf(const GcsNodeInfo &local_node_
|
|||
request.mutable_node_info()->CopyFrom(local_node_info);
|
||||
|
||||
auto operation = [this, request, local_node_info,
|
||||
node_id](SequencerDoneCallback done_callback) {
|
||||
node_id](const SequencerDoneCallback &done_callback) {
|
||||
client_impl_->GetGcsRpcClient().RegisterNode(
|
||||
request, [this, node_id, local_node_info, done_callback](
|
||||
const Status &status, const rpc::RegisterNodeReply &reply) {
|
||||
|
@ -510,40 +516,35 @@ Status ServiceBasedNodeInfoAccessor::AsyncGetAll(
|
|||
Status ServiceBasedNodeInfoAccessor::AsyncSubscribeToNodeChange(
|
||||
const SubscribeCallback<ClientID, GcsNodeInfo> &subscribe,
|
||||
const StatusCallback &done) {
|
||||
RAY_LOG(DEBUG) << "Subscribing node change.";
|
||||
RAY_CHECK(subscribe != nullptr);
|
||||
RAY_CHECK(node_change_callback_ == nullptr);
|
||||
node_change_callback_ = subscribe;
|
||||
|
||||
RAY_CHECK(subscribe != nullptr);
|
||||
subscribe_node_operation_ = [this, subscribe](const StatusCallback &done) {
|
||||
fetch_node_data_operation_ = [this](const StatusCallback &done) {
|
||||
auto callback = [this, done](const Status &status,
|
||||
const std::vector<GcsNodeInfo> &node_info_list) {
|
||||
for (auto &node_info : node_info_list) {
|
||||
HandleNotification(node_info);
|
||||
}
|
||||
if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGetAll(callback));
|
||||
};
|
||||
|
||||
subscribe_node_operation_ = [this](const StatusCallback &done) {
|
||||
auto on_subscribe = [this](const std::string &id, const std::string &data) {
|
||||
GcsNodeInfo node_info;
|
||||
node_info.ParseFromString(data);
|
||||
HandleNotification(node_info);
|
||||
};
|
||||
|
||||
auto on_done = [this, subscribe, done](const Status &status) {
|
||||
// Get nodes from GCS Service.
|
||||
auto callback = [this, subscribe, done](
|
||||
const Status &status,
|
||||
const std::vector<GcsNodeInfo> &node_info_list) {
|
||||
for (auto &node_info : node_info_list) {
|
||||
HandleNotification(node_info);
|
||||
}
|
||||
if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGetAll(callback));
|
||||
};
|
||||
|
||||
auto status =
|
||||
client_impl_->GetGcsPubSub().SubscribeAll(NODE_CHANNEL, on_subscribe, on_done);
|
||||
RAY_LOG(DEBUG) << "Finished subscribing node change.";
|
||||
return status;
|
||||
return client_impl_->GetGcsPubSub().SubscribeAll(NODE_CHANNEL, on_subscribe, done);
|
||||
};
|
||||
return subscribe_node_operation_(done);
|
||||
|
||||
return subscribe_node_operation_([this, subscribe, done](const Status &status) {
|
||||
fetch_node_data_operation_(done);
|
||||
});
|
||||
}
|
||||
|
||||
boost::optional<GcsNodeInfo> ServiceBasedNodeInfoAccessor::Get(
|
||||
|
@ -596,7 +597,7 @@ Status ServiceBasedNodeInfoAccessor::AsyncUpdateResources(
|
|||
}
|
||||
|
||||
auto operation = [this, request, node_id,
|
||||
callback](SequencerDoneCallback done_callback) {
|
||||
callback](const SequencerDoneCallback &done_callback) {
|
||||
client_impl_->GetGcsRpcClient().UpdateResources(
|
||||
request, [node_id, callback, done_callback](
|
||||
const Status &status, const rpc::UpdateResourcesReply &reply) {
|
||||
|
@ -624,7 +625,7 @@ Status ServiceBasedNodeInfoAccessor::AsyncDeleteResources(
|
|||
}
|
||||
|
||||
auto operation = [this, request, node_id,
|
||||
callback](SequencerDoneCallback done_callback) {
|
||||
callback](const SequencerDoneCallback &done_callback) {
|
||||
client_impl_->GetGcsRpcClient().DeleteResources(
|
||||
request, [node_id, callback, done_callback](
|
||||
const Status &status, const rpc::DeleteResourcesReply &reply) {
|
||||
|
@ -643,20 +644,15 @@ Status ServiceBasedNodeInfoAccessor::AsyncDeleteResources(
|
|||
|
||||
Status ServiceBasedNodeInfoAccessor::AsyncSubscribeToResources(
|
||||
const ItemCallback<rpc::NodeResourceChange> &subscribe, const StatusCallback &done) {
|
||||
RAY_LOG(DEBUG) << "Subscribing node resources change.";
|
||||
RAY_CHECK(subscribe != nullptr);
|
||||
|
||||
subscribe_resource_operation_ = [this, subscribe](const StatusCallback &done) {
|
||||
auto on_subscribe = [subscribe](const std::string &id, const std::string &data) {
|
||||
rpc::NodeResourceChange node_resource_change;
|
||||
node_resource_change.ParseFromString(data);
|
||||
subscribe(node_resource_change);
|
||||
};
|
||||
|
||||
auto status = client_impl_->GetGcsPubSub().SubscribeAll(NODE_RESOURCE_CHANNEL,
|
||||
on_subscribe, done);
|
||||
RAY_LOG(DEBUG) << "Finished subscribing node resources change.";
|
||||
return status;
|
||||
return client_impl_->GetGcsPubSub().SubscribeAll(NODE_RESOURCE_CHANNEL, on_subscribe,
|
||||
done);
|
||||
};
|
||||
return subscribe_resource_operation_(done);
|
||||
}
|
||||
|
@ -696,19 +692,15 @@ Status ServiceBasedNodeInfoAccessor::AsyncReportBatchHeartbeat(
|
|||
Status ServiceBasedNodeInfoAccessor::AsyncSubscribeBatchHeartbeat(
|
||||
const ItemCallback<rpc::HeartbeatBatchTableData> &subscribe,
|
||||
const StatusCallback &done) {
|
||||
RAY_LOG(DEBUG) << "Subscribing batch heartbeat.";
|
||||
RAY_CHECK(subscribe != nullptr);
|
||||
|
||||
subscribe_batch_heartbeat_operation_ = [this, subscribe](const StatusCallback &done) {
|
||||
auto on_subscribe = [subscribe](const std::string &id, const std::string &data) {
|
||||
rpc::HeartbeatBatchTableData heartbeat_batch_table_data;
|
||||
heartbeat_batch_table_data.ParseFromString(data);
|
||||
subscribe(heartbeat_batch_table_data);
|
||||
};
|
||||
auto status = client_impl_->GetGcsPubSub().Subscribe(HEARTBEAT_BATCH_CHANNEL, "",
|
||||
on_subscribe, done);
|
||||
RAY_LOG(DEBUG) << "Finished subscribing batch heartbeat.";
|
||||
return status;
|
||||
return client_impl_->GetGcsPubSub().Subscribe(HEARTBEAT_BATCH_CHANNEL, "",
|
||||
on_subscribe, done);
|
||||
};
|
||||
return subscribe_batch_heartbeat_operation_(done);
|
||||
}
|
||||
|
@ -752,18 +744,27 @@ void ServiceBasedNodeInfoAccessor::HandleNotification(const GcsNodeInfo &node_in
|
|||
}
|
||||
}
|
||||
|
||||
Status ServiceBasedNodeInfoAccessor::AsyncReSubscribe() {
|
||||
void ServiceBasedNodeInfoAccessor::AsyncResubscribe(bool is_pubsub_server_restarted) {
|
||||
RAY_LOG(INFO) << "Reestablishing subscription for node info.";
|
||||
if (subscribe_node_operation_ != nullptr) {
|
||||
return subscribe_node_operation_(nullptr);
|
||||
// If only the GCS sever has restarted, we only need to fetch data from the GCS server.
|
||||
// If the pub-sub server has also restarted, we need to resubscribe to the pub-sub
|
||||
// server first, then fetch data from the GCS server.
|
||||
if (is_pubsub_server_restarted) {
|
||||
if (subscribe_node_operation_ != nullptr) {
|
||||
RAY_CHECK_OK(subscribe_node_operation_(
|
||||
[this](const Status &status) { fetch_node_data_operation_(nullptr); }));
|
||||
}
|
||||
if (subscribe_resource_operation_ != nullptr) {
|
||||
RAY_CHECK_OK(subscribe_resource_operation_(nullptr));
|
||||
}
|
||||
if (subscribe_batch_heartbeat_operation_ != nullptr) {
|
||||
RAY_CHECK_OK(subscribe_batch_heartbeat_operation_(nullptr));
|
||||
}
|
||||
} else {
|
||||
if (fetch_node_data_operation_ != nullptr) {
|
||||
fetch_node_data_operation_(nullptr);
|
||||
}
|
||||
}
|
||||
if (subscribe_resource_operation_ != nullptr) {
|
||||
return subscribe_resource_operation_(nullptr);
|
||||
}
|
||||
if (subscribe_batch_heartbeat_operation_ != nullptr) {
|
||||
return subscribe_batch_heartbeat_operation_(nullptr);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ServiceBasedTaskInfoAccessor::ServiceBasedTaskInfoAccessor(
|
||||
|
@ -829,46 +830,46 @@ Status ServiceBasedTaskInfoAccessor::AsyncDelete(const std::vector<TaskID> &task
|
|||
Status ServiceBasedTaskInfoAccessor::AsyncSubscribe(
|
||||
const TaskID &task_id, const SubscribeCallback<TaskID, rpc::TaskTableData> &subscribe,
|
||||
const StatusCallback &done) {
|
||||
RAY_LOG(DEBUG) << "Subscribing task, task id = " << task_id;
|
||||
RAY_CHECK(subscribe != nullptr) << "Failed to subscribe task, task id = " << task_id;
|
||||
|
||||
auto subscribe_operation = [this, task_id, subscribe](const StatusCallback &done) {
|
||||
auto fetch_data_operation = [this, task_id,
|
||||
subscribe](const StatusCallback &fetch_done) {
|
||||
auto callback = [task_id, subscribe, fetch_done](
|
||||
const Status &status,
|
||||
const boost::optional<rpc::TaskTableData> &result) {
|
||||
if (result) {
|
||||
subscribe(task_id, *result);
|
||||
}
|
||||
if (fetch_done) {
|
||||
fetch_done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGet(task_id, callback));
|
||||
};
|
||||
|
||||
auto subscribe_operation = [this, task_id,
|
||||
subscribe](const StatusCallback &subscribe_done) {
|
||||
auto on_subscribe = [task_id, subscribe](const std::string &id,
|
||||
const std::string &data) {
|
||||
TaskTableData task_data;
|
||||
task_data.ParseFromString(data);
|
||||
subscribe(task_id, task_data);
|
||||
};
|
||||
auto on_done = [this, task_id, subscribe, done](const Status &status) {
|
||||
if (status.ok()) {
|
||||
auto callback = [task_id, subscribe, done](
|
||||
const Status &status,
|
||||
const boost::optional<rpc::TaskTableData> &result) {
|
||||
if (result) {
|
||||
subscribe(task_id, *result);
|
||||
}
|
||||
if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGet(task_id, callback));
|
||||
} else if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
auto status = client_impl_->GetGcsPubSub().Subscribe(TASK_CHANNEL, task_id.Hex(),
|
||||
on_subscribe, on_done);
|
||||
RAY_LOG(DEBUG) << "Finished subscribing task, task id = " << task_id;
|
||||
return status;
|
||||
return client_impl_->GetGcsPubSub().Subscribe(TASK_CHANNEL, task_id.Hex(),
|
||||
on_subscribe, subscribe_done);
|
||||
};
|
||||
|
||||
subscribe_task_operations_[task_id] = subscribe_operation;
|
||||
return subscribe_operation(done);
|
||||
fetch_task_data_operations_[task_id] = fetch_data_operation;
|
||||
return subscribe_operation(
|
||||
[fetch_data_operation, done](const Status &status) { fetch_data_operation(done); });
|
||||
}
|
||||
|
||||
Status ServiceBasedTaskInfoAccessor::AsyncUnsubscribe(const TaskID &task_id) {
|
||||
RAY_LOG(DEBUG) << "Unsubscribing task, task id = " << task_id;
|
||||
auto status = client_impl_->GetGcsPubSub().Unsubscribe(TASK_CHANNEL, task_id.Hex());
|
||||
subscribe_task_operations_.erase(task_id);
|
||||
fetch_task_data_operations_.erase(task_id);
|
||||
RAY_LOG(DEBUG) << "Finished unsubscribing task, task id = " << task_id;
|
||||
return status;
|
||||
}
|
||||
|
@ -916,39 +917,38 @@ Status ServiceBasedTaskInfoAccessor::AsyncSubscribeTaskLease(
|
|||
const TaskID &task_id,
|
||||
const SubscribeCallback<TaskID, boost::optional<rpc::TaskLeaseData>> &subscribe,
|
||||
const StatusCallback &done) {
|
||||
RAY_LOG(DEBUG) << "Subscribing task lease, task id = " << task_id;
|
||||
RAY_CHECK(subscribe != nullptr)
|
||||
<< "Failed to subscribe task lease, task id = " << task_id;
|
||||
|
||||
auto subscribe_operation = [this, task_id, subscribe](const StatusCallback &done) {
|
||||
auto fetch_data_operation = [this, task_id,
|
||||
subscribe](const StatusCallback &fetch_done) {
|
||||
auto callback = [task_id, subscribe, fetch_done](
|
||||
const Status &status,
|
||||
const boost::optional<rpc::TaskLeaseData> &result) {
|
||||
subscribe(task_id, result);
|
||||
if (fetch_done) {
|
||||
fetch_done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGetTaskLease(task_id, callback));
|
||||
};
|
||||
|
||||
auto subscribe_operation = [this, task_id,
|
||||
subscribe](const StatusCallback &subscribe_done) {
|
||||
auto on_subscribe = [task_id, subscribe](const std::string &id,
|
||||
const std::string &data) {
|
||||
TaskLeaseData task_lease_data;
|
||||
task_lease_data.ParseFromString(data);
|
||||
subscribe(task_id, task_lease_data);
|
||||
};
|
||||
auto on_done = [this, task_id, subscribe, done](const Status &status) {
|
||||
if (status.ok()) {
|
||||
auto callback = [task_id, subscribe, done](
|
||||
const Status &status,
|
||||
const boost::optional<rpc::TaskLeaseData> &result) {
|
||||
subscribe(task_id, result);
|
||||
if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGetTaskLease(task_id, callback));
|
||||
} else if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
auto status = client_impl_->GetGcsPubSub().Subscribe(
|
||||
TASK_LEASE_CHANNEL, task_id.Hex(), on_subscribe, on_done);
|
||||
RAY_LOG(DEBUG) << "Finished subscribing task lease, task id = " << task_id;
|
||||
return status;
|
||||
return client_impl_->GetGcsPubSub().Subscribe(TASK_LEASE_CHANNEL, task_id.Hex(),
|
||||
on_subscribe, subscribe_done);
|
||||
};
|
||||
|
||||
subscribe_task_lease_operations_[task_id] = subscribe_operation;
|
||||
return subscribe_operation(done);
|
||||
fetch_task_lease_data_operations_[task_id] = fetch_data_operation;
|
||||
return subscribe_operation(
|
||||
[fetch_data_operation, done](const Status &status) { fetch_data_operation(done); });
|
||||
}
|
||||
|
||||
Status ServiceBasedTaskInfoAccessor::AsyncUnsubscribeTaskLease(const TaskID &task_id) {
|
||||
|
@ -956,6 +956,7 @@ Status ServiceBasedTaskInfoAccessor::AsyncUnsubscribeTaskLease(const TaskID &tas
|
|||
auto status =
|
||||
client_impl_->GetGcsPubSub().Unsubscribe(TASK_LEASE_CHANNEL, task_id.Hex());
|
||||
subscribe_task_lease_operations_.erase(task_id);
|
||||
fetch_task_lease_data_operations_.erase(task_id);
|
||||
RAY_LOG(DEBUG) << "Finished unsubscribing task lease, task id = " << task_id;
|
||||
return status;
|
||||
}
|
||||
|
@ -982,15 +983,32 @@ Status ServiceBasedTaskInfoAccessor::AttemptTaskReconstruction(
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ServiceBasedTaskInfoAccessor::AsyncReSubscribe() {
|
||||
void ServiceBasedTaskInfoAccessor::AsyncResubscribe(bool is_pubsub_server_restarted) {
|
||||
RAY_LOG(INFO) << "Reestablishing subscription for task info.";
|
||||
for (auto &item : subscribe_task_operations_) {
|
||||
RAY_CHECK_OK(item.second(nullptr));
|
||||
// If only the GCS sever has restarted, we only need to fetch data from the GCS server.
|
||||
// If the pub-sub server has also restarted, we need to resubscribe to the pub-sub
|
||||
// server first, then fetch data from the GCS server.
|
||||
if (is_pubsub_server_restarted) {
|
||||
for (auto &item : subscribe_task_operations_) {
|
||||
auto &task_id = item.first;
|
||||
RAY_CHECK_OK(item.second([this, task_id](const Status &status) {
|
||||
fetch_task_data_operations_[task_id](nullptr);
|
||||
}));
|
||||
}
|
||||
for (auto &item : subscribe_task_lease_operations_) {
|
||||
auto &task_id = item.first;
|
||||
RAY_CHECK_OK(item.second([this, task_id](const Status &status) {
|
||||
fetch_task_lease_data_operations_[task_id](nullptr);
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
for (auto &item : fetch_task_data_operations_) {
|
||||
item.second(nullptr);
|
||||
}
|
||||
for (auto &item : fetch_task_lease_data_operations_) {
|
||||
item.second(nullptr);
|
||||
}
|
||||
}
|
||||
for (auto &item : subscribe_task_lease_operations_) {
|
||||
RAY_CHECK_OK(item.second(nullptr));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ServiceBasedObjectInfoAccessor::ServiceBasedObjectInfoAccessor(
|
||||
|
@ -1093,10 +1111,28 @@ Status ServiceBasedObjectInfoAccessor::AsyncSubscribeToLocations(
|
|||
const ObjectID &object_id,
|
||||
const SubscribeCallback<ObjectID, ObjectChangeNotification> &subscribe,
|
||||
const StatusCallback &done) {
|
||||
RAY_LOG(DEBUG) << "Subscribing object location, object id = " << object_id;
|
||||
RAY_CHECK(subscribe != nullptr)
|
||||
<< "Failed to subscribe object location, object id = " << object_id;
|
||||
auto subscribe_operation = [this, object_id, subscribe](const StatusCallback &done) {
|
||||
|
||||
auto fetch_data_operation = [this, object_id,
|
||||
subscribe](const StatusCallback &fetch_done) {
|
||||
auto callback = [object_id, subscribe, fetch_done](
|
||||
const Status &status,
|
||||
const std::vector<rpc::ObjectTableData> &result) {
|
||||
if (status.ok()) {
|
||||
gcs::ObjectChangeNotification notification(rpc::GcsChangeMode::APPEND_OR_ADD,
|
||||
result);
|
||||
subscribe(object_id, notification);
|
||||
}
|
||||
if (fetch_done) {
|
||||
fetch_done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGetLocations(object_id, callback));
|
||||
};
|
||||
|
||||
auto subscribe_operation = [this, object_id,
|
||||
subscribe](const StatusCallback &subscribe_done) {
|
||||
auto on_subscribe = [object_id, subscribe](const std::string &id,
|
||||
const std::string &data) {
|
||||
rpc::ObjectLocationChange object_location_change;
|
||||
|
@ -1109,40 +1145,32 @@ Status ServiceBasedObjectInfoAccessor::AsyncSubscribeToLocations(
|
|||
gcs::ObjectChangeNotification notification(change_mode, object_data_vector);
|
||||
subscribe(object_id, notification);
|
||||
};
|
||||
auto on_done = [this, object_id, subscribe, done](const Status &status) {
|
||||
if (status.ok()) {
|
||||
auto callback = [object_id, subscribe, done](
|
||||
const Status &status,
|
||||
const std::vector<rpc::ObjectTableData> &result) {
|
||||
if (status.ok()) {
|
||||
gcs::ObjectChangeNotification notification(rpc::GcsChangeMode::APPEND_OR_ADD,
|
||||
result);
|
||||
subscribe(object_id, notification);
|
||||
}
|
||||
if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
RAY_CHECK_OK(AsyncGetLocations(object_id, callback));
|
||||
} else if (done) {
|
||||
done(status);
|
||||
}
|
||||
};
|
||||
auto status = client_impl_->GetGcsPubSub().Subscribe(OBJECT_CHANNEL, object_id.Hex(),
|
||||
on_subscribe, on_done);
|
||||
RAY_LOG(DEBUG) << "Finished subscribing object location, object id = " << object_id;
|
||||
return status;
|
||||
return client_impl_->GetGcsPubSub().Subscribe(OBJECT_CHANNEL, object_id.Hex(),
|
||||
on_subscribe, subscribe_done);
|
||||
};
|
||||
|
||||
subscribe_object_operations_[object_id] = subscribe_operation;
|
||||
return subscribe_operation(done);
|
||||
fetch_object_data_operations_[object_id] = fetch_data_operation;
|
||||
return subscribe_operation(
|
||||
[fetch_data_operation, done](const Status &status) { fetch_data_operation(done); });
|
||||
}
|
||||
|
||||
Status ServiceBasedObjectInfoAccessor::AsyncReSubscribe() {
|
||||
void ServiceBasedObjectInfoAccessor::AsyncResubscribe(bool is_pubsub_server_restarted) {
|
||||
RAY_LOG(INFO) << "Reestablishing subscription for object locations.";
|
||||
for (auto &item : subscribe_object_operations_) {
|
||||
RAY_CHECK_OK(item.second(nullptr));
|
||||
// If only the GCS sever has restarted, we only need to fetch data from the GCS server.
|
||||
// If the pub-sub server has also restarted, we need to resubscribe to the pub-sub
|
||||
// server first, then fetch data from the GCS server.
|
||||
if (is_pubsub_server_restarted) {
|
||||
for (auto &item : subscribe_object_operations_) {
|
||||
RAY_CHECK_OK(item.second([this, item](const Status &status) {
|
||||
fetch_object_data_operations_[item.first](nullptr);
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
for (auto &item : fetch_object_data_operations_) {
|
||||
item.second(nullptr);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ServiceBasedObjectInfoAccessor::AsyncUnsubscribeToLocations(
|
||||
|
@ -1150,6 +1178,7 @@ Status ServiceBasedObjectInfoAccessor::AsyncUnsubscribeToLocations(
|
|||
RAY_LOG(DEBUG) << "Unsubscribing object location, object id = " << object_id;
|
||||
auto status = client_impl_->GetGcsPubSub().Unsubscribe(OBJECT_CHANNEL, object_id.Hex());
|
||||
subscribe_object_operations_.erase(object_id);
|
||||
fetch_object_data_operations_.erase(object_id);
|
||||
RAY_LOG(DEBUG) << "Finished unsubscribing object location, object id = " << object_id;
|
||||
return status;
|
||||
}
|
||||
|
@ -1225,7 +1254,6 @@ ServiceBasedWorkerInfoAccessor::ServiceBasedWorkerInfoAccessor(
|
|||
Status ServiceBasedWorkerInfoAccessor::AsyncSubscribeToWorkerFailures(
|
||||
const SubscribeCallback<WorkerID, rpc::WorkerFailureData> &subscribe,
|
||||
const StatusCallback &done) {
|
||||
RAY_LOG(DEBUG) << "Subscribing worker failures.";
|
||||
RAY_CHECK(subscribe != nullptr);
|
||||
subscribe_operation_ = [this, subscribe](const StatusCallback &done) {
|
||||
auto on_subscribe = [subscribe](const std::string &id, const std::string &data) {
|
||||
|
@ -1233,20 +1261,18 @@ Status ServiceBasedWorkerInfoAccessor::AsyncSubscribeToWorkerFailures(
|
|||
worker_failure_data.ParseFromString(data);
|
||||
subscribe(WorkerID::FromBinary(id), worker_failure_data);
|
||||
};
|
||||
auto status = client_impl_->GetGcsPubSub().SubscribeAll(WORKER_FAILURE_CHANNEL,
|
||||
on_subscribe, done);
|
||||
RAY_LOG(DEBUG) << "Finished subscribing worker failures.";
|
||||
return status;
|
||||
return client_impl_->GetGcsPubSub().SubscribeAll(WORKER_FAILURE_CHANNEL, on_subscribe,
|
||||
done);
|
||||
};
|
||||
return subscribe_operation_(done);
|
||||
}
|
||||
|
||||
Status ServiceBasedWorkerInfoAccessor::AsyncReSubscribe() {
|
||||
void ServiceBasedWorkerInfoAccessor::AsyncResubscribe(bool is_pubsub_server_restarted) {
|
||||
RAY_LOG(INFO) << "Reestablishing subscription for worker failures.";
|
||||
if (subscribe_operation_ != nullptr) {
|
||||
return subscribe_operation_(nullptr);
|
||||
// If the pub-sub server has restarted, we need to resubscribe to the pub-sub server.
|
||||
if (subscribe_operation_ != nullptr && is_pubsub_server_restarted) {
|
||||
RAY_CHECK_OK(subscribe_operation_(nullptr));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ServiceBasedWorkerInfoAccessor::AsyncReportWorkerFailure(
|
||||
|
|
|
@ -24,6 +24,8 @@ namespace gcs {
|
|||
|
||||
using SubscribeOperation = std::function<Status(const StatusCallback &done)>;
|
||||
|
||||
using FetchDataOperation = std::function<void(const StatusCallback &done)>;
|
||||
|
||||
class ServiceBasedGcsClient;
|
||||
|
||||
/// \class ServiceBasedJobInfoAccessor
|
||||
|
@ -46,11 +48,11 @@ class ServiceBasedJobInfoAccessor : public JobInfoAccessor {
|
|||
|
||||
Status AsyncGetAll(const MultiItemCallback<rpc::JobTableData> &callback) override;
|
||||
|
||||
Status AsyncReSubscribe() override;
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override;
|
||||
|
||||
private:
|
||||
/// Save the subscribe operation in this function, so we can call it again when GCS
|
||||
/// restarts from a failure.
|
||||
/// Save the subscribe operation in this function, so we can call it again when PubSub
|
||||
/// server restarts from a failure.
|
||||
SubscribeOperation subscribe_operation_;
|
||||
|
||||
ServiceBasedGcsClient *client_impl_;
|
||||
|
@ -107,15 +109,23 @@ class ServiceBasedActorInfoAccessor : public ActorInfoAccessor {
|
|||
const ActorID &actor_id,
|
||||
const OptionalItemCallback<rpc::ActorCheckpointIdData> &callback) override;
|
||||
|
||||
Status AsyncReSubscribe() override;
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override;
|
||||
|
||||
private:
|
||||
/// Save the subscribe operation in this function, so we can call it again when GCS
|
||||
/// restarts from a failure.
|
||||
/// Save the subscribe operation in this function, so we can call it again when PubSub
|
||||
/// server restarts from a failure.
|
||||
SubscribeOperation subscribe_all_operation_;
|
||||
|
||||
/// Save the fetch data operation in this function, so we can call it again when GCS
|
||||
/// server restarts from a failure.
|
||||
FetchDataOperation fetch_all_data_operation_;
|
||||
|
||||
/// Save the subscribe operation of actors.
|
||||
std::unordered_map<ActorID, SubscribeOperation> subscribe_operations_;
|
||||
|
||||
/// Save the fetch data operation of actors.
|
||||
std::unordered_map<ActorID, FetchDataOperation> fetch_data_operations_;
|
||||
|
||||
ServiceBasedGcsClient *client_impl_;
|
||||
|
||||
Sequencer<ActorID> sequencer_;
|
||||
|
@ -184,15 +194,19 @@ class ServiceBasedNodeInfoAccessor : public NodeInfoAccessor {
|
|||
const ItemCallback<rpc::HeartbeatBatchTableData> &subscribe,
|
||||
const StatusCallback &done) override;
|
||||
|
||||
Status AsyncReSubscribe() override;
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override;
|
||||
|
||||
private:
|
||||
/// Save the subscribe operation in this function, so we can call it again when GCS
|
||||
/// restarts from a failure.
|
||||
/// Save the subscribe operation in this function, so we can call it again when PubSub
|
||||
/// server restarts from a failure.
|
||||
SubscribeOperation subscribe_node_operation_;
|
||||
SubscribeOperation subscribe_resource_operation_;
|
||||
SubscribeOperation subscribe_batch_heartbeat_operation_;
|
||||
|
||||
/// Save the fetch data operation in this function, so we can call it again when GCS
|
||||
/// server restarts from a failure.
|
||||
FetchDataOperation fetch_node_data_operation_;
|
||||
|
||||
void HandleNotification(const GcsNodeInfo &node_info);
|
||||
|
||||
ServiceBasedGcsClient *client_impl_;
|
||||
|
@ -256,14 +270,19 @@ class ServiceBasedTaskInfoAccessor : public TaskInfoAccessor {
|
|||
const std::shared_ptr<rpc::TaskReconstructionData> &data_ptr,
|
||||
const StatusCallback &callback) override;
|
||||
|
||||
Status AsyncReSubscribe() override;
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override;
|
||||
|
||||
private:
|
||||
/// Save the subscribe operation in this function, so we can call it again when GCS
|
||||
/// restarts from a failure.
|
||||
/// Save the subscribe operations, so we can call them again when PubSub
|
||||
/// server restarts from a failure.
|
||||
std::unordered_map<TaskID, SubscribeOperation> subscribe_task_operations_;
|
||||
std::unordered_map<TaskID, SubscribeOperation> subscribe_task_lease_operations_;
|
||||
|
||||
/// Save the fetch data operation in this function, so we can call it again when GCS
|
||||
/// server restarts from a failure.
|
||||
std::unordered_map<TaskID, FetchDataOperation> fetch_task_data_operations_;
|
||||
std::unordered_map<TaskID, FetchDataOperation> fetch_task_lease_data_operations_;
|
||||
|
||||
ServiceBasedGcsClient *client_impl_;
|
||||
};
|
||||
|
||||
|
@ -295,13 +314,17 @@ class ServiceBasedObjectInfoAccessor : public ObjectInfoAccessor {
|
|||
|
||||
Status AsyncUnsubscribeToLocations(const ObjectID &object_id) override;
|
||||
|
||||
Status AsyncReSubscribe() override;
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override;
|
||||
|
||||
private:
|
||||
/// Save the subscribe operation in this function, so we can call it again when GCS
|
||||
/// restarts from a failure.
|
||||
/// Save the subscribe operations, so we can call them again when PubSub
|
||||
/// server restarts from a failure.
|
||||
std::unordered_map<ObjectID, SubscribeOperation> subscribe_object_operations_;
|
||||
|
||||
/// Save the fetch data operation in this function, so we can call it again when GCS
|
||||
/// server restarts from a failure.
|
||||
std::unordered_map<ObjectID, FetchDataOperation> fetch_object_data_operations_;
|
||||
|
||||
ServiceBasedGcsClient *client_impl_;
|
||||
|
||||
Sequencer<ObjectID> sequencer_;
|
||||
|
@ -362,7 +385,7 @@ class ServiceBasedWorkerInfoAccessor : public WorkerInfoAccessor {
|
|||
const std::unordered_map<std::string, std::string> &worker_info,
|
||||
const StatusCallback &callback) override;
|
||||
|
||||
Status AsyncReSubscribe() override;
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override;
|
||||
|
||||
private:
|
||||
/// Save the subscribe operation in this function, so we can call it again when GCS
|
||||
|
|
|
@ -47,13 +47,13 @@ Status ServiceBasedGcsClient::Connect(boost::asio::io_service &io_service) {
|
|||
};
|
||||
std::pair<std::string, int> address = get_server_address();
|
||||
|
||||
auto re_subscribe = [this]() {
|
||||
RAY_CHECK_OK(job_accessor_->AsyncReSubscribe());
|
||||
RAY_CHECK_OK(actor_accessor_->AsyncReSubscribe());
|
||||
RAY_CHECK_OK(node_accessor_->AsyncReSubscribe());
|
||||
RAY_CHECK_OK(task_accessor_->AsyncReSubscribe());
|
||||
RAY_CHECK_OK(object_accessor_->AsyncReSubscribe());
|
||||
RAY_CHECK_OK(worker_accessor_->AsyncReSubscribe());
|
||||
auto re_subscribe = [this](bool is_pubsub_server_restarted) {
|
||||
job_accessor_->AsyncResubscribe(is_pubsub_server_restarted);
|
||||
actor_accessor_->AsyncResubscribe(is_pubsub_server_restarted);
|
||||
node_accessor_->AsyncResubscribe(is_pubsub_server_restarted);
|
||||
task_accessor_->AsyncResubscribe(is_pubsub_server_restarted);
|
||||
object_accessor_->AsyncResubscribe(is_pubsub_server_restarted);
|
||||
worker_accessor_->AsyncResubscribe(is_pubsub_server_restarted);
|
||||
};
|
||||
|
||||
// Connect to gcs service.
|
||||
|
|
|
@ -483,6 +483,11 @@ class ServiceBasedGcsClientTest : public ::testing::Test {
|
|||
EXPECT_TRUE(WaitForCondition(condition, timeout_ms_.count()));
|
||||
}
|
||||
|
||||
void CheckActorData(const gcs::ActorTableData &actor,
|
||||
rpc::ActorTableData_ActorState expected_state) {
|
||||
ASSERT_TRUE(actor.state() == expected_state);
|
||||
}
|
||||
|
||||
// GCS server.
|
||||
gcs::GcsServerConfig config_;
|
||||
std::unique_ptr<gcs::GcsServer> gcs_server_;
|
||||
|
@ -835,7 +840,7 @@ TEST_F(ServiceBasedGcsClientTest, TestErrorInfo) {
|
|||
ASSERT_TRUE(ReportJobError(error_table_data));
|
||||
}
|
||||
|
||||
TEST_F(ServiceBasedGcsClientTest, TestJobTableReSubscribe) {
|
||||
TEST_F(ServiceBasedGcsClientTest, TestJobTableResubscribe) {
|
||||
// Test that subscription of the job table can still work when GCS server restarts.
|
||||
JobID job_id = JobID::FromInt(1);
|
||||
auto job_table_data = Mocker::GenJobTableData(job_id);
|
||||
|
@ -854,53 +859,79 @@ TEST_F(ServiceBasedGcsClientTest, TestJobTableReSubscribe) {
|
|||
WaitPendingDone(job_update_count, 1);
|
||||
}
|
||||
|
||||
TEST_F(ServiceBasedGcsClientTest, TestActorTableReSubscribe) {
|
||||
TEST_F(ServiceBasedGcsClientTest, TestActorTableResubscribe) {
|
||||
// Test that subscription of the actor table can still work when GCS server restarts.
|
||||
JobID job_id = JobID::FromInt(1);
|
||||
auto actor1_table_data = Mocker::GenActorTableData(job_id);
|
||||
auto actor1_id = ActorID::FromBinary(actor1_table_data->actor_id());
|
||||
auto actor2_table_data = Mocker::GenActorTableData(job_id);
|
||||
auto actor2_id = ActorID::FromBinary(actor2_table_data->actor_id());
|
||||
auto actor_table_data = Mocker::GenActorTableData(job_id);
|
||||
auto actor_id = ActorID::FromBinary(actor_table_data->actor_id());
|
||||
|
||||
// Subscribe to any register or update operations of actors.
|
||||
std::atomic<int> actors_update_count(0);
|
||||
auto subscribe_all = [&actors_update_count](const ActorID &id,
|
||||
const rpc::ActorTableData &result) {
|
||||
++actors_update_count;
|
||||
// Number of notifications for the following `SubscribeAllActors` operation.
|
||||
std::atomic<int> num_subscribe_all_notifications(0);
|
||||
// All the notifications for the following `SubscribeAllActors` operation.
|
||||
std::vector<gcs::ActorTableData> subscribe_all_notifications;
|
||||
auto subscribe_all = [&num_subscribe_all_notifications, &subscribe_all_notifications](
|
||||
const ActorID &id, const rpc::ActorTableData &data) {
|
||||
subscribe_all_notifications.emplace_back(data);
|
||||
++num_subscribe_all_notifications;
|
||||
};
|
||||
// Subscribe to updates of all actors.
|
||||
ASSERT_TRUE(SubscribeAllActors(subscribe_all));
|
||||
|
||||
// Subscribe to any update operations of actor1.
|
||||
std::atomic<int> actor1_update_count(0);
|
||||
auto actor1_subscribe = [&actor1_update_count](const ActorID &actor_id,
|
||||
const gcs::ActorTableData &data) {
|
||||
++actor1_update_count;
|
||||
// Number of notifications for the following `SubscribeActor` operation.
|
||||
std::atomic<int> num_subscribe_one_notifications(0);
|
||||
// All the notifications for the following `SubscribeActor` operation.
|
||||
std::vector<gcs::ActorTableData> subscribe_one_notifications;
|
||||
auto actor_subscribe = [&num_subscribe_one_notifications, &subscribe_one_notifications](
|
||||
const ActorID &actor_id, const gcs::ActorTableData &data) {
|
||||
subscribe_one_notifications.emplace_back(data);
|
||||
++num_subscribe_one_notifications;
|
||||
};
|
||||
ASSERT_TRUE(SubscribeActor(actor1_id, actor1_subscribe));
|
||||
// Subscribe to updates for this actor.
|
||||
ASSERT_TRUE(SubscribeActor(actor_id, actor_subscribe));
|
||||
|
||||
// Subscribe to any update operations of actor2.
|
||||
std::atomic<int> actor2_update_count(0);
|
||||
auto actor2_subscribe = [&actor2_update_count](const ActorID &actor_id,
|
||||
const gcs::ActorTableData &data) {
|
||||
++actor2_update_count;
|
||||
};
|
||||
ASSERT_TRUE(SubscribeActor(actor2_id, actor2_subscribe));
|
||||
ASSERT_TRUE(RegisterActor(actor_table_data));
|
||||
|
||||
ASSERT_TRUE(RegisterActor(actor1_table_data));
|
||||
ASSERT_TRUE(RegisterActor(actor2_table_data));
|
||||
WaitPendingDone(actor2_update_count, 1);
|
||||
UnsubscribeActor(actor2_id);
|
||||
// We should receive a new ALIVE notification from the subscribe channel.
|
||||
WaitPendingDone(num_subscribe_all_notifications, 1);
|
||||
WaitPendingDone(num_subscribe_one_notifications, 1);
|
||||
CheckActorData(subscribe_all_notifications[0],
|
||||
rpc::ActorTableData_ActorState::ActorTableData_ActorState_ALIVE);
|
||||
CheckActorData(subscribe_one_notifications[0],
|
||||
rpc::ActorTableData_ActorState::ActorTableData_ActorState_ALIVE);
|
||||
|
||||
// Restart GCS server.
|
||||
RestartGcsServer();
|
||||
|
||||
ASSERT_TRUE(UpdateActor(actor1_id, actor1_table_data));
|
||||
ASSERT_TRUE(UpdateActor(actor2_id, actor2_table_data));
|
||||
WaitPendingDone(actor1_update_count, 3);
|
||||
WaitPendingDone(actor2_update_count, 1);
|
||||
UnsubscribeActor(actor1_id);
|
||||
// We need to send a RPC to detect GCS server restart. Then GCS client will
|
||||
// reconnect to GCS server and resubscribe.
|
||||
ASSERT_TRUE(GetActor(actor_id).state() ==
|
||||
rpc::ActorTableData_ActorState::ActorTableData_ActorState_ALIVE);
|
||||
|
||||
// When GCS client detects that GCS server has restarted, but the pub-sub server
|
||||
// didn't restart, it will fetch data again from the GCS server. So we'll receive
|
||||
// another notification of ALIVE state.
|
||||
WaitPendingDone(num_subscribe_all_notifications, 2);
|
||||
WaitPendingDone(num_subscribe_one_notifications, 2);
|
||||
CheckActorData(subscribe_all_notifications[1],
|
||||
rpc::ActorTableData_ActorState::ActorTableData_ActorState_ALIVE);
|
||||
CheckActorData(subscribe_one_notifications[1],
|
||||
rpc::ActorTableData_ActorState::ActorTableData_ActorState_ALIVE);
|
||||
|
||||
// Update the actor state to DEAD.
|
||||
actor_table_data->set_state(
|
||||
rpc::ActorTableData_ActorState::ActorTableData_ActorState_DEAD);
|
||||
ASSERT_TRUE(UpdateActor(actor_id, actor_table_data));
|
||||
|
||||
// We should receive a new DEAD notification from the subscribe channel.
|
||||
WaitPendingDone(num_subscribe_all_notifications, 3);
|
||||
WaitPendingDone(num_subscribe_one_notifications, 3);
|
||||
CheckActorData(subscribe_all_notifications[2],
|
||||
rpc::ActorTableData_ActorState::ActorTableData_ActorState_DEAD);
|
||||
CheckActorData(subscribe_one_notifications[2],
|
||||
rpc::ActorTableData_ActorState::ActorTableData_ActorState_DEAD);
|
||||
}
|
||||
|
||||
TEST_F(ServiceBasedGcsClientTest, TestObjectTableReSubscribe) {
|
||||
TEST_F(ServiceBasedGcsClientTest, TestObjectTableResubscribe) {
|
||||
ObjectID object1_id = ObjectID::FromRandom();
|
||||
ObjectID object2_id = ObjectID::FromRandom();
|
||||
ClientID node_id = ClientID::FromRandom();
|
||||
|
@ -942,7 +973,7 @@ TEST_F(ServiceBasedGcsClientTest, TestObjectTableReSubscribe) {
|
|||
WaitPendingDone(object2_change_count, 2);
|
||||
}
|
||||
|
||||
TEST_F(ServiceBasedGcsClientTest, TestNodeTableReSubscribe) {
|
||||
TEST_F(ServiceBasedGcsClientTest, TestNodeTableResubscribe) {
|
||||
// Test that subscription of the node table can still work when GCS server restarts.
|
||||
// Subscribe to node addition and removal events from GCS and cache those information.
|
||||
std::atomic<int> node_change_count(0);
|
||||
|
@ -968,8 +999,6 @@ TEST_F(ServiceBasedGcsClientTest, TestNodeTableReSubscribe) {
|
|||
};
|
||||
ASSERT_TRUE(SubscribeBatchHeartbeat(batch_heartbeat_subscribe));
|
||||
|
||||
RestartGcsServer();
|
||||
|
||||
auto node_info = Mocker::GenNodeInfo(1);
|
||||
ASSERT_TRUE(RegisterNode(*node_info));
|
||||
ClientID node_id = ClientID::FromBinary(node_info->node_id());
|
||||
|
@ -978,13 +1007,23 @@ TEST_F(ServiceBasedGcsClientTest, TestNodeTableReSubscribe) {
|
|||
auto heartbeat = std::make_shared<rpc::HeartbeatTableData>();
|
||||
heartbeat->set_client_id(node_info->node_id());
|
||||
ASSERT_TRUE(ReportHeartbeat(heartbeat));
|
||||
|
||||
WaitPendingDone(node_change_count, 1);
|
||||
WaitPendingDone(resource_change_count, 1);
|
||||
WaitPendingDone(batch_heartbeat_count, 1);
|
||||
|
||||
RestartGcsServer();
|
||||
|
||||
node_info = Mocker::GenNodeInfo(1);
|
||||
ASSERT_TRUE(RegisterNode(*node_info));
|
||||
node_id = ClientID::FromBinary(node_info->node_id());
|
||||
ASSERT_TRUE(UpdateResources(node_id, key));
|
||||
heartbeat->set_client_id(node_info->node_id());
|
||||
ASSERT_TRUE(ReportHeartbeat(heartbeat));
|
||||
|
||||
WaitPendingDone(node_change_count, 2);
|
||||
WaitPendingDone(resource_change_count, 2);
|
||||
WaitPendingDone(batch_heartbeat_count, 2);
|
||||
}
|
||||
|
||||
TEST_F(ServiceBasedGcsClientTest, TestTaskTableReSubscribe) {
|
||||
TEST_F(ServiceBasedGcsClientTest, TestTaskTableResubscribe) {
|
||||
JobID job_id = JobID::FromInt(6);
|
||||
TaskID task_id = TaskID::ForDriverTask(job_id);
|
||||
auto task_table_data = Mocker::GenTaskTableData(job_id.Binary(), task_id.Binary());
|
||||
|
@ -1023,7 +1062,7 @@ TEST_F(ServiceBasedGcsClientTest, TestTaskTableReSubscribe) {
|
|||
WaitPendingDone(task_count, 1);
|
||||
}
|
||||
|
||||
TEST_F(ServiceBasedGcsClientTest, TestWorkerTableReSubscribe) {
|
||||
TEST_F(ServiceBasedGcsClientTest, TestWorkerTableResubscribe) {
|
||||
// Subscribe to all unexpected failure of workers from GCS.
|
||||
std::atomic<int> worker_failure_count(0);
|
||||
auto on_subscribe = [&worker_failure_count](const WorkerID &worker_id,
|
||||
|
|
|
@ -289,7 +289,7 @@ struct GcsServerMocker {
|
|||
return Status::NotImplemented("");
|
||||
}
|
||||
|
||||
Status AsyncReSubscribe() override { return Status::NotImplemented(""); }
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override {}
|
||||
};
|
||||
|
||||
class MockedErrorInfoAccessor : public gcs::ErrorInfoAccessor {
|
||||
|
|
|
@ -82,7 +82,7 @@ class RedisLogBasedActorInfoAccessor : public ActorInfoAccessor {
|
|||
const ActorID &actor_id,
|
||||
const OptionalItemCallback<ActorCheckpointIdData> &callback) override;
|
||||
|
||||
Status AsyncReSubscribe() override { return Status::NotImplemented(""); }
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override {}
|
||||
|
||||
protected:
|
||||
virtual std::vector<ActorID> GetAllActorID() const;
|
||||
|
@ -182,9 +182,7 @@ class RedisJobInfoAccessor : public JobInfoAccessor {
|
|||
return Status::NotImplemented("AsyncGetAll not implemented");
|
||||
}
|
||||
|
||||
Status AsyncReSubscribe() override {
|
||||
return Status::NotImplemented("AsyncReSubscribe not implemented");
|
||||
}
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override {}
|
||||
|
||||
private:
|
||||
/// Append job information to GCS asynchronously.
|
||||
|
@ -242,9 +240,7 @@ class RedisTaskInfoAccessor : public TaskInfoAccessor {
|
|||
const std::shared_ptr<TaskReconstructionData> &data_ptr,
|
||||
const StatusCallback &callback) override;
|
||||
|
||||
Status AsyncReSubscribe() override {
|
||||
return Status::NotImplemented("AsyncReSubscribe not implemented");
|
||||
}
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override {}
|
||||
|
||||
private:
|
||||
RedisGcsClient *client_impl_{nullptr};
|
||||
|
@ -295,7 +291,7 @@ class RedisObjectInfoAccessor : public ObjectInfoAccessor {
|
|||
|
||||
Status AsyncUnsubscribeToLocations(const ObjectID &object_id) override;
|
||||
|
||||
Status AsyncReSubscribe() override { return Status::NotImplemented(""); }
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override {}
|
||||
|
||||
private:
|
||||
RedisGcsClient *client_impl_{nullptr};
|
||||
|
@ -376,9 +372,7 @@ class RedisNodeInfoAccessor : public NodeInfoAccessor {
|
|||
const ItemCallback<HeartbeatBatchTableData> &subscribe,
|
||||
const StatusCallback &done) override;
|
||||
|
||||
Status AsyncReSubscribe() override {
|
||||
return Status::NotImplemented("AsyncReSubscribe not implemented");
|
||||
}
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override {}
|
||||
|
||||
private:
|
||||
RedisGcsClient *client_impl_{nullptr};
|
||||
|
@ -453,7 +447,7 @@ class RedisWorkerInfoAccessor : public WorkerInfoAccessor {
|
|||
const std::unordered_map<std::string, std::string> &worker_info,
|
||||
const StatusCallback &callback) override;
|
||||
|
||||
Status AsyncReSubscribe() override { return Status::NotImplemented(""); }
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override {}
|
||||
|
||||
private:
|
||||
RedisGcsClient *client_impl_{nullptr};
|
||||
|
|
|
@ -87,7 +87,7 @@ class GcsRpcClient {
|
|||
GcsRpcClient(const std::string &address, const int port,
|
||||
ClientCallManager &client_call_manager,
|
||||
std::function<std::pair<std::string, int>()> get_server_address = nullptr,
|
||||
std::function<void()> reconnected_callback = nullptr)
|
||||
std::function<void(bool)> reconnected_callback = nullptr)
|
||||
: client_call_manager_(client_call_manager),
|
||||
get_server_address_(std::move(get_server_address)),
|
||||
reconnected_callback_(std::move(reconnected_callback)) {
|
||||
|
@ -255,7 +255,10 @@ class GcsRpcClient {
|
|||
if (index < RayConfig::instance().ping_gcs_rpc_server_max_retries()) {
|
||||
Init(address.first, address.second, client_call_manager_);
|
||||
if (reconnected_callback_) {
|
||||
reconnected_callback_();
|
||||
// TODO(ffbin): Once we separate the pubsub server and storage addresses, we can
|
||||
// judge whether pubsub server is restarted. Currently, we only support the
|
||||
// scenario where pubsub server does not restart.
|
||||
reconnected_callback_(false);
|
||||
}
|
||||
} else {
|
||||
RAY_LOG(FATAL) << "Couldn't reconnect to GCS server. The last attempted GCS "
|
||||
|
@ -275,7 +278,7 @@ class GcsRpcClient {
|
|||
/// Note, we use ping to detect whether the reconnection is successful. If the ping
|
||||
/// succeeds but the RPC connection fails, this function might be called called again.
|
||||
/// So it needs to be idempotent.
|
||||
std::function<void()> reconnected_callback_;
|
||||
std::function<void(bool)> reconnected_callback_;
|
||||
|
||||
/// The gRPC-generated stub.
|
||||
std::unique_ptr<GrpcClient<JobInfoGcsService>> job_info_grpc_client_;
|
||||
|
|
Loading…
Add table
Reference in a new issue