mirror of
https://github.com/vale981/ray
synced 2025-03-06 18:41:40 -05:00
[GCS]Only publish fileds used by sub clients in WorkerTableData (#13508)
This commit is contained in:
parent
6c9088eb62
commit
b2a6e55289
8 changed files with 33 additions and 19 deletions
|
@ -657,13 +657,14 @@ class WorkerInfoAccessor {
|
||||||
virtual ~WorkerInfoAccessor() = default;
|
virtual ~WorkerInfoAccessor() = default;
|
||||||
|
|
||||||
/// Subscribe to all unexpected failure of workers from GCS asynchronously.
|
/// Subscribe to all unexpected failure of workers from GCS asynchronously.
|
||||||
/// Note that this does not include workers that failed due to node failure.
|
/// Note that this does not include workers that failed due to node failure
|
||||||
|
/// and only fileds in WorkerDeltaData would be published.
|
||||||
///
|
///
|
||||||
/// \param subscribe Callback that will be called each time when a worker failed.
|
/// \param subscribe Callback that will be called each time when a worker failed.
|
||||||
/// \param done Callback that will be called when subscription is complete.
|
/// \param done Callback that will be called when subscription is complete.
|
||||||
/// \return Status
|
/// \return Status
|
||||||
virtual Status AsyncSubscribeToWorkerFailures(
|
virtual Status AsyncSubscribeToWorkerFailures(
|
||||||
const ItemCallback<rpc::WorkerTableData> &subscribe,
|
const ItemCallback<rpc::WorkerDeltaData> &subscribe,
|
||||||
const StatusCallback &done) = 0;
|
const StatusCallback &done) = 0;
|
||||||
|
|
||||||
/// Report a worker failure to GCS asynchronously.
|
/// Report a worker failure to GCS asynchronously.
|
||||||
|
|
|
@ -1311,11 +1311,11 @@ ServiceBasedWorkerInfoAccessor::ServiceBasedWorkerInfoAccessor(
|
||||||
: client_impl_(client_impl) {}
|
: client_impl_(client_impl) {}
|
||||||
|
|
||||||
Status ServiceBasedWorkerInfoAccessor::AsyncSubscribeToWorkerFailures(
|
Status ServiceBasedWorkerInfoAccessor::AsyncSubscribeToWorkerFailures(
|
||||||
const ItemCallback<rpc::WorkerTableData> &subscribe, const StatusCallback &done) {
|
const ItemCallback<rpc::WorkerDeltaData> &subscribe, const StatusCallback &done) {
|
||||||
RAY_CHECK(subscribe != nullptr);
|
RAY_CHECK(subscribe != nullptr);
|
||||||
subscribe_operation_ = [this, subscribe](const StatusCallback &done) {
|
subscribe_operation_ = [this, subscribe](const StatusCallback &done) {
|
||||||
auto on_subscribe = [subscribe](const std::string &id, const std::string &data) {
|
auto on_subscribe = [subscribe](const std::string &id, const std::string &data) {
|
||||||
rpc::WorkerTableData worker_failure_data;
|
rpc::WorkerDeltaData worker_failure_data;
|
||||||
worker_failure_data.ParseFromString(data);
|
worker_failure_data.ParseFromString(data);
|
||||||
subscribe(worker_failure_data);
|
subscribe(worker_failure_data);
|
||||||
};
|
};
|
||||||
|
|
|
@ -407,7 +407,7 @@ class ServiceBasedWorkerInfoAccessor : public WorkerInfoAccessor {
|
||||||
virtual ~ServiceBasedWorkerInfoAccessor() = default;
|
virtual ~ServiceBasedWorkerInfoAccessor() = default;
|
||||||
|
|
||||||
Status AsyncSubscribeToWorkerFailures(
|
Status AsyncSubscribeToWorkerFailures(
|
||||||
const ItemCallback<rpc::WorkerTableData> &subscribe,
|
const ItemCallback<rpc::WorkerDeltaData> &subscribe,
|
||||||
const StatusCallback &done) override;
|
const StatusCallback &done) override;
|
||||||
|
|
||||||
Status AsyncReportWorkerFailure(const std::shared_ptr<rpc::WorkerTableData> &data_ptr,
|
Status AsyncReportWorkerFailure(const std::shared_ptr<rpc::WorkerTableData> &data_ptr,
|
||||||
|
|
|
@ -495,7 +495,7 @@ class ServiceBasedGcsClientTest : public ::testing::Test {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool SubscribeToWorkerFailures(
|
bool SubscribeToWorkerFailures(
|
||||||
const gcs::ItemCallback<rpc::WorkerTableData> &subscribe) {
|
const gcs::ItemCallback<rpc::WorkerDeltaData> &subscribe) {
|
||||||
std::promise<bool> promise;
|
std::promise<bool> promise;
|
||||||
RAY_CHECK_OK(gcs_client_->Workers().AsyncSubscribeToWorkerFailures(
|
RAY_CHECK_OK(gcs_client_->Workers().AsyncSubscribeToWorkerFailures(
|
||||||
subscribe, [&promise](Status status) { promise.set_value(status.ok()); }));
|
subscribe, [&promise](Status status) { promise.set_value(status.ok()); }));
|
||||||
|
@ -922,7 +922,7 @@ TEST_F(ServiceBasedGcsClientTest, TestStats) {
|
||||||
TEST_F(ServiceBasedGcsClientTest, TestWorkerInfo) {
|
TEST_F(ServiceBasedGcsClientTest, TestWorkerInfo) {
|
||||||
// Subscribe to all unexpected failure of workers from GCS.
|
// Subscribe to all unexpected failure of workers from GCS.
|
||||||
std::atomic<int> worker_failure_count(0);
|
std::atomic<int> worker_failure_count(0);
|
||||||
auto on_subscribe = [&worker_failure_count](const rpc::WorkerTableData &result) {
|
auto on_subscribe = [&worker_failure_count](const rpc::WorkerDeltaData &result) {
|
||||||
++worker_failure_count;
|
++worker_failure_count;
|
||||||
};
|
};
|
||||||
ASSERT_TRUE(SubscribeToWorkerFailures(on_subscribe));
|
ASSERT_TRUE(SubscribeToWorkerFailures(on_subscribe));
|
||||||
|
@ -1168,7 +1168,7 @@ TEST_F(ServiceBasedGcsClientTest, TestTaskTableResubscribe) {
|
||||||
TEST_F(ServiceBasedGcsClientTest, TestWorkerTableResubscribe) {
|
TEST_F(ServiceBasedGcsClientTest, TestWorkerTableResubscribe) {
|
||||||
// Subscribe to all unexpected failure of workers from GCS.
|
// Subscribe to all unexpected failure of workers from GCS.
|
||||||
std::atomic<int> worker_failure_count(0);
|
std::atomic<int> worker_failure_count(0);
|
||||||
auto on_subscribe = [&worker_failure_count](const rpc::WorkerTableData &result) {
|
auto on_subscribe = [&worker_failure_count](const rpc::WorkerDeltaData &result) {
|
||||||
++worker_failure_count;
|
++worker_failure_count;
|
||||||
};
|
};
|
||||||
ASSERT_TRUE(SubscribeToWorkerFailures(on_subscribe));
|
ASSERT_TRUE(SubscribeToWorkerFailures(on_subscribe));
|
||||||
|
|
|
@ -52,8 +52,15 @@ void GcsWorkerManager::HandleReportWorkerFailure(
|
||||||
<< ", address = " << worker_address.ip_address();
|
<< ", address = " << worker_address.ip_address();
|
||||||
} else {
|
} else {
|
||||||
stats::UnintentionalWorkerFailures.Record(1);
|
stats::UnintentionalWorkerFailures.Record(1);
|
||||||
|
// Only publish worker_id and raylet_id in address as they are the only fields used
|
||||||
|
// by sub clients.
|
||||||
|
auto worker_failure_delta = std::make_shared<rpc::WorkerDeltaData>();
|
||||||
|
worker_failure_delta->set_worker_id(
|
||||||
|
worker_failure_data->worker_address().worker_id());
|
||||||
|
worker_failure_delta->set_raylet_id(
|
||||||
|
worker_failure_data->worker_address().raylet_id());
|
||||||
RAY_CHECK_OK(gcs_pub_sub_->Publish(WORKER_CHANNEL, worker_id.Hex(),
|
RAY_CHECK_OK(gcs_pub_sub_->Publish(WORKER_CHANNEL, worker_id.Hex(),
|
||||||
worker_failure_data->SerializeAsString(),
|
worker_failure_delta->SerializeAsString(),
|
||||||
nullptr));
|
nullptr));
|
||||||
}
|
}
|
||||||
GCS_RPC_SEND_REPLY(send_reply_callback, reply, status);
|
GCS_RPC_SEND_REPLY(send_reply_callback, reply, status);
|
||||||
|
|
|
@ -393,6 +393,12 @@ message WorkerTableData {
|
||||||
map<string, bytes> worker_info = 6;
|
map<string, bytes> worker_info = 6;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fields to publish when worker fails.
|
||||||
|
message WorkerDeltaData {
|
||||||
|
bytes raylet_id = 1;
|
||||||
|
bytes worker_id = 2;
|
||||||
|
}
|
||||||
|
|
||||||
message ResourceMap {
|
message ResourceMap {
|
||||||
map<string, ResourceTableData> items = 1;
|
map<string, ResourceTableData> items = 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -299,8 +299,8 @@ ray::Status NodeManager::RegisterGcs() {
|
||||||
// node failure. These workers can be identified by comparing the raylet_id
|
// node failure. These workers can be identified by comparing the raylet_id
|
||||||
// in their rpc::Address to the ID of a failed raylet.
|
// in their rpc::Address to the ID of a failed raylet.
|
||||||
const auto &worker_failure_handler =
|
const auto &worker_failure_handler =
|
||||||
[this](const rpc::WorkerTableData &worker_failure_data) {
|
[this](const rpc::WorkerDeltaData &worker_failure_data) {
|
||||||
HandleUnexpectedWorkerFailure(worker_failure_data.worker_address());
|
HandleUnexpectedWorkerFailure(worker_failure_data);
|
||||||
};
|
};
|
||||||
RAY_CHECK_OK(gcs_client_->Workers().AsyncSubscribeToWorkerFailures(
|
RAY_CHECK_OK(gcs_client_->Workers().AsyncSubscribeToWorkerFailures(
|
||||||
worker_failure_handler, /*done_callback=*/nullptr));
|
worker_failure_handler, /*done_callback=*/nullptr));
|
||||||
|
@ -716,14 +716,14 @@ void NodeManager::NodeRemoved(const NodeID &node_id) {
|
||||||
|
|
||||||
// Clean up workers that were owned by processes that were on the failed
|
// Clean up workers that were owned by processes that were on the failed
|
||||||
// node.
|
// node.
|
||||||
rpc::Address address;
|
rpc::WorkerDeltaData data;
|
||||||
address.set_raylet_id(node_id.Binary());
|
data.set_raylet_id(node_id.Binary());
|
||||||
HandleUnexpectedWorkerFailure(address);
|
HandleUnexpectedWorkerFailure(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
void NodeManager::HandleUnexpectedWorkerFailure(const rpc::Address &address) {
|
void NodeManager::HandleUnexpectedWorkerFailure(const rpc::WorkerDeltaData &data) {
|
||||||
const WorkerID worker_id = WorkerID::FromBinary(address.worker_id());
|
const WorkerID worker_id = WorkerID::FromBinary(data.worker_id());
|
||||||
const NodeID node_id = NodeID::FromBinary(address.raylet_id());
|
const NodeID node_id = NodeID::FromBinary(data.raylet_id());
|
||||||
if (!worker_id.IsNil()) {
|
if (!worker_id.IsNil()) {
|
||||||
RAY_LOG(DEBUG) << "Worker " << worker_id << " failed";
|
RAY_LOG(DEBUG) << "Worker " << worker_id << " failed";
|
||||||
failed_workers_cache_.insert(worker_id);
|
failed_workers_cache_.insert(worker_id);
|
||||||
|
|
|
@ -172,8 +172,8 @@ class NodeManager : public rpc::NodeManagerServiceHandler,
|
||||||
|
|
||||||
/// Handle an unexpected failure notification from GCS pubsub.
|
/// Handle an unexpected failure notification from GCS pubsub.
|
||||||
///
|
///
|
||||||
/// \param worker_address The address of the worker that died.
|
/// \param data The data of the worker that died.
|
||||||
void HandleUnexpectedWorkerFailure(const rpc::Address &worker_address);
|
void HandleUnexpectedWorkerFailure(const rpc::WorkerDeltaData &data);
|
||||||
|
|
||||||
/// Handler for the addition of a new node.
|
/// Handler for the addition of a new node.
|
||||||
///
|
///
|
||||||
|
|
Loading…
Add table
Reference in a new issue