[Core] Revert "[Core] Batch PinObjectIDs requests from Raylet client (#24322)" and "[Core] rename PinObjectIDs to PinObjectID (#24451)" (#24741)

we noticed performance regression for nightly test shuffle_1tb_5000_partitions. concretely the test previously takes 1h10m to finish but now it takes more than 2h30minutes.

after investigation we believe mostly likely 5a82640 caused the regression.

here is the run before 5a82640: https://console.anyscale.com/o/anyscale-internal/projects/prj_SVFGM5yBqK6DHCfLtRMryXHM/clusters/ses_1ejykCYq9BnkC5v8ZJjrqc2b?command-history-section=command_history
here is the run after 5a82640:
https://console.anyscale.com/o/anyscale-internal/projects/prj_SVFGM5yBqK6DHCfLtRMryXHM/clusters/ses_Lr5N8jVRdHCWJWYA2SRaUkzZ?command-history-section=command_history
This commit is contained in:
Chen Shen 2022-05-12 16:17:40 -07:00 committed by GitHub
parent 0a0c52e351
commit 02042e1305
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 212 additions and 314 deletions

View file

@ -85,7 +85,7 @@ of what the event stats look like:
CoreWorkerService.grpc_client.GetObjectLocationsOwner - 51333 total (0 active), CPU time: mean = 25.166 us, total = 1.292 s CoreWorkerService.grpc_client.GetObjectLocationsOwner - 51333 total (0 active), CPU time: mean = 25.166 us, total = 1.292 s
ObjectManager.ObjectDeleted - 43188 total (0 active), CPU time: mean = 26.017 us, total = 1.124 s ObjectManager.ObjectDeleted - 43188 total (0 active), CPU time: mean = 26.017 us, total = 1.124 s
CoreWorkerService.grpc_client.RemoveObjectLocationOwner - 43177 total (0 active), CPU time: mean = 2.368 us, total = 102.252 ms CoreWorkerService.grpc_client.RemoveObjectLocationOwner - 43177 total (0 active), CPU time: mean = 2.368 us, total = 102.252 ms
NodeManagerService.grpc_server.PinObjectID - 40000 total (0 active), CPU time: mean = 194.860 us, total = 7.794 s NodeManagerService.grpc_server.PinObjectIDs - 40000 total (0 active), CPU time: mean = 194.860 us, total = 7.794 s
Callback latency injection Callback latency injection
-------------------------- --------------------------

View file

@ -104,9 +104,9 @@ class MockNodeManager : public NodeManager {
rpc::SendReplyCallback send_reply_callback), rpc::SendReplyCallback send_reply_callback),
(override)); (override));
MOCK_METHOD(void, MOCK_METHOD(void,
HandlePinObjectID, HandlePinObjectIDs,
(const rpc::PinObjectIDRequest &request, (const rpc::PinObjectIDsRequest &request,
rpc::PinObjectIDReply *reply, rpc::PinObjectIDsReply *reply,
rpc::SendReplyCallback send_reply_callback), rpc::SendReplyCallback send_reply_callback),
(override)); (override));
MOCK_METHOD(void, MOCK_METHOD(void,

View file

@ -17,10 +17,10 @@ namespace ray {
class MockPinObjectsInterface : public PinObjectsInterface { class MockPinObjectsInterface : public PinObjectsInterface {
public: public:
MOCK_METHOD(void, MOCK_METHOD(void,
PinObjectID, PinObjectIDs,
(const rpc::Address &caller_address, (const rpc::Address &caller_address,
const ObjectID &object_id, const std::vector<ObjectID> &object_ids,
rpc::ClientCallback<rpc::PinObjectIDReply> callback), const ray::rpc::ClientCallback<ray::rpc::PinObjectIDsReply> &callback),
(override)); (override));
}; };
@ -189,10 +189,10 @@ class MockRayletClientInterface : public RayletClientInterface {
const rpc::ClientCallback<rpc::ReleaseUnusedBundlesReply> &callback), const rpc::ClientCallback<rpc::ReleaseUnusedBundlesReply> &callback),
(override)); (override));
MOCK_METHOD(void, MOCK_METHOD(void,
PinObjectID, PinObjectIDs,
(const rpc::Address &caller_address, (const rpc::Address &caller_address,
const ObjectID &object_id, const std::vector<ObjectID> &object_ids,
rpc::ClientCallback<rpc::PinObjectIDReply> callback), const ray::rpc::ClientCallback<ray::rpc::PinObjectIDsReply> &callback),
(override)); (override));
MOCK_METHOD(void, MOCK_METHOD(void,
GetSystemConfig, GetSystemConfig,

View file

@ -896,21 +896,15 @@ Status CoreWorker::PutInLocalPlasmaStore(const RayObject &object,
if (pin_object) { if (pin_object) {
// Tell the raylet to pin the object **after** it is created. // Tell the raylet to pin the object **after** it is created.
RAY_LOG(DEBUG) << "Pinning put object " << object_id; RAY_LOG(DEBUG) << "Pinning put object " << object_id;
local_raylet_client_->PinObjectID( local_raylet_client_->PinObjectIDs(
rpc_address_, rpc_address_,
object_id, {object_id},
[this, object_id](const Status &status, const rpc::PinObjectIDReply &reply) { [this, object_id](const Status &status, const rpc::PinObjectIDsReply &reply) {
if (!status.ok()) {
RAY_LOG(INFO) << "Failed to pin existing copy of the object " << object_id
<< ". This object may get evicted while there are still "
"references to it: "
<< status;
}
// Only release the object once the raylet has responded to avoid the race // Only release the object once the raylet has responded to avoid the race
// condition that the object could be evicted before the raylet pins it. // condition that the object could be evicted before the raylet pins it.
if (auto s = plasma_store_provider_->Release(object_id); !s.ok()) { if (!plasma_store_provider_->Release(object_id).ok()) {
RAY_LOG(ERROR) << "Failed to release ObjectID (" << object_id RAY_LOG(ERROR) << "Failed to release ObjectID (" << object_id
<< "), might cause a leak in plasma: " << s; << "), might cause a leak in plasma.";
} }
}); });
} else { } else {
@ -1056,21 +1050,15 @@ Status CoreWorker::SealExisting(const ObjectID &object_id,
if (pin_object) { if (pin_object) {
// Tell the raylet to pin the object **after** it is created. // Tell the raylet to pin the object **after** it is created.
RAY_LOG(DEBUG) << "Pinning sealed object " << object_id; RAY_LOG(DEBUG) << "Pinning sealed object " << object_id;
local_raylet_client_->PinObjectID( local_raylet_client_->PinObjectIDs(
owner_address != nullptr ? *owner_address : rpc_address_, owner_address != nullptr ? *owner_address : rpc_address_,
object_id, {object_id},
[this, object_id](const Status &status, const rpc::PinObjectIDReply &reply) { [this, object_id](const Status &status, const rpc::PinObjectIDsReply &reply) {
if (!status.ok()) {
RAY_LOG(INFO) << "Failed to pin existing copy of the object " << object_id
<< ". This object may get evicted while there are still "
"references to it: "
<< status;
}
// Only release the object once the raylet has responded to avoid the race // Only release the object once the raylet has responded to avoid the race
// condition that the object could be evicted before the raylet pins it. // condition that the object could be evicted before the raylet pins it.
if (auto s = plasma_store_provider_->Release(object_id); !s.ok()) { if (!plasma_store_provider_->Release(object_id).ok()) {
RAY_LOG(ERROR) << "Failed to release ObjectID (" << object_id RAY_LOG(ERROR) << "Failed to release ObjectID (" << object_id
<< "), might cause a leak in plasma: " << s; << "), might cause a leak in plasma.";
} }
}); });
} else { } else {
@ -2450,17 +2438,16 @@ bool CoreWorker::PinExistingReturnObject(const ObjectID &return_id,
// Asynchronously ask the raylet to pin the object. Note that this can fail // Asynchronously ask the raylet to pin the object. Note that this can fail
// if the raylet fails. We expect the owner of the object to handle that // if the raylet fails. We expect the owner of the object to handle that
// case (e.g., by detecting the raylet failure and storing an error). // case (e.g., by detecting the raylet failure and storing an error).
local_raylet_client_->PinObjectID( local_raylet_client_->PinObjectIDs(
owner_address, owner_address,
return_id, {return_id},
[return_id, pinned_return_object](const Status &status, [return_id, pinned_return_object](const Status &status,
const rpc::PinObjectIDReply &reply) { const rpc::PinObjectIDsReply &reply) {
if (!status.ok()) { if (!status.ok()) {
RAY_LOG(INFO) << "Failed to pin existing copy of the task return object " RAY_LOG(INFO) << "Failed to pin existing copy of the task return object "
<< return_id << return_id
<< ". This object may get evicted while there are still " << ". This object may get evicted while there are still "
"references to it: " "references to it.";
<< status;
} }
}); });
return true; return true;

View file

@ -96,7 +96,7 @@ void ObjectRecoveryManager::PinExistingObjectCopy(
const rpc::Address &raylet_address, const rpc::Address &raylet_address,
const std::vector<rpc::Address> &other_locations) { const std::vector<rpc::Address> &other_locations) {
// If a copy still exists, pin the object by sending a // If a copy still exists, pin the object by sending a
// PinObjectID RPC. // PinObjectIDs RPC.
const auto node_id = NodeID::FromBinary(raylet_address.raylet_id()); const auto node_id = NodeID::FromBinary(raylet_address.raylet_id());
RAY_LOG(DEBUG) << "Trying to pin copy of lost object " << object_id << " at node " RAY_LOG(DEBUG) << "Trying to pin copy of lost object " << object_id << " at node "
<< node_id; << node_id;
@ -118,23 +118,23 @@ void ObjectRecoveryManager::PinExistingObjectCopy(
client = client_it->second; client = client_it->second;
} }
client->PinObjectID(rpc_address_, client->PinObjectIDs(rpc_address_,
object_id, {object_id},
[this, object_id, other_locations, node_id]( [this, object_id, other_locations, node_id](
const Status &status, const rpc::PinObjectIDReply &reply) { const Status &status, const rpc::PinObjectIDsReply &reply) {
if (status.ok()) { if (status.ok()) {
// TODO(swang): Make sure that the node is still alive when // TODO(swang): Make sure that the node is still alive when
// marking the object as pinned. // marking the object as pinned.
RAY_CHECK(in_memory_store_->Put( RAY_CHECK(in_memory_store_->Put(
RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), object_id)); RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), object_id));
reference_counter_->UpdateObjectPinnedAtRaylet(object_id, reference_counter_->UpdateObjectPinnedAtRaylet(object_id,
node_id); node_id);
} else { } else {
RAY_LOG(INFO) << "Error pinning new copy of lost object " RAY_LOG(INFO) << "Error pinning new copy of lost object "
<< object_id << ", trying again"; << object_id << ", trying again";
PinOrReconstructObject(object_id, other_locations); PinOrReconstructObject(object_id, other_locations);
} }
}); });
} }
void ObjectRecoveryManager::ReconstructObject(const ObjectID &object_id) { void ObjectRecoveryManager::ReconstructObject(const ObjectID &object_id) {

View file

@ -58,23 +58,24 @@ class MockTaskResubmitter : public TaskResubmissionInterface {
class MockRayletClient : public PinObjectsInterface { class MockRayletClient : public PinObjectsInterface {
public: public:
void PinObjectID(const rpc::Address &caller_address, void PinObjectIDs(
const ObjectID &object_id, const rpc::Address &caller_address,
rpc::ClientCallback<rpc::PinObjectIDReply> callback) override { const std::vector<ObjectID> &object_ids,
RAY_LOG(INFO) << "PinObjectID " << object_id.Hex(); const rpc::ClientCallback<rpc::PinObjectIDsReply> &callback) override {
callbacks.push_back(std::move(callback)); RAY_LOG(INFO) << "PinObjectIDs " << object_ids.size();
callbacks.push_back(callback);
} }
size_t Flush() { size_t Flush() {
size_t flushed = callbacks.size(); size_t flushed = callbacks.size();
for (const auto &callback : callbacks) { for (const auto &callback : callbacks) {
callback(Status::OK(), rpc::PinObjectIDReply()); callback(Status::OK(), rpc::PinObjectIDsReply());
} }
callbacks.clear(); callbacks.clear();
return flushed; return flushed;
} }
std::list<rpc::ClientCallback<rpc::PinObjectIDReply>> callbacks = {}; std::list<rpc::ClientCallback<rpc::PinObjectIDsReply>> callbacks = {};
}; };
class MockObjectDirectory { class MockObjectDirectory {

View file

@ -459,7 +459,7 @@ class NodeResourceInfoAccessor {
/// server. /// server.
virtual void AsyncResubscribe(); virtual void AsyncResubscribe();
/// Report resource usage of a node to GCS asynchronously. Only used in tests. /// Report resource usage of a node to GCS asynchronously.
/// ///
/// \param data_ptr The data that will be reported to GCS. /// \param data_ptr The data that will be reported to GCS.
/// \param callback Callback that will be called after report finishes. /// \param callback Callback that will be called after report finishes.

View file

@ -254,9 +254,10 @@ struct GcsServerMocker {
} }
/// PinObjectsInterface /// PinObjectsInterface
void PinObjectID(const rpc::Address &caller_address, void PinObjectIDs(
const ObjectID &object_id, const rpc::Address &caller_address,
rpc::ClientCallback<rpc::PinObjectIDReply> callback) override {} const std::vector<ObjectID> &object_ids,
const ray::rpc::ClientCallback<ray::rpc::PinObjectIDsReply> &callback) override {}
/// DependencyWaiterInterface /// DependencyWaiterInterface
ray::Status WaitForDirectActorCallArgs( ray::Status WaitForDirectActorCallArgs(

View file

@ -482,8 +482,8 @@ Status PlasmaClient::Impl::GetBuffers(
// If we get here, then the objects aren't all currently in use by this // If we get here, then the objects aren't all currently in use by this
// client, so we need to send a request to the plasma store. // client, so we need to send a request to the plasma store.
RAY_RETURN_NOT_OK( RAY_RETURN_NOT_OK(SendGetRequest(
SendGetRequest(store_conn_, object_ids, num_objects, timeout_ms, is_from_worker)); store_conn_, &object_ids[0], num_objects, timeout_ms, is_from_worker));
std::vector<uint8_t> buffer; std::vector<uint8_t> buffer;
RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaGetReply, &buffer)); RAY_RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType::PlasmaGetReply, &buffer));
std::vector<ObjectID> received_object_ids(num_objects); std::vector<ObjectID> received_object_ids(num_objects);
@ -560,12 +560,8 @@ Status PlasmaClient::Impl::Get(const std::vector<ObjectID> &object_ids,
}; };
const size_t num_objects = object_ids.size(); const size_t num_objects = object_ids.size();
*out = std::vector<ObjectBuffer>(num_objects); *out = std::vector<ObjectBuffer>(num_objects);
return GetBuffers(object_ids.data(), return GetBuffers(
num_objects, &object_ids[0], num_objects, timeout_ms, wrap_buffer, &(*out)[0], is_from_worker);
timeout_ms,
wrap_buffer,
out->data(),
is_from_worker);
} }
Status PlasmaClient::Impl::MarkObjectUnused(const ObjectID &object_id) { Status PlasmaClient::Impl::MarkObjectUnused(const ObjectID &object_id) {

View file

@ -339,7 +339,7 @@ class PullManager {
int64_t num_active_bundles_ = 0; int64_t num_active_bundles_ = 0;
/// Callback to pin plasma objects. /// Callback to pin plasma objects.
std::function<std::unique_ptr<RayObject>(const ObjectID &object_id)> pin_object_; std::function<std::unique_ptr<RayObject>(const ObjectID &object_ids)> pin_object_;
/// The last time OOM was reported. Track this so we don't spam warnings when /// The last time OOM was reported. Track this so we don't spam warnings when
/// the object store is full. /// the object store is full.

View file

@ -162,14 +162,15 @@ message CancelWorkerLeaseReply {
bool success = 1; bool success = 1;
} }
message PinObjectIDRequest { message PinObjectIDsRequest {
// Address of the owner to ask when to unpin the objects. // Address of the owner to ask when to unpin the objects.
Address owner_address = 1; Address owner_address = 1;
// ObjectIDs to pin. // ObjectIDs to pin.
repeated bytes object_ids = 2; repeated bytes object_ids = 2;
} }
message PinObjectIDReply {} message PinObjectIDsReply {
}
message GetNodeStatsRequest { message GetNodeStatsRequest {
// Whether to include memory stats. This could be large since it includes // Whether to include memory stats. This could be large since it includes
@ -353,7 +354,7 @@ service NodeManagerService {
// lease request was not yet granted. // lease request was not yet granted.
rpc CancelWorkerLease(CancelWorkerLeaseRequest) returns (CancelWorkerLeaseReply); rpc CancelWorkerLease(CancelWorkerLeaseRequest) returns (CancelWorkerLeaseReply);
// Pin the provided object IDs. // Pin the provided object IDs.
rpc PinObjectID(PinObjectIDRequest) returns (PinObjectIDReply); rpc PinObjectIDs(PinObjectIDsRequest) returns (PinObjectIDsReply);
// Get the current node stats. // Get the current node stats.
rpc GetNodeStats(GetNodeStatsRequest) returns (GetNodeStatsReply); rpc GetNodeStats(GetNodeStatsRequest) returns (GetNodeStatsReply);
// Trigger garbage collection in all workers across the cluster. // Trigger garbage collection in all workers across the cluster.

View file

@ -268,7 +268,7 @@ NodeManager::NodeManager(instrumented_io_context &io_service,
std::vector<ObjectID> object_ids = {object_id}; std::vector<ObjectID> object_ids = {object_id};
std::vector<std::unique_ptr<RayObject>> results; std::vector<std::unique_ptr<RayObject>> results;
std::unique_ptr<RayObject> result; std::unique_ptr<RayObject> result;
if (GetObjectsFromPlasma(object_ids, &results).ok() && results.size() > 0) { if (GetObjectsFromPlasma(object_ids, &results) && results.size() > 0) {
result = std::move(results[0]); result = std::move(results[0]);
} }
return result; return result;
@ -387,7 +387,7 @@ NodeManager::NodeManager(instrumented_io_context &io_service,
leased_workers_, leased_workers_,
[this](const std::vector<ObjectID> &object_ids, [this](const std::vector<ObjectID> &object_ids,
std::vector<std::unique_ptr<RayObject>> *results) { std::vector<std::unique_ptr<RayObject>> *results) {
return GetObjectsFromPlasma(object_ids, results).ok(); return GetObjectsFromPlasma(object_ids, results);
}, },
max_task_args_memory); max_task_args_memory);
cluster_task_manager_ = std::make_shared<ClusterTaskManager>( cluster_task_manager_ = std::make_shared<ClusterTaskManager>(
@ -2309,21 +2309,23 @@ std::string compact_tag_string(const opencensus::stats::ViewDescriptor &view,
return result.str(); return result.str();
} }
Status NodeManager::GetObjectsFromPlasma( bool NodeManager::GetObjectsFromPlasma(const std::vector<ObjectID> &object_ids,
const std::vector<ObjectID> &object_ids, std::vector<std::unique_ptr<RayObject>> *results) {
std::vector<std::unique_ptr<RayObject>> *results) {
// Pin the objects in plasma by getting them and holding a reference to // Pin the objects in plasma by getting them and holding a reference to
// the returned buffer. // the returned buffer.
// NOTE: the caller must ensure that the objects already exist in plasma before // NOTE: the caller must ensure that the objects already exist in plasma before
// sending a PinObjectID request. // sending a PinObjectIDs request.
std::vector<plasma::ObjectBuffer> plasma_results; std::vector<plasma::ObjectBuffer> plasma_results;
// TODO(swang): This `Get` has a timeout of 0, so the plasma store will not // TODO(swang): This `Get` has a timeout of 0, so the plasma store will not
// block when serving the request. However, if the plasma store is under // block when serving the request. However, if the plasma store is under
// heavy load, then this request can still block the NodeManager event loop // heavy load, then this request can still block the NodeManager event loop
// since we must wait for the plasma store's reply. We should consider using // since we must wait for the plasma store's reply. We should consider using
// an `AsyncGet` instead. // an `AsyncGet` instead.
RAY_RETURN_NOT_OK(store_client_.Get( if (!store_client_
object_ids, /*timeout_ms=*/0, &plasma_results, /*is_from_worker=*/false)); .Get(object_ids, /*timeout_ms=*/0, &plasma_results, /*is_from_worker=*/false)
.ok()) {
return false;
}
for (const auto &plasma_result : plasma_results) { for (const auto &plasma_result : plasma_results) {
if (plasma_result.data == nullptr) { if (plasma_result.data == nullptr) {
@ -2333,12 +2335,12 @@ Status NodeManager::GetObjectsFromPlasma(
new RayObject(plasma_result.data, plasma_result.metadata, {}))); new RayObject(plasma_result.data, plasma_result.metadata, {})));
} }
} }
return Status::OK(); return true;
} }
void NodeManager::HandlePinObjectID(const rpc::PinObjectIDRequest &request, void NodeManager::HandlePinObjectIDs(const rpc::PinObjectIDsRequest &request,
rpc::PinObjectIDReply *reply, rpc::PinObjectIDsReply *reply,
rpc::SendReplyCallback send_reply_callback) { rpc::SendReplyCallback send_reply_callback) {
std::vector<ObjectID> object_ids; std::vector<ObjectID> object_ids;
object_ids.reserve(request.object_ids_size()); object_ids.reserve(request.object_ids_size());
const auto &owner_address = request.owner_address(); const auto &owner_address = request.owner_address();
@ -2346,12 +2348,12 @@ void NodeManager::HandlePinObjectID(const rpc::PinObjectIDRequest &request,
object_ids.push_back(ObjectID::FromBinary(object_id_binary)); object_ids.push_back(ObjectID::FromBinary(object_id_binary));
} }
std::vector<std::unique_ptr<RayObject>> results; std::vector<std::unique_ptr<RayObject>> results;
if (auto s = GetObjectsFromPlasma(object_ids, &results); !s.ok()) { if (!GetObjectsFromPlasma(object_ids, &results)) {
RAY_LOG(WARNING) RAY_LOG(WARNING)
<< "Failed to get objects that should have been in the object store. These " << "Failed to get objects that should have been in the object store. These "
"objects may have been evicted while there are still references in scope: " "objects may have been evicted while there are still references in scope.";
<< s; // TODO(suquark): Maybe "Status::ObjectNotFound" is more accurate here.
send_reply_callback(s, nullptr, nullptr); send_reply_callback(Status::Invalid("Failed to get objects."), nullptr, nullptr);
return; return;
} }
// Wait for the object to be freed by the owner, which keeps the ref count. // Wait for the object to be freed by the owner, which keeps the ref count.

View file

@ -175,7 +175,7 @@ class NodeManager : public rpc::NodeManagerServiceHandler,
/// Subscribe to the relevant GCS tables and set up handlers. /// Subscribe to the relevant GCS tables and set up handlers.
/// ///
/// \return Status indicating whether this was done successfully or not. /// \return Status indicating whether this was done successfully or not.
Status RegisterGcs(); ray::Status RegisterGcs();
/// Get initial node manager configuration. /// Get initial node manager configuration.
const NodeManagerConfig &GetInitialConfig() const; const NodeManagerConfig &GetInitialConfig() const;
@ -226,7 +226,7 @@ class NodeManager : public rpc::NodeManagerServiceHandler,
/// \param include_task_info If true, it requires every task metadata information /// \param include_task_info If true, it requires every task metadata information
/// from all workers. /// from all workers.
void QueryAllWorkerStates( void QueryAllWorkerStates(
const std::function<void(const Status &status, const std::function<void(const ray::Status &status,
const rpc::GetCoreWorkerStatsReply &r)> &on_replied, const rpc::GetCoreWorkerStatsReply &r)> &on_replied,
rpc::SendReplyCallback &send_reply_callback, rpc::SendReplyCallback &send_reply_callback,
bool include_memory_info, bool include_memory_info,
@ -551,10 +551,10 @@ class NodeManager : public rpc::NodeManagerServiceHandler,
rpc::CancelWorkerLeaseReply *reply, rpc::CancelWorkerLeaseReply *reply,
rpc::SendReplyCallback send_reply_callback) override; rpc::SendReplyCallback send_reply_callback) override;
/// Handle a `PinObjectID` request. /// Handle a `PinObjectIDs` request.
void HandlePinObjectID(const rpc::PinObjectIDRequest &request, void HandlePinObjectIDs(const rpc::PinObjectIDsRequest &request,
rpc::PinObjectIDReply *reply, rpc::PinObjectIDsReply *reply,
rpc::SendReplyCallback send_reply_callback) override; rpc::SendReplyCallback send_reply_callback) override;
/// Handle a `NodeStats` request. /// Handle a `NodeStats` request.
void HandleGetNodeStats(const rpc::GetNodeStatsRequest &request, void HandleGetNodeStats(const rpc::GetNodeStatsRequest &request,
@ -632,9 +632,9 @@ class NodeManager : public rpc::NodeManagerServiceHandler,
/// \param[in] object_ids The objects to get. /// \param[in] object_ids The objects to get.
/// \param[out] results The pointers to objects stored in /// \param[out] results The pointers to objects stored in
/// plasma. /// plasma.
/// \return Status of the request. /// \return Whether the request was successful.
Status GetObjectsFromPlasma(const std::vector<ObjectID> &object_ids, bool GetObjectsFromPlasma(const std::vector<ObjectID> &object_ids,
std::vector<std::unique_ptr<RayObject>> *results); std::vector<std::unique_ptr<RayObject>> *results);
/// Populate the relevant parts of the heartbeat table. This is intended for /// Populate the relevant parts of the heartbeat table. This is intended for
/// sending raylet <-> gcs heartbeats. In particular, this should fill in /// sending raylet <-> gcs heartbeats. In particular, this should fill in

View file

@ -48,12 +48,11 @@ AddressesToFlatbuffer(flatbuffers::FlatBufferBuilder &fbb,
} // namespace } // namespace
namespace ray { namespace ray {
namespace raylet {
RayletConnection::RayletConnection(instrumented_io_context &io_service, raylet::RayletConnection::RayletConnection(instrumented_io_context &io_service,
const std::string &raylet_socket, const std::string &raylet_socket,
int num_retries, int num_retries,
int64_t timeout) { int64_t timeout) {
local_stream_socket socket(io_service); local_stream_socket socket(io_service);
Status s = ConnectSocketRetry(socket, raylet_socket, num_retries, timeout); Status s = ConnectSocketRetry(socket, raylet_socket, num_retries, timeout);
// If we could not connect to the socket, exit. // If we could not connect to the socket, exit.
@ -63,8 +62,8 @@ RayletConnection::RayletConnection(instrumented_io_context &io_service,
conn_ = ServerConnection::Create(std::move(socket)); conn_ = ServerConnection::Create(std::move(socket));
} }
Status RayletConnection::WriteMessage(MessageType type, Status raylet::RayletConnection::WriteMessage(MessageType type,
flatbuffers::FlatBufferBuilder *fbb) { flatbuffers::FlatBufferBuilder *fbb) {
std::unique_lock<std::mutex> guard(write_mutex_); std::unique_lock<std::mutex> guard(write_mutex_);
int64_t length = fbb ? fbb->GetSize() : 0; int64_t length = fbb ? fbb->GetSize() : 0;
uint8_t *bytes = fbb ? fbb->GetBufferPointer() : nullptr; uint8_t *bytes = fbb ? fbb->GetBufferPointer() : nullptr;
@ -73,10 +72,10 @@ Status RayletConnection::WriteMessage(MessageType type,
return status; return status;
} }
Status RayletConnection::AtomicRequestReply(MessageType request_type, Status raylet::RayletConnection::AtomicRequestReply(MessageType request_type,
MessageType reply_type, MessageType reply_type,
std::vector<uint8_t> *reply_message, std::vector<uint8_t> *reply_message,
flatbuffers::FlatBufferBuilder *fbb) { flatbuffers::FlatBufferBuilder *fbb) {
std::unique_lock<std::mutex> guard(mutex_); std::unique_lock<std::mutex> guard(mutex_);
RAY_RETURN_NOT_OK(WriteMessage(request_type, fbb)); RAY_RETURN_NOT_OK(WriteMessage(request_type, fbb));
auto status = conn_->ReadMessage(static_cast<int64_t>(reply_type), reply_message); auto status = conn_->ReadMessage(static_cast<int64_t>(reply_type), reply_message);
@ -84,7 +83,7 @@ Status RayletConnection::AtomicRequestReply(MessageType request_type,
return status; return status;
} }
void RayletConnection::ShutdownIfLocalRayletDisconnected(const Status &status) { void raylet::RayletConnection::ShutdownIfLocalRayletDisconnected(const Status &status) {
if (!status.ok() && IsRayletFailed(RayConfig::instance().RAYLET_PID())) { if (!status.ok() && IsRayletFailed(RayConfig::instance().RAYLET_PID())) {
RAY_LOG(WARNING) << "The connection is failed because the local raylet has been " RAY_LOG(WARNING) << "The connection is failed because the local raylet has been "
"dead. Terminate the process. Status: " "dead. Terminate the process. Status: "
@ -94,28 +93,27 @@ void RayletConnection::ShutdownIfLocalRayletDisconnected(const Status &status) {
} }
} }
RayletClient::RayletClient(std::shared_ptr<rpc::NodeManagerWorkerClient> grpc_client) raylet::RayletClient::RayletClient(
: grpc_client_(std::move(grpc_client)) { std::shared_ptr<rpc::NodeManagerWorkerClient> grpc_client)
pin_batcher_ = std::make_unique<PinBatcher>(grpc_client_); : grpc_client_(std::move(grpc_client)) {}
}
RayletClient::RayletClient(instrumented_io_context &io_service, raylet::RayletClient::RayletClient(
std::shared_ptr<ray::rpc::NodeManagerWorkerClient> grpc_client, instrumented_io_context &io_service,
const std::string &raylet_socket, std::shared_ptr<ray::rpc::NodeManagerWorkerClient> grpc_client,
const WorkerID &worker_id, const std::string &raylet_socket,
rpc::WorkerType worker_type, const WorkerID &worker_id,
const JobID &job_id, rpc::WorkerType worker_type,
const int &runtime_env_hash, const JobID &job_id,
const Language &language, const int &runtime_env_hash,
const std::string &ip_address, const Language &language,
Status *status, const std::string &ip_address,
NodeID *raylet_id, Status *status,
int *port, NodeID *raylet_id,
std::string *serialized_job_config, int *port,
StartupToken startup_token) std::string *serialized_job_config,
StartupToken startup_token)
: grpc_client_(std::move(grpc_client)), worker_id_(worker_id), job_id_(job_id) { : grpc_client_(std::move(grpc_client)), worker_id_(worker_id), job_id_(job_id) {
conn_ = std::make_unique<RayletConnection>(io_service, raylet_socket, -1, -1); conn_ = std::make_unique<raylet::RayletConnection>(io_service, raylet_socket, -1, -1);
pin_batcher_ = std::make_unique<PinBatcher>(grpc_client_);
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
// TODO(suquark): Use `WorkerType` in `common.proto` without converting to int. // TODO(suquark): Use `WorkerType` in `common.proto` without converting to int.
@ -158,9 +156,7 @@ RayletClient::RayletClient(instrumented_io_context &io_service,
*serialized_job_config = reply_message->serialized_job_config()->str(); *serialized_job_config = reply_message->serialized_job_config()->str();
} }
RayletClient::~RayletClient() {} Status raylet::RayletClient::Disconnect(
Status RayletClient::Disconnect(
rpc::WorkerExitType exit_type, rpc::WorkerExitType exit_type,
const std::shared_ptr<LocalMemoryBuffer> &creation_task_exception_pb_bytes) { const std::shared_ptr<LocalMemoryBuffer> &creation_task_exception_pb_bytes) {
RAY_LOG(INFO) << "RayletClient::Disconnect, exit_type=" RAY_LOG(INFO) << "RayletClient::Disconnect, exit_type="
@ -194,20 +190,23 @@ Status RayletClient::Disconnect(
return Status::OK(); return Status::OK();
} }
Status RayletClient::AnnounceWorkerPort(int port) { Status raylet::RayletClient::AnnounceWorkerPort(int port) {
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
auto message = protocol::CreateAnnounceWorkerPort(fbb, port); auto message = protocol::CreateAnnounceWorkerPort(fbb, port);
fbb.Finish(message); fbb.Finish(message);
return conn_->WriteMessage(MessageType::AnnounceWorkerPort, &fbb); return conn_->WriteMessage(MessageType::AnnounceWorkerPort, &fbb);
} }
Status RayletClient::TaskDone() { return conn_->WriteMessage(MessageType::TaskDone); } Status raylet::RayletClient::TaskDone() {
return conn_->WriteMessage(MessageType::TaskDone);
}
Status RayletClient::FetchOrReconstruct(const std::vector<ObjectID> &object_ids, Status raylet::RayletClient::FetchOrReconstruct(
const std::vector<rpc::Address> &owner_addresses, const std::vector<ObjectID> &object_ids,
bool fetch_only, const std::vector<rpc::Address> &owner_addresses,
bool mark_worker_blocked, bool fetch_only,
const TaskID &current_task_id) { bool mark_worker_blocked,
const TaskID &current_task_id) {
RAY_CHECK(object_ids.size() == owner_addresses.size()); RAY_CHECK(object_ids.size() == owner_addresses.size());
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
auto object_ids_message = to_flatbuf(fbb, object_ids); auto object_ids_message = to_flatbuf(fbb, object_ids);
@ -222,34 +221,34 @@ Status RayletClient::FetchOrReconstruct(const std::vector<ObjectID> &object_ids,
return conn_->WriteMessage(MessageType::FetchOrReconstruct, &fbb); return conn_->WriteMessage(MessageType::FetchOrReconstruct, &fbb);
} }
Status RayletClient::NotifyUnblocked(const TaskID &current_task_id) { Status raylet::RayletClient::NotifyUnblocked(const TaskID &current_task_id) {
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
auto message = protocol::CreateNotifyUnblocked(fbb, to_flatbuf(fbb, current_task_id)); auto message = protocol::CreateNotifyUnblocked(fbb, to_flatbuf(fbb, current_task_id));
fbb.Finish(message); fbb.Finish(message);
return conn_->WriteMessage(MessageType::NotifyUnblocked, &fbb); return conn_->WriteMessage(MessageType::NotifyUnblocked, &fbb);
} }
Status RayletClient::NotifyDirectCallTaskBlocked(bool release_resources) { Status raylet::RayletClient::NotifyDirectCallTaskBlocked(bool release_resources) {
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
auto message = protocol::CreateNotifyDirectCallTaskBlocked(fbb, release_resources); auto message = protocol::CreateNotifyDirectCallTaskBlocked(fbb, release_resources);
fbb.Finish(message); fbb.Finish(message);
return conn_->WriteMessage(MessageType::NotifyDirectCallTaskBlocked, &fbb); return conn_->WriteMessage(MessageType::NotifyDirectCallTaskBlocked, &fbb);
} }
Status RayletClient::NotifyDirectCallTaskUnblocked() { Status raylet::RayletClient::NotifyDirectCallTaskUnblocked() {
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
auto message = protocol::CreateNotifyDirectCallTaskUnblocked(fbb); auto message = protocol::CreateNotifyDirectCallTaskUnblocked(fbb);
fbb.Finish(message); fbb.Finish(message);
return conn_->WriteMessage(MessageType::NotifyDirectCallTaskUnblocked, &fbb); return conn_->WriteMessage(MessageType::NotifyDirectCallTaskUnblocked, &fbb);
} }
Status RayletClient::Wait(const std::vector<ObjectID> &object_ids, Status raylet::RayletClient::Wait(const std::vector<ObjectID> &object_ids,
const std::vector<rpc::Address> &owner_addresses, const std::vector<rpc::Address> &owner_addresses,
int num_returns, int num_returns,
int64_t timeout_milliseconds, int64_t timeout_milliseconds,
bool mark_worker_blocked, bool mark_worker_blocked,
const TaskID &current_task_id, const TaskID &current_task_id,
WaitResultPair *result) { WaitResultPair *result) {
// Write request. // Write request.
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
auto message = protocol::CreateWaitRequest(fbb, auto message = protocol::CreateWaitRequest(fbb,
@ -278,7 +277,7 @@ Status RayletClient::Wait(const std::vector<ObjectID> &object_ids,
return Status::OK(); return Status::OK();
} }
Status RayletClient::WaitForDirectActorCallArgs( Status raylet::RayletClient::WaitForDirectActorCallArgs(
const std::vector<rpc::ObjectReference> &references, int64_t tag) { const std::vector<rpc::ObjectReference> &references, int64_t tag) {
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
std::vector<ObjectID> object_ids; std::vector<ObjectID> object_ids;
@ -293,10 +292,10 @@ Status RayletClient::WaitForDirectActorCallArgs(
return conn_->WriteMessage(MessageType::WaitForDirectActorCallArgsRequest, &fbb); return conn_->WriteMessage(MessageType::WaitForDirectActorCallArgsRequest, &fbb);
} }
Status RayletClient::PushError(const JobID &job_id, Status raylet::RayletClient::PushError(const JobID &job_id,
const std::string &type, const std::string &type,
const std::string &error_message, const std::string &error_message,
double timestamp) { double timestamp) {
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
auto message = protocol::CreatePushErrorRequest(fbb, auto message = protocol::CreatePushErrorRequest(fbb,
to_flatbuf(fbb, job_id), to_flatbuf(fbb, job_id),
@ -307,8 +306,8 @@ Status RayletClient::PushError(const JobID &job_id,
return conn_->WriteMessage(MessageType::PushErrorRequest, &fbb); return conn_->WriteMessage(MessageType::PushErrorRequest, &fbb);
} }
Status RayletClient::FreeObjects(const std::vector<ObjectID> &object_ids, Status raylet::RayletClient::FreeObjects(const std::vector<ObjectID> &object_ids,
bool local_only) { bool local_only) {
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
auto message = auto message =
protocol::CreateFreeObjectsRequest(fbb, local_only, to_flatbuf(fbb, object_ids)); protocol::CreateFreeObjectsRequest(fbb, local_only, to_flatbuf(fbb, object_ids));
@ -316,7 +315,7 @@ Status RayletClient::FreeObjects(const std::vector<ObjectID> &object_ids,
return conn_->WriteMessage(MessageType::FreeObjectsInObjectStoreRequest, &fbb); return conn_->WriteMessage(MessageType::FreeObjectsInObjectStoreRequest, &fbb);
} }
void RayletClient::RequestWorkerLease( void raylet::RayletClient::RequestWorkerLease(
const rpc::TaskSpec &task_spec, const rpc::TaskSpec &task_spec,
bool grant_or_reject, bool grant_or_reject,
const rpc::ClientCallback<rpc::RequestWorkerLeaseReply> &callback, const rpc::ClientCallback<rpc::RequestWorkerLeaseReply> &callback,
@ -338,7 +337,7 @@ void RayletClient::RequestWorkerLease(
} }
/// Spill objects to external storage. /// Spill objects to external storage.
void RayletClient::RequestObjectSpillage( void raylet::RayletClient::RequestObjectSpillage(
const ObjectID &object_id, const ObjectID &object_id,
const rpc::ClientCallback<rpc::RequestObjectSpillageReply> &callback) { const rpc::ClientCallback<rpc::RequestObjectSpillageReply> &callback) {
rpc::RequestObjectSpillageRequest request; rpc::RequestObjectSpillageRequest request;
@ -346,11 +345,11 @@ void RayletClient::RequestObjectSpillage(
grpc_client_->RequestObjectSpillage(request, callback); grpc_client_->RequestObjectSpillage(request, callback);
} }
std::shared_ptr<grpc::Channel> RayletClient::GetChannel() const { std::shared_ptr<grpc::Channel> raylet::RayletClient::GetChannel() const {
return grpc_client_->Channel(); return grpc_client_->Channel();
} }
void RayletClient::ReportWorkerBacklog( void raylet::RayletClient::ReportWorkerBacklog(
const WorkerID &worker_id, const WorkerID &worker_id,
const std::vector<rpc::WorkerBacklogReport> &backlog_reports) { const std::vector<rpc::WorkerBacklogReport> &backlog_reports) {
rpc::ReportWorkerBacklogRequest request; rpc::ReportWorkerBacklogRequest request;
@ -364,10 +363,10 @@ void RayletClient::ReportWorkerBacklog(
}); });
} }
Status RayletClient::ReturnWorker(int worker_port, Status raylet::RayletClient::ReturnWorker(int worker_port,
const WorkerID &worker_id, const WorkerID &worker_id,
bool disconnect_worker, bool disconnect_worker,
bool worker_exiting) { bool worker_exiting) {
rpc::ReturnWorkerRequest request; rpc::ReturnWorkerRequest request;
request.set_worker_port(worker_port); request.set_worker_port(worker_port);
request.set_worker_id(worker_id.Binary()); request.set_worker_id(worker_id.Binary());
@ -382,7 +381,7 @@ Status RayletClient::ReturnWorker(int worker_port,
return Status::OK(); return Status::OK();
} }
void RayletClient::ReleaseUnusedWorkers( void raylet::RayletClient::ReleaseUnusedWorkers(
const std::vector<WorkerID> &workers_in_use, const std::vector<WorkerID> &workers_in_use,
const rpc::ClientCallback<rpc::ReleaseUnusedWorkersReply> &callback) { const rpc::ClientCallback<rpc::ReleaseUnusedWorkersReply> &callback) {
rpc::ReleaseUnusedWorkersRequest request; rpc::ReleaseUnusedWorkersRequest request;
@ -401,7 +400,7 @@ void RayletClient::ReleaseUnusedWorkers(
}); });
} }
void RayletClient::CancelWorkerLease( void raylet::RayletClient::CancelWorkerLease(
const TaskID &task_id, const TaskID &task_id,
const rpc::ClientCallback<rpc::CancelWorkerLeaseReply> &callback) { const rpc::ClientCallback<rpc::CancelWorkerLeaseReply> &callback) {
rpc::CancelWorkerLeaseRequest request; rpc::CancelWorkerLeaseRequest request;
@ -409,7 +408,7 @@ void RayletClient::CancelWorkerLease(
grpc_client_->CancelWorkerLease(request, callback); grpc_client_->CancelWorkerLease(request, callback);
} }
void RayletClient::PrepareBundleResources( void raylet::RayletClient::PrepareBundleResources(
const std::vector<std::shared_ptr<const BundleSpecification>> &bundle_specs, const std::vector<std::shared_ptr<const BundleSpecification>> &bundle_specs,
const ray::rpc::ClientCallback<ray::rpc::PrepareBundleResourcesReply> &callback) { const ray::rpc::ClientCallback<ray::rpc::PrepareBundleResourcesReply> &callback) {
rpc::PrepareBundleResourcesRequest request; rpc::PrepareBundleResourcesRequest request;
@ -423,7 +422,7 @@ void RayletClient::PrepareBundleResources(
grpc_client_->PrepareBundleResources(request, callback); grpc_client_->PrepareBundleResources(request, callback);
} }
void RayletClient::CommitBundleResources( void raylet::RayletClient::CommitBundleResources(
const std::vector<std::shared_ptr<const BundleSpecification>> &bundle_specs, const std::vector<std::shared_ptr<const BundleSpecification>> &bundle_specs,
const ray::rpc::ClientCallback<ray::rpc::CommitBundleResourcesReply> &callback) { const ray::rpc::ClientCallback<ray::rpc::CommitBundleResourcesReply> &callback) {
rpc::CommitBundleResourcesRequest request; rpc::CommitBundleResourcesRequest request;
@ -437,7 +436,7 @@ void RayletClient::CommitBundleResources(
grpc_client_->CommitBundleResources(request, callback); grpc_client_->CommitBundleResources(request, callback);
} }
void RayletClient::CancelResourceReserve( void raylet::RayletClient::CancelResourceReserve(
const BundleSpecification &bundle_spec, const BundleSpecification &bundle_spec,
const ray::rpc::ClientCallback<ray::rpc::CancelResourceReserveReply> &callback) { const ray::rpc::ClientCallback<ray::rpc::CancelResourceReserveReply> &callback) {
rpc::CancelResourceReserveRequest request; rpc::CancelResourceReserveRequest request;
@ -445,7 +444,7 @@ void RayletClient::CancelResourceReserve(
grpc_client_->CancelResourceReserve(request, callback); grpc_client_->CancelResourceReserve(request, callback);
} }
void RayletClient::ReleaseUnusedBundles( void raylet::RayletClient::ReleaseUnusedBundles(
const std::vector<rpc::Bundle> &bundles_in_use, const std::vector<rpc::Bundle> &bundles_in_use,
const rpc::ClientCallback<rpc::ReleaseUnusedBundlesReply> &callback) { const rpc::ClientCallback<rpc::ReleaseUnusedBundlesReply> &callback) {
rpc::ReleaseUnusedBundlesRequest request; rpc::ReleaseUnusedBundlesRequest request;
@ -464,13 +463,25 @@ void RayletClient::ReleaseUnusedBundles(
}); });
} }
void RayletClient::PinObjectID(const rpc::Address &caller_address, void raylet::RayletClient::PinObjectIDs(
const ObjectID &object_id, const rpc::Address &caller_address,
rpc::ClientCallback<rpc::PinObjectIDReply> callback) { const std::vector<ObjectID> &object_ids,
pin_batcher_->Add(caller_address, object_id, std::move(callback)); const rpc::ClientCallback<rpc::PinObjectIDsReply> &callback) {
rpc::PinObjectIDsRequest request;
request.mutable_owner_address()->CopyFrom(caller_address);
for (const ObjectID &object_id : object_ids) {
request.add_object_ids(object_id.Binary());
}
pins_in_flight_++;
auto rpc_callback = [this, callback = std::move(callback)](
Status status, const rpc::PinObjectIDsReply &reply) {
pins_in_flight_--;
callback(status, reply);
};
grpc_client_->PinObjectIDs(request, rpc_callback);
} }
void RayletClient::ShutdownRaylet( void raylet::RayletClient::ShutdownRaylet(
const NodeID &node_id, const NodeID &node_id,
bool graceful, bool graceful,
const rpc::ClientCallback<rpc::ShutdownRayletReply> &callback) { const rpc::ClientCallback<rpc::ShutdownRayletReply> &callback) {
@ -479,12 +490,13 @@ void RayletClient::ShutdownRaylet(
grpc_client_->ShutdownRaylet(request, callback); grpc_client_->ShutdownRaylet(request, callback);
} }
void RayletClient::GlobalGC(const rpc::ClientCallback<rpc::GlobalGCReply> &callback) { void raylet::RayletClient::GlobalGC(
const rpc::ClientCallback<rpc::GlobalGCReply> &callback) {
rpc::GlobalGCRequest request; rpc::GlobalGCRequest request;
grpc_client_->GlobalGC(request, callback); grpc_client_->GlobalGC(request, callback);
} }
void RayletClient::UpdateResourceUsage( void raylet::RayletClient::UpdateResourceUsage(
std::string &serialized_resource_usage_batch, std::string &serialized_resource_usage_batch,
const rpc::ClientCallback<rpc::UpdateResourceUsageReply> &callback) { const rpc::ClientCallback<rpc::UpdateResourceUsageReply> &callback) {
@ -493,20 +505,20 @@ void RayletClient::UpdateResourceUsage(
grpc_client_->UpdateResourceUsage(request, callback); grpc_client_->UpdateResourceUsage(request, callback);
} }
void RayletClient::RequestResourceReport( void raylet::RayletClient::RequestResourceReport(
const rpc::ClientCallback<rpc::RequestResourceReportReply> &callback) { const rpc::ClientCallback<rpc::RequestResourceReportReply> &callback) {
rpc::RequestResourceReportRequest request; rpc::RequestResourceReportRequest request;
grpc_client_->RequestResourceReport(request, callback); grpc_client_->RequestResourceReport(request, callback);
} }
void RayletClient::GetResourceLoad( void raylet::RayletClient::GetResourceLoad(
const rpc::ClientCallback<rpc::GetResourceLoadReply> &callback) { const rpc::ClientCallback<rpc::GetResourceLoadReply> &callback) {
rpc::GetResourceLoadRequest request; rpc::GetResourceLoadRequest request;
grpc_client_->GetResourceLoad(request, callback); grpc_client_->GetResourceLoad(request, callback);
} }
void RayletClient::SubscribeToPlasma(const ObjectID &object_id, void raylet::RayletClient::SubscribeToPlasma(const ObjectID &object_id,
const rpc::Address &owner_address) { const rpc::Address &owner_address) {
flatbuffers::FlatBufferBuilder fbb; flatbuffers::FlatBufferBuilder fbb;
auto message = protocol::CreateSubscribePlasmaReady( auto message = protocol::CreateSubscribePlasmaReady(
fbb, to_flatbuf(fbb, object_id), to_flatbuf(fbb, owner_address)); fbb, to_flatbuf(fbb, object_id), to_flatbuf(fbb, owner_address));
@ -515,74 +527,16 @@ void RayletClient::SubscribeToPlasma(const ObjectID &object_id,
RAY_CHECK_OK(conn_->WriteMessage(MessageType::SubscribePlasmaReady, &fbb)); RAY_CHECK_OK(conn_->WriteMessage(MessageType::SubscribePlasmaReady, &fbb));
} }
void RayletClient::GetSystemConfig( void raylet::RayletClient::GetSystemConfig(
const rpc::ClientCallback<rpc::GetSystemConfigReply> &callback) { const rpc::ClientCallback<rpc::GetSystemConfigReply> &callback) {
rpc::GetSystemConfigRequest request; rpc::GetSystemConfigRequest request;
grpc_client_->GetSystemConfig(request, callback); grpc_client_->GetSystemConfig(request, callback);
} }
void RayletClient::GetGcsServerAddress( void raylet::RayletClient::GetGcsServerAddress(
const rpc::ClientCallback<rpc::GetGcsServerAddressReply> &callback) { const rpc::ClientCallback<rpc::GetGcsServerAddressReply> &callback) {
rpc::GetGcsServerAddressRequest request; rpc::GetGcsServerAddressRequest request;
grpc_client_->GetGcsServerAddress(request, callback); grpc_client_->GetGcsServerAddress(request, callback);
} }
int64_t RayletClient::GetPinsInFlight() const { return pin_batcher_->TotalPending(); }
PinBatcher::PinBatcher(std::shared_ptr<ray::rpc::NodeManagerWorkerClient> grpc_client)
: grpc_client_(std::move(grpc_client)) {}
void PinBatcher::Add(const rpc::Address &address,
const ObjectID &object_id,
rpc::ClientCallback<rpc::PinObjectIDReply> callback) {
absl::MutexLock lock(&mu_);
total_inflight_pins_++;
RayletDestination &raylet =
raylets_.try_emplace(address.raylet_id(), address).first->second;
raylet.buffered_.emplace_back(object_id, std::move(callback));
Flush(address.raylet_id());
}
int64_t PinBatcher::TotalPending() const {
absl::MutexLock lock(&mu_);
return total_inflight_pins_;
}
bool PinBatcher::Flush(const std::string &raylet_id) {
auto &raylet = raylets_.at(raylet_id);
if (raylet.buffered_.empty() || !raylet.inflight_.empty()) {
return false;
}
raylet.inflight_ = std::move(raylet.buffered_);
raylet.buffered_.clear();
rpc::PinObjectIDRequest request;
request.mutable_owner_address()->CopyFrom(raylet.raylet_address_);
for (const auto &req : raylet.inflight_) {
request.add_object_ids(req.object_id.Binary());
}
auto rpc_callback = [this, raylet_id](Status status,
const rpc::PinObjectIDReply &reply) {
std::vector<Request> inflight;
{
absl::MutexLock lock(&mu_);
auto &raylet = raylets_.at(raylet_id);
inflight = std::move(raylet.inflight_);
raylet.inflight_.clear();
total_inflight_pins_ -= inflight.size();
if (!Flush(raylet_id)) {
// No more buffered requests, so this RayletDestination can be dropped.
raylets_.erase(raylet_id);
}
}
for (auto &req : inflight) {
req.callback(status, reply);
}
};
grpc_client_->PinObjectID(request, std::move(rpc_callback));
return true;
}
} // namespace raylet
} // namespace ray } // namespace ray

View file

@ -50,9 +50,10 @@ namespace ray {
class PinObjectsInterface { class PinObjectsInterface {
public: public:
/// Request to a raylet to pin a plasma object. The callback will be sent via gRPC. /// Request to a raylet to pin a plasma object. The callback will be sent via gRPC.
virtual void PinObjectID(const rpc::Address &caller_address, virtual void PinObjectIDs(
const ObjectID &object_id, const rpc::Address &caller_address,
rpc::ClientCallback<rpc::PinObjectIDReply> callback) = 0; const std::vector<ObjectID> &object_ids,
const ray::rpc::ClientCallback<ray::rpc::PinObjectIDsReply> &callback) = 0;
virtual ~PinObjectsInterface(){}; virtual ~PinObjectsInterface(){};
}; };
@ -232,53 +233,6 @@ class RayletConnection {
std::mutex write_mutex_; std::mutex write_mutex_;
}; };
/// Batches PinObjectIDRequest so there would be only one outstanding
/// request per Raylet. This reduces the memory and CPU overhead when a
/// large number of objects need to be pinned.
class PinBatcher {
public:
PinBatcher(std::shared_ptr<rpc::NodeManagerWorkerClient> grpc_client);
/// Adds objects to be pinned at the address.
void Add(const rpc::Address &address,
const ObjectID &object_id,
rpc::ClientCallback<rpc::PinObjectIDReply> callback);
/// Total number of objects waiting to be pinned.
int64_t TotalPending() const;
private:
// Request from a single Add() call.
struct Request {
Request(ObjectID oid, rpc::ClientCallback<rpc::PinObjectIDReply> cb)
: object_id(oid), callback(std::move(cb)) {}
ObjectID object_id;
rpc::ClientCallback<rpc::PinObjectIDReply> callback;
};
// Collects buffered pin object requests intended for a raylet.
struct RayletDestination {
RayletDestination(const rpc::Address &address) : raylet_address_(address) {}
const rpc::Address raylet_address_;
std::vector<Request> inflight_;
std::vector<Request> buffered_;
};
/// Tries sending out a batched pin request with buffered object IDs.
///
/// \return true if a request is sent out, false otherwise, e.g. when
/// there is already an inflight request, or there is no buffered Object IDs.
bool Flush(const std::string &raylet_id) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
const std::shared_ptr<rpc::NodeManagerWorkerClient> grpc_client_;
mutable absl::Mutex mu_;
// Maps Raylet ID to the address and buffered messages for the Raylet.
absl::flat_hash_map<std::string, RayletDestination> raylets_ ABSL_GUARDED_BY(mu_);
int64_t total_inflight_pins_ ABSL_GUARDED_BY(mu_) = 0;
};
class RayletClient : public RayletClientInterface { class RayletClient : public RayletClientInterface {
public: public:
/// Connect to the raylet. /// Connect to the raylet.
@ -304,7 +258,7 @@ class RayletClient : public RayletClientInterface {
/// \param startup_token The startup token of the process assigned to /// \param startup_token The startup token of the process assigned to
/// it during startup as a command line argument. /// it during startup as a command line argument.
RayletClient(instrumented_io_context &io_service, RayletClient(instrumented_io_context &io_service,
std::shared_ptr<rpc::NodeManagerWorkerClient> grpc_client, std::shared_ptr<ray::rpc::NodeManagerWorkerClient> grpc_client,
const std::string &raylet_socket, const std::string &raylet_socket,
const WorkerID &worker_id, const WorkerID &worker_id,
rpc::WorkerType worker_type, rpc::WorkerType worker_type,
@ -321,9 +275,7 @@ class RayletClient : public RayletClientInterface {
/// Connect to the raylet via grpc only. /// Connect to the raylet via grpc only.
/// ///
/// \param grpc_client gRPC client to the raylet. /// \param grpc_client gRPC client to the raylet.
RayletClient(std::shared_ptr<rpc::NodeManagerWorkerClient> grpc_client); RayletClient(std::shared_ptr<ray::rpc::NodeManagerWorkerClient> grpc_client);
~RayletClient() override;
/// Notify the raylet that this client is disconnecting gracefully. This /// Notify the raylet that this client is disconnecting gracefully. This
/// is used by actors to exit gracefully so that the raylet doesn't /// is used by actors to exit gracefully so that the raylet doesn't
@ -488,9 +440,10 @@ class RayletClient : public RayletClientInterface {
const std::vector<rpc::Bundle> &bundles_in_use, const std::vector<rpc::Bundle> &bundles_in_use,
const rpc::ClientCallback<rpc::ReleaseUnusedBundlesReply> &callback) override; const rpc::ClientCallback<rpc::ReleaseUnusedBundlesReply> &callback) override;
void PinObjectID(const rpc::Address &caller_address, void PinObjectIDs(
const ObjectID &object_id, const rpc::Address &caller_address,
rpc::ClientCallback<rpc::PinObjectIDReply> callback) override; const std::vector<ObjectID> &object_ids,
const ray::rpc::ClientCallback<ray::rpc::PinObjectIDsReply> &callback) override;
void ShutdownRaylet( void ShutdownRaylet(
const NodeID &node_id, const NodeID &node_id,
@ -524,12 +477,12 @@ class RayletClient : public RayletClientInterface {
const ResourceMappingType &GetResourceIDs() const { return resource_ids_; } const ResourceMappingType &GetResourceIDs() const { return resource_ids_; }
int64_t GetPinsInFlight() const; int64_t GetPinsInFlight() const { return pins_in_flight_.load(); }
private: private:
/// gRPC client to the raylet. Right now, this is only used for a couple /// gRPC client to the raylet. Right now, this is only used for a couple
/// request types. /// request types.
std::shared_ptr<rpc::NodeManagerWorkerClient> grpc_client_; std::shared_ptr<ray::rpc::NodeManagerWorkerClient> grpc_client_;
const WorkerID worker_id_; const WorkerID worker_id_;
const JobID job_id_; const JobID job_id_;
@ -539,9 +492,12 @@ class RayletClient : public RayletClientInterface {
ResourceMappingType resource_ids_; ResourceMappingType resource_ids_;
/// The connection to the raylet server. /// The connection to the raylet server.
std::unique_ptr<RayletConnection> conn_; std::unique_ptr<RayletConnection> conn_;
/// Batches pin object ID requests to the same raylet. All PinObjectID requests
/// should go through this. /// The number of object ID pin RPCs currently in flight.
std::unique_ptr<PinBatcher> pin_batcher_; std::atomic<int64_t> pins_in_flight_{0};
protected:
RayletClient() {}
}; };
} // namespace raylet } // namespace raylet

View file

@ -153,7 +153,7 @@ class NodeManagerWorkerClient
/// Notify the raylet to pin the provided object IDs. /// Notify the raylet to pin the provided object IDs.
VOID_RPC_CLIENT_METHOD(NodeManagerService, VOID_RPC_CLIENT_METHOD(NodeManagerService,
PinObjectID, PinObjectIDs,
grpc_client_, grpc_client_,
/*method_timeout_ms*/ -1, ) /*method_timeout_ms*/ -1, )

View file

@ -33,7 +33,7 @@ namespace rpc {
RPC_SERVICE_HANDLER(NodeManagerService, ReturnWorker, -1) \ RPC_SERVICE_HANDLER(NodeManagerService, ReturnWorker, -1) \
RPC_SERVICE_HANDLER(NodeManagerService, ReleaseUnusedWorkers, -1) \ RPC_SERVICE_HANDLER(NodeManagerService, ReleaseUnusedWorkers, -1) \
RPC_SERVICE_HANDLER(NodeManagerService, CancelWorkerLease, -1) \ RPC_SERVICE_HANDLER(NodeManagerService, CancelWorkerLease, -1) \
RPC_SERVICE_HANDLER(NodeManagerService, PinObjectID, -1) \ RPC_SERVICE_HANDLER(NodeManagerService, PinObjectIDs, -1) \
RPC_SERVICE_HANDLER(NodeManagerService, GetNodeStats, -1) \ RPC_SERVICE_HANDLER(NodeManagerService, GetNodeStats, -1) \
RPC_SERVICE_HANDLER(NodeManagerService, GlobalGC, -1) \ RPC_SERVICE_HANDLER(NodeManagerService, GlobalGC, -1) \
RPC_SERVICE_HANDLER(NodeManagerService, FormatGlobalMemoryInfo, -1) \ RPC_SERVICE_HANDLER(NodeManagerService, FormatGlobalMemoryInfo, -1) \
@ -114,9 +114,9 @@ class NodeManagerServiceHandler {
rpc::CancelResourceReserveReply *reply, rpc::CancelResourceReserveReply *reply,
rpc::SendReplyCallback send_reply_callback) = 0; rpc::SendReplyCallback send_reply_callback) = 0;
virtual void HandlePinObjectID(const PinObjectIDRequest &request, virtual void HandlePinObjectIDs(const PinObjectIDsRequest &request,
PinObjectIDReply *reply, PinObjectIDsReply *reply,
SendReplyCallback send_reply_callback) = 0; SendReplyCallback send_reply_callback) = 0;
virtual void HandleGetNodeStats(const GetNodeStatsRequest &request, virtual void HandleGetNodeStats(const GetNodeStatsRequest &request,
GetNodeStatsReply *reply, GetNodeStatsReply *reply,