[Object Spilling] Fix a bug where object url is empty. (#18193)

* Fix a bug

* Addressed code review.

* Fix a test
This commit is contained in:
SangBin Cho 2021-08-31 10:10:28 -07:00 committed by GitHub
parent 2c0dcec18f
commit d240d26525
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 116 additions and 9 deletions

View file

@ -460,16 +460,18 @@ void PullManager::TryToMakeObjectLocal(const ObjectID &object_id) {
return; return;
} }
// If we can restore directly from this raylet, then try to do so. // check if we can restore the object directly in the current raylet.
std::string spilled_url = get_locally_spilled_object_url_(object_id); // first check local spilled objects
bool can_restore_directly = std::string direct_restore_url = get_locally_spilled_object_url_(object_id);
!spilled_url.empty() || // If the object is spilled locally if (direct_restore_url.empty()) {
(!request.spilled_url.empty() && if (!request.spilled_url.empty() && request.spilled_node_id.IsNil()) {
request.spilled_node_id direct_restore_url = request.spilled_url;
.IsNil()); // Or if the object is spilled on external storages. }
if (can_restore_directly) { }
if (!direct_restore_url.empty()) {
// Select an url from the object directory update
UpdateRetryTimer(request, object_id); UpdateRetryTimer(request, object_id);
restore_spilled_object_(object_id, request.spilled_url, restore_spilled_object_(object_id, direct_restore_url,
[object_id](const ray::Status &status) { [object_id](const ray::Status &status) {
if (!status.ok()) { if (!status.ok()) {
RAY_LOG(ERROR) << "Object restore for " << object_id RAY_LOG(ERROR) << "Object restore for " << object_id

View file

@ -289,6 +289,111 @@ TEST_P(PullManagerTest, TestRestoreSpilledObjectLocal) {
AssertNoLeaks(); AssertNoLeaks();
} }
TEST_P(PullManagerTest, TestRestoreSpilledObjectOnLocalStorage) {
/// Test the scneario where the object is spilled to local storage, like filesystems.
auto prio = BundlePriority::TASK_ARGS;
if (GetParam()) {
prio = BundlePriority::GET_REQUEST;
}
auto refs = CreateObjectRefs(1);
auto obj1 = ObjectRefsToIds(refs)[0];
rpc::Address addr1;
AssertNumActiveRequestsEquals(0);
std::vector<rpc::ObjectReference> objects_to_locate;
auto req_id = pull_manager_.Pull(refs, prio, &objects_to_locate);
ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs));
std::unordered_set<NodeID> client_ids;
pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0);
// client_ids is empty here, so there's nowhere to pull from.
ASSERT_EQ(num_send_pull_request_calls_, 0);
ASSERT_EQ(num_restore_spilled_object_calls_, 0);
fake_time_ += 10.;
// Objects are spilled locally, but the remote object directory doesn't have the
// information. It should still restore objects.
ObjectSpilled(obj1, "remote_url/foo/bar");
pull_manager_.OnLocationChange(obj1, client_ids, "", self_node_id_, 0);
// We request a local restore.
ASSERT_EQ(num_send_pull_request_calls_, 0);
ASSERT_EQ(num_restore_spilled_object_calls_, 1);
// The call can be retried after a delay, and the url in the remote object directory is
// updated now.
fake_time_ += 10.;
pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", self_node_id_,
0);
ASSERT_EQ(num_send_pull_request_calls_, 0);
ASSERT_EQ(num_restore_spilled_object_calls_, 2);
ASSERT_TRUE(num_abort_calls_.empty());
ASSERT_TRUE(pull_manager_.PullRequestActiveOrWaitingForMetadata(req_id));
auto objects_to_cancel = pull_manager_.CancelPull(req_id);
ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs));
ASSERT_EQ(num_abort_calls_[obj1], 1);
AssertNoLeaks();
}
TEST_P(PullManagerTest, TestRestoreSpilledObjectOnExternalStorage) {
/// Test the scneario where the object is spilled to external storages, such as S3.
auto prio = BundlePriority::TASK_ARGS;
if (GetParam()) {
prio = BundlePriority::GET_REQUEST;
}
auto refs = CreateObjectRefs(1);
auto obj1 = ObjectRefsToIds(refs)[0];
rpc::Address addr1;
AssertNumActiveRequestsEquals(0);
std::vector<rpc::ObjectReference> objects_to_locate;
auto req_id = pull_manager_.Pull(refs, prio, &objects_to_locate);
ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs));
std::unordered_set<NodeID> client_ids;
pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0);
// client_ids is empty here, so there's nowhere to pull from.
ASSERT_EQ(num_send_pull_request_calls_, 0);
ASSERT_EQ(num_restore_spilled_object_calls_, 0);
fake_time_ += 10.;
// Objects are spilled to the empty URL locally if it is spilled to external storages.
ObjectSpilled(obj1, "");
// If objects are spilled to external storages, the node id should be Nil().
// So this shouldn't invoke restoration.
pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", self_node_id_,
0);
// We request a local restore.
ASSERT_EQ(num_send_pull_request_calls_, 0);
ASSERT_EQ(num_restore_spilled_object_calls_, 0);
// Now Nil ID is properly updated.
pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", NodeID::Nil(),
0);
// We request a local restore.
ASSERT_EQ(num_send_pull_request_calls_, 0);
ASSERT_EQ(num_restore_spilled_object_calls_, 1);
// The call can be retried after a delay.
fake_time_ += 10.;
pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", NodeID::Nil(),
0);
ASSERT_EQ(num_send_pull_request_calls_, 0);
ASSERT_EQ(num_restore_spilled_object_calls_, 2);
ASSERT_TRUE(num_abort_calls_.empty());
ASSERT_TRUE(pull_manager_.PullRequestActiveOrWaitingForMetadata(req_id));
auto objects_to_cancel = pull_manager_.CancelPull(req_id);
ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs));
ASSERT_EQ(num_abort_calls_[obj1], 1);
AssertNoLeaks();
}
TEST_P(PullManagerTest, TestLoadBalancingRestorationRequest) { TEST_P(PullManagerTest, TestLoadBalancingRestorationRequest) {
/* Make sure when the object copy is in other raylet, we pull object from there instead /* Make sure when the object copy is in other raylet, we pull object from there instead
* of requesting the owner node to restore the object. */ * of requesting the owner node to restore the object. */