mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
Work stealing! (#15475)
* work_stealing one commit squash * using random task id to request workers * inlining methods in direct_task_transport.h * faster checking for presence of stealable tasks in RequestNewWorkerIfNeeded * linting * fixup! using random task id to request workers * estimating number of tasks to steal based only on tasks in flight * linting * fixup! linting * backup of changes * fixed issue in scheduling queue test after merge * linting * redesigned work stealing. compiles but not tested * all tests passing locally * fixup! all tests passing locally * fixup! fixup! all tests passing locally * fixed big bug in StealTasksIfNeeded * rev1 * rev2 (before removing the work_stealing param) * removed work_stealing flag, fixed existing unit tests * added unit tests; need to figure out how to assign distinct worker ids in GrantWorkerLease * fixed work stealing test * revisions, added two more unit/regression tests * test
This commit is contained in:
parent
9249287a36
commit
3e2f608145
14 changed files with 1246 additions and 259 deletions
|
@ -296,6 +296,7 @@ def test_ray_options(shutdown_only):
|
|||
|
||||
to_check = ["CPU", "GPU", "memory", "custom1"]
|
||||
for key in to_check:
|
||||
print(key, without_options[key], with_options[key])
|
||||
assert without_options[key] != with_options[key], key
|
||||
assert without_options != with_options
|
||||
|
||||
|
|
|
@ -629,6 +629,7 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
|
|||
reference_counter_, node_addr_factory, rpc_address_))
|
||||
: std::shared_ptr<LeasePolicyInterface>(
|
||||
std::make_shared<LocalLeasePolicy>(rpc_address_));
|
||||
|
||||
direct_task_submitter_ = std::make_unique<CoreWorkerDirectTaskSubmitter>(
|
||||
rpc_address_, local_raylet_client_, core_worker_client_pool_, raylet_client_factory,
|
||||
std::move(lease_policy), memory_store_, task_manager_, local_raylet_id,
|
||||
|
@ -643,6 +644,7 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_
|
|||
future_resolver_.reset(new FutureResolver(memory_store_,
|
||||
std::move(report_locality_data_callback),
|
||||
core_worker_client_pool_, rpc_address_));
|
||||
|
||||
// Unfortunately the raylet client has to be constructed after the receivers.
|
||||
if (direct_task_receiver_ != nullptr) {
|
||||
task_argument_waiter_.reset(new DependencyWaiterImpl(*local_raylet_client_));
|
||||
|
@ -2357,6 +2359,13 @@ void CoreWorker::HandlePushTask(const rpc::PushTaskRequest &request,
|
|||
}
|
||||
}
|
||||
|
||||
void CoreWorker::HandleStealTasks(const rpc::StealTasksRequest &request,
|
||||
rpc::StealTasksReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) {
|
||||
RAY_LOG(DEBUG) << "Entering CoreWorker::HandleStealWork!";
|
||||
direct_task_receiver_->HandleStealTasks(request, reply, send_reply_callback);
|
||||
}
|
||||
|
||||
void CoreWorker::HandleDirectActorCallArgWaitComplete(
|
||||
const rpc::DirectActorCallArgWaitCompleteRequest &request,
|
||||
rpc::DirectActorCallArgWaitCompleteReply *reply,
|
||||
|
|
|
@ -896,6 +896,11 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
|
|||
void HandlePushTask(const rpc::PushTaskRequest &request, rpc::PushTaskReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) override;
|
||||
|
||||
/// Implements gRPC server handler.
|
||||
void HandleStealTasks(const rpc::StealTasksRequest &request,
|
||||
rpc::StealTasksReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) override;
|
||||
|
||||
/// Implements gRPC server handler.
|
||||
void HandleDirectActorCallArgWaitComplete(
|
||||
const rpc::DirectActorCallArgWaitCompleteRequest &request,
|
||||
|
|
|
@ -45,6 +45,8 @@ class TaskFinisherInterface {
|
|||
const TaskID &task_id, const TaskSpecification &spec, rpc::ErrorType error_type,
|
||||
const std::shared_ptr<rpc::RayException> &creation_task_exception = nullptr) = 0;
|
||||
|
||||
virtual absl::optional<TaskSpecification> GetTaskSpec(const TaskID &task_id) const = 0;
|
||||
|
||||
virtual ~TaskFinisherInterface() {}
|
||||
};
|
||||
|
||||
|
@ -158,7 +160,7 @@ class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterfa
|
|||
bool MarkTaskCanceled(const TaskID &task_id) override;
|
||||
|
||||
/// Return the spec for a pending task.
|
||||
absl::optional<TaskSpecification> GetTaskSpec(const TaskID &task_id) const;
|
||||
absl::optional<TaskSpecification> GetTaskSpec(const TaskID &task_id) const override;
|
||||
|
||||
/// Return specs for pending children tasks of the given parent task.
|
||||
std::vector<TaskID> GetPendingChildrenTasks(const TaskID &parent_task_id) const;
|
||||
|
|
|
@ -97,6 +97,9 @@ class MockTaskFinisher : public TaskFinisherInterface {
|
|||
|
||||
MOCK_METHOD1(MarkTaskCanceled, bool(const TaskID &task_id));
|
||||
|
||||
MOCK_CONST_METHOD1(GetTaskSpec,
|
||||
absl::optional<TaskSpecification>(const TaskID &task_id));
|
||||
|
||||
MOCK_METHOD4(MarkPendingTaskFailed,
|
||||
void(const TaskID &task_id, const TaskSpecification &spec,
|
||||
rpc::ErrorType error_type,
|
||||
|
|
|
@ -28,6 +28,10 @@ namespace ray {
|
|||
// be better to use a mock clock or lease manager interface, but that's high
|
||||
// overhead for the very simple timeout logic we currently have.
|
||||
int64_t kLongTimeout = 1024 * 1024 * 1024;
|
||||
TaskSpecification BuildTaskSpec(const std::unordered_map<std::string, double> &resources,
|
||||
const ray::FunctionDescriptor &function_descriptor);
|
||||
// Calls BuildTaskSpec with empty resources map and empty function descriptor
|
||||
TaskSpecification BuildEmptyTaskSpec();
|
||||
|
||||
class MockWorkerClient : public rpc::CoreWorkerClientInterface {
|
||||
public:
|
||||
|
@ -36,7 +40,31 @@ class MockWorkerClient : public rpc::CoreWorkerClientInterface {
|
|||
callbacks.push_back(callback);
|
||||
}
|
||||
|
||||
bool ReplyPushTask(Status status = Status::OK(), bool exit = false) {
|
||||
void StealTasks(std::unique_ptr<rpc::StealTasksRequest> request,
|
||||
const rpc::ClientCallback<rpc::StealTasksReply> &callback) override {
|
||||
steal_callbacks.push_back(callback);
|
||||
}
|
||||
|
||||
bool ReplyStealTasks(
|
||||
Status status = Status::OK(),
|
||||
std::vector<TaskSpecification> tasks_stolen = std::vector<TaskSpecification>()) {
|
||||
if (steal_callbacks.size() == 0) {
|
||||
return false;
|
||||
}
|
||||
auto callback = steal_callbacks.front();
|
||||
auto reply = rpc::StealTasksReply();
|
||||
|
||||
for (auto task_spec : tasks_stolen) {
|
||||
reply.add_stolen_tasks_ids(task_spec.TaskId().Binary());
|
||||
}
|
||||
|
||||
callback(status, reply);
|
||||
steal_callbacks.pop_front();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ReplyPushTask(Status status = Status::OK(), bool exit = false,
|
||||
bool stolen = false) {
|
||||
if (callbacks.size() == 0) {
|
||||
return false;
|
||||
}
|
||||
|
@ -45,6 +73,9 @@ class MockWorkerClient : public rpc::CoreWorkerClientInterface {
|
|||
if (exit) {
|
||||
reply.set_worker_exiting(true);
|
||||
}
|
||||
if (stolen) {
|
||||
reply.set_task_stolen(true);
|
||||
}
|
||||
callback(status, reply);
|
||||
callbacks.pop_front();
|
||||
return true;
|
||||
|
@ -56,6 +87,7 @@ class MockWorkerClient : public rpc::CoreWorkerClientInterface {
|
|||
}
|
||||
|
||||
std::list<rpc::ClientCallback<rpc::PushTaskReply>> callbacks;
|
||||
std::list<rpc::ClientCallback<rpc::StealTasksReply>> steal_callbacks;
|
||||
std::list<rpc::CancelTaskRequest> kill_requests;
|
||||
};
|
||||
|
||||
|
@ -89,6 +121,11 @@ class MockTaskFinisher : public TaskFinisherInterface {
|
|||
|
||||
bool MarkTaskCanceled(const TaskID &task_id) override { return true; }
|
||||
|
||||
absl::optional<TaskSpecification> GetTaskSpec(const TaskID &task_id) const override {
|
||||
TaskSpecification task = BuildEmptyTaskSpec();
|
||||
return task;
|
||||
}
|
||||
|
||||
int num_tasks_complete = 0;
|
||||
int num_tasks_failed = 0;
|
||||
int num_inlined_dependencies = 0;
|
||||
|
@ -128,7 +165,8 @@ class MockRayletClient : public WorkerLeaseInterface {
|
|||
|
||||
// Trigger reply to RequestWorkerLease.
|
||||
bool GrantWorkerLease(const std::string &address, int port,
|
||||
const NodeID &retry_at_raylet_id, bool cancel = false) {
|
||||
const NodeID &retry_at_raylet_id, bool cancel = false,
|
||||
std::string worker_id = std::string()) {
|
||||
rpc::RequestWorkerLeaseReply reply;
|
||||
if (cancel) {
|
||||
reply.set_canceled(true);
|
||||
|
@ -140,6 +178,11 @@ class MockRayletClient : public WorkerLeaseInterface {
|
|||
reply.mutable_worker_address()->set_ip_address(address);
|
||||
reply.mutable_worker_address()->set_port(port);
|
||||
reply.mutable_worker_address()->set_raylet_id(retry_at_raylet_id.Binary());
|
||||
// Set the worker ID if the worker_id string is a valid, non-empty argument. A
|
||||
// worker ID can only be set using a 28-characters string.
|
||||
if (worker_id.length() == 28) {
|
||||
reply.mutable_worker_address()->set_worker_id(worker_id);
|
||||
}
|
||||
}
|
||||
if (callbacks.size() == 0) {
|
||||
return false;
|
||||
|
@ -358,6 +401,13 @@ TaskSpecification BuildTaskSpec(const std::unordered_map<std::string, double> &r
|
|||
return builder.Build();
|
||||
}
|
||||
|
||||
TaskSpecification BuildEmptyTaskSpec() {
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
return BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
}
|
||||
|
||||
TEST(DirectTaskTransportTest, TestSubmitOneTask) {
|
||||
rpc::Address address;
|
||||
auto raylet_client = std::make_shared<MockRayletClient>();
|
||||
|
@ -372,10 +422,7 @@ TEST(DirectTaskTransportTest, TestSubmitOneTask) {
|
|||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
ASSERT_EQ(lease_policy->num_lease_policy_consults, 1);
|
||||
|
@ -414,10 +461,7 @@ TEST(DirectTaskTransportTest, TestHandleTaskFailure) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(address, raylet_client, client_pool, nullptr,
|
||||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil()));
|
||||
|
@ -449,12 +493,10 @@ TEST(DirectTaskTransportTest, TestConcurrentWorkerLeases) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(address, raylet_client, client_pool, nullptr,
|
||||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task1 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task2 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task3 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
|
||||
TaskSpecification task1 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task2 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task3 = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task1).ok());
|
||||
ASSERT_TRUE(submitter.SubmitTask(task2).ok());
|
||||
|
@ -509,12 +551,10 @@ TEST(DirectTaskTransportTest, TestReuseWorkerLease) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(address, raylet_client, client_pool, nullptr,
|
||||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task1 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task2 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task3 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
|
||||
TaskSpecification task1 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task2 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task3 = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task1).ok());
|
||||
ASSERT_TRUE(submitter.SubmitTask(task2).ok());
|
||||
|
@ -574,12 +614,9 @@ TEST(DirectTaskTransportTest, TestRetryLeaseCancellation) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(address, raylet_client, client_pool, nullptr,
|
||||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task1 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task2 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task3 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task1 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task2 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task3 = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task1).ok());
|
||||
ASSERT_TRUE(submitter.SubmitTask(task2).ok());
|
||||
|
@ -635,12 +672,9 @@ TEST(DirectTaskTransportTest, TestConcurrentCancellationAndSubmission) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(address, raylet_client, client_pool, nullptr,
|
||||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task1 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task2 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task3 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task1 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task2 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task3 = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task1).ok());
|
||||
ASSERT_TRUE(submitter.SubmitTask(task2).ok());
|
||||
|
@ -693,11 +727,8 @@ TEST(DirectTaskTransportTest, TestWorkerNotReusedOnError) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(address, raylet_client, client_pool, nullptr,
|
||||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task1 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task2 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task1 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task2 = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task1).ok());
|
||||
ASSERT_TRUE(submitter.SubmitTask(task2).ok());
|
||||
|
@ -742,10 +773,7 @@ TEST(DirectTaskTransportTest, TestWorkerNotReturnedOnExit) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(address, raylet_client, client_pool, nullptr,
|
||||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task1 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task1 = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task1).ok());
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 1);
|
||||
|
@ -790,10 +818,7 @@ TEST(DirectTaskTransportTest, TestSpillback) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(
|
||||
address, raylet_client, client_pool, lease_client_factory, lease_policy, store,
|
||||
task_finisher, NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
ASSERT_EQ(lease_policy->num_lease_policy_consults, 1);
|
||||
|
@ -857,10 +882,7 @@ TEST(DirectTaskTransportTest, TestSpillbackRoundTrip) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(
|
||||
address, raylet_client, client_pool, lease_client_factory, lease_policy, store,
|
||||
task_finisher, local_raylet_id, kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 1);
|
||||
|
@ -1049,12 +1071,9 @@ TEST(DirectTaskTransportTest, TestWorkerLeaseTimeout) {
|
|||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(),
|
||||
/*lease_timeout_ms=*/5, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task1 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task2 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task3 = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task1 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task2 = BuildEmptyTaskSpec();
|
||||
TaskSpecification task3 = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task1).ok());
|
||||
ASSERT_TRUE(submitter.SubmitTask(task2).ok());
|
||||
|
@ -1109,10 +1128,7 @@ TEST(DirectTaskTransportTest, TestKillExecutingTask) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(address, raylet_client, client_pool, nullptr,
|
||||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1234, NodeID::Nil()));
|
||||
|
@ -1162,10 +1178,7 @@ TEST(DirectTaskTransportTest, TestKillPendingTask) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(address, raylet_client, client_pool, nullptr,
|
||||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task = BuildEmptyTaskSpec();
|
||||
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
ASSERT_TRUE(submitter.CancelTask(task, true, false).ok());
|
||||
|
@ -1199,10 +1212,7 @@ TEST(DirectTaskTransportTest, TestKillResolvingTask) {
|
|||
CoreWorkerDirectTaskSubmitter submitter(address, raylet_client, client_pool, nullptr,
|
||||
lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator);
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
TaskSpecification task = BuildTaskSpec(empty_resources, empty_descriptor);
|
||||
TaskSpecification task = BuildEmptyTaskSpec();
|
||||
ObjectID obj1 = ObjectID::FromRandom();
|
||||
task.GetMutableMessage().add_args()->mutable_object_ref()->set_object_id(obj1.Binary());
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
|
@ -1242,12 +1252,9 @@ TEST(DirectTaskTransportTest, TestPipeliningConcurrentWorkerLeases) {
|
|||
NodeID::Nil(), kLongTimeout, actor_creator, max_tasks_in_flight_per_worker);
|
||||
|
||||
// Prepare 20 tasks and save them in a vector.
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
std::vector<TaskSpecification> tasks;
|
||||
for (int i = 1; i <= 20; i++) {
|
||||
tasks.push_back(BuildTaskSpec(empty_resources, empty_descriptor));
|
||||
tasks.push_back(BuildEmptyTaskSpec());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 20);
|
||||
|
||||
|
@ -1262,10 +1269,11 @@ TEST(DirectTaskTransportTest, TestPipeliningConcurrentWorkerLeases) {
|
|||
ASSERT_EQ(worker_client->callbacks.size(), 10);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
|
||||
// Last 10 tasks are pushed; no more workers are requested.
|
||||
// Last 10 tasks are pushed; one more worker is requested due to the Eager Worker
|
||||
// Requesting Mode.
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil()));
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 20);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
|
||||
for (int i = 1; i <= 20; i++) {
|
||||
ASSERT_FALSE(worker_client->callbacks.empty());
|
||||
|
@ -1283,14 +1291,15 @@ TEST(DirectTaskTransportTest, TestPipeliningConcurrentWorkerLeases) {
|
|||
}
|
||||
}
|
||||
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 20);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
|
||||
ASSERT_FALSE(raylet_client->ReplyCancelWorkerLease());
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 1);
|
||||
ASSERT_TRUE(raylet_client->ReplyCancelWorkerLease());
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("nil", 0, NodeID::Nil(), /*cancel=*/true));
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 1);
|
||||
|
||||
// Check that there are no entries left in the scheduling_key_entries_ hashmap. These
|
||||
// would otherwise cause a memory leak.
|
||||
|
@ -1317,12 +1326,9 @@ TEST(DirectTaskTransportTest, TestPipeliningReuseWorkerLease) {
|
|||
NodeID::Nil(), kLongTimeout, actor_creator, max_tasks_in_flight_per_worker);
|
||||
|
||||
// prepare 30 tasks and save them in a vector
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
std::vector<TaskSpecification> tasks;
|
||||
for (int i = 0; i < 30; i++) {
|
||||
tasks.push_back(BuildTaskSpec(empty_resources, empty_descriptor));
|
||||
tasks.push_back(BuildEmptyTaskSpec());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 30);
|
||||
|
||||
|
@ -1353,14 +1359,17 @@ TEST(DirectTaskTransportTest, TestPipeliningReuseWorkerLease) {
|
|||
}
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 10);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 1);
|
||||
ASSERT_TRUE(raylet_client->ReplyCancelWorkerLease());
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
|
||||
// Tasks 21-30 finish, and the worker is finally returned.
|
||||
for (int i = 21; i <= 30; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 1);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 30);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 1);
|
||||
ASSERT_TRUE(raylet_client->ReplyCancelWorkerLease());
|
||||
|
||||
// The second lease request is returned immediately.
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil()));
|
||||
|
@ -1397,12 +1406,9 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
NodeID::Nil(), kLongTimeout, actor_creator, max_tasks_in_flight_per_worker);
|
||||
|
||||
// prepare 30 tasks and save them in a vector
|
||||
std::unordered_map<std::string, double> empty_resources;
|
||||
ray::FunctionDescriptor empty_descriptor =
|
||||
ray::FunctionDescriptorBuilder::BuildPython("", "", "", "");
|
||||
std::vector<TaskSpecification> tasks;
|
||||
for (int i = 0; i < 30; i++) {
|
||||
tasks.push_back(BuildTaskSpec(empty_resources, empty_descriptor));
|
||||
tasks.push_back(BuildEmptyTaskSpec());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 30);
|
||||
|
||||
|
@ -1419,9 +1425,10 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 0);
|
||||
|
||||
// Grant a worker lease, and check that still only 1 worker was requested.
|
||||
// Grant a worker lease, and check that one more worker was requested due to the Eager
|
||||
// Worker Requesting Mode.
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1000, NodeID::Nil()));
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 1);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
|
@ -1429,14 +1436,14 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 4);
|
||||
|
||||
// Submit 6 more tasks, and check that still only 1 worker was requested.
|
||||
// Submit 6 more tasks, and check that still only 2 worker were requested.
|
||||
for (int i = 1; i <= 6; i++) {
|
||||
auto task = tasks.front();
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
tasks.erase(tasks.begin());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 20);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 1);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
|
@ -1444,7 +1451,8 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 10);
|
||||
|
||||
// Submit 1 more task, and check that one more worker is requested, for a total of 2.
|
||||
// Submit 1 more task, and check that no additional worker is requested, because a
|
||||
// request is already pending (due to the Eager Worker Requesting mode)
|
||||
auto task = tasks.front();
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
tasks.erase(tasks.begin());
|
||||
|
@ -1457,9 +1465,10 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 10);
|
||||
|
||||
// Grant a worker lease, and check that still only 2 workers were requested.
|
||||
// Grant a worker lease, and check that one more worker is requested because there are
|
||||
// stealable tasks.
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil()));
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
|
@ -1475,7 +1484,7 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
tasks.erase(tasks.begin());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 10);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
|
@ -1484,11 +1493,11 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
ASSERT_EQ(worker_client->callbacks.size(), 20);
|
||||
|
||||
// Call ReplyPushTask on a quarter of the submitted tasks (5), and check that the
|
||||
// total number of workers requested remains equal to 2.
|
||||
// total number of workers requested remains equal to 3.
|
||||
for (int i = 1; i <= 5; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 5);
|
||||
|
@ -1503,7 +1512,7 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
tasks.erase(tasks.begin());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 5);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 5);
|
||||
|
@ -1512,11 +1521,11 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
ASSERT_EQ(worker_client->callbacks.size(), 20);
|
||||
|
||||
// Call ReplyPushTask on a quarter of the submitted tasks (5), and check that the
|
||||
// total number of workers requested remains equal to 2.
|
||||
// total number of workers requested remains equal to 3.
|
||||
for (int i = 1; i <= 5; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 10);
|
||||
|
@ -1525,14 +1534,14 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
ASSERT_EQ(worker_client->callbacks.size(), 15);
|
||||
|
||||
// Submit last 5 tasks, and check that the total number of workers requested is still
|
||||
// 2
|
||||
// 3
|
||||
for (int i = 1; i <= 5; i++) {
|
||||
auto task = tasks.front();
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
tasks.erase(tasks.begin());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 10);
|
||||
|
@ -1545,19 +1554,436 @@ TEST(DirectTaskTransportTest, TestPipeliningNumberOfWorkersRequested) {
|
|||
for (int i = 1; i <= 20; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 30);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 1);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 0);
|
||||
ASSERT_TRUE(raylet_client->ReplyCancelWorkerLease());
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("nil", 0, NodeID::Nil(), /*cancel=*/true));
|
||||
ASSERT_FALSE(raylet_client->ReplyCancelWorkerLease());
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 1);
|
||||
|
||||
// Check that there are no entries left in the scheduling_key_entries_ hashmap. These
|
||||
// would otherwise cause a memory leak.
|
||||
ASSERT_TRUE(submitter.CheckNoSchedulingKeyEntriesPublic());
|
||||
}
|
||||
|
||||
TEST(DirectTaskTransportTest, TestStealingTasks) {
|
||||
rpc::Address address;
|
||||
auto raylet_client = std::make_shared<MockRayletClient>();
|
||||
auto worker_client = std::make_shared<MockWorkerClient>();
|
||||
auto store = std::make_shared<CoreWorkerMemoryStore>();
|
||||
auto client_pool = std::make_shared<rpc::CoreWorkerClientPool>(
|
||||
[&](const rpc::Address &addr) { return worker_client; });
|
||||
auto task_finisher = std::make_shared<MockTaskFinisher>();
|
||||
auto actor_creator = std::make_shared<MockActorCreator>();
|
||||
auto lease_policy = std::make_shared<MockLeasePolicy>();
|
||||
|
||||
// Set max_tasks_in_flight_per_worker to a value larger than 1 to enable the
|
||||
// pipelining of task submissions. This is done by passing a
|
||||
// max_tasks_in_flight_per_worker parameter to the CoreWorkerDirectTaskSubmitter.
|
||||
uint32_t max_tasks_in_flight_per_worker = 10;
|
||||
CoreWorkerDirectTaskSubmitter submitter(
|
||||
address, raylet_client, client_pool, nullptr, lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator, max_tasks_in_flight_per_worker);
|
||||
|
||||
// prepare 20 tasks and save them in a vector
|
||||
std::vector<TaskSpecification> tasks;
|
||||
for (int i = 0; i < 20; i++) {
|
||||
tasks.push_back(BuildEmptyTaskSpec());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 20);
|
||||
|
||||
// Submit 10 tasks, and check that 1 worker is requested.
|
||||
for (int i = 1; i <= 20; i++) {
|
||||
auto task = tasks.front();
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
tasks.erase(tasks.begin());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 1);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 0);
|
||||
|
||||
// Grant a worker lease, and check that one more worker is requested due to the Eager
|
||||
// Worker Requesting Mode.
|
||||
std::string worker1_id = "worker1_ID_abcdefghijklmnopq";
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil(), false,
|
||||
worker1_id));
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 10);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
std::string worker2_id = "worker2_ID_abcdefghijklmnopq";
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1002, NodeID::Nil(), false,
|
||||
worker2_id));
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 20);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// First worker runs the first 10 tasks
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
// First worker begins stealing from the second worker
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 10);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 10);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 1);
|
||||
|
||||
// 5 tasks get stolen!
|
||||
for (int i = 1; i <= 5; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask(Status::OK(), false, true));
|
||||
}
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 10);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 5);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 1);
|
||||
|
||||
// The 5 stolen tasks are forwarded from the victim (2nd worker) to the thief (1st
|
||||
// worker)
|
||||
std::vector<TaskSpecification> tasks_stolen;
|
||||
for (int i = 0; i < 5; i++) {
|
||||
tasks_stolen.push_back(BuildEmptyTaskSpec());
|
||||
}
|
||||
ASSERT_TRUE(worker_client->ReplyStealTasks(Status::OK(), tasks_stolen));
|
||||
tasks_stolen.clear();
|
||||
ASSERT_TRUE(tasks_stolen.empty());
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 10);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 10);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// The second worker finishes its workload of 5 tasks and begins stealing from the first
|
||||
// worker
|
||||
for (int i = 1; i <= 5; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 15);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 5);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 1);
|
||||
// The second worker steals floor(5/2)=2 tasks from the first worker
|
||||
for (int i = 1; i <= 2; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask(Status::OK(), false, true));
|
||||
}
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 15);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 3);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 1);
|
||||
ASSERT_TRUE(tasks_stolen.empty());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
tasks_stolen.push_back(BuildEmptyTaskSpec());
|
||||
}
|
||||
ASSERT_FALSE(tasks_stolen.empty());
|
||||
ASSERT_TRUE(worker_client->ReplyStealTasks(Status::OK(), tasks_stolen));
|
||||
tasks_stolen.clear();
|
||||
ASSERT_TRUE(tasks_stolen.empty());
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 15);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 5);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// The first worker executes the remaining 3 tasks (the ones not stolen) and returns
|
||||
for (int i = 1; i <= 3; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 1);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 18);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 1);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 2);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// The second worker executes the stolen 2 tasks and returns.
|
||||
for (int i = 1; i <= 2; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 20);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 2);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 0);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
}
|
||||
|
||||
TEST(DirectTaskTransportTest, TestNoStealingByExpiredWorker) {
|
||||
rpc::Address address;
|
||||
auto raylet_client = std::make_shared<MockRayletClient>();
|
||||
auto worker_client = std::make_shared<MockWorkerClient>();
|
||||
auto store = std::make_shared<CoreWorkerMemoryStore>();
|
||||
auto client_pool = std::make_shared<rpc::CoreWorkerClientPool>(
|
||||
[&](const rpc::Address &addr) { return worker_client; });
|
||||
auto task_finisher = std::make_shared<MockTaskFinisher>();
|
||||
auto actor_creator = std::make_shared<MockActorCreator>();
|
||||
auto lease_policy = std::make_shared<MockLeasePolicy>();
|
||||
|
||||
// Set max_tasks_in_flight_per_worker to a value larger than 1 to enable the
|
||||
// pipelining of task submissions. This is done by passing a
|
||||
// max_tasks_in_flight_per_worker parameter to the CoreWorkerDirectTaskSubmitter.
|
||||
uint32_t max_tasks_in_flight_per_worker = 10;
|
||||
CoreWorkerDirectTaskSubmitter submitter(
|
||||
address, raylet_client, client_pool, nullptr, lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), 1000, actor_creator, max_tasks_in_flight_per_worker);
|
||||
|
||||
// prepare 30 tasks and save them in a vector
|
||||
std::vector<TaskSpecification> tasks;
|
||||
for (int i = 0; i < 30; i++) {
|
||||
tasks.push_back(BuildEmptyTaskSpec());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 30);
|
||||
|
||||
// Submit the tasks, and check that one worker is requested.
|
||||
for (int i = 1; i <= 30; i++) {
|
||||
auto task = tasks.front();
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
tasks.erase(tasks.begin());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 1);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 0);
|
||||
|
||||
// Grant a worker lease, and check that one more worker is requested due to the Eager
|
||||
// Worker Requesting Mode.
|
||||
std::string worker1_id = "worker1_ID_abcdefghijklmnopq";
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil(), false,
|
||||
worker1_id));
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 10);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// Grant a second worker lease, and check that one more worker is requested due to the
|
||||
// Eager Worker Requesting Mode.
|
||||
std::string worker2_id = "worker2_ID_abcdefghijklmnopq";
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1002, NodeID::Nil(), false,
|
||||
worker2_id));
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 3);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 20);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// Grant a third worker lease, and check that one more worker is requested due to the
|
||||
// Eager Worker Requesting Mode.
|
||||
std::string worker3_id = "worker3_ID_abcdefghijklmnopq";
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1003, NodeID::Nil(), false,
|
||||
worker3_id));
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 4);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 30);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// First worker runs the first 9 tasks and returns an error on completion of the last
|
||||
// one (10th task).
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
bool found_error = (i == 10);
|
||||
auto status = Status::OK();
|
||||
ASSERT_TRUE(status.ok());
|
||||
if (found_error) {
|
||||
status = Status::UnknownError("Worker has experienced an unknown error!");
|
||||
ASSERT_FALSE(status.ok());
|
||||
}
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask(status));
|
||||
}
|
||||
|
||||
// Check that the first worker does not start stealing, and that it is returned to the
|
||||
// Raylet instead.
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 4);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 1);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 9);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 1);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 20);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// Second worker runs the first 9 tasks. Then we let its lease expire, and check that it
|
||||
// does not initiate stealing.
|
||||
for (int i = 1; i <= 9; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
std::this_thread::sleep_for(
|
||||
std::chrono::milliseconds(2000)); // Sleep for 1s, causing the lease to time out.
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
// Check that the second worker does not start stealing, and that it is returned to the
|
||||
// Raylet instead.
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 4);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 1);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 1);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 19);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 1);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 10);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// Last worker finishes its workload and returns
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 4);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 1);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 29);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 1);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 0);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
}
|
||||
|
||||
TEST(DirectTaskTransportTest, TestNoWorkerRequestedIfStealingUnavailable) {
|
||||
rpc::Address address;
|
||||
auto raylet_client = std::make_shared<MockRayletClient>();
|
||||
auto worker_client = std::make_shared<MockWorkerClient>();
|
||||
auto store = std::make_shared<CoreWorkerMemoryStore>();
|
||||
auto client_pool = std::make_shared<rpc::CoreWorkerClientPool>(
|
||||
[&](const rpc::Address &addr) { return worker_client; });
|
||||
auto task_finisher = std::make_shared<MockTaskFinisher>();
|
||||
auto actor_creator = std::make_shared<MockActorCreator>();
|
||||
auto lease_policy = std::make_shared<MockLeasePolicy>();
|
||||
|
||||
// Set max_tasks_in_flight_per_worker to a value larger than 1 to enable the
|
||||
// pipelining of task submissions. This is done by passing a
|
||||
// max_tasks_in_flight_per_worker parameter to the CoreWorkerDirectTaskSubmitter.
|
||||
uint32_t max_tasks_in_flight_per_worker = 10;
|
||||
CoreWorkerDirectTaskSubmitter submitter(
|
||||
address, raylet_client, client_pool, nullptr, lease_policy, store, task_finisher,
|
||||
NodeID::Nil(), kLongTimeout, actor_creator, max_tasks_in_flight_per_worker);
|
||||
|
||||
// prepare 2 tasks and save them in a vector
|
||||
std::vector<TaskSpecification> tasks;
|
||||
for (int i = 0; i < 10; i++) {
|
||||
tasks.push_back(BuildEmptyTaskSpec());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 10);
|
||||
|
||||
// submit both tasks
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
auto task = tasks.front();
|
||||
ASSERT_TRUE(submitter.SubmitTask(task).ok());
|
||||
tasks.erase(tasks.begin());
|
||||
}
|
||||
ASSERT_EQ(tasks.size(), 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 1);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 0);
|
||||
|
||||
// Grant a worker lease, and check that one more worker is requested due to the Eager
|
||||
// Worker Requesting Mode, even if the task queue is empty.
|
||||
std::string worker1_id = "worker1_ID_abcdefghijklmnopq";
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1001, NodeID::Nil(), false,
|
||||
worker1_id));
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 10);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// Execute 9 tasks
|
||||
for (int i = 1; i <= 9; i++) {
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
}
|
||||
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 9);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 1);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// Grant a second worker, which returns immediately because there are no stealable
|
||||
// tasks.
|
||||
std::string worker2_id = "worker2_ID_abcdefghijklmnopq";
|
||||
ASSERT_TRUE(raylet_client->GrantWorkerLease("localhost", 1002, NodeID::Nil(), false,
|
||||
worker2_id));
|
||||
|
||||
// Check that no more workers are requested now that there are no more stealable tasks.
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 1);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 9);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 1);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
|
||||
// Last task runs and first worker is returned
|
||||
ASSERT_TRUE(worker_client->ReplyPushTask());
|
||||
ASSERT_EQ(raylet_client->num_workers_requested, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_returned, 2);
|
||||
ASSERT_EQ(raylet_client->num_workers_disconnected, 0);
|
||||
ASSERT_EQ(task_finisher->num_tasks_complete, 10);
|
||||
ASSERT_EQ(task_finisher->num_tasks_failed, 0);
|
||||
ASSERT_EQ(raylet_client->num_leases_canceled, 0);
|
||||
ASSERT_EQ(worker_client->callbacks.size(), 0);
|
||||
ASSERT_EQ(worker_client->steal_callbacks.size(), 0);
|
||||
}
|
||||
|
||||
} // namespace ray
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
|
|
@ -42,15 +42,18 @@ TEST(SchedulingQueueTest, TestInOrder) {
|
|||
ActorSchedulingQueue queue(io_service, waiter);
|
||||
int n_ok = 0;
|
||||
int n_rej = 0;
|
||||
int n_steal = 0;
|
||||
auto fn_ok = [&n_ok](rpc::SendReplyCallback callback) { n_ok++; };
|
||||
auto fn_rej = [&n_rej](rpc::SendReplyCallback callback) { n_rej++; };
|
||||
queue.Add(0, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(1, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(2, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(3, -1, fn_ok, fn_rej, nullptr);
|
||||
auto fn_steal = [&n_steal](rpc::SendReplyCallback callback) { n_steal++; };
|
||||
queue.Add(0, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(2, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(3, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
io_service.run();
|
||||
ASSERT_EQ(n_ok, 4);
|
||||
ASSERT_EQ(n_rej, 0);
|
||||
ASSERT_EQ(n_steal, 0);
|
||||
}
|
||||
|
||||
TEST(SchedulingQueueTest, TestWaitForObjects) {
|
||||
|
@ -62,12 +65,19 @@ TEST(SchedulingQueueTest, TestWaitForObjects) {
|
|||
ActorSchedulingQueue queue(io_service, waiter);
|
||||
int n_ok = 0;
|
||||
int n_rej = 0;
|
||||
int n_steal = 0;
|
||||
|
||||
auto fn_ok = [&n_ok](rpc::SendReplyCallback callback) { n_ok++; };
|
||||
auto fn_rej = [&n_rej](rpc::SendReplyCallback callback) { n_rej++; };
|
||||
queue.Add(0, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(1, -1, fn_ok, fn_rej, nullptr, TaskID::Nil(), ObjectIdsToRefs({obj1}));
|
||||
queue.Add(2, -1, fn_ok, fn_rej, nullptr, TaskID::Nil(), ObjectIdsToRefs({obj2}));
|
||||
queue.Add(3, -1, fn_ok, fn_rej, nullptr, TaskID::Nil(), ObjectIdsToRefs({obj3}));
|
||||
auto fn_steal = [&n_steal](rpc::SendReplyCallback callback) { n_steal++; };
|
||||
queue.Add(0, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(1, -1, fn_ok, fn_rej, nullptr, fn_steal, TaskID::Nil(),
|
||||
ObjectIdsToRefs({obj1}));
|
||||
queue.Add(2, -1, fn_ok, fn_rej, nullptr, fn_steal, TaskID::Nil(),
|
||||
ObjectIdsToRefs({obj2}));
|
||||
queue.Add(3, -1, fn_ok, fn_rej, nullptr, fn_steal, TaskID::Nil(),
|
||||
ObjectIdsToRefs({obj3}));
|
||||
|
||||
ASSERT_EQ(n_ok, 1);
|
||||
|
||||
waiter.Complete(0);
|
||||
|
@ -78,6 +88,8 @@ TEST(SchedulingQueueTest, TestWaitForObjects) {
|
|||
|
||||
waiter.Complete(1);
|
||||
ASSERT_EQ(n_ok, 4);
|
||||
|
||||
ASSERT_EQ(n_steal, 0);
|
||||
}
|
||||
|
||||
TEST(SchedulingQueueTest, TestWaitForObjectsNotSubjectToSeqTimeout) {
|
||||
|
@ -87,15 +99,21 @@ TEST(SchedulingQueueTest, TestWaitForObjectsNotSubjectToSeqTimeout) {
|
|||
ActorSchedulingQueue queue(io_service, waiter);
|
||||
int n_ok = 0;
|
||||
int n_rej = 0;
|
||||
int n_steal = 0;
|
||||
|
||||
auto fn_ok = [&n_ok](rpc::SendReplyCallback callback) { n_ok++; };
|
||||
auto fn_rej = [&n_rej](rpc::SendReplyCallback callback) { n_rej++; };
|
||||
queue.Add(0, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(1, -1, fn_ok, fn_rej, nullptr, TaskID::Nil(), ObjectIdsToRefs({obj1}));
|
||||
auto fn_steal = [&n_steal](rpc::SendReplyCallback callback) { n_steal++; };
|
||||
queue.Add(0, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(1, -1, fn_ok, fn_rej, nullptr, fn_steal, TaskID::Nil(),
|
||||
ObjectIdsToRefs({obj1}));
|
||||
|
||||
ASSERT_EQ(n_ok, 1);
|
||||
io_service.run();
|
||||
ASSERT_EQ(n_rej, 0);
|
||||
waiter.Complete(0);
|
||||
ASSERT_EQ(n_ok, 2);
|
||||
ASSERT_EQ(n_steal, 0);
|
||||
}
|
||||
|
||||
TEST(SchedulingQueueTest, TestOutOfOrder) {
|
||||
|
@ -104,15 +122,18 @@ TEST(SchedulingQueueTest, TestOutOfOrder) {
|
|||
ActorSchedulingQueue queue(io_service, waiter);
|
||||
int n_ok = 0;
|
||||
int n_rej = 0;
|
||||
int n_steal = 0;
|
||||
auto fn_ok = [&n_ok](rpc::SendReplyCallback callback) { n_ok++; };
|
||||
auto fn_rej = [&n_rej](rpc::SendReplyCallback callback) { n_rej++; };
|
||||
queue.Add(2, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(0, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(3, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(1, -1, fn_ok, fn_rej, nullptr);
|
||||
auto fn_steal = [&n_steal](rpc::SendReplyCallback callback) { n_steal++; };
|
||||
queue.Add(2, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(0, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(3, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
io_service.run();
|
||||
ASSERT_EQ(n_ok, 4);
|
||||
ASSERT_EQ(n_rej, 0);
|
||||
ASSERT_EQ(n_steal, 0);
|
||||
}
|
||||
|
||||
TEST(SchedulingQueueTest, TestSeqWaitTimeout) {
|
||||
|
@ -121,20 +142,23 @@ TEST(SchedulingQueueTest, TestSeqWaitTimeout) {
|
|||
ActorSchedulingQueue queue(io_service, waiter);
|
||||
int n_ok = 0;
|
||||
int n_rej = 0;
|
||||
int n_steal = 0;
|
||||
auto fn_ok = [&n_ok](rpc::SendReplyCallback callback) { n_ok++; };
|
||||
auto fn_rej = [&n_rej](rpc::SendReplyCallback callback) { n_rej++; };
|
||||
queue.Add(2, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(0, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(3, -1, fn_ok, fn_rej, nullptr);
|
||||
auto fn_steal = [&n_steal](rpc::SendReplyCallback callback) { n_steal++; };
|
||||
queue.Add(2, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(0, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(3, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
ASSERT_EQ(n_ok, 1);
|
||||
ASSERT_EQ(n_rej, 0);
|
||||
io_service.run(); // immediately triggers timeout
|
||||
ASSERT_EQ(n_ok, 1);
|
||||
ASSERT_EQ(n_rej, 2);
|
||||
queue.Add(4, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(5, -1, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(4, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(5, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
ASSERT_EQ(n_ok, 3);
|
||||
ASSERT_EQ(n_rej, 2);
|
||||
ASSERT_EQ(n_steal, 0);
|
||||
}
|
||||
|
||||
TEST(SchedulingQueueTest, TestSkipAlreadyProcessedByClient) {
|
||||
|
@ -143,14 +167,17 @@ TEST(SchedulingQueueTest, TestSkipAlreadyProcessedByClient) {
|
|||
ActorSchedulingQueue queue(io_service, waiter);
|
||||
int n_ok = 0;
|
||||
int n_rej = 0;
|
||||
int n_steal = 0;
|
||||
auto fn_ok = [&n_ok](rpc::SendReplyCallback callback) { n_ok++; };
|
||||
auto fn_rej = [&n_rej](rpc::SendReplyCallback callback) { n_rej++; };
|
||||
queue.Add(2, 2, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(3, 2, fn_ok, fn_rej, nullptr);
|
||||
queue.Add(1, 2, fn_ok, fn_rej, nullptr);
|
||||
auto fn_steal = [&n_steal](rpc::SendReplyCallback callback) { n_steal++; };
|
||||
queue.Add(2, 2, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(3, 2, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue.Add(1, 2, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
io_service.run();
|
||||
ASSERT_EQ(n_ok, 1);
|
||||
ASSERT_EQ(n_rej, 2);
|
||||
ASSERT_EQ(n_steal, 0);
|
||||
}
|
||||
|
||||
TEST(SchedulingQueueTest, TestCancelQueuedTask) {
|
||||
|
@ -158,18 +185,127 @@ TEST(SchedulingQueueTest, TestCancelQueuedTask) {
|
|||
ASSERT_TRUE(queue->TaskQueueEmpty());
|
||||
int n_ok = 0;
|
||||
int n_rej = 0;
|
||||
int n_steal = 0;
|
||||
auto fn_ok = [&n_ok](rpc::SendReplyCallback callback) { n_ok++; };
|
||||
auto fn_rej = [&n_rej](rpc::SendReplyCallback callback) { n_rej++; };
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr);
|
||||
auto fn_steal = [&n_steal](rpc::SendReplyCallback callback) { n_steal++; };
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
ASSERT_TRUE(queue->CancelTaskIfFound(TaskID::Nil()));
|
||||
ASSERT_FALSE(queue->TaskQueueEmpty());
|
||||
queue->ScheduleRequests();
|
||||
ASSERT_EQ(n_ok, 4);
|
||||
ASSERT_EQ(n_rej, 0);
|
||||
ASSERT_EQ(n_steal, 0);
|
||||
}
|
||||
|
||||
TEST(SchedulingQueueTest, TestStealingOneTask) {
|
||||
NormalSchedulingQueue *queue = new NormalSchedulingQueue();
|
||||
ASSERT_TRUE(queue->TaskQueueEmpty());
|
||||
int n_ok = 0;
|
||||
int n_rej = 0;
|
||||
int n_steal = 0;
|
||||
auto fn_ok = [&n_ok](rpc::SendReplyCallback callback) { n_ok++; };
|
||||
auto fn_rej = [&n_rej](rpc::SendReplyCallback callback) { n_rej++; };
|
||||
auto fn_steal = [&n_steal](rpc::SendReplyCallback callback) { n_steal++; };
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
|
||||
auto reply = rpc::StealTasksReply();
|
||||
size_t n_stolen = reply.stolen_tasks_ids_size();
|
||||
ASSERT_EQ(n_stolen, 0);
|
||||
|
||||
ASSERT_EQ(queue->Steal(&reply), 0);
|
||||
n_stolen = reply.stolen_tasks_ids_size();
|
||||
ASSERT_EQ(n_stolen, 0);
|
||||
ASSERT_FALSE(queue->TaskQueueEmpty());
|
||||
queue->ScheduleRequests();
|
||||
ASSERT_TRUE(queue->TaskQueueEmpty());
|
||||
ASSERT_EQ(n_ok, 1);
|
||||
ASSERT_EQ(n_rej, 0);
|
||||
ASSERT_EQ(n_steal, 0);
|
||||
}
|
||||
|
||||
TEST(SchedulingQueueTest, TestStealingEvenNumberTasks) {
|
||||
NormalSchedulingQueue *queue = new NormalSchedulingQueue();
|
||||
ASSERT_TRUE(queue->TaskQueueEmpty());
|
||||
int n_ok = 0;
|
||||
int n_rej = 0;
|
||||
int n_steal = 0;
|
||||
auto fn_ok = [&n_ok](rpc::SendReplyCallback callback) { n_ok++; };
|
||||
auto fn_rej = [&n_rej](rpc::SendReplyCallback callback) { n_rej++; };
|
||||
auto fn_steal = [&n_steal](rpc::SendReplyCallback callback) { n_steal++; };
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
|
||||
auto reply = rpc::StealTasksReply();
|
||||
size_t n_stolen = reply.stolen_tasks_ids_size();
|
||||
ASSERT_EQ(n_stolen, 0);
|
||||
|
||||
ASSERT_EQ(queue->Steal(&reply), 5);
|
||||
n_stolen = reply.stolen_tasks_ids_size();
|
||||
ASSERT_EQ(n_stolen, 5);
|
||||
ASSERT_FALSE(queue->TaskQueueEmpty());
|
||||
queue->ScheduleRequests();
|
||||
queue->ScheduleRequests();
|
||||
queue->ScheduleRequests();
|
||||
queue->ScheduleRequests();
|
||||
queue->ScheduleRequests();
|
||||
ASSERT_TRUE(queue->TaskQueueEmpty());
|
||||
ASSERT_EQ(n_ok, 5);
|
||||
ASSERT_EQ(n_rej, 0);
|
||||
ASSERT_EQ(n_steal, 5);
|
||||
}
|
||||
|
||||
TEST(SchedulingQueueTest, TestStealingOddNumberTasks) {
|
||||
NormalSchedulingQueue *queue = new NormalSchedulingQueue();
|
||||
ASSERT_TRUE(queue->TaskQueueEmpty());
|
||||
int n_ok = 0;
|
||||
int n_rej = 0;
|
||||
int n_steal = 0;
|
||||
auto fn_ok = [&n_ok](rpc::SendReplyCallback callback) { n_ok++; };
|
||||
auto fn_rej = [&n_rej](rpc::SendReplyCallback callback) { n_rej++; };
|
||||
auto fn_steal = [&n_steal](rpc::SendReplyCallback callback) { n_steal++; };
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
queue->Add(-1, -1, fn_ok, fn_rej, nullptr, fn_steal);
|
||||
|
||||
auto reply = rpc::StealTasksReply();
|
||||
size_t n_stolen = reply.stolen_tasks_ids_size();
|
||||
ASSERT_EQ(n_stolen, 0);
|
||||
|
||||
ASSERT_EQ(queue->Steal(&reply), 5);
|
||||
n_stolen = reply.stolen_tasks_ids_size();
|
||||
ASSERT_EQ(n_stolen, 5);
|
||||
ASSERT_FALSE(queue->TaskQueueEmpty());
|
||||
queue->ScheduleRequests();
|
||||
queue->ScheduleRequests();
|
||||
queue->ScheduleRequests();
|
||||
queue->ScheduleRequests();
|
||||
queue->ScheduleRequests();
|
||||
queue->ScheduleRequests();
|
||||
ASSERT_TRUE(queue->TaskQueueEmpty());
|
||||
ASSERT_EQ(n_ok, 6);
|
||||
ASSERT_EQ(n_rej, 0);
|
||||
ASSERT_EQ(n_steal, 5);
|
||||
}
|
||||
|
||||
} // namespace ray
|
||||
|
|
|
@ -530,6 +530,15 @@ void CoreWorkerDirectTaskReceiver::HandleTask(
|
|||
send_reply_callback(Status::Invalid("client cancelled stale rpc"), nullptr, nullptr);
|
||||
};
|
||||
|
||||
auto steal_callback = [this, task_spec,
|
||||
reply](rpc::SendReplyCallback send_reply_callback) {
|
||||
RAY_LOG(DEBUG) << "Task " << task_spec.TaskId() << " was stolen from "
|
||||
<< worker_context_.GetWorkerID()
|
||||
<< "'s non_actor_task_queue_! Setting reply->set_task_stolen(true)!";
|
||||
reply->set_task_stolen(true);
|
||||
send_reply_callback(Status::OK(), nullptr, nullptr);
|
||||
};
|
||||
|
||||
auto dependencies = task_spec.GetDependencies(false);
|
||||
|
||||
if (task_spec.IsActorTask()) {
|
||||
|
@ -544,13 +553,15 @@ void CoreWorkerDirectTaskReceiver::HandleTask(
|
|||
|
||||
it->second->Add(request.sequence_number(), request.client_processed_up_to(),
|
||||
std::move(accept_callback), std::move(reject_callback),
|
||||
std::move(send_reply_callback), task_spec.TaskId(), dependencies);
|
||||
std::move(send_reply_callback), nullptr, task_spec.TaskId(),
|
||||
dependencies);
|
||||
} else {
|
||||
// Add the normal task's callbacks to the non-actor scheduling queue.
|
||||
normal_scheduling_queue_->Add(
|
||||
request.sequence_number(), request.client_processed_up_to(),
|
||||
std::move(accept_callback), std::move(reject_callback),
|
||||
std::move(send_reply_callback), task_spec.TaskId(), dependencies);
|
||||
std::move(send_reply_callback), std::move(steal_callback), task_spec.TaskId(),
|
||||
dependencies);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -564,6 +575,16 @@ void CoreWorkerDirectTaskReceiver::RunNormalTasksFromQueue() {
|
|||
normal_scheduling_queue_->ScheduleRequests();
|
||||
}
|
||||
|
||||
void CoreWorkerDirectTaskReceiver::HandleStealTasks(
|
||||
const rpc::StealTasksRequest &request, rpc::StealTasksReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) {
|
||||
size_t n_tasks_stolen = normal_scheduling_queue_->Steal(reply);
|
||||
RAY_LOG(DEBUG) << "Number of tasks stolen is " << n_tasks_stolen;
|
||||
|
||||
// send reply back
|
||||
send_reply_callback(Status::OK(), nullptr, nullptr);
|
||||
}
|
||||
|
||||
bool CoreWorkerDirectTaskReceiver::CancelQueuedNormalTask(TaskID task_id) {
|
||||
// Look up the task to be canceled in the queue of normal tasks. If it is found and
|
||||
// removed successfully, return true.
|
||||
|
|
|
@ -276,18 +276,28 @@ class CoreWorkerDirectActorTaskSubmitter
|
|||
class InboundRequest {
|
||||
public:
|
||||
InboundRequest(){};
|
||||
|
||||
InboundRequest(std::function<void(rpc::SendReplyCallback)> accept_callback,
|
||||
std::function<void(rpc::SendReplyCallback)> reject_callback,
|
||||
std::function<void(rpc::SendReplyCallback)> steal_callback,
|
||||
rpc::SendReplyCallback send_reply_callback, TaskID task_id,
|
||||
bool has_dependencies)
|
||||
: accept_callback_(std::move(accept_callback)),
|
||||
reject_callback_(std::move(reject_callback)),
|
||||
steal_callback_(std::move(steal_callback)),
|
||||
send_reply_callback_(std::move(send_reply_callback)),
|
||||
task_id(task_id),
|
||||
has_pending_dependencies_(has_dependencies) {}
|
||||
|
||||
void Accept() { accept_callback_(std::move(send_reply_callback_)); }
|
||||
void Cancel() { reject_callback_(std::move(send_reply_callback_)); }
|
||||
void Steal(rpc::StealTasksReply *reply) {
|
||||
reply->add_stolen_tasks_ids(task_id.Binary());
|
||||
RAY_CHECK(TaskID::FromBinary(reply->stolen_tasks_ids(reply->stolen_tasks_ids_size() -
|
||||
1)) == task_id);
|
||||
steal_callback_(std::move(send_reply_callback_));
|
||||
}
|
||||
|
||||
bool CanExecute() const { return !has_pending_dependencies_; }
|
||||
ray::TaskID TaskID() const { return task_id; }
|
||||
void MarkDependenciesSatisfied() { has_pending_dependencies_ = false; }
|
||||
|
@ -295,7 +305,9 @@ class InboundRequest {
|
|||
private:
|
||||
std::function<void(rpc::SendReplyCallback)> accept_callback_;
|
||||
std::function<void(rpc::SendReplyCallback)> reject_callback_;
|
||||
std::function<void(rpc::SendReplyCallback)> steal_callback_;
|
||||
rpc::SendReplyCallback send_reply_callback_;
|
||||
|
||||
ray::TaskID task_id;
|
||||
bool has_pending_dependencies_;
|
||||
};
|
||||
|
@ -378,10 +390,13 @@ class SchedulingQueue {
|
|||
std::function<void(rpc::SendReplyCallback)> accept_request,
|
||||
std::function<void(rpc::SendReplyCallback)> reject_request,
|
||||
rpc::SendReplyCallback send_reply_callback,
|
||||
std::function<void(rpc::SendReplyCallback)> steal_request = nullptr,
|
||||
TaskID task_id = TaskID::Nil(),
|
||||
const std::vector<rpc::ObjectReference> &dependencies = {}) = 0;
|
||||
virtual void ScheduleRequests() = 0;
|
||||
virtual bool TaskQueueEmpty() const = 0;
|
||||
virtual size_t Size() const = 0;
|
||||
virtual size_t Steal(rpc::StealTasksReply *reply) = 0;
|
||||
virtual bool CancelTaskIfFound(TaskID task_id) = 0;
|
||||
virtual ~SchedulingQueue(){};
|
||||
};
|
||||
|
@ -407,13 +422,27 @@ class ActorSchedulingQueue : public SchedulingQueue {
|
|||
}
|
||||
}
|
||||
|
||||
bool TaskQueueEmpty() const { return pending_actor_tasks_.empty(); }
|
||||
bool TaskQueueEmpty() const {
|
||||
RAY_CHECK(false) << "TaskQueueEmpty() not implemented for actor queues";
|
||||
// The return instruction will never be executed, but we need to include it
|
||||
// nonetheless because this is a non-void function.
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t Size() const {
|
||||
RAY_CHECK(false) << "Size() not implemented for actor queues";
|
||||
// The return instruction will never be executed, but we need to include it
|
||||
// nonetheless because this is a non-void function.
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Add a new actor task's callbacks to the worker queue.
|
||||
void Add(int64_t seq_no, int64_t client_processed_up_to,
|
||||
std::function<void(rpc::SendReplyCallback)> accept_request,
|
||||
std::function<void(rpc::SendReplyCallback)> reject_request,
|
||||
rpc::SendReplyCallback send_reply_callback, TaskID task_id = TaskID::Nil(),
|
||||
rpc::SendReplyCallback send_reply_callback,
|
||||
std::function<void(rpc::SendReplyCallback)> steal_request = nullptr,
|
||||
TaskID task_id = TaskID::Nil(),
|
||||
const std::vector<rpc::ObjectReference> &dependencies = {}) {
|
||||
// A seq_no of -1 means no ordering constraint. Actor tasks must be executed in order.
|
||||
RAY_CHECK(seq_no != -1);
|
||||
|
@ -425,9 +454,11 @@ class ActorSchedulingQueue : public SchedulingQueue {
|
|||
next_seq_no_ = client_processed_up_to + 1;
|
||||
}
|
||||
RAY_LOG(DEBUG) << "Enqueue " << seq_no << " cur seqno " << next_seq_no_;
|
||||
pending_actor_tasks_[seq_no] =
|
||||
InboundRequest(std::move(accept_request), std::move(reject_request),
|
||||
std::move(send_reply_callback), task_id, dependencies.size() > 0);
|
||||
|
||||
pending_actor_tasks_[seq_no] = InboundRequest(
|
||||
std::move(accept_request), std::move(reject_request), std::move(steal_request),
|
||||
std::move(send_reply_callback), task_id, dependencies.size() > 0);
|
||||
|
||||
if (dependencies.size() > 0) {
|
||||
waiter_.Wait(dependencies, [seq_no, this]() {
|
||||
RAY_CHECK(boost::this_thread::get_id() == main_thread_id_);
|
||||
|
@ -441,8 +472,15 @@ class ActorSchedulingQueue : public SchedulingQueue {
|
|||
ScheduleRequests();
|
||||
}
|
||||
|
||||
// We don't allow the cancellation of actor tasks, so invoking CancelTaskIfFound results
|
||||
// in a fatal error.
|
||||
size_t Steal(rpc::StealTasksReply *reply) {
|
||||
RAY_CHECK(false) << "Cannot steal actor tasks";
|
||||
// The return instruction will never be executed, but we need to include it
|
||||
// nonetheless because this is a non-void function.
|
||||
return 0;
|
||||
}
|
||||
|
||||
// We don't allow the cancellation of actor tasks, so invoking CancelTaskIfFound
|
||||
// results in a fatal error.
|
||||
bool CancelTaskIfFound(TaskID task_id) {
|
||||
RAY_CHECK(false) << "Cannot cancel actor tasks";
|
||||
// The return instruction will never be executed, but we need to include it
|
||||
|
@ -550,24 +588,60 @@ class NormalSchedulingQueue : public SchedulingQueue {
|
|||
return pending_normal_tasks_.empty();
|
||||
}
|
||||
|
||||
// Returns the current size of the task queue.
|
||||
size_t Size() const {
|
||||
absl::MutexLock lock(&mu_);
|
||||
return pending_normal_tasks_.size();
|
||||
}
|
||||
|
||||
/// Add a new task's callbacks to the worker queue.
|
||||
void Add(int64_t seq_no, int64_t client_processed_up_to,
|
||||
std::function<void(rpc::SendReplyCallback)> accept_request,
|
||||
std::function<void(rpc::SendReplyCallback)> reject_request,
|
||||
rpc::SendReplyCallback send_reply_callback, TaskID task_id = TaskID::Nil(),
|
||||
rpc::SendReplyCallback send_reply_callback,
|
||||
std::function<void(rpc::SendReplyCallback)> steal_request = nullptr,
|
||||
TaskID task_id = TaskID::Nil(),
|
||||
|
||||
const std::vector<rpc::ObjectReference> &dependencies = {}) {
|
||||
absl::MutexLock lock(&mu_);
|
||||
// Normal tasks should not have ordering constraints.
|
||||
RAY_CHECK(seq_no == -1);
|
||||
// Create a InboundRequest object for the new task, and add it to the queue.
|
||||
pending_normal_tasks_.push_back(
|
||||
InboundRequest(std::move(accept_request), std::move(reject_request),
|
||||
std::move(send_reply_callback), task_id, dependencies.size() > 0));
|
||||
|
||||
pending_normal_tasks_.push_back(InboundRequest(
|
||||
std::move(accept_request), std::move(reject_request), std::move(steal_request),
|
||||
std::move(send_reply_callback), task_id, dependencies.size() > 0));
|
||||
}
|
||||
|
||||
/// Steal up to max_tasks tasks by removing them from the queue and responding to the
|
||||
/// owner.
|
||||
size_t Steal(rpc::StealTasksReply *reply) {
|
||||
size_t tasks_stolen = 0;
|
||||
|
||||
absl::MutexLock lock(&mu_);
|
||||
|
||||
if (pending_normal_tasks_.size() <= 1) {
|
||||
RAY_LOG(DEBUG) << "We don't have enough tasks to steal, so we return early!";
|
||||
return tasks_stolen;
|
||||
}
|
||||
|
||||
size_t half = pending_normal_tasks_.size() / 2;
|
||||
|
||||
for (tasks_stolen = 0; tasks_stolen < half; tasks_stolen++) {
|
||||
RAY_CHECK(!pending_normal_tasks_.empty());
|
||||
InboundRequest tail = pending_normal_tasks_.back();
|
||||
pending_normal_tasks_.pop_back();
|
||||
int stolen_task_ids = reply->stolen_tasks_ids_size();
|
||||
tail.Steal(reply);
|
||||
RAY_CHECK(reply->stolen_tasks_ids_size() == stolen_task_ids + 1);
|
||||
}
|
||||
|
||||
return tasks_stolen;
|
||||
}
|
||||
|
||||
// Search for an InboundRequest associated with the task that we are trying to cancel.
|
||||
// If found, remove the InboundRequest from the queue and return true. Otherwise, return
|
||||
// false.
|
||||
// If found, remove the InboundRequest from the queue and return true. Otherwise,
|
||||
// return false.
|
||||
bool CancelTaskIfFound(TaskID task_id) {
|
||||
absl::MutexLock lock(&mu_);
|
||||
for (std::deque<InboundRequest>::reverse_iterator it = pending_normal_tasks_.rbegin();
|
||||
|
@ -641,6 +715,15 @@ class CoreWorkerDirectTaskReceiver {
|
|||
/// Pop tasks from the queue and execute them sequentially
|
||||
void RunNormalTasksFromQueue();
|
||||
|
||||
/// Handle a `StealTask` request.
|
||||
///
|
||||
/// \param[in] request The request message.
|
||||
/// \param[out] reply The reply message.
|
||||
/// \param[in] send_reply_callback The callback to be called when the request is done.
|
||||
void HandleStealTasks(const rpc::StealTasksRequest &request,
|
||||
rpc::StealTasksReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback);
|
||||
|
||||
bool CancelQueuedNormalTask(TaskID task_id);
|
||||
|
||||
private:
|
||||
|
|
|
@ -77,6 +77,8 @@ Status CoreWorkerDirectTaskSubmitter::SubmitTask(TaskSpecification task_spec) {
|
|||
: ActorID::Nil());
|
||||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
scheduling_key_entry.task_queue.push_back(task_spec);
|
||||
scheduling_key_entry.resource_spec = task_spec;
|
||||
|
||||
if (!scheduling_key_entry.AllPipelinesToWorkersFull(
|
||||
max_tasks_in_flight_per_worker_)) {
|
||||
// The pipelines to the current workers are not full yet, so we don't need more
|
||||
|
@ -118,8 +120,8 @@ void CoreWorkerDirectTaskSubmitter::AddWorkerLeaseClient(
|
|||
const SchedulingKey &scheduling_key) {
|
||||
client_cache_->GetOrConnect(addr.ToProto());
|
||||
int64_t expiration = current_time_ms() + lease_timeout_ms_;
|
||||
LeaseEntry new_lease_entry = LeaseEntry(std::move(lease_client), expiration, 0,
|
||||
assigned_resources, scheduling_key);
|
||||
LeaseEntry new_lease_entry =
|
||||
LeaseEntry(std::move(lease_client), expiration, assigned_resources, scheduling_key);
|
||||
worker_to_lease_entry_.emplace(addr, new_lease_entry);
|
||||
|
||||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
|
@ -127,6 +129,197 @@ void CoreWorkerDirectTaskSubmitter::AddWorkerLeaseClient(
|
|||
RAY_CHECK(scheduling_key_entry.active_workers.size() >= 1);
|
||||
}
|
||||
|
||||
void CoreWorkerDirectTaskSubmitter::ReturnWorker(const rpc::WorkerAddress addr,
|
||||
bool was_error,
|
||||
const SchedulingKey &scheduling_key) {
|
||||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
RAY_CHECK(scheduling_key_entry.active_workers.size() >= 1);
|
||||
auto &lease_entry = worker_to_lease_entry_[addr];
|
||||
RAY_CHECK(lease_entry.lease_client);
|
||||
RAY_CHECK(lease_entry.tasks_in_flight == 0);
|
||||
RAY_CHECK(lease_entry.WorkerIsStealing() == false);
|
||||
|
||||
// Decrement the number of active workers consuming tasks from the queue associated
|
||||
// with the current scheduling_key
|
||||
scheduling_key_entry.active_workers.erase(addr);
|
||||
if (scheduling_key_entry.CanDelete()) {
|
||||
// We can safely remove the entry keyed by scheduling_key from the
|
||||
// scheduling_key_entries_ hashmap.
|
||||
scheduling_key_entries_.erase(scheduling_key);
|
||||
}
|
||||
|
||||
auto status =
|
||||
lease_entry.lease_client->ReturnWorker(addr.port, addr.worker_id, was_error);
|
||||
if (!status.ok()) {
|
||||
RAY_LOG(ERROR) << "Error returning worker to raylet: " << status.ToString();
|
||||
}
|
||||
worker_to_lease_entry_.erase(addr);
|
||||
}
|
||||
|
||||
bool CoreWorkerDirectTaskSubmitter::FindOptimalVictimForStealing(
|
||||
const SchedulingKey &scheduling_key, rpc::WorkerAddress thief_addr,
|
||||
rpc::Address *victim_raw_addr) {
|
||||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
|
||||
// Check that there is at least one worker (other than the thief) with the current
|
||||
// SchedulingKey and that there are stealable tasks
|
||||
if (scheduling_key_entry.active_workers.size() <= 1 ||
|
||||
!scheduling_key_entry.StealableTasks()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Iterate through the active workers with the relevant SchedulingKey, and select the
|
||||
// best one for stealing by updating the victim_raw_addr (pointing to the designated
|
||||
// victim) every time we find a candidate that is better than the incumbent. A candidate
|
||||
// is better if: (1) the incumbent victim is the thief -- because this choice would be
|
||||
// illegal (thief cannot steal from itself), so any alternative choice is better (2) the
|
||||
// candidate is not the thief (otherwise, again, it cannot be designated as the victim),
|
||||
// and it has more stealable tasks than the incumbent victim
|
||||
*victim_raw_addr = scheduling_key_entry.active_workers.begin()->ToProto();
|
||||
|
||||
for (auto candidate_it = scheduling_key_entry.active_workers.begin();
|
||||
candidate_it != scheduling_key_entry.active_workers.end(); candidate_it++) {
|
||||
const rpc::WorkerAddress &candidate_addr = *candidate_it;
|
||||
const auto &candidate_entry = worker_to_lease_entry_[candidate_addr];
|
||||
|
||||
const rpc::WorkerAddress victim_addr = rpc::WorkerAddress(*victim_raw_addr);
|
||||
RAY_CHECK(worker_to_lease_entry_.find(victim_addr) != worker_to_lease_entry_.end());
|
||||
const auto &victim_entry = worker_to_lease_entry_[victim_addr];
|
||||
|
||||
// Update the designated victim if the alternative candidate is a better choice than
|
||||
// the incumbent victim
|
||||
if (victim_addr.worker_id == thief_addr.worker_id ||
|
||||
((candidate_entry.tasks_in_flight > victim_entry.tasks_in_flight) &&
|
||||
candidate_addr.worker_id != thief_addr.worker_id)) {
|
||||
// We copy the candidate's rpc::Address (instead of its rpc::WorkerAddress) because
|
||||
// objects of type 'ray::rpc::WorkerAddress' cannot be assigned as their copy
|
||||
// assignment operator is implicitly deleted
|
||||
*victim_raw_addr = candidate_addr.ToProto();
|
||||
}
|
||||
}
|
||||
|
||||
const rpc::WorkerAddress victim_addr = rpc::WorkerAddress(*victim_raw_addr);
|
||||
// We can't steal unless we can find a thief and a victim with distinct addresses/worker
|
||||
// ids. In fact, if we allow stealing among workers with the same address/worker id, we
|
||||
// will also necessarily enable self-stealing.
|
||||
if ((victim_addr == thief_addr) || victim_addr.worker_id == thief_addr.worker_id) {
|
||||
RAY_LOG(INFO) << "No victim available with address distinct from thief!";
|
||||
RAY_LOG(INFO) << "victim_addr.worker_id: " << victim_addr.worker_id
|
||||
<< " thief_addr.worker_id: " << thief_addr.worker_id;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto &victim_entry = worker_to_lease_entry_[victim_addr];
|
||||
// Double check that the victim has the correct SchedulingKey
|
||||
RAY_CHECK(victim_entry.scheduling_key == scheduling_key);
|
||||
|
||||
RAY_LOG(DEBUG) << "Victim is worker " << victim_addr.worker_id << " and has "
|
||||
<< victim_entry.tasks_in_flight << " tasks in flight, "
|
||||
<< " among which we estimate that " << victim_entry.tasks_in_flight / 2
|
||||
<< " are available for stealing";
|
||||
RAY_CHECK(scheduling_key_entry.total_tasks_in_flight >= victim_entry.tasks_in_flight);
|
||||
|
||||
if ((victim_entry.tasks_in_flight / 2) < 1) {
|
||||
RAY_LOG(DEBUG) << "The designated victim does not have enough tasks to steal.";
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void CoreWorkerDirectTaskSubmitter::StealTasksOrReturnWorker(
|
||||
const rpc::WorkerAddress &thief_addr, bool was_error,
|
||||
const SchedulingKey &scheduling_key,
|
||||
const google::protobuf::RepeatedPtrField<rpc::ResourceMapEntry> &assigned_resources) {
|
||||
auto &thief_entry = worker_to_lease_entry_[thief_addr];
|
||||
// Check that the thief still retains its lease_client, and it has no tasks in flights
|
||||
RAY_CHECK(thief_entry.lease_client);
|
||||
RAY_CHECK(thief_entry.tasks_in_flight == 0);
|
||||
RAY_CHECK(thief_entry.WorkerIsStealing() == false);
|
||||
|
||||
// Return the worker if there was an error or the lease has expired.
|
||||
if ((was_error || current_time_ms() > thief_entry.lease_expiration_time)) {
|
||||
RAY_LOG(DEBUG) << "Returning worker " << thief_addr.worker_id
|
||||
<< " due to error or lease expiration";
|
||||
ReturnWorker(thief_addr, was_error, scheduling_key);
|
||||
return;
|
||||
}
|
||||
|
||||
RAY_LOG(DEBUG) << "Beginning to steal work now! Thief is worker: "
|
||||
<< thief_addr.worker_id;
|
||||
|
||||
// Search for a suitable victim
|
||||
rpc::Address victim_raw_addr;
|
||||
if (!FindOptimalVictimForStealing(scheduling_key, thief_addr, &victim_raw_addr)) {
|
||||
RAY_LOG(DEBUG) << "Could not find a suitable victim for stealing! Returning worker "
|
||||
<< thief_addr.worker_id;
|
||||
// If stealing was enabled, we can now cancel any pending new workeer lease request,
|
||||
// because stealing is now possible this time.
|
||||
if (max_tasks_in_flight_per_worker_ > 1) {
|
||||
CancelWorkerLeaseIfNeeded(scheduling_key);
|
||||
}
|
||||
ReturnWorker(thief_addr, was_error, scheduling_key);
|
||||
return;
|
||||
}
|
||||
// If we get here, stealing must be enabled.
|
||||
RAY_CHECK(max_tasks_in_flight_per_worker_ > 1);
|
||||
rpc::WorkerAddress victim_addr = rpc::WorkerAddress(victim_raw_addr);
|
||||
RAY_CHECK(worker_to_lease_entry_.find(victim_addr) != worker_to_lease_entry_.end());
|
||||
|
||||
thief_entry.SetWorkerIsStealing();
|
||||
|
||||
// By this point, we have ascertained that the victim is available for stealing, so we
|
||||
// can go ahead with the RPC
|
||||
RAY_LOG(DEBUG) << "Executing StealTasks RPC!";
|
||||
auto request = std::unique_ptr<rpc::StealTasksRequest>(new rpc::StealTasksRequest);
|
||||
request->mutable_thief_addr()->CopyFrom(thief_addr.ToProto());
|
||||
auto &victim_client = *client_cache_->GetOrConnect(victim_addr.ToProto());
|
||||
auto victim_wid = victim_addr.worker_id;
|
||||
|
||||
RAY_UNUSED(victim_client.StealTasks(
|
||||
std::move(request), [this, scheduling_key, victim_wid, victim_addr, thief_addr](
|
||||
Status status, const rpc::StealTasksReply &reply) {
|
||||
absl::MutexLock lock(&mu_);
|
||||
|
||||
// Obtain the thief's lease entry (after ensuring that it still exists)
|
||||
RAY_CHECK(worker_to_lease_entry_.find(thief_addr) !=
|
||||
worker_to_lease_entry_.end());
|
||||
|
||||
auto &thief_entry = worker_to_lease_entry_[thief_addr];
|
||||
RAY_CHECK(thief_entry.WorkerIsStealing());
|
||||
|
||||
// Compute number of tasks stolen
|
||||
size_t number_of_tasks_stolen = reply.stolen_tasks_ids_size();
|
||||
RAY_LOG(DEBUG) << "We stole " << number_of_tasks_stolen << " tasks "
|
||||
<< "from worker: " << victim_wid;
|
||||
|
||||
thief_entry.SetWorkerDoneStealing();
|
||||
|
||||
// push all tasks to the front of the queue
|
||||
for (size_t i = 0; i < number_of_tasks_stolen; i++) {
|
||||
// Get the task_id of the stolen task, and obtain the corresponding task_spec
|
||||
// from the TaskManager
|
||||
TaskID stolen_task_id = TaskID::FromBinary(reply.stolen_tasks_ids(i));
|
||||
RAY_CHECK(task_finisher_->GetTaskSpec(stolen_task_id));
|
||||
auto stolen_task_spec = *(task_finisher_->GetTaskSpec(stolen_task_id));
|
||||
|
||||
// delete the stolen task from the executing_tasks map if it is still there.
|
||||
executing_tasks_.erase(stolen_task_id);
|
||||
|
||||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
|
||||
// Add the task to the queue
|
||||
RAY_LOG(DEBUG) << "Adding stolen task " << stolen_task_spec.TaskId()
|
||||
<< " back to the queue (of current size="
|
||||
<< scheduling_key_entry.task_queue.size() << ")!";
|
||||
scheduling_key_entry.task_queue.push_front(stolen_task_spec);
|
||||
}
|
||||
// call OnWorkerIdle to ship the task to the thief
|
||||
OnWorkerIdle(thief_addr, scheduling_key, /*error=*/!status.ok(),
|
||||
thief_entry.assigned_resources);
|
||||
}));
|
||||
}
|
||||
|
||||
void CoreWorkerDirectTaskSubmitter::OnWorkerIdle(
|
||||
const rpc::WorkerAddress &addr, const SchedulingKey &scheduling_key, bool was_error,
|
||||
const google::protobuf::RepeatedPtrField<rpc::ResourceMapEntry> &assigned_resources) {
|
||||
|
@ -138,39 +331,26 @@ void CoreWorkerDirectTaskSubmitter::OnWorkerIdle(
|
|||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
auto ¤t_queue = scheduling_key_entry.task_queue;
|
||||
// Return the worker if there was an error executing the previous task,
|
||||
// the previous task is an actor creation task,
|
||||
// there are no more applicable queued tasks, or the lease is expired.
|
||||
if (was_error || current_queue.empty() ||
|
||||
current_time_ms() > lease_entry.lease_expiration_time) {
|
||||
// the lease is expired; Steal or return the worker if there are no more applicable
|
||||
// queued tasks and the worker is not stealing.
|
||||
if ((was_error || current_time_ms() > lease_entry.lease_expiration_time) ||
|
||||
(current_queue.empty() && !lease_entry.WorkerIsStealing())) {
|
||||
RAY_CHECK(scheduling_key_entry.active_workers.size() >= 1);
|
||||
|
||||
// Return the worker only if there are no tasks in flight
|
||||
if (lease_entry.tasks_in_flight == 0) {
|
||||
// Decrement the number of active workers consuming tasks from the queue associated
|
||||
// with the current scheduling_key
|
||||
scheduling_key_entry.active_workers.erase(addr);
|
||||
if (scheduling_key_entry.CanDelete()) {
|
||||
// We can safely remove the entry keyed by scheduling_key from the
|
||||
// scheduling_key_entries_ hashmap.
|
||||
scheduling_key_entries_.erase(scheduling_key);
|
||||
}
|
||||
|
||||
auto status =
|
||||
lease_entry.lease_client->ReturnWorker(addr.port, addr.worker_id, was_error);
|
||||
if (!status.ok()) {
|
||||
RAY_LOG(ERROR) << "Error returning worker to raylet: " << status.ToString();
|
||||
}
|
||||
worker_to_lease_entry_.erase(addr);
|
||||
RAY_LOG(DEBUG)
|
||||
<< "Number of tasks in flight == 0, calling StealTasksOrReturnWorker!";
|
||||
StealTasksOrReturnWorker(addr, was_error, scheduling_key, assigned_resources);
|
||||
}
|
||||
|
||||
} else {
|
||||
auto &client = *client_cache_->GetOrConnect(addr.ToProto());
|
||||
|
||||
while (!current_queue.empty() &&
|
||||
!lease_entry.PipelineToWorkerFull(max_tasks_in_flight_per_worker_)) {
|
||||
auto task_spec = current_queue.front();
|
||||
lease_entry
|
||||
.tasks_in_flight++; // Increment the number of tasks in flight to the worker
|
||||
// Increment the number of tasks in flight to the worker
|
||||
lease_entry.tasks_in_flight++;
|
||||
|
||||
// Increment the total number of tasks in flight to any worker associated with the
|
||||
// current scheduling_key
|
||||
|
@ -182,11 +362,8 @@ void CoreWorkerDirectTaskSubmitter::OnWorkerIdle(
|
|||
PushNormalTask(addr, client, scheduling_key, task_spec, assigned_resources);
|
||||
current_queue.pop_front();
|
||||
}
|
||||
|
||||
// Delete the queue if it's now empty. Note that the queue cannot already be empty
|
||||
// because this is the only place tasks are removed from it.
|
||||
if (current_queue.empty()) {
|
||||
RAY_LOG(INFO) << "Task queue empty, canceling lease request";
|
||||
// If stealing is not an option, we can cancel the request for new worker leases
|
||||
if (max_tasks_in_flight_per_worker_ == 1) {
|
||||
CancelWorkerLeaseIfNeeded(scheduling_key);
|
||||
}
|
||||
}
|
||||
|
@ -197,11 +374,15 @@ void CoreWorkerDirectTaskSubmitter::CancelWorkerLeaseIfNeeded(
|
|||
const SchedulingKey &scheduling_key) {
|
||||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
auto &task_queue = scheduling_key_entry.task_queue;
|
||||
if (!task_queue.empty()) {
|
||||
// There are still pending tasks, so let the worker lease request succeed.
|
||||
if (!task_queue.empty() || scheduling_key_entry.StealableTasks()) {
|
||||
// There are still pending tasks, or there are tasks that can be stolen by a new
|
||||
// worker, so let the worker lease request succeed.
|
||||
return;
|
||||
}
|
||||
|
||||
RAY_LOG(DEBUG)
|
||||
<< "Task queue is empty, and there are no stealable tasks; canceling lease request";
|
||||
|
||||
auto &pending_lease_request = scheduling_key_entry.pending_lease_request;
|
||||
if (pending_lease_request.first) {
|
||||
// There is an in-flight lease request. Cancel it.
|
||||
|
@ -237,7 +418,7 @@ CoreWorkerDirectTaskSubmitter::GetOrConnectLeaseClient(
|
|||
NodeID raylet_id = NodeID::FromBinary(raylet_address->raylet_id());
|
||||
auto it = remote_lease_clients_.find(raylet_id);
|
||||
if (it == remote_lease_clients_.end()) {
|
||||
RAY_LOG(DEBUG) << "Connecting to raylet " << raylet_id;
|
||||
RAY_LOG(INFO) << "Connecting to raylet " << raylet_id;
|
||||
it = remote_lease_clients_
|
||||
.emplace(raylet_id, lease_client_factory_(raylet_address->ip_address(),
|
||||
raylet_address->port()))
|
||||
|
@ -261,36 +442,54 @@ void CoreWorkerDirectTaskSubmitter::RequestNewWorkerIfNeeded(
|
|||
return;
|
||||
}
|
||||
|
||||
auto &task_queue = scheduling_key_entry.task_queue;
|
||||
if (task_queue.empty()) {
|
||||
// We don't have any of this type of task to run.
|
||||
if (scheduling_key_entry.CanDelete()) {
|
||||
// We can safely remove the entry keyed by scheduling_key from the
|
||||
// scheduling_key_entries_ hashmap.
|
||||
scheduling_key_entries_.erase(scheduling_key);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Check whether we really need a new worker or whether we have
|
||||
// enough room in an existing worker's pipeline to send the new tasks
|
||||
if (!scheduling_key_entry.AllPipelinesToWorkersFull(max_tasks_in_flight_per_worker_)) {
|
||||
// enough room in an existing worker's pipeline to send the new tasks. If the pipelines
|
||||
// are not full, we do not request a new worker (unless work stealing is enabled, in
|
||||
// which case we can request a worker under the Eager Worker Requesting mode)
|
||||
if (!scheduling_key_entry.AllPipelinesToWorkersFull(max_tasks_in_flight_per_worker_) &&
|
||||
max_tasks_in_flight_per_worker_ == 1) {
|
||||
// The pipelines to the current workers are not full yet, so we don't need more
|
||||
// workers.
|
||||
return;
|
||||
}
|
||||
|
||||
TaskSpecification &resource_spec = task_queue.front();
|
||||
auto &task_queue = scheduling_key_entry.task_queue;
|
||||
// Check if the task queue is empty. If that is the case, it only makes sense to
|
||||
// consider requesting a new worker if work stealing is enabled, and there is at least a
|
||||
// worker with stealable tasks. If work stealing is not enabled, or there is no tasks
|
||||
// that we can steal from existing workers, we don't need a new worker because we don't
|
||||
// have any tasks to execute on that worker.
|
||||
if (task_queue.empty()) {
|
||||
// If any worker has more than one task in flight, then that task can be stolen.
|
||||
bool stealable_tasks = scheduling_key_entry.StealableTasks();
|
||||
if (!stealable_tasks) {
|
||||
if (scheduling_key_entry.CanDelete()) {
|
||||
// We can safely remove the entry keyed by scheduling_key from the
|
||||
// scheduling_key_entries_ hashmap.
|
||||
scheduling_key_entries_.erase(scheduling_key);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Create a TaskSpecification with an overwritten TaskID to make sure we don't reuse the
|
||||
// same TaskID to request a worker
|
||||
auto resource_spec_msg = scheduling_key_entry.resource_spec.GetMutableMessage();
|
||||
resource_spec_msg.set_task_id(TaskID::ForFakeTask().Binary());
|
||||
TaskSpecification resource_spec = TaskSpecification(resource_spec_msg);
|
||||
|
||||
rpc::Address best_node_address;
|
||||
if (raylet_address == nullptr) {
|
||||
// If no raylet address is given, find the best worker for our next lease request.
|
||||
best_node_address = lease_policy_->GetBestNodeForTask(resource_spec);
|
||||
raylet_address = &best_node_address;
|
||||
}
|
||||
|
||||
auto lease_client = GetOrConnectLeaseClient(raylet_address);
|
||||
TaskID task_id = resource_spec.TaskId();
|
||||
// Subtract 1 so we don't double count the task we are requesting for.
|
||||
int64_t queue_size = task_queue.size() - 1;
|
||||
|
||||
lease_client->RequestWorkerLease(
|
||||
resource_spec,
|
||||
[this, scheduling_key](const Status &status,
|
||||
|
@ -313,6 +512,7 @@ void CoreWorkerDirectTaskSubmitter::RequestNewWorkerIfNeeded(
|
|||
// assign work to the worker.
|
||||
RAY_LOG(DEBUG) << "Lease granted " << task_id;
|
||||
rpc::WorkerAddress addr(reply.worker_address());
|
||||
|
||||
auto resources_copy = reply.resource_mapping();
|
||||
|
||||
AddWorkerLeaseClient(addr, std::move(lease_client), resources_copy,
|
||||
|
@ -322,6 +522,7 @@ void CoreWorkerDirectTaskSubmitter::RequestNewWorkerIfNeeded(
|
|||
/*error=*/false, resources_copy);
|
||||
} else {
|
||||
// The raylet redirected us to a different raylet to retry at.
|
||||
|
||||
RequestNewWorkerIfNeeded(scheduling_key, &reply.retry_at_raylet_address());
|
||||
}
|
||||
} else if (lease_client != local_lease_client_) {
|
||||
|
@ -330,7 +531,9 @@ void CoreWorkerDirectTaskSubmitter::RequestNewWorkerIfNeeded(
|
|||
// TODO(swang): Fail after some number of retries?
|
||||
RAY_LOG(ERROR) << "Retrying attempt to schedule task at remote node. Error: "
|
||||
<< status.ToString();
|
||||
|
||||
RequestNewWorkerIfNeeded(scheduling_key);
|
||||
|
||||
} else {
|
||||
// A local request failed. This shouldn't happen if the raylet is still alive
|
||||
// and we don't currently handle raylet failures, so treat it as a fatal
|
||||
|
@ -360,56 +563,63 @@ void CoreWorkerDirectTaskSubmitter::PushNormalTask(
|
|||
request->mutable_task_spec()->CopyFrom(task_spec.GetMessage());
|
||||
request->mutable_resource_mapping()->CopyFrom(assigned_resources);
|
||||
request->set_intended_worker_id(addr.worker_id.Binary());
|
||||
client.PushNormalTask(std::move(request), [this, task_id, is_actor, is_actor_creation,
|
||||
scheduling_key, addr, assigned_resources](
|
||||
Status status,
|
||||
const rpc::PushTaskReply &reply) {
|
||||
{
|
||||
absl::MutexLock lock(&mu_);
|
||||
executing_tasks_.erase(task_id);
|
||||
client.PushNormalTask(
|
||||
std::move(request),
|
||||
[this, task_spec, task_id, is_actor, is_actor_creation, scheduling_key, addr,
|
||||
assigned_resources](Status status, const rpc::PushTaskReply &reply) {
|
||||
{
|
||||
absl::MutexLock lock(&mu_);
|
||||
executing_tasks_.erase(task_id);
|
||||
|
||||
// Decrement the number of tasks in flight to the worker
|
||||
auto &lease_entry = worker_to_lease_entry_[addr];
|
||||
RAY_CHECK(lease_entry.tasks_in_flight > 0);
|
||||
lease_entry.tasks_in_flight--;
|
||||
// Decrement the number of tasks in flight to the worker
|
||||
auto &lease_entry = worker_to_lease_entry_[addr];
|
||||
RAY_CHECK(lease_entry.tasks_in_flight > 0);
|
||||
lease_entry.tasks_in_flight--;
|
||||
|
||||
// Decrement the total number of tasks in flight to any worker with the current
|
||||
// scheduling_key.
|
||||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
RAY_CHECK(scheduling_key_entry.active_workers.size() >= 1);
|
||||
RAY_CHECK(scheduling_key_entry.total_tasks_in_flight >= 1);
|
||||
scheduling_key_entry.total_tasks_in_flight--;
|
||||
}
|
||||
if (reply.worker_exiting()) {
|
||||
// The worker is draining and will shutdown after it is done. Don't return
|
||||
// it to the Raylet since that will kill it early.
|
||||
absl::MutexLock lock(&mu_);
|
||||
worker_to_lease_entry_.erase(addr);
|
||||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
scheduling_key_entry.active_workers.erase(addr);
|
||||
if (scheduling_key_entry.CanDelete()) {
|
||||
// We can safely remove the entry keyed by scheduling_key from the
|
||||
// scheduling_key_entries_ hashmap.
|
||||
scheduling_key_entries_.erase(scheduling_key);
|
||||
}
|
||||
} else if (!status.ok() || !is_actor_creation) {
|
||||
// Successful actor creation leases the worker indefinitely from the raylet.
|
||||
absl::MutexLock lock(&mu_);
|
||||
OnWorkerIdle(addr, scheduling_key,
|
||||
/*error=*/!status.ok(), assigned_resources);
|
||||
}
|
||||
if (!status.ok()) {
|
||||
// TODO: It'd be nice to differentiate here between process vs node
|
||||
// failure (e.g., by contacting the raylet). If it was a process
|
||||
// failure, it may have been an application-level error and it may
|
||||
// not make sense to retry the task.
|
||||
RAY_UNUSED(task_finisher_->PendingTaskFailed(
|
||||
task_id, is_actor ? rpc::ErrorType::ACTOR_DIED : rpc::ErrorType::WORKER_DIED,
|
||||
&status));
|
||||
} else {
|
||||
task_finisher_->CompletePendingTask(task_id, reply, addr.ToProto());
|
||||
}
|
||||
});
|
||||
// Decrement the total number of tasks in flight to any worker with the current
|
||||
// scheduling_key.
|
||||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
RAY_CHECK(scheduling_key_entry.active_workers.size() >= 1);
|
||||
RAY_CHECK(scheduling_key_entry.total_tasks_in_flight >= 1);
|
||||
scheduling_key_entry.total_tasks_in_flight--;
|
||||
|
||||
if (reply.worker_exiting()) {
|
||||
RAY_LOG(DEBUG) << "Worker " << addr.worker_id
|
||||
<< " replied that it is exiting.";
|
||||
// The worker is draining and will shutdown after it is done. Don't return
|
||||
// it to the Raylet since that will kill it early.
|
||||
worker_to_lease_entry_.erase(addr);
|
||||
auto &scheduling_key_entry = scheduling_key_entries_[scheduling_key];
|
||||
scheduling_key_entry.active_workers.erase(addr);
|
||||
if (scheduling_key_entry.CanDelete()) {
|
||||
// We can safely remove the entry keyed by scheduling_key from the
|
||||
// scheduling_key_entries_ hashmap.
|
||||
scheduling_key_entries_.erase(scheduling_key);
|
||||
}
|
||||
} else if (reply.task_stolen()) {
|
||||
// If the task was stolen, we push it to the thief worker & call OnWorkerIdle
|
||||
// in the StealTasks callback within StealTasksOrReturnWorker. So we don't
|
||||
// need to do anything here.
|
||||
return;
|
||||
} else if (!status.ok() || !is_actor_creation) {
|
||||
// Successful actor creation leases the worker indefinitely from the raylet.
|
||||
OnWorkerIdle(addr, scheduling_key,
|
||||
/*error=*/!status.ok(), assigned_resources);
|
||||
}
|
||||
}
|
||||
if (!status.ok()) {
|
||||
// TODO: It'd be nice to differentiate here between process vs node
|
||||
// failure (e.g., by contacting the raylet). If it was a process
|
||||
// failure, it may have been an application-level error and it may
|
||||
// not make sense to retry the task.
|
||||
RAY_UNUSED(task_finisher_->PendingTaskFailed(
|
||||
task_id,
|
||||
is_actor ? rpc::ErrorType::ACTOR_DIED : rpc::ErrorType::WORKER_DIED,
|
||||
&status));
|
||||
} else {
|
||||
task_finisher_->CompletePendingTask(task_id, reply, addr.ToProto());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Status CoreWorkerDirectTaskSubmitter::CancelTask(TaskSpecification task_spec,
|
||||
|
|
|
@ -136,6 +136,43 @@ class CoreWorkerDirectTaskSubmitter {
|
|||
const google::protobuf::RepeatedPtrField<rpc::ResourceMapEntry> &assigned_resources,
|
||||
const SchedulingKey &scheduling_key) EXCLUSIVE_LOCKS_REQUIRED(mu_);
|
||||
|
||||
/// This function takes care of returning a worker to the Raylet.
|
||||
/// \param[in] addr The address of the worker.
|
||||
/// \param[in] was_error Whether the task failed to be submitted.
|
||||
void ReturnWorker(const rpc::WorkerAddress addr, bool was_error,
|
||||
const SchedulingKey &scheduling_key) EXCLUSIVE_LOCKS_REQUIRED(mu_);
|
||||
|
||||
/// Check that the scheduling_key_entries_ hashmap is empty.
|
||||
inline bool CheckNoSchedulingKeyEntries() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
|
||||
return scheduling_key_entries_.empty();
|
||||
}
|
||||
|
||||
/// Find the optimal victim (if there is any) for stealing work
|
||||
///
|
||||
/// \param[in] scheduling_key The SchedulingKey of the thief.
|
||||
/// \param[in] victim_addr The pointer to a variable that the function will fill with
|
||||
/// the address of the victim, if one is found \param[out] A boolean indicating whether
|
||||
/// we found a suitable victim or not
|
||||
bool FindOptimalVictimForStealing(const SchedulingKey &scheduling_key,
|
||||
rpc::WorkerAddress thief_addr,
|
||||
rpc::Address *victim_raw_addr)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mu_);
|
||||
|
||||
/// Look for workers with a surplus of tasks in flight, and, if it is possible,
|
||||
/// steal some of those tasks and submit them to the current worker. If no tasks
|
||||
/// are available for stealing, return the worker to the Raylet.
|
||||
///
|
||||
/// \param[in] thief_addr The address of the worker that has finished its own work,
|
||||
/// and is ready for stealing.
|
||||
/// \param[in] was_error Whether the last task failed to be submitted to the worker.
|
||||
/// \param[in] scheduling_key The scheduling class of the worker.
|
||||
/// \param[in] assigned_resources Resource ids previously assigned to the worker.
|
||||
void StealTasksOrReturnWorker(
|
||||
const rpc::WorkerAddress &thief_addr, bool was_error,
|
||||
const SchedulingKey &scheduling_key,
|
||||
const google::protobuf::RepeatedPtrField<rpc::ResourceMapEntry> &assigned_resources)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mu_);
|
||||
|
||||
/// Push a task to a specific worker.
|
||||
void PushNormalTask(const rpc::WorkerAddress &addr,
|
||||
rpc::CoreWorkerClientInterface &client,
|
||||
|
@ -144,11 +181,6 @@ class CoreWorkerDirectTaskSubmitter {
|
|||
const google::protobuf::RepeatedPtrField<rpc::ResourceMapEntry>
|
||||
&assigned_resources);
|
||||
|
||||
/// Check that the scheduling_key_entries_ hashmap is empty.
|
||||
bool CheckNoSchedulingKeyEntries() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
|
||||
return scheduling_key_entries_.empty();
|
||||
}
|
||||
|
||||
/// Address of our RPC server.
|
||||
rpc::Address rpc_address_;
|
||||
|
||||
|
@ -197,32 +229,53 @@ class CoreWorkerDirectTaskSubmitter {
|
|||
/// (1) The lease client through which the worker should be returned
|
||||
/// (2) The expiration time of a worker's lease.
|
||||
/// (3) The number of tasks that are currently in flight to the worker
|
||||
/// (4) The resources assigned to the worker
|
||||
/// (5) The SchedulingKey assigned to tasks that will be sent to the worker
|
||||
/// (4) A boolean that indicates whether we have launched a StealTasks request, and we
|
||||
/// are waiting for the stolen tasks (5) The resources assigned to the worker (6) The
|
||||
/// SchedulingKey assigned to tasks that will be sent to the worker
|
||||
struct LeaseEntry {
|
||||
std::shared_ptr<WorkerLeaseInterface> lease_client;
|
||||
int64_t lease_expiration_time;
|
||||
uint32_t tasks_in_flight;
|
||||
uint32_t tasks_in_flight = 0;
|
||||
bool currently_stealing = false;
|
||||
google::protobuf::RepeatedPtrField<rpc::ResourceMapEntry> assigned_resources;
|
||||
SchedulingKey scheduling_key;
|
||||
|
||||
LeaseEntry(
|
||||
std::shared_ptr<WorkerLeaseInterface> lease_client = nullptr,
|
||||
int64_t lease_expiration_time = 0, uint32_t tasks_in_flight = 0,
|
||||
int64_t lease_expiration_time = 0,
|
||||
google::protobuf::RepeatedPtrField<rpc::ResourceMapEntry> assigned_resources =
|
||||
google::protobuf::RepeatedPtrField<rpc::ResourceMapEntry>(),
|
||||
SchedulingKey scheduling_key = std::make_tuple(0, std::vector<ObjectID>(),
|
||||
ActorID::Nil()))
|
||||
: lease_client(lease_client),
|
||||
lease_expiration_time(lease_expiration_time),
|
||||
tasks_in_flight(tasks_in_flight),
|
||||
assigned_resources(assigned_resources),
|
||||
scheduling_key(scheduling_key) {}
|
||||
|
||||
// Check whether the pipeline to the worker associated with a LeaseEntry is full.
|
||||
bool PipelineToWorkerFull(uint32_t max_tasks_in_flight_per_worker) const {
|
||||
inline bool PipelineToWorkerFull(uint32_t max_tasks_in_flight_per_worker) const {
|
||||
return tasks_in_flight == max_tasks_in_flight_per_worker;
|
||||
}
|
||||
|
||||
// Check whether the worker is a thief who is in the process of stealing tasks.
|
||||
// Knowing whether a thief is currently stealing is important to prevent the thief
|
||||
// from initiating another StealTasks request or from being returned to the raylet
|
||||
// until stealing has completed.
|
||||
inline bool WorkerIsStealing() const { return currently_stealing; }
|
||||
|
||||
// Once stealing has begun, updated the thief's currently_stealing flag to reflect the
|
||||
// new state.
|
||||
inline void SetWorkerIsStealing() {
|
||||
RAY_CHECK(!currently_stealing);
|
||||
currently_stealing = true;
|
||||
}
|
||||
|
||||
// Once stealing has completed, updated the thief's currently_stealing flag to reflect
|
||||
// the new state.
|
||||
inline void SetWorkerDoneStealing() {
|
||||
RAY_CHECK(currently_stealing);
|
||||
currently_stealing = false;
|
||||
}
|
||||
};
|
||||
|
||||
// Map from worker address to a LeaseEntry struct containing the lease's metadata.
|
||||
|
@ -233,6 +286,7 @@ class CoreWorkerDirectTaskSubmitter {
|
|||
// Keep track of pending worker lease requests to the raylet.
|
||||
std::pair<std::shared_ptr<WorkerLeaseInterface>, TaskID> pending_lease_request =
|
||||
std::make_pair(nullptr, TaskID::Nil());
|
||||
TaskSpecification resource_spec = TaskSpecification();
|
||||
// Tasks that are queued for execution. We keep an individual queue per
|
||||
// scheduling class to ensure fairness.
|
||||
std::deque<TaskSpecification> task_queue = std::deque<TaskSpecification>();
|
||||
|
@ -245,7 +299,7 @@ class CoreWorkerDirectTaskSubmitter {
|
|||
|
||||
// Check whether it's safe to delete this SchedulingKeyEntry from the
|
||||
// scheduling_key_entries_ hashmap.
|
||||
bool CanDelete() const {
|
||||
inline bool CanDelete() const {
|
||||
if (!pending_lease_request.first && task_queue.empty() &&
|
||||
active_workers.size() == 0 && total_tasks_in_flight == 0) {
|
||||
return true;
|
||||
|
@ -256,10 +310,23 @@ class CoreWorkerDirectTaskSubmitter {
|
|||
|
||||
// Check whether the pipelines to the active workers associated with a
|
||||
// SchedulingKeyEntry are all full.
|
||||
bool AllPipelinesToWorkersFull(uint32_t max_tasks_in_flight_per_worker) const {
|
||||
return total_tasks_in_flight ==
|
||||
inline bool AllPipelinesToWorkersFull(uint32_t max_tasks_in_flight_per_worker) const {
|
||||
return total_tasks_in_flight >=
|
||||
(active_workers.size() * max_tasks_in_flight_per_worker);
|
||||
}
|
||||
|
||||
// Check whether there exists at least one task that can be stolen
|
||||
inline bool StealableTasks() const {
|
||||
// TODO: Make this function more accurate without introducing excessive
|
||||
// inefficiencies. Currently, there is one scenario where this function can return
|
||||
// false even if there are stealable tasks. This happens if the number of tasks in
|
||||
// flight is less or equal to the number of active workers (so the condition below
|
||||
// evaluates to FALSE), but some workers have more than 1 task queued, while others
|
||||
// have none.
|
||||
|
||||
// If any worker has more than one task in flight, then that task can be stolen.
|
||||
return total_tasks_in_flight > active_workers.size();
|
||||
}
|
||||
};
|
||||
|
||||
// For each Scheduling Key, scheduling_key_entries_ contains a SchedulingKeyEntry struct
|
||||
|
|
|
@ -72,6 +72,16 @@ message ReturnObject {
|
|||
int64 size = 6;
|
||||
}
|
||||
|
||||
message StealTasksRequest {
|
||||
// The address of the thief that is requesting to steal tasks.
|
||||
Address thief_addr = 1;
|
||||
}
|
||||
|
||||
message StealTasksReply {
|
||||
// The TaskIDs of the tasks that were stolen
|
||||
repeated bytes stolen_tasks_ids = 2;
|
||||
}
|
||||
|
||||
message PushTaskRequest {
|
||||
// The ID of the worker this message is intended for.
|
||||
bytes intended_worker_id = 1;
|
||||
|
@ -95,8 +105,10 @@ message PushTaskRequest {
|
|||
message PushTaskReply {
|
||||
// The returned objects.
|
||||
repeated ReturnObject return_objects = 1;
|
||||
// Set to true if the task was stolen before its execution at the worker.
|
||||
bool task_stolen = 2;
|
||||
// Set to true if the worker will be exiting.
|
||||
bool worker_exiting = 2;
|
||||
bool worker_exiting = 3;
|
||||
// The references that the worker borrowed during the task execution. A
|
||||
// borrower is a process that is currently using the object ID, in one of 3
|
||||
// ways:
|
||||
|
@ -111,7 +123,7 @@ message PushTaskReply {
|
|||
// counts for any IDs that were nested inside these objects that the worker
|
||||
// may now be borrowing. The reference counts also include any new borrowers
|
||||
// that the worker created by passing a borrowed ID into a nested task.
|
||||
repeated ObjectReferenceCount borrowed_refs = 3;
|
||||
repeated ObjectReferenceCount borrowed_refs = 4;
|
||||
}
|
||||
|
||||
message DirectActorCallArgWaitCompleteRequest {
|
||||
|
@ -365,6 +377,8 @@ message RunOnUtilWorkerReply {
|
|||
service CoreWorkerService {
|
||||
// Push a task directly to this worker from another.
|
||||
rpc PushTask(PushTaskRequest) returns (PushTaskReply);
|
||||
// Steal tasks from a worker if it has a surplus of work
|
||||
rpc StealTasks(StealTasksRequest) returns (StealTasksReply);
|
||||
// Reply from raylet that wait for direct actor call args has completed.
|
||||
rpc DirectActorCallArgWaitComplete(DirectActorCallArgWaitCompleteRequest)
|
||||
returns (DirectActorCallArgWaitCompleteReply);
|
||||
|
|
|
@ -119,6 +119,9 @@ class CoreWorkerClientInterface {
|
|||
virtual void PushNormalTask(std::unique_ptr<PushTaskRequest> request,
|
||||
const ClientCallback<PushTaskReply> &callback) {}
|
||||
|
||||
virtual void StealTasks(std::unique_ptr<StealTasksRequest> request,
|
||||
const ClientCallback<StealTasksReply> &callback) {}
|
||||
|
||||
/// Notify a wait has completed for direct actor call arguments.
|
||||
///
|
||||
/// \param[in] request The request message.
|
||||
|
@ -292,6 +295,11 @@ class CoreWorkerClient : public std::enable_shared_from_this<CoreWorkerClient>,
|
|||
INVOKE_RPC_CALL(CoreWorkerService, PushTask, *request, callback, grpc_client_);
|
||||
}
|
||||
|
||||
void StealTasks(std::unique_ptr<StealTasksRequest> request,
|
||||
const ClientCallback<StealTasksReply> &callback) override {
|
||||
INVOKE_RPC_CALL(CoreWorkerService, StealTasks, *request, callback, grpc_client_);
|
||||
}
|
||||
|
||||
/// Send as many pending tasks as possible. This method is thread-safe.
|
||||
///
|
||||
/// The client will guarantee no more than kMaxBytesInFlight bytes of RPCs are being
|
||||
|
|
|
@ -29,6 +29,7 @@ namespace rpc {
|
|||
/// NOTE: See src/ray/core_worker/core_worker.h on how to add a new grpc handler.
|
||||
#define RAY_CORE_WORKER_RPC_HANDLERS \
|
||||
RPC_SERVICE_HANDLER(CoreWorkerService, PushTask) \
|
||||
RPC_SERVICE_HANDLER(CoreWorkerService, StealTasks) \
|
||||
RPC_SERVICE_HANDLER(CoreWorkerService, DirectActorCallArgWaitComplete) \
|
||||
RPC_SERVICE_HANDLER(CoreWorkerService, GetObjectStatus) \
|
||||
RPC_SERVICE_HANDLER(CoreWorkerService, WaitForActorOutOfScope) \
|
||||
|
@ -52,6 +53,7 @@ namespace rpc {
|
|||
|
||||
#define RAY_CORE_WORKER_DECLARE_RPC_HANDLERS \
|
||||
DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(PushTask) \
|
||||
DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(StealTasks) \
|
||||
DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(DirectActorCallArgWaitComplete) \
|
||||
DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(GetObjectStatus) \
|
||||
DECLARE_VOID_RPC_SERVICE_HANDLER_METHOD(WaitForActorOutOfScope) \
|
||||
|
|
Loading…
Add table
Reference in a new issue