mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
Merge pull request #181 from amplab/test
replace locks with synchronized data structures
This commit is contained in:
commit
cdc286ed70
3 changed files with 245 additions and 294 deletions
487
src/scheduler.cc
487
src/scheduler.cc
|
@ -12,9 +12,9 @@ Status SchedulerService::SubmitTask(ServerContext* context, const SubmitTaskRequ
|
|||
std::unique_ptr<Task> task(new Task(request->task())); // need to copy, because request is const
|
||||
size_t num_return_vals;
|
||||
{
|
||||
std::lock_guard<std::mutex> fntable_lock(fntable_lock_);
|
||||
FnTable::const_iterator fn = fntable_.find(task->name());
|
||||
if (fn == fntable_.end()) {
|
||||
auto fntable = fntable_.get();
|
||||
FnTable::const_iterator fn = fntable->find(task->name());
|
||||
if (fn == fntable->end()) {
|
||||
num_return_vals = 0;
|
||||
reply->set_function_registered(false);
|
||||
} else {
|
||||
|
@ -31,27 +31,17 @@ Status SchedulerService::SubmitTask(ServerContext* context, const SubmitTaskRequ
|
|||
result_objrefs.push_back(result);
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> reference_counts_lock(reference_counts_lock_); // we grab this lock because increment_ref_count assumes it has been acquired
|
||||
auto reference_counts = reference_counts_.get(); // we grab this lock because increment_ref_count assumes it has been acquired
|
||||
increment_ref_count(result_objrefs); // We increment once so the objrefs don't go out of scope before we reply to the worker that called SubmitTask. The corresponding decrement will happen in submit_task in raylib.
|
||||
increment_ref_count(result_objrefs); // We increment once so the objrefs don't go out of scope before the task is scheduled on the worker. The corresponding decrement will happen in deserialize_task in raylib.
|
||||
}
|
||||
|
||||
auto operation = std::unique_ptr<Operation>(new Operation());
|
||||
operation->set_allocated_task(task.release());
|
||||
{
|
||||
std::lock_guard<std::mutex> workers_lock(workers_lock_);
|
||||
operation->set_creator_operationid(workers_[request->workerid()].current_task);
|
||||
}
|
||||
operation->set_creator_operationid((*workers_.get())[request->workerid()].current_task);
|
||||
|
||||
OperationId operationid;
|
||||
{
|
||||
std::lock_guard<std::mutex> computation_graph_lock(computation_graph_lock_);
|
||||
operationid = computation_graph_.add_operation(std::move(operation));
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> task_queue_lock(task_queue_lock_);
|
||||
task_queue_.push_back(operationid);
|
||||
}
|
||||
OperationId operationid = computation_graph_.get()->add_operation(std::move(operation));
|
||||
task_queue_.get()->push_back(operationid);
|
||||
schedule();
|
||||
}
|
||||
return Status::OK;
|
||||
|
@ -66,17 +56,10 @@ Status SchedulerService::PutObj(ServerContext* context, const PutObjRequest* req
|
|||
}
|
||||
|
||||
Status SchedulerService::RequestObj(ServerContext* context, const RequestObjRequest* request, AckReply* reply) {
|
||||
size_t size;
|
||||
{
|
||||
std::lock_guard<std::mutex> objects_lock(objects_lock_);
|
||||
size = objtable_.size();
|
||||
}
|
||||
size_t size = objtable_.get()->size();
|
||||
ObjRef objref = request->objref();
|
||||
RAY_CHECK_LT(objref, size, "internal error: no object with objref " << objref << " exists");
|
||||
{
|
||||
std::lock_guard<std::mutex> get_queue_lock(get_queue_lock_);
|
||||
get_queue_.push_back(std::make_pair(request->workerid(), objref));
|
||||
}
|
||||
get_queue_.get()->push_back(std::make_pair(request->workerid(), objref));
|
||||
schedule();
|
||||
return Status::OK;
|
||||
}
|
||||
|
@ -86,25 +69,19 @@ Status SchedulerService::AliasObjRefs(ServerContext* context, const AliasObjRefs
|
|||
ObjRef target_objref = request->target_objref();
|
||||
RAY_LOG(RAY_ALIAS, "Aliasing objref " << alias_objref << " with objref " << target_objref);
|
||||
RAY_CHECK_NEQ(alias_objref, target_objref, "internal error: attempting to alias objref " << alias_objref << " with itself.");
|
||||
size_t size;
|
||||
{
|
||||
std::lock_guard<std::mutex> objects_lock(objects_lock_);
|
||||
size = objtable_.size();
|
||||
}
|
||||
size_t size = objtable_.get()->size();
|
||||
RAY_CHECK_LT(alias_objref, size, "internal error: no object with objref " << alias_objref << " exists");
|
||||
RAY_CHECK_LT(target_objref, size, "internal error: no object with objref " << target_objref << " exists");
|
||||
{
|
||||
std::lock_guard<std::mutex> target_objrefs_lock(target_objrefs_lock_);
|
||||
RAY_CHECK_EQ(target_objrefs_[alias_objref], UNITIALIZED_ALIAS, "internal error: attempting to alias objref " << alias_objref << " with objref " << target_objref << ", but objref " << alias_objref << " has already been aliased with objref " << target_objrefs_[alias_objref]);
|
||||
target_objrefs_[alias_objref] = target_objref;
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> reverse_target_objrefs_lock(reverse_target_objrefs_lock_);
|
||||
reverse_target_objrefs_[target_objref].push_back(alias_objref);
|
||||
auto target_objrefs = target_objrefs_.get();
|
||||
RAY_CHECK_EQ((*target_objrefs)[alias_objref], UNITIALIZED_ALIAS, "internal error: attempting to alias objref " << alias_objref << " with objref " << target_objref << ", but objref " << alias_objref << " has already been aliased with objref " << (*target_objrefs)[alias_objref]);
|
||||
(*target_objrefs)[alias_objref] = target_objref;
|
||||
}
|
||||
(*reverse_target_objrefs_.get())[target_objref].push_back(alias_objref);
|
||||
{
|
||||
// The corresponding increment was done in register_new_object.
|
||||
std::lock_guard<std::mutex> reference_counts_lock(reference_counts_lock_); // we grab this lock because decrement_ref_count assumes it has been acquired
|
||||
auto reference_counts = reference_counts_.get(); // we grab this lock because decrement_ref_count assumes it has been acquired
|
||||
auto contained_objrefs = contained_objrefs_.get(); // we grab this lock because decrement_ref_count assumes it has been acquired
|
||||
decrement_ref_count(std::vector<ObjRef>({alias_objref}));
|
||||
}
|
||||
schedule();
|
||||
|
@ -112,14 +89,14 @@ Status SchedulerService::AliasObjRefs(ServerContext* context, const AliasObjRefs
|
|||
}
|
||||
|
||||
Status SchedulerService::RegisterObjStore(ServerContext* context, const RegisterObjStoreRequest* request, RegisterObjStoreReply* reply) {
|
||||
std::lock_guard<std::mutex> objects_lock(objects_lock_); // to protect objects_in_transit_
|
||||
std::lock_guard<std::mutex> objstore_lock(objstores_lock_);
|
||||
ObjStoreId objstoreid = objstores_.size();
|
||||
auto objtable = objtable_.get(); // to protect objects_in_transit_
|
||||
auto objstores = objstores_.get();
|
||||
ObjStoreId objstoreid = objstores->size();
|
||||
auto channel = grpc::CreateChannel(request->objstore_address(), grpc::InsecureChannelCredentials());
|
||||
objstores_.push_back(ObjStoreHandle());
|
||||
objstores_[objstoreid].address = request->objstore_address();
|
||||
objstores_[objstoreid].channel = channel;
|
||||
objstores_[objstoreid].objstore_stub = ObjStore::NewStub(channel);
|
||||
objstores->push_back(ObjStoreHandle());
|
||||
(*objstores)[objstoreid].address = request->objstore_address();
|
||||
(*objstores)[objstoreid].channel = channel;
|
||||
(*objstores)[objstoreid].objstore_stub = ObjStore::NewStub(channel);
|
||||
reply->set_objstoreid(objstoreid);
|
||||
objects_in_transit_.push_back(std::vector<ObjRef>());
|
||||
return Status::OK;
|
||||
|
@ -153,7 +130,8 @@ Status SchedulerService::ObjReady(ServerContext* context, const ObjReadyRequest*
|
|||
// the corresponding increment was done in register_new_object in the
|
||||
// scheduler. For all subsequent calls to ObjReady, the corresponding
|
||||
// increment was done in deliver_object_if_necessary in the scheduler.
|
||||
std::lock_guard<std::mutex> reference_counts_lock(reference_counts_lock_); // we grab this lock because decrement_ref_count assumes it has been acquired
|
||||
auto reference_counts = reference_counts_.get(); // we grab this lock because decrement_ref_count assumes it has been acquired
|
||||
auto contained_objrefs = contained_objrefs_.get(); // we grab this lock because decrement_ref_count assumes it has been acquired
|
||||
decrement_ref_count(std::vector<ObjRef>({objref}));
|
||||
}
|
||||
schedule();
|
||||
|
@ -164,39 +142,28 @@ Status SchedulerService::ReadyForNewTask(ServerContext* context, const ReadyForN
|
|||
RAY_LOG(RAY_INFO, "worker " << request->workerid() << " is ready for a new task");
|
||||
if (request->has_previous_task_info()) {
|
||||
OperationId operationid;
|
||||
{
|
||||
std::lock_guard<std::mutex> workers_lock(workers_lock_);
|
||||
operationid = workers_[request->workerid()].current_task;
|
||||
}
|
||||
operationid = (*workers_.get())[request->workerid()].current_task;
|
||||
std::string task_name;
|
||||
{
|
||||
std::lock_guard<std::mutex> computation_graph_lock(computation_graph_lock_);
|
||||
task_name = computation_graph_.get_task(operationid).name();
|
||||
}
|
||||
task_name = computation_graph_.get()->get_task(operationid).name();
|
||||
TaskStatus info;
|
||||
{
|
||||
std::lock_guard<std::mutex> workers_lock(workers_lock_);
|
||||
operationid = workers_[request->workerid()].current_task;
|
||||
auto workers = workers_.get();
|
||||
operationid = (*workers)[request->workerid()].current_task;
|
||||
info.set_operationid(operationid);
|
||||
info.set_function_name(task_name);
|
||||
info.set_worker_address(workers_[request->workerid()].worker_address);
|
||||
info.set_worker_address((*workers)[request->workerid()].worker_address);
|
||||
info.set_error_message(request->previous_task_info().error_message());
|
||||
workers_[request->workerid()].current_task = NO_OPERATION; // clear operation ID
|
||||
(*workers)[request->workerid()].current_task = NO_OPERATION; // clear operation ID
|
||||
}
|
||||
if (!request->previous_task_info().task_succeeded()) {
|
||||
RAY_LOG(RAY_INFO, "Error: Task " << info.operationid() << " executing function " << info.function_name() << " on worker " << request->workerid() << " failed with error message: " << info.error_message());
|
||||
std::lock_guard<std::mutex> failed_tasks_lock(failed_tasks_lock_);
|
||||
failed_tasks_.push_back(info);
|
||||
failed_tasks_.get()->push_back(info);
|
||||
} else {
|
||||
std::lock_guard<std::mutex> successful_tasks_lock(successful_tasks_lock_);
|
||||
successful_tasks_.push_back(info.operationid());
|
||||
successful_tasks_.get()->push_back(info.operationid());
|
||||
}
|
||||
// TODO(rkn): Handle task failure
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(avail_workers_lock_);
|
||||
avail_workers_.push_back(request->workerid());
|
||||
}
|
||||
avail_workers_.get()->push_back(request->workerid());
|
||||
schedule();
|
||||
return Status::OK;
|
||||
}
|
||||
|
@ -208,7 +175,7 @@ Status SchedulerService::IncrementRefCount(ServerContext* context, const Increme
|
|||
for (int i = 0; i < num_objrefs; ++i) {
|
||||
objrefs.push_back(request->objref(i));
|
||||
}
|
||||
std::lock_guard<std::mutex> reference_counts_lock(reference_counts_lock_); // we grab this lock because increment_ref_count assumes it has been acquired
|
||||
auto reference_counts = reference_counts_.get(); // we grab this lock because increment_ref_count assumes it has been acquired
|
||||
increment_ref_count(objrefs);
|
||||
return Status::OK;
|
||||
}
|
||||
|
@ -220,7 +187,8 @@ Status SchedulerService::DecrementRefCount(ServerContext* context, const Decreme
|
|||
for (int i = 0; i < num_objrefs; ++i) {
|
||||
objrefs.push_back(request->objref(i));
|
||||
}
|
||||
std::lock_guard<std::mutex> reference_counts_lock(reference_counts_lock_); // we grab this lock, because decrement_ref_count assumes it has been acquired
|
||||
auto reference_counts = reference_counts_.get(); // we grab this lock, because decrement_ref_count assumes it has been acquired
|
||||
auto contained_objrefs = contained_objrefs_.get(); // we grab this lock because decrement_ref_count assumes it has been acquired
|
||||
decrement_ref_count(objrefs);
|
||||
return Status::OK;
|
||||
}
|
||||
|
@ -231,10 +199,10 @@ Status SchedulerService::AddContainedObjRefs(ServerContext* context, const AddCo
|
|||
// TODO(rkn): Perhaps we don't need this check. It won't work because the objstore may not have called ObjReady yet.
|
||||
// RAY_LOG(RAY_FATAL, "Attempting to add contained objrefs for non-canonical objref " << objref);
|
||||
// }
|
||||
std::lock_guard<std::mutex> contained_objrefs_lock(contained_objrefs_lock_);
|
||||
RAY_CHECK_EQ(contained_objrefs_[objref].size(), 0, "Attempting to add contained objrefs for objref " << objref << ", but contained_objrefs_[objref].size() != 0.");
|
||||
auto contained_objrefs = contained_objrefs_.get();
|
||||
RAY_CHECK_EQ((*contained_objrefs)[objref].size(), 0, "Attempting to add contained objrefs for objref " << objref << ", but contained_objrefs_[objref].size() != 0.");
|
||||
for (int i = 0; i < request->contained_objref_size(); ++i) {
|
||||
contained_objrefs_[objref].push_back(request->contained_objref(i));
|
||||
(*contained_objrefs)[objref].push_back(request->contained_objref(i));
|
||||
}
|
||||
return Status::OK;
|
||||
}
|
||||
|
@ -245,34 +213,34 @@ Status SchedulerService::SchedulerInfo(ServerContext* context, const SchedulerIn
|
|||
}
|
||||
|
||||
Status SchedulerService::TaskInfo(ServerContext* context, const TaskInfoRequest* request, TaskInfoReply* reply) {
|
||||
std::lock_guard<std::mutex> successful_tasks_lock(successful_tasks_lock_);
|
||||
std::lock_guard<std::mutex> failed_tasks_lock(failed_tasks_lock_);
|
||||
std::lock_guard<std::mutex> computation_graph_lock(computation_graph_lock_);
|
||||
std::lock_guard<std::mutex> workers_lock(workers_lock_);
|
||||
for (int i = 0; i < failed_tasks_.size(); ++i) {
|
||||
auto successful_tasks = successful_tasks_.get();
|
||||
auto failed_tasks = failed_tasks_.get();
|
||||
auto computation_graph = computation_graph_.get();
|
||||
auto workers = workers_.get();
|
||||
for (int i = 0; i < failed_tasks->size(); ++i) {
|
||||
TaskStatus* info = reply->add_failed_task();
|
||||
*info = failed_tasks_[i];
|
||||
*info = (*failed_tasks)[i];
|
||||
}
|
||||
for (int i = 0; i < workers_.size(); ++i) {
|
||||
OperationId operationid = workers_[i].current_task;
|
||||
for (int i = 0; i < workers->size(); ++i) {
|
||||
OperationId operationid = (*workers)[i].current_task;
|
||||
if (operationid != NO_OPERATION) {
|
||||
const Task& task = computation_graph_.get_task(operationid);
|
||||
const Task& task = computation_graph->get_task(operationid);
|
||||
TaskStatus* info = reply->add_running_task();
|
||||
info->set_operationid(operationid);
|
||||
info->set_function_name(task.name());
|
||||
info->set_worker_address(workers_[i].worker_address);
|
||||
info->set_worker_address((*workers)[i].worker_address);
|
||||
}
|
||||
}
|
||||
reply->set_num_succeeded(successful_tasks_.size());
|
||||
reply->set_num_succeeded(successful_tasks->size());
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
void SchedulerService::deliver_object_if_necessary(ObjRef canonical_objref, ObjStoreId from, ObjStoreId to) {
|
||||
bool object_present_or_in_transit;
|
||||
{
|
||||
std::lock_guard<std::mutex> objects_lock(objects_lock_);
|
||||
auto &objstores = objtable_[canonical_objref];
|
||||
bool object_present = std::binary_search(objstores.begin(), objstores.end(), to);
|
||||
auto objtable = objtable_.get();
|
||||
auto &locations = (*objtable)[canonical_objref];
|
||||
bool object_present = std::binary_search(locations.begin(), locations.end(), to);
|
||||
auto &objects_in_flight = objects_in_transit_[to];
|
||||
bool object_in_transit = (std::find(objects_in_flight.begin(), objects_in_flight.end(), canonical_objref) != objects_in_flight.end());
|
||||
object_present_or_in_transit = object_present || object_in_transit;
|
||||
|
@ -299,16 +267,16 @@ void SchedulerService::deliver_object(ObjRef canonical_objref, ObjStoreId from,
|
|||
// We increment once so the objref doesn't go out of scope before the ObjReady
|
||||
// method is called. The corresponding decrement will happen in ObjReady in
|
||||
// the scheduler.
|
||||
std::lock_guard<std::mutex> reference_counts_lock(reference_counts_lock_); // we grab this lock because increment_ref_count assumes it has been acquired
|
||||
auto reference_counts = reference_counts_.get(); // we grab this lock because increment_ref_count assumes it has been acquired
|
||||
increment_ref_count(std::vector<ObjRef>({canonical_objref}));
|
||||
}
|
||||
ClientContext context;
|
||||
AckReply reply;
|
||||
StartDeliveryRequest request;
|
||||
request.set_objref(canonical_objref);
|
||||
std::lock_guard<std::mutex> lock(objstores_lock_);
|
||||
request.set_objstore_address(objstores_[from].address);
|
||||
objstores_[to].objstore_stub->StartDelivery(&context, request, &reply);
|
||||
auto objstores = objstores_.get();
|
||||
request.set_objstore_address((*objstores)[from].address);
|
||||
(*objstores)[to].objstore_stub->StartDelivery(&context, request, &reply);
|
||||
}
|
||||
|
||||
void SchedulerService::schedule() {
|
||||
|
@ -328,7 +296,7 @@ void SchedulerService::schedule() {
|
|||
// assign_task assumes that the canonical objrefs for its arguments are all ready, that is has_canonical_objref() is true for all of the call's arguments
|
||||
void SchedulerService::assign_task(OperationId operationid, WorkerId workerid) {
|
||||
ObjStoreId objstoreid = get_store(workerid);
|
||||
const Task& task = computation_graph_.get_task(operationid);
|
||||
const Task& task = computation_graph_.unsafe_get()->get_task(operationid);
|
||||
ClientContext context;
|
||||
ExecuteTaskRequest request;
|
||||
ExecuteTaskReply reply;
|
||||
|
@ -337,26 +305,23 @@ void SchedulerService::assign_task(OperationId operationid, WorkerId workerid) {
|
|||
if (!task.arg(i).has_obj()) {
|
||||
ObjRef objref = task.arg(i).ref();
|
||||
ObjRef canonical_objref = get_canonical_objref(objref);
|
||||
{
|
||||
// Notify the relevant objstore about potential aliasing when it's ready
|
||||
std::lock_guard<std::mutex> alias_notification_queue_lock(alias_notification_queue_lock_);
|
||||
alias_notification_queue_.push_back(std::make_pair(objstoreid, std::make_pair(objref, canonical_objref)));
|
||||
}
|
||||
// Notify the relevant objstore about potential aliasing when it's ready
|
||||
alias_notification_queue_.get()->push_back(std::make_pair(objstoreid, std::make_pair(objref, canonical_objref)));
|
||||
attempt_notify_alias(objstoreid, objref, canonical_objref);
|
||||
RAY_LOG(RAY_DEBUG, "task contains object ref " << canonical_objref);
|
||||
deliver_object_if_necessary(canonical_objref, pick_objstore(canonical_objref), objstoreid);
|
||||
}
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> workers_lock(workers_lock_);
|
||||
workers_[workerid].current_task = operationid;
|
||||
auto workers = workers_.get();
|
||||
(*workers)[workerid].current_task = operationid;
|
||||
request.mutable_task()->CopyFrom(task); // TODO(rkn): Is ownership handled properly here?
|
||||
Status status = workers_[workerid].worker_stub->ExecuteTask(&context, request, &reply);
|
||||
Status status = (*workers)[workerid].worker_stub->ExecuteTask(&context, request, &reply);
|
||||
}
|
||||
}
|
||||
|
||||
bool SchedulerService::can_run(const Task& task) {
|
||||
std::lock_guard<std::mutex> lock(objects_lock_);
|
||||
auto objtable = objtable_.get();
|
||||
for (int i = 0; i < task.arg_size(); ++i) {
|
||||
if (!task.arg(i).has_obj()) {
|
||||
ObjRef objref = task.arg(i).ref();
|
||||
|
@ -364,7 +329,7 @@ bool SchedulerService::can_run(const Task& task) {
|
|||
return false;
|
||||
}
|
||||
ObjRef canonical_objref = get_canonical_objref(objref);
|
||||
if (canonical_objref >= objtable_.size() || objtable_[canonical_objref].size() == 0) {
|
||||
if (canonical_objref >= objtable->size() || (*objtable)[canonical_objref].size() == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -377,9 +342,9 @@ std::pair<WorkerId, ObjStoreId> SchedulerService::register_worker(const std::str
|
|||
ObjStoreId objstoreid = std::numeric_limits<size_t>::max();
|
||||
// TODO: HACK: num_attempts is a hack
|
||||
for (int num_attempts = 0; num_attempts < 5; ++num_attempts) {
|
||||
std::lock_guard<std::mutex> lock(objstores_lock_);
|
||||
for (size_t i = 0; i < objstores_.size(); ++i) {
|
||||
if (objstores_[i].address == objstore_address) {
|
||||
auto objstores = objstores_.get();
|
||||
for (size_t i = 0; i < objstores->size(); ++i) {
|
||||
if ((*objstores)[i].address == objstore_address) {
|
||||
objstoreid = i;
|
||||
}
|
||||
}
|
||||
|
@ -390,15 +355,15 @@ std::pair<WorkerId, ObjStoreId> SchedulerService::register_worker(const std::str
|
|||
RAY_CHECK_NEQ(objstoreid, std::numeric_limits<size_t>::max(), "object store with address " << objstore_address << " not yet registered");
|
||||
WorkerId workerid;
|
||||
{
|
||||
std::lock_guard<std::mutex> workers_lock(workers_lock_);
|
||||
workerid = workers_.size();
|
||||
workers_.push_back(WorkerHandle());
|
||||
auto workers = workers_.get();
|
||||
workerid = workers->size();
|
||||
workers->push_back(WorkerHandle());
|
||||
auto channel = grpc::CreateChannel(worker_address, grpc::InsecureChannelCredentials());
|
||||
workers_[workerid].channel = channel;
|
||||
workers_[workerid].objstoreid = objstoreid;
|
||||
workers_[workerid].worker_stub = WorkerService::NewStub(channel);
|
||||
workers_[workerid].worker_address = worker_address;
|
||||
workers_[workerid].current_task = NO_OPERATION;
|
||||
(*workers)[workerid].channel = channel;
|
||||
(*workers)[workerid].objstoreid = objstoreid;
|
||||
(*workers)[workerid].worker_stub = WorkerService::NewStub(channel);
|
||||
(*workers)[workerid].worker_address = worker_address;
|
||||
(*workers)[workerid].current_task = NO_OPERATION;
|
||||
}
|
||||
return std::make_pair(workerid, objstoreid);
|
||||
}
|
||||
|
@ -406,25 +371,25 @@ std::pair<WorkerId, ObjStoreId> SchedulerService::register_worker(const std::str
|
|||
ObjRef SchedulerService::register_new_object() {
|
||||
// If we don't simultaneously lock objtable_ and target_objrefs_, we will probably get errors.
|
||||
// TODO(rkn): increment/decrement_reference_count also acquire reference_counts_lock_ and target_objrefs_lock_ (through has_canonical_objref()), which caused deadlock in the past
|
||||
std::lock_guard<std::mutex> reference_counts_lock(reference_counts_lock_);
|
||||
std::lock_guard<std::mutex> contained_objrefs_lock(contained_objrefs_lock_);
|
||||
std::lock_guard<std::mutex> objects_lock(objects_lock_);
|
||||
std::lock_guard<std::mutex> target_objrefs_lock(target_objrefs_lock_);
|
||||
std::lock_guard<std::mutex> reverse_target_objrefs_lock(reverse_target_objrefs_lock_);
|
||||
ObjRef objtable_size = objtable_.size();
|
||||
ObjRef target_objrefs_size = target_objrefs_.size();
|
||||
ObjRef reverse_target_objrefs_size = reverse_target_objrefs_.size();
|
||||
ObjRef reference_counts_size = reference_counts_.size();
|
||||
ObjRef contained_objrefs_size = contained_objrefs_.size();
|
||||
auto reference_counts = reference_counts_.get();
|
||||
auto contained_objrefs = contained_objrefs_.get();
|
||||
auto objtable = objtable_.get();
|
||||
auto target_objrefs = target_objrefs_.get();
|
||||
auto reverse_target_objrefs = reverse_target_objrefs_.get();
|
||||
ObjRef objtable_size = objtable->size();
|
||||
ObjRef target_objrefs_size = target_objrefs->size();
|
||||
ObjRef reverse_target_objrefs_size = reverse_target_objrefs->size();
|
||||
ObjRef reference_counts_size = reference_counts->size();
|
||||
ObjRef contained_objrefs_size = contained_objrefs->size();
|
||||
RAY_CHECK_EQ(objtable_size, target_objrefs_size, "objtable_ and target_objrefs_ should have the same size, but objtable_.size() = " << objtable_size << " and target_objrefs_.size() = " << target_objrefs_size);
|
||||
RAY_CHECK_EQ(objtable_size, reverse_target_objrefs_size, "objtable_ and reverse_target_objrefs_ should have the same size, but objtable_.size() = " << objtable_size << " and reverse_target_objrefs_.size() = " << reverse_target_objrefs_size);
|
||||
RAY_CHECK_EQ(objtable_size, reference_counts_size, "objtable_ and reference_counts_ should have the same size, but objtable_.size() = " << objtable_size << " and reference_counts_.size() = " << reference_counts_size);
|
||||
RAY_CHECK_EQ(objtable_size, contained_objrefs_size, "objtable_ and contained_objrefs_ should have the same size, but objtable_.size() = " << objtable_size << " and contained_objrefs_.size() = " << contained_objrefs_size);
|
||||
objtable_.push_back(std::vector<ObjStoreId>());
|
||||
target_objrefs_.push_back(UNITIALIZED_ALIAS);
|
||||
reverse_target_objrefs_.push_back(std::vector<ObjRef>());
|
||||
reference_counts_.push_back(0);
|
||||
contained_objrefs_.push_back(std::vector<ObjRef>());
|
||||
objtable->push_back(std::vector<ObjStoreId>());
|
||||
target_objrefs->push_back(UNITIALIZED_ALIAS);
|
||||
reverse_target_objrefs->push_back(std::vector<ObjRef>());
|
||||
reference_counts->push_back(0);
|
||||
contained_objrefs->push_back(std::vector<ObjRef>());
|
||||
{
|
||||
// We increment once so the objref doesn't go out of scope before the ObjReady
|
||||
// method is called. The corresponding decrement will happen either in
|
||||
|
@ -436,89 +401,88 @@ ObjRef SchedulerService::register_new_object() {
|
|||
|
||||
void SchedulerService::add_location(ObjRef canonical_objref, ObjStoreId objstoreid) {
|
||||
// add_location must be called with a canonical objref
|
||||
{
|
||||
std::lock_guard<std::mutex> reference_counts_lock(reference_counts_lock_);
|
||||
RAY_CHECK_NEQ(reference_counts_[canonical_objref], DEALLOCATED, "Calling ObjReady with canonical_objref " << canonical_objref << ", but this objref has already been deallocated");
|
||||
}
|
||||
RAY_CHECK_NEQ((*reference_counts_.get())[canonical_objref], DEALLOCATED, "Calling ObjReady with canonical_objref " << canonical_objref << ", but this objref has already been deallocated");
|
||||
RAY_CHECK(is_canonical(canonical_objref), "Attempting to call add_location with a non-canonical objref (objref " << canonical_objref << ")");
|
||||
std::lock_guard<std::mutex> objects_lock(objects_lock_);
|
||||
RAY_CHECK_LT(canonical_objref, objtable_.size(), "trying to put an object in the object store that was not registered with the scheduler (objref " << canonical_objref << ")");
|
||||
auto objtable = objtable_.get();
|
||||
RAY_CHECK_LT(canonical_objref, objtable->size(), "trying to put an object in the object store that was not registered with the scheduler (objref " << canonical_objref << ")");
|
||||
// do a binary search
|
||||
auto &objstores = objtable_[canonical_objref];
|
||||
auto pos = std::lower_bound(objstores.begin(), objstores.end(), objstoreid);
|
||||
if (pos == objstores.end() || objstoreid < *pos) {
|
||||
objstores.insert(pos, objstoreid);
|
||||
auto &locations = (*objtable)[canonical_objref];
|
||||
auto pos = std::lower_bound(locations.begin(), locations.end(), objstoreid);
|
||||
if (pos == locations.end() || objstoreid < *pos) {
|
||||
locations.insert(pos, objstoreid);
|
||||
}
|
||||
auto &objects_in_flight = objects_in_transit_[objstoreid];
|
||||
objects_in_flight.erase(std::remove(objects_in_flight.begin(), objects_in_flight.end(), canonical_objref), objects_in_flight.end());
|
||||
}
|
||||
|
||||
void SchedulerService::add_canonical_objref(ObjRef objref) {
|
||||
std::lock_guard<std::mutex> lock(target_objrefs_lock_);
|
||||
RAY_CHECK_LT(objref, target_objrefs_.size(), "internal error: attempting to insert objref " << objref << " in target_objrefs_, but target_objrefs_.size() is " << target_objrefs_.size());
|
||||
RAY_CHECK(target_objrefs_[objref] == UNITIALIZED_ALIAS || target_objrefs_[objref] == objref, "internal error: attempting to declare objref " << objref << " as a canonical objref, but target_objrefs_[objref] is already aliased with objref " << target_objrefs_[objref]);
|
||||
target_objrefs_[objref] = objref;
|
||||
auto target_objrefs = target_objrefs_.get();
|
||||
RAY_CHECK_LT(objref, target_objrefs->size(), "internal error: attempting to insert objref " << objref << " in target_objrefs_, but target_objrefs_.size() is " << target_objrefs->size());
|
||||
RAY_CHECK((*target_objrefs)[objref] == UNITIALIZED_ALIAS || (*target_objrefs)[objref] == objref, "internal error: attempting to declare objref " << objref << " as a canonical objref, but target_objrefs_[objref] is already aliased with objref " << (*target_objrefs)[objref]);
|
||||
(*target_objrefs)[objref] = objref;
|
||||
}
|
||||
|
||||
ObjStoreId SchedulerService::get_store(WorkerId workerid) {
|
||||
std::lock_guard<std::mutex> lock(workers_lock_);
|
||||
ObjStoreId result = workers_[workerid].objstoreid;
|
||||
auto workers = workers_.get();
|
||||
ObjStoreId result = (*workers)[workerid].objstoreid;
|
||||
return result;
|
||||
}
|
||||
|
||||
void SchedulerService::register_function(const std::string& name, WorkerId workerid, size_t num_return_vals) {
|
||||
std::lock_guard<std::mutex> lock(fntable_lock_);
|
||||
FnInfo& info = fntable_[name];
|
||||
auto fntable = fntable_.get();
|
||||
FnInfo& info = (*fntable)[name];
|
||||
info.set_num_return_vals(num_return_vals);
|
||||
info.add_worker(workerid);
|
||||
}
|
||||
|
||||
void SchedulerService::get_info(const SchedulerInfoRequest& request, SchedulerInfoReply* reply) {
|
||||
acquire_all_locks();
|
||||
for (int i = 0; i < reference_counts_.size(); ++i) {
|
||||
reply->add_reference_count(reference_counts_[i]);
|
||||
auto reference_counts = reference_counts_.unsafe_get();
|
||||
for (int i = 0; i < reference_counts->size(); ++i) {
|
||||
reply->add_reference_count((*reference_counts)[i]);
|
||||
}
|
||||
for (int i = 0; i < target_objrefs_.size(); ++i) {
|
||||
reply->add_target_objref(target_objrefs_[i]);
|
||||
auto target_objrefs = target_objrefs_.unsafe_get();
|
||||
for (int i = 0; i < target_objrefs->size(); ++i) {
|
||||
reply->add_target_objref((*target_objrefs)[i]);
|
||||
}
|
||||
auto function_table = reply->mutable_function_table();
|
||||
for (const auto& entry : fntable_) {
|
||||
for (const auto& entry : *fntable_.unsafe_get()) {
|
||||
(*function_table)[entry.first].set_num_return_vals(entry.second.num_return_vals());
|
||||
for (const WorkerId& worker : entry.second.workers()) {
|
||||
(*function_table)[entry.first].add_workerid(worker);
|
||||
}
|
||||
}
|
||||
for (const auto& entry : task_queue_) {
|
||||
for (const auto& entry : *task_queue_.unsafe_get()) {
|
||||
reply->add_operationid(entry);
|
||||
}
|
||||
for (const WorkerId& entry : avail_workers_) {
|
||||
for (const WorkerId& entry : *avail_workers_.unsafe_get()) {
|
||||
reply->add_avail_worker(entry);
|
||||
}
|
||||
computation_graph_.to_protobuf(reply->mutable_computation_graph());
|
||||
computation_graph_.unsafe_get()->to_protobuf(reply->mutable_computation_graph());
|
||||
release_all_locks();
|
||||
}
|
||||
|
||||
// pick_objstore assumes that objects_lock_ has been acquired
|
||||
// pick_objstore must be called with a canonical_objref
|
||||
ObjStoreId SchedulerService::pick_objstore(ObjRef canonical_objref) {
|
||||
std::mt19937 rng;
|
||||
RAY_CHECK(is_canonical(canonical_objref), "Attempting to call pick_objstore with a non-canonical objref, (objref " << canonical_objref << ")");
|
||||
std::uniform_int_distribution<int> uni(0, objtable_[canonical_objref].size() - 1);
|
||||
ObjStoreId objstoreid = objtable_[canonical_objref][uni(rng)];
|
||||
auto objtable = objtable_.get();
|
||||
std::uniform_int_distribution<int> uni(0, (*objtable)[canonical_objref].size() - 1);
|
||||
ObjStoreId objstoreid = (*objtable)[canonical_objref][uni(rng)];
|
||||
return objstoreid;
|
||||
}
|
||||
|
||||
bool SchedulerService::is_canonical(ObjRef objref) {
|
||||
std::lock_guard<std::mutex> lock(target_objrefs_lock_);
|
||||
RAY_CHECK_NEQ(target_objrefs_[objref], UNITIALIZED_ALIAS, "Attempting to call is_canonical on an objref for which aliasing is not complete or the object is not ready, target_objrefs_[objref] == UNITIALIZED_ALIAS for objref " << objref << ".");
|
||||
return objref == target_objrefs_[objref];
|
||||
auto target_objrefs = target_objrefs_.get();
|
||||
RAY_CHECK_NEQ((*target_objrefs)[objref], UNITIALIZED_ALIAS, "Attempting to call is_canonical on an objref for which aliasing is not complete or the object is not ready, target_objrefs_[objref] == UNITIALIZED_ALIAS for objref " << objref << ".");
|
||||
return objref == (*target_objrefs)[objref];
|
||||
}
|
||||
|
||||
void SchedulerService::perform_gets() {
|
||||
std::lock_guard<std::mutex> get_queue_lock(get_queue_lock_);
|
||||
auto get_queue = get_queue_.get();
|
||||
// Complete all get tasks that can be completed.
|
||||
for (int i = 0; i < get_queue_.size(); ++i) {
|
||||
const std::pair<WorkerId, ObjRef>& get = get_queue_[i];
|
||||
for (int i = 0; i < get_queue->size(); ++i) {
|
||||
const std::pair<WorkerId, ObjRef>& get = (*get_queue)[i];
|
||||
ObjRef objref = get.second;
|
||||
WorkerId workerid = get.first;
|
||||
ObjStoreId objstoreid = get_store(workerid);
|
||||
|
@ -528,46 +492,39 @@ void SchedulerService::perform_gets() {
|
|||
}
|
||||
ObjRef canonical_objref = get_canonical_objref(objref);
|
||||
RAY_LOG(RAY_DEBUG, "attempting to get objref " << get.second << " with canonical objref " << canonical_objref << " to objstore " << get_store(workerid));
|
||||
int num_stores;
|
||||
{
|
||||
std::lock_guard<std::mutex> objects_lock(objects_lock_);
|
||||
num_stores = objtable_[canonical_objref].size();
|
||||
}
|
||||
int num_stores = (*objtable_.get())[canonical_objref].size();
|
||||
if (num_stores > 0) {
|
||||
deliver_object_if_necessary(canonical_objref, pick_objstore(canonical_objref), objstoreid);
|
||||
{
|
||||
// Notify the relevant objstore about potential aliasing when it's ready
|
||||
std::lock_guard<std::mutex> alias_notification_queue_lock(alias_notification_queue_lock_);
|
||||
alias_notification_queue_.push_back(std::make_pair(get_store(workerid), std::make_pair(objref, canonical_objref)));
|
||||
}
|
||||
// Notify the relevant objstore about potential aliasing when it's ready
|
||||
alias_notification_queue_.get()->push_back(std::make_pair(get_store(workerid), std::make_pair(objref, canonical_objref)));
|
||||
// Remove the get task from the queue
|
||||
std::swap(get_queue_[i], get_queue_[get_queue_.size() - 1]);
|
||||
get_queue_.pop_back();
|
||||
std::swap((*get_queue)[i], (*get_queue)[get_queue->size() - 1]);
|
||||
get_queue->pop_back();
|
||||
i -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SchedulerService::schedule_tasks_naively() {
|
||||
std::lock_guard<std::mutex> computation_graph_lock(computation_graph_lock_);
|
||||
std::lock_guard<std::mutex> fntable_lock(fntable_lock_);
|
||||
std::lock_guard<std::mutex> avail_workers_lock(avail_workers_lock_);
|
||||
std::lock_guard<std::mutex> task_queue_lock(task_queue_lock_);
|
||||
for (int i = 0; i < avail_workers_.size(); ++i) {
|
||||
auto computation_graph = computation_graph_.get();
|
||||
auto fntable = fntable_.get();
|
||||
auto avail_workers = avail_workers_.get();
|
||||
auto task_queue = task_queue_.get();
|
||||
for (int i = 0; i < avail_workers->size(); ++i) {
|
||||
// Submit all tasks whose arguments are ready.
|
||||
WorkerId workerid = avail_workers_[i];
|
||||
for (auto it = task_queue_.begin(); it != task_queue_.end(); ++it) {
|
||||
WorkerId workerid = (*avail_workers)[i];
|
||||
for (auto it = task_queue->begin(); it != task_queue->end(); ++it) {
|
||||
// The use of erase(it) below invalidates the iterator, but we
|
||||
// immediately break out of the inner loop, so the iterator is not used
|
||||
// after the erase
|
||||
const OperationId operationid = *it;
|
||||
const Task& task = computation_graph_.get_task(operationid);
|
||||
auto& workers = fntable_[task.name()].workers();
|
||||
const Task& task = computation_graph->get_task(operationid);
|
||||
auto& workers = (*fntable)[task.name()].workers();
|
||||
if (std::binary_search(workers.begin(), workers.end(), workerid) && can_run(task)) {
|
||||
assign_task(operationid, workerid);
|
||||
task_queue_.erase(it);
|
||||
std::swap(avail_workers_[i], avail_workers_[avail_workers_.size() - 1]);
|
||||
avail_workers_.pop_back();
|
||||
task_queue->erase(it);
|
||||
std::swap((*avail_workers)[i], (*avail_workers)[avail_workers->size() - 1]);
|
||||
avail_workers->pop_back();
|
||||
i -= 1;
|
||||
break;
|
||||
}
|
||||
|
@ -576,20 +533,20 @@ void SchedulerService::schedule_tasks_naively() {
|
|||
}
|
||||
|
||||
void SchedulerService::schedule_tasks_location_aware() {
|
||||
std::lock_guard<std::mutex> computation_graph_lock(computation_graph_lock_);
|
||||
std::lock_guard<std::mutex> fntable_lock(fntable_lock_);
|
||||
std::lock_guard<std::mutex> avail_workers_lock(avail_workers_lock_);
|
||||
std::lock_guard<std::mutex> task_queue_lock(task_queue_lock_);
|
||||
for (int i = 0; i < avail_workers_.size(); ++i) {
|
||||
auto computation_graph = computation_graph_.get();
|
||||
auto fntable = fntable_.get();
|
||||
auto avail_workers = avail_workers_.get();
|
||||
auto task_queue = task_queue_.get();
|
||||
for (int i = 0; i < avail_workers->size(); ++i) {
|
||||
// Submit all tasks whose arguments are ready.
|
||||
WorkerId workerid = avail_workers_[i];
|
||||
WorkerId workerid = (*avail_workers)[i];
|
||||
ObjStoreId objstoreid = get_store(workerid);
|
||||
auto bestit = task_queue_.end(); // keep track of the task that fits the worker best so far
|
||||
auto bestit = task_queue->end(); // keep track of the task that fits the worker best so far
|
||||
size_t min_num_shipped_objects = std::numeric_limits<size_t>::max(); // number of objects that need to be transfered for this worker
|
||||
for (auto it = task_queue_.begin(); it != task_queue_.end(); ++it) {
|
||||
for (auto it = task_queue->begin(); it != task_queue->end(); ++it) {
|
||||
OperationId operationid = *it;
|
||||
const Task& task = computation_graph_.get_task(operationid);
|
||||
auto& workers = fntable_[task.name()].workers();
|
||||
const Task& task = computation_graph->get_task(operationid);
|
||||
auto& workers = (*fntable)[task.name()].workers();
|
||||
if (std::binary_search(workers.begin(), workers.end(), workerid) && can_run(task)) {
|
||||
// determine how many objects would need to be shipped
|
||||
size_t num_shipped_objects = 0;
|
||||
|
@ -600,8 +557,8 @@ void SchedulerService::schedule_tasks_location_aware() {
|
|||
ObjRef canonical_objref = get_canonical_objref(objref);
|
||||
{
|
||||
// check if the object is already in the local object store
|
||||
std::lock_guard<std::mutex> objects_lock(objects_lock_);
|
||||
if (!std::binary_search(objtable_[canonical_objref].begin(), objtable_[canonical_objref].end(), objstoreid)) {
|
||||
auto objtable = objtable_.get();
|
||||
if (!std::binary_search((*objtable)[canonical_objref].begin(), (*objtable)[canonical_objref].end(), objstoreid)) {
|
||||
num_shipped_objects += 1;
|
||||
}
|
||||
}
|
||||
|
@ -614,58 +571,58 @@ void SchedulerService::schedule_tasks_location_aware() {
|
|||
}
|
||||
}
|
||||
// if we found a suitable task
|
||||
if (bestit != task_queue_.end()) {
|
||||
if (bestit != task_queue->end()) {
|
||||
assign_task(*bestit, workerid);
|
||||
task_queue_.erase(bestit);
|
||||
std::swap(avail_workers_[i], avail_workers_[avail_workers_.size() - 1]);
|
||||
avail_workers_.pop_back();
|
||||
task_queue->erase(bestit);
|
||||
std::swap((*avail_workers)[i], (*avail_workers)[avail_workers->size() - 1]);
|
||||
avail_workers->pop_back();
|
||||
i -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SchedulerService::perform_notify_aliases() {
|
||||
std::lock_guard<std::mutex> alias_notification_queue_lock(alias_notification_queue_lock_);
|
||||
for (int i = 0; i < alias_notification_queue_.size(); ++i) {
|
||||
const std::pair<WorkerId, std::pair<ObjRef, ObjRef> > alias_notification = alias_notification_queue_[i];
|
||||
auto alias_notification_queue = alias_notification_queue_.get();
|
||||
for (int i = 0; i < alias_notification_queue->size(); ++i) {
|
||||
const std::pair<WorkerId, std::pair<ObjRef, ObjRef> > alias_notification = (*alias_notification_queue)[i];
|
||||
ObjStoreId objstoreid = alias_notification.first;
|
||||
ObjRef alias_objref = alias_notification.second.first;
|
||||
ObjRef canonical_objref = alias_notification.second.second;
|
||||
if (attempt_notify_alias(objstoreid, alias_objref, canonical_objref)) { // this locks both the objstore_ and objtable_
|
||||
// the attempt to notify the objstore of the objref aliasing succeeded, so remove the notification task from the queue
|
||||
std::swap(alias_notification_queue_[i], alias_notification_queue_[alias_notification_queue_.size() - 1]);
|
||||
alias_notification_queue_.pop_back();
|
||||
std::swap((*alias_notification_queue)[i], (*alias_notification_queue)[alias_notification_queue->size() - 1]);
|
||||
alias_notification_queue->pop_back();
|
||||
i -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool SchedulerService::has_canonical_objref(ObjRef objref) {
|
||||
std::lock_guard<std::mutex> lock(target_objrefs_lock_);
|
||||
auto target_objrefs = target_objrefs_.get();
|
||||
ObjRef objref_temp = objref;
|
||||
while (true) {
|
||||
RAY_CHECK_LT(objref_temp, target_objrefs_.size(), "Attempting to index target_objrefs_ with objref " << objref_temp << ", but target_objrefs_.size() = " << target_objrefs_.size());
|
||||
if (target_objrefs_[objref_temp] == UNITIALIZED_ALIAS) {
|
||||
RAY_CHECK_LT(objref_temp, target_objrefs->size(), "Attempting to index target_objrefs_ with objref " << objref_temp << ", but target_objrefs_.size() = " << target_objrefs->size());
|
||||
if ((*target_objrefs)[objref_temp] == UNITIALIZED_ALIAS) {
|
||||
return false;
|
||||
}
|
||||
if (target_objrefs_[objref_temp] == objref_temp) {
|
||||
if ((*target_objrefs)[objref_temp] == objref_temp) {
|
||||
return true;
|
||||
}
|
||||
objref_temp = target_objrefs_[objref_temp];
|
||||
objref_temp = (*target_objrefs)[objref_temp];
|
||||
}
|
||||
}
|
||||
|
||||
ObjRef SchedulerService::get_canonical_objref(ObjRef objref) {
|
||||
// get_canonical_objref assumes that has_canonical_objref(objref) is true
|
||||
std::lock_guard<std::mutex> lock(target_objrefs_lock_);
|
||||
auto target_objrefs = target_objrefs_.get();
|
||||
ObjRef objref_temp = objref;
|
||||
while (true) {
|
||||
RAY_CHECK_LT(objref_temp, target_objrefs_.size(), "Attempting to index target_objrefs_ with objref " << objref_temp << ", but target_objrefs_.size() = " << target_objrefs_.size());
|
||||
RAY_CHECK_NEQ(target_objrefs_[objref_temp], UNITIALIZED_ALIAS, "Attempting to get canonical objref for objref " << objref << ", which aliases, objref " << objref_temp << ", but target_objrefs_[objref_temp] == UNITIALIZED_ALIAS for objref_temp = " << objref_temp << ".");
|
||||
if (target_objrefs_[objref_temp] == objref_temp) {
|
||||
RAY_CHECK_LT(objref_temp, target_objrefs->size(), "Attempting to index target_objrefs_ with objref " << objref_temp << ", but target_objrefs_.size() = " << target_objrefs->size());
|
||||
RAY_CHECK_NEQ((*target_objrefs)[objref_temp], UNITIALIZED_ALIAS, "Attempting to get canonical objref for objref " << objref << ", which aliases, objref " << objref_temp << ", but target_objrefs_[objref_temp] == UNITIALIZED_ALIAS for objref_temp = " << objref_temp << ".");
|
||||
if ((*target_objrefs)[objref_temp] == objref_temp) {
|
||||
return objref_temp;
|
||||
}
|
||||
objref_temp = target_objrefs_[objref_temp];
|
||||
objref_temp = (*target_objrefs)[objref_temp];
|
||||
RAY_LOG(RAY_ALIAS, "Looping in get_canonical_objref.");
|
||||
}
|
||||
}
|
||||
|
@ -677,8 +634,8 @@ bool SchedulerService::attempt_notify_alias(ObjStoreId objstoreid, ObjRef alias_
|
|||
return true;
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(objects_lock_);
|
||||
if (!std::binary_search(objtable_[canonical_objref].begin(), objtable_[canonical_objref].end(), objstoreid)) {
|
||||
auto objtable = objtable_.get();
|
||||
if (!std::binary_search((*objtable)[canonical_objref].begin(), (*objtable)[canonical_objref].end(), objstoreid)) {
|
||||
// the objstore doesn't have the object for canonical_objref yet, so it's too early to notify the objstore about the alias
|
||||
return false;
|
||||
}
|
||||
|
@ -688,10 +645,7 @@ bool SchedulerService::attempt_notify_alias(ObjStoreId objstoreid, ObjRef alias_
|
|||
NotifyAliasRequest request;
|
||||
request.set_alias_objref(alias_objref);
|
||||
request.set_canonical_objref(canonical_objref);
|
||||
{
|
||||
std::lock_guard<std::mutex> objstores_lock(objstores_lock_);
|
||||
objstores_[objstoreid].objstore_stub->NotifyAlias(&context, request, &reply);
|
||||
}
|
||||
(*objstores_.get())[objstoreid].objstore_stub->NotifyAlias(&context, request, &reply);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -700,50 +654,56 @@ void SchedulerService::deallocate_object(ObjRef canonical_objref) {
|
|||
// deallocate_object also recursively calls decrement_ref_count). Both of
|
||||
// these methods require reference_counts_lock_ to have been acquired, and
|
||||
// so the lock must before outside of these methods (it is acquired in
|
||||
// DecrementRefCount).
|
||||
// DecrementRefCount). Because we use contained_objrefs_ in this method, we
|
||||
// also require contained_objrefs_lock_ to be acquired outside of
|
||||
// decrement_ref_count.
|
||||
RAY_LOG(RAY_REFCOUNT, "Deallocating canonical_objref " << canonical_objref << ".");
|
||||
{
|
||||
std::lock_guard<std::mutex> objects_lock(objects_lock_);
|
||||
auto &objstores = objtable_[canonical_objref];
|
||||
std::lock_guard<std::mutex> objstores_lock(objstores_lock_); // TODO(rkn): Should this be inside the for loop instead?
|
||||
for (int i = 0; i < objstores.size(); ++i) {
|
||||
auto objtable = objtable_.get();
|
||||
auto &locations = (*objtable)[canonical_objref];
|
||||
auto objstores = objstores_.get(); // TODO(rkn): Should this be inside the for loop instead?
|
||||
for (int i = 0; i < locations.size(); ++i) {
|
||||
ClientContext context;
|
||||
AckReply reply;
|
||||
DeallocateObjectRequest request;
|
||||
request.set_canonical_objref(canonical_objref);
|
||||
ObjStoreId objstoreid = objstores[i];
|
||||
ObjStoreId objstoreid = locations[i];
|
||||
RAY_LOG(RAY_REFCOUNT, "Attempting to deallocate canonical_objref " << canonical_objref << " from objstore " << objstoreid);
|
||||
objstores_[objstoreid].objstore_stub->DeallocateObject(&context, request, &reply);
|
||||
(*objstores)[objstoreid].objstore_stub->DeallocateObject(&context, request, &reply);
|
||||
}
|
||||
objtable_[canonical_objref].clear();
|
||||
locations.clear();
|
||||
}
|
||||
decrement_ref_count(contained_objrefs_[canonical_objref]);
|
||||
decrement_ref_count((*contained_objrefs_.unsafe_get())[canonical_objref]);
|
||||
}
|
||||
|
||||
void SchedulerService::increment_ref_count(const std::vector<ObjRef> &objrefs) {
|
||||
// increment_ref_count assumes that reference_counts_lock_ has been acquired already
|
||||
for (int i = 0; i < objrefs.size(); ++i) {
|
||||
ObjRef objref = objrefs[i];
|
||||
RAY_CHECK_NEQ(reference_counts_[objref], DEALLOCATED, "Attempting to increment the reference count for objref " << objref << ", but this object appears to have been deallocated already.");
|
||||
reference_counts_[objref] += 1;
|
||||
RAY_LOG(RAY_REFCOUNT, "Incremented ref count for objref " << objref <<". New reference count is " << reference_counts_[objref]);
|
||||
auto reference_counts = reference_counts_.unsafe_get();
|
||||
RAY_CHECK_NEQ((*reference_counts)[objref], DEALLOCATED, "Attempting to increment the reference count for objref " << objref << ", but this object appears to have been deallocated already.");
|
||||
(*reference_counts)[objref] += 1;
|
||||
RAY_LOG(RAY_REFCOUNT, "Incremented ref count for objref " << objref <<". New reference count is " << (*reference_counts)[objref]);
|
||||
}
|
||||
}
|
||||
|
||||
void SchedulerService::decrement_ref_count(const std::vector<ObjRef> &objrefs) {
|
||||
// decrement_ref_count assumes that reference_counts_lock_ has been acquired already
|
||||
// decrement_ref_count assumes that reference_counts_lock_ and
|
||||
// contained_objrefs_lock_ have been acquired already. contained_objrefs_lock_
|
||||
// is needed inside of deallocate_object
|
||||
for (int i = 0; i < objrefs.size(); ++i) {
|
||||
ObjRef objref = objrefs[i];
|
||||
RAY_CHECK_NEQ(reference_counts_[objref], DEALLOCATED, "Attempting to decrement the reference count for objref " << objref << ", but this object appears to have been deallocated already.");
|
||||
RAY_CHECK_NEQ(reference_counts_[objref], 0, "Attempting to decrement the reference count for objref " << objref << ", but the reference count for this object is already 0.");
|
||||
reference_counts_[objref] -= 1;
|
||||
RAY_LOG(RAY_REFCOUNT, "Decremented ref count for objref " << objref << ". New reference count is " << reference_counts_[objref]);
|
||||
auto reference_counts = reference_counts_.unsafe_get();
|
||||
RAY_CHECK_NEQ((*reference_counts)[objref], DEALLOCATED, "Attempting to decrement the reference count for objref " << objref << ", but this object appears to have been deallocated already.");
|
||||
RAY_CHECK_NEQ((*reference_counts)[objref], 0, "Attempting to decrement the reference count for objref " << objref << ", but the reference count for this object is already 0.");
|
||||
(*reference_counts)[objref] -= 1;
|
||||
RAY_LOG(RAY_REFCOUNT, "Decremented ref count for objref " << objref << ". New reference count is " << (*reference_counts)[objref]);
|
||||
// See if we can deallocate the object
|
||||
std::vector<ObjRef> equivalent_objrefs;
|
||||
get_equivalent_objrefs(objref, equivalent_objrefs);
|
||||
bool can_deallocate = true;
|
||||
for (int j = 0; j < equivalent_objrefs.size(); ++j) {
|
||||
if (reference_counts_[equivalent_objrefs[j]] != 0) {
|
||||
if ((*reference_counts)[equivalent_objrefs[j]] != 0) {
|
||||
can_deallocate = false;
|
||||
break;
|
||||
}
|
||||
|
@ -753,7 +713,7 @@ void SchedulerService::decrement_ref_count(const std::vector<ObjRef> &objrefs) {
|
|||
RAY_CHECK(is_canonical(canonical_objref), "canonical_objref is not canonical.");
|
||||
deallocate_object(canonical_objref);
|
||||
for (int j = 0; j < equivalent_objrefs.size(); ++j) {
|
||||
reference_counts_[equivalent_objrefs[j]] = DEALLOCATED;
|
||||
(*reference_counts)[equivalent_objrefs[j]] = DEALLOCATED;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -762,40 +722,41 @@ void SchedulerService::decrement_ref_count(const std::vector<ObjRef> &objrefs) {
|
|||
void SchedulerService::upstream_objrefs(ObjRef objref, std::vector<ObjRef> &objrefs) {
|
||||
// upstream_objrefs assumes that the lock reverse_target_objrefs_lock_ has been acquired
|
||||
objrefs.push_back(objref);
|
||||
for (int i = 0; i < reverse_target_objrefs_[objref].size(); ++i) {
|
||||
upstream_objrefs(reverse_target_objrefs_[objref][i], objrefs);
|
||||
auto reverse_target_objrefs = reverse_target_objrefs_.unsafe_get();
|
||||
for (int i = 0; i < (*reverse_target_objrefs)[objref].size(); ++i) {
|
||||
upstream_objrefs((*reverse_target_objrefs)[objref][i], objrefs);
|
||||
}
|
||||
}
|
||||
|
||||
void SchedulerService::get_equivalent_objrefs(ObjRef objref, std::vector<ObjRef> &equivalent_objrefs) {
|
||||
std::lock_guard<std::mutex> target_objrefs_lock(target_objrefs_lock_);
|
||||
auto target_objrefs = target_objrefs_.get();
|
||||
ObjRef downstream_objref = objref;
|
||||
while (target_objrefs_[downstream_objref] != downstream_objref && target_objrefs_[downstream_objref] != UNITIALIZED_ALIAS) {
|
||||
while ((*target_objrefs)[downstream_objref] != downstream_objref && (*target_objrefs)[downstream_objref] != UNITIALIZED_ALIAS) {
|
||||
RAY_LOG(RAY_ALIAS, "Looping in get_equivalent_objrefs");
|
||||
downstream_objref = target_objrefs_[downstream_objref];
|
||||
downstream_objref = (*target_objrefs)[downstream_objref];
|
||||
}
|
||||
std::lock_guard<std::mutex> reverse_target_objrefs_lock(reverse_target_objrefs_lock_);
|
||||
auto reverse_target_objrefs = reverse_target_objrefs_.get();
|
||||
upstream_objrefs(downstream_objref, equivalent_objrefs);
|
||||
}
|
||||
|
||||
// This method defines the order in which locks should be acquired.
|
||||
void SchedulerService::do_on_locks(bool lock) {
|
||||
std::mutex *mutexes[] = {
|
||||
&successful_tasks_lock_,
|
||||
&failed_tasks_lock_,
|
||||
&get_queue_lock_,
|
||||
&computation_graph_lock_,
|
||||
&fntable_lock_,
|
||||
&avail_workers_lock_,
|
||||
&task_queue_lock_,
|
||||
&reference_counts_lock_,
|
||||
&contained_objrefs_lock_,
|
||||
&workers_lock_,
|
||||
&alias_notification_queue_lock_,
|
||||
&objects_lock_,
|
||||
&objstores_lock_,
|
||||
&target_objrefs_lock_,
|
||||
&reverse_target_objrefs_lock_
|
||||
&successful_tasks_.mutex(),
|
||||
&failed_tasks_.mutex(),
|
||||
&get_queue_.mutex(),
|
||||
&computation_graph_.mutex(),
|
||||
&fntable_.mutex(),
|
||||
&avail_workers_.mutex(),
|
||||
&task_queue_.mutex(),
|
||||
&reference_counts_.mutex(),
|
||||
&contained_objrefs_.mutex(),
|
||||
&workers_.mutex(),
|
||||
&alias_notification_queue_.mutex(),
|
||||
&objtable_.mutex(),
|
||||
&objstores_.mutex(),
|
||||
&target_objrefs_.mutex(),
|
||||
&reverse_target_objrefs_.mutex()
|
||||
};
|
||||
size_t n = sizeof(mutexes) / sizeof(*mutexes);
|
||||
for (size_t i = 0; i != n; ++i) {
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include "ray.grpc.pb.h"
|
||||
#include "types.pb.h"
|
||||
|
||||
#include "utils.h"
|
||||
#include "computation_graph.h"
|
||||
|
||||
using grpc::Server;
|
||||
|
@ -133,32 +134,26 @@ private:
|
|||
|
||||
// The computation graph tracks the operations that have been submitted to the
|
||||
// scheduler and is mostly used for fault tolerance.
|
||||
ComputationGraph computation_graph_;
|
||||
std::mutex computation_graph_lock_;
|
||||
Synchronized<ComputationGraph> computation_graph_;
|
||||
// Vector of all workers registered in the system. Their index in this vector
|
||||
// is the workerid.
|
||||
std::vector<WorkerHandle> workers_;
|
||||
std::mutex workers_lock_;
|
||||
Synchronized<std::vector<WorkerHandle> > workers_;
|
||||
// Vector of all workers that are currently idle.
|
||||
std::vector<WorkerId> avail_workers_;
|
||||
std::mutex avail_workers_lock_;
|
||||
Synchronized<std::vector<WorkerId> > avail_workers_;
|
||||
// Vector of all object stores registered in the system. Their index in this
|
||||
// vector is the objstoreid.
|
||||
std::vector<ObjStoreHandle> objstores_;
|
||||
grpc::mutex objstores_lock_;
|
||||
Synchronized<std::vector<ObjStoreHandle> > objstores_;
|
||||
// Mapping from an aliased objref to the objref it is aliased with. If an
|
||||
// objref is a canonical objref (meaning it is not aliased), then
|
||||
// target_objrefs_[objref] == objref. For each objref, target_objrefs_[objref]
|
||||
// is initialized to UNITIALIZED_ALIAS and the correct value is filled later
|
||||
// when it is known.
|
||||
std::vector<ObjRef> target_objrefs_;
|
||||
std::mutex target_objrefs_lock_;
|
||||
Synchronized<std::vector<ObjRef> > target_objrefs_;
|
||||
// This data structure maps an objref to all of the objrefs that alias it (there could be multiple such objrefs).
|
||||
std::vector<std::vector<ObjRef> > reverse_target_objrefs_;
|
||||
std::mutex reverse_target_objrefs_lock_;
|
||||
Synchronized<std::vector<std::vector<ObjRef> > > reverse_target_objrefs_;
|
||||
// Mapping from canonical objref to list of object stores where the object is stored. Non-canonical (aliased) objrefs should not be used to index objtable_.
|
||||
ObjTable objtable_;
|
||||
std::mutex objects_lock_; // This lock protects objtable_ and objects_in_transit_
|
||||
Synchronized<ObjTable> objtable_; // This lock protects objtable_ and objects_in_transit_
|
||||
|
||||
// For each object store objstoreid, objects_in_transit_[objstoreid] is a
|
||||
// vector of the canonical object references that are being streamed to that
|
||||
// object store but are not yet present. Object references are added to this
|
||||
|
@ -166,36 +161,29 @@ private:
|
|||
// the same object to a given object store twice), and object references are
|
||||
// removed when add_location is called (from ObjReady), and they are moved to
|
||||
// the objtable_. Note that objects_in_transit_ and objtable_ share the same
|
||||
// lock (objects_lock_).
|
||||
// lock (objects_lock_). // TODO(rkn): Consider making this part of the
|
||||
// objtable data structure.
|
||||
std::vector<std::vector<ObjRef> > objects_in_transit_;
|
||||
// Hash map from function names to workers where the function is registered.
|
||||
FnTable fntable_;
|
||||
std::mutex fntable_lock_;
|
||||
Synchronized<FnTable> fntable_;
|
||||
// List of pending tasks.
|
||||
std::deque<OperationId> task_queue_;
|
||||
std::mutex task_queue_lock_;
|
||||
Synchronized<std::deque<OperationId> > task_queue_;
|
||||
// List of pending get calls.
|
||||
std::vector<std::pair<WorkerId, ObjRef> > get_queue_;
|
||||
std::mutex get_queue_lock_;
|
||||
Synchronized<std::vector<std::pair<WorkerId, ObjRef> > > get_queue_;
|
||||
// List of failed tasks
|
||||
std::vector<TaskStatus> failed_tasks_;
|
||||
std::mutex failed_tasks_lock_;
|
||||
Synchronized<std::vector<TaskStatus> > failed_tasks_;
|
||||
// List of the IDs of successful tasks
|
||||
std::vector<OperationId> successful_tasks_; // Right now, we only use this information in the TaskInfo call.
|
||||
std::mutex successful_tasks_lock_;
|
||||
Synchronized<std::vector<OperationId> > successful_tasks_; // Right now, we only use this information in the TaskInfo call.
|
||||
// List of pending alias notifications. Each element consists of (objstoreid, (alias_objref, canonical_objref)).
|
||||
std::vector<std::pair<ObjStoreId, std::pair<ObjRef, ObjRef> > > alias_notification_queue_;
|
||||
std::mutex alias_notification_queue_lock_;
|
||||
Synchronized<std::vector<std::pair<ObjStoreId, std::pair<ObjRef, ObjRef> > > > alias_notification_queue_;
|
||||
// Reference counts. Currently, reference_counts_[objref] is the number of
|
||||
// existing references held to objref. This is done for all objrefs, not just
|
||||
// canonical_objrefs. This data structure completely ignores aliasing. If the
|
||||
// object corresponding to objref has been deallocated, then
|
||||
// reference_counts[objref] will equal DEALLOCATED.
|
||||
std::vector<RefCount> reference_counts_;
|
||||
std::mutex reference_counts_lock_;
|
||||
Synchronized<std::vector<RefCount> > reference_counts_;
|
||||
// contained_objrefs_[objref] is a vector of all of the objrefs contained inside the object referred to by objref
|
||||
std::vector<std::vector<ObjRef> > contained_objrefs_;
|
||||
std::mutex contained_objrefs_lock_;
|
||||
Synchronized<std::vector<std::vector<ObjRef> > > contained_objrefs_;
|
||||
// the scheduling algorithm that will be used
|
||||
SchedulingAlgorithmType scheduling_algorithm_;
|
||||
};
|
||||
|
|
|
@ -34,6 +34,7 @@ class Synchronized {
|
|||
T value_;
|
||||
public:
|
||||
typedef T element_type;
|
||||
typedef Mutex mutex_type;
|
||||
template<class... U>
|
||||
Synchronized(U&&... args) : value_(std::forward<T>(args)...) { }
|
||||
Synchronized(const Synchronized& other) : value_(*other) { }
|
||||
|
@ -47,6 +48,7 @@ public:
|
|||
SynchronizedPtr<const Synchronized> get() const { return *this; }
|
||||
element_type* unsafe_get() { return &value_; }
|
||||
const element_type* unsafe_get() const { return &value_; }
|
||||
mutex_type& mutex() { return mutex_; }
|
||||
};
|
||||
|
||||
std::string::iterator split_ip_address(std::string& ip_address);
|
||||
|
|
Loading…
Add table
Reference in a new issue