mirror of
https://github.com/vale981/ray
synced 2025-03-08 19:41:38 -05:00
[xray] Resubmit tasks that fail to be forwarded (#2645)
This commit is contained in:
parent
dd924a388b
commit
e3e0cfce87
1 changed files with 31 additions and 21 deletions
|
@ -1294,26 +1294,40 @@ void NodeManager::ForwardTaskOrResubmit(const Task &task,
|
||||||
RAY_LOG(INFO) << "Failed to forward task " << task_id << " to node manager "
|
RAY_LOG(INFO) << "Failed to forward task " << task_id << " to node manager "
|
||||||
<< node_manager_id;
|
<< node_manager_id;
|
||||||
// Mark the failed task as pending to let other raylets know that we still
|
// Mark the failed task as pending to let other raylets know that we still
|
||||||
// have the task. Once the task is successfully retried, it will be
|
// have the task. TaskDependencyManager::TaskPending() is assumed to be
|
||||||
// canceled. TaskDependencyManager::TaskPending() is assumed to be
|
|
||||||
// idempotent.
|
// idempotent.
|
||||||
task_dependency_manager_.TaskPending(task);
|
task_dependency_manager_.TaskPending(task);
|
||||||
|
|
||||||
// Create a timer to resubmit the task in a little bit. TODO(rkn): Really
|
// Actor tasks can only be executed at the actor's location, so they are
|
||||||
// this should be a unique_ptr instead of a shared_ptr. However, it's a
|
// retried after a timeout. All other tasks that fail to be forwarded are
|
||||||
// little harder to move unique_ptrs into lambdas.
|
// deemed to be placeable again.
|
||||||
auto retry_timer = std::make_shared<boost::asio::deadline_timer>(io_service_);
|
if (task.GetTaskSpecification().IsActorTask()) {
|
||||||
auto retry_duration = boost::posix_time::milliseconds(
|
// The task is for an actor on another node. Create a timer to resubmit
|
||||||
RayConfig::instance().node_manager_forward_task_retry_timeout_milliseconds());
|
// the task in a little bit. TODO(rkn): Really this should be a
|
||||||
retry_timer->expires_from_now(retry_duration);
|
// unique_ptr instead of a shared_ptr. However, it's a little harder to
|
||||||
retry_timer->async_wait(
|
// move unique_ptrs into lambdas.
|
||||||
[this, task, task_id, retry_timer](const boost::system::error_code &error) {
|
auto retry_timer = std::make_shared<boost::asio::deadline_timer>(io_service_);
|
||||||
// Timer killing will receive the boost::asio::error::operation_aborted,
|
auto retry_duration = boost::posix_time::milliseconds(
|
||||||
// we only handle the timeout event.
|
RayConfig::instance().node_manager_forward_task_retry_timeout_milliseconds());
|
||||||
RAY_CHECK(!error);
|
retry_timer->expires_from_now(retry_duration);
|
||||||
RAY_LOG(INFO) << "In ForwardTask retry callback for task " << task_id;
|
retry_timer->async_wait(
|
||||||
EnqueuePlaceableTask(task);
|
[this, task, task_id, retry_timer](const boost::system::error_code &error) {
|
||||||
});
|
// Timer killing will receive the boost::asio::error::operation_aborted,
|
||||||
|
// we only handle the timeout event.
|
||||||
|
RAY_CHECK(!error);
|
||||||
|
RAY_LOG(DEBUG) << "Resubmitting task " << task_id
|
||||||
|
<< " because ForwardTask failed.";
|
||||||
|
SubmitTask(task, Lineage());
|
||||||
|
});
|
||||||
|
// Remove the task from the lineage cache. The task will get added back
|
||||||
|
// once it is resubmitted.
|
||||||
|
lineage_cache_.RemoveWaitingTask(task_id);
|
||||||
|
} else {
|
||||||
|
// The task is not for an actor and may therefore be placed on another
|
||||||
|
// node immediately. Send it to the scheduling policy to be placed again.
|
||||||
|
local_queues_.QueuePlaceableTasks({task});
|
||||||
|
ScheduleTasks();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1384,10 +1398,6 @@ ray::Status NodeManager::ForwardTask(const Task &task, const ClientID &node_id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// TODO(atumanov): caller must handle ForwardTask failure.
|
|
||||||
RAY_LOG(WARNING) << "[NodeManager][ForwardTask] failed to forward task " << task_id
|
|
||||||
<< " to node " << node_id;
|
|
||||||
}
|
}
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue