mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[core] Fix the wrong error message in gcs for worker exits (#19774)
This commit is contained in:
parent
aa5499ef0f
commit
98961d1ee2
3 changed files with 23 additions and 17 deletions
|
@ -830,7 +830,8 @@ void CoreWorker::Exit(
|
|||
task_execution_service_.post(
|
||||
[this, exit_type, creation_task_exception_pb_bytes]() {
|
||||
if (exit_type == rpc::WorkerExitType::CREATION_TASK_ERROR ||
|
||||
exit_type == rpc::WorkerExitType::INTENDED_EXIT) {
|
||||
exit_type == rpc::WorkerExitType::INTENDED_EXIT ||
|
||||
exit_type == rpc::WorkerExitType::IDLE_EXIT) {
|
||||
// Notify the raylet about this exit.
|
||||
// Only CREATION_TASK_ERROR and INTENDED_EXIT needs to disconnect
|
||||
// manually.
|
||||
|
|
|
@ -711,13 +711,19 @@ void GcsActorManager::OnWorkerDead(
|
|||
const ray::NodeID &node_id, const ray::WorkerID &worker_id,
|
||||
const rpc::WorkerExitType disconnect_type,
|
||||
const std::shared_ptr<rpc::RayException> &creation_task_exception) {
|
||||
RAY_LOG(INFO) << "Worker " << worker_id << " on node " << node_id
|
||||
<< " exited, type=" << rpc::WorkerExitType_Name(disconnect_type)
|
||||
<< ", has creation_task_exception = "
|
||||
<< (creation_task_exception != nullptr);
|
||||
std::string message = absl::StrCat(
|
||||
"Worker ", worker_id.Hex(), " on node ", node_id.Hex(),
|
||||
" exits, type=", rpc::WorkerExitType_Name(disconnect_type),
|
||||
", has creation_task_exception = ", (creation_task_exception != nullptr));
|
||||
if (creation_task_exception != nullptr) {
|
||||
RAY_LOG(INFO) << "Formatted creation task exception: "
|
||||
<< creation_task_exception->formatted_exception_string();
|
||||
absl::StrAppend(&message, " Formatted creation task exception: ",
|
||||
creation_task_exception->formatted_exception_string());
|
||||
}
|
||||
if (disconnect_type == rpc::WorkerExitType::INTENDED_EXIT ||
|
||||
disconnect_type == rpc::WorkerExitType::IDLE_EXIT) {
|
||||
RAY_LOG(DEBUG) << message;
|
||||
} else {
|
||||
RAY_LOG(WARNING) << message;
|
||||
}
|
||||
|
||||
bool need_reconstruct = disconnect_type != rpc::WorkerExitType::INTENDED_EXIT &&
|
||||
|
|
|
@ -25,17 +25,16 @@ void GcsWorkerManager::HandleReportWorkerFailure(
|
|||
const rpc::Address worker_address = request.worker_failure().worker_address();
|
||||
const auto worker_id = WorkerID::FromBinary(worker_address.worker_id());
|
||||
const auto node_id = NodeID::FromBinary(worker_address.raylet_id());
|
||||
std::stringstream log_stream;
|
||||
log_stream << "Reporting worker failure, worker id = " << worker_id
|
||||
<< ", node id = " << node_id << ", address = " << worker_address.ip_address()
|
||||
<< ", exit_type = "
|
||||
<< rpc::WorkerExitType_Name(request.worker_failure().exit_type())
|
||||
<< ", has creation task exception = "
|
||||
<< request.worker_failure().has_creation_task_exception();
|
||||
if (request.worker_failure().exit_type() == rpc::WorkerExitType::INTENDED_EXIT) {
|
||||
RAY_LOG(INFO) << log_stream.str();
|
||||
std::string message = absl::StrCat(
|
||||
"Reporting worker exit, worker id = ", worker_id.Hex(),
|
||||
", node id = ", node_id.Hex(), ", address = ", worker_address.ip_address(),
|
||||
", exit_type = ", rpc::WorkerExitType_Name(request.worker_failure().exit_type()),
|
||||
request.worker_failure().has_creation_task_exception());
|
||||
if (request.worker_failure().exit_type() == rpc::WorkerExitType::INTENDED_EXIT ||
|
||||
request.worker_failure().exit_type() == rpc::WorkerExitType::IDLE_EXIT) {
|
||||
RAY_LOG(DEBUG) << message;
|
||||
} else {
|
||||
RAY_LOG(WARNING) << log_stream.str()
|
||||
RAY_LOG(WARNING) << message
|
||||
<< ". Unintentional worker failures have been reported. If there "
|
||||
"are lots of this logs, that might indicate there are "
|
||||
"unexpected failures in the cluster.";
|
||||
|
|
Loading…
Add table
Reference in a new issue