[core] Fix the wrong error message in gcs for worker exits (#19774)

This commit is contained in:
Yi Cheng 2021-10-27 12:55:27 -07:00 committed by GitHub
parent aa5499ef0f
commit 98961d1ee2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 17 deletions

View file

@ -830,7 +830,8 @@ void CoreWorker::Exit(
task_execution_service_.post(
[this, exit_type, creation_task_exception_pb_bytes]() {
if (exit_type == rpc::WorkerExitType::CREATION_TASK_ERROR ||
exit_type == rpc::WorkerExitType::INTENDED_EXIT) {
exit_type == rpc::WorkerExitType::INTENDED_EXIT ||
exit_type == rpc::WorkerExitType::IDLE_EXIT) {
// Notify the raylet about this exit.
// Only CREATION_TASK_ERROR and INTENDED_EXIT needs to disconnect
// manually.

View file

@ -711,13 +711,19 @@ void GcsActorManager::OnWorkerDead(
const ray::NodeID &node_id, const ray::WorkerID &worker_id,
const rpc::WorkerExitType disconnect_type,
const std::shared_ptr<rpc::RayException> &creation_task_exception) {
RAY_LOG(INFO) << "Worker " << worker_id << " on node " << node_id
<< " exited, type=" << rpc::WorkerExitType_Name(disconnect_type)
<< ", has creation_task_exception = "
<< (creation_task_exception != nullptr);
std::string message = absl::StrCat(
"Worker ", worker_id.Hex(), " on node ", node_id.Hex(),
" exits, type=", rpc::WorkerExitType_Name(disconnect_type),
", has creation_task_exception = ", (creation_task_exception != nullptr));
if (creation_task_exception != nullptr) {
RAY_LOG(INFO) << "Formatted creation task exception: "
<< creation_task_exception->formatted_exception_string();
absl::StrAppend(&message, " Formatted creation task exception: ",
creation_task_exception->formatted_exception_string());
}
if (disconnect_type == rpc::WorkerExitType::INTENDED_EXIT ||
disconnect_type == rpc::WorkerExitType::IDLE_EXIT) {
RAY_LOG(DEBUG) << message;
} else {
RAY_LOG(WARNING) << message;
}
bool need_reconstruct = disconnect_type != rpc::WorkerExitType::INTENDED_EXIT &&

View file

@ -25,17 +25,16 @@ void GcsWorkerManager::HandleReportWorkerFailure(
const rpc::Address worker_address = request.worker_failure().worker_address();
const auto worker_id = WorkerID::FromBinary(worker_address.worker_id());
const auto node_id = NodeID::FromBinary(worker_address.raylet_id());
std::stringstream log_stream;
log_stream << "Reporting worker failure, worker id = " << worker_id
<< ", node id = " << node_id << ", address = " << worker_address.ip_address()
<< ", exit_type = "
<< rpc::WorkerExitType_Name(request.worker_failure().exit_type())
<< ", has creation task exception = "
<< request.worker_failure().has_creation_task_exception();
if (request.worker_failure().exit_type() == rpc::WorkerExitType::INTENDED_EXIT) {
RAY_LOG(INFO) << log_stream.str();
std::string message = absl::StrCat(
"Reporting worker exit, worker id = ", worker_id.Hex(),
", node id = ", node_id.Hex(), ", address = ", worker_address.ip_address(),
", exit_type = ", rpc::WorkerExitType_Name(request.worker_failure().exit_type()),
request.worker_failure().has_creation_task_exception());
if (request.worker_failure().exit_type() == rpc::WorkerExitType::INTENDED_EXIT ||
request.worker_failure().exit_type() == rpc::WorkerExitType::IDLE_EXIT) {
RAY_LOG(DEBUG) << message;
} else {
RAY_LOG(WARNING) << log_stream.str()
RAY_LOG(WARNING) << message
<< ". Unintentional worker failures have been reported. If there "
"are lots of this logs, that might indicate there are "
"unexpected failures in the cluster.";