Better errors on process deaths (#3252)

This commit is contained in:
Eric Liang 2018-11-07 14:08:16 -08:00 committed by GitHub
parent 1dd5d92789
commit 29e3362905
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 21 additions and 7 deletions

View file

@ -25,7 +25,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
feather-format lxml openpyxl xlrd py-spy setproctitle
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
sudo apt-get update
sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip
@ -51,7 +51,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
feather-format lxml openpyxl xlrd py-spy setproctitle
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
# check that brew is installed
which -s brew

View file

@ -5,6 +5,7 @@ from __future__ import print_function
from contextlib import contextmanager
import atexit
import colorama
import faulthandler
import hashlib
import inspect
import logging
@ -404,7 +405,8 @@ class Worker(object):
invalid_error = RayTaskError(
"<unknown>", None,
"Invalid return value: likely worker died or was killed "
"while executing the task.")
"while executing the task; check previous logs or dmesg "
"for errors.")
return [invalid_error] * len(object_ids)
except pyarrow.DeserializationCallbackError:
# Wait a little bit for the import thread to import the class.
@ -1850,6 +1852,9 @@ def connect(info,
assert not worker.connected, error_message
assert worker.cached_functions_to_run is not None, error_message
# Enable nice stack traces on SIGSEGV etc.
faulthandler.enable(all_threads=False)
# Initialize some fields.
if mode is WORKER_MODE:
worker.worker_id = random_string()

View file

@ -151,6 +151,7 @@ setup(
"pytest",
"pyyaml",
"redis",
"faulthandler;python_version<'3'",
"setproctitle",
# The six module is required by pyarrow.
"six >= 1.0.0",

View file

@ -47,7 +47,9 @@ void ObjectStoreNotificationManager::ProcessStoreLength(
void ObjectStoreNotificationManager::ProcessStoreNotification(
const boost::system::error_code &error) {
if (error.value() != boost::system::errc::success) {
RAY_LOG(FATAL) << boost_to_ray_status(error).ToString();
RAY_LOG(FATAL)
<< "Problem communicating with the object store from raylet, check logs or "
<< "dmesg for previous errors: " << boost_to_ray_status(error).ToString();
}
const auto &object_info =

View file

@ -258,7 +258,10 @@ ray::raylet::TaskSpecification *local_scheduler_get_task_raylet(
RAY_LOG(DEBUG) << "Exiting because local scheduler closed connection.";
exit(1);
}
RAY_CHECK(type == static_cast<int64_t>(MessageType::ExecuteTask));
if (type != static_cast<int64_t>(MessageType::ExecuteTask)) {
RAY_LOG(FATAL) << "Problem communicating with raylet from worker: check logs or "
"dmesg for previous errors.";
}
// Parse the flatbuffer object.
auto reply_message = flatbuffers::GetRoot<ray::protocol::GetTaskReply>(reply);
@ -338,8 +341,11 @@ std::pair<std::vector<ObjectID>, std::vector<ObjectID>> local_scheduler_wait(
// Read result.
read_message(conn->conn, &type, &reply_size, &reply);
}
RAY_CHECK(static_cast<ray::protocol::MessageType>(type) ==
ray::protocol::MessageType::WaitReply);
if (static_cast<ray::protocol::MessageType>(type) !=
ray::protocol::MessageType::WaitReply) {
RAY_LOG(FATAL) << "Problem communicating with raylet from worker: check logs or "
"dmesg for previous errors.";
}
auto reply_message = flatbuffers::GetRoot<ray::protocol::WaitReply>(reply);
// Convert result.
std::pair<std::vector<ObjectID>, std::vector<ObjectID>> result;