mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
Better errors on process deaths (#3252)
This commit is contained in:
parent
1dd5d92789
commit
29e3362905
5 changed files with 21 additions and 7 deletions
|
@ -25,7 +25,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then
|
|||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler
|
||||
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip
|
||||
|
@ -51,7 +51,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
|
|||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler
|
||||
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
|
||||
# check that brew is installed
|
||||
which -s brew
|
||||
|
|
|
@ -5,6 +5,7 @@ from __future__ import print_function
|
|||
from contextlib import contextmanager
|
||||
import atexit
|
||||
import colorama
|
||||
import faulthandler
|
||||
import hashlib
|
||||
import inspect
|
||||
import logging
|
||||
|
@ -404,7 +405,8 @@ class Worker(object):
|
|||
invalid_error = RayTaskError(
|
||||
"<unknown>", None,
|
||||
"Invalid return value: likely worker died or was killed "
|
||||
"while executing the task.")
|
||||
"while executing the task; check previous logs or dmesg "
|
||||
"for errors.")
|
||||
return [invalid_error] * len(object_ids)
|
||||
except pyarrow.DeserializationCallbackError:
|
||||
# Wait a little bit for the import thread to import the class.
|
||||
|
@ -1850,6 +1852,9 @@ def connect(info,
|
|||
assert not worker.connected, error_message
|
||||
assert worker.cached_functions_to_run is not None, error_message
|
||||
|
||||
# Enable nice stack traces on SIGSEGV etc.
|
||||
faulthandler.enable(all_threads=False)
|
||||
|
||||
# Initialize some fields.
|
||||
if mode is WORKER_MODE:
|
||||
worker.worker_id = random_string()
|
||||
|
|
|
@ -151,6 +151,7 @@ setup(
|
|||
"pytest",
|
||||
"pyyaml",
|
||||
"redis",
|
||||
"faulthandler;python_version<'3'",
|
||||
"setproctitle",
|
||||
# The six module is required by pyarrow.
|
||||
"six >= 1.0.0",
|
||||
|
|
|
@ -47,7 +47,9 @@ void ObjectStoreNotificationManager::ProcessStoreLength(
|
|||
void ObjectStoreNotificationManager::ProcessStoreNotification(
|
||||
const boost::system::error_code &error) {
|
||||
if (error.value() != boost::system::errc::success) {
|
||||
RAY_LOG(FATAL) << boost_to_ray_status(error).ToString();
|
||||
RAY_LOG(FATAL)
|
||||
<< "Problem communicating with the object store from raylet, check logs or "
|
||||
<< "dmesg for previous errors: " << boost_to_ray_status(error).ToString();
|
||||
}
|
||||
|
||||
const auto &object_info =
|
||||
|
|
|
@ -258,7 +258,10 @@ ray::raylet::TaskSpecification *local_scheduler_get_task_raylet(
|
|||
RAY_LOG(DEBUG) << "Exiting because local scheduler closed connection.";
|
||||
exit(1);
|
||||
}
|
||||
RAY_CHECK(type == static_cast<int64_t>(MessageType::ExecuteTask));
|
||||
if (type != static_cast<int64_t>(MessageType::ExecuteTask)) {
|
||||
RAY_LOG(FATAL) << "Problem communicating with raylet from worker: check logs or "
|
||||
"dmesg for previous errors.";
|
||||
}
|
||||
|
||||
// Parse the flatbuffer object.
|
||||
auto reply_message = flatbuffers::GetRoot<ray::protocol::GetTaskReply>(reply);
|
||||
|
@ -338,8 +341,11 @@ std::pair<std::vector<ObjectID>, std::vector<ObjectID>> local_scheduler_wait(
|
|||
// Read result.
|
||||
read_message(conn->conn, &type, &reply_size, &reply);
|
||||
}
|
||||
RAY_CHECK(static_cast<ray::protocol::MessageType>(type) ==
|
||||
ray::protocol::MessageType::WaitReply);
|
||||
if (static_cast<ray::protocol::MessageType>(type) !=
|
||||
ray::protocol::MessageType::WaitReply) {
|
||||
RAY_LOG(FATAL) << "Problem communicating with raylet from worker: check logs or "
|
||||
"dmesg for previous errors.";
|
||||
}
|
||||
auto reply_message = flatbuffers::GetRoot<ray::protocol::WaitReply>(reply);
|
||||
// Convert result.
|
||||
std::pair<std::vector<ObjectID>, std::vector<ObjectID>> result;
|
||||
|
|
Loading…
Add table
Reference in a new issue