mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
Tentative workaround for some forks and signals on Windows (#6362)
* Platform shims for Windows * Tentative workaround for some forks and signals on Windows * Rewrite WorkerPool::StartProcess by moving spawnvp wrapper to a separate function * Separate spawnvp the wrappers for POSIX and Windows * Fix rv use
This commit is contained in:
parent
8636d67b72
commit
74b2e871b7
9 changed files with 149 additions and 24 deletions
25
BUILD.bazel
25
BUILD.bazel
|
@ -748,6 +748,31 @@ cc_binary(
|
|||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "platform_shims",
|
||||
srcs = [] + select({
|
||||
"@bazel_tools//src/conditions:windows": glob([
|
||||
"src/shims/windows/**/*.c",
|
||||
"src/shims/windows/**/*.cc",
|
||||
]),
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
hdrs = [] + select({
|
||||
"@bazel_tools//src/conditions:windows": glob([
|
||||
"src/shims/windows/**/*.h",
|
||||
]),
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
copts = COPTS,
|
||||
includes = [] + select({
|
||||
"@bazel_tools//src/conditions:windows": [
|
||||
"src/shims/windows",
|
||||
],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "ray_util",
|
||||
srcs = glob(
|
||||
|
|
|
@ -116,6 +116,7 @@ cc_library(
|
|||
":common_fbs",
|
||||
":plasma_fbs",
|
||||
"@com_github_google_glog//:glog",
|
||||
"@com_github_ray_project_ray//:platform_shims",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -210,6 +211,7 @@ cc_library(
|
|||
":ae",
|
||||
":plasma_client",
|
||||
"@com_github_google_glog//:glog",
|
||||
"@com_github_ray_project_ray//:platform_shims",
|
||||
],
|
||||
)
|
||||
|
||||
|
|
|
@ -28,5 +28,8 @@ cc_library(
|
|||
copts = COPTS,
|
||||
includes = ["deps"],
|
||||
strip_include_prefix = "deps",
|
||||
deps = [
|
||||
"@com_github_ray_project_ray//:platform_shims",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
|
|
@ -262,12 +262,16 @@ void CoreWorker::Disconnect() {
|
|||
}
|
||||
|
||||
void CoreWorker::RunIOService() {
|
||||
#ifdef _WIN32
|
||||
// TODO(mehrdadn): Is there an equivalent for Windows we need here?
|
||||
#else
|
||||
// Block SIGINT and SIGTERM so they will be handled by the main thread.
|
||||
sigset_t mask;
|
||||
sigemptyset(&mask);
|
||||
sigaddset(&mask, SIGINT);
|
||||
sigaddset(&mask, SIGTERM);
|
||||
pthread_sigmask(SIG_BLOCK, &mask, NULL);
|
||||
#endif
|
||||
|
||||
io_service_.run();
|
||||
}
|
||||
|
|
|
@ -266,8 +266,10 @@ void NodeManager::KillWorker(std::shared_ptr<Worker> worker) {
|
|||
retry_timer->expires_from_now(retry_duration);
|
||||
retry_timer->async_wait([retry_timer, worker](const boost::system::error_code &error) {
|
||||
RAY_LOG(DEBUG) << "Send SIGKILL to worker, pid=" << worker->Pid();
|
||||
// Force kill worker. TODO(rkn): Is there some small danger that the worker
|
||||
// has already died and the PID has been reassigned to a different process?
|
||||
// Force kill worker. TODO(mehrdadn, rkn): The worker may have already died
|
||||
// and had its PID reassigned to a different process, at least on Windows.
|
||||
// On Linux, this may or may not be the case, depending on e.g. whether
|
||||
// the process has been already waited on. Regardless, this must be fixed.
|
||||
kill(worker->Pid(), SIGKILL);
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
#include "ray/raylet/worker_pool.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <Windows.h>
|
||||
#include <process.h>
|
||||
#endif
|
||||
|
||||
#include <sys/wait.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
@ -46,9 +51,13 @@ WorkerPool::WorkerPool(int num_workers, int maximum_startup_concurrency,
|
|||
: maximum_startup_concurrency_(maximum_startup_concurrency),
|
||||
gcs_client_(std::move(gcs_client)) {
|
||||
RAY_CHECK(maximum_startup_concurrency > 0);
|
||||
#ifdef _WIN32
|
||||
// TODO(mehrdadn): Is there an equivalent of this we need for Windows?
|
||||
#else
|
||||
// Ignore SIGCHLD signals. If we don't do this, then worker processes will
|
||||
// become zombies instead of dying gracefully.
|
||||
signal(SIGCHLD, SIG_IGN);
|
||||
#endif
|
||||
for (const auto &entry : worker_commands) {
|
||||
// Initialize the pool state for this language.
|
||||
auto &state = states_by_lang_[entry.first];
|
||||
|
@ -198,6 +207,56 @@ int WorkerPool::StartWorkerProcess(const Language &language,
|
|||
return -1;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// Fork + exec combo for Windows. Returns -1 on failure.
|
||||
// TODO(mehrdadn): This is dangerous on Windows.
|
||||
// We need to keep the actual process handle alive for the PID to stay valid.
|
||||
// Make this change as soon as possible, or the PID may refer to the wrong process.
|
||||
static pid_t spawnvp_wrapper(std::vector<std::string> const &args) {
|
||||
pid_t pid;
|
||||
std::vector<const char *> str_args;
|
||||
for (const auto &arg : args) {
|
||||
str_args.push_back(arg.c_str());
|
||||
}
|
||||
str_args.push_back(NULL);
|
||||
HANDLE handle = (HANDLE)spawnvp(P_NOWAIT, str_args[0], str_args.data());
|
||||
if (handle != INVALID_HANDLE_VALUE) {
|
||||
pid = static_cast<pid_t>(GetProcessId(handle));
|
||||
if (pid == 0) {
|
||||
pid = -1;
|
||||
}
|
||||
CloseHandle(handle);
|
||||
} else {
|
||||
pid = -1;
|
||||
errno = EINVAL;
|
||||
}
|
||||
return pid;
|
||||
}
|
||||
#else
|
||||
// Fork + exec combo for POSIX. Returns -1 on failure.
|
||||
static pid_t spawnvp_wrapper(std::vector<std::string> const &args) {
|
||||
pid_t pid;
|
||||
std::vector<const char *> str_args;
|
||||
for (const auto &arg : args) {
|
||||
str_args.push_back(arg.c_str());
|
||||
}
|
||||
str_args.push_back(NULL);
|
||||
pid = fork();
|
||||
if (pid == 0) {
|
||||
// Child process case.
|
||||
// Reset the SIGCHLD handler for the worker.
|
||||
// TODO(mehrdadn): Move any work here to the child process itself
|
||||
// so that it can also be implemented on Windows.
|
||||
signal(SIGCHLD, SIG_DFL);
|
||||
if (execvp(str_args[0], const_cast<char *const *>(str_args.data())) == -1) {
|
||||
pid = -1;
|
||||
abort(); // fork() succeeded but exec() failed, so abort the child
|
||||
}
|
||||
}
|
||||
return pid;
|
||||
}
|
||||
#endif
|
||||
|
||||
pid_t WorkerPool::StartProcess(const std::vector<std::string> &worker_command_args) {
|
||||
if (RAY_LOG_ENABLED(DEBUG)) {
|
||||
std::stringstream stream;
|
||||
|
@ -209,29 +268,12 @@ pid_t WorkerPool::StartProcess(const std::vector<std::string> &worker_command_ar
|
|||
}
|
||||
|
||||
// Launch the process to create the worker.
|
||||
pid_t pid = fork();
|
||||
|
||||
if (pid != 0) {
|
||||
return pid;
|
||||
pid_t pid = spawnvp_wrapper(worker_command_args);
|
||||
if (pid == -1) {
|
||||
RAY_LOG(FATAL) << "Failed to start worker with error " << errno << ": "
|
||||
<< strerror(errno);
|
||||
}
|
||||
|
||||
// Child process case.
|
||||
// Reset the SIGCHLD handler for the worker.
|
||||
signal(SIGCHLD, SIG_DFL);
|
||||
|
||||
// Try to execute the worker command.
|
||||
std::vector<const char *> worker_command_args_str;
|
||||
for (const auto &arg : worker_command_args) {
|
||||
worker_command_args_str.push_back(arg.c_str());
|
||||
}
|
||||
worker_command_args_str.push_back(nullptr);
|
||||
int rv = execvp(worker_command_args_str[0],
|
||||
const_cast<char *const *>(worker_command_args_str.data()));
|
||||
|
||||
// The worker failed to start. This is a fatal error.
|
||||
RAY_LOG(FATAL) << "Failed to start worker with return value " << rv << ": "
|
||||
<< strerror(errno);
|
||||
return 0;
|
||||
return pid;
|
||||
}
|
||||
|
||||
Status WorkerPool::RegisterWorker(const std::shared_ptr<Worker> &worker) {
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
// This test just print some call stack information.
|
||||
namespace ray {
|
||||
#ifndef _WIN32
|
||||
|
||||
void Sleep() { usleep(100000); }
|
||||
|
||||
|
@ -77,6 +78,7 @@ TEST(SignalTest, SIGILL_Test) {
|
|||
}
|
||||
}
|
||||
|
||||
#endif // !_WIN32
|
||||
} // namespace ray
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
|
28
src/shims/windows/unistd.cc
Normal file
28
src/shims/windows/unistd.cc
Normal file
|
@ -0,0 +1,28 @@
|
|||
#include <unistd.h>
|
||||
|
||||
#ifndef WIN32_LEAN_AND_MEAN
|
||||
#define WIN32_LEAN_AND_MEAN 1
|
||||
#endif
|
||||
#include <Windows.h>
|
||||
|
||||
int kill(pid_t pid, int sig) {
|
||||
int result;
|
||||
if (HANDLE process = OpenProcess(PROCESS_TERMINATE, FALSE, pid)) {
|
||||
if (sig == SIGKILL) {
|
||||
if (TerminateProcess(process, ERROR_PROCESS_ABORTED)) {
|
||||
result = 0;
|
||||
} else {
|
||||
result = -1;
|
||||
errno = EPERM;
|
||||
}
|
||||
} else {
|
||||
result = -1;
|
||||
errno = EINVAL;
|
||||
}
|
||||
CloseHandle(process);
|
||||
} else {
|
||||
result = -1;
|
||||
errno = ESRCH;
|
||||
}
|
||||
return result;
|
||||
}
|
17
src/shims/windows/unistd.h
Normal file
17
src/shims/windows/unistd.h
Normal file
|
@ -0,0 +1,17 @@
|
|||
#ifndef UNISTD_H
|
||||
#define UNISTD_H
|
||||
|
||||
typedef int pid_t /* technically unsigned on Windows, but no practical concern */;
|
||||
enum { SIGKILL = 9 };
|
||||
|
||||
__declspec(
|
||||
deprecated("Killing a process by ID has an inherent race condition on Windows"
|
||||
" and is HIGHLY discouraged. "
|
||||
"Furthermore, signals other than SIGKILL are NOT portable. "
|
||||
"Please use a wrapper that keeps the process handle alive"
|
||||
" and terminates it directly as needed. "
|
||||
"For SIGTERM or other signals, a different IPC mechanism may be"
|
||||
" more appropriate (such as window messages on Windows)."
|
||||
"")) int kill(pid_t pid, int sig);
|
||||
|
||||
#endif /* UNISTD_H */
|
Loading…
Add table
Reference in a new issue