mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
Tentative workaround for some forks and signals on Windows (#6362)
* Platform shims for Windows * Tentative workaround for some forks and signals on Windows * Rewrite WorkerPool::StartProcess by moving spawnvp wrapper to a separate function * Separate spawnvp the wrappers for POSIX and Windows * Fix rv use
This commit is contained in:
parent
8636d67b72
commit
74b2e871b7
9 changed files with 149 additions and 24 deletions
25
BUILD.bazel
25
BUILD.bazel
|
@ -748,6 +748,31 @@ cc_binary(
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "platform_shims",
|
||||||
|
srcs = [] + select({
|
||||||
|
"@bazel_tools//src/conditions:windows": glob([
|
||||||
|
"src/shims/windows/**/*.c",
|
||||||
|
"src/shims/windows/**/*.cc",
|
||||||
|
]),
|
||||||
|
"//conditions:default": [],
|
||||||
|
}),
|
||||||
|
hdrs = [] + select({
|
||||||
|
"@bazel_tools//src/conditions:windows": glob([
|
||||||
|
"src/shims/windows/**/*.h",
|
||||||
|
]),
|
||||||
|
"//conditions:default": [],
|
||||||
|
}),
|
||||||
|
copts = COPTS,
|
||||||
|
includes = [] + select({
|
||||||
|
"@bazel_tools//src/conditions:windows": [
|
||||||
|
"src/shims/windows",
|
||||||
|
],
|
||||||
|
"//conditions:default": [],
|
||||||
|
}),
|
||||||
|
visibility = ["//visibility:public"],
|
||||||
|
)
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "ray_util",
|
name = "ray_util",
|
||||||
srcs = glob(
|
srcs = glob(
|
||||||
|
|
|
@ -116,6 +116,7 @@ cc_library(
|
||||||
":common_fbs",
|
":common_fbs",
|
||||||
":plasma_fbs",
|
":plasma_fbs",
|
||||||
"@com_github_google_glog//:glog",
|
"@com_github_google_glog//:glog",
|
||||||
|
"@com_github_ray_project_ray//:platform_shims",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -210,6 +211,7 @@ cc_library(
|
||||||
":ae",
|
":ae",
|
||||||
":plasma_client",
|
":plasma_client",
|
||||||
"@com_github_google_glog//:glog",
|
"@com_github_google_glog//:glog",
|
||||||
|
"@com_github_ray_project_ray//:platform_shims",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -28,5 +28,8 @@ cc_library(
|
||||||
copts = COPTS,
|
copts = COPTS,
|
||||||
includes = ["deps"],
|
includes = ["deps"],
|
||||||
strip_include_prefix = "deps",
|
strip_include_prefix = "deps",
|
||||||
|
deps = [
|
||||||
|
"@com_github_ray_project_ray//:platform_shims",
|
||||||
|
],
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
)
|
)
|
||||||
|
|
|
@ -262,12 +262,16 @@ void CoreWorker::Disconnect() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void CoreWorker::RunIOService() {
|
void CoreWorker::RunIOService() {
|
||||||
|
#ifdef _WIN32
|
||||||
|
// TODO(mehrdadn): Is there an equivalent for Windows we need here?
|
||||||
|
#else
|
||||||
// Block SIGINT and SIGTERM so they will be handled by the main thread.
|
// Block SIGINT and SIGTERM so they will be handled by the main thread.
|
||||||
sigset_t mask;
|
sigset_t mask;
|
||||||
sigemptyset(&mask);
|
sigemptyset(&mask);
|
||||||
sigaddset(&mask, SIGINT);
|
sigaddset(&mask, SIGINT);
|
||||||
sigaddset(&mask, SIGTERM);
|
sigaddset(&mask, SIGTERM);
|
||||||
pthread_sigmask(SIG_BLOCK, &mask, NULL);
|
pthread_sigmask(SIG_BLOCK, &mask, NULL);
|
||||||
|
#endif
|
||||||
|
|
||||||
io_service_.run();
|
io_service_.run();
|
||||||
}
|
}
|
||||||
|
|
|
@ -266,8 +266,10 @@ void NodeManager::KillWorker(std::shared_ptr<Worker> worker) {
|
||||||
retry_timer->expires_from_now(retry_duration);
|
retry_timer->expires_from_now(retry_duration);
|
||||||
retry_timer->async_wait([retry_timer, worker](const boost::system::error_code &error) {
|
retry_timer->async_wait([retry_timer, worker](const boost::system::error_code &error) {
|
||||||
RAY_LOG(DEBUG) << "Send SIGKILL to worker, pid=" << worker->Pid();
|
RAY_LOG(DEBUG) << "Send SIGKILL to worker, pid=" << worker->Pid();
|
||||||
// Force kill worker. TODO(rkn): Is there some small danger that the worker
|
// Force kill worker. TODO(mehrdadn, rkn): The worker may have already died
|
||||||
// has already died and the PID has been reassigned to a different process?
|
// and had its PID reassigned to a different process, at least on Windows.
|
||||||
|
// On Linux, this may or may not be the case, depending on e.g. whether
|
||||||
|
// the process has been already waited on. Regardless, this must be fixed.
|
||||||
kill(worker->Pid(), SIGKILL);
|
kill(worker->Pid(), SIGKILL);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
#include "ray/raylet/worker_pool.h"
|
#include "ray/raylet/worker_pool.h"
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#include <Windows.h>
|
||||||
|
#include <process.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <sys/wait.h>
|
#include <sys/wait.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
@ -46,9 +51,13 @@ WorkerPool::WorkerPool(int num_workers, int maximum_startup_concurrency,
|
||||||
: maximum_startup_concurrency_(maximum_startup_concurrency),
|
: maximum_startup_concurrency_(maximum_startup_concurrency),
|
||||||
gcs_client_(std::move(gcs_client)) {
|
gcs_client_(std::move(gcs_client)) {
|
||||||
RAY_CHECK(maximum_startup_concurrency > 0);
|
RAY_CHECK(maximum_startup_concurrency > 0);
|
||||||
|
#ifdef _WIN32
|
||||||
|
// TODO(mehrdadn): Is there an equivalent of this we need for Windows?
|
||||||
|
#else
|
||||||
// Ignore SIGCHLD signals. If we don't do this, then worker processes will
|
// Ignore SIGCHLD signals. If we don't do this, then worker processes will
|
||||||
// become zombies instead of dying gracefully.
|
// become zombies instead of dying gracefully.
|
||||||
signal(SIGCHLD, SIG_IGN);
|
signal(SIGCHLD, SIG_IGN);
|
||||||
|
#endif
|
||||||
for (const auto &entry : worker_commands) {
|
for (const auto &entry : worker_commands) {
|
||||||
// Initialize the pool state for this language.
|
// Initialize the pool state for this language.
|
||||||
auto &state = states_by_lang_[entry.first];
|
auto &state = states_by_lang_[entry.first];
|
||||||
|
@ -198,6 +207,56 @@ int WorkerPool::StartWorkerProcess(const Language &language,
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
// Fork + exec combo for Windows. Returns -1 on failure.
|
||||||
|
// TODO(mehrdadn): This is dangerous on Windows.
|
||||||
|
// We need to keep the actual process handle alive for the PID to stay valid.
|
||||||
|
// Make this change as soon as possible, or the PID may refer to the wrong process.
|
||||||
|
static pid_t spawnvp_wrapper(std::vector<std::string> const &args) {
|
||||||
|
pid_t pid;
|
||||||
|
std::vector<const char *> str_args;
|
||||||
|
for (const auto &arg : args) {
|
||||||
|
str_args.push_back(arg.c_str());
|
||||||
|
}
|
||||||
|
str_args.push_back(NULL);
|
||||||
|
HANDLE handle = (HANDLE)spawnvp(P_NOWAIT, str_args[0], str_args.data());
|
||||||
|
if (handle != INVALID_HANDLE_VALUE) {
|
||||||
|
pid = static_cast<pid_t>(GetProcessId(handle));
|
||||||
|
if (pid == 0) {
|
||||||
|
pid = -1;
|
||||||
|
}
|
||||||
|
CloseHandle(handle);
|
||||||
|
} else {
|
||||||
|
pid = -1;
|
||||||
|
errno = EINVAL;
|
||||||
|
}
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// Fork + exec combo for POSIX. Returns -1 on failure.
|
||||||
|
static pid_t spawnvp_wrapper(std::vector<std::string> const &args) {
|
||||||
|
pid_t pid;
|
||||||
|
std::vector<const char *> str_args;
|
||||||
|
for (const auto &arg : args) {
|
||||||
|
str_args.push_back(arg.c_str());
|
||||||
|
}
|
||||||
|
str_args.push_back(NULL);
|
||||||
|
pid = fork();
|
||||||
|
if (pid == 0) {
|
||||||
|
// Child process case.
|
||||||
|
// Reset the SIGCHLD handler for the worker.
|
||||||
|
// TODO(mehrdadn): Move any work here to the child process itself
|
||||||
|
// so that it can also be implemented on Windows.
|
||||||
|
signal(SIGCHLD, SIG_DFL);
|
||||||
|
if (execvp(str_args[0], const_cast<char *const *>(str_args.data())) == -1) {
|
||||||
|
pid = -1;
|
||||||
|
abort(); // fork() succeeded but exec() failed, so abort the child
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
pid_t WorkerPool::StartProcess(const std::vector<std::string> &worker_command_args) {
|
pid_t WorkerPool::StartProcess(const std::vector<std::string> &worker_command_args) {
|
||||||
if (RAY_LOG_ENABLED(DEBUG)) {
|
if (RAY_LOG_ENABLED(DEBUG)) {
|
||||||
std::stringstream stream;
|
std::stringstream stream;
|
||||||
|
@ -209,29 +268,12 @@ pid_t WorkerPool::StartProcess(const std::vector<std::string> &worker_command_ar
|
||||||
}
|
}
|
||||||
|
|
||||||
// Launch the process to create the worker.
|
// Launch the process to create the worker.
|
||||||
pid_t pid = fork();
|
pid_t pid = spawnvp_wrapper(worker_command_args);
|
||||||
|
if (pid == -1) {
|
||||||
if (pid != 0) {
|
RAY_LOG(FATAL) << "Failed to start worker with error " << errno << ": "
|
||||||
return pid;
|
<< strerror(errno);
|
||||||
}
|
}
|
||||||
|
return pid;
|
||||||
// Child process case.
|
|
||||||
// Reset the SIGCHLD handler for the worker.
|
|
||||||
signal(SIGCHLD, SIG_DFL);
|
|
||||||
|
|
||||||
// Try to execute the worker command.
|
|
||||||
std::vector<const char *> worker_command_args_str;
|
|
||||||
for (const auto &arg : worker_command_args) {
|
|
||||||
worker_command_args_str.push_back(arg.c_str());
|
|
||||||
}
|
|
||||||
worker_command_args_str.push_back(nullptr);
|
|
||||||
int rv = execvp(worker_command_args_str[0],
|
|
||||||
const_cast<char *const *>(worker_command_args_str.data()));
|
|
||||||
|
|
||||||
// The worker failed to start. This is a fatal error.
|
|
||||||
RAY_LOG(FATAL) << "Failed to start worker with return value " << rv << ": "
|
|
||||||
<< strerror(errno);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Status WorkerPool::RegisterWorker(const std::shared_ptr<Worker> &worker) {
|
Status WorkerPool::RegisterWorker(const std::shared_ptr<Worker> &worker) {
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
|
|
||||||
// This test just print some call stack information.
|
// This test just print some call stack information.
|
||||||
namespace ray {
|
namespace ray {
|
||||||
|
#ifndef _WIN32
|
||||||
|
|
||||||
void Sleep() { usleep(100000); }
|
void Sleep() { usleep(100000); }
|
||||||
|
|
||||||
|
@ -77,6 +78,7 @@ TEST(SignalTest, SIGILL_Test) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // !_WIN32
|
||||||
} // namespace ray
|
} // namespace ray
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
|
|
28
src/shims/windows/unistd.cc
Normal file
28
src/shims/windows/unistd.cc
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#ifndef WIN32_LEAN_AND_MEAN
|
||||||
|
#define WIN32_LEAN_AND_MEAN 1
|
||||||
|
#endif
|
||||||
|
#include <Windows.h>
|
||||||
|
|
||||||
|
int kill(pid_t pid, int sig) {
|
||||||
|
int result;
|
||||||
|
if (HANDLE process = OpenProcess(PROCESS_TERMINATE, FALSE, pid)) {
|
||||||
|
if (sig == SIGKILL) {
|
||||||
|
if (TerminateProcess(process, ERROR_PROCESS_ABORTED)) {
|
||||||
|
result = 0;
|
||||||
|
} else {
|
||||||
|
result = -1;
|
||||||
|
errno = EPERM;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
result = -1;
|
||||||
|
errno = EINVAL;
|
||||||
|
}
|
||||||
|
CloseHandle(process);
|
||||||
|
} else {
|
||||||
|
result = -1;
|
||||||
|
errno = ESRCH;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
17
src/shims/windows/unistd.h
Normal file
17
src/shims/windows/unistd.h
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
#ifndef UNISTD_H
|
||||||
|
#define UNISTD_H
|
||||||
|
|
||||||
|
typedef int pid_t /* technically unsigned on Windows, but no practical concern */;
|
||||||
|
enum { SIGKILL = 9 };
|
||||||
|
|
||||||
|
__declspec(
|
||||||
|
deprecated("Killing a process by ID has an inherent race condition on Windows"
|
||||||
|
" and is HIGHLY discouraged. "
|
||||||
|
"Furthermore, signals other than SIGKILL are NOT portable. "
|
||||||
|
"Please use a wrapper that keeps the process handle alive"
|
||||||
|
" and terminates it directly as needed. "
|
||||||
|
"For SIGTERM or other signals, a different IPC mechanism may be"
|
||||||
|
" more appropriate (such as window messages on Windows)."
|
||||||
|
"")) int kill(pid_t pid, int sig);
|
||||||
|
|
||||||
|
#endif /* UNISTD_H */
|
Loading…
Add table
Reference in a new issue