From a879302355f77e2d01e489ec3bf5452b167d6688 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Sat, 29 Sep 2018 22:10:57 +0800 Subject: [PATCH] Improve log message when failing to fork worker process (#2990) ## What do these changes do? ```c++ // Try to execute the worker command. int rv = execvp(worker_command_args[0], const_cast(worker_command_args.data())); // The worker failed to start. This is a fatal error. RAY_LOG(FATAL) << "Failed to start worker with return value " << rv; ``` When starting a process fails, the return value `rv` always be set to -1. It is useless for us. The log message should show some meaningful infos. For example, If we did't install java. The message showed for us should be: ```shell Failed to start worker: No such file or directory. ``` This could help us to locate issue quickly. ## Related issue number N/A --- src/ray/raylet/worker_pool.cc | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index e06743b5c..7ed4a1408 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -118,12 +118,18 @@ void WorkerPool::StartWorkerProcess(const Language &language) { // Launch the process to create the worker. pid_t pid = fork(); - if (pid != 0) { + if (pid < 0) { + // Failure case. + RAY_LOG(FATAL) << "Failed to fork worker process: " << strerror(errno); + return; + } else if (pid > 0) { + // Parent process case. RAY_LOG(DEBUG) << "Started worker process with pid " << pid; starting_worker_processes_.emplace(std::make_pair(pid, num_workers_per_process_)); return; } + // Child process case. // Reset the SIGCHLD handler for the worker. signal(SIGCHLD, SIG_DFL); @@ -138,7 +144,8 @@ void WorkerPool::StartWorkerProcess(const Language &language) { int rv = execvp(worker_command_args[0], const_cast(worker_command_args.data())); // The worker failed to start. This is a fatal error. - RAY_LOG(FATAL) << "Failed to start worker with return value " << rv; + RAY_LOG(FATAL) << "Failed to start worker with return value " << rv << ": " + << strerror(errno); } void WorkerPool::RegisterWorker(std::shared_ptr worker) {