[Runtime Env] Plumbing runtime env failure error message to the exception: Task [1/3] (#22032)

This is the PR to write better runtime env exception. After 3 PRs are merged, we can entirely turn off the runtime env logs streamed to drivers. The first PR only handles tasks exception. TODO - [x] Task (this PR) - [ ] Actor - [ ] Turn of runtime env logs & improve error msgs
2025-03-06 02:21:39 -05:00 · 2022-02-04 09:47:04 +09:00 · 2022-02-04 09:47:04 +09:00 · d7fc7d2e9d
commit d7fc7d2e9d
parent dd935874ee
20 changed files with 227 additions and 115 deletions
--- a/dashboard/modules/job/tests/test_http_job_server.py
+++ b/dashboard/modules/job/tests/test_http_job_server.py
@ -156,7 +156,7 @@ def test_runtime_env_setup_failure(job_sdk_client):

    wait_for_condition(_check_job_failed, client=client, job_id=job_id)
    status = client.get_job_status(job_id)
-    assert "The runtime_env failed to be set up" in status.message
+    assert "Failed to setup runtime environment" in status.message


 def test_submit_job_with_exception_in_driver(job_sdk_client):
--- a/dashboard/modules/runtime_env/runtime_env_agent.py
+++ b/dashboard/modules/runtime_env/runtime_env_agent.py
@ -5,6 +5,7 @@ import json
 import logging
 import os
 import time
+import traceback
 from typing import Dict, Set
 from ray._private.utils import import_attr

@ -261,9 +262,10 @@ class RuntimeEnvAgent(
                        serialized_env, request.serialized_allocated_resource_instances
                    )
                    break
-                except Exception as ex:
-                    self._logger.exception("Runtime env creation failed.")
-                    error_message = str(ex)
+                except Exception:
+                    err_msg = f"Failed to create runtime env {serialized_env}."
+                    self._logger.exception(err_msg)
+                    error_message = f"{err_msg}\n{traceback.format_exc()}"
                    await asyncio.sleep(
                        runtime_env_consts.RUNTIME_ENV_RETRY_INTERVAL_MS / 1000
                    )
--- a/python/ray/_private/runtime_env/conda_utils.py
+++ b/python/ray/_private/runtime_env/conda_utils.py
@ -110,8 +110,11 @@ def create_conda_env_if_needed(
    logger.info(f"Creating conda environment {prefix}")
    exit_code, output = exec_cmd_stream_to_logger(create_cmd, logger)
    if exit_code != 0:
-        shutil.rmtree(prefix)
-        raise RuntimeError(f"Failed to install conda environment {prefix}:\n{output}")
+        if os.path.exists(prefix):
+            shutil.rmtree(prefix)
+        raise RuntimeError(
+            f"Failed to install conda environment {prefix}:\nOutput:\n{output}"
+        )


 def delete_conda_env(prefix: str, logger: Optional[logging.Logger] = None) -> bool:
--- a/python/ray/exceptions.py
+++ b/python/ray/exceptions.py
@ -522,10 +522,21 @@ class AsyncioActorExit(RayError):


 class RuntimeEnvSetupError(RayError):
-    """Raised when a runtime environment fails to be set up."""
+    """Raised when a runtime environment fails to be set up.
+
+    params:
+        error_message: The error message that explains
+            why runtime env setup has failed.
+    """
+
+    def __init__(self, error_message: str = None):
+        self.error_message = error_message

    def __str__(self):
-        return "The runtime_env failed to be set up."
+        msgs = ["Failed to setup runtime environment."]
+        if self.error_message:
+            msgs.append(self.error_message)
+        return "\n".join(msgs)


 class TaskPlacementGroupRemoved(RayError):
--- a/python/ray/serialization.py
+++ b/python/ray/serialization.py
@ -200,14 +200,19 @@ class SerializationContext:
            raise DeserializationError()
        return obj

-    def _deserialize_actor_died_error(self, data, metadata_fields):
-        if not data:
-            return RayActorError()
+    def _deserialize_error_info(self, data, metadata_fields):
+        assert data
        pb_bytes = self._deserialize_msgpack_data(data, metadata_fields)
        assert pb_bytes

        ray_error_info = RayErrorInfo()
        ray_error_info.ParseFromString(pb_bytes)
+        return ray_error_info
+
+    def _deserialize_actor_died_error(self, data, metadata_fields):
+        if not data:
+            return RayActorError()
+        ray_error_info = self._deserialize_error_info(data, metadata_fields)
        assert ray_error_info.HasField("actor_died_error")
        if ray_error_info.actor_died_error.HasField("creation_task_failure_context"):
            return RayError.from_ray_exception(
@ -291,7 +296,12 @@ class SerializationContext:
                    object_ref.hex(), object_ref.owner_address(), object_ref.call_site()
                )
            elif error_type == ErrorType.Value("RUNTIME_ENV_SETUP_FAILED"):
-                return RuntimeEnvSetupError()
+                error_info = self._deserialize_error_info(data, metadata_fields)
+                # TODO(sang): Assert instead once actor also reports error messages.
+                error_msg = ""
+                if error_info.HasField("runtime_env_setup_failed_error"):
+                    error_msg = error_info.runtime_env_setup_failed_error.error_message
+                return RuntimeEnvSetupError(error_message=error_msg)
            elif error_type == ErrorType.Value("TASK_PLACEMENT_GROUP_REMOVED"):
                return TaskPlacementGroupRemoved()
            elif error_type == ErrorType.Value("ACTOR_PLACEMENT_GROUP_REMOVED"):
--- a/python/ray/tests/test_runtime_env.py
+++ b/python/ray/tests/test_runtime_env.py
@ -137,7 +137,11 @@ def test_invalid_conda_env(shutdown_only):

    start = time.time()
    bad_env = {"conda": {"dependencies": ["this_doesnt_exist"]}}
-    with pytest.raises(RuntimeEnvSetupError):
+    with pytest.raises(
+        RuntimeEnvSetupError,
+        # The actual error message should be included in the exception.
+        match="ResolvePackageNotFound",
+    ):
        ray.get(f.options(runtime_env=bad_env).remote())
    first_time = time.time() - start

@ -150,7 +154,7 @@ def test_invalid_conda_env(shutdown_only):

    # The second time this runs it should be faster as the error is cached.
    start = time.time()
-    with pytest.raises(RuntimeEnvSetupError):
+    with pytest.raises(RuntimeEnvSetupError, match="ResolvePackageNotFound"):
        ray.get(f.options(runtime_env=bad_env).remote())

    assert (time.time() - start) < (first_time / 2.0)
--- a/src/mock/ray/raylet/scheduling/cluster_task_manager_interface.h
+++ b/src/mock/ray/raylet/scheduling/cluster_task_manager_interface.h
@ -33,7 +33,10 @@ class MockClusterTaskManagerInterface : public ClusterTaskManagerInterface {
              (const, override));
  MOCK_METHOD(void, TaskFinished,
              (std::shared_ptr<WorkerInterface> worker, RayTask *task), (override));
-  MOCK_METHOD(bool, CancelTask, (const TaskID &task_id, bool runtime_env_setup_failed),
+  MOCK_METHOD(bool, CancelTask,
+              (const TaskID &task_id,
+               rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type,
+               const std::string &scheduling_failure_message),
              (override));
  MOCK_METHOD(void, QueueAndScheduleTask,
              (const RayTask &task, rpc::RequestWorkerLeaseReply *reply,
--- a/src/ray/core_worker/transport/direct_task_transport.cc
+++ b/src/ray/core_worker/transport/direct_task_transport.cc
@ -382,8 +382,12 @@ void CoreWorkerDirectTaskSubmitter::RequestNewWorkerIfNeeded(
                if (reply.failure_type() ==
                    rpc::RequestWorkerLeaseReply::
                        SCHEDULING_CANCELLED_RUNTIME_ENV_SETUP_FAILED) {
+                  rpc::RayErrorInfo error_info;
+                  error_info.mutable_runtime_env_setup_failed_error()->set_error_message(
+                      reply.scheduling_failure_message());
                  RAY_UNUSED(task_finisher_->FailPendingTask(
-                      task_spec.TaskId(), rpc::ErrorType::RUNTIME_ENV_SETUP_FAILED));
+                      task_spec.TaskId(), rpc::ErrorType::RUNTIME_ENV_SETUP_FAILED,
+                      /*status*/ nullptr, &error_info));
                } else {
                  if (task_spec.IsActorCreationTask()) {
                    RAY_UNUSED(task_finisher_->FailPendingTask(
--- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc
+++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc
@ -1047,7 +1047,7 @@ void GcsActorManager::OnActorSchedulingFailed(
  case rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_RUNTIME_ENV_SETUP_FAILED:
    error_msg =
        "Could not create the actor because its associated runtime env failed to be "
-        "created..";
+        "created.";
    break;
  default:
    RAY_LOG(FATAL) << "Unknown error, failure type "
--- a/src/ray/protobuf/common.proto
+++ b/src/ray/protobuf/common.proto
@ -186,6 +186,7 @@ enum ErrorType {
 message RayErrorInfo {
  oneof error {
    ActorDeathCause actor_died_error = 2;
+    RuntimeEnvFailedContext runtime_env_setup_failed_error = 3;
  }
 }

@ -204,13 +205,13 @@ message ActorDeathCause {
  oneof context {
    // Indicates that this actor is marked as DEAD due to actor creation task failure.
    RayException creation_task_failure_context = 1;
-    ActorDeathRuntimeEnvFailedContext runtime_env_failed_context = 2;
+    RuntimeEnvFailedContext runtime_env_failed_context = 2;
    ActorDiedErrorContext actor_died_error_context = 3;
  }
 }
 // ---Actor death contexts start----
 // Indicates that this actor is marked as DEAD due to runtime environment setup failure.
-message ActorDeathRuntimeEnvFailedContext {
+message RuntimeEnvFailedContext {
  // TODO(sang,lixin) Get this error message from agent.
  string error_message = 1;
 }
--- a/src/ray/protobuf/node_manager.proto
+++ b/src/ray/protobuf/node_manager.proto
@ -55,15 +55,17 @@ message RequestWorkerLeaseRequest {

 message RequestWorkerLeaseReply {
  enum SchedulingFailureType {
+    // The default failure type is "not failed".
+    NOT_FAILED = 0;
    // Scheduling is failed on this node.
-    SCHEDULING_FAILED = 0;
+    SCHEDULING_FAILED = 1;
    // Scheduling is cancelled because task/actor's placement is removed.
-    SCHEDULING_CANCELLED_PLACEMENT_GROUP_REMOVED = 1;
+    SCHEDULING_CANCELLED_PLACEMENT_GROUP_REMOVED = 2;
    // Scheduling is cancelled because task/actor's runtime environment setup is failed
-    SCHEDULING_CANCELLED_RUNTIME_ENV_SETUP_FAILED = 2;
+    SCHEDULING_CANCELLED_RUNTIME_ENV_SETUP_FAILED = 3;
    // Scheduling is cancelled because task/actor is intentionally cancelled. E.g.,
    // ray.kill or ray.cancel
-    SCHEDULING_CANCELLED_INTENDED = 3;
+    SCHEDULING_CANCELLED_INTENDED = 4;
  }

  // Address of the leased worker. If this is empty, then the request should be
@ -73,8 +75,7 @@ message RequestWorkerLeaseReply {
  Address retry_at_raylet_address = 2;
  // Resource mapping ids acquired by the leased worker.
  repeated ResourceMapEntry resource_mapping = 3;
-  // Whether this lease request was canceled. In this case, the
-  // client should try again if the resources are still required.
+  // Whether this lease request was canceled.
  bool canceled = 4;
  // PID of the worker process.
  uint32 worker_pid = 6;
@ -83,7 +84,11 @@ message RequestWorkerLeaseReply {
  // The (normal task) resources data to be carried by the Reply.
  ResourcesData resources_data = 8;
  // Scheduling failure type.
+  // Must be set only when canceled is set.
  SchedulingFailureType failure_type = 9;
+  // The error message explaining why scheduling has failed.
+  // Must be an empty string if failure_type is `NOT_FAILED`.
+  string scheduling_failure_message = 10;
 }

 message PrepareBundleResourcesRequest {
--- a/src/ray/raylet/agent_manager.cc
+++ b/src/ray/raylet/agent_manager.cc
@ -133,16 +133,20 @@ void AgentManager::CreateRuntimeEnv(
    CreateRuntimeEnvCallback callback) {
  // If the agent cannot be started, fail the request.
  if (!should_start_agent_) {
-    RAY_LOG(ERROR) << "Not all required Ray dependencies for the runtime_env "
-                      "feature were found. To install the required dependencies, "
-                   << "please run `pip install \"ray[default]\"`.";
+    std::stringstream str_stream;
+    str_stream << "Not all required Ray dependencies for the runtime_env "
+                  "feature were found. To install the required dependencies, "
+               << "please run `pip install \"ray[default]\"`.";
+    const auto &error_message = str_stream.str();
+    RAY_LOG(ERROR) << error_message;
    // Execute the callback after the currently executing callback finishes.  Otherwise
    // the task may be erased from the dispatch queue during the queue iteration in
    // ClusterTaskManager::DispatchScheduledTasksToWorkers(), invalidating the iterator
    // and causing a segfault.
    delay_executor_(
-        [callback] {
-          callback(/*successful=*/false, /*serialized_runtime_env_context=*/"");
+        [callback = std::move(callback), error_message] {
+          callback(/*successful=*/false, /*serialized_runtime_env_context=*/"",
+                   /*setup_error_message*/ error_message);
        },
        0);
    return;
@ -151,12 +155,17 @@ void AgentManager::CreateRuntimeEnv(
  if (runtime_env_agent_client_ == nullptr) {
    // If the agent cannot be restarted anymore, fail the request.
    if (agent_restart_count_ >= RayConfig::instance().agent_max_restart_count()) {
-      RAY_LOG(WARNING) << "Runtime environment " << serialized_runtime_env
-                       << " cannot be created on this node because the agent is dead.";
+      std::stringstream str_stream;
+      str_stream << "Runtime environment " << serialized_runtime_env
+                 << " cannot be created on this node because the agent is dead.";
+      const auto &error_message = str_stream.str();
+      RAY_LOG(WARNING) << error_message;
      delay_executor_(
-          [callback, serialized_runtime_env] {
+          [callback = std::move(callback),
+           serialized_runtime_env = std::move(serialized_runtime_env), error_message] {
            callback(/*successful=*/false,
-                     /*serialized_runtime_env_context=*/serialized_runtime_env);
+                     /*serialized_runtime_env_context=*/serialized_runtime_env,
+                     /*setup_error_message*/ error_message);
          },
          0);
      return;
@ -167,7 +176,7 @@ void AgentManager::CreateRuntimeEnv(
        << serialized_runtime_env;
    delay_executor_(
        [this, job_id, serialized_runtime_env, serialized_allocated_resource_instances,
-         callback] {
+         callback = std::move(callback)] {
          CreateRuntimeEnv(job_id, serialized_runtime_env,
                           serialized_allocated_resource_instances, callback);
        },
@ -180,26 +189,29 @@ void AgentManager::CreateRuntimeEnv(
  request.set_serialized_allocated_resource_instances(
      serialized_allocated_resource_instances);
  runtime_env_agent_client_->CreateRuntimeEnv(
-      request,
-      [this, job_id, serialized_runtime_env, serialized_allocated_resource_instances,
-       callback](const Status &status, const rpc::CreateRuntimeEnvReply &reply) {
+      request, [this, job_id, serialized_runtime_env,
+                serialized_allocated_resource_instances, callback = std::move(callback)](
+                   const Status &status, const rpc::CreateRuntimeEnvReply &reply) {
        if (status.ok()) {
          if (reply.status() == rpc::AGENT_RPC_STATUS_OK) {
-            callback(true, reply.serialized_runtime_env_context());
+            callback(true, reply.serialized_runtime_env_context(),
+                     /*setup_error_message*/ "");
          } else {
-            RAY_LOG(ERROR) << "Failed to create runtime env: " << serialized_runtime_env
-                           << ", error message: " << reply.error_message();
-            callback(false, reply.serialized_runtime_env_context());
+            RAY_LOG(INFO) << "Failed to create runtime env: " << serialized_runtime_env
+                          << ", error message: " << reply.error_message();
+            callback(false, reply.serialized_runtime_env_context(),
+                     /*setup_error_message*/ reply.error_message());
          }

        } else {
-          RAY_LOG(ERROR)
+          // TODO(sang): Invoke a callback if it fails more than X times.
+          RAY_LOG(INFO)
              << "Failed to create the runtime env: " << serialized_runtime_env
              << ", status = " << status
              << ", maybe there are some network problems, will retry it later.";
          delay_executor_(
              [this, job_id, serialized_runtime_env,
-               serialized_allocated_resource_instances, callback] {
+               serialized_allocated_resource_instances, callback = std::move(callback)] {
                CreateRuntimeEnv(job_id, serialized_runtime_env,
                                 serialized_allocated_resource_instances, callback);
              },
@ -228,6 +240,7 @@ void AgentManager::DeleteURIs(const std::vector<std::string> &uris,
      if (reply.status() == rpc::AGENT_RPC_STATUS_OK) {
        callback(true);
      } else {
+        // TODO(sang): Find a better way to delivering error messages in this case.
        RAY_LOG(ERROR) << "Failed to delete URIs"
                       << ", error message: " << reply.error_message();
        callback(false);
--- a/src/ray/raylet/agent_manager.h
+++ b/src/ray/raylet/agent_manager.h
@ -35,8 +35,14 @@ typedef std::function<std::shared_ptr<rpc::RuntimeEnvAgentClientInterface>(
    const std::string &ip_address, int port)>
    RuntimeEnvAgentClientFactoryFn;

+/// Callback that's callaed after runtime env is created.
+/// \param[in] successful Whether or not the creation was successful.
+/// \param[in] serialized_runtime_env_context Serialized context.
+/// \param[in] setup_error_message The error message if runtime env creation fails.
+/// It must be only set when successful == false.
 typedef std::function<void(bool successful,
-                           const std::string &serialized_runtime_env_context)>
+                           const std::string &serialized_runtime_env_context,
+                           const std::string &setup_error_message)>
    CreateRuntimeEnvCallback;
 typedef std::function<void(bool successful)> DeleteURIsCallback;

--- a/src/ray/raylet/scheduling/cluster_task_manager.cc
+++ b/src/ray/raylet/scheduling/cluster_task_manager.cc
@ -154,7 +154,8 @@ bool ClusterTaskManager::PoppedWorkerHandler(
    const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
    const TaskID &task_id, SchedulingClass scheduling_class,
    const std::shared_ptr<internal::Work> &work, bool is_detached_actor,
-    const rpc::Address &owner_address) {
+    const rpc::Address &owner_address,
+    const std::string &runtime_env_setup_error_message) {
  const auto &reply = work->reply;
  const auto &callback = work->callback;
  bool canceled = work->GetState() == internal::WorkStatus::CANCELLED;
@ -246,7 +247,8 @@ bool ClusterTaskManager::PoppedWorkerHandler(
        // `CancelTask`.
        CancelTask(
            task_id,
-            rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_RUNTIME_ENV_SETUP_FAILED);
+            rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_RUNTIME_ENV_SETUP_FAILED,
+            /*scheduling_failure_message*/ runtime_env_setup_error_message);
      } else {
        // In other cases, set the work status `WAITING` to make this task
        // could be re-dispatched.
@ -452,10 +454,11 @@ void ClusterTaskManager::DispatchScheduledTasksToWorkers(
        worker_pool_.PopWorker(
            spec,
            [this, task_id, scheduling_class, work, is_detached_actor, owner_address](
-                const std::shared_ptr<WorkerInterface> worker,
-                PopWorkerStatus status) -> bool {
+                const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
+                const std::string &runtime_env_setup_error_message) -> bool {
              return PoppedWorkerHandler(worker, status, task_id, scheduling_class, work,
-                                         is_detached_actor, owner_address);
+                                         is_detached_actor, owner_address,
+                                         runtime_env_setup_error_message);
            },
            allocated_instances_serialized_json);
        work_it++;
@ -663,17 +666,20 @@ void ClusterTaskManager::ReleaseTaskArgs(const TaskID &task_id) {
 }

 void ReplyCancelled(std::shared_ptr<internal::Work> &work,
-                    rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type) {
+                    rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type,
+                    const std::string &scheduling_failure_message) {
  auto reply = work->reply;
  auto callback = work->callback;
  reply->set_canceled(true);
  reply->set_failure_type(failure_type);
+  reply->set_scheduling_failure_message(scheduling_failure_message);
  callback();
 }

 bool ClusterTaskManager::CancelTask(
    const TaskID &task_id,
-    rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type) {
+    rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type,
+    const std::string &scheduling_failure_message) {
  // TODO(sang): There are lots of repetitive code around task backlogs. We should
  // refactor them.
  for (auto shapes_it = tasks_to_schedule_.begin(); shapes_it != tasks_to_schedule_.end();
@ -683,7 +689,7 @@ bool ClusterTaskManager::CancelTask(
      const auto &task = (*work_it)->task;
      if (task.GetTaskSpecification().TaskId() == task_id) {
        RAY_LOG(DEBUG) << "Canceling task " << task_id << " from schedule queue.";
-        ReplyCancelled(*work_it, failure_type);
+        ReplyCancelled(*work_it, failure_type, scheduling_failure_message);
        work_queue.erase(work_it);
        if (work_queue.empty()) {
          tasks_to_schedule_.erase(shapes_it);
@ -699,7 +705,7 @@ bool ClusterTaskManager::CancelTask(
      const auto &task = (*work_it)->task;
      if (task.GetTaskSpecification().TaskId() == task_id) {
        RAY_LOG(DEBUG) << "Canceling task " << task_id << " from dispatch queue.";
-        ReplyCancelled(*work_it, failure_type);
+        ReplyCancelled(*work_it, failure_type, scheduling_failure_message);
        if ((*work_it)->GetState() == internal::WorkStatus::WAITING_FOR_WORKER) {
          // We've already acquired resources so we need to release them.
          cluster_resource_scheduler_->GetLocalResourceManager().ReleaseWorkerResources(
@ -729,7 +735,7 @@ bool ClusterTaskManager::CancelTask(
      const auto &task = (*work_it)->task;
      if (task.GetTaskSpecification().TaskId() == task_id) {
        RAY_LOG(DEBUG) << "Canceling task " << task_id << " from infeasible queue.";
-        ReplyCancelled(*work_it, failure_type);
+        ReplyCancelled(*work_it, failure_type, scheduling_failure_message);
        work_queue.erase(work_it);
        if (work_queue.empty()) {
          infeasible_tasks_.erase(shapes_it);
@ -742,7 +748,7 @@ bool ClusterTaskManager::CancelTask(
  auto iter = waiting_tasks_index_.find(task_id);
  if (iter != waiting_tasks_index_.end()) {
    const auto &task = (*iter->second)->task;
-    ReplyCancelled(*iter->second, failure_type);
+    ReplyCancelled(*iter->second, failure_type, scheduling_failure_message);
    if (!task.GetTaskSpecification().GetDependencies().empty()) {
      task_dependency_manager_.RemoveTaskDependencies(
          task.GetTaskSpecification().TaskId());
--- a/src/ray/raylet/scheduling/cluster_task_manager.h
+++ b/src/ray/raylet/scheduling/cluster_task_manager.h
@ -203,10 +203,10 @@ class ClusterTaskManager : public ClusterTaskManagerInterface {
  ///
  /// \return True if task was successfully removed. This function will return
  /// false if the task is already running.
-  bool CancelTask(
-      const TaskID &task_id,
-      rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type =
-          rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED) override;
+  bool CancelTask(const TaskID &task_id,
+                  rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type =
+                      rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED,
+                  const std::string &scheduling_failure_message = "") override;

  /// Populate the list of pending or infeasible actor tasks for node stats.
  ///
@ -294,7 +294,8 @@ class ClusterTaskManager : public ClusterTaskManagerInterface {
                           PopWorkerStatus status, const TaskID &task_id,
                           SchedulingClass scheduling_class,
                           const std::shared_ptr<internal::Work> &work,
-                           bool is_detached_actor, const rpc::Address &owner_address);
+                           bool is_detached_actor, const rpc::Address &owner_address,
+                           const std::string &runtime_env_setup_error_message);

  /// (Step 3) Attempts to dispatch all tasks which are ready to run. A task
  /// will be dispatched if it is on `tasks_to_dispatch_` and there are still
--- a/src/ray/raylet/scheduling/cluster_task_manager_interface.h
+++ b/src/ray/raylet/scheduling/cluster_task_manager_interface.h
@ -88,7 +88,8 @@ class ClusterTaskManagerInterface {
  virtual bool CancelTask(
      const TaskID &task_id,
      rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type =
-          rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED) = 0;
+          rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_INTENDED,
+      const std::string &scheduling_failure_message = "") = 0;

  /// Set the worker backlog size for a particular scheduling class.
  ///
--- a/src/ray/raylet/scheduling/cluster_task_manager_test.cc
+++ b/src/ray/raylet/scheduling/cluster_task_manager_test.cc
@ -66,12 +66,15 @@ class MockWorkerPool : public WorkerPoolInterface {
    return {};
  }

-  void TriggerCallbacksWithNotOKStatus(PopWorkerStatus status) {
+  void TriggerCallbacksWithNotOKStatus(
+      PopWorkerStatus status, const std::string &runtime_env_setup_error_msg = "") {
    RAY_CHECK(status != PopWorkerStatus::OK);
    for (const auto &pair : callbacks) {
      for (const auto &callback : pair.second) {
        // No task should be dispatched.
-        ASSERT_FALSE(callback(nullptr, status));
+        ASSERT_FALSE(
+            callback(nullptr, status,
+                     /*runtime_env_setup_error_msg*/ runtime_env_setup_error_msg));
      }
    }
    callbacks.clear();
@ -88,7 +91,7 @@ class MockWorkerPool : public WorkerPoolInterface {
        RAY_CHECK(!list.empty());
        for (auto list_it = list.begin(); list_it != list.end();) {
          auto &callback = *list_it;
-          dispatched = callback(worker, PopWorkerStatus::OK);
+          dispatched = callback(worker, PopWorkerStatus::OK, "");
          list_it = list.erase(list_it);
          if (dispatched) {
            break;
@ -914,12 +917,15 @@ TEST_F(ClusterTaskManagerTest, NotOKPopWorkerTest) {
  ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING), 0);
  ASSERT_EQ(NumRunningTasks(), 1);
  // The task should be cancelled.
-  pool_.TriggerCallbacksWithNotOKStatus(PopWorkerStatus::RuntimeEnvCreationFailed);
+  const auto runtime_env_error_msg = "Runtime env error message";
+  pool_.TriggerCallbacksWithNotOKStatus(PopWorkerStatus::RuntimeEnvCreationFailed,
+                                        runtime_env_error_msg);
  ASSERT_TRUE(callback_called);
  ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING_FOR_WORKER), 0);
  ASSERT_EQ(NumTasksToDispatchWithStatus(internal::WorkStatus::WAITING), 0);
  ASSERT_EQ(NumRunningTasks(), 0);
  ASSERT_TRUE(reply.canceled());
+  ASSERT_EQ(reply.scheduling_failure_message(), runtime_env_error_msg);

  AssertNoLeaks();
 }
--- a/src/ray/raylet/worker_pool.cc
+++ b/src/ray/raylet/worker_pool.cc
@ -172,6 +172,10 @@ void WorkerPool::SetAgentManager(std::shared_ptr<AgentManager> agent_manager) {
 void WorkerPool::PopWorkerCallbackAsync(const PopWorkerCallback &callback,
                                        std::shared_ptr<WorkerInterface> worker,
                                        PopWorkerStatus status) {
+  // This method shouldn't be invoked when runtime env creation has failed because
+  // when runtime env is failed to be created, they are all
+  // invoking the callback immediately.
+  RAY_CHECK(status != PopWorkerStatus::RuntimeEnvCreationFailed);
  // Call back this function asynchronously to make sure executed in different stack.
  io_service_->post([this, callback, worker,
                     status]() { PopWorkerCallbackInternal(callback, worker, status); },
@ -182,7 +186,7 @@ void WorkerPool::PopWorkerCallbackInternal(const PopWorkerCallback &callback,
                                           std::shared_ptr<WorkerInterface> worker,
                                           PopWorkerStatus status) {
  RAY_CHECK(callback);
-  auto used = callback(worker, status);
+  auto used = callback(worker, status, /*runtime_env_setup_error_message*/ "");
  if (worker && !used) {
    // The invalid worker not used, restore it to worker pool.
    PushWorker(worker);
@ -613,14 +617,15 @@ void WorkerPool::HandleJobStarted(const JobID &job_id, const rpc::JobConfig &job
                  << ". The runtime environment was " << runtime_env << ".";
    CreateRuntimeEnv(
        runtime_env, job_id,
-        [job_id](bool successful, const std::string &serialized_runtime_env_context) {
+        [job_id](bool successful, const std::string &serialized_runtime_env_context,
+                 const std::string &setup_error_message) {
          if (successful) {
            RAY_LOG(INFO) << "[Eagerly] Create runtime env successful for job " << job_id
                          << ". The result context was " << serialized_runtime_env_context
                          << ".";
          } else {
            RAY_LOG(ERROR) << "[Eagerly] Couldn't create a runtime environment for job "
-                           << job_id << ".";
+                           << job_id << ". Error message: " << setup_error_message;
          }
        });
  }
@ -882,7 +887,11 @@ void WorkerPool::InvokePopWorkerCallbackForProcess(
    *task_id = it->second.task_id;
    const auto &callback = it->second.callback;
    RAY_CHECK(callback);
-    *worker_used = callback(worker, status);
+    // This method shouldn't be invoked when runtime env creation has failed because
+    // when runtime env is failed to be created, they are all
+    // invoking the callback immediately.
+    RAY_CHECK(status != PopWorkerStatus::RuntimeEnvCreationFailed);
+    *worker_used = callback(worker, status, /*runtime_env_setup_error_message*/ "");
    starting_workers_to_tasks.erase(it);
  }
 }
@ -1135,14 +1144,16 @@ void WorkerPool::PopWorker(const TaskSpecification &task_spec,
        CreateRuntimeEnv(
            task_spec.SerializedRuntimeEnv(), task_spec.JobId(),
            [this, start_worker_process_fn, callback, &state, task_spec, dynamic_options](
-                bool successful, const std::string &serialized_runtime_env_context) {
+                bool successful, const std::string &serialized_runtime_env_context,
+                const std::string &setup_error_message) {
              if (successful) {
                start_worker_process_fn(task_spec, state, dynamic_options, true,
                                        task_spec.SerializedRuntimeEnv(),
                                        serialized_runtime_env_context, callback);
              } else {
                process_failed_runtime_env_setup_failed_++;
-                callback(nullptr, PopWorkerStatus::RuntimeEnvCreationFailed);
+                callback(nullptr, PopWorkerStatus::RuntimeEnvCreationFailed,
+                         /*runtime_env_setup_error_message*/ setup_error_message);
                RAY_LOG(WARNING)
                    << "Create runtime env failed for task " << task_spec.TaskId()
                    << " and couldn't create the dedicated worker.";
@ -1194,14 +1205,16 @@ void WorkerPool::PopWorker(const TaskSpecification &task_spec,
        CreateRuntimeEnv(
            task_spec.SerializedRuntimeEnv(), task_spec.JobId(),
            [this, start_worker_process_fn, callback, &state, task_spec](
-                bool successful, const std::string &serialized_runtime_env_context) {
+                bool successful, const std::string &serialized_runtime_env_context,
+                const std::string &setup_error_message) {
              if (successful) {
                start_worker_process_fn(task_spec, state, {}, false,
                                        task_spec.SerializedRuntimeEnv(),
                                        serialized_runtime_env_context, callback);
              } else {
                process_failed_runtime_env_setup_failed_++;
-                callback(nullptr, PopWorkerStatus::RuntimeEnvCreationFailed);
+                callback(nullptr, PopWorkerStatus::RuntimeEnvCreationFailed,
+                         /*runtime_env_setup_error_message*/ setup_error_message);
                RAY_LOG(WARNING)
                    << "Create runtime env failed for task " << task_spec.TaskId()
                    << " and couldn't create the worker.";
@ -1493,20 +1506,21 @@ WorkerPool::IOWorkerState &WorkerPool::GetIOWorkerStateFromWorkerType(

 void WorkerPool::CreateRuntimeEnv(
    const std::string &serialized_runtime_env, const JobID &job_id,
-    const std::function<void(bool, const std::string &)> &callback,
+    const CreateRuntimeEnvCallback &callback,
    const std::string &serialized_allocated_resource_instances) {
  // create runtime env.
  agent_manager_->CreateRuntimeEnv(
      job_id, serialized_runtime_env, serialized_allocated_resource_instances,
-      [job_id, serialized_runtime_env, callback](
-          bool successful, const std::string &serialized_runtime_env_context) {
+      [job_id, serialized_runtime_env = std::move(serialized_runtime_env), callback](
+          bool successful, const std::string &serialized_runtime_env_context,
+          const std::string &setup_error_message) {
        if (successful) {
-          callback(true, serialized_runtime_env_context);
+          callback(true, serialized_runtime_env_context, "");
        } else {
          RAY_LOG(WARNING) << "Couldn't create a runtime environment for job " << job_id
                           << ". The runtime environment was " << serialized_runtime_env
                           << ".";
-          callback(false, "");
+          callback(false, "", setup_error_message);
        }
      });
 }
--- a/src/ray/raylet/worker_pool.h
+++ b/src/ray/raylet/worker_pool.h
@ -61,10 +61,17 @@ enum PopWorkerStatus {
  RuntimeEnvCreationFailed = 4,
 };

-/// \Return true if the worker was used. Otherwise, return false and the worker will be
-/// returned to the worker pool.
+/// \param[in] worker The started worker instance. Nullptr if worker is not started.
+/// \param[in] status The pop worker status. OK if things go well. Otherwise, it will
+/// contain the error status.
+/// \param[in] runtime_env_setup_error_message The error message
+/// when runtime env setup is failed. This should be empty unless status ==
+/// RuntimeEnvCreationFailed.
+/// \return true if the worker was used. Otherwise, return false
+/// and the worker will be returned to the worker pool.
 using PopWorkerCallback = std::function<bool(
-    const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status)>;
+    const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
+    const std::string &runtime_env_setup_error_message)>;

 /// \class WorkerPoolInterface
 ///
@ -621,7 +628,7 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface {
  /// Create runtime env asynchronously by runtime env agent.
  void CreateRuntimeEnv(
      const std::string &serialized_runtime_env, const JobID &job_id,
-      const std::function<void(bool, const std::string &)> &callback,
+      const CreateRuntimeEnvCallback &callback,
      const std::string &serialized_allocated_resource_instances = "{}");

  void AddStartingWorkerProcess(
--- a/src/ray/raylet/worker_pool_test.cc
+++ b/src/ray/raylet/worker_pool_test.cc
@ -33,6 +33,7 @@ int POOL_SIZE_SOFT_LIMIT = 5;
 int WORKER_REGISTER_TIMEOUT_SECONDS = 3;
 JobID JOB_ID = JobID::FromInt(1);
 std::string BAD_RUNTIME_ENV = "bad runtime env";
+const std::string BAD_RUNTIME_ENV_ERROR_MSG = "bad runtime env";

 std::vector<Language> LANGUAGES = {Language::PYTHON, Language::JAVA};

@ -86,6 +87,7 @@ class MockRuntimeEnvAgentClient : public rpc::RuntimeEnvAgentClientInterface {
    rpc::CreateRuntimeEnvReply reply;
    if (request.serialized_runtime_env() == BAD_RUNTIME_ENV) {
      reply.set_status(rpc::AGENT_RPC_STATUS_FAILED);
+      reply.set_error_message(BAD_RUNTIME_ENV_ERROR_MSG);
    } else {
      rpc::RuntimeEnv runtime_env;
      if (google::protobuf::util::JsonStringToMessage(request.serialized_runtime_env(),
@ -314,23 +316,27 @@ class WorkerPoolMock : public WorkerPool {
  // worker synchronously.
  // \param[in] push_workers If true, tries to push the workers from the started
  // processes.
-  std::shared_ptr<WorkerInterface> PopWorkerSync(const TaskSpecification &task_spec,
-                                                 bool push_workers = true,
-                                                 PopWorkerStatus *worker_status = nullptr,
-                                                 int timeout_worker_number = 0) {
+  std::shared_ptr<WorkerInterface> PopWorkerSync(
+      const TaskSpecification &task_spec, bool push_workers = true,
+      PopWorkerStatus *worker_status = nullptr, int timeout_worker_number = 0,
+      std::string *runtime_env_error_msg = nullptr) {
    std::shared_ptr<WorkerInterface> popped_worker = nullptr;
    std::promise<bool> promise;
-    this->PopWorker(task_spec,
-                    [&popped_worker, worker_status, &promise](
-                        const std::shared_ptr<WorkerInterface> worker,
-                        PopWorkerStatus status) -> bool {
-                      popped_worker = worker;
-                      if (worker_status != nullptr) {
-                        *worker_status = status;
-                      }
-                      promise.set_value(true);
-                      return true;
-                    });
+    this->PopWorker(
+        task_spec,
+        [&popped_worker, worker_status, &promise, runtime_env_error_msg](
+            const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
+            const std::string &runtime_env_setup_error_message) -> bool {
+          popped_worker = worker;
+          if (worker_status != nullptr) {
+            *worker_status = status;
+          }
+          if (runtime_env_error_msg) {
+            *runtime_env_error_msg = runtime_env_setup_error_message;
+          }
+          promise.set_value(true);
+          return true;
+        });
    if (push_workers) {
      PushWorkers(timeout_worker_number);
    }
@ -773,9 +779,10 @@ TEST_F(WorkerPoolTest, MaximumStartupConcurrency) {

  // Try to pop some workers. Some worker processes will be started.
  for (int i = 0; i < MAXIMUM_STARTUP_CONCURRENCY; i++) {
-    worker_pool_->PopWorker(task_spec,
-                            [](const std::shared_ptr<WorkerInterface> worker,
-                               PopWorkerStatus status) -> bool { return true; });
+    worker_pool_->PopWorker(
+        task_spec,
+        [](const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
+           const std::string &runtime_env_setup_error_message) -> bool { return true; });
    auto last_process = worker_pool_->LastStartedWorkerProcess();
    RAY_CHECK(last_process.IsValid());
    started_processes.push_back(last_process);
@ -783,9 +790,10 @@ TEST_F(WorkerPoolTest, MaximumStartupConcurrency) {

  // Can't start a new worker process at this point.
  ASSERT_EQ(MAXIMUM_STARTUP_CONCURRENCY, worker_pool_->NumWorkerProcessesStarting());
-  worker_pool_->PopWorker(task_spec,
-                          [](const std::shared_ptr<WorkerInterface> worker,
-                             PopWorkerStatus status) -> bool { return true; });
+  worker_pool_->PopWorker(
+      task_spec,
+      [](const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
+         const std::string &runtime_env_setup_error_message) -> bool { return true; });
  ASSERT_EQ(MAXIMUM_STARTUP_CONCURRENCY, worker_pool_->NumWorkerProcessesStarting());

  std::vector<std::shared_ptr<WorkerInterface>> workers;
@ -803,9 +811,10 @@ TEST_F(WorkerPoolTest, MaximumStartupConcurrency) {

  // Can't start a new worker process at this point.
  ASSERT_EQ(MAXIMUM_STARTUP_CONCURRENCY, worker_pool_->NumWorkerProcessesStarting());
-  worker_pool_->PopWorker(task_spec,
-                          [](const std::shared_ptr<WorkerInterface> worker,
-                             PopWorkerStatus status) -> bool { return true; });
+  worker_pool_->PopWorker(
+      task_spec,
+      [](const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
+         const std::string &runtime_env_setup_error_message) -> bool { return true; });
  ASSERT_EQ(MAXIMUM_STARTUP_CONCURRENCY, worker_pool_->NumWorkerProcessesStarting());

  // Call `OnWorkerStarted` to emulate worker port announcement.
@ -1622,7 +1631,8 @@ TEST_F(WorkerPoolTest, WorkerNoLeaks) {
  // Pop a worker and don't dispatch.
  worker_pool_->PopWorker(
      task_spec,
-      [](const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status) -> bool {
+      [](const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
+         const std::string &runtime_env_setup_error_message) -> bool {
        // Don't dispatch this worker.
        return false;
      });
@ -1637,7 +1647,8 @@ TEST_F(WorkerPoolTest, WorkerNoLeaks) {
  // Pop a worker and don't dispatch.
  worker_pool_->PopWorker(
      task_spec,
-      [](const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status) -> bool {
+      [](const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
+         const std::string &runtime_env_setup_error_message) -> bool {
        // Don't dispatch this worker.
        return false;
      });
@ -1647,7 +1658,8 @@ TEST_F(WorkerPoolTest, WorkerNoLeaks) {
  // Pop a worker and dispatch.
  worker_pool_->PopWorker(
      task_spec,
-      [](const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status) -> bool {
+      [](const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
+         const std::string &runtime_env_setup_error_message) -> bool {
        // Dispatch this worker.
        return true;
      });
@ -1665,9 +1677,10 @@ TEST_F(WorkerPoolTest, PopWorkerStatus) {
  // Startup worker processes to maximum.
  for (int i = 0; i < MAXIMUM_STARTUP_CONCURRENCY; i++) {
    auto task_spec = ExampleTaskSpec();
-    worker_pool_->PopWorker(task_spec,
-                            [](const std::shared_ptr<WorkerInterface> worker,
-                               PopWorkerStatus status) -> bool { return true; });
+    worker_pool_->PopWorker(
+        task_spec,
+        [](const std::shared_ptr<WorkerInterface> worker, PopWorkerStatus status,
+           const std::string &runtime_env_setup_error_message) -> bool { return true; });
  }
  ASSERT_EQ(MAXIMUM_STARTUP_CONCURRENCY, worker_pool_->NumWorkerProcessesStarting());

@ -1705,11 +1718,13 @@ TEST_F(WorkerPoolTest, PopWorkerStatus) {
  const auto task_spec_with_bad_runtime_env = ExampleTaskSpec(
      ActorID::Nil(), Language::PYTHON, job_id, ActorID::Nil(), {"XXX=YYY"},
      TaskID::FromRandom(JobID::Nil()), ExampleRuntimeEnvInfoFromString(BAD_RUNTIME_ENV));
-  popped_worker =
-      worker_pool_->PopWorkerSync(task_spec_with_bad_runtime_env, true, &status);
+  std::string error_msg;
+  popped_worker = worker_pool_->PopWorkerSync(task_spec_with_bad_runtime_env, true,
+                                              &status, 0, &error_msg);
  // PopWorker failed and the status is `RuntimeEnvCreationFailed`.
  ASSERT_EQ(popped_worker, nullptr);
  ASSERT_EQ(status, PopWorkerStatus::RuntimeEnvCreationFailed);
+  ASSERT_EQ(error_msg, BAD_RUNTIME_ENV_ERROR_MSG);

  // Create a task with available runtime env.
  const auto task_spec_with_runtime_env = ExampleTaskSpec(