[core] Fix bugs in worker cleanup on driver exit (#17049)

* unit test * cleanup test * Don't kill workers when job finishes * better test * lint * lint * comment * check
2025-03-06 10:31:39 -05:00 · 2021-07-15 12:53:51 -07:00 · 2021-07-15 12:53:51 -07:00 · bdaa96bf43
commit bdaa96bf43
parent 02f58a5c6b
6 changed files with 99 additions and 35 deletions
--- a/python/ray/tests/test_failure_4.py
+++ b/python/ray/tests/test_failure_4.py
@ -7,7 +7,8 @@ import pytest
 from ray.cluster_utils import Cluster
 import ray.ray_constants as ray_constants
-from ray.test_utils import (init_error_pubsub, get_error_message)
+from ray.test_utils import (init_error_pubsub, get_error_message,
                            run_string_as_driver)
 def test_fill_object_store_exception(shutdown_only):
@ -79,5 +80,56 @@ def test_connect_with_disconnected_node(shutdown_only):
    p.close()
 def test_detached_actor_ref(call_ray_start):
    address = call_ray_start
    driver_script = """
 import ray
 import time
@ray.remote
 def foo(x):
    return ray.put(42)
@ray.remote
 class Actor:
    def __init__(self):
        self.ref = None
    def invoke(self):
        self.ref = foo.remote(0)
        # Wait for the task to finish before exiting the driver.
        ray.get(self.ref)
    def get(self):
        print("get", self.ref)
        return self.ref
 if __name__ == "__main__":
    ray.init(address="{}", namespace="default")
    a = Actor.options(name="holder", lifetime="detached").remote()
    # Wait for the task to finish before exiting the driver.
    ray.get(a.invoke.remote())
    print("success")
 """.format(address)
    out = run_string_as_driver(driver_script)
    assert "success" in out
    import time
    time.sleep(5)
    # connect to the cluster
    ray.init(address=address, namespace="default")
    actor = ray.get_actor("holder")
    x = actor.get.remote()
    while isinstance(x, ray.ObjectRef):
        x = ray.get(x)
    assert x == 42
 if __name__ == "__main__":
    sys.exit(pytest.main(["-v", __file__]))
--- a/python/ray/tests/test_multi_node.py
+++ b/python/ray/tests/test_multi_node.py
@ -1,13 +1,15 @@
 import os
 import pytest
 import psutil
 import sys
 import time
 import ray
 from ray import ray_constants
 from ray.test_utils import (RayTestTimeoutException, run_string_as_driver,
                            run_string_as_driver_nonblocking,
                            init_error_pubsub, get_error_message,
-                            object_memory_usage)
+                            object_memory_usage, wait_for_condition)
 def test_error_isolation(call_ray_start):
@ -183,6 +185,9 @@ import time
 import ray
 import numpy as np
 from ray.test_utils import object_memory_usage
 import os
 ray.init(address="{}")
 object_refs = [ray.put(np.zeros(200 * 1024, dtype=np.uint8))
              for i in range(1000)]
@ -192,10 +197,20 @@ while time.time() - start_time < 30:
        break
 else:
    raise Exception("Objects did not appear in object table.")
@ray.remote
 def f():
    time.sleep(1)
 print("success")
 # Submit some tasks without waiting for them to finish. Their workers should
 # still get cleaned up eventually, even if they get started after the driver
 # exits.
 [f.remote() for _ in range(10)]
 """.format(address)
-    run_string_as_driver(driver_script)
+    out = run_string_as_driver(driver_script)
    assert "success" in out
    # Make sure the objects are removed from the object table.
    start_time = time.time()
@ -205,6 +220,15 @@ print("success")
    else:
        raise Exception("Objects were not all removed from object table.")
    def all_workers_exited():
        for proc in psutil.process_iter():
            if ray_constants.WORKER_PROCESS_TYPE_IDLE_WORKER in proc.name():
                return False
        return True
    # Check that workers are eventually cleaned up.
    wait_for_condition(all_workers_exited)
 def test_drivers_named_actors(call_ray_start):
    # This test will create some drivers that submit some tasks to the same
--- a/python/ray/tests/test_multi_tenancy.py
+++ b/python/ray/tests/test_multi_tenancy.py
@ -234,7 +234,7 @@ ray.shutdown()
    # wait for a while to let workers register
    time.sleep(2)
-    wait_for_condition(lambda: len(get_workers()) == before)
+    wait_for_condition(lambda: len(get_workers()) <= before)
 def test_not_killing_workers_that_own_objects(shutdown_only):
--- a/src/ray/raylet/node_manager.cc
+++ b/src/ray/raylet/node_manager.cc
@ -538,21 +538,6 @@ void NodeManager::HandleJobFinished(const JobID &job_id, const JobTableData &job
  RAY_LOG(DEBUG) << "HandleJobFinished " << job_id;
  RAY_CHECK(job_data.is_dead());
  worker_pool_.HandleJobFinished(job_id);
  auto workers = worker_pool_.GetWorkersRunningTasksForJob(job_id);
  // Kill all the workers. The actual cleanup for these workers is done
  // later when we receive the DisconnectClient message from them.
  for (const auto &worker : workers) {
    if (!worker->IsDetachedActor()) {
      // Clean up any open ray.wait calls that the worker made.
      dependency_manager_.CancelWaitRequest(worker->WorkerId());
      // Mark the worker as dead so further messages from it are ignored
      // (except DisconnectClient).
      worker->MarkDead();
      // Then kill the worker process.
      KillWorker(worker);
    }
  }
  runtime_env_manager_.RemoveURIReference(job_id.Hex());
 }
@ -1243,19 +1228,9 @@ void NodeManager::DisconnectClient(
    }
  }
  RAY_CHECK(!(is_worker && is_driver));
-  // If the client has any blocked tasks, mark them as unblocked. In
+  // Clean up any open ray.get or ray.wait calls that the worker made.
-  // particular, we are no longer waiting for their dependencies.
+  dependency_manager_.CancelGetRequest(worker->WorkerId());
-  if (is_worker && worker->IsDead()) {
+  dependency_manager_.CancelWaitRequest(worker->WorkerId());
    // If the worker was killed by us because the driver exited,
    // treat it as intentionally disconnected.
    // Don't need to unblock the client if it's a worker and is already dead.
    // Because in this case, its task is already cleaned up.
    RAY_LOG(DEBUG) << "Skip unblocking worker because it's already dead.";
  } else {
    // Clean up any open ray.wait calls that the worker made.
    dependency_manager_.CancelGetRequest(worker->WorkerId());
    dependency_manager_.CancelWaitRequest(worker->WorkerId());
  }
  // Erase any lease metadata.
  leased_workers_.erase(worker->WorkerId());
--- a/src/ray/raylet/worker_pool.cc
+++ b/src/ray/raylet/worker_pool.cc
@ -443,6 +443,7 @@ void WorkerPool::HandleJobFinished(const JobID &job_id) {
  // Currently we don't erase the job from `all_jobs_` , as a workaround for
  // https://github.com/ray-project/ray/issues/11437.
  // unfinished_jobs_.erase(job_id);
  finished_jobs_.insert(job_id);
 }
 boost::optional<const rpc::JobConfig &> WorkerPool::GetJobConfig(
@ -715,11 +716,16 @@ void WorkerPool::TryKillingIdleWorkers() {
  running_size -= pending_exit_idle_workers_.size();
  // Kill idle workers in FIFO order.
  for (const auto &idle_pair : idle_of_all_languages_) {
    const auto &idle_worker = idle_pair.first;
    const auto &job_id = idle_worker->GetAssignedJobId();
    if (running_size <= static_cast<size_t>(num_workers_soft_limit_)) {
-      break;
+      if (!finished_jobs_.count(job_id)) {
        // Ignore the soft limit for jobs that have already finished, as we
        // should always clean up these workers.
        break;
      }
    }
    const auto &idle_worker = idle_pair.first;
    if (pending_exit_idle_workers_.count(idle_worker->WorkerId())) {
      // If the worker is pending exit, just skip it.
      continue;
@ -768,7 +774,11 @@ void WorkerPool::TryKillingIdleWorkers() {
        static_cast<size_t>(num_workers_soft_limit_)) {
      // A Java worker process may contain multiple workers. Killing more workers than we
      // expect may slow the job.
-      return;
+      if (!finished_jobs_.count(job_id)) {
        // Ignore the soft limit for jobs that have already finished, as we
        // should always clean up these workers.
        return;
      }
    }
    for (const auto &worker : workers_in_the_same_process) {
--- a/src/ray/raylet/worker_pool.h
+++ b/src/ray/raylet/worker_pool.h
@ -553,6 +553,9 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface {
  /// This map tracks the latest infos of unfinished jobs.
  absl::flat_hash_map<JobID, rpc::JobConfig> all_jobs_;
  /// Set of jobs whose drivers have exited.
  absl::flat_hash_set<JobID> finished_jobs_;
  /// This map stores the same data as `idle_of_all_languages_`, but in a map structure
  /// for lookup performance.
  std::unordered_map<std::shared_ptr<WorkerInterface>, int64_t>