[Docker] Run docker stop in parallel (#14901)

* first pass at parallel docker stop * real impl * use env var variable * lint fix
2025-03-07 02:51:39 -05:00 · 2021-03-31 08:41:52 -07:00 · 2021-03-31 08:41:52 -07:00 · 32e50b8c67
commit 32e50b8c67
parent 107effb370
2 changed files with 27 additions and 4 deletions
--- a/python/ray/autoscaler/_private/commands.py
+++ b/python/ray/autoscaler/_private/commands.py
@ -1,4 +1,5 @@
 import copy
+from concurrent.futures import ThreadPoolExecutor
 import datetime
 import hashlib
 import json
@ -25,7 +26,8 @@ from ray.experimental.internal_kv import _internal_kv_put
 import ray._private.services as services
 from ray.autoscaler.node_provider import NodeProvider
 from ray.autoscaler._private.constants import \
-    AUTOSCALER_RESOURCE_REQUEST_CHANNEL
+    AUTOSCALER_RESOURCE_REQUEST_CHANNEL, \
+    MAX_PARALLEL_SHUTDOWN_WORKERS
 from ray.autoscaler._private.util import validate_config, hash_runtime_conf, \
    hash_launch_conf, prepare_config
 from ray.autoscaler._private.providers import _get_node_provider, \
@ -406,7 +408,12 @@ def teardown_cluster(config_file: str, yes: bool, workers_only: bool,
                file_mounts_contents_hash="",
                is_head_node=False,
                docker_config=config.get("docker"))
-            _exec(updater, cmd=f"docker stop {container_name}", run_env="host")
+
+            _exec(
+                updater,
+                f"docker stop {container_name}",
+                with_output=False,
+                run_env="host")
        except Exception:
            cli_logger.warning(f"Docker stop failed on {node}")

@ -416,9 +423,21 @@ def teardown_cluster(config_file: str, yes: bool, workers_only: bool,

    container_name = config.get("docker", {}).get("container_name")
    if container_name:
-        for node in A:
-            run_docker_stop(node, container_name)

+        # This is to ensure that the parallel SSH calls below do not mess with
+        # the users terminal.
+        output_redir = cmd_output_util.is_output_redirected()
+        cmd_output_util.set_output_redirected(True)
+        allow_interactive = cmd_output_util.does_allow_interactive()
+        cmd_output_util.set_allow_interactive(False)
+
+        with ThreadPoolExecutor(
+                max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS) as executor:
+            for node in A:
+                executor.submit(
+                    run_docker_stop, node=node, container_name=container_name)
+        cmd_output_util.set_output_redirected(output_redir)
+        cmd_output_util.set_allow_interactive(allow_interactive)
    with LogTimer("teardown_cluster: done."):
        while A:
            provider.terminate_nodes(A)
--- a/python/ray/autoscaler/_private/constants.py
+++ b/python/ray/autoscaler/_private/constants.py
@ -89,3 +89,7 @@ RAY_PROCESSES = [
    ["new_dashboard/agent.py", False],
    ["ray_process_reaper.py", False],
 ]
+
+# Max Concurrent SSH Calls to stop Docker
+MAX_PARALLEL_SHUTDOWN_WORKERS = env_integer("MAX_PARALLEL_SHUTDOWN_WORKERS",
+                                            50)