[Autoscaler][Docker] Make disable_shm_size_detection more usable (#14913)

2025-03-06 02:21:39 -05:00 · 2021-03-30 18:10:09 -07:00 · 2021-03-30 18:10:09 -07:00 · 73fb5d6022
commit 73fb5d6022
parent 3aa39142db
3 changed files with 56 additions and 14 deletions
--- a/doc/source/cluster/config.rst
+++ b/doc/source/cluster/config.rst
@ -658,6 +658,7 @@ If enabled, Ray will not try to use the NVIDIA Container Runtime if GPUs are pre
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 If enabled, Ray will not automatically specify the size ``/dev/shm`` for the started container and the runtime's default value (64MiB for Docker) will be used.
+If ``--shm-size=<>`` is manually added to ``run_options``, this is *automatically* set to ``True``, meaning that Ray will defer to the user-provided value.

 * **Required:** No
 * **Importance:** Low
--- a/python/ray/autoscaler/_private/command_runner.py
+++ b/python/ray/autoscaler/_private/command_runner.py
@ -1,6 +1,6 @@
 from getpass import getuser
 from shlex import quote
-from typing import Dict
+from typing import Dict, List
 import click
 import hashlib
 import json
@ -831,13 +831,14 @@ class DockerCommandRunner(CommandRunnerInterface):
                    home_directory = env_var.split("HOME=")[1]
                    break

+            user_docker_run_options = self.docker_config.get(
+                "run_options", []) + self.docker_config.get(
+                    f"{'head' if as_head else 'worker'}_run_options", [])
            start_command = docker_start_cmds(
                self.ssh_command_runner.ssh_user, specific_image,
                cleaned_bind_mounts, self.container_name,
-                self.docker_config.get(
-                    "run_options", []) + self.docker_config.get(
-                        f"{'head' if as_head else 'worker'}_run_options", []) +
-                self._configure_runtime() + self._auto_configure_shm(),
+                self._configure_runtime(
+                    self._auto_configure_shm(user_docker_run_options)),
                self.ssh_command_runner.cluster_name, home_directory,
                self.docker_cmd)
            self.run(start_command, run_env="host")
@ -887,9 +888,9 @@ class DockerCommandRunner(CommandRunnerInterface):
        self.initialized = True
        return docker_run_executed

-    def _configure_runtime(self):
+    def _configure_runtime(self, run_options: List[str]) -> List[str]:
        if self.docker_config.get("disable_automatic_runtime_detection"):
-            return []
+            return run_options

        runtime_output = self.ssh_command_runner.run(
            f"{self.docker_cmd} " + "info -f '{{.Runtimes}}' ",
@ -897,18 +898,23 @@ class DockerCommandRunner(CommandRunnerInterface):
        if "nvidia-container-runtime" in runtime_output:
            try:
                self.ssh_command_runner.run("nvidia-smi", with_output=False)
-                return ["--runtime=nvidia"]
+                return run_options + ["--runtime=nvidia"]
            except Exception as e:
                logger.warning(
                    "Nvidia Container Runtime is present, but no GPUs found.")
                logger.debug(f"nvidia-smi error: {e}")
-                return []
+                return run_options

-        return []
+        return run_options

-    def _auto_configure_shm(self):
+    def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
        if self.docker_config.get("disable_shm_size_detection"):
-            return []
+            return run_options
+        for run_opt in run_options:
+            if "--shm-size" in run_opt:
+                logger.info("Bypassing automatic SHM-Detection because of "
+                            f"`run_option`: {run_opt}")
+                return run_options
        try:
            shm_output = self.ssh_command_runner.run(
                "cat /proc/meminfo || true",
@ -921,11 +927,11 @@ class DockerCommandRunner(CommandRunnerInterface):
            shm_size = min((available_memory_bytes *
                            DEFAULT_OBJECT_STORE_MEMORY_PROPORTION * 1.1),
                           DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES)
-            return [f"--shm-size='{shm_size}b'"]
+            return run_options + [f"--shm-size='{shm_size}b'"]
        except Exception as e:
            logger.warning(
                f"Received error while trying to auto-compute SHM size {e}")
-            return []
+            return run_options

    def _get_docker_host_mount_location(self, cluster_name: str) -> str:
        """Return the docker host mount directory location."""
--- a/python/ray/tests/test_command_runner.py
+++ b/python/ray/tests/test_command_runner.py
@ -370,6 +370,41 @@ def test_rsync_without_exclude_and_filter():
        "1.2.3.4", pattern=f"--filter dir-merge,- .ignore")


+@pytest.mark.parametrize("run_option_type",
+                         ["run_options", "head_run_options"])
+def test_docker_shm_override(run_option_type):
+    process_runner = MockProcessRunner()
+    provider = MockProvider()
+    provider.create_node({}, {}, 1)
+    cluster_name = "cluster"
+
+    docker_config = {
+        "container_name": "container",
+        "image": "rayproject/ray:latest",
+        run_option_type: ["--shm-size=80g"]
+    }
+    args = {
+        "log_prefix": "prefix",
+        "node_id": 0,
+        "provider": provider,
+        "auth_config": auth_config,
+        "cluster_name": cluster_name,
+        "process_runner": process_runner,
+        "use_internal_ip": False,
+        "docker_config": docker_config,
+    }
+    cmd_runner = DockerCommandRunner(**args)
+
+    process_runner.respond_to_call("json .Config.Env", 2 * ["[]"])
+    cmd_runner.run_init(as_head=True, file_mounts={}, sync_run_yet=True)
+
+    # Ensure the user-provided SHM size is used.
+    process_runner.assert_has_call("1.2.3.4", pattern="--shm-size=80g")
+
+    # Ensure that SHM auto detection is bypassed
+    process_runner.assert_not_has_call("1.2.3.4", pattern="/proc/meminfo")
+
+
 if __name__ == "__main__":
    import sys
    sys.exit(pytest.main(["-v", __file__]))