[Autoscaler][Docker] Make disable_shm_size_detection more usable (#14913)

This commit is contained in:
Ian Rodney 2021-03-30 18:10:09 -07:00 committed by GitHub
parent 3aa39142db
commit 73fb5d6022
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 56 additions and 14 deletions

View file

@ -658,6 +658,7 @@ If enabled, Ray will not try to use the NVIDIA Container Runtime if GPUs are pre
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If enabled, Ray will not automatically specify the size ``/dev/shm`` for the started container and the runtime's default value (64MiB for Docker) will be used.
If ``--shm-size=<>`` is manually added to ``run_options``, this is *automatically* set to ``True``, meaning that Ray will defer to the user-provided value.
* **Required:** No
* **Importance:** Low

View file

@ -1,6 +1,6 @@
from getpass import getuser
from shlex import quote
from typing import Dict
from typing import Dict, List
import click
import hashlib
import json
@ -831,13 +831,14 @@ class DockerCommandRunner(CommandRunnerInterface):
home_directory = env_var.split("HOME=")[1]
break
user_docker_run_options = self.docker_config.get(
"run_options", []) + self.docker_config.get(
f"{'head' if as_head else 'worker'}_run_options", [])
start_command = docker_start_cmds(
self.ssh_command_runner.ssh_user, specific_image,
cleaned_bind_mounts, self.container_name,
self.docker_config.get(
"run_options", []) + self.docker_config.get(
f"{'head' if as_head else 'worker'}_run_options", []) +
self._configure_runtime() + self._auto_configure_shm(),
self._configure_runtime(
self._auto_configure_shm(user_docker_run_options)),
self.ssh_command_runner.cluster_name, home_directory,
self.docker_cmd)
self.run(start_command, run_env="host")
@ -887,9 +888,9 @@ class DockerCommandRunner(CommandRunnerInterface):
self.initialized = True
return docker_run_executed
def _configure_runtime(self):
def _configure_runtime(self, run_options: List[str]) -> List[str]:
if self.docker_config.get("disable_automatic_runtime_detection"):
return []
return run_options
runtime_output = self.ssh_command_runner.run(
f"{self.docker_cmd} " + "info -f '{{.Runtimes}}' ",
@ -897,18 +898,23 @@ class DockerCommandRunner(CommandRunnerInterface):
if "nvidia-container-runtime" in runtime_output:
try:
self.ssh_command_runner.run("nvidia-smi", with_output=False)
return ["--runtime=nvidia"]
return run_options + ["--runtime=nvidia"]
except Exception as e:
logger.warning(
"Nvidia Container Runtime is present, but no GPUs found.")
logger.debug(f"nvidia-smi error: {e}")
return []
return run_options
return []
return run_options
def _auto_configure_shm(self):
def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
if self.docker_config.get("disable_shm_size_detection"):
return []
return run_options
for run_opt in run_options:
if "--shm-size" in run_opt:
logger.info("Bypassing automatic SHM-Detection because of "
f"`run_option`: {run_opt}")
return run_options
try:
shm_output = self.ssh_command_runner.run(
"cat /proc/meminfo || true",
@ -921,11 +927,11 @@ class DockerCommandRunner(CommandRunnerInterface):
shm_size = min((available_memory_bytes *
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION * 1.1),
DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES)
return [f"--shm-size='{shm_size}b'"]
return run_options + [f"--shm-size='{shm_size}b'"]
except Exception as e:
logger.warning(
f"Received error while trying to auto-compute SHM size {e}")
return []
return run_options
def _get_docker_host_mount_location(self, cluster_name: str) -> str:
"""Return the docker host mount directory location."""

View file

@ -370,6 +370,41 @@ def test_rsync_without_exclude_and_filter():
"1.2.3.4", pattern=f"--filter dir-merge,- .ignore")
@pytest.mark.parametrize("run_option_type",
["run_options", "head_run_options"])
def test_docker_shm_override(run_option_type):
process_runner = MockProcessRunner()
provider = MockProvider()
provider.create_node({}, {}, 1)
cluster_name = "cluster"
docker_config = {
"container_name": "container",
"image": "rayproject/ray:latest",
run_option_type: ["--shm-size=80g"]
}
args = {
"log_prefix": "prefix",
"node_id": 0,
"provider": provider,
"auth_config": auth_config,
"cluster_name": cluster_name,
"process_runner": process_runner,
"use_internal_ip": False,
"docker_config": docker_config,
}
cmd_runner = DockerCommandRunner(**args)
process_runner.respond_to_call("json .Config.Env", 2 * ["[]"])
cmd_runner.run_init(as_head=True, file_mounts={}, sync_run_yet=True)
# Ensure the user-provided SHM size is used.
process_runner.assert_has_call("1.2.3.4", pattern="--shm-size=80g")
# Ensure that SHM auto detection is bypassed
process_runner.assert_not_has_call("1.2.3.4", pattern="/proc/meminfo")
if __name__ == "__main__":
import sys
sys.exit(pytest.main(["-v", __file__]))