mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[docker] auto-populate shared memory size (#11953)
This commit is contained in:
parent
59bc1e6c09
commit
3b56a1a522
7 changed files with 71 additions and 5 deletions
|
@ -278,7 +278,8 @@ Start out by launching the deployment container.
|
|||
docker run --shm-size=<shm-size> -t -i rayproject/ray
|
||||
|
||||
Replace ``<shm-size>`` with a limit appropriate for your system, for example
|
||||
``512M`` or ``2G``. The ``-t`` and ``-i`` options here are required to support
|
||||
``512M`` or ``2G``. A good estimate for this is to use roughly 30% of your available memory (this is
|
||||
what Ray uses internally for its Object Store). The ``-t`` and ``-i`` options here are required to support
|
||||
interactive use of the container.
|
||||
|
||||
**Note:** Ray requires a **large** amount of shared memory because each object
|
||||
|
|
|
@ -12,6 +12,9 @@ import time
|
|||
import warnings
|
||||
|
||||
from ray.autoscaler.command_runner import CommandRunnerInterface
|
||||
from ray.autoscaler._private.constants import \
|
||||
DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES,\
|
||||
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION
|
||||
from ray.autoscaler._private.docker import check_bind_mounts_cmd, \
|
||||
check_docker_running_cmd, \
|
||||
check_docker_image, \
|
||||
|
@ -716,8 +719,8 @@ class DockerCommandRunner(CommandRunnerInterface):
|
|||
self.container_name,
|
||||
self.docker_config.get(
|
||||
"run_options", []) + self.docker_config.get(
|
||||
f"{'head' if as_head else 'worker'}_run_options",
|
||||
[]) + self._configure_runtime(),
|
||||
f"{'head' if as_head else 'worker'}_run_options", []) +
|
||||
self._configure_runtime() + self._auto_configure_shm(),
|
||||
self.ssh_command_runner.cluster_name, home_directory)
|
||||
self.run(start_command, run_env="host")
|
||||
else:
|
||||
|
@ -781,6 +784,27 @@ class DockerCommandRunner(CommandRunnerInterface):
|
|||
|
||||
return []
|
||||
|
||||
def _auto_configure_shm(self):
|
||||
if self.docker_config.get("disable_shm_size_detection"):
|
||||
return []
|
||||
try:
|
||||
shm_output = self.ssh_command_runner.run(
|
||||
"cat /proc/meminfo || true",
|
||||
with_output=True).decode().strip()
|
||||
available_memory = int([
|
||||
ln for ln in shm_output.split("\n") if "MemAvailable" in ln
|
||||
][0].split()[1])
|
||||
available_memory_bytes = available_memory * 1024
|
||||
# Overestimate SHM size by 10%
|
||||
shm_size = min((available_memory_bytes *
|
||||
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION * 1.1),
|
||||
DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES)
|
||||
return [f"--shm-size='{shm_size}b'"]
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Received error while trying to auto-compute SHM size {e}")
|
||||
return []
|
||||
|
||||
def _get_docker_host_mount_location(self, cluster_name: str) -> str:
|
||||
"""Return the docker host mount directory location."""
|
||||
# Imported here due to circular dependency in imports.
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import os
|
||||
|
||||
from ray.ray_constants import ( # noqa F401
|
||||
AUTOSCALER_RESOURCE_REQUEST_CHANNEL, LOGGER_FORMAT,
|
||||
AUTOSCALER_RESOURCE_REQUEST_CHANNEL, DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES,
|
||||
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION, LOGGER_FORMAT,
|
||||
MEMORY_RESOURCE_UNIT_BYTES, RESOURCES_ENVIRONMENT_VARIABLE)
|
||||
|
||||
|
||||
|
|
|
@ -242,6 +242,11 @@
|
|||
"type": "boolean",
|
||||
"description": "disable Ray from automatically using the NVIDIA runtime if available",
|
||||
"default": false
|
||||
},
|
||||
"disable_shm_size_detection" : {
|
||||
"type": "boolean",
|
||||
"description": "disable Ray from automatically detecting /dev/shm size for the container",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
|
|
|
@ -24,6 +24,8 @@ ID_SIZE = 20
|
|||
# The default maximum number of bytes to allocate to the object store unless
|
||||
# overridden by the user.
|
||||
DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = 200 * 10**9
|
||||
# The default proportion of available memory allocated to the object store
|
||||
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION = 0.3
|
||||
# The smallest cap on the memory used by the object store that we allow.
|
||||
# This must be greater than MEMORY_RESOURCE_UNIT_BYTES * 0.7
|
||||
OBJECT_STORE_MINIMUM_MEMORY_BYTES = 75 * 1024 * 1024
|
||||
|
|
|
@ -179,7 +179,9 @@ class ResourceSpec(
|
|||
avail_memory = ray.utils.estimate_available_memory()
|
||||
object_store_memory = self.object_store_memory
|
||||
if object_store_memory is None:
|
||||
object_store_memory = int(avail_memory * 0.3)
|
||||
object_store_memory = int(
|
||||
avail_memory *
|
||||
ray_constants.DEFAULT_OBJECT_STORE_MEMORY_PROPORTION)
|
||||
# Cap memory to avoid memory waste and perf issues on large nodes
|
||||
if (object_store_memory >
|
||||
ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES):
|
||||
|
|
|
@ -1516,6 +1516,37 @@ class AutoscalingTest(unittest.TestCase):
|
|||
f"{file_mount_dir}/ ubuntu@172.0.0.{i}:"
|
||||
f"{docker_mount_prefix}/home/test-folder/")
|
||||
|
||||
def testAutodetectResources(self):
|
||||
self.provider = MockProvider()
|
||||
config = SMALL_CLUSTER.copy()
|
||||
config_path = self.write_config(config)
|
||||
runner = MockProcessRunner()
|
||||
proc_meminfo = """
|
||||
MemTotal: 16396056 kB
|
||||
MemFree: 12869528 kB
|
||||
MemAvailable: 33000000 kB
|
||||
"""
|
||||
runner.respond_to_call("cat /proc/meminfo", [proc_meminfo])
|
||||
runner.respond_to_call(".Runtimes", ["nvidia-container-runtime"])
|
||||
runner.respond_to_call("nvidia-smi", ["works"])
|
||||
lm = LoadMetrics()
|
||||
autoscaler = StandardAutoscaler(
|
||||
config_path,
|
||||
lm,
|
||||
max_failures=0,
|
||||
process_runner=runner,
|
||||
update_interval_s=0)
|
||||
|
||||
autoscaler.update()
|
||||
self.waitForNodes(2)
|
||||
self.provider.finish_starting_nodes()
|
||||
autoscaler.update()
|
||||
self.waitForNodes(
|
||||
2, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE})
|
||||
autoscaler.update()
|
||||
runner.assert_has_call("172.0.0.0", pattern="--shm-size")
|
||||
runner.assert_has_call("172.0.0.0", pattern="--runtime=nvidia")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
|
Loading…
Add table
Reference in a new issue