Use GPUtil for gpu detection when available (#18938)

In Envs with K8S and enabled SELinux there is a bug: "/proc/nvidia/" is not allowed to mount in container So, i made a rework for GPU detection based on GPutil package. ## Checks - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Release tests Co-authored-by: Mopga <a14415641@cab-wsm-0010669.sigma.sbrf.ru> Co-authored-by: Julius <juliustfrost@gmail.com>
2025-03-06 02:21:39 -05:00 · 2022-02-28 01:54:35 +03:00 · 2022-02-28 01:54:35 +03:00 · 6f68c74a5d
commit 6f68c74a5d
parent 372c620f58
1 changed files with 35 additions and 8 deletions
--- a/python/ray/_private/resource_spec.py
+++ b/python/ray/_private/resource_spec.py
@ -1,3 +1,4 @@
 import importlib.util
 from collections import namedtuple
 import logging
 import os
@ -8,6 +9,11 @@ import sys
 import ray
 import ray.ray_constants as ray_constants
 try:
    import GPUtil
 except ImportError:
    pass
 logger = logging.getLogger(__name__)
 # Prefix for the node id resource that is automatically added to each node.
@ -175,6 +181,12 @@ class ResourceSpec(
                num_gpus = min(num_gpus, len(gpu_ids))
        try:
            if (
                sys.platform.startswith("linux")
                and importlib.util.find_spec("GPUtil") is not None
            ):
                gpu_types = _get_gpu_types_gputil()
            else:
                info_string = _get_gpu_info_string()
                gpu_types = _constraints_from_gpu_info(info_string)
            resources.update(gpu_types)
@ -261,16 +273,17 @@ def _autodetect_num_gpus():
    """Attempt to detect the number of GPUs on this machine.
    TODO(rkn): This currently assumes NVIDIA GPUs on Linux.
    TODO(mehrdadn): This currently does not work on macOS.
    TODO(mehrdadn): Use a better mechanism for Windows.
    Possibly useful: tensorflow.config.list_physical_devices()
    Returns:
        The number of GPUs if any were detected, otherwise 0.
    """
    result = 0
    if sys.platform.startswith("linux"):
        if importlib.util.find_spec("GPUtil"):
            gpu_list = GPUtil.getGPUs()
            result = len(gpu_list)
        else:
            proc_gpus_path = "/proc/driver/nvidia/gpus"
            if os.path.isdir(proc_gpus_path):
                result = len(os.listdir(proc_gpus_path))
@ -282,6 +295,20 @@ def _autodetect_num_gpus():
    return result
 def _get_gpu_types_gputil():
    gpu_list = GPUtil.getGPUs()
    if len(gpu_list) > 0:
        gpu_list_names = [gpu.name for gpu in gpu_list]
        info_str = gpu_list_names.pop()
        pretty_name = _pretty_gpu_name(info_str)
        if pretty_name:
            constraint_name = (
                f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
            )
            return {constraint_name: 1}
    return {}
 def _constraints_from_gpu_info(info_str):
    """Parse the contents of a /proc/driver/nvidia/gpus/*/information to get the
    gpu model type.