Use GPUtil for gpu detection when available (#18938)

In Envs with K8S and enabled SELinux there is a bug: "/proc/nvidia/" is not allowed to mount in container So, i made a rework for GPU detection based on GPutil package. ## Checks - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Release tests Co-authored-by: Mopga <a14415641@cab-wsm-0010669.sigma.sbrf.ru> Co-authored-by: Julius <juliustfrost@gmail.com>
2025-03-06 02:21:39 -05:00 · 2022-02-28 01:54:35 +03:00 · 2022-02-28 01:54:35 +03:00 · 6f68c74a5d
commit 6f68c74a5d
parent 372c620f58
1 changed files with 35 additions and 8 deletions
--- a/python/ray/_private/resource_spec.py
+++ b/python/ray/_private/resource_spec.py
@ -1,3 +1,4 @@
+import importlib.util
 from collections import namedtuple
 import logging
 import os
@ -8,6 +9,11 @@ import sys
 import ray
 import ray.ray_constants as ray_constants

+try:
+    import GPUtil
+except ImportError:
+    pass
+
 logger = logging.getLogger(__name__)

 # Prefix for the node id resource that is automatically added to each node.
@ -175,8 +181,14 @@ class ResourceSpec(
                num_gpus = min(num_gpus, len(gpu_ids))

        try:
-            info_string = _get_gpu_info_string()
-            gpu_types = _constraints_from_gpu_info(info_string)
+            if (
+                sys.platform.startswith("linux")
+                and importlib.util.find_spec("GPUtil") is not None
+            ):
+                gpu_types = _get_gpu_types_gputil()
+            else:
+                info_string = _get_gpu_info_string()
+                gpu_types = _constraints_from_gpu_info(info_string)
            resources.update(gpu_types)
        except Exception:
            logger.exception("Could not parse gpu information.")
@ -261,19 +273,20 @@ def _autodetect_num_gpus():
    """Attempt to detect the number of GPUs on this machine.

    TODO(rkn): This currently assumes NVIDIA GPUs on Linux.
-    TODO(mehrdadn): This currently does not work on macOS.
    TODO(mehrdadn): Use a better mechanism for Windows.

-    Possibly useful: tensorflow.config.list_physical_devices()
-
    Returns:
        The number of GPUs if any were detected, otherwise 0.
    """
    result = 0
    if sys.platform.startswith("linux"):
-        proc_gpus_path = "/proc/driver/nvidia/gpus"
-        if os.path.isdir(proc_gpus_path):
-            result = len(os.listdir(proc_gpus_path))
+        if importlib.util.find_spec("GPUtil"):
+            gpu_list = GPUtil.getGPUs()
+            result = len(gpu_list)
+        else:
+            proc_gpus_path = "/proc/driver/nvidia/gpus"
+            if os.path.isdir(proc_gpus_path):
+                result = len(os.listdir(proc_gpus_path))
    elif sys.platform == "win32":
        props = "AdapterCompatibility"
        cmdargs = ["WMIC", "PATH", "Win32_VideoController", "GET", props]
@ -282,6 +295,20 @@ def _autodetect_num_gpus():
    return result


+def _get_gpu_types_gputil():
+    gpu_list = GPUtil.getGPUs()
+    if len(gpu_list) > 0:
+        gpu_list_names = [gpu.name for gpu in gpu_list]
+        info_str = gpu_list_names.pop()
+        pretty_name = _pretty_gpu_name(info_str)
+        if pretty_name:
+            constraint_name = (
+                f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
+            )
+            return {constraint_name: 1}
+    return {}
+
+
 def _constraints_from_gpu_info(info_str):
    """Parse the contents of a /proc/driver/nvidia/gpus/*/information to get the
    gpu model type.