mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
Use GPUtil for gpu detection when available (#18938)
In Envs with K8S and enabled SELinux there is a bug: "/proc/nvidia/" is not allowed to mount in container So, i made a rework for GPU detection based on GPutil package. ## Checks - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Release tests Co-authored-by: Mopga <a14415641@cab-wsm-0010669.sigma.sbrf.ru> Co-authored-by: Julius <juliustfrost@gmail.com>
This commit is contained in:
parent
372c620f58
commit
6f68c74a5d
1 changed files with 35 additions and 8 deletions
|
@ -1,3 +1,4 @@
|
|||
import importlib.util
|
||||
from collections import namedtuple
|
||||
import logging
|
||||
import os
|
||||
|
@ -8,6 +9,11 @@ import sys
|
|||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
|
||||
try:
|
||||
import GPUtil
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Prefix for the node id resource that is automatically added to each node.
|
||||
|
@ -175,8 +181,14 @@ class ResourceSpec(
|
|||
num_gpus = min(num_gpus, len(gpu_ids))
|
||||
|
||||
try:
|
||||
info_string = _get_gpu_info_string()
|
||||
gpu_types = _constraints_from_gpu_info(info_string)
|
||||
if (
|
||||
sys.platform.startswith("linux")
|
||||
and importlib.util.find_spec("GPUtil") is not None
|
||||
):
|
||||
gpu_types = _get_gpu_types_gputil()
|
||||
else:
|
||||
info_string = _get_gpu_info_string()
|
||||
gpu_types = _constraints_from_gpu_info(info_string)
|
||||
resources.update(gpu_types)
|
||||
except Exception:
|
||||
logger.exception("Could not parse gpu information.")
|
||||
|
@ -261,19 +273,20 @@ def _autodetect_num_gpus():
|
|||
"""Attempt to detect the number of GPUs on this machine.
|
||||
|
||||
TODO(rkn): This currently assumes NVIDIA GPUs on Linux.
|
||||
TODO(mehrdadn): This currently does not work on macOS.
|
||||
TODO(mehrdadn): Use a better mechanism for Windows.
|
||||
|
||||
Possibly useful: tensorflow.config.list_physical_devices()
|
||||
|
||||
Returns:
|
||||
The number of GPUs if any were detected, otherwise 0.
|
||||
"""
|
||||
result = 0
|
||||
if sys.platform.startswith("linux"):
|
||||
proc_gpus_path = "/proc/driver/nvidia/gpus"
|
||||
if os.path.isdir(proc_gpus_path):
|
||||
result = len(os.listdir(proc_gpus_path))
|
||||
if importlib.util.find_spec("GPUtil"):
|
||||
gpu_list = GPUtil.getGPUs()
|
||||
result = len(gpu_list)
|
||||
else:
|
||||
proc_gpus_path = "/proc/driver/nvidia/gpus"
|
||||
if os.path.isdir(proc_gpus_path):
|
||||
result = len(os.listdir(proc_gpus_path))
|
||||
elif sys.platform == "win32":
|
||||
props = "AdapterCompatibility"
|
||||
cmdargs = ["WMIC", "PATH", "Win32_VideoController", "GET", props]
|
||||
|
@ -282,6 +295,20 @@ def _autodetect_num_gpus():
|
|||
return result
|
||||
|
||||
|
||||
def _get_gpu_types_gputil():
|
||||
gpu_list = GPUtil.getGPUs()
|
||||
if len(gpu_list) > 0:
|
||||
gpu_list_names = [gpu.name for gpu in gpu_list]
|
||||
info_str = gpu_list_names.pop()
|
||||
pretty_name = _pretty_gpu_name(info_str)
|
||||
if pretty_name:
|
||||
constraint_name = (
|
||||
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
|
||||
)
|
||||
return {constraint_name: 1}
|
||||
return {}
|
||||
|
||||
|
||||
def _constraints_from_gpu_info(info_str):
|
||||
"""Parse the contents of a /proc/driver/nvidia/gpus/*/information to get the
|
||||
gpu model type.
|
||||
|
|
Loading…
Add table
Reference in a new issue