Use GPUtil for gpu detection when available (#18938)

In Envs with K8S and enabled SELinux there is a bug:
"/proc/nvidia/" is not allowed to mount in container
So, i made a rework for GPU detection based on GPutil package.



## Checks

- [x] I've run `scripts/format.sh` to lint the changes in this PR.
- [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/
- Testing Strategy
   - [x] Release tests

Co-authored-by: Mopga <a14415641@cab-wsm-0010669.sigma.sbrf.ru>
Co-authored-by: Julius <juliustfrost@gmail.com>
This commit is contained in:
mopga 2022-02-28 01:54:35 +03:00 committed by GitHub
parent 372c620f58
commit 6f68c74a5d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,3 +1,4 @@
import importlib.util
from collections import namedtuple from collections import namedtuple
import logging import logging
import os import os
@ -8,6 +9,11 @@ import sys
import ray import ray
import ray.ray_constants as ray_constants import ray.ray_constants as ray_constants
try:
import GPUtil
except ImportError:
pass
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Prefix for the node id resource that is automatically added to each node. # Prefix for the node id resource that is automatically added to each node.
@ -175,6 +181,12 @@ class ResourceSpec(
num_gpus = min(num_gpus, len(gpu_ids)) num_gpus = min(num_gpus, len(gpu_ids))
try: try:
if (
sys.platform.startswith("linux")
and importlib.util.find_spec("GPUtil") is not None
):
gpu_types = _get_gpu_types_gputil()
else:
info_string = _get_gpu_info_string() info_string = _get_gpu_info_string()
gpu_types = _constraints_from_gpu_info(info_string) gpu_types = _constraints_from_gpu_info(info_string)
resources.update(gpu_types) resources.update(gpu_types)
@ -261,16 +273,17 @@ def _autodetect_num_gpus():
"""Attempt to detect the number of GPUs on this machine. """Attempt to detect the number of GPUs on this machine.
TODO(rkn): This currently assumes NVIDIA GPUs on Linux. TODO(rkn): This currently assumes NVIDIA GPUs on Linux.
TODO(mehrdadn): This currently does not work on macOS.
TODO(mehrdadn): Use a better mechanism for Windows. TODO(mehrdadn): Use a better mechanism for Windows.
Possibly useful: tensorflow.config.list_physical_devices()
Returns: Returns:
The number of GPUs if any were detected, otherwise 0. The number of GPUs if any were detected, otherwise 0.
""" """
result = 0 result = 0
if sys.platform.startswith("linux"): if sys.platform.startswith("linux"):
if importlib.util.find_spec("GPUtil"):
gpu_list = GPUtil.getGPUs()
result = len(gpu_list)
else:
proc_gpus_path = "/proc/driver/nvidia/gpus" proc_gpus_path = "/proc/driver/nvidia/gpus"
if os.path.isdir(proc_gpus_path): if os.path.isdir(proc_gpus_path):
result = len(os.listdir(proc_gpus_path)) result = len(os.listdir(proc_gpus_path))
@ -282,6 +295,20 @@ def _autodetect_num_gpus():
return result return result
def _get_gpu_types_gputil():
gpu_list = GPUtil.getGPUs()
if len(gpu_list) > 0:
gpu_list_names = [gpu.name for gpu in gpu_list]
info_str = gpu_list_names.pop()
pretty_name = _pretty_gpu_name(info_str)
if pretty_name:
constraint_name = (
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
)
return {constraint_name: 1}
return {}
def _constraints_from_gpu_info(info_str): def _constraints_from_gpu_info(info_str):
"""Parse the contents of a /proc/driver/nvidia/gpus/*/information to get the """Parse the contents of a /proc/driver/nvidia/gpus/*/information to get the
gpu model type. gpu model type.