mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
Use GPUtil for gpu detection when available (#18938)
In Envs with K8S and enabled SELinux there is a bug: "/proc/nvidia/" is not allowed to mount in container So, i made a rework for GPU detection based on GPutil package. ## Checks - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Release tests Co-authored-by: Mopga <a14415641@cab-wsm-0010669.sigma.sbrf.ru> Co-authored-by: Julius <juliustfrost@gmail.com>
This commit is contained in:
parent
372c620f58
commit
6f68c74a5d
1 changed files with 35 additions and 8 deletions
|
@ -1,3 +1,4 @@
|
||||||
|
import importlib.util
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
@ -8,6 +9,11 @@ import sys
|
||||||
import ray
|
import ray
|
||||||
import ray.ray_constants as ray_constants
|
import ray.ray_constants as ray_constants
|
||||||
|
|
||||||
|
try:
|
||||||
|
import GPUtil
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Prefix for the node id resource that is automatically added to each node.
|
# Prefix for the node id resource that is automatically added to each node.
|
||||||
|
@ -175,6 +181,12 @@ class ResourceSpec(
|
||||||
num_gpus = min(num_gpus, len(gpu_ids))
|
num_gpus = min(num_gpus, len(gpu_ids))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
if (
|
||||||
|
sys.platform.startswith("linux")
|
||||||
|
and importlib.util.find_spec("GPUtil") is not None
|
||||||
|
):
|
||||||
|
gpu_types = _get_gpu_types_gputil()
|
||||||
|
else:
|
||||||
info_string = _get_gpu_info_string()
|
info_string = _get_gpu_info_string()
|
||||||
gpu_types = _constraints_from_gpu_info(info_string)
|
gpu_types = _constraints_from_gpu_info(info_string)
|
||||||
resources.update(gpu_types)
|
resources.update(gpu_types)
|
||||||
|
@ -261,16 +273,17 @@ def _autodetect_num_gpus():
|
||||||
"""Attempt to detect the number of GPUs on this machine.
|
"""Attempt to detect the number of GPUs on this machine.
|
||||||
|
|
||||||
TODO(rkn): This currently assumes NVIDIA GPUs on Linux.
|
TODO(rkn): This currently assumes NVIDIA GPUs on Linux.
|
||||||
TODO(mehrdadn): This currently does not work on macOS.
|
|
||||||
TODO(mehrdadn): Use a better mechanism for Windows.
|
TODO(mehrdadn): Use a better mechanism for Windows.
|
||||||
|
|
||||||
Possibly useful: tensorflow.config.list_physical_devices()
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The number of GPUs if any were detected, otherwise 0.
|
The number of GPUs if any were detected, otherwise 0.
|
||||||
"""
|
"""
|
||||||
result = 0
|
result = 0
|
||||||
if sys.platform.startswith("linux"):
|
if sys.platform.startswith("linux"):
|
||||||
|
if importlib.util.find_spec("GPUtil"):
|
||||||
|
gpu_list = GPUtil.getGPUs()
|
||||||
|
result = len(gpu_list)
|
||||||
|
else:
|
||||||
proc_gpus_path = "/proc/driver/nvidia/gpus"
|
proc_gpus_path = "/proc/driver/nvidia/gpus"
|
||||||
if os.path.isdir(proc_gpus_path):
|
if os.path.isdir(proc_gpus_path):
|
||||||
result = len(os.listdir(proc_gpus_path))
|
result = len(os.listdir(proc_gpus_path))
|
||||||
|
@ -282,6 +295,20 @@ def _autodetect_num_gpus():
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _get_gpu_types_gputil():
|
||||||
|
gpu_list = GPUtil.getGPUs()
|
||||||
|
if len(gpu_list) > 0:
|
||||||
|
gpu_list_names = [gpu.name for gpu in gpu_list]
|
||||||
|
info_str = gpu_list_names.pop()
|
||||||
|
pretty_name = _pretty_gpu_name(info_str)
|
||||||
|
if pretty_name:
|
||||||
|
constraint_name = (
|
||||||
|
f"{ray_constants.RESOURCE_CONSTRAINT_PREFIX}" f"{pretty_name}"
|
||||||
|
)
|
||||||
|
return {constraint_name: 1}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def _constraints_from_gpu_info(info_str):
|
def _constraints_from_gpu_info(info_str):
|
||||||
"""Parse the contents of a /proc/driver/nvidia/gpus/*/information to get the
|
"""Parse the contents of a /proc/driver/nvidia/gpus/*/information to get the
|
||||||
gpu model type.
|
gpu model type.
|
||||||
|
|
Loading…
Add table
Reference in a new issue