[Core] Do not convert gpu id to int (#9744)

Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
yncxcw 2020-08-11 13:09:46 -06:00 committed by GitHub
parent d6226b80bb
commit 32cd94b750
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 54 additions and 20 deletions

View file

@ -72,8 +72,8 @@ Resources with Actors
You can specify that an actor requires CPUs or GPUs in the decorator. While Ray has built-in support for CPUs and GPUs, Ray can also handle custom resources.
When using GPUs, Ray will automatically set the environment variable ``CUDA_VISIBLE_DEVICES`` for the actor after instantiated. The actor will have access to a list of the IDs of the GPUs
that it is allowed to use via ``ray.get_gpu_ids()``. This is a list of integers,
like ``[]``, or ``[1]``, or ``[2, 5, 6]``.
that it is allowed to use via ``ray.get_gpu_ids(as_str=True)``. This is a list of strings,
like ``[]``, or ``['1']``, or ``['2', '5', '6']``. Under some circumstances, the IDs of GPUs could be given as UUID strings instead of indices (see the `CUDA programming guide <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`__).
.. code-block:: python

View file

@ -33,8 +33,8 @@ remote decorator.
print("ray.get_gpu_ids(): {}".format(ray.get_gpu_ids()))
print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
Inside of the remote function, a call to ``ray.get_gpu_ids()`` will return a
list of integers indicating which GPUs the remote function is allowed to use.
Inside of the remote function, a call to ``ray.get_gpu_ids(as_str=True)`` will return a
list of strings indicating which GPUs the remote function is allowed to use.
Typically, it is not necessary to call ``ray.get_gpu_ids()`` because Ray will
automatically set the ``CUDA_VISIBLE_DEVICES`` environment variable.

View file

@ -353,7 +353,7 @@ cdef execute_task(
CFiberEvent task_done_event
# Automatically restrict the GPUs available to this task.
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids(as_str=True))
function_descriptor = CFunctionDescriptorToPython(
ray_function.GetFunctionDescriptor())

View file

@ -95,10 +95,10 @@ def test_actor_gpus(ray_start_cluster):
@ray.remote(num_gpus=1)
class Actor1:
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
self.gpu_ids = ray.get_gpu_ids(as_str=True)
def get_location_and_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
assert ray.get_gpu_ids(as_str=True) == self.gpu_ids
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))

View file

@ -633,6 +633,25 @@ def save_gpu_ids_shutdown_only():
del os.environ["CUDA_VISIBLE_DEVICES"]
@pytest.mark.parametrize("as_str", [False, True])
def test_gpu_ids_as_str(save_gpu_ids_shutdown_only, as_str):
allowed_gpu_ids = [4, 5, 6]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
str(i) for i in allowed_gpu_ids)
ray.init()
@ray.remote
def get_gpu_ids(as_str):
gpu_ids = ray.get_gpu_ids(as_str)
for gpu_id in gpu_ids:
if as_str:
assert isinstance(gpu_id, str)
else:
assert isinstance(gpu_id, int)
ray.get([get_gpu_ids.remote(as_str) for _ in range(10)])
def test_specific_gpus(save_gpu_ids_shutdown_only):
allowed_gpu_ids = [4, 5, 6]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
@ -643,14 +662,14 @@ def test_specific_gpus(save_gpu_ids_shutdown_only):
def f():
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 1
assert gpu_ids[0] in allowed_gpu_ids
assert int(gpu_ids[0]) in allowed_gpu_ids
@ray.remote(num_gpus=2)
def g():
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 2
assert gpu_ids[0] in allowed_gpu_ids
assert gpu_ids[1] in allowed_gpu_ids
assert int(gpu_ids[0]) in allowed_gpu_ids
assert int(gpu_ids[1]) in allowed_gpu_ids
ray.get([f.remote() for _ in range(100)])
ray.get([g.remote() for _ in range(100)])
@ -671,7 +690,7 @@ def test_local_mode_gpus(save_gpu_ids_shutdown_only):
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 3
for gpu in gpu_ids:
assert gpu in allowed_gpu_ids
assert int(gpu) in allowed_gpu_ids
ray.get([f.remote() for _ in range(100)])

View file

@ -271,9 +271,9 @@ def get_cuda_visible_devices():
"""Get the device IDs in the CUDA_VISIBLE_DEVICES environment variable.
Returns:
if CUDA_VISIBLE_DEVICES is set, this returns a list of integers with
the IDs of the GPUs. If it is not set or is set to NoDevFiles,
this returns None.
devices (List[str]): If CUDA_VISIBLE_DEVICES is set, returns a
list of strings representing the IDs of the visible GPUs.
If it is not set or is set to NoDevFiles, returns empty list.
"""
gpu_ids_str = os.environ.get("CUDA_VISIBLE_DEVICES", None)
@ -286,7 +286,8 @@ def get_cuda_visible_devices():
if gpu_ids_str == "NoDevFiles":
return []
return [int(i) for i in gpu_ids_str.split(",")]
# GPU identifiers are given as strings representing integers or UUIDs.
return list(gpu_ids_str.split(","))
last_set_gpu_ids = None
@ -296,7 +297,7 @@ def set_cuda_visible_devices(gpu_ids):
"""Set the CUDA_VISIBLE_DEVICES environment variable.
Args:
gpu_ids: This is a list of integers representing GPU IDs.
gpu_ids (List[str]): List of strings representing GPU IDs.
"""
global last_set_gpu_ids

View file

@ -373,7 +373,7 @@ class Worker:
sys.exit(0)
def get_gpu_ids():
def get_gpu_ids(as_str=False):
"""Get the IDs of the GPUs that are available to the worker.
If the CUDA_VISIBLE_DEVICES environment variable was set when the worker
@ -381,6 +381,10 @@ def get_gpu_ids():
IDs in CUDA_VISIBLE_DEVICES. If not, the IDs will fall in the range
[0, NUM_GPUS - 1], where NUM_GPUS is the number of GPUs that the node has.
Args:
as_str (Boolean): If true, return gpu ids in string format. By default,
it is False. This will change to default to True in the future.
Returns:
A list of GPU IDs.
"""
@ -400,7 +404,17 @@ def get_gpu_ids():
# Give all GPUs in local_mode.
if global_worker.mode == LOCAL_MODE:
max_gpus = global_worker.node.get_resource_spec().num_gpus
return global_worker.original_gpu_ids[:max_gpus]
assigned_ids = global_worker.original_gpu_ids[:max_gpus]
if not as_str:
from ray.util.debug import log_once
if log_once("ray.get_gpu_ids.as_str"):
logger.warning(
"ray.get_gpu_ids() will return a list of strings by default"
" in a future version of Ray for compatibility with CUDA. "
"To enable the forward-compatible behavior, use "
"`ray.get_gpu_ids(as_str=True)`.")
assigned_ids = [int(assigned_id) for assigned_id in assigned_ids]
return assigned_ids

View file

@ -413,7 +413,7 @@ class RolloutWorker(ParallelIteratorWorker):
if (ray.is_initialized()
and ray.worker._mode() != ray.worker.LOCAL_MODE):
# Check available number of GPUs
if not ray.get_gpu_ids():
if not ray.get_gpu_ids(as_str=True):
logger.debug("Creating policy evaluation worker {}".format(
worker_index) +
" on CPU (please ignore any CUDA init errors)")

View file

@ -98,7 +98,7 @@ class TorchPolicy(Policy):
"""
self.framework = "torch"
super().__init__(observation_space, action_space, config)
if torch.cuda.is_available() and ray.get_gpu_ids():
if torch.cuda.is_available() and ray.get_gpu_ids(as_str=True):
self.device = torch.device("cuda")
else:
self.device = torch.device("cpu")