mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[Core] Do not convert gpu id to int (#9744)
Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
parent
d6226b80bb
commit
32cd94b750
9 changed files with 54 additions and 20 deletions
|
@ -72,8 +72,8 @@ Resources with Actors
|
|||
You can specify that an actor requires CPUs or GPUs in the decorator. While Ray has built-in support for CPUs and GPUs, Ray can also handle custom resources.
|
||||
|
||||
When using GPUs, Ray will automatically set the environment variable ``CUDA_VISIBLE_DEVICES`` for the actor after instantiated. The actor will have access to a list of the IDs of the GPUs
|
||||
that it is allowed to use via ``ray.get_gpu_ids()``. This is a list of integers,
|
||||
like ``[]``, or ``[1]``, or ``[2, 5, 6]``.
|
||||
that it is allowed to use via ``ray.get_gpu_ids(as_str=True)``. This is a list of strings,
|
||||
like ``[]``, or ``['1']``, or ``['2', '5', '6']``. Under some circumstances, the IDs of GPUs could be given as UUID strings instead of indices (see the `CUDA programming guide <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`__).
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
|
|
@ -33,8 +33,8 @@ remote decorator.
|
|||
print("ray.get_gpu_ids(): {}".format(ray.get_gpu_ids()))
|
||||
print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
|
||||
|
||||
Inside of the remote function, a call to ``ray.get_gpu_ids()`` will return a
|
||||
list of integers indicating which GPUs the remote function is allowed to use.
|
||||
Inside of the remote function, a call to ``ray.get_gpu_ids(as_str=True)`` will return a
|
||||
list of strings indicating which GPUs the remote function is allowed to use.
|
||||
Typically, it is not necessary to call ``ray.get_gpu_ids()`` because Ray will
|
||||
automatically set the ``CUDA_VISIBLE_DEVICES`` environment variable.
|
||||
|
||||
|
|
|
@ -353,7 +353,7 @@ cdef execute_task(
|
|||
CFiberEvent task_done_event
|
||||
|
||||
# Automatically restrict the GPUs available to this task.
|
||||
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
|
||||
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids(as_str=True))
|
||||
|
||||
function_descriptor = CFunctionDescriptorToPython(
|
||||
ray_function.GetFunctionDescriptor())
|
||||
|
|
|
@ -95,10 +95,10 @@ def test_actor_gpus(ray_start_cluster):
|
|||
@ray.remote(num_gpus=1)
|
||||
class Actor1:
|
||||
def __init__(self):
|
||||
self.gpu_ids = ray.get_gpu_ids()
|
||||
self.gpu_ids = ray.get_gpu_ids(as_str=True)
|
||||
|
||||
def get_location_and_ids(self):
|
||||
assert ray.get_gpu_ids() == self.gpu_ids
|
||||
assert ray.get_gpu_ids(as_str=True) == self.gpu_ids
|
||||
return (ray.worker.global_worker.node.unique_id,
|
||||
tuple(self.gpu_ids))
|
||||
|
||||
|
|
|
@ -633,6 +633,25 @@ def save_gpu_ids_shutdown_only():
|
|||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_str", [False, True])
|
||||
def test_gpu_ids_as_str(save_gpu_ids_shutdown_only, as_str):
|
||||
allowed_gpu_ids = [4, 5, 6]
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
|
||||
str(i) for i in allowed_gpu_ids)
|
||||
ray.init()
|
||||
|
||||
@ray.remote
|
||||
def get_gpu_ids(as_str):
|
||||
gpu_ids = ray.get_gpu_ids(as_str)
|
||||
for gpu_id in gpu_ids:
|
||||
if as_str:
|
||||
assert isinstance(gpu_id, str)
|
||||
else:
|
||||
assert isinstance(gpu_id, int)
|
||||
|
||||
ray.get([get_gpu_ids.remote(as_str) for _ in range(10)])
|
||||
|
||||
|
||||
def test_specific_gpus(save_gpu_ids_shutdown_only):
|
||||
allowed_gpu_ids = [4, 5, 6]
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
|
||||
|
@ -643,14 +662,14 @@ def test_specific_gpus(save_gpu_ids_shutdown_only):
|
|||
def f():
|
||||
gpu_ids = ray.get_gpu_ids()
|
||||
assert len(gpu_ids) == 1
|
||||
assert gpu_ids[0] in allowed_gpu_ids
|
||||
assert int(gpu_ids[0]) in allowed_gpu_ids
|
||||
|
||||
@ray.remote(num_gpus=2)
|
||||
def g():
|
||||
gpu_ids = ray.get_gpu_ids()
|
||||
assert len(gpu_ids) == 2
|
||||
assert gpu_ids[0] in allowed_gpu_ids
|
||||
assert gpu_ids[1] in allowed_gpu_ids
|
||||
assert int(gpu_ids[0]) in allowed_gpu_ids
|
||||
assert int(gpu_ids[1]) in allowed_gpu_ids
|
||||
|
||||
ray.get([f.remote() for _ in range(100)])
|
||||
ray.get([g.remote() for _ in range(100)])
|
||||
|
@ -671,7 +690,7 @@ def test_local_mode_gpus(save_gpu_ids_shutdown_only):
|
|||
gpu_ids = ray.get_gpu_ids()
|
||||
assert len(gpu_ids) == 3
|
||||
for gpu in gpu_ids:
|
||||
assert gpu in allowed_gpu_ids
|
||||
assert int(gpu) in allowed_gpu_ids
|
||||
|
||||
ray.get([f.remote() for _ in range(100)])
|
||||
|
||||
|
|
|
@ -271,9 +271,9 @@ def get_cuda_visible_devices():
|
|||
"""Get the device IDs in the CUDA_VISIBLE_DEVICES environment variable.
|
||||
|
||||
Returns:
|
||||
if CUDA_VISIBLE_DEVICES is set, this returns a list of integers with
|
||||
the IDs of the GPUs. If it is not set or is set to NoDevFiles,
|
||||
this returns None.
|
||||
devices (List[str]): If CUDA_VISIBLE_DEVICES is set, returns a
|
||||
list of strings representing the IDs of the visible GPUs.
|
||||
If it is not set or is set to NoDevFiles, returns empty list.
|
||||
"""
|
||||
gpu_ids_str = os.environ.get("CUDA_VISIBLE_DEVICES", None)
|
||||
|
||||
|
@ -286,7 +286,8 @@ def get_cuda_visible_devices():
|
|||
if gpu_ids_str == "NoDevFiles":
|
||||
return []
|
||||
|
||||
return [int(i) for i in gpu_ids_str.split(",")]
|
||||
# GPU identifiers are given as strings representing integers or UUIDs.
|
||||
return list(gpu_ids_str.split(","))
|
||||
|
||||
|
||||
last_set_gpu_ids = None
|
||||
|
@ -296,7 +297,7 @@ def set_cuda_visible_devices(gpu_ids):
|
|||
"""Set the CUDA_VISIBLE_DEVICES environment variable.
|
||||
|
||||
Args:
|
||||
gpu_ids: This is a list of integers representing GPU IDs.
|
||||
gpu_ids (List[str]): List of strings representing GPU IDs.
|
||||
"""
|
||||
|
||||
global last_set_gpu_ids
|
||||
|
|
|
@ -373,7 +373,7 @@ class Worker:
|
|||
sys.exit(0)
|
||||
|
||||
|
||||
def get_gpu_ids():
|
||||
def get_gpu_ids(as_str=False):
|
||||
"""Get the IDs of the GPUs that are available to the worker.
|
||||
|
||||
If the CUDA_VISIBLE_DEVICES environment variable was set when the worker
|
||||
|
@ -381,6 +381,10 @@ def get_gpu_ids():
|
|||
IDs in CUDA_VISIBLE_DEVICES. If not, the IDs will fall in the range
|
||||
[0, NUM_GPUS - 1], where NUM_GPUS is the number of GPUs that the node has.
|
||||
|
||||
Args:
|
||||
as_str (Boolean): If true, return gpu ids in string format. By default,
|
||||
it is False. This will change to default to True in the future.
|
||||
|
||||
Returns:
|
||||
A list of GPU IDs.
|
||||
"""
|
||||
|
@ -400,7 +404,17 @@ def get_gpu_ids():
|
|||
# Give all GPUs in local_mode.
|
||||
if global_worker.mode == LOCAL_MODE:
|
||||
max_gpus = global_worker.node.get_resource_spec().num_gpus
|
||||
return global_worker.original_gpu_ids[:max_gpus]
|
||||
assigned_ids = global_worker.original_gpu_ids[:max_gpus]
|
||||
|
||||
if not as_str:
|
||||
from ray.util.debug import log_once
|
||||
if log_once("ray.get_gpu_ids.as_str"):
|
||||
logger.warning(
|
||||
"ray.get_gpu_ids() will return a list of strings by default"
|
||||
" in a future version of Ray for compatibility with CUDA. "
|
||||
"To enable the forward-compatible behavior, use "
|
||||
"`ray.get_gpu_ids(as_str=True)`.")
|
||||
assigned_ids = [int(assigned_id) for assigned_id in assigned_ids]
|
||||
|
||||
return assigned_ids
|
||||
|
||||
|
|
|
@ -413,7 +413,7 @@ class RolloutWorker(ParallelIteratorWorker):
|
|||
if (ray.is_initialized()
|
||||
and ray.worker._mode() != ray.worker.LOCAL_MODE):
|
||||
# Check available number of GPUs
|
||||
if not ray.get_gpu_ids():
|
||||
if not ray.get_gpu_ids(as_str=True):
|
||||
logger.debug("Creating policy evaluation worker {}".format(
|
||||
worker_index) +
|
||||
" on CPU (please ignore any CUDA init errors)")
|
||||
|
|
|
@ -98,7 +98,7 @@ class TorchPolicy(Policy):
|
|||
"""
|
||||
self.framework = "torch"
|
||||
super().__init__(observation_space, action_space, config)
|
||||
if torch.cuda.is_available() and ray.get_gpu_ids():
|
||||
if torch.cuda.is_available() and ray.get_gpu_ids(as_str=True):
|
||||
self.device = torch.device("cuda")
|
||||
else:
|
||||
self.device = torch.device("cpu")
|
||||
|
|
Loading…
Add table
Reference in a new issue