mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[train] set auto_transfer cuda device (#26819)
This sets the CUDA Stream on the correct device (and not the default one) when calling train.torch.prepare_data_loader(auto_transfer=True). Signed-off-by: Matthew Deng <matt@anyscale.com>
This commit is contained in:
parent
4da78c489a
commit
728e2b36d6
2 changed files with 30 additions and 3 deletions
|
@ -477,6 +477,33 @@ def test_auto_transfer_data_from_host_to_device(
|
|||
assert compute_average_runtime(host_to_device) >= with_auto_transfer
|
||||
|
||||
|
||||
def test_auto_transfer_correct_device(ray_start_4_cpus_2_gpus):
|
||||
"""Tests that auto_transfer uses the right device for the cuda stream."""
|
||||
import nvidia_smi
|
||||
|
||||
nvidia_smi.nvmlInit()
|
||||
|
||||
def get_gpu_used_mem(i):
|
||||
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
|
||||
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
|
||||
return info.used
|
||||
|
||||
start_gpu_memory = get_gpu_used_mem(1)
|
||||
|
||||
device = torch.device("cuda:1")
|
||||
small_dataloader = [(torch.randn((1024 * 4, 1024 * 4)),) for _ in range(10)]
|
||||
wrapped_dataloader = ( # noqa: F841
|
||||
ray.train.torch.train_loop_utils._WrappedDataLoader(
|
||||
small_dataloader, device, True
|
||||
)
|
||||
)
|
||||
|
||||
end_gpu_memory = get_gpu_used_mem(1)
|
||||
|
||||
# Verify GPU memory usage increases on the right cuda device
|
||||
assert end_gpu_memory > start_gpu_memory
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
|
|
@ -354,7 +354,7 @@ class _TorchAccelerator(Accelerator):
|
|||
data_loader: torch.utils.data.DataLoader,
|
||||
add_dist_sampler: bool = True,
|
||||
move_to_device: bool = True,
|
||||
auto_transfer: bool = True,
|
||||
auto_transfer: bool = False,
|
||||
) -> torch.utils.data.DataLoader:
|
||||
"""Prepares DataLoader for distributed execution.
|
||||
|
||||
|
@ -368,7 +368,7 @@ class _TorchAccelerator(Accelerator):
|
|||
the provided DataLoader.
|
||||
move_to_device: If set, automatically move the data
|
||||
returned by the data loader to the correct device.
|
||||
auto_transfer: If set and device is GPU, another CUDA stream
|
||||
auto_transfer: (Experimental) If set and device is GPU, another CUDA stream
|
||||
is created to automatically copy data from host (CPU) memory
|
||||
to device (GPU) memory (the default CUDA stream still runs the
|
||||
training procedure). If device is CPU, it will be disabled
|
||||
|
@ -567,7 +567,7 @@ class _WrappedDataLoader(DataLoader):
|
|||
self._auto_transfer = auto_transfer if device.type == "cuda" else False
|
||||
# create a new CUDA stream to move data from host to device concurrently
|
||||
self._memcpy_stream = (
|
||||
torch.cuda.Stream()
|
||||
torch.cuda.Stream(device)
|
||||
if device.type == "cuda" and self._auto_transfer
|
||||
else None
|
||||
)
|
||||
|
|
Loading…
Add table
Reference in a new issue