[train] set auto_transfer cuda device (#26819)

This sets the CUDA Stream on the correct device (and not the default one) when calling train.torch.prepare_data_loader(auto_transfer=True).

Signed-off-by: Matthew Deng <matt@anyscale.com>
This commit is contained in:
matthewdeng 2022-07-21 09:50:32 -07:00 committed by GitHub
parent 4da78c489a
commit 728e2b36d6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 3 deletions

View file

@ -477,6 +477,33 @@ def test_auto_transfer_data_from_host_to_device(
assert compute_average_runtime(host_to_device) >= with_auto_transfer
def test_auto_transfer_correct_device(ray_start_4_cpus_2_gpus):
"""Tests that auto_transfer uses the right device for the cuda stream."""
import nvidia_smi
nvidia_smi.nvmlInit()
def get_gpu_used_mem(i):
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
return info.used
start_gpu_memory = get_gpu_used_mem(1)
device = torch.device("cuda:1")
small_dataloader = [(torch.randn((1024 * 4, 1024 * 4)),) for _ in range(10)]
wrapped_dataloader = ( # noqa: F841
ray.train.torch.train_loop_utils._WrappedDataLoader(
small_dataloader, device, True
)
)
end_gpu_memory = get_gpu_used_mem(1)
# Verify GPU memory usage increases on the right cuda device
assert end_gpu_memory > start_gpu_memory
if __name__ == "__main__":
import sys

View file

@ -354,7 +354,7 @@ class _TorchAccelerator(Accelerator):
data_loader: torch.utils.data.DataLoader,
add_dist_sampler: bool = True,
move_to_device: bool = True,
auto_transfer: bool = True,
auto_transfer: bool = False,
) -> torch.utils.data.DataLoader:
"""Prepares DataLoader for distributed execution.
@ -368,7 +368,7 @@ class _TorchAccelerator(Accelerator):
the provided DataLoader.
move_to_device: If set, automatically move the data
returned by the data loader to the correct device.
auto_transfer: If set and device is GPU, another CUDA stream
auto_transfer: (Experimental) If set and device is GPU, another CUDA stream
is created to automatically copy data from host (CPU) memory
to device (GPU) memory (the default CUDA stream still runs the
training procedure). If device is CPU, it will be disabled
@ -567,7 +567,7 @@ class _WrappedDataLoader(DataLoader):
self._auto_transfer = auto_transfer if device.type == "cuda" else False
# create a new CUDA stream to move data from host to device concurrently
self._memcpy_stream = (
torch.cuda.Stream()
torch.cuda.Stream(device)
if device.type == "cuda" and self._auto_transfer
else None
)