[train] set auto_transfer cuda device (#26819)

This sets the CUDA Stream on the correct device (and not the default one) when calling train.torch.prepare_data_loader(auto_transfer=True). Signed-off-by: Matthew Deng <matt@anyscale.com>
2025-03-05 18:11:42 -05:00 · 2022-07-21 09:50:32 -07:00 · 2022-07-21 09:50:32 -07:00 · 728e2b36d6
commit 728e2b36d6
parent 4da78c489a
2 changed files with 30 additions and 3 deletions
--- a/python/ray/train/tests/test_gpu.py
+++ b/python/ray/train/tests/test_gpu.py
@ -477,6 +477,33 @@ def test_auto_transfer_data_from_host_to_device(
        assert compute_average_runtime(host_to_device) >= with_auto_transfer


+def test_auto_transfer_correct_device(ray_start_4_cpus_2_gpus):
+    """Tests that auto_transfer uses the right device for the cuda stream."""
+    import nvidia_smi
+
+    nvidia_smi.nvmlInit()
+
+    def get_gpu_used_mem(i):
+        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
+        info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+        return info.used
+
+    start_gpu_memory = get_gpu_used_mem(1)
+
+    device = torch.device("cuda:1")
+    small_dataloader = [(torch.randn((1024 * 4, 1024 * 4)),) for _ in range(10)]
+    wrapped_dataloader = (  # noqa: F841
+        ray.train.torch.train_loop_utils._WrappedDataLoader(
+            small_dataloader, device, True
+        )
+    )
+
+    end_gpu_memory = get_gpu_used_mem(1)
+
+    # Verify GPU memory usage increases on the right cuda device
+    assert end_gpu_memory > start_gpu_memory
+
+
 if __name__ == "__main__":
    import sys

--- a/python/ray/train/torch/train_loop_utils.py
+++ b/python/ray/train/torch/train_loop_utils.py
@ -354,7 +354,7 @@ class _TorchAccelerator(Accelerator):
        data_loader: torch.utils.data.DataLoader,
        add_dist_sampler: bool = True,
        move_to_device: bool = True,
-        auto_transfer: bool = True,
+        auto_transfer: bool = False,
    ) -> torch.utils.data.DataLoader:
        """Prepares DataLoader for distributed execution.

@ -368,7 +368,7 @@ class _TorchAccelerator(Accelerator):
                the provided DataLoader.
            move_to_device: If set, automatically move the data
                returned by the data loader to the correct device.
-            auto_transfer: If set and device is GPU, another CUDA stream
+            auto_transfer: (Experimental) If set and device is GPU, another CUDA stream
                is created to automatically copy data from host (CPU) memory
                to device (GPU) memory (the default CUDA stream still runs the
                training procedure). If device is CPU, it will be disabled
@ -567,7 +567,7 @@ class _WrappedDataLoader(DataLoader):
        self._auto_transfer = auto_transfer if device.type == "cuda" else False
        # create a new CUDA stream to move data from host to device concurrently
        self._memcpy_stream = (
-            torch.cuda.Stream()
+            torch.cuda.Stream(device)
            if device.type == "cuda" and self._auto_transfer
            else None
        )