mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[sgd] simplify cuda visible device setting (#8775)
This commit is contained in:
parent
07637e5b5b
commit
58efec0f2b
2 changed files with 32 additions and 27 deletions
|
@ -198,8 +198,10 @@ class TestLocalDistributedRunner(unittest.TestCase):
|
||||||
self.assertEquals(len(env_set_device), 1)
|
self.assertEquals(len(env_set_device), 1)
|
||||||
|
|
||||||
if preset_devices:
|
if preset_devices:
|
||||||
self.assertIn(env_set_device, preset_devices.split(","))
|
visible_devices = preset_devices.split(",")
|
||||||
self.assertEquals(local_device, "0")
|
self.assertIn(env_set_device, visible_devices)
|
||||||
|
device_int = int(local_device)
|
||||||
|
self.assertLess(device_int, len(visible_devices))
|
||||||
else:
|
else:
|
||||||
self.assertEquals(local_device, env_set_device)
|
self.assertEquals(local_device, env_set_device)
|
||||||
|
|
||||||
|
@ -220,29 +222,26 @@ class TestLocalDistributedRunner(unittest.TestCase):
|
||||||
init_mock.return_value = True
|
init_mock.return_value = True
|
||||||
self._testWithInitialized(init_mock)
|
self._testWithInitialized(init_mock)
|
||||||
|
|
||||||
def _testNotInitialized(self, init_mock):
|
|
||||||
mock_runner = MagicMock()
|
|
||||||
mock_runner._set_cuda_device = MagicMock()
|
|
||||||
LocalDistributedRunner._try_reserve_and_set_cuda(mock_runner)
|
|
||||||
mock_runner._set_cuda_device.assert_called_with("0")
|
|
||||||
self.assertEquals(len(os.environ["CUDA_VISIBLE_DEVICES"]), 1)
|
|
||||||
|
|
||||||
def testNoVisibleNotInitialized(self):
|
|
||||||
with patch("torch.cuda.is_initialized") as init_mock:
|
|
||||||
init_mock.return_value = False
|
|
||||||
self._testNotInitialized(init_mock)
|
|
||||||
|
|
||||||
def test2VisibleNotInitialized(self):
|
def test2VisibleNotInitialized(self):
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
|
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
|
||||||
with patch("torch.cuda.is_initialized") as init_mock:
|
with patch("torch.cuda.is_initialized") as init_mock:
|
||||||
init_mock.return_value = False
|
init_mock.return_value = False
|
||||||
self._testNotInitialized(init_mock)
|
mock_runner = MagicMock()
|
||||||
|
mock_runner._set_cuda_device = MagicMock()
|
||||||
|
LocalDistributedRunner._try_reserve_and_set_cuda(mock_runner)
|
||||||
|
args, _ = mock_runner._set_cuda_device.call_args
|
||||||
|
self.assertTrue(("1" in args) or "0" in args)
|
||||||
|
self.assertEquals(len(os.environ["CUDA_VISIBLE_DEVICES"]), 1)
|
||||||
|
|
||||||
def test1VisibleNotInitialized(self):
|
def test1VisibleNotInitialized(self):
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||||
with patch("torch.cuda.is_initialized") as init_mock:
|
with patch("torch.cuda.is_initialized") as init_mock:
|
||||||
init_mock.return_value = False
|
init_mock.return_value = False
|
||||||
self._testNotInitialized(init_mock)
|
mock_runner = MagicMock()
|
||||||
|
mock_runner._set_cuda_device = MagicMock()
|
||||||
|
LocalDistributedRunner._try_reserve_and_set_cuda(mock_runner)
|
||||||
|
mock_runner._set_cuda_device.assert_called_with("0")
|
||||||
|
self.assertEquals(len(os.environ["CUDA_VISIBLE_DEVICES"]), 1)
|
||||||
|
|
||||||
@patch("torch.cuda.set_device")
|
@patch("torch.cuda.set_device")
|
||||||
def testSetDevice(self, set_mock):
|
def testSetDevice(self, set_mock):
|
||||||
|
|
|
@ -276,22 +276,28 @@ class LocalDistributedRunner(DistributedTorchRunner):
|
||||||
super(LocalDistributedRunner, self).__init__(*args, **kwargs)
|
super(LocalDistributedRunner, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
def _try_reserve_and_set_cuda(self):
|
def _try_reserve_and_set_cuda(self):
|
||||||
use_found_device = os.environ.get("CUDA_VISIBLE_DEVICES") is None \
|
visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
|
||||||
and torch.cuda.is_initialized()
|
reserved_device = reserve_cuda_device()
|
||||||
device = reserve_cuda_device()
|
|
||||||
# This needs to be set even if torch.cuda is already
|
# This needs to be set even if torch.cuda is already
|
||||||
# initialized because the env var is used later when
|
# initialized because the env var is used later when
|
||||||
# starting the DDP setup.
|
# starting the DDP setup.
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = device
|
os.environ["CUDA_VISIBLE_DEVICES"] = reserved_device
|
||||||
if use_found_device:
|
if visible_devices:
|
||||||
|
# We want to set the index on the visible devices list.
|
||||||
|
if reserved_device not in visible_devices:
|
||||||
|
raise RuntimeError(
|
||||||
|
"TorchTrainer reserved a device {} that was not in the "
|
||||||
|
"CUDA_VISIBLE_DEVICES {}. This may be because the "
|
||||||
|
"Ray cluster is not set with the right env vars. "
|
||||||
|
"If that is not the issue, please raise a "
|
||||||
|
"Github issue.".format(reserved_device, visible_devices))
|
||||||
|
devices = visible_devices.split(",")
|
||||||
|
scoped_index = devices.index(reserved_device)
|
||||||
|
self._set_cuda_device(str(scoped_index))
|
||||||
|
else:
|
||||||
# Once cuda is initialized, torch.device ignores the os.env
|
# Once cuda is initialized, torch.device ignores the os.env
|
||||||
# so we have to set the right actual device.
|
# so we have to set the right actual device.
|
||||||
self._set_cuda_device(device)
|
self._set_cuda_device(reserved_device)
|
||||||
else:
|
|
||||||
# if CUDA is not initialized, we can set the os.env.
|
|
||||||
# Even if initialized, we want to set the device to use BatchNorm.
|
|
||||||
# and make Torch think it only sees 1 GPU.
|
|
||||||
self._set_cuda_device("0")
|
|
||||||
|
|
||||||
def _set_cuda_device(self, device_str):
|
def _set_cuda_device(self, device_str):
|
||||||
"""Sets the CUDA device for this current local worker."""
|
"""Sets the CUDA device for this current local worker."""
|
||||||
|
|
Loading…
Add table
Reference in a new issue