[SGD] Retry sgd.local_rank() (#18824)

* finish * fix * wip * address comment * update * fix test * fix failing test * address comments * fix test * fix
2025-03-06 18:41:40 -05:00 · 2021-09-22 15:48:38 -07:00 · 2021-09-22 15:48:38 -07:00 · 00dd190df9
commit 00dd190df9
parent 73c3cff18b
14 changed files with 224 additions and 56 deletions
--- a/doc/source/raysgd/v2/raysgd.rst
+++ b/doc/source/raysgd/v2/raysgd.rst
@ -153,6 +153,10 @@ system. Let's take following simple examples:
        results = trainer.run(train_func_distributed)
        trainer.shutdown()
    See :ref:`sgd-porting-code` for a more comprehensive example.
  .. group-tab:: TensorFlow
    This example shows how you can use RaySGD to set up `Multi-worker training
@ -250,4 +254,7 @@ system. Let's take following simple examples:
        trainer.shutdown()
    See :ref:`sgd-porting-code` for a more comprehensive example.
 **Next steps:** Check out the :ref:`User Guide <sgd-user-guide>`!
--- a/doc/source/raysgd/v2/user_guide.rst
+++ b/doc/source/raysgd/v2/user_guide.rst
@ -83,6 +83,30 @@ training.
                                 sampler=DistributedSampler(dataset))
    **Step 3:** Set the proper CUDA device if you are using GPUs.
    If you are using GPUs, you need to make sure to the CUDA devices are properly setup inside your training function.
    This involves 3 steps:
    1. Use the local rank to set the default CUDA device for the worker.
    2. Move the model to the default CUDA device (or a specific CUDA device).
    3. Specify ``device_ids`` when wrapping in ``DistributedDataParallel``.
    .. code-block:: python
        def train_func():
            device = torch.device(f"cuda:{sgd.local_rank()}" if
                          torch.cuda.is_available() else "cpu")
            torch.cuda.set_device(device)
            # Create model.
            model = NeuralNetwork()
            model = model.to(device)
            model = DistributedDataParallel(
                model,
                device_ids=[sgd.local_rank()] if torch.cuda.is_available() else None)
  .. group-tab:: TensorFlow
    .. note::
--- a/python/ray/util/sgd/v2/BUILD
+++ b/python/ray/util/sgd/v2/BUILD
@ -73,7 +73,7 @@ py_test(
 py_test(
    name = "test_worker_group",
-    size = "small",
+    size = "medium",
    srcs = ["tests/test_worker_group.py"],
    tags = ["team:ml", "exclusive"],
    deps = [":sgd_v2_lib"]
--- a/python/ray/util/sgd/v2/init.py
+++ b/python/ray/util/sgd/v2/init.py
@ -3,11 +3,11 @@ from ray.util.sgd.v2.backends import (BackendConfig, HorovodConfig,
 from ray.util.sgd.v2.callbacks import SGDCallback
 from ray.util.sgd.v2.checkpoint import CheckpointStrategy
 from ray.util.sgd.v2.session import (load_checkpoint, save_checkpoint, report,
-                                     world_rank)
+                                     world_rank, local_rank)
 from ray.util.sgd.v2.trainer import Trainer, SGDIterator
 __all__ = [
    "BackendConfig", "CheckpointStrategy", "HorovodConfig", "load_checkpoint",
-    "report", "save_checkpoint", "SGDCallback", "SGDIterator",
+    "local_rank", "report", "save_checkpoint", "SGDCallback", "SGDIterator",
    "TensorflowConfig", "TorchConfig", "Trainer", "world_rank"
 ]
--- a/python/ray/util/sgd/v2/backends/backend.py
+++ b/python/ray/util/sgd/v2/backends/backend.py
@ -15,8 +15,7 @@ from ray.util.sgd.v2.constants import ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, \
    TUNE_CHECKPOINT_ID
 from ray.util.sgd.v2.session import TrainingResultType, TrainingResult
 from ray.util.sgd.v2.session import init_session, get_session, shutdown_session
-from ray.util.sgd.v2.utils import construct_path, get_node_id, get_gpu_ids, \
+from ray.util.sgd.v2.utils import construct_path, check_for_failure
    check_for_failure
 from ray.util.sgd.v2.worker_group import WorkerGroup
 if TUNE_INSTALLED:
@ -309,12 +308,8 @@ class BackendExecutor:
        """
-        def get_node_id_and_gpu():
+        node_ids_and_gpu_ids = [(w.metadata.node_id, w.metadata.gpu_ids)
-            node_id = get_node_id()
+                                for w in self.worker_group.workers]
            gpu_ids = get_gpu_ids()
            return node_id, gpu_ids
        node_ids_and_gpu_ids = self.worker_group.execute(get_node_id_and_gpu)
        node_id_to_worker_id = defaultdict(set)
        node_id_to_gpu_ids = defaultdict(set)
@ -336,6 +331,37 @@ class BackendExecutor:
                        worker_id, set_gpu_ids))
        ray.get(futures)
    def _create_local_rank_map(self) -> Dict:
        """Create mapping from worker world_rank to local_rank.
        Example:
            Worker 0: 0.0.0.0
            Worker 1: 0.0.0.0
            Worker 2: 0.0.0.1
            Worker 3: 0.0.0.0
            Worker 4: 0.0.0.1
            Workers 0, 1, 3 are on 0.0.0.0.
            Workers 2, 4 are on 0.0.0.1.
            Expected Output:
            {
                0 -> 0,
                1 -> 1,
                2 -> 0,
                3 -> 2,
                4 -> 1
            }
        """
        rank_mapping = {}
        ip_dict = defaultdict(int)
        for world_rank in range(len(self.worker_group)):
            worker = self.worker_group.workers[world_rank]
            node_ip = worker.metadata.node_ip
            rank_mapping[world_rank] = ip_dict[node_ip]
            ip_dict[node_ip] += 1
        return rank_mapping
    def start_training(
            self,
            train_func: Callable[[], T],
@ -371,11 +397,12 @@ class BackendExecutor:
            ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0)
        # First initialize the session.
-        def initialize_session(world_rank, train_func, checkpoint):
+        def initialize_session(world_rank, local_rank, train_func, checkpoint):
            try:
                init_session(
                    training_func=train_func,
                    world_rank=world_rank,
                    local_rank=local_rank,
                    checkpoint=checkpoint,
                    detailed_autofilled_metrics=use_detailed_autofilled_metrics
                )
@ -388,6 +415,8 @@ class BackendExecutor:
        checkpoint_dict = self.checkpoint_manager._load_checkpoint(checkpoint)
        local_rank_map = self._create_local_rank_map()
        futures = []
        for world_rank in range(len(self.worker_group)):
            futures.append(
@ -395,6 +424,7 @@ class BackendExecutor:
                    world_rank,
                    initialize_session,
                    world_rank=world_rank,
                    local_rank=local_rank_map[world_rank],
                    train_func=train_func,
                    checkpoint=checkpoint_dict))
--- a/python/ray/util/sgd/v2/backends/horovod.py
+++ b/python/ray/util/sgd/v2/backends/horovod.py
@ -5,7 +5,7 @@ from typing import Optional, Set
 import ray
 from ray.util.sgd.v2.backends.backend import BackendConfig, Backend
-from ray.util.sgd.v2.utils import get_node_id, get_hostname, update_env_vars
+from ray.util.sgd.v2.utils import update_env_vars
 from ray.util.sgd.v2.worker_group import WorkerGroup
 try:
@ -44,9 +44,9 @@ class HorovodConfig(BackendConfig):
        return HorovodBackend
-def init_env_vars(world_rank: int, world_size: int):
+def init_env_vars(world_rank: int, world_size: int, node_id: str):
    """Initialize Horovod environment variables."""
-    os.environ["HOROVOD_HOSTNAME"] = get_node_id()
+    os.environ["HOROVOD_HOSTNAME"] = node_id
    os.environ["HOROVOD_RANK"] = str(world_rank)
    os.environ["HOROVOD_SIZE"] = str(world_size)
@ -60,9 +60,11 @@ class HorovodBackend(Backend):
        # Initialize workers with Horovod environment variables
        setup_futures = []
        for rank in range(len(worker_group)):
            worker_node_id = worker_group.workers[rank].metadata.node_id
            setup_futures.append(
                worker_group.execute_single_async(rank, init_env_vars, rank,
-                                                  len(worker_group)))
+                                                  len(worker_group),
                                                  worker_node_id))
        ray.get(setup_futures)
        # Use Horovod Ray Coordinator
@ -70,8 +72,8 @@ class HorovodBackend(Backend):
        self.coordinator = Coordinator(backend_config)
        # Get all the hostnames of all workers
-        node_ids = worker_group.execute(get_node_id)
+        node_ids = [w.metadata.node_id for w in worker_group.workers]
-        hostnames = worker_group.execute(get_hostname)
+        hostnames = [w.metadata.hostname for w in worker_group.workers]
        # Register each hostname to the coordinator. assumes the hostname
        # ordering is the same.
        for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)):
--- a/python/ray/util/sgd/v2/examples/train_fashion_mnist_example.py
+++ b/python/ray/util/sgd/v2/examples/train_fashion_mnist_example.py
@ -86,6 +86,9 @@ def train_func(config: Dict):
    lr = config["lr"]
    epochs = config["epochs"]
    device = torch.device(f"cuda:{sgd.local_rank()}"
                          if torch.cuda.is_available() else "cpu")
    # Create data loaders.
    train_dataloader = DataLoader(
        training_data,
@ -97,10 +100,11 @@ def train_func(config: Dict):
        sampler=DistributedSampler(test_data))
    # Create model.
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = NeuralNetwork()
    model = model.to(device)
-    model = DistributedDataParallel(model)
+    model = DistributedDataParallel(
        model,
        device_ids=[device.index] if torch.cuda.is_available() else None)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
--- a/python/ray/util/sgd/v2/session.py
+++ b/python/ray/util/sgd/v2/session.py
@ -32,12 +32,14 @@ class Session:
    def __init__(self,
                 training_func: Callable,
                 world_rank: int,
                 local_rank: int,
                 checkpoint: Optional[Dict] = None,
                 detailed_autofilled_metrics: bool = False):
        # The Thread object that is running the training function.
        self.training_thread = PropagatingThread(
            target=training_func, daemon=True)
        self.world_rank = world_rank
        self.local_rank = local_rank
        self.loaded_checkpoint = checkpoint
        # This lock is used to control the execution of the training thread.
@ -263,6 +265,29 @@ def world_rank() -> int:
    return session.world_rank
 def local_rank() -> int:
    """Get the local rank of this worker (rank of the worker on its node).
    .. code-block:: python
        import time
        from ray.util import sgd
        def train_func():
            if torch.cuda.is_available():
                torch.cuda.set_device(sgd.local_rank())
            ...
        trainer = Trainer(backend="torch", use_gpu=True)
        trainer.start()
        trainer.run(train_func)
        trainer.shutdown()
    """
    session = get_session()
    return session.local_rank
 def load_checkpoint() -> Optional[Dict]:
    """Loads checkpoint data onto the worker.
--- a/python/ray/util/sgd/v2/tests/test_backend.py
+++ b/python/ray/util/sgd/v2/tests/test_backend.py
@ -54,9 +54,10 @@ def ray_2_node_4_gpu():
 def gen_execute_special(special_f):
    def execute_async_special(self, f):
        """Runs f on worker 0, special_f on other workers."""
-        futures = [self.workers[0]._BaseWorkerMixin__execute.remote(f)]
+        futures = [self.workers[0].actor._BaseWorkerMixin__execute.remote(f)]
        for worker in self.workers[1:]:
-            futures.append(worker._BaseWorkerMixin__execute.remote(special_f))
+            futures.append(
                worker.actor._BaseWorkerMixin__execute.remote(special_f))
        return futures
    return execute_async_special
@ -123,6 +124,18 @@ def test_train(ray_start_2_cpus, tmp_path):
    assert e.finish_training() == [1, 1]
 def test_local_ranks(ray_start_2_cpus, tmp_path):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()
    def train():
        return sgd.local_rank()
    e.start_training(train, run_dir=tmp_path)
    assert set(e.finish_training()) == {0, 1}
 def test_train_failure(ray_start_2_cpus, tmp_path):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
--- a/python/ray/util/sgd/v2/tests/test_session.py
+++ b/python/ray/util/sgd/v2/tests/test_session.py
@ -2,7 +2,8 @@ import time
 import pytest
 from ray.util.sgd.v2.session import init_session, shutdown_session, \
-    get_session, world_rank, report, save_checkpoint, TrainingResultType, \
+    get_session, world_rank, local_rank, report, save_checkpoint, \
    TrainingResultType, \
    load_checkpoint
@ -11,7 +12,7 @@ def session():
    def f():
        return 1
-    init_session(training_func=f, world_rank=0)
+    init_session(training_func=f, world_rank=0, local_rank=0)
    yield get_session()
    shutdown_session()
@ -34,6 +35,13 @@ def test_world_rank(session):
        world_rank()
 def test_local_rank(session):
    assert local_rank() == 0
    shutdown_session()
    with pytest.raises(ValueError):
        local_rank()
 def test_train(session):
    session.start()
    output = session.finish()
@ -45,7 +53,7 @@ def test_report():
        for i in range(2):
            report(loss=i)
-    init_session(training_func=train, world_rank=0)
+    init_session(training_func=train, world_rank=0, local_rank=0)
    session = get_session()
    session.start()
    assert session.get_next().data["loss"] == 0
@ -62,7 +70,7 @@ def test_report_fail():
            report(i)
        return 1
-    init_session(training_func=train, world_rank=0)
+    init_session(training_func=train, world_rank=0, local_rank=0)
    session = get_session()
    session.start()
    assert session.get_next() is None
@ -96,7 +104,7 @@ def test_checkpoint():
        assert next.type == TrainingResultType.CHECKPOINT
        assert next.data["epoch"] == expected
-    init_session(training_func=train, world_rank=0)
+    init_session(training_func=train, world_rank=0, local_rank=0)
    session = get_session()
    session.start()
    validate_zero(0)
@ -110,7 +118,7 @@ def test_checkpoint():
        assert next.type == TrainingResultType.CHECKPOINT
        assert next.data == {}
-    init_session(training_func=train, world_rank=1)
+    init_session(training_func=train, world_rank=1, local_rank=1)
    session = get_session()
    session.start()
    validate_nonzero()
@ -129,7 +137,7 @@ def test_load_checkpoint_after_save():
            checkpoint = load_checkpoint()
            assert checkpoint["epoch"] == i
-    init_session(training_func=train, world_rank=0)
+    init_session(training_func=train, world_rank=0, local_rank=0)
    session = get_session()
    session.start()
    for i in range(2):
@ -145,7 +153,7 @@ def test_locking():
        import _thread
        _thread.interrupt_main()
-    init_session(training_func=train_1, world_rank=0)
+    init_session(training_func=train_1, world_rank=0, local_rank=0)
    session = get_session()
    with pytest.raises(KeyboardInterrupt):
        session.start()
@ -156,7 +164,7 @@ def test_locking():
            report(loss=i)
        train_1()
-    init_session(training_func=train_2, world_rank=0)
+    init_session(training_func=train_2, world_rank=0, local_rank=0)
    session = get_session()
    session.start()
    time.sleep(3)
--- a/python/ray/util/sgd/v2/tests/test_trainer.py
+++ b/python/ray/util/sgd/v2/tests/test_trainer.py
@ -88,7 +88,7 @@ def gen_execute_single_async_special(special_f):
        assert len(self.workers) == 2
        if i == 0 and hasattr(self, "should_fail") and self.should_fail:
            kwargs["train_func"] = special_f
-        return self.workers[i]._BaseWorkerMixin__execute.remote(
+        return self.workers[i].actor._BaseWorkerMixin__execute.remote(
            f, *args, **kwargs)
    return execute_single_async_special
@ -126,7 +126,7 @@ class KillCallback(SGDCallback):
        print(results)
        assert all(r["loss"] == 1 for r in results)
        if self.counter == self.fail_on:
-            ray.kill(self.worker_group.workers[0])
+            ray.kill(self.worker_group.workers[0].actor)
            time.sleep(3)
        self.counter += 1
@ -752,6 +752,27 @@ def test_worker_failure_2(ray_start_2_cpus):
        assert results == [1, 1]
 def test_worker_failure_local_rank(ray_start_2_cpus):
    test_config = TestConfig()
    def train():
        return sgd.local_rank()
    def train_actor_failure():
        import sys
        sys.exit(0)
        return sgd.local_rank()
    new_backend_executor_cls = gen_new_backend_executor(train_actor_failure)
    with patch.object(ray.util.sgd.v2.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        results = trainer.run(train)
        assert set(results) == {0, 1}
 def test_worker_start_failure(ray_start_2_cpus):
    test_config = TestConfig()
--- a/python/ray/util/sgd/v2/trainer.py
+++ b/python/ray/util/sgd/v2/trainer.py
@ -480,7 +480,7 @@ class SGDWorkerGroup:
        self._worker_group = worker_group
    def __getitem__(self, item) -> ActorHandle:
-        return self._worker_group.workers[item]
+        return self._worker_group.workers[item].actor
    def shutdown(self, patience_s: float = 5):
        """Shutdown all the workers.
--- a/python/ray/util/sgd/v2/utils.py
+++ b/python/ray/util/sgd/v2/utils.py
@ -87,21 +87,6 @@ class PropagatingThread(Thread):
        return self.ret
 def get_node_id() -> str:
    """Returns the ID of the node that this worker is on."""
    return ray.get_runtime_context().node_id.hex()
 def get_hostname() -> str:
    """Returns the hostname that this worker is on."""
    return socket.gethostname()
 def get_gpu_ids() -> List[int]:
    """Return list of CUDA device IDs available to this worker."""
    return ray.get_gpu_ids()
 def update_env_vars(env_vars: Dict[str, Any]):
    """Updates the environment variables on this worker process.
--- a/python/ray/util/sgd/v2/worker_group.py
+++ b/python/ray/util/sgd/v2/worker_group.py
@ -1,7 +1,10 @@
 import socket
 from dataclasses import dataclass
 import logging
 from typing import Callable, List, TypeVar, Optional, Dict, Type, Tuple
 import ray
 from ray.actor import ActorHandle
 from ray.types import ObjectRef
 T = TypeVar("T")
@ -22,6 +25,32 @@ class BaseWorkerMixin:
        return func(*args, **kwargs)
@dataclass
 class WorkerMetadata:
    """Metadata for each worker/actor.
    This information is expected to stay the same throughout the lifetime of
    actor.
    Args:
        node_id (str): ID of the node this worker is on.
        node_ip (str): IP address of the node this worker is on.
        hostname (str): Hostname that this worker is on.
        gpu_ids (List[int]): List of CUDA IDs available to this worker.
    """
    node_id: str
    node_ip: str
    hostname: str
    gpu_ids: Optional[List[int]]
@dataclass
 class Worker:
    """Class representing a Worker."""
    actor: ActorHandle
    metadata: WorkerMetadata
 def create_executable_class(executable_cls: Optional[Type] = None) -> Type:
    """Create the executable class to use as the Ray actors."""
    if not executable_cls:
@ -37,6 +66,20 @@ def create_executable_class(executable_cls: Optional[Type] = None) -> Type:
        return _WrappedExecutable
 def construct_metadata() -> WorkerMetadata:
    """Creates metadata for this worker.
    This function is expected to be run on the actor.
    """
    node_id = ray.get_runtime_context().node_id.hex()
    node_ip = ray.util.get_node_ip_address()
    hostname = socket.gethostname()
    gpu_ids = ray.get_gpu_ids()
    return WorkerMetadata(
        node_id=node_id, node_ip=node_ip, hostname=hostname, gpu_ids=gpu_ids)
 class WorkerGroup:
    """Group of Ray Actors that can execute arbitrary functions.
@ -118,8 +161,11 @@ class WorkerGroup:
        self.start()
    def _create_worker(self):
-        return self._remote_cls.remote(*self._actor_cls_args,
+        actor = self._remote_cls.remote(*self._actor_cls_args,
-                                       **self._actor_cls_kwargs)
+                                        **self._actor_cls_kwargs)
        actor_metadata = ray.get(
            actor._BaseWorkerMixin__execute.remote(construct_metadata))
        return Worker(actor=actor, metadata=actor_metadata)
    def start(self):
        """Starts all the workers in this worker group."""
@ -145,9 +191,11 @@ class WorkerGroup:
        logger.debug(f"Shutting down {len(self.workers)} workers.")
        if patience_s <= 0:
            for worker in self.workers:
-                ray.kill(worker)
+                ray.kill(worker.actor)
        else:
-            done_refs = [w.__ray_terminate__.remote() for w in self.workers]
+            done_refs = [
                w.actor.__ray_terminate__.remote() for w in self.workers
            ]
            # Wait for actors to die gracefully.
            done, not_done = ray.wait(done_refs, timeout=patience_s)
            if not_done:
@ -155,7 +203,7 @@ class WorkerGroup:
                             "force kill.")
                # If all actors are not able to die gracefully, then kill them.
                for worker in self.workers:
-                    ray.kill(worker)
+                    ray.kill(worker.actor)
        logger.debug("Shutdown successful.")
        self.workers = []
@ -180,7 +228,7 @@ class WorkerGroup:
                               "create a new WorkerGroup or restart this one.")
        return [
-            w._BaseWorkerMixin__execute.remote(func, *args, **kwargs)
+            w.actor._BaseWorkerMixin__execute.remote(func, *args, **kwargs)
            for w in self.workers
        ]
@ -214,7 +262,8 @@ class WorkerGroup:
        if worker_index >= len(self.workers):
            raise ValueError(f"The provided worker_index {worker_index} is "
                             f"not valid for {self.num_workers} workers.")
-        return self.workers[worker_index]._BaseWorkerMixin__execute.remote(
+        return self.workers[worker_index].actor._BaseWorkerMixin__execute\
            .remote(
            func, *args, **kwargs)
    def execute_single(self, worker_index: int, func: Callable[..., T], *args,