From d354161528612b2d430e47e0aced5b3b0a22628a Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Wed, 22 Sep 2021 18:49:41 -0700 Subject: [PATCH] [SGD] Link `ray.sgd` namespace to `ray.util.sgd.v2` (#18732) * wip * add symlink * update * remove from init * no require tune * try fix * change * * import * fix docs * address comment --- doc/source/raysgd/raysgd.rst | 4 +-- doc/source/raysgd/v2/api.rst | 29 ++++++++++---------- doc/source/raysgd/v2/raysgd.rst | 4 +-- doc/source/raysgd/v2/user_guide.rst | 32 +++++++++++----------- python/ray/setup-dev.py | 1 + python/ray/sgd/__init__.py | 2 ++ python/ray/util/sgd/torch/torch_trainer.py | 19 +++++++++++-- 7 files changed, 55 insertions(+), 36 deletions(-) create mode 100644 python/ray/sgd/__init__.py diff --git a/doc/source/raysgd/raysgd.rst b/doc/source/raysgd/raysgd.rst index 245179623..87696e68d 100644 --- a/doc/source/raysgd/raysgd.rst +++ b/doc/source/raysgd/raysgd.rst @@ -5,8 +5,8 @@ RaySGD: Distributed Training Wrappers ===================================== -.. tip:: We are rolling out a lighter-weight version of RaySGD. See the - documentation :ref:`here `. +.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD is in alpha as of Ray 1.7. + See the documentation :ref:`here `. RaySGD is a lightweight library for distributed deep learning, providing thin wrappers around PyTorch and TensorFlow native modules for data parallel training. diff --git a/doc/source/raysgd/v2/api.rst b/doc/source/raysgd/v2/api.rst index 382a8d8fc..bb4a871cb 100644 --- a/doc/source/raysgd/v2/api.rst +++ b/doc/source/raysgd/v2/api.rst @@ -1,3 +1,4 @@ + .. _sgd-api: RaySGD API @@ -8,7 +9,7 @@ RaySGD API Trainer ------- -.. autoclass:: ray.util.sgd.v2.Trainer +.. autoclass:: ray.sgd.Trainer :members: .. _sgd-api-iterator: @@ -16,7 +17,7 @@ Trainer SGDIterator ~~~~~~~~~~~ -.. autoclass:: ray.util.sgd.v2.SGDIterator +.. autoclass:: ray.sgd.SGDIterator :members: .. _sgd-api-backend-config: @@ -24,35 +25,35 @@ SGDIterator BackendConfig ------------- -.. autoclass:: ray.util.sgd.v2.BackendConfig +.. autoclass:: ray.sgd.BackendConfig .. _sgd-api-torch-config: TorchConfig ~~~~~~~~~~~ -.. autoclass:: ray.util.sgd.v2.TorchConfig +.. autoclass:: ray.sgd.TorchConfig .. _sgd-api-tensorflow-config: TensorflowConfig ~~~~~~~~~~~~~~~~ -.. autoclass:: ray.util.sgd.v2.TensorflowConfig +.. autoclass:: ray.sgd.TensorflowConfig .. _sgd-api-horovod-config: HorovodConfig ~~~~~~~~~~~~~ -.. autoclass:: ray.util.sgd.v2.HorovodConfig +.. autoclass:: ray.sgd.HorovodConfig .. _sgd-api-callback: SGDCallback ----------- -.. autoclass:: ray.util.sgd.v2.SGDCallback +.. autoclass:: ray.sgd.SGDCallback :members: .. _sgd-api-json-logger-callback: @@ -60,21 +61,21 @@ SGDCallback JsonLoggerCallback ~~~~~~~~~~~~~~~~~~ -.. autoclass:: ray.util.sgd.v2.callbacks.JsonLoggerCallback +.. autoclass:: ray.sgd.JsonLoggerCallback .. _sgd-api-tbx-logger-callback: TBXLoggerCallback ~~~~~~~~~~~~~~~~~ -.. autoclass:: ray.util.sgd.v2.callbacks.TBXLoggerCallback +.. autoclass:: ray.sgd.TBXLoggerCallback .. _sgd-api-checkpoint-strategy: CheckpointStrategy ------------------ -.. autoclass:: ray.util.sgd.v2.CheckpointStrategy +.. autoclass:: ray.sgd.CheckpointStrategy Training Function Utilities --------------------------- @@ -82,19 +83,19 @@ Training Function Utilities sgd.report ~~~~~~~~~~ -.. autofunction:: ray.util.sgd.v2.report +.. autofunction:: ray.sgd.report sgd.load_checkpoint ~~~~~~~~~~~~~~~~~~~ -.. autofunction:: ray.util.sgd.v2.load_checkpoint +.. autofunction:: ray.sgd.load_checkpoint sgd.save_checkpoint ~~~~~~~~~~~~~~~~~~~ -.. autofunction:: ray.util.sgd.v2.save_checkpoint +.. autofunction:: ray.sgd.save_checkpoint sgd.world_rank ~~~~~~~~~~~~~~ -.. autofunction:: ray.util.sgd.v2.world_rank \ No newline at end of file +.. autofunction:: ray.sgd.world_rank \ No newline at end of file diff --git a/doc/source/raysgd/v2/raysgd.rst b/doc/source/raysgd/v2/raysgd.rst index 374603134..02111cdae 100644 --- a/doc/source/raysgd/v2/raysgd.rst +++ b/doc/source/raysgd/v2/raysgd.rst @@ -146,7 +146,7 @@ system. Let's take following simple examples: .. code-block:: python - from ray.util.sgd.v2 import Trainer + from ray.sgd import Trainer trainer = Trainer(backend="torch", num_workers=4) trainer.start() @@ -246,7 +246,7 @@ system. Let's take following simple examples: .. code-block:: python - from ray.util.sgd.v2 import Trainer + from ray.sgd import Trainer trainer = Trainer(backend="tensorflow", num_workers=4) trainer.start() diff --git a/doc/source/raysgd/v2/user_guide.rst b/doc/source/raysgd/v2/user_guide.rst index 88b528390..9b2d223f3 100644 --- a/doc/source/raysgd/v2/user_guide.rst +++ b/doc/source/raysgd/v2/user_guide.rst @@ -213,7 +213,7 @@ configurations. As an example: .. code-block:: python - from ray.util.sgd.v2 import Trainer + from ray.sgd import Trainer def train_func(config): results = [] @@ -340,8 +340,8 @@ You can plug all of these into RaySGD with the following interface: .. code-block:: python - from ray.util.sgd import v2 as sgd - from ray.util.sgd.v2 import SGDCallback, Trainer + from ray import sgd + from sgd import SGDCallback, Trainer from typing import List, Dict class PrintingCallback(SGDCallback): @@ -395,7 +395,7 @@ A simple example for creating a callback that will print out results: .. code-block:: python - from ray.util.sgd.v2 import SGDCallback + from ray.sgd import SGDCallback class PrintingCallback(SGDCallback): def handle_result(self, results: List[Dict], **info): @@ -422,8 +422,8 @@ Here is an example: .. code-block:: python - from ray.util.sgd import v2 as sgd - from ray.util.sgd.v2 import SGDCallback, Trainer + from ray import sgd + from ray.sgd import SGDCallback, Trainer from typing import List, Dict import torch @@ -477,8 +477,8 @@ The latest saved checkpoint can be accessed through the ``Trainer``'s .. code-block:: python - from ray.util.sgd import v2 as sgd - from ray.util.sgd.v2 import Trainer + from ray import sgd + from sgd import Trainer def train_func(config): model = 0 # This should be replaced with a real model. @@ -519,8 +519,8 @@ As an example, to disable writing checkpoints to disk: .. code-block:: python :emphasize-lines: 8,12 - from ray.util.sgd import v2 as sgd - from ray.util.sgd.v2 import CheckpointStrategy, Trainer + from ray import sgd + from sgd import CheckpointStrategy, Trainer def train_func(): for epoch in range(3): @@ -550,8 +550,8 @@ Checkpoints can be loaded into the training function in 2 steps: .. code-block:: python - from ray.util.sgd import v2 as sgd - from ray.util.sgd.v2 import Trainer + from ray import sgd + from sgd import Trainer def train_func(config): checkpoint = sgd.load_checkpoint() or {} @@ -662,8 +662,8 @@ produce an object ("Trainable") that will be passed to Ray Tune. .. code-block:: python - from ray.util.sgd import v2 as sgd - from ray.util.sgd.v2 import Trainer + from ray import sgd + from sgd import Trainer def train_func(config): # In this example, nothing is expected to change over epochs, @@ -704,8 +704,8 @@ A couple caveats: .. code-block:: python from ray import tune - from ray.util.sgd import v2 as sgd - from ray.util.sgd.v2 import Trainer + from ray import sgd + from sgd import Trainer def train_func(config): # In this example, nothing is expected to change over epochs, diff --git a/python/ray/setup-dev.py b/python/ray/setup-dev.py index 9873a2cc6..81a0a0aca 100755 --- a/python/ray/setup-dev.py +++ b/python/ray/setup-dev.py @@ -74,6 +74,7 @@ if __name__ == "__main__": do_link("rllib", force=args.yes, local_path="../../../rllib") do_link("tune", force=args.yes) + do_link("sgd", force=args.yes) do_link("autoscaler", force=args.yes) do_link("ray_operator", force=args.yes) do_link("cloudpickle", force=args.yes) diff --git a/python/ray/sgd/__init__.py b/python/ray/sgd/__init__.py new file mode 100644 index 000000000..c5d4677aa --- /dev/null +++ b/python/ray/sgd/__init__.py @@ -0,0 +1,2 @@ +from ray.util.sgd.v2 import * # noqa: F401, F403 +from ray.util.sgd.v2.callbacks import JsonLoggerCallback, TBXLoggerCallback # noqa: E501, F401, F403 diff --git a/python/ray/util/sgd/torch/torch_trainer.py b/python/ray/util/sgd/torch/torch_trainer.py index 24258290f..d5bda476e 100644 --- a/python/ray/util/sgd/torch/torch_trainer.py +++ b/python/ray/util/sgd/torch/torch_trainer.py @@ -10,8 +10,6 @@ import torch import torch.distributed as dist import ray -from ray.tune import PlacementGroupFactory, Trainable -from ray.tune.utils.util import merge_dicts from ray.util import log_once from ray.util.annotations import PublicAPI from ray.util.sgd.torch.worker_group import LocalWorkerGroup, \ @@ -20,6 +18,20 @@ from ray.util.sgd.utils import NUM_SAMPLES, BATCH_SIZE from ray.util.sgd.torch.constants import VALID_SCHEDULER_STEP, NCCL_TIMEOUT_S from ray.util.sgd.data import Dataset +try: + from ray.tune import Trainable + from ray.tune import PlacementGroupFactory + from ray.tune.utils.util import merge_dicts + TUNE_INSTALLED = True +except ImportError: + TUNE_INSTALLED = False + Trainable = PlacementGroupFactory = object + + def noop(): + return + + merge_dicts = noop + logger = logging.getLogger(__name__) @@ -652,6 +664,9 @@ class TorchTrainer: training epoch for each tune iteration. """ + if not TUNE_INSTALLED: + raise RuntimeError("Please install `ray[tune]` to use the Tune " + "integration.") if override_tune_step is not None: callback_args = inspect.signature(override_tune_step) if not len(callback_args.parameters) == 2: