[SGD] Link ray.sgd namespace to ray.util.sgd.v2 (#18732)

* wip

* add symlink

* update

* remove from init

* no require tune

* try fix

* change

* * import

* fix docs

* address comment
This commit is contained in:
Amog Kamsetty 2021-09-22 18:49:41 -07:00 committed by GitHub
parent e41109a5e7
commit d354161528
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 55 additions and 36 deletions

View file

@ -5,8 +5,8 @@ RaySGD: Distributed Training Wrappers
=====================================
.. tip:: We are rolling out a lighter-weight version of RaySGD. See the
documentation :ref:`here <sgd-v2-docs>`.
.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD is in alpha as of Ray 1.7.
See the documentation :ref:`here <sgd-v2-docs>`.
RaySGD is a lightweight library for distributed deep learning, providing thin wrappers around PyTorch and TensorFlow native modules for data parallel training.

View file

@ -1,3 +1,4 @@
.. _sgd-api:
RaySGD API
@ -8,7 +9,7 @@ RaySGD API
Trainer
-------
.. autoclass:: ray.util.sgd.v2.Trainer
.. autoclass:: ray.sgd.Trainer
:members:
.. _sgd-api-iterator:
@ -16,7 +17,7 @@ Trainer
SGDIterator
~~~~~~~~~~~
.. autoclass:: ray.util.sgd.v2.SGDIterator
.. autoclass:: ray.sgd.SGDIterator
:members:
.. _sgd-api-backend-config:
@ -24,35 +25,35 @@ SGDIterator
BackendConfig
-------------
.. autoclass:: ray.util.sgd.v2.BackendConfig
.. autoclass:: ray.sgd.BackendConfig
.. _sgd-api-torch-config:
TorchConfig
~~~~~~~~~~~
.. autoclass:: ray.util.sgd.v2.TorchConfig
.. autoclass:: ray.sgd.TorchConfig
.. _sgd-api-tensorflow-config:
TensorflowConfig
~~~~~~~~~~~~~~~~
.. autoclass:: ray.util.sgd.v2.TensorflowConfig
.. autoclass:: ray.sgd.TensorflowConfig
.. _sgd-api-horovod-config:
HorovodConfig
~~~~~~~~~~~~~
.. autoclass:: ray.util.sgd.v2.HorovodConfig
.. autoclass:: ray.sgd.HorovodConfig
.. _sgd-api-callback:
SGDCallback
-----------
.. autoclass:: ray.util.sgd.v2.SGDCallback
.. autoclass:: ray.sgd.SGDCallback
:members:
.. _sgd-api-json-logger-callback:
@ -60,21 +61,21 @@ SGDCallback
JsonLoggerCallback
~~~~~~~~~~~~~~~~~~
.. autoclass:: ray.util.sgd.v2.callbacks.JsonLoggerCallback
.. autoclass:: ray.sgd.JsonLoggerCallback
.. _sgd-api-tbx-logger-callback:
TBXLoggerCallback
~~~~~~~~~~~~~~~~~
.. autoclass:: ray.util.sgd.v2.callbacks.TBXLoggerCallback
.. autoclass:: ray.sgd.TBXLoggerCallback
.. _sgd-api-checkpoint-strategy:
CheckpointStrategy
------------------
.. autoclass:: ray.util.sgd.v2.CheckpointStrategy
.. autoclass:: ray.sgd.CheckpointStrategy
Training Function Utilities
---------------------------
@ -82,19 +83,19 @@ Training Function Utilities
sgd.report
~~~~~~~~~~
.. autofunction:: ray.util.sgd.v2.report
.. autofunction:: ray.sgd.report
sgd.load_checkpoint
~~~~~~~~~~~~~~~~~~~
.. autofunction:: ray.util.sgd.v2.load_checkpoint
.. autofunction:: ray.sgd.load_checkpoint
sgd.save_checkpoint
~~~~~~~~~~~~~~~~~~~
.. autofunction:: ray.util.sgd.v2.save_checkpoint
.. autofunction:: ray.sgd.save_checkpoint
sgd.world_rank
~~~~~~~~~~~~~~
.. autofunction:: ray.util.sgd.v2.world_rank
.. autofunction:: ray.sgd.world_rank

View file

@ -146,7 +146,7 @@ system. Let's take following simple examples:
.. code-block:: python
from ray.util.sgd.v2 import Trainer
from ray.sgd import Trainer
trainer = Trainer(backend="torch", num_workers=4)
trainer.start()
@ -246,7 +246,7 @@ system. Let's take following simple examples:
.. code-block:: python
from ray.util.sgd.v2 import Trainer
from ray.sgd import Trainer
trainer = Trainer(backend="tensorflow", num_workers=4)
trainer.start()

View file

@ -213,7 +213,7 @@ configurations. As an example:
.. code-block:: python
from ray.util.sgd.v2 import Trainer
from ray.sgd import Trainer
def train_func(config):
results = []
@ -340,8 +340,8 @@ You can plug all of these into RaySGD with the following interface:
.. code-block:: python
from ray.util.sgd import v2 as sgd
from ray.util.sgd.v2 import SGDCallback, Trainer
from ray import sgd
from sgd import SGDCallback, Trainer
from typing import List, Dict
class PrintingCallback(SGDCallback):
@ -395,7 +395,7 @@ A simple example for creating a callback that will print out results:
.. code-block:: python
from ray.util.sgd.v2 import SGDCallback
from ray.sgd import SGDCallback
class PrintingCallback(SGDCallback):
def handle_result(self, results: List[Dict], **info):
@ -422,8 +422,8 @@ Here is an example:
.. code-block:: python
from ray.util.sgd import v2 as sgd
from ray.util.sgd.v2 import SGDCallback, Trainer
from ray import sgd
from ray.sgd import SGDCallback, Trainer
from typing import List, Dict
import torch
@ -477,8 +477,8 @@ The latest saved checkpoint can be accessed through the ``Trainer``'s
.. code-block:: python
from ray.util.sgd import v2 as sgd
from ray.util.sgd.v2 import Trainer
from ray import sgd
from sgd import Trainer
def train_func(config):
model = 0 # This should be replaced with a real model.
@ -519,8 +519,8 @@ As an example, to disable writing checkpoints to disk:
.. code-block:: python
:emphasize-lines: 8,12
from ray.util.sgd import v2 as sgd
from ray.util.sgd.v2 import CheckpointStrategy, Trainer
from ray import sgd
from sgd import CheckpointStrategy, Trainer
def train_func():
for epoch in range(3):
@ -550,8 +550,8 @@ Checkpoints can be loaded into the training function in 2 steps:
.. code-block:: python
from ray.util.sgd import v2 as sgd
from ray.util.sgd.v2 import Trainer
from ray import sgd
from sgd import Trainer
def train_func(config):
checkpoint = sgd.load_checkpoint() or {}
@ -662,8 +662,8 @@ produce an object ("Trainable") that will be passed to Ray Tune.
.. code-block:: python
from ray.util.sgd import v2 as sgd
from ray.util.sgd.v2 import Trainer
from ray import sgd
from sgd import Trainer
def train_func(config):
# In this example, nothing is expected to change over epochs,
@ -704,8 +704,8 @@ A couple caveats:
.. code-block:: python
from ray import tune
from ray.util.sgd import v2 as sgd
from ray.util.sgd.v2 import Trainer
from ray import sgd
from sgd import Trainer
def train_func(config):
# In this example, nothing is expected to change over epochs,

View file

@ -74,6 +74,7 @@ if __name__ == "__main__":
do_link("rllib", force=args.yes, local_path="../../../rllib")
do_link("tune", force=args.yes)
do_link("sgd", force=args.yes)
do_link("autoscaler", force=args.yes)
do_link("ray_operator", force=args.yes)
do_link("cloudpickle", force=args.yes)

View file

@ -0,0 +1,2 @@
from ray.util.sgd.v2 import * # noqa: F401, F403
from ray.util.sgd.v2.callbacks import JsonLoggerCallback, TBXLoggerCallback # noqa: E501, F401, F403

View file

@ -10,8 +10,6 @@ import torch
import torch.distributed as dist
import ray
from ray.tune import PlacementGroupFactory, Trainable
from ray.tune.utils.util import merge_dicts
from ray.util import log_once
from ray.util.annotations import PublicAPI
from ray.util.sgd.torch.worker_group import LocalWorkerGroup, \
@ -20,6 +18,20 @@ from ray.util.sgd.utils import NUM_SAMPLES, BATCH_SIZE
from ray.util.sgd.torch.constants import VALID_SCHEDULER_STEP, NCCL_TIMEOUT_S
from ray.util.sgd.data import Dataset
try:
from ray.tune import Trainable
from ray.tune import PlacementGroupFactory
from ray.tune.utils.util import merge_dicts
TUNE_INSTALLED = True
except ImportError:
TUNE_INSTALLED = False
Trainable = PlacementGroupFactory = object
def noop():
return
merge_dicts = noop
logger = logging.getLogger(__name__)
@ -652,6 +664,9 @@ class TorchTrainer:
training epoch for each tune iteration.
"""
if not TUNE_INSTALLED:
raise RuntimeError("Please install `ray[tune]` to use the Tune "
"integration.")
if override_tune_step is not None:
callback_args = inspect.signature(override_tune_step)
if not len(callback_args.parameters) == 2: