mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[SGD] Link ray.sgd
namespace to ray.util.sgd.v2
(#18732)
* wip * add symlink * update * remove from init * no require tune * try fix * change * * import * fix docs * address comment
This commit is contained in:
parent
e41109a5e7
commit
d354161528
7 changed files with 55 additions and 36 deletions
|
@ -5,8 +5,8 @@ RaySGD: Distributed Training Wrappers
|
|||
=====================================
|
||||
|
||||
|
||||
.. tip:: We are rolling out a lighter-weight version of RaySGD. See the
|
||||
documentation :ref:`here <sgd-v2-docs>`.
|
||||
.. warning:: This is an older version of Ray SGD. A newer, more light-weight version of Ray SGD is in alpha as of Ray 1.7.
|
||||
See the documentation :ref:`here <sgd-v2-docs>`.
|
||||
|
||||
RaySGD is a lightweight library for distributed deep learning, providing thin wrappers around PyTorch and TensorFlow native modules for data parallel training.
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
.. _sgd-api:
|
||||
|
||||
RaySGD API
|
||||
|
@ -8,7 +9,7 @@ RaySGD API
|
|||
Trainer
|
||||
-------
|
||||
|
||||
.. autoclass:: ray.util.sgd.v2.Trainer
|
||||
.. autoclass:: ray.sgd.Trainer
|
||||
:members:
|
||||
|
||||
.. _sgd-api-iterator:
|
||||
|
@ -16,7 +17,7 @@ Trainer
|
|||
SGDIterator
|
||||
~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: ray.util.sgd.v2.SGDIterator
|
||||
.. autoclass:: ray.sgd.SGDIterator
|
||||
:members:
|
||||
|
||||
.. _sgd-api-backend-config:
|
||||
|
@ -24,35 +25,35 @@ SGDIterator
|
|||
BackendConfig
|
||||
-------------
|
||||
|
||||
.. autoclass:: ray.util.sgd.v2.BackendConfig
|
||||
.. autoclass:: ray.sgd.BackendConfig
|
||||
|
||||
.. _sgd-api-torch-config:
|
||||
|
||||
TorchConfig
|
||||
~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: ray.util.sgd.v2.TorchConfig
|
||||
.. autoclass:: ray.sgd.TorchConfig
|
||||
|
||||
.. _sgd-api-tensorflow-config:
|
||||
|
||||
TensorflowConfig
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: ray.util.sgd.v2.TensorflowConfig
|
||||
.. autoclass:: ray.sgd.TensorflowConfig
|
||||
|
||||
.. _sgd-api-horovod-config:
|
||||
|
||||
HorovodConfig
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: ray.util.sgd.v2.HorovodConfig
|
||||
.. autoclass:: ray.sgd.HorovodConfig
|
||||
|
||||
.. _sgd-api-callback:
|
||||
|
||||
SGDCallback
|
||||
-----------
|
||||
|
||||
.. autoclass:: ray.util.sgd.v2.SGDCallback
|
||||
.. autoclass:: ray.sgd.SGDCallback
|
||||
:members:
|
||||
|
||||
.. _sgd-api-json-logger-callback:
|
||||
|
@ -60,21 +61,21 @@ SGDCallback
|
|||
JsonLoggerCallback
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: ray.util.sgd.v2.callbacks.JsonLoggerCallback
|
||||
.. autoclass:: ray.sgd.JsonLoggerCallback
|
||||
|
||||
.. _sgd-api-tbx-logger-callback:
|
||||
|
||||
TBXLoggerCallback
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: ray.util.sgd.v2.callbacks.TBXLoggerCallback
|
||||
.. autoclass:: ray.sgd.TBXLoggerCallback
|
||||
|
||||
.. _sgd-api-checkpoint-strategy:
|
||||
|
||||
CheckpointStrategy
|
||||
------------------
|
||||
|
||||
.. autoclass:: ray.util.sgd.v2.CheckpointStrategy
|
||||
.. autoclass:: ray.sgd.CheckpointStrategy
|
||||
|
||||
Training Function Utilities
|
||||
---------------------------
|
||||
|
@ -82,19 +83,19 @@ Training Function Utilities
|
|||
sgd.report
|
||||
~~~~~~~~~~
|
||||
|
||||
.. autofunction:: ray.util.sgd.v2.report
|
||||
.. autofunction:: ray.sgd.report
|
||||
|
||||
sgd.load_checkpoint
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autofunction:: ray.util.sgd.v2.load_checkpoint
|
||||
.. autofunction:: ray.sgd.load_checkpoint
|
||||
|
||||
sgd.save_checkpoint
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autofunction:: ray.util.sgd.v2.save_checkpoint
|
||||
.. autofunction:: ray.sgd.save_checkpoint
|
||||
|
||||
sgd.world_rank
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. autofunction:: ray.util.sgd.v2.world_rank
|
||||
.. autofunction:: ray.sgd.world_rank
|
|
@ -146,7 +146,7 @@ system. Let's take following simple examples:
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd.v2 import Trainer
|
||||
from ray.sgd import Trainer
|
||||
|
||||
trainer = Trainer(backend="torch", num_workers=4)
|
||||
trainer.start()
|
||||
|
@ -246,7 +246,7 @@ system. Let's take following simple examples:
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd.v2 import Trainer
|
||||
from ray.sgd import Trainer
|
||||
|
||||
trainer = Trainer(backend="tensorflow", num_workers=4)
|
||||
trainer.start()
|
||||
|
|
|
@ -213,7 +213,7 @@ configurations. As an example:
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd.v2 import Trainer
|
||||
from ray.sgd import Trainer
|
||||
|
||||
def train_func(config):
|
||||
results = []
|
||||
|
@ -340,8 +340,8 @@ You can plug all of these into RaySGD with the following interface:
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd import v2 as sgd
|
||||
from ray.util.sgd.v2 import SGDCallback, Trainer
|
||||
from ray import sgd
|
||||
from sgd import SGDCallback, Trainer
|
||||
from typing import List, Dict
|
||||
|
||||
class PrintingCallback(SGDCallback):
|
||||
|
@ -395,7 +395,7 @@ A simple example for creating a callback that will print out results:
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd.v2 import SGDCallback
|
||||
from ray.sgd import SGDCallback
|
||||
|
||||
class PrintingCallback(SGDCallback):
|
||||
def handle_result(self, results: List[Dict], **info):
|
||||
|
@ -422,8 +422,8 @@ Here is an example:
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd import v2 as sgd
|
||||
from ray.util.sgd.v2 import SGDCallback, Trainer
|
||||
from ray import sgd
|
||||
from ray.sgd import SGDCallback, Trainer
|
||||
from typing import List, Dict
|
||||
|
||||
import torch
|
||||
|
@ -477,8 +477,8 @@ The latest saved checkpoint can be accessed through the ``Trainer``'s
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd import v2 as sgd
|
||||
from ray.util.sgd.v2 import Trainer
|
||||
from ray import sgd
|
||||
from sgd import Trainer
|
||||
|
||||
def train_func(config):
|
||||
model = 0 # This should be replaced with a real model.
|
||||
|
@ -519,8 +519,8 @@ As an example, to disable writing checkpoints to disk:
|
|||
.. code-block:: python
|
||||
:emphasize-lines: 8,12
|
||||
|
||||
from ray.util.sgd import v2 as sgd
|
||||
from ray.util.sgd.v2 import CheckpointStrategy, Trainer
|
||||
from ray import sgd
|
||||
from sgd import CheckpointStrategy, Trainer
|
||||
|
||||
def train_func():
|
||||
for epoch in range(3):
|
||||
|
@ -550,8 +550,8 @@ Checkpoints can be loaded into the training function in 2 steps:
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd import v2 as sgd
|
||||
from ray.util.sgd.v2 import Trainer
|
||||
from ray import sgd
|
||||
from sgd import Trainer
|
||||
|
||||
def train_func(config):
|
||||
checkpoint = sgd.load_checkpoint() or {}
|
||||
|
@ -662,8 +662,8 @@ produce an object ("Trainable") that will be passed to Ray Tune.
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd import v2 as sgd
|
||||
from ray.util.sgd.v2 import Trainer
|
||||
from ray import sgd
|
||||
from sgd import Trainer
|
||||
|
||||
def train_func(config):
|
||||
# In this example, nothing is expected to change over epochs,
|
||||
|
@ -704,8 +704,8 @@ A couple caveats:
|
|||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
from ray.util.sgd import v2 as sgd
|
||||
from ray.util.sgd.v2 import Trainer
|
||||
from ray import sgd
|
||||
from sgd import Trainer
|
||||
|
||||
def train_func(config):
|
||||
# In this example, nothing is expected to change over epochs,
|
||||
|
|
|
@ -74,6 +74,7 @@ if __name__ == "__main__":
|
|||
|
||||
do_link("rllib", force=args.yes, local_path="../../../rllib")
|
||||
do_link("tune", force=args.yes)
|
||||
do_link("sgd", force=args.yes)
|
||||
do_link("autoscaler", force=args.yes)
|
||||
do_link("ray_operator", force=args.yes)
|
||||
do_link("cloudpickle", force=args.yes)
|
||||
|
|
2
python/ray/sgd/__init__.py
Normal file
2
python/ray/sgd/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
from ray.util.sgd.v2 import * # noqa: F401, F403
|
||||
from ray.util.sgd.v2.callbacks import JsonLoggerCallback, TBXLoggerCallback # noqa: E501, F401, F403
|
|
@ -10,8 +10,6 @@ import torch
|
|||
import torch.distributed as dist
|
||||
|
||||
import ray
|
||||
from ray.tune import PlacementGroupFactory, Trainable
|
||||
from ray.tune.utils.util import merge_dicts
|
||||
from ray.util import log_once
|
||||
from ray.util.annotations import PublicAPI
|
||||
from ray.util.sgd.torch.worker_group import LocalWorkerGroup, \
|
||||
|
@ -20,6 +18,20 @@ from ray.util.sgd.utils import NUM_SAMPLES, BATCH_SIZE
|
|||
from ray.util.sgd.torch.constants import VALID_SCHEDULER_STEP, NCCL_TIMEOUT_S
|
||||
from ray.util.sgd.data import Dataset
|
||||
|
||||
try:
|
||||
from ray.tune import Trainable
|
||||
from ray.tune import PlacementGroupFactory
|
||||
from ray.tune.utils.util import merge_dicts
|
||||
TUNE_INSTALLED = True
|
||||
except ImportError:
|
||||
TUNE_INSTALLED = False
|
||||
Trainable = PlacementGroupFactory = object
|
||||
|
||||
def noop():
|
||||
return
|
||||
|
||||
merge_dicts = noop
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
@ -652,6 +664,9 @@ class TorchTrainer:
|
|||
training epoch for each tune iteration.
|
||||
|
||||
"""
|
||||
if not TUNE_INSTALLED:
|
||||
raise RuntimeError("Please install `ray[tune]` to use the Tune "
|
||||
"integration.")
|
||||
if override_tune_step is not None:
|
||||
callback_args = inspect.signature(override_tune_step)
|
||||
if not len(callback_args.parameters) == 2:
|
||||
|
|
Loading…
Add table
Reference in a new issue