mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[RaySGD] Rename PyTorch API endpoints to start with Torch (#7425)
* Start renaming pytorch to torch * Rename PyTorchTrainer to TorchTrainer * Rename PyTorch runners to Torch runners * Finish renaming API * Rename to torch in tests * Finish renaming docs + tests * Run format + fix DeprecationWarning * fix * move tests up * rename Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
parent
f6883bf725
commit
3a134c7224
22 changed files with 222 additions and 218 deletions
|
@ -31,6 +31,58 @@ fi
|
|||
|
||||
echo "Using Docker image" $DOCKER_SHA
|
||||
|
||||
|
||||
######################## SGD TESTS #################################
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python -m pytest /ray/python/ray/util/sgd/tests
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/doc/examples/doc_code/raysgd_torch_signatures.py
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/torch/examples/train_example.py
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/torch/examples/train_example.py --num-replicas=2
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/torch/examples/tune_example.py
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/torch/examples/tune_example.py --num-replicas=2
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py --smoke-test
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py --smoke-test --num-replicas=2
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py --smoke-test --tune
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/torch/examples/dcgan.py --smoke-test --num-replicas=2
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --num-replicas=2
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --tune
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --smoke-test
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test --augment-data
|
||||
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
pytest /ray/python/ray/tune/tests/test_cluster.py
|
||||
|
||||
|
@ -139,54 +191,3 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE}
|
|||
# $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
# python /ray/python/ray/tune/examples/bohb_example.py \
|
||||
# --smoke-test
|
||||
|
||||
|
||||
######################## SGD TESTS #################################
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python -m pytest /ray/python/ray/util/sgd/tests
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/doc/examples/doc_code/raysgd_torch_signatures.py
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/pytorch/examples/train_example.py
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/pytorch/examples/train_example.py --num-replicas=2
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/pytorch/examples/tune_example.py
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/pytorch/examples/tune_example.py --num-replicas=2
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py --smoke-test
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py --smoke-test --num-replicas=2
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py --smoke-test --tune
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/pytorch/examples/dcgan.py --smoke-test --num-replicas=2
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --num-replicas=2
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --tune
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --smoke-test
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test
|
||||
|
||||
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
|
||||
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test --augment-data
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# flake8: noqa
|
||||
"""
|
||||
This file holds code for the Pytorch Trainer creator signatures.
|
||||
This file holds code for the torch Trainer creator signatures.
|
||||
|
||||
It ignores yapf because yapf doesn't allow comments right after code blocks,
|
||||
but we put comments right after code blocks to prevent large white spaces
|
||||
|
@ -18,7 +18,7 @@ def model_creator(config):
|
|||
function to specify the optimization procedure for multiple models.
|
||||
|
||||
Args:
|
||||
config (dict): Configuration dictionary passed into ``PyTorchTrainer``.
|
||||
config (dict): Configuration dictionary passed into ``TorchTrainer``.
|
||||
|
||||
Returns:
|
||||
One or more torch.nn.Module objects.
|
||||
|
@ -36,7 +36,7 @@ def optimizer_creator(model, config):
|
|||
Args:
|
||||
models: The return values from ``model_creator``. This can be one
|
||||
or more torch nn modules.
|
||||
config (dict): Configuration dictionary passed into ``PyTorchTrainer``.
|
||||
config (dict): Configuration dictionary passed into ``TorchTrainer``.
|
||||
|
||||
Returns:
|
||||
One or more Torch optimizer objects.
|
||||
|
@ -46,7 +46,7 @@ def optimizer_creator(model, config):
|
|||
|
||||
|
||||
# __torch_data_start__
|
||||
from ray.util.sgd.pytorch.examples.train_example import LinearDataset
|
||||
from ray.util.sgd.torch.examples.train_example import LinearDataset
|
||||
|
||||
def data_creator(config):
|
||||
"""Constructs torch.utils.data.Dataset objects.
|
||||
|
@ -55,7 +55,7 @@ def data_creator(config):
|
|||
only one dataset will be used for training.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary passed into ``PyTorchTrainer``
|
||||
config: Configuration dictionary passed into ``TorchTrainer``
|
||||
|
||||
Returns:
|
||||
One or Two Dataset objects. If only one Dataset object is provided,
|
||||
|
@ -71,10 +71,10 @@ def loss_creator(config):
|
|||
"""Constructs the Torch Loss object.
|
||||
|
||||
Note that optionally, you can pass in a Torch Loss constructor directly
|
||||
into the PyTorchTrainer (i.e., ``PyTorchTrainer(loss_creator=nn.BCELoss, ...)``).
|
||||
into the TorchTrainer (i.e., ``TorchTrainer(loss_creator=nn.BCELoss, ...)``).
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary passed into ``PyTorchTrainer``
|
||||
config: Configuration dictionary passed into ``TorchTrainer``
|
||||
|
||||
Returns:
|
||||
Torch Loss object.
|
||||
|
@ -91,7 +91,7 @@ def scheduler_creator(optimizer, config):
|
|||
Args:
|
||||
optimizers: The return values from ``optimizer_creator``.
|
||||
This can be one or more torch optimizer objects.
|
||||
config: Configuration dictionary passed into ``PyTorchTrainer``
|
||||
config: Configuration dictionary passed into ``TorchTrainer``
|
||||
|
||||
Returns:
|
||||
One or more Torch scheduler objects.
|
||||
|
@ -108,9 +108,9 @@ ray.init()
|
|||
# __torch_ray_end__
|
||||
|
||||
# __torch_trainer_start__
|
||||
from ray.util.sgd import PyTorchTrainer
|
||||
from ray.util.sgd import TorchTrainer
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
|
|
@ -7,7 +7,7 @@ RaySGD is a lightweight library for distributed deep learning, providing thin wr
|
|||
|
||||
The main features are:
|
||||
|
||||
- **Ease of use**: Scale Pytorch's native ``DistributedDataParallel`` and TensorFlow's ``tf.distribute.MirroredStrategy`` without needing to monitor individual nodes.
|
||||
- **Ease of use**: Scale PyTorch's native ``DistributedDataParallel`` and TensorFlow's ``tf.distribute.MirroredStrategy`` without needing to monitor individual nodes.
|
||||
- **Composability**: RaySGD is built on top of the Ray Actor API, enabling seamless integration with existing Ray applications such as RLlib, Tune, and Ray.Serve.
|
||||
- **Scale up and down**: Start on single CPU. Scale up to multi-node, multi-CPU, or multi-GPU clusters by changing 2 lines of code.
|
||||
|
||||
|
@ -20,7 +20,7 @@ The main features are:
|
|||
Getting Started
|
||||
---------------
|
||||
|
||||
You can start a ``PyTorchTrainer`` with the following:
|
||||
You can start a ``TorchTrainer`` with the following:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -29,7 +29,7 @@ You can start a ``PyTorchTrainer`` with the following:
|
|||
import torch.nn as nn
|
||||
from torch import distributed
|
||||
|
||||
from ray.util.sgd import PyTorchTrainer
|
||||
from ray.util.sgd import TorchTrainer
|
||||
from ray.util.sgd.examples.train_example import LinearDataset
|
||||
|
||||
|
||||
|
@ -48,7 +48,7 @@ You can start a ``PyTorchTrainer`` with the following:
|
|||
|
||||
ray.init()
|
||||
|
||||
trainer1 = PyTorchTrainer(
|
||||
trainer1 = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
Distributed PyTorch
|
||||
===================
|
||||
|
||||
The RaySGD ``PyTorchTrainer`` simplifies distributed model training for PyTorch. The ``PyTorchTrainer`` is a wrapper around ``torch.distributed.launch`` with a Python API to easily incorporate distributed training into a larger Python application, as opposed to needing to wrap your training code in bash scripts.
|
||||
The RaySGD ``TorchTrainer`` simplifies distributed model training for PyTorch. The ``TorchTrainer`` is a wrapper around ``torch.distributed.launch`` with a Python API to easily incorporate distributed training into a larger Python application, as opposed to needing to wrap your training code in bash scripts.
|
||||
|
||||
Under the hood, ``PytorchTrainer`` will create *replicas* of your model (controlled by ``num_replicas``), each of which is managed by a Ray actor.
|
||||
Under the hood, ``TorchTrainer`` will create *replicas* of your model (controlled by ``num_replicas``), each of which is managed by a Ray actor.
|
||||
|
||||
.. image:: raysgd-actors.svg
|
||||
:align: center
|
||||
|
||||
For end to end examples leveraging RaySGD PyTorchTrainer, jump to :ref:`raysgd-pytorch-examples`.
|
||||
For end to end examples leveraging RaySGD TorchTrainer, jump to :ref:`raysgd-torch-examples`.
|
||||
|
||||
.. contents:: :local:
|
||||
|
||||
|
@ -17,19 +17,19 @@ Setting up training
|
|||
|
||||
.. tip:: Get in touch with us if you're using or considering using `RaySGD <https://forms.gle/26EMwdahdgm7Lscy9>`_!
|
||||
|
||||
The ``PyTorchTrainer`` can be constructed with functions that wrap components of the training script. Specifically, it requires constructors for the Model, Data, Optimizer, Loss, and ``lr_scheduler`` to create replicated copies across different devices and machines.
|
||||
The ``TorchTrainer`` can be constructed with functions that wrap components of the training script. Specifically, it requires constructors for the Model, Data, Optimizer, Loss, and ``lr_scheduler`` to create replicated copies across different devices and machines.
|
||||
|
||||
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
|
||||
:language: python
|
||||
:start-after: __torch_trainer_start__
|
||||
:end-before: __torch_trainer_end__
|
||||
|
||||
The below section covers the expected signatures of creator functions. Jump to :ref:`starting-pytorch-trainer`.
|
||||
The below section covers the expected signatures of creator functions. Jump to :ref:`starting-torch-trainer`.
|
||||
|
||||
Model Creator
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
This is the signature needed for ``PyTorchTrainer(model_creator=...)``.
|
||||
This is the signature needed for ``TorchTrainer(model_creator=...)``.
|
||||
|
||||
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
|
||||
:language: python
|
||||
|
@ -40,7 +40,7 @@ This is the signature needed for ``PyTorchTrainer(model_creator=...)``.
|
|||
Optimizer Creator
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
This is the signature needed for ``PyTorchTrainer(optimizer_creator=...)``.
|
||||
This is the signature needed for ``TorchTrainer(optimizer_creator=...)``.
|
||||
|
||||
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
|
||||
:language: python
|
||||
|
@ -52,7 +52,7 @@ This is the signature needed for ``PyTorchTrainer(optimizer_creator=...)``.
|
|||
Data Creator
|
||||
~~~~~~~~~~~~
|
||||
|
||||
This is the signature needed for ``PyTorchTrainer(data_creator=...)``.
|
||||
This is the signature needed for ``TorchTrainer(data_creator=...)``.
|
||||
|
||||
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
|
||||
:language: python
|
||||
|
@ -64,7 +64,7 @@ This is the signature needed for ``PyTorchTrainer(data_creator=...)``.
|
|||
Loss Creator
|
||||
~~~~~~~~~~~~
|
||||
|
||||
This is the signature needed for ``PyTorchTrainer(loss_creator=...)``.
|
||||
This is the signature needed for ``TorchTrainer(loss_creator=...)``.
|
||||
|
||||
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
|
||||
:language: python
|
||||
|
@ -76,7 +76,7 @@ Scheduler Creator
|
|||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
Optionally, you can provide a creator function for the learning rate scheduler. This is the signature needed
|
||||
for ``PyTorchTrainer(scheduler_creator=...)``.
|
||||
for ``TorchTrainer(scheduler_creator=...)``.
|
||||
|
||||
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
|
||||
:language: python
|
||||
|
@ -84,7 +84,7 @@ for ``PyTorchTrainer(scheduler_creator=...)``.
|
|||
:end-before: __torch_scheduler_end__
|
||||
|
||||
|
||||
.. _starting-pytorch-trainer:
|
||||
.. _starting-torch-trainer:
|
||||
|
||||
Putting things together
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -108,7 +108,7 @@ You can also set the number of workers and whether the workers will use GPUs:
|
|||
.. code-block:: python
|
||||
:emphasize-lines: 8,9
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -138,7 +138,7 @@ After training, you may want to reappropriate the Ray cluster. To release Ray re
|
|||
|
||||
.. note:: Be sure to call ``trainer.save()`` or ``trainer.get_model()`` before shutting down.
|
||||
|
||||
See the documentation on the PyTorchTrainer here: :ref:`ref-pytorch-trainer`.
|
||||
See the documentation on the TorchTrainer here: :ref:`ref-torch-trainer`.
|
||||
|
||||
|
||||
.. _raysgd-custom-training:
|
||||
|
@ -146,8 +146,8 @@ See the documentation on the PyTorchTrainer here: :ref:`ref-pytorch-trainer`.
|
|||
Custom Training and Validation (Operators)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
``PyTorchTrainer`` allows you to run a custom training and validation loops in parallel on each worker, providing a flexible interface similar to using PyTorch natively.
|
||||
This is done via the :ref:`ref-pytorch-operator` interface.
|
||||
``TorchTrainer`` allows you to run a custom training and validation loops in parallel on each worker, providing a flexible interface similar to using PyTorch natively.
|
||||
This is done via the :ref:`ref-torch-operator` interface.
|
||||
|
||||
For both training and validation, there are two granularities that you can provide customization - per epoch and per batch. These correspond to ``train_batch``,
|
||||
``train_epoch``, ``validate``, and ``validate_batch``. Other useful methods to override include ``setup``, ``save`` and ``restore``. You can use these
|
||||
|
@ -160,7 +160,7 @@ Below is a partial example of a custom ``TrainingOperator`` that provides a ``tr
|
|||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
from ray.util.sgd.pytorch import TrainingOperator
|
||||
from ray.util.sgd.torch import TrainingOperator
|
||||
|
||||
class GANOperator(TrainingOperator):
|
||||
def setup(self, config):
|
||||
|
@ -237,7 +237,7 @@ Below is a partial example of a custom ``TrainingOperator`` that provides a ``tr
|
|||
"num_samples": imgs.shape[0]
|
||||
}
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -252,7 +252,7 @@ Below is a partial example of a custom ``TrainingOperator`` that provides a ``tr
|
|||
stats = trainer.train()
|
||||
print(stats)
|
||||
|
||||
See the `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/dcgan.py>`__ for an end to end example. It constructs two models and two optimizers and uses a custom training operator to provide a non-standard training loop.
|
||||
See the `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/dcgan.py>`__ for an end to end example. It constructs two models and two optimizers and uses a custom training operator to provide a non-standard training loop.
|
||||
|
||||
|
||||
Initialization Functions
|
||||
|
@ -269,7 +269,7 @@ Use the ``initialization_hook`` parameter to initialize state on each worker pro
|
|||
os.environ["NCCL_LL_THRESHOLD"] = "0"
|
||||
os.environ["NCCL_DEBUG"] = "INFO"
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -290,7 +290,7 @@ and ``trainer.load``, which wraps the relevant ``torch.save`` and ``torch.load``
|
|||
checkpoint_path = os.path.join(tempfile.mkdtemp(), "checkpoint")
|
||||
trainer_1.save(checkpoint_path)
|
||||
|
||||
trainer_2 = PyTorchTrainer(
|
||||
trainer_2 = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -317,7 +317,7 @@ You can enable mixed precision training for PyTorch with the ``use_fp16`` flag.
|
|||
.. code-block:: python
|
||||
:emphasize-lines: 7
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -329,12 +329,12 @@ You can enable mixed precision training for PyTorch with the ``use_fp16`` flag.
|
|||
``Apex`` is a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training. When ``use_fp16=True``,
|
||||
you should not manually cast your model or data to ``.half()``. The flag informs the Trainer to call ``amp.initialize`` on the created models and optimizers and optimize using the scaled loss: ``amp.scale_loss(loss, optimizer)``.
|
||||
|
||||
To specify particular parameters for ``amp.initialize``, you can use the ``apex_args`` field for the PyTorchTrainer constructor. Valid arguments can be found on the `Apex documentation <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_:
|
||||
To specify particular parameters for ``amp.initialize``, you can use the ``apex_args`` field for the TorchTrainer constructor. Valid arguments can be found on the `Apex documentation <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_:
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 7-12
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -368,7 +368,7 @@ After connecting, you can scale up the number of workers seamlessly across multi
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -410,9 +410,9 @@ Users can set ``checkpoint="auto"`` to always checkpoint the current model befor
|
|||
Advanced: Hyperparameter Tuning
|
||||
-------------------------------
|
||||
|
||||
``PyTorchTrainer`` naturally integrates with Tune via the ``PyTorchTrainable`` interface. The same arguments to ``PyTorchTrainer`` should be passed into the ``tune.run(config=...)`` as shown below.
|
||||
``TorchTrainer`` naturally integrates with Tune via the ``TorchTrainable`` interface. The same arguments to ``TorchTrainer`` should be passed into the ``tune.run(config=...)`` as shown below.
|
||||
|
||||
.. literalinclude:: ../../../python/ray/util/sgd/pytorch/examples/tune_example.py
|
||||
.. literalinclude:: ../../../python/ray/util/sgd/torch/examples/tune_example.py
|
||||
:language: python
|
||||
:start-after: __torch_tune_example__
|
||||
|
||||
|
@ -420,13 +420,13 @@ Advanced: Hyperparameter Tuning
|
|||
Simultaneous Multi-model Training
|
||||
---------------------------------
|
||||
|
||||
In certain scenarios, such as training GANs, you may want to use multiple models in the training loop. You can do this in the ``PyTorchTrainer`` by allowing the ``model_creator``, ``optimizer_creator``, and ``scheduler_creator`` to return multiple values. Provide a custom TrainingOperator (:ref:`raysgd-custom-training`) to train across multiple models.
|
||||
In certain scenarios, such as training GANs, you may want to use multiple models in the training loop. You can do this in the ``TorchTrainer`` by allowing the ``model_creator``, ``optimizer_creator``, and ``scheduler_creator`` to return multiple values. Provide a custom TrainingOperator (:ref:`raysgd-custom-training`) to train across multiple models.
|
||||
|
||||
You can see the `DCGAN script <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/dcgan.py>`_ for an end-to-end example.
|
||||
You can see the `DCGAN script <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/dcgan.py>`_ for an end-to-end example.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd.pytorch import PyTorchTrainer, TrainingOperator
|
||||
from ray.util.sgd.torch import TorchTrainer, TrainingOperator
|
||||
|
||||
def train(*, model=None, criterion=None, optimizer=None, dataloader=None):
|
||||
model.train()
|
||||
|
@ -472,7 +472,7 @@ You can see the `DCGAN script <https://github.com/ray-project/ray/blob/master/py
|
|||
dataloader=dataloader)
|
||||
return result
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -487,19 +487,19 @@ Feature Requests
|
|||
|
||||
Have features that you'd really like to see in RaySGD? Feel free to `open an issue <https://github.com/ray-project/ray>`_.
|
||||
|
||||
.. _raysgd-pytorch-examples:
|
||||
.. _raysgd-torch-examples:
|
||||
|
||||
PyTorchTrainer Examples
|
||||
TorchTrainer Examples
|
||||
-----------------------
|
||||
|
||||
Here are some examples of using RaySGD for training PyTorch models. If you'd like
|
||||
to contribute an example, feel free to create a `pull request here <https://github.com/ray-project/ray/>`_.
|
||||
|
||||
- `PyTorch training example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/train_example.py>`__:
|
||||
Simple example of using Ray's PyTorchTrainer.
|
||||
- `Torch training example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/train_example.py>`__:
|
||||
Simple example of using Ray's TorchTrainer.
|
||||
|
||||
- `CIFAR10 example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py>`__:
|
||||
- `CIFAR10 example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py>`__:
|
||||
Training a ResNet18 model on CIFAR10.
|
||||
|
||||
- `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/dcgan.py>`__:
|
||||
- `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/dcgan.py>`__:
|
||||
Training a Deep Convolutional GAN on MNIST. It constructs two models and two optimizers and uses a custom training operator.
|
||||
|
|
|
@ -1,29 +1,29 @@
|
|||
Package Reference
|
||||
=================
|
||||
|
||||
.. _ref-pytorch-trainer:
|
||||
.. _ref-torch-trainer:
|
||||
|
||||
PyTorchTrainer
|
||||
--------------
|
||||
TorchTrainer
|
||||
------------
|
||||
|
||||
.. autoclass:: ray.util.sgd.pytorch.PyTorchTrainer
|
||||
.. autoclass:: ray.util.sgd.torch.TorchTrainer
|
||||
:members:
|
||||
|
||||
.. automethod:: __init__
|
||||
|
||||
.. _ref-pytorch-operator:
|
||||
.. _ref-torch-operator:
|
||||
|
||||
PyTorch TrainingOperator
|
||||
------------------------
|
||||
|
||||
.. autoclass:: ray.util.sgd.pytorch.TrainingOperator
|
||||
.. autoclass:: ray.util.sgd.torch.TrainingOperator
|
||||
:members:
|
||||
|
||||
|
||||
PyTorchTrainable
|
||||
----------------
|
||||
TorchTrainable
|
||||
--------------
|
||||
|
||||
.. autoclass:: ray.util.sgd.pytorch.PyTorchTrainable
|
||||
.. autoclass:: ray.util.sgd.torch.TorchTrainable
|
||||
:members:
|
||||
|
||||
TFTrainer
|
||||
|
|
|
@ -1,4 +1,9 @@
|
|||
from ray.util.sgd.pytorch import PyTorchTrainer
|
||||
from ray.util.sgd.torch import TorchTrainer
|
||||
from ray.util.sgd.tf import TFTrainer
|
||||
|
||||
__all__ = ["PyTorchTrainer", "TFTrainer"]
|
||||
__all__ = ["TorchTrainer", "TFTrainer"]
|
||||
|
||||
|
||||
def PyTorchTrainer(**kwargs):
|
||||
raise DeprecationWarning("ray.util.sgd.pytorch.PyTorchTrainer has been "
|
||||
"renamed to ray.util.sgd.torch.TorchTrainer")
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PyTorchTrainer = None
|
||||
PyTorchTrainable = None
|
||||
TrainingOperator = None
|
||||
|
||||
try:
|
||||
import torch # noqa: F401
|
||||
|
||||
from ray.util.sgd.pytorch.pytorch_trainer import (PyTorchTrainer,
|
||||
PyTorchTrainable)
|
||||
|
||||
from ray.util.sgd.pytorch.training_operator import TrainingOperator
|
||||
|
||||
__all__ = ["PyTorchTrainer", "PyTorchTrainable", "TrainingOperator"]
|
||||
except ImportError:
|
||||
logger.warning("PyTorch not found. PyTorchTrainer will not be available")
|
|
@ -10,12 +10,12 @@ import torch.distributed as dist
|
|||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.util.sgd.pytorch import PyTorchTrainer, PyTorchTrainable
|
||||
from ray.util.sgd.pytorch.training_operator import _TestingOperator
|
||||
from ray.util.sgd.pytorch.constants import BATCH_COUNT, SCHEDULER_STEP
|
||||
from ray.util.sgd.torch import TorchTrainer, TorchTrainable
|
||||
from ray.util.sgd.torch.training_operator import _TestingOperator
|
||||
from ray.util.sgd.torch.constants import BATCH_COUNT, SCHEDULER_STEP
|
||||
from ray.util.sgd.utils import check_for_failure
|
||||
|
||||
from ray.util.sgd.pytorch.examples.train_example import (
|
||||
from ray.util.sgd.torch.examples.train_example import (
|
||||
model_creator, optimizer_creator, data_creator, LinearDataset)
|
||||
|
||||
|
||||
|
@ -28,7 +28,7 @@ def ray_start_2_cpus():
|
|||
|
||||
|
||||
def test_single_step(ray_start_2_cpus): # noqa: F811
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -44,7 +44,7 @@ def test_single_step(ray_start_2_cpus): # noqa: F811
|
|||
@pytest.mark.parametrize("num_replicas", [1, 2]
|
||||
if dist.is_available() else [1])
|
||||
def test_train(ray_start_2_cpus, num_replicas): # noqa: F811
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -107,7 +107,7 @@ def test_multi_model(ray_start_2_cpus, num_replicas):
|
|||
]
|
||||
return opts[0], opts[1]
|
||||
|
||||
trainer1 = PyTorchTrainer(
|
||||
trainer1 = TorchTrainer(
|
||||
multi_model_creator,
|
||||
data_creator,
|
||||
multi_optimizer_creator,
|
||||
|
@ -124,7 +124,7 @@ def test_multi_model(ray_start_2_cpus, num_replicas):
|
|||
|
||||
trainer1.shutdown()
|
||||
|
||||
trainer2 = PyTorchTrainer(
|
||||
trainer2 = TorchTrainer(
|
||||
multi_model_creator,
|
||||
data_creator,
|
||||
multi_optimizer_creator,
|
||||
|
@ -193,7 +193,7 @@ def test_multi_model_matrix(ray_start_2_cpus, num_replicas): # noqa: F811
|
|||
for model_count in range(1, 3):
|
||||
for optimizer_count in range(1, 3):
|
||||
for scheduler_count in range(1, 3):
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
multi_model_creator,
|
||||
data_creator,
|
||||
multi_optimizer_creator,
|
||||
|
@ -221,7 +221,7 @@ def test_scheduler_freq(ray_start_2_cpus, scheduler_freq): # noqa: F811
|
|||
return torch.optim.lr_scheduler.StepLR(
|
||||
optimizer, step_size=30, gamma=0.1)
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -239,7 +239,7 @@ def test_scheduler_freq(ray_start_2_cpus, scheduler_freq): # noqa: F811
|
|||
def test_scheduler_validate(ray_start_2_cpus): # noqa: F811
|
||||
from torch.optim.lr_scheduler import ReduceLROnPlateau
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -273,7 +273,7 @@ def test_tune_train(ray_start_2_cpus, num_replicas): # noqa: F811
|
|||
}
|
||||
|
||||
analysis = tune.run(
|
||||
PyTorchTrainable,
|
||||
TorchTrainable,
|
||||
num_samples=2,
|
||||
config=config,
|
||||
stop={"training_iteration": 2},
|
||||
|
@ -293,7 +293,7 @@ def test_tune_train(ray_start_2_cpus, num_replicas): # noqa: F811
|
|||
@pytest.mark.parametrize("num_replicas", [1, 2]
|
||||
if dist.is_available() else [1])
|
||||
def test_save_and_restore(ray_start_2_cpus, num_replicas): # noqa: F811
|
||||
trainer1 = PyTorchTrainer(
|
||||
trainer1 = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -308,7 +308,7 @@ def test_save_and_restore(ray_start_2_cpus, num_replicas): # noqa: F811
|
|||
|
||||
trainer1.shutdown()
|
||||
|
||||
trainer2 = PyTorchTrainer(
|
||||
trainer2 = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -346,8 +346,8 @@ def test_fail_with_recover(ray_start_2_cpus): # noqa: F811
|
|||
success = check_for_failure(worker_stats)
|
||||
return success, worker_stats
|
||||
|
||||
with patch.object(PyTorchTrainer, "_train_epoch", step_with_fail):
|
||||
trainer1 = PyTorchTrainer(
|
||||
with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
|
||||
trainer1 = TorchTrainer(
|
||||
model_creator,
|
||||
single_loader,
|
||||
optimizer_creator,
|
||||
|
@ -376,8 +376,8 @@ def test_resize(ray_start_2_cpus): # noqa: F811
|
|||
success = check_for_failure(worker_stats)
|
||||
return success, worker_stats
|
||||
|
||||
with patch.object(PyTorchTrainer, "_train_epoch", step_with_fail):
|
||||
trainer1 = PyTorchTrainer(
|
||||
with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
|
||||
trainer1 = TorchTrainer(
|
||||
model_creator,
|
||||
single_loader,
|
||||
optimizer_creator,
|
||||
|
@ -412,8 +412,8 @@ def test_fail_twice(ray_start_2_cpus): # noqa: F811
|
|||
success = check_for_failure(worker_stats)
|
||||
return success, worker_stats
|
||||
|
||||
with patch.object(PyTorchTrainer, "_train_epoch", step_with_fail):
|
||||
trainer1 = PyTorchTrainer(
|
||||
with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
|
||||
trainer1 = TorchTrainer(
|
||||
model_creator,
|
||||
single_loader,
|
||||
optimizer_creator,
|
|
@ -4,8 +4,8 @@ import torch.nn as nn
|
|||
import unittest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from ray.util.sgd.pytorch.training_operator import TrainingOperator
|
||||
from ray.util.sgd.pytorch.pytorch_runner import PyTorchRunner
|
||||
from ray.util.sgd.torch.training_operator import TrainingOperator
|
||||
from ray.util.sgd.torch.torch_runner import TorchRunner
|
||||
|
||||
|
||||
class LinearDataset(torch.utils.data.Dataset):
|
||||
|
@ -45,14 +45,14 @@ def create_dataloaders(config):
|
|||
return LinearDataset(2, 5), LinearDataset(2, 5, size=400)
|
||||
|
||||
|
||||
class TestPyTorchRunner(unittest.TestCase):
|
||||
class TestTorchRunner(unittest.TestCase):
|
||||
def testValidate(self):
|
||||
class MockOperator(TrainingOperator):
|
||||
def setup(self, config):
|
||||
self.train_epoch = MagicMock(returns=dict(mean_accuracy=10))
|
||||
self.validate = MagicMock(returns=dict(mean_accuracy=10))
|
||||
|
||||
runner = PyTorchRunner(
|
||||
runner = TorchRunner(
|
||||
model_creator,
|
||||
create_dataloaders,
|
||||
optimizer_creator,
|
||||
|
@ -76,7 +76,7 @@ class TestPyTorchRunner(unittest.TestCase):
|
|||
self.count += 1
|
||||
return {"count": self.count}
|
||||
|
||||
runner = PyTorchRunner(
|
||||
runner = TorchRunner(
|
||||
model_creator,
|
||||
create_dataloaders,
|
||||
optimizer_creator,
|
||||
|
@ -105,7 +105,7 @@ class TestPyTorchRunner(unittest.TestCase):
|
|||
]
|
||||
return opts[0], opts[1], opts[2]
|
||||
|
||||
runner = PyTorchRunner(
|
||||
runner = TorchRunner(
|
||||
three_model_creator,
|
||||
single_loader,
|
||||
three_optimizer_creator,
|
||||
|
@ -116,8 +116,8 @@ class TestPyTorchRunner(unittest.TestCase):
|
|||
self.assertEqual(len(runner.given_models), 3)
|
||||
self.assertEqual(len(runner.given_optimizers), 3)
|
||||
|
||||
runner2 = PyTorchRunner(model_creator, single_loader,
|
||||
optimizer_creator, loss_creator)
|
||||
runner2 = TorchRunner(model_creator, single_loader, optimizer_creator,
|
||||
loss_creator)
|
||||
runner2.setup()
|
||||
|
||||
self.assertNotEqual(runner2.given_models, runner2.models)
|
||||
|
@ -128,26 +128,26 @@ class TestPyTorchRunner(unittest.TestCase):
|
|||
return (LinearDataset(2, 5), LinearDataset(2, 5, size=400),
|
||||
LinearDataset(2, 5, size=400))
|
||||
|
||||
runner = PyTorchRunner(model_creator, three_data_loader,
|
||||
optimizer_creator, loss_creator)
|
||||
runner = TorchRunner(model_creator, three_data_loader,
|
||||
optimizer_creator, loss_creator)
|
||||
with self.assertRaises(ValueError):
|
||||
runner.setup()
|
||||
|
||||
runner2 = PyTorchRunner(model_creator, three_data_loader,
|
||||
optimizer_creator, loss_creator)
|
||||
runner2 = TorchRunner(model_creator, three_data_loader,
|
||||
optimizer_creator, loss_creator)
|
||||
with self.assertRaises(ValueError):
|
||||
runner2.setup()
|
||||
|
||||
def testSingleLoader(self):
|
||||
runner = PyTorchRunner(model_creator, single_loader, optimizer_creator,
|
||||
loss_creator)
|
||||
runner = TorchRunner(model_creator, single_loader, optimizer_creator,
|
||||
loss_creator)
|
||||
runner.setup()
|
||||
runner.train_epoch()
|
||||
with self.assertRaises(ValueError):
|
||||
runner.validate()
|
||||
|
||||
def testNativeLoss(self):
|
||||
runner = PyTorchRunner(
|
||||
runner = TorchRunner(
|
||||
model_creator,
|
||||
single_loader,
|
||||
optimizer_creator,
|
||||
|
@ -165,8 +165,8 @@ class TestPyTorchRunner(unittest.TestCase):
|
|||
]
|
||||
return opts[0], opts[1], opts[2]
|
||||
|
||||
runner = PyTorchRunner(multi_model_creator, single_loader,
|
||||
multi_optimizer_creator, loss_creator)
|
||||
runner = TorchRunner(multi_model_creator, single_loader,
|
||||
multi_optimizer_creator, loss_creator)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
runner.setup()
|
17
python/ray/util/sgd/torch/__init__.py
Normal file
17
python/ray/util/sgd/torch/__init__.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
TorchTrainer = None
|
||||
TorchTrainable = None
|
||||
TrainingOperator = None
|
||||
|
||||
try:
|
||||
import torch # noqa: F401
|
||||
|
||||
from ray.util.sgd.torch.torch_trainer import (TorchTrainer, TorchTrainable)
|
||||
|
||||
from ray.util.sgd.torch.training_operator import TrainingOperator
|
||||
|
||||
__all__ = ["TorchTrainer", "TorchTrainable", "TrainingOperator"]
|
||||
except ImportError:
|
||||
logger.warning("PyTorch not found. TorchTrainer will not be available")
|
|
@ -7,24 +7,24 @@ import torch.distributed as dist
|
|||
import torch.utils.data
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
|
||||
from ray.util.sgd.pytorch.pytorch_runner import PyTorchRunner
|
||||
from ray.util.sgd.torch.torch_runner import TorchRunner
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DistributedPyTorchRunner(PyTorchRunner):
|
||||
class DistributedTorchRunner(TorchRunner):
|
||||
"""Manages a distributed PyTorch model replica.
|
||||
|
||||
|
||||
Args:
|
||||
args: Arguments for PyTorchRunner.
|
||||
args: Arguments for TorchRunner.
|
||||
backend (string): backend used by distributed PyTorch.
|
||||
kwargs: Keyword arguments for PyTorchRunner.
|
||||
kwargs: Keyword arguments for TorchRunner.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *args, backend="gloo", **kwargs):
|
||||
super(DistributedPyTorchRunner, self).__init__(*args, **kwargs)
|
||||
super(DistributedTorchRunner, self).__init__(*args, **kwargs)
|
||||
self.backend = backend
|
||||
|
||||
def setup(self, url, world_rank, world_size):
|
||||
|
@ -110,7 +110,7 @@ class DistributedPyTorchRunner(PyTorchRunner):
|
|||
"""
|
||||
if hasattr(self.train_loader.sampler, "set_epoch"):
|
||||
self.train_loader.sampler.set_epoch(self.epochs)
|
||||
return super(DistributedPyTorchRunner, self).train_epoch(**kwargs)
|
||||
return super(DistributedTorchRunner, self).train_epoch(**kwargs)
|
||||
|
||||
def _get_model_state_dicts(self):
|
||||
"""Fetch state from ``model.module`` instead of ``model``.
|
||||
|
@ -132,7 +132,7 @@ class DistributedPyTorchRunner(PyTorchRunner):
|
|||
|
||||
# def shutdown(self):
|
||||
"""Attempts to shut down the worker."""
|
||||
# super(DistributedPyTorchRunner, self).shutdown()
|
||||
# super(DistributedTorchRunner, self).shutdown()
|
||||
# TODO: Temporarily removing since it causes hangs on MacOSX.
|
||||
# However, it seems to be harmless to remove permanently
|
||||
# since the processes are shutdown anyways. This comment can be
|
|
@ -8,8 +8,8 @@ import torchvision
|
|||
import torchvision.transforms as transforms
|
||||
|
||||
import ray
|
||||
from ray.util.sgd.pytorch import (PyTorchTrainer, PyTorchTrainable)
|
||||
from ray.util.sgd.pytorch.resnet import ResNet18
|
||||
from ray.util.sgd.torch import (TorchTrainer, TorchTrainable)
|
||||
from ray.util.sgd.torch.resnet import ResNet18
|
||||
|
||||
|
||||
def initialization_hook():
|
||||
|
@ -62,7 +62,7 @@ def train_example(num_replicas=1,
|
|||
use_gpu=False,
|
||||
use_fp16=False,
|
||||
test_mode=False):
|
||||
trainer1 = PyTorchTrainer(
|
||||
trainer1 = TorchTrainer(
|
||||
ResNet18,
|
||||
cifar_creator,
|
||||
optimizer_creator,
|
||||
|
@ -107,7 +107,7 @@ def tune_example(num_replicas=1, use_gpu=False, test_mode=False):
|
|||
}
|
||||
|
||||
analysis = tune.run(
|
||||
PyTorchTrainable,
|
||||
TorchTrainable,
|
||||
num_samples=2,
|
||||
config=config,
|
||||
stop={"training_iteration": 2},
|
|
@ -15,9 +15,9 @@ from torch.nn import functional as F
|
|||
from scipy.stats import entropy
|
||||
|
||||
import ray
|
||||
from ray.util.sgd import PyTorchTrainer
|
||||
from ray.util.sgd import TorchTrainer
|
||||
from ray.util.sgd.utils import override
|
||||
from ray.util.sgd.pytorch import TrainingOperator
|
||||
from ray.util.sgd.torch import TrainingOperator
|
||||
|
||||
|
||||
def data_creator(config):
|
||||
|
@ -223,9 +223,9 @@ def train_example(num_replicas=1, use_gpu=False, test_mode=False):
|
|||
"test_mode": test_mode,
|
||||
"classification_model_path": os.path.join(
|
||||
os.path.dirname(ray.__file__),
|
||||
"util/sgd/pytorch/examples/mnist_cnn.pt")
|
||||
"util/sgd/torch/examples/mnist_cnn.pt")
|
||||
}
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
|
@ -13,7 +13,7 @@ import numpy as np
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from ray.util.sgd import PyTorchTrainer
|
||||
from ray.util.sgd import TorchTrainer
|
||||
|
||||
|
||||
class LinearDataset(torch.utils.data.Dataset):
|
||||
|
@ -44,7 +44,7 @@ def optimizer_creator(model, config):
|
|||
def scheduler_creator(optimizer, config):
|
||||
"""Returns a learning rate scheduler wrapping the optimizer.
|
||||
|
||||
You will need to set ``PyTorchTrainer(scheduler_step_freq="epoch")``
|
||||
You will need to set ``TorchTrainer(scheduler_step_freq="epoch")``
|
||||
for the scheduler to be incremented correctly.
|
||||
|
||||
If using a scheduler for validation loss, be sure to call
|
||||
|
@ -59,7 +59,7 @@ def data_creator(config):
|
|||
|
||||
|
||||
def train_example(num_replicas=1, use_gpu=False):
|
||||
trainer1 = PyTorchTrainer(
|
||||
trainer1 = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
|
@ -14,7 +14,7 @@ import torch.nn as nn
|
|||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.util.sgd.pytorch.pytorch_trainer import PyTorchTrainable
|
||||
from ray.util.sgd.torch.torch_trainer import TorchTrainable
|
||||
|
||||
|
||||
class LinearDataset(torch.utils.data.Dataset):
|
||||
|
@ -60,7 +60,7 @@ def tune_example(num_replicas=1, use_gpu=False):
|
|||
}
|
||||
|
||||
analysis = tune.run(
|
||||
PyTorchTrainable,
|
||||
TorchTrainable,
|
||||
num_samples=12,
|
||||
config=config,
|
||||
stop={"training_iteration": 2},
|
|
@ -9,8 +9,8 @@ import torch.utils.data
|
|||
from torch.utils.data import Dataset
|
||||
|
||||
import ray
|
||||
from ray.util.sgd.pytorch.constants import USE_FP16, SCHEDULER_STEP
|
||||
from ray.util.sgd.pytorch.training_operator import TrainingOperator
|
||||
from ray.util.sgd.torch.constants import USE_FP16, SCHEDULER_STEP
|
||||
from ray.util.sgd.torch.training_operator import TrainingOperator
|
||||
from ray.util.sgd import utils
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -23,23 +23,23 @@ except ImportError:
|
|||
pass
|
||||
|
||||
|
||||
class PyTorchRunner:
|
||||
class TorchRunner:
|
||||
"""Manages a PyTorch model for training.
|
||||
|
||||
Args:
|
||||
model_creator (dict -> *): see pytorch_trainer.py
|
||||
data_creator (dict -> Dataset, Dataset): see pytorch_trainer.py.
|
||||
optimizer_creator (models, dict -> optimizers): see pytorch_trainer.py.
|
||||
loss_creator (dict -> loss | Loss class): see pytorch_trainer.py.
|
||||
model_creator (dict -> *): see torch_trainer.py
|
||||
data_creator (dict -> Dataset, Dataset): see torch_trainer.py.
|
||||
optimizer_creator (models, dict -> optimizers): see torch_trainer.py.
|
||||
loss_creator (dict -> loss | Loss class): see torch_trainer.py.
|
||||
scheduler_creator (optimizers, dict -> schedulers): see
|
||||
pytorch_trainer.py.
|
||||
training_operator_cls: see pytorch_trainer.py
|
||||
config (dict): see pytorch_trainer.py.
|
||||
dataloader_config (dict): See pytorch_trainer.py.
|
||||
batch_size (int): see pytorch_trainer.py.
|
||||
use_fp16 (bool): see pytorch_trainer.py.
|
||||
apex_args (dict|None): see pytorch_trainer.py.
|
||||
scheduler_step_freq (str): see pytorch_trainer.py.
|
||||
torch_trainer.py.
|
||||
training_operator_cls: see torch_trainer.py
|
||||
config (dict): see torch_trainer.py.
|
||||
dataloader_config (dict): See torch_trainer.py.
|
||||
batch_size (int): see torch_trainer.py.
|
||||
use_fp16 (bool): see torch_trainer.py.
|
||||
apex_args (dict|None): see torch_trainer.py.
|
||||
scheduler_step_freq (str): see torch_trainer.py.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
|
@ -11,11 +11,11 @@ import ray
|
|||
|
||||
from ray.tune import Trainable
|
||||
from ray.tune.trial import Resources
|
||||
from ray.util.sgd.pytorch.distributed_pytorch_runner import (
|
||||
DistributedPyTorchRunner)
|
||||
from ray.util.sgd.torch.distributed_torch_runner import (
|
||||
DistributedTorchRunner)
|
||||
from ray.util.sgd import utils
|
||||
from ray.util.sgd.pytorch.pytorch_runner import PyTorchRunner
|
||||
from ray.util.sgd.pytorch.constants import VALID_SCHEDULER_STEP
|
||||
from ray.util.sgd.torch.torch_runner import TorchRunner
|
||||
from ray.util.sgd.torch.constants import VALID_SCHEDULER_STEP
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
RESIZE_COOLDOWN_S = 10
|
||||
|
@ -29,7 +29,7 @@ def _validate_scheduler_step_freq(scheduler_step_freq):
|
|||
VALID_SCHEDULER_STEP, scheduler_step_freq))
|
||||
|
||||
|
||||
class PyTorchTrainer:
|
||||
class TorchTrainer:
|
||||
"""Train a PyTorch model using distributed PyTorch.
|
||||
|
||||
Launches a set of actors which connect via distributed PyTorch and
|
||||
|
@ -49,7 +49,7 @@ class PyTorchTrainer:
|
|||
def data_creator(config):
|
||||
return LinearDataset(2, 5), LinearDataset(2, 5, size=400)
|
||||
|
||||
trainer = PyTorchTrainer(
|
||||
trainer = TorchTrainer(
|
||||
model_creator,
|
||||
data_creator,
|
||||
optimizer_creator,
|
||||
|
@ -195,7 +195,7 @@ class PyTorchTrainer:
|
|||
if num_replicas == 1:
|
||||
# Generate actor class
|
||||
Runner = ray.remote(
|
||||
num_cpus=1, num_gpus=int(self.use_gpu))(PyTorchRunner)
|
||||
num_cpus=1, num_gpus=int(self.use_gpu))(TorchRunner)
|
||||
# Start workers
|
||||
self.workers = [
|
||||
Runner.remote(
|
||||
|
@ -220,8 +220,7 @@ class PyTorchTrainer:
|
|||
else:
|
||||
# Generate actor class
|
||||
Runner = ray.remote(
|
||||
num_cpus=1,
|
||||
num_gpus=int(self.use_gpu))(DistributedPyTorchRunner)
|
||||
num_cpus=1, num_gpus=int(self.use_gpu))(DistributedTorchRunner)
|
||||
# Compute batch size per replica
|
||||
batch_size_per_replica = self.batch_size // num_replicas
|
||||
if self.batch_size % num_replicas > 0:
|
||||
|
@ -285,7 +284,7 @@ class PyTorchTrainer:
|
|||
in case of shared cluster usage.
|
||||
checkpoint (str): Path to checkpoint to restore from if retrying.
|
||||
If max_retries is set and ``checkpoint == "auto"``,
|
||||
PyTorchTrainer will save a checkpoint before starting to train.
|
||||
TorchTrainer will save a checkpoint before starting to train.
|
||||
info (dict): Optional dictionary passed to the training
|
||||
operator for ``train_epoch`` and ``train_batch``.
|
||||
|
||||
|
@ -487,7 +486,7 @@ class PyTorchTrainer:
|
|||
return False
|
||||
|
||||
|
||||
class PyTorchTrainable(Trainable):
|
||||
class TorchTrainable(Trainable):
|
||||
@classmethod
|
||||
def default_resource_request(cls, config):
|
||||
return Resources(
|
||||
|
@ -497,7 +496,7 @@ class PyTorchTrainable(Trainable):
|
|||
extra_gpu=int(config["use_gpu"]) * config["num_replicas"])
|
||||
|
||||
def _setup(self, config):
|
||||
self._trainer = PyTorchTrainer(**config)
|
||||
self._trainer = TorchTrainer(**config)
|
||||
|
||||
def _train(self):
|
||||
train_stats = self._trainer.train()
|
|
@ -2,7 +2,7 @@ import collections
|
|||
import torch
|
||||
|
||||
from ray.util.sgd.utils import TimerStat, AverageMeter
|
||||
from ray.util.sgd.pytorch.constants import (
|
||||
from ray.util.sgd.torch.constants import (
|
||||
SCHEDULER_STEP_EPOCH, SCHEDULER_STEP_BATCH, SCHEDULER_STEP, BATCH_COUNT)
|
||||
|
||||
amp = None
|
||||
|
@ -11,7 +11,7 @@ try:
|
|||
from apex import amp
|
||||
except ImportError:
|
||||
# Apex library is not installed, so we cannot enable mixed precision.
|
||||
# We don't log here because logging happens in the pytorch_runner,
|
||||
# We don't log here because logging happens in the torch_runner,
|
||||
# where amp is initialized.
|
||||
pass
|
||||
|
||||
|
@ -26,7 +26,7 @@ class TrainingOperator:
|
|||
|
||||
The scheduler will only be called at a batch or epoch frequency, depending
|
||||
on the user parameter. Be sure to set ``scheduler_step_freq`` in
|
||||
``PyTorchTrainer`` to either "batch" or "epoch" to increment the scheduler
|
||||
``TorchTrainer`` to either "batch" or "epoch" to increment the scheduler
|
||||
correctly during training. If using a learning rate scheduler
|
||||
that depends on validation loss, you can use ``trainer.update_scheduler``.
|
||||
|
||||
|
@ -290,7 +290,7 @@ class TrainingOperator:
|
|||
|
||||
@property
|
||||
def config(self):
|
||||
"""Dictionary as provided into PyTorchTrainer."""
|
||||
"""Dictionary as provided into TorchTrainer."""
|
||||
return self._config
|
||||
|
||||
@property
|
Loading…
Add table
Reference in a new issue