[RaySGD] Rename PyTorch API endpoints to start with Torch (#7425)

* Start renaming pytorch to torch

* Rename PyTorchTrainer to TorchTrainer

* Rename PyTorch runners to Torch runners

* Finish renaming API

* Rename to torch in tests

* Finish renaming docs + tests

* Run format + fix DeprecationWarning

* fix

* move tests up

* rename

Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
Maksim Smolin 2020-03-03 16:44:42 -08:00 committed by GitHub
parent f6883bf725
commit 3a134c7224
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 222 additions and 218 deletions

View file

@ -31,6 +31,58 @@ fi
echo "Using Docker image" $DOCKER_SHA
######################## SGD TESTS #################################
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python -m pytest /ray/python/ray/util/sgd/tests
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/doc/examples/doc_code/raysgd_torch_signatures.py
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/torch/examples/train_example.py
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/torch/examples/train_example.py --num-replicas=2
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/torch/examples/tune_example.py
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/torch/examples/tune_example.py --num-replicas=2
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py --smoke-test
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py --smoke-test --num-replicas=2
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py --smoke-test --tune
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/torch/examples/dcgan.py --smoke-test --num-replicas=2
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --num-replicas=2
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --tune
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --smoke-test
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test --augment-data
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
pytest /ray/python/ray/tune/tests/test_cluster.py
@ -139,54 +191,3 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE}
# $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
# python /ray/python/ray/tune/examples/bohb_example.py \
# --smoke-test
######################## SGD TESTS #################################
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python -m pytest /ray/python/ray/util/sgd/tests
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/doc/examples/doc_code/raysgd_torch_signatures.py
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/pytorch/examples/train_example.py
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/pytorch/examples/train_example.py --num-replicas=2
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/pytorch/examples/tune_example.py
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/pytorch/examples/tune_example.py --num-replicas=2
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py --smoke-test
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py --smoke-test --num-replicas=2
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py --smoke-test --tune
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/pytorch/examples/dcgan.py --smoke-test --num-replicas=2
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --num-replicas=2
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --tune
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --smoke-test
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test
$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test --augment-data

View file

@ -1,6 +1,6 @@
# flake8: noqa
"""
This file holds code for the Pytorch Trainer creator signatures.
This file holds code for the torch Trainer creator signatures.
It ignores yapf because yapf doesn't allow comments right after code blocks,
but we put comments right after code blocks to prevent large white spaces
@ -18,7 +18,7 @@ def model_creator(config):
function to specify the optimization procedure for multiple models.
Args:
config (dict): Configuration dictionary passed into ``PyTorchTrainer``.
config (dict): Configuration dictionary passed into ``TorchTrainer``.
Returns:
One or more torch.nn.Module objects.
@ -36,7 +36,7 @@ def optimizer_creator(model, config):
Args:
models: The return values from ``model_creator``. This can be one
or more torch nn modules.
config (dict): Configuration dictionary passed into ``PyTorchTrainer``.
config (dict): Configuration dictionary passed into ``TorchTrainer``.
Returns:
One or more Torch optimizer objects.
@ -46,7 +46,7 @@ def optimizer_creator(model, config):
# __torch_data_start__
from ray.util.sgd.pytorch.examples.train_example import LinearDataset
from ray.util.sgd.torch.examples.train_example import LinearDataset
def data_creator(config):
"""Constructs torch.utils.data.Dataset objects.
@ -55,7 +55,7 @@ def data_creator(config):
only one dataset will be used for training.
Args:
config: Configuration dictionary passed into ``PyTorchTrainer``
config: Configuration dictionary passed into ``TorchTrainer``
Returns:
One or Two Dataset objects. If only one Dataset object is provided,
@ -71,10 +71,10 @@ def loss_creator(config):
"""Constructs the Torch Loss object.
Note that optionally, you can pass in a Torch Loss constructor directly
into the PyTorchTrainer (i.e., ``PyTorchTrainer(loss_creator=nn.BCELoss, ...)``).
into the TorchTrainer (i.e., ``TorchTrainer(loss_creator=nn.BCELoss, ...)``).
Args:
config: Configuration dictionary passed into ``PyTorchTrainer``
config: Configuration dictionary passed into ``TorchTrainer``
Returns:
Torch Loss object.
@ -91,7 +91,7 @@ def scheduler_creator(optimizer, config):
Args:
optimizers: The return values from ``optimizer_creator``.
This can be one or more torch optimizer objects.
config: Configuration dictionary passed into ``PyTorchTrainer``
config: Configuration dictionary passed into ``TorchTrainer``
Returns:
One or more Torch scheduler objects.
@ -108,9 +108,9 @@ ray.init()
# __torch_ray_end__
# __torch_trainer_start__
from ray.util.sgd import PyTorchTrainer
from ray.util.sgd import TorchTrainer
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,

View file

@ -7,7 +7,7 @@ RaySGD is a lightweight library for distributed deep learning, providing thin wr
The main features are:
- **Ease of use**: Scale Pytorch's native ``DistributedDataParallel`` and TensorFlow's ``tf.distribute.MirroredStrategy`` without needing to monitor individual nodes.
- **Ease of use**: Scale PyTorch's native ``DistributedDataParallel`` and TensorFlow's ``tf.distribute.MirroredStrategy`` without needing to monitor individual nodes.
- **Composability**: RaySGD is built on top of the Ray Actor API, enabling seamless integration with existing Ray applications such as RLlib, Tune, and Ray.Serve.
- **Scale up and down**: Start on single CPU. Scale up to multi-node, multi-CPU, or multi-GPU clusters by changing 2 lines of code.
@ -20,7 +20,7 @@ The main features are:
Getting Started
---------------
You can start a ``PyTorchTrainer`` with the following:
You can start a ``TorchTrainer`` with the following:
.. code-block:: python
@ -29,7 +29,7 @@ You can start a ``PyTorchTrainer`` with the following:
import torch.nn as nn
from torch import distributed
from ray.util.sgd import PyTorchTrainer
from ray.util.sgd import TorchTrainer
from ray.util.sgd.examples.train_example import LinearDataset
@ -48,7 +48,7 @@ You can start a ``PyTorchTrainer`` with the following:
ray.init()
trainer1 = PyTorchTrainer(
trainer1 = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,

View file

@ -1,14 +1,14 @@
Distributed PyTorch
===================
The RaySGD ``PyTorchTrainer`` simplifies distributed model training for PyTorch. The ``PyTorchTrainer`` is a wrapper around ``torch.distributed.launch`` with a Python API to easily incorporate distributed training into a larger Python application, as opposed to needing to wrap your training code in bash scripts.
The RaySGD ``TorchTrainer`` simplifies distributed model training for PyTorch. The ``TorchTrainer`` is a wrapper around ``torch.distributed.launch`` with a Python API to easily incorporate distributed training into a larger Python application, as opposed to needing to wrap your training code in bash scripts.
Under the hood, ``PytorchTrainer`` will create *replicas* of your model (controlled by ``num_replicas``), each of which is managed by a Ray actor.
Under the hood, ``TorchTrainer`` will create *replicas* of your model (controlled by ``num_replicas``), each of which is managed by a Ray actor.
.. image:: raysgd-actors.svg
:align: center
For end to end examples leveraging RaySGD PyTorchTrainer, jump to :ref:`raysgd-pytorch-examples`.
For end to end examples leveraging RaySGD TorchTrainer, jump to :ref:`raysgd-torch-examples`.
.. contents:: :local:
@ -17,19 +17,19 @@ Setting up training
.. tip:: Get in touch with us if you're using or considering using `RaySGD <https://forms.gle/26EMwdahdgm7Lscy9>`_!
The ``PyTorchTrainer`` can be constructed with functions that wrap components of the training script. Specifically, it requires constructors for the Model, Data, Optimizer, Loss, and ``lr_scheduler`` to create replicated copies across different devices and machines.
The ``TorchTrainer`` can be constructed with functions that wrap components of the training script. Specifically, it requires constructors for the Model, Data, Optimizer, Loss, and ``lr_scheduler`` to create replicated copies across different devices and machines.
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
:language: python
:start-after: __torch_trainer_start__
:end-before: __torch_trainer_end__
The below section covers the expected signatures of creator functions. Jump to :ref:`starting-pytorch-trainer`.
The below section covers the expected signatures of creator functions. Jump to :ref:`starting-torch-trainer`.
Model Creator
~~~~~~~~~~~~~
This is the signature needed for ``PyTorchTrainer(model_creator=...)``.
This is the signature needed for ``TorchTrainer(model_creator=...)``.
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
:language: python
@ -40,7 +40,7 @@ This is the signature needed for ``PyTorchTrainer(model_creator=...)``.
Optimizer Creator
~~~~~~~~~~~~~~~~~
This is the signature needed for ``PyTorchTrainer(optimizer_creator=...)``.
This is the signature needed for ``TorchTrainer(optimizer_creator=...)``.
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
:language: python
@ -52,7 +52,7 @@ This is the signature needed for ``PyTorchTrainer(optimizer_creator=...)``.
Data Creator
~~~~~~~~~~~~
This is the signature needed for ``PyTorchTrainer(data_creator=...)``.
This is the signature needed for ``TorchTrainer(data_creator=...)``.
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
:language: python
@ -64,7 +64,7 @@ This is the signature needed for ``PyTorchTrainer(data_creator=...)``.
Loss Creator
~~~~~~~~~~~~
This is the signature needed for ``PyTorchTrainer(loss_creator=...)``.
This is the signature needed for ``TorchTrainer(loss_creator=...)``.
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
:language: python
@ -76,7 +76,7 @@ Scheduler Creator
~~~~~~~~~~~~~~~~~
Optionally, you can provide a creator function for the learning rate scheduler. This is the signature needed
for ``PyTorchTrainer(scheduler_creator=...)``.
for ``TorchTrainer(scheduler_creator=...)``.
.. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
:language: python
@ -84,7 +84,7 @@ for ``PyTorchTrainer(scheduler_creator=...)``.
:end-before: __torch_scheduler_end__
.. _starting-pytorch-trainer:
.. _starting-torch-trainer:
Putting things together
~~~~~~~~~~~~~~~~~~~~~~~
@ -108,7 +108,7 @@ You can also set the number of workers and whether the workers will use GPUs:
.. code-block:: python
:emphasize-lines: 8,9
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -138,7 +138,7 @@ After training, you may want to reappropriate the Ray cluster. To release Ray re
.. note:: Be sure to call ``trainer.save()`` or ``trainer.get_model()`` before shutting down.
See the documentation on the PyTorchTrainer here: :ref:`ref-pytorch-trainer`.
See the documentation on the TorchTrainer here: :ref:`ref-torch-trainer`.
.. _raysgd-custom-training:
@ -146,8 +146,8 @@ See the documentation on the PyTorchTrainer here: :ref:`ref-pytorch-trainer`.
Custom Training and Validation (Operators)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``PyTorchTrainer`` allows you to run a custom training and validation loops in parallel on each worker, providing a flexible interface similar to using PyTorch natively.
This is done via the :ref:`ref-pytorch-operator` interface.
``TorchTrainer`` allows you to run a custom training and validation loops in parallel on each worker, providing a flexible interface similar to using PyTorch natively.
This is done via the :ref:`ref-torch-operator` interface.
For both training and validation, there are two granularities that you can provide customization - per epoch and per batch. These correspond to ``train_batch``,
``train_epoch``, ``validate``, and ``validate_batch``. Other useful methods to override include ``setup``, ``save`` and ``restore``. You can use these
@ -160,7 +160,7 @@ Below is a partial example of a custom ``TrainingOperator`` that provides a ``tr
.. code-block:: python
import torch
from ray.util.sgd.pytorch import TrainingOperator
from ray.util.sgd.torch import TrainingOperator
class GANOperator(TrainingOperator):
def setup(self, config):
@ -237,7 +237,7 @@ Below is a partial example of a custom ``TrainingOperator`` that provides a ``tr
"num_samples": imgs.shape[0]
}
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -252,7 +252,7 @@ Below is a partial example of a custom ``TrainingOperator`` that provides a ``tr
stats = trainer.train()
print(stats)
See the `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/dcgan.py>`__ for an end to end example. It constructs two models and two optimizers and uses a custom training operator to provide a non-standard training loop.
See the `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/dcgan.py>`__ for an end to end example. It constructs two models and two optimizers and uses a custom training operator to provide a non-standard training loop.
Initialization Functions
@ -269,7 +269,7 @@ Use the ``initialization_hook`` parameter to initialize state on each worker pro
os.environ["NCCL_LL_THRESHOLD"] = "0"
os.environ["NCCL_DEBUG"] = "INFO"
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -290,7 +290,7 @@ and ``trainer.load``, which wraps the relevant ``torch.save`` and ``torch.load``
checkpoint_path = os.path.join(tempfile.mkdtemp(), "checkpoint")
trainer_1.save(checkpoint_path)
trainer_2 = PyTorchTrainer(
trainer_2 = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -317,7 +317,7 @@ You can enable mixed precision training for PyTorch with the ``use_fp16`` flag.
.. code-block:: python
:emphasize-lines: 7
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -329,12 +329,12 @@ You can enable mixed precision training for PyTorch with the ``use_fp16`` flag.
``Apex`` is a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training. When ``use_fp16=True``,
you should not manually cast your model or data to ``.half()``. The flag informs the Trainer to call ``amp.initialize`` on the created models and optimizers and optimize using the scaled loss: ``amp.scale_loss(loss, optimizer)``.
To specify particular parameters for ``amp.initialize``, you can use the ``apex_args`` field for the PyTorchTrainer constructor. Valid arguments can be found on the `Apex documentation <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_:
To specify particular parameters for ``amp.initialize``, you can use the ``apex_args`` field for the TorchTrainer constructor. Valid arguments can be found on the `Apex documentation <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_:
.. code-block:: python
:emphasize-lines: 7-12
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -368,7 +368,7 @@ After connecting, you can scale up the number of workers seamlessly across multi
.. code-block:: python
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -410,9 +410,9 @@ Users can set ``checkpoint="auto"`` to always checkpoint the current model befor
Advanced: Hyperparameter Tuning
-------------------------------
``PyTorchTrainer`` naturally integrates with Tune via the ``PyTorchTrainable`` interface. The same arguments to ``PyTorchTrainer`` should be passed into the ``tune.run(config=...)`` as shown below.
``TorchTrainer`` naturally integrates with Tune via the ``TorchTrainable`` interface. The same arguments to ``TorchTrainer`` should be passed into the ``tune.run(config=...)`` as shown below.
.. literalinclude:: ../../../python/ray/util/sgd/pytorch/examples/tune_example.py
.. literalinclude:: ../../../python/ray/util/sgd/torch/examples/tune_example.py
:language: python
:start-after: __torch_tune_example__
@ -420,13 +420,13 @@ Advanced: Hyperparameter Tuning
Simultaneous Multi-model Training
---------------------------------
In certain scenarios, such as training GANs, you may want to use multiple models in the training loop. You can do this in the ``PyTorchTrainer`` by allowing the ``model_creator``, ``optimizer_creator``, and ``scheduler_creator`` to return multiple values. Provide a custom TrainingOperator (:ref:`raysgd-custom-training`) to train across multiple models.
In certain scenarios, such as training GANs, you may want to use multiple models in the training loop. You can do this in the ``TorchTrainer`` by allowing the ``model_creator``, ``optimizer_creator``, and ``scheduler_creator`` to return multiple values. Provide a custom TrainingOperator (:ref:`raysgd-custom-training`) to train across multiple models.
You can see the `DCGAN script <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/dcgan.py>`_ for an end-to-end example.
You can see the `DCGAN script <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/dcgan.py>`_ for an end-to-end example.
.. code-block:: python
from ray.util.sgd.pytorch import PyTorchTrainer, TrainingOperator
from ray.util.sgd.torch import TorchTrainer, TrainingOperator
def train(*, model=None, criterion=None, optimizer=None, dataloader=None):
model.train()
@ -472,7 +472,7 @@ You can see the `DCGAN script <https://github.com/ray-project/ray/blob/master/py
dataloader=dataloader)
return result
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -487,19 +487,19 @@ Feature Requests
Have features that you'd really like to see in RaySGD? Feel free to `open an issue <https://github.com/ray-project/ray>`_.
.. _raysgd-pytorch-examples:
.. _raysgd-torch-examples:
PyTorchTrainer Examples
TorchTrainer Examples
-----------------------
Here are some examples of using RaySGD for training PyTorch models. If you'd like
to contribute an example, feel free to create a `pull request here <https://github.com/ray-project/ray/>`_.
- `PyTorch training example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/train_example.py>`__:
Simple example of using Ray's PyTorchTrainer.
- `Torch training example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/train_example.py>`__:
Simple example of using Ray's TorchTrainer.
- `CIFAR10 example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py>`__:
- `CIFAR10 example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py>`__:
Training a ResNet18 model on CIFAR10.
- `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/dcgan.py>`__:
- `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/dcgan.py>`__:
Training a Deep Convolutional GAN on MNIST. It constructs two models and two optimizers and uses a custom training operator.

View file

@ -1,29 +1,29 @@
Package Reference
=================
.. _ref-pytorch-trainer:
.. _ref-torch-trainer:
PyTorchTrainer
--------------
TorchTrainer
------------
.. autoclass:: ray.util.sgd.pytorch.PyTorchTrainer
.. autoclass:: ray.util.sgd.torch.TorchTrainer
:members:
.. automethod:: __init__
.. _ref-pytorch-operator:
.. _ref-torch-operator:
PyTorch TrainingOperator
------------------------
.. autoclass:: ray.util.sgd.pytorch.TrainingOperator
.. autoclass:: ray.util.sgd.torch.TrainingOperator
:members:
PyTorchTrainable
----------------
TorchTrainable
--------------
.. autoclass:: ray.util.sgd.pytorch.PyTorchTrainable
.. autoclass:: ray.util.sgd.torch.TorchTrainable
:members:
TFTrainer

View file

@ -1,4 +1,9 @@
from ray.util.sgd.pytorch import PyTorchTrainer
from ray.util.sgd.torch import TorchTrainer
from ray.util.sgd.tf import TFTrainer
__all__ = ["PyTorchTrainer", "TFTrainer"]
__all__ = ["TorchTrainer", "TFTrainer"]
def PyTorchTrainer(**kwargs):
raise DeprecationWarning("ray.util.sgd.pytorch.PyTorchTrainer has been "
"renamed to ray.util.sgd.torch.TorchTrainer")

View file

@ -1,18 +0,0 @@
import logging
logger = logging.getLogger(__name__)
PyTorchTrainer = None
PyTorchTrainable = None
TrainingOperator = None
try:
import torch # noqa: F401
from ray.util.sgd.pytorch.pytorch_trainer import (PyTorchTrainer,
PyTorchTrainable)
from ray.util.sgd.pytorch.training_operator import TrainingOperator
__all__ = ["PyTorchTrainer", "PyTorchTrainable", "TrainingOperator"]
except ImportError:
logger.warning("PyTorch not found. PyTorchTrainer will not be available")

View file

@ -10,12 +10,12 @@ import torch.distributed as dist
import ray
from ray import tune
from ray.util.sgd.pytorch import PyTorchTrainer, PyTorchTrainable
from ray.util.sgd.pytorch.training_operator import _TestingOperator
from ray.util.sgd.pytorch.constants import BATCH_COUNT, SCHEDULER_STEP
from ray.util.sgd.torch import TorchTrainer, TorchTrainable
from ray.util.sgd.torch.training_operator import _TestingOperator
from ray.util.sgd.torch.constants import BATCH_COUNT, SCHEDULER_STEP
from ray.util.sgd.utils import check_for_failure
from ray.util.sgd.pytorch.examples.train_example import (
from ray.util.sgd.torch.examples.train_example import (
model_creator, optimizer_creator, data_creator, LinearDataset)
@ -28,7 +28,7 @@ def ray_start_2_cpus():
def test_single_step(ray_start_2_cpus): # noqa: F811
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -44,7 +44,7 @@ def test_single_step(ray_start_2_cpus): # noqa: F811
@pytest.mark.parametrize("num_replicas", [1, 2]
if dist.is_available() else [1])
def test_train(ray_start_2_cpus, num_replicas): # noqa: F811
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -107,7 +107,7 @@ def test_multi_model(ray_start_2_cpus, num_replicas):
]
return opts[0], opts[1]
trainer1 = PyTorchTrainer(
trainer1 = TorchTrainer(
multi_model_creator,
data_creator,
multi_optimizer_creator,
@ -124,7 +124,7 @@ def test_multi_model(ray_start_2_cpus, num_replicas):
trainer1.shutdown()
trainer2 = PyTorchTrainer(
trainer2 = TorchTrainer(
multi_model_creator,
data_creator,
multi_optimizer_creator,
@ -193,7 +193,7 @@ def test_multi_model_matrix(ray_start_2_cpus, num_replicas): # noqa: F811
for model_count in range(1, 3):
for optimizer_count in range(1, 3):
for scheduler_count in range(1, 3):
trainer = PyTorchTrainer(
trainer = TorchTrainer(
multi_model_creator,
data_creator,
multi_optimizer_creator,
@ -221,7 +221,7 @@ def test_scheduler_freq(ray_start_2_cpus, scheduler_freq): # noqa: F811
return torch.optim.lr_scheduler.StepLR(
optimizer, step_size=30, gamma=0.1)
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -239,7 +239,7 @@ def test_scheduler_freq(ray_start_2_cpus, scheduler_freq): # noqa: F811
def test_scheduler_validate(ray_start_2_cpus): # noqa: F811
from torch.optim.lr_scheduler import ReduceLROnPlateau
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -273,7 +273,7 @@ def test_tune_train(ray_start_2_cpus, num_replicas): # noqa: F811
}
analysis = tune.run(
PyTorchTrainable,
TorchTrainable,
num_samples=2,
config=config,
stop={"training_iteration": 2},
@ -293,7 +293,7 @@ def test_tune_train(ray_start_2_cpus, num_replicas): # noqa: F811
@pytest.mark.parametrize("num_replicas", [1, 2]
if dist.is_available() else [1])
def test_save_and_restore(ray_start_2_cpus, num_replicas): # noqa: F811
trainer1 = PyTorchTrainer(
trainer1 = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -308,7 +308,7 @@ def test_save_and_restore(ray_start_2_cpus, num_replicas): # noqa: F811
trainer1.shutdown()
trainer2 = PyTorchTrainer(
trainer2 = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -346,8 +346,8 @@ def test_fail_with_recover(ray_start_2_cpus): # noqa: F811
success = check_for_failure(worker_stats)
return success, worker_stats
with patch.object(PyTorchTrainer, "_train_epoch", step_with_fail):
trainer1 = PyTorchTrainer(
with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
trainer1 = TorchTrainer(
model_creator,
single_loader,
optimizer_creator,
@ -376,8 +376,8 @@ def test_resize(ray_start_2_cpus): # noqa: F811
success = check_for_failure(worker_stats)
return success, worker_stats
with patch.object(PyTorchTrainer, "_train_epoch", step_with_fail):
trainer1 = PyTorchTrainer(
with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
trainer1 = TorchTrainer(
model_creator,
single_loader,
optimizer_creator,
@ -412,8 +412,8 @@ def test_fail_twice(ray_start_2_cpus): # noqa: F811
success = check_for_failure(worker_stats)
return success, worker_stats
with patch.object(PyTorchTrainer, "_train_epoch", step_with_fail):
trainer1 = PyTorchTrainer(
with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
trainer1 = TorchTrainer(
model_creator,
single_loader,
optimizer_creator,

View file

@ -4,8 +4,8 @@ import torch.nn as nn
import unittest
from unittest.mock import MagicMock
from ray.util.sgd.pytorch.training_operator import TrainingOperator
from ray.util.sgd.pytorch.pytorch_runner import PyTorchRunner
from ray.util.sgd.torch.training_operator import TrainingOperator
from ray.util.sgd.torch.torch_runner import TorchRunner
class LinearDataset(torch.utils.data.Dataset):
@ -45,14 +45,14 @@ def create_dataloaders(config):
return LinearDataset(2, 5), LinearDataset(2, 5, size=400)
class TestPyTorchRunner(unittest.TestCase):
class TestTorchRunner(unittest.TestCase):
def testValidate(self):
class MockOperator(TrainingOperator):
def setup(self, config):
self.train_epoch = MagicMock(returns=dict(mean_accuracy=10))
self.validate = MagicMock(returns=dict(mean_accuracy=10))
runner = PyTorchRunner(
runner = TorchRunner(
model_creator,
create_dataloaders,
optimizer_creator,
@ -76,7 +76,7 @@ class TestPyTorchRunner(unittest.TestCase):
self.count += 1
return {"count": self.count}
runner = PyTorchRunner(
runner = TorchRunner(
model_creator,
create_dataloaders,
optimizer_creator,
@ -105,7 +105,7 @@ class TestPyTorchRunner(unittest.TestCase):
]
return opts[0], opts[1], opts[2]
runner = PyTorchRunner(
runner = TorchRunner(
three_model_creator,
single_loader,
three_optimizer_creator,
@ -116,8 +116,8 @@ class TestPyTorchRunner(unittest.TestCase):
self.assertEqual(len(runner.given_models), 3)
self.assertEqual(len(runner.given_optimizers), 3)
runner2 = PyTorchRunner(model_creator, single_loader,
optimizer_creator, loss_creator)
runner2 = TorchRunner(model_creator, single_loader, optimizer_creator,
loss_creator)
runner2.setup()
self.assertNotEqual(runner2.given_models, runner2.models)
@ -128,26 +128,26 @@ class TestPyTorchRunner(unittest.TestCase):
return (LinearDataset(2, 5), LinearDataset(2, 5, size=400),
LinearDataset(2, 5, size=400))
runner = PyTorchRunner(model_creator, three_data_loader,
optimizer_creator, loss_creator)
runner = TorchRunner(model_creator, three_data_loader,
optimizer_creator, loss_creator)
with self.assertRaises(ValueError):
runner.setup()
runner2 = PyTorchRunner(model_creator, three_data_loader,
optimizer_creator, loss_creator)
runner2 = TorchRunner(model_creator, three_data_loader,
optimizer_creator, loss_creator)
with self.assertRaises(ValueError):
runner2.setup()
def testSingleLoader(self):
runner = PyTorchRunner(model_creator, single_loader, optimizer_creator,
loss_creator)
runner = TorchRunner(model_creator, single_loader, optimizer_creator,
loss_creator)
runner.setup()
runner.train_epoch()
with self.assertRaises(ValueError):
runner.validate()
def testNativeLoss(self):
runner = PyTorchRunner(
runner = TorchRunner(
model_creator,
single_loader,
optimizer_creator,
@ -165,8 +165,8 @@ class TestPyTorchRunner(unittest.TestCase):
]
return opts[0], opts[1], opts[2]
runner = PyTorchRunner(multi_model_creator, single_loader,
multi_optimizer_creator, loss_creator)
runner = TorchRunner(multi_model_creator, single_loader,
multi_optimizer_creator, loss_creator)
with self.assertRaises(ValueError):
runner.setup()

View file

@ -0,0 +1,17 @@
import logging
logger = logging.getLogger(__name__)
TorchTrainer = None
TorchTrainable = None
TrainingOperator = None
try:
import torch # noqa: F401
from ray.util.sgd.torch.torch_trainer import (TorchTrainer, TorchTrainable)
from ray.util.sgd.torch.training_operator import TrainingOperator
__all__ = ["TorchTrainer", "TorchTrainable", "TrainingOperator"]
except ImportError:
logger.warning("PyTorch not found. TorchTrainer will not be available")

View file

@ -7,24 +7,24 @@ import torch.distributed as dist
import torch.utils.data
from torch.nn.parallel import DistributedDataParallel
from ray.util.sgd.pytorch.pytorch_runner import PyTorchRunner
from ray.util.sgd.torch.torch_runner import TorchRunner
logger = logging.getLogger(__name__)
class DistributedPyTorchRunner(PyTorchRunner):
class DistributedTorchRunner(TorchRunner):
"""Manages a distributed PyTorch model replica.
Args:
args: Arguments for PyTorchRunner.
args: Arguments for TorchRunner.
backend (string): backend used by distributed PyTorch.
kwargs: Keyword arguments for PyTorchRunner.
kwargs: Keyword arguments for TorchRunner.
"""
def __init__(self, *args, backend="gloo", **kwargs):
super(DistributedPyTorchRunner, self).__init__(*args, **kwargs)
super(DistributedTorchRunner, self).__init__(*args, **kwargs)
self.backend = backend
def setup(self, url, world_rank, world_size):
@ -110,7 +110,7 @@ class DistributedPyTorchRunner(PyTorchRunner):
"""
if hasattr(self.train_loader.sampler, "set_epoch"):
self.train_loader.sampler.set_epoch(self.epochs)
return super(DistributedPyTorchRunner, self).train_epoch(**kwargs)
return super(DistributedTorchRunner, self).train_epoch(**kwargs)
def _get_model_state_dicts(self):
"""Fetch state from ``model.module`` instead of ``model``.
@ -132,7 +132,7 @@ class DistributedPyTorchRunner(PyTorchRunner):
# def shutdown(self):
"""Attempts to shut down the worker."""
# super(DistributedPyTorchRunner, self).shutdown()
# super(DistributedTorchRunner, self).shutdown()
# TODO: Temporarily removing since it causes hangs on MacOSX.
# However, it seems to be harmless to remove permanently
# since the processes are shutdown anyways. This comment can be

View file

@ -8,8 +8,8 @@ import torchvision
import torchvision.transforms as transforms
import ray
from ray.util.sgd.pytorch import (PyTorchTrainer, PyTorchTrainable)
from ray.util.sgd.pytorch.resnet import ResNet18
from ray.util.sgd.torch import (TorchTrainer, TorchTrainable)
from ray.util.sgd.torch.resnet import ResNet18
def initialization_hook():
@ -62,7 +62,7 @@ def train_example(num_replicas=1,
use_gpu=False,
use_fp16=False,
test_mode=False):
trainer1 = PyTorchTrainer(
trainer1 = TorchTrainer(
ResNet18,
cifar_creator,
optimizer_creator,
@ -107,7 +107,7 @@ def tune_example(num_replicas=1, use_gpu=False, test_mode=False):
}
analysis = tune.run(
PyTorchTrainable,
TorchTrainable,
num_samples=2,
config=config,
stop={"training_iteration": 2},

View file

@ -15,9 +15,9 @@ from torch.nn import functional as F
from scipy.stats import entropy
import ray
from ray.util.sgd import PyTorchTrainer
from ray.util.sgd import TorchTrainer
from ray.util.sgd.utils import override
from ray.util.sgd.pytorch import TrainingOperator
from ray.util.sgd.torch import TrainingOperator
def data_creator(config):
@ -223,9 +223,9 @@ def train_example(num_replicas=1, use_gpu=False, test_mode=False):
"test_mode": test_mode,
"classification_model_path": os.path.join(
os.path.dirname(ray.__file__),
"util/sgd/pytorch/examples/mnist_cnn.pt")
"util/sgd/torch/examples/mnist_cnn.pt")
}
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,

View file

@ -13,7 +13,7 @@ import numpy as np
import torch
import torch.nn as nn
from ray.util.sgd import PyTorchTrainer
from ray.util.sgd import TorchTrainer
class LinearDataset(torch.utils.data.Dataset):
@ -44,7 +44,7 @@ def optimizer_creator(model, config):
def scheduler_creator(optimizer, config):
"""Returns a learning rate scheduler wrapping the optimizer.
You will need to set ``PyTorchTrainer(scheduler_step_freq="epoch")``
You will need to set ``TorchTrainer(scheduler_step_freq="epoch")``
for the scheduler to be incremented correctly.
If using a scheduler for validation loss, be sure to call
@ -59,7 +59,7 @@ def data_creator(config):
def train_example(num_replicas=1, use_gpu=False):
trainer1 = PyTorchTrainer(
trainer1 = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,

View file

@ -14,7 +14,7 @@ import torch.nn as nn
import ray
from ray import tune
from ray.util.sgd.pytorch.pytorch_trainer import PyTorchTrainable
from ray.util.sgd.torch.torch_trainer import TorchTrainable
class LinearDataset(torch.utils.data.Dataset):
@ -60,7 +60,7 @@ def tune_example(num_replicas=1, use_gpu=False):
}
analysis = tune.run(
PyTorchTrainable,
TorchTrainable,
num_samples=12,
config=config,
stop={"training_iteration": 2},

View file

@ -9,8 +9,8 @@ import torch.utils.data
from torch.utils.data import Dataset
import ray
from ray.util.sgd.pytorch.constants import USE_FP16, SCHEDULER_STEP
from ray.util.sgd.pytorch.training_operator import TrainingOperator
from ray.util.sgd.torch.constants import USE_FP16, SCHEDULER_STEP
from ray.util.sgd.torch.training_operator import TrainingOperator
from ray.util.sgd import utils
logger = logging.getLogger(__name__)
@ -23,23 +23,23 @@ except ImportError:
pass
class PyTorchRunner:
class TorchRunner:
"""Manages a PyTorch model for training.
Args:
model_creator (dict -> *): see pytorch_trainer.py
data_creator (dict -> Dataset, Dataset): see pytorch_trainer.py.
optimizer_creator (models, dict -> optimizers): see pytorch_trainer.py.
loss_creator (dict -> loss | Loss class): see pytorch_trainer.py.
model_creator (dict -> *): see torch_trainer.py
data_creator (dict -> Dataset, Dataset): see torch_trainer.py.
optimizer_creator (models, dict -> optimizers): see torch_trainer.py.
loss_creator (dict -> loss | Loss class): see torch_trainer.py.
scheduler_creator (optimizers, dict -> schedulers): see
pytorch_trainer.py.
training_operator_cls: see pytorch_trainer.py
config (dict): see pytorch_trainer.py.
dataloader_config (dict): See pytorch_trainer.py.
batch_size (int): see pytorch_trainer.py.
use_fp16 (bool): see pytorch_trainer.py.
apex_args (dict|None): see pytorch_trainer.py.
scheduler_step_freq (str): see pytorch_trainer.py.
torch_trainer.py.
training_operator_cls: see torch_trainer.py
config (dict): see torch_trainer.py.
dataloader_config (dict): See torch_trainer.py.
batch_size (int): see torch_trainer.py.
use_fp16 (bool): see torch_trainer.py.
apex_args (dict|None): see torch_trainer.py.
scheduler_step_freq (str): see torch_trainer.py.
"""
def __init__(self,

View file

@ -11,11 +11,11 @@ import ray
from ray.tune import Trainable
from ray.tune.trial import Resources
from ray.util.sgd.pytorch.distributed_pytorch_runner import (
DistributedPyTorchRunner)
from ray.util.sgd.torch.distributed_torch_runner import (
DistributedTorchRunner)
from ray.util.sgd import utils
from ray.util.sgd.pytorch.pytorch_runner import PyTorchRunner
from ray.util.sgd.pytorch.constants import VALID_SCHEDULER_STEP
from ray.util.sgd.torch.torch_runner import TorchRunner
from ray.util.sgd.torch.constants import VALID_SCHEDULER_STEP
logger = logging.getLogger(__name__)
RESIZE_COOLDOWN_S = 10
@ -29,7 +29,7 @@ def _validate_scheduler_step_freq(scheduler_step_freq):
VALID_SCHEDULER_STEP, scheduler_step_freq))
class PyTorchTrainer:
class TorchTrainer:
"""Train a PyTorch model using distributed PyTorch.
Launches a set of actors which connect via distributed PyTorch and
@ -49,7 +49,7 @@ class PyTorchTrainer:
def data_creator(config):
return LinearDataset(2, 5), LinearDataset(2, 5, size=400)
trainer = PyTorchTrainer(
trainer = TorchTrainer(
model_creator,
data_creator,
optimizer_creator,
@ -195,7 +195,7 @@ class PyTorchTrainer:
if num_replicas == 1:
# Generate actor class
Runner = ray.remote(
num_cpus=1, num_gpus=int(self.use_gpu))(PyTorchRunner)
num_cpus=1, num_gpus=int(self.use_gpu))(TorchRunner)
# Start workers
self.workers = [
Runner.remote(
@ -220,8 +220,7 @@ class PyTorchTrainer:
else:
# Generate actor class
Runner = ray.remote(
num_cpus=1,
num_gpus=int(self.use_gpu))(DistributedPyTorchRunner)
num_cpus=1, num_gpus=int(self.use_gpu))(DistributedTorchRunner)
# Compute batch size per replica
batch_size_per_replica = self.batch_size // num_replicas
if self.batch_size % num_replicas > 0:
@ -285,7 +284,7 @@ class PyTorchTrainer:
in case of shared cluster usage.
checkpoint (str): Path to checkpoint to restore from if retrying.
If max_retries is set and ``checkpoint == "auto"``,
PyTorchTrainer will save a checkpoint before starting to train.
TorchTrainer will save a checkpoint before starting to train.
info (dict): Optional dictionary passed to the training
operator for ``train_epoch`` and ``train_batch``.
@ -487,7 +486,7 @@ class PyTorchTrainer:
return False
class PyTorchTrainable(Trainable):
class TorchTrainable(Trainable):
@classmethod
def default_resource_request(cls, config):
return Resources(
@ -497,7 +496,7 @@ class PyTorchTrainable(Trainable):
extra_gpu=int(config["use_gpu"]) * config["num_replicas"])
def _setup(self, config):
self._trainer = PyTorchTrainer(**config)
self._trainer = TorchTrainer(**config)
def _train(self):
train_stats = self._trainer.train()

View file

@ -2,7 +2,7 @@ import collections
import torch
from ray.util.sgd.utils import TimerStat, AverageMeter
from ray.util.sgd.pytorch.constants import (
from ray.util.sgd.torch.constants import (
SCHEDULER_STEP_EPOCH, SCHEDULER_STEP_BATCH, SCHEDULER_STEP, BATCH_COUNT)
amp = None
@ -11,7 +11,7 @@ try:
from apex import amp
except ImportError:
# Apex library is not installed, so we cannot enable mixed precision.
# We don't log here because logging happens in the pytorch_runner,
# We don't log here because logging happens in the torch_runner,
# where amp is initialized.
pass
@ -26,7 +26,7 @@ class TrainingOperator:
The scheduler will only be called at a batch or epoch frequency, depending
on the user parameter. Be sure to set ``scheduler_step_freq`` in
``PyTorchTrainer`` to either "batch" or "epoch" to increment the scheduler
``TorchTrainer`` to either "batch" or "epoch" to increment the scheduler
correctly during training. If using a learning rate scheduler
that depends on validation loss, you can use ``trainer.update_scheduler``.
@ -290,7 +290,7 @@ class TrainingOperator:
@property
def config(self):
"""Dictionary as provided into PyTorchTrainer."""
"""Dictionary as provided into TorchTrainer."""
return self._config
@property