[RaySGD] Rename PyTorch API endpoints to start with Torch (#7425)

* Start renaming pytorch to torch * Rename PyTorchTrainer to TorchTrainer * Rename PyTorch runners to Torch runners * Finish renaming API * Rename to torch in tests * Finish renaming docs + tests * Run format + fix DeprecationWarning * fix * move tests up * rename Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
2025-03-06 02:21:39 -05:00 · 2020-03-03 16:44:42 -08:00 · 2020-03-03 16:44:42 -08:00 · 3a134c7224
commit 3a134c7224
parent f6883bf725
22 changed files with 222 additions and 218 deletions
--- a/ci/jenkins_tests/run_tune_tests.sh
+++ b/ci/jenkins_tests/run_tune_tests.sh
@ -31,6 +31,58 @@ fi

 echo "Using Docker image" $DOCKER_SHA

+
+######################## SGD TESTS #################################
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python -m pytest /ray/python/ray/util/sgd/tests
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/doc/examples/doc_code/raysgd_torch_signatures.py
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/torch/examples/train_example.py
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/torch/examples/train_example.py --num-replicas=2
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/torch/examples/tune_example.py
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/torch/examples/tune_example.py --num-replicas=2
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py --smoke-test
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py --smoke-test --num-replicas=2
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py --smoke-test --tune
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/torch/examples/dcgan.py --smoke-test --num-replicas=2
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --num-replicas=2
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --tune
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --smoke-test
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+    python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test --augment-data
+
+
 $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
    pytest /ray/python/ray/tune/tests/test_cluster.py

@ -139,54 +191,3 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE}
 # $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
 #     python /ray/python/ray/tune/examples/bohb_example.py \
 #     --smoke-test
-
-
-######################## SGD TESTS #################################
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python -m pytest /ray/python/ray/util/sgd/tests
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/doc/examples/doc_code/raysgd_torch_signatures.py
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/pytorch/examples/train_example.py
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/pytorch/examples/train_example.py --num-replicas=2
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/pytorch/examples/tune_example.py
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/pytorch/examples/tune_example.py --num-replicas=2
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py --smoke-test
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py --smoke-test --num-replicas=2
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py --smoke-test --tune
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/pytorch/examples/dcgan.py --smoke-test --num-replicas=2
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --num-replicas=2
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/tf/examples/tensorflow_train_example.py --tune
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --smoke-test
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
-    python /ray/python/ray/util/sgd/tf/examples/cifar_tf_example.py --num-replicas 2 --smoke-test --augment-data
--- a/doc/examples/doc_code/raysgd_torch_signatures.py
+++ b/doc/examples/doc_code/raysgd_torch_signatures.py
@ -1,6 +1,6 @@
 # flake8: noqa
 """
-This file holds code for the Pytorch Trainer creator signatures.
+This file holds code for the torch Trainer creator signatures.

 It ignores yapf because yapf doesn't allow comments right after code blocks,
 but we put comments right after code blocks to prevent large white spaces
@ -18,7 +18,7 @@ def model_creator(config):
    function to specify the optimization procedure for multiple models.

    Args:
-        config (dict): Configuration dictionary passed into ``PyTorchTrainer``.
+        config (dict): Configuration dictionary passed into ``TorchTrainer``.

    Returns:
        One or more torch.nn.Module objects.
@ -36,7 +36,7 @@ def optimizer_creator(model, config):
    Args:
        models: The return values from ``model_creator``. This can be one
            or more torch nn modules.
-        config (dict): Configuration dictionary passed into ``PyTorchTrainer``.
+        config (dict): Configuration dictionary passed into ``TorchTrainer``.

    Returns:
        One or more Torch optimizer objects.
@ -46,7 +46,7 @@ def optimizer_creator(model, config):


 # __torch_data_start__
-from ray.util.sgd.pytorch.examples.train_example import LinearDataset
+from ray.util.sgd.torch.examples.train_example import LinearDataset

 def data_creator(config):
    """Constructs torch.utils.data.Dataset objects.
@ -55,7 +55,7 @@ def data_creator(config):
    only one dataset will be used for training.

    Args:
-        config: Configuration dictionary passed into ``PyTorchTrainer``
+        config: Configuration dictionary passed into ``TorchTrainer``

    Returns:
        One or Two Dataset objects. If only one Dataset object is provided,
@ -71,10 +71,10 @@ def loss_creator(config):
    """Constructs the Torch Loss object.

    Note that optionally, you can pass in a Torch Loss constructor directly
-    into the PyTorchTrainer (i.e., ``PyTorchTrainer(loss_creator=nn.BCELoss, ...)``).
+    into the TorchTrainer (i.e., ``TorchTrainer(loss_creator=nn.BCELoss, ...)``).

    Args:
-        config: Configuration dictionary passed into ``PyTorchTrainer``
+        config: Configuration dictionary passed into ``TorchTrainer``

    Returns:
        Torch Loss object.
@ -91,7 +91,7 @@ def scheduler_creator(optimizer, config):
    Args:
        optimizers: The return values from ``optimizer_creator``.
            This can be one or more torch optimizer objects.
-        config: Configuration dictionary passed into ``PyTorchTrainer``
+        config: Configuration dictionary passed into ``TorchTrainer``

    Returns:
        One or more Torch scheduler objects.
@ -108,9 +108,9 @@ ray.init()
 # __torch_ray_end__

 # __torch_trainer_start__
-from ray.util.sgd import PyTorchTrainer
+from ray.util.sgd import TorchTrainer

-trainer = PyTorchTrainer(
+trainer = TorchTrainer(
    model_creator,
    data_creator,
    optimizer_creator,
--- a/doc/source/raysgd/raysgd.rst
+++ b/doc/source/raysgd/raysgd.rst
@ -7,7 +7,7 @@ RaySGD is a lightweight library for distributed deep learning, providing thin wr

 The main features are:

-  - **Ease of use**: Scale Pytorch's native ``DistributedDataParallel`` and TensorFlow's ``tf.distribute.MirroredStrategy`` without needing to monitor individual nodes.
+  - **Ease of use**: Scale PyTorch's native ``DistributedDataParallel`` and TensorFlow's ``tf.distribute.MirroredStrategy`` without needing to monitor individual nodes.
  - **Composability**: RaySGD is built on top of the Ray Actor API, enabling seamless integration with existing Ray applications such as RLlib, Tune, and Ray.Serve.
  - **Scale up and down**: Start on single CPU. Scale up to multi-node, multi-CPU, or multi-GPU clusters by changing 2 lines of code.

@ -20,7 +20,7 @@ The main features are:
 Getting Started
 ---------------

-You can start a ``PyTorchTrainer`` with the following:
+You can start a ``TorchTrainer`` with the following:

 .. code-block:: python

@ -29,7 +29,7 @@ You can start a ``PyTorchTrainer`` with the following:
    import torch.nn as nn
    from torch import distributed

-    from ray.util.sgd import PyTorchTrainer
+    from ray.util.sgd import TorchTrainer
    from ray.util.sgd.examples.train_example import LinearDataset


@ -48,7 +48,7 @@ You can start a ``PyTorchTrainer`` with the following:

    ray.init()

-    trainer1 = PyTorchTrainer(
+    trainer1 = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
--- a/doc/source/raysgd/raysgd_pytorch.rst
+++ b/doc/source/raysgd/raysgd_pytorch.rst
@ -1,14 +1,14 @@
 Distributed PyTorch
 ===================

-The RaySGD ``PyTorchTrainer`` simplifies distributed model training for PyTorch. The ``PyTorchTrainer`` is a wrapper around ``torch.distributed.launch`` with a Python API to easily incorporate distributed training into a larger Python application, as opposed to needing to wrap your training code in bash scripts.
+The RaySGD ``TorchTrainer`` simplifies distributed model training for PyTorch. The ``TorchTrainer`` is a wrapper around ``torch.distributed.launch`` with a Python API to easily incorporate distributed training into a larger Python application, as opposed to needing to wrap your training code in bash scripts.

-Under the hood, ``PytorchTrainer`` will create *replicas* of your model (controlled by ``num_replicas``), each of which is managed by a Ray actor.
+Under the hood, ``TorchTrainer`` will create *replicas* of your model (controlled by ``num_replicas``), each of which is managed by a Ray actor.

 .. image:: raysgd-actors.svg
    :align: center

-For end to end examples leveraging RaySGD PyTorchTrainer, jump to :ref:`raysgd-pytorch-examples`.
+For end to end examples leveraging RaySGD TorchTrainer, jump to :ref:`raysgd-torch-examples`.

 .. contents:: :local:

@ -17,19 +17,19 @@ Setting up training

 .. tip:: Get in touch with us if you're using or considering using `RaySGD <https://forms.gle/26EMwdahdgm7Lscy9>`_!

-The ``PyTorchTrainer`` can be constructed with functions that wrap components of the training script. Specifically, it requires constructors for the Model, Data, Optimizer, Loss, and ``lr_scheduler`` to create replicated copies across different devices and machines.
+The ``TorchTrainer`` can be constructed with functions that wrap components of the training script. Specifically, it requires constructors for the Model, Data, Optimizer, Loss, and ``lr_scheduler`` to create replicated copies across different devices and machines.

 .. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
   :language: python
   :start-after: __torch_trainer_start__
   :end-before: __torch_trainer_end__

-The below section covers the expected signatures of creator functions. Jump to :ref:`starting-pytorch-trainer`.
+The below section covers the expected signatures of creator functions. Jump to :ref:`starting-torch-trainer`.

 Model Creator
 ~~~~~~~~~~~~~

-This is the signature needed for ``PyTorchTrainer(model_creator=...)``.
+This is the signature needed for ``TorchTrainer(model_creator=...)``.

 .. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
   :language: python
@ -40,7 +40,7 @@ This is the signature needed for ``PyTorchTrainer(model_creator=...)``.
 Optimizer Creator
 ~~~~~~~~~~~~~~~~~

-This is the signature needed for ``PyTorchTrainer(optimizer_creator=...)``.
+This is the signature needed for ``TorchTrainer(optimizer_creator=...)``.

 .. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
   :language: python
@ -52,7 +52,7 @@ This is the signature needed for ``PyTorchTrainer(optimizer_creator=...)``.
 Data Creator
 ~~~~~~~~~~~~

-This is the signature needed for ``PyTorchTrainer(data_creator=...)``.
+This is the signature needed for ``TorchTrainer(data_creator=...)``.

 .. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
   :language: python
@ -64,7 +64,7 @@ This is the signature needed for ``PyTorchTrainer(data_creator=...)``.
 Loss Creator
 ~~~~~~~~~~~~

-This is the signature needed for ``PyTorchTrainer(loss_creator=...)``.
+This is the signature needed for ``TorchTrainer(loss_creator=...)``.

 .. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
   :language: python
@ -76,7 +76,7 @@ Scheduler Creator
 ~~~~~~~~~~~~~~~~~

 Optionally, you can provide a creator function for the learning rate scheduler. This is the signature needed
-for ``PyTorchTrainer(scheduler_creator=...)``.
+for ``TorchTrainer(scheduler_creator=...)``.

 .. literalinclude:: ../../examples/doc_code/raysgd_torch_signatures.py
   :language: python
@ -84,7 +84,7 @@ for ``PyTorchTrainer(scheduler_creator=...)``.
   :end-before: __torch_scheduler_end__


-.. _starting-pytorch-trainer:
+.. _starting-torch-trainer:

 Putting things together
 ~~~~~~~~~~~~~~~~~~~~~~~
@ -108,7 +108,7 @@ You can also set the number of workers and whether the workers will use GPUs:
 .. code-block:: python
    :emphasize-lines: 8,9

-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -138,7 +138,7 @@ After training, you may want to reappropriate the Ray cluster. To release Ray re

 .. note:: Be sure to call ``trainer.save()`` or ``trainer.get_model()`` before shutting down.

-See the documentation on the PyTorchTrainer here: :ref:`ref-pytorch-trainer`.
+See the documentation on the TorchTrainer here: :ref:`ref-torch-trainer`.


 .. _raysgd-custom-training:
@ -146,8 +146,8 @@ See the documentation on the PyTorchTrainer here: :ref:`ref-pytorch-trainer`.
 Custom Training and Validation (Operators)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-``PyTorchTrainer`` allows you to run a custom training and validation loops in parallel on each worker, providing a flexible interface similar to using PyTorch natively.
-This is done via the :ref:`ref-pytorch-operator` interface.
+``TorchTrainer`` allows you to run a custom training and validation loops in parallel on each worker, providing a flexible interface similar to using PyTorch natively.
+This is done via the :ref:`ref-torch-operator` interface.

 For both training and validation, there are two granularities that you can provide customization - per epoch and per batch. These correspond to ``train_batch``,
 ``train_epoch``, ``validate``, and ``validate_batch``. Other useful methods to override include ``setup``, ``save`` and ``restore``. You can use these
@ -160,7 +160,7 @@ Below is a partial example of a custom ``TrainingOperator`` that provides a ``tr
 .. code-block:: python

    import torch
-    from ray.util.sgd.pytorch import TrainingOperator
+    from ray.util.sgd.torch import TrainingOperator

    class GANOperator(TrainingOperator):
        def setup(self, config):
@ -237,7 +237,7 @@ Below is a partial example of a custom ``TrainingOperator`` that provides a ``tr
                "num_samples": imgs.shape[0]
            }

-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
            model_creator,
            data_creator,
            optimizer_creator,
@ -252,7 +252,7 @@ Below is a partial example of a custom ``TrainingOperator`` that provides a ``tr
        stats = trainer.train()
        print(stats)

-See the `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/dcgan.py>`__ for an end to end example. It constructs two models and two optimizers and uses a custom training operator to provide a non-standard training loop.
+See the `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/dcgan.py>`__ for an end to end example. It constructs two models and two optimizers and uses a custom training operator to provide a non-standard training loop.


 Initialization Functions
@ -269,7 +269,7 @@ Use the ``initialization_hook`` parameter to initialize state on each worker pro
        os.environ["NCCL_LL_THRESHOLD"] = "0"
        os.environ["NCCL_DEBUG"] = "INFO"

-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -290,7 +290,7 @@ and ``trainer.load``, which wraps the relevant ``torch.save`` and ``torch.load``
    checkpoint_path = os.path.join(tempfile.mkdtemp(), "checkpoint")
    trainer_1.save(checkpoint_path)

-    trainer_2 = PyTorchTrainer(
+    trainer_2 = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -317,7 +317,7 @@ You can enable mixed precision training for PyTorch with the ``use_fp16`` flag.
 .. code-block:: python
    :emphasize-lines: 7

-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -329,12 +329,12 @@ You can enable mixed precision training for PyTorch with the ``use_fp16`` flag.
 ``Apex`` is a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training. When ``use_fp16=True``,
 you should not manually cast your model or data to ``.half()``. The flag informs the Trainer to call ``amp.initialize`` on the created models and optimizers and optimize using the scaled loss: ``amp.scale_loss(loss, optimizer)``.

-To specify particular parameters for ``amp.initialize``, you can use the ``apex_args`` field for the PyTorchTrainer constructor. Valid arguments can be found on the `Apex documentation <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_:
+To specify particular parameters for ``amp.initialize``, you can use the ``apex_args`` field for the TorchTrainer constructor. Valid arguments can be found on the `Apex documentation <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_:

 .. code-block:: python
    :emphasize-lines: 7-12

-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -368,7 +368,7 @@ After connecting, you can scale up the number of workers seamlessly across multi

 .. code-block:: python

-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -410,9 +410,9 @@ Users can set ``checkpoint="auto"`` to always checkpoint the current model befor
 Advanced: Hyperparameter Tuning
 -------------------------------

-``PyTorchTrainer`` naturally integrates with Tune via the ``PyTorchTrainable`` interface. The same arguments to ``PyTorchTrainer`` should be passed into the ``tune.run(config=...)`` as shown below.
+``TorchTrainer`` naturally integrates with Tune via the ``TorchTrainable`` interface. The same arguments to ``TorchTrainer`` should be passed into the ``tune.run(config=...)`` as shown below.

-.. literalinclude:: ../../../python/ray/util/sgd/pytorch/examples/tune_example.py
+.. literalinclude:: ../../../python/ray/util/sgd/torch/examples/tune_example.py
   :language: python
   :start-after: __torch_tune_example__

@ -420,13 +420,13 @@ Advanced: Hyperparameter Tuning
 Simultaneous Multi-model Training
 ---------------------------------

-In certain scenarios, such as training GANs, you may want to use multiple models in the training loop. You can do this in the ``PyTorchTrainer`` by allowing the ``model_creator``, ``optimizer_creator``, and ``scheduler_creator`` to return multiple values. Provide a custom TrainingOperator (:ref:`raysgd-custom-training`) to train across multiple models.
+In certain scenarios, such as training GANs, you may want to use multiple models in the training loop. You can do this in the ``TorchTrainer`` by allowing the ``model_creator``, ``optimizer_creator``, and ``scheduler_creator`` to return multiple values. Provide a custom TrainingOperator (:ref:`raysgd-custom-training`) to train across multiple models.

-You can see the `DCGAN script <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/dcgan.py>`_ for an end-to-end example.
+You can see the `DCGAN script <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/dcgan.py>`_ for an end-to-end example.

 .. code-block:: python

-    from ray.util.sgd.pytorch import PyTorchTrainer, TrainingOperator
+    from ray.util.sgd.torch import TorchTrainer, TrainingOperator

    def train(*, model=None, criterion=None, optimizer=None, dataloader=None):
        model.train()
@ -472,7 +472,7 @@ You can see the `DCGAN script <https://github.com/ray-project/ray/blob/master/py
                    dataloader=dataloader)
            return result

-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -487,19 +487,19 @@ Feature Requests

 Have features that you'd really like to see in RaySGD? Feel free to `open an issue <https://github.com/ray-project/ray>`_.

-.. _raysgd-pytorch-examples:
+.. _raysgd-torch-examples:

-PyTorchTrainer Examples
+TorchTrainer Examples
 -----------------------

 Here are some examples of using RaySGD for training PyTorch models. If you'd like
 to contribute an example, feel free to create a `pull request here <https://github.com/ray-project/ray/>`_.

- `PyTorch training example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/train_example.py>`__:
-   Simple example of using Ray's PyTorchTrainer.
+- `Torch training example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/train_example.py>`__:
+   Simple example of using Ray's TorchTrainer.

- `CIFAR10 example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py>`__:
+- `CIFAR10 example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/cifar_pytorch_example.py>`__:
   Training a ResNet18 model on CIFAR10.

- `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/pytorch/examples/dcgan.py>`__:
+- `DCGAN example <https://github.com/ray-project/ray/blob/master/python/ray/util/sgd/torch/examples/dcgan.py>`__:
   Training a Deep Convolutional GAN on MNIST. It constructs two models and two optimizers and uses a custom training operator.
--- a/doc/source/raysgd/raysgd_ref.rst
+++ b/doc/source/raysgd/raysgd_ref.rst
@ -1,29 +1,29 @@
 Package Reference
 =================

-.. _ref-pytorch-trainer:
+.. _ref-torch-trainer:

-PyTorchTrainer
--------------
+TorchTrainer
+------------

-.. autoclass:: ray.util.sgd.pytorch.PyTorchTrainer
+.. autoclass:: ray.util.sgd.torch.TorchTrainer
    :members:

    .. automethod:: __init__

-.. _ref-pytorch-operator:
+.. _ref-torch-operator:

 PyTorch TrainingOperator
 ------------------------

-.. autoclass:: ray.util.sgd.pytorch.TrainingOperator
+.. autoclass:: ray.util.sgd.torch.TrainingOperator
    :members:


-PyTorchTrainable
----------------
+TorchTrainable
+--------------

-.. autoclass:: ray.util.sgd.pytorch.PyTorchTrainable
+.. autoclass:: ray.util.sgd.torch.TorchTrainable
    :members:

 TFTrainer
--- a/python/ray/util/sgd/init.py
+++ b/python/ray/util/sgd/init.py
@ -1,4 +1,9 @@
-from ray.util.sgd.pytorch import PyTorchTrainer
+from ray.util.sgd.torch import TorchTrainer
 from ray.util.sgd.tf import TFTrainer

-__all__ = ["PyTorchTrainer", "TFTrainer"]
+__all__ = ["TorchTrainer", "TFTrainer"]
+
+
+def PyTorchTrainer(**kwargs):
+    raise DeprecationWarning("ray.util.sgd.pytorch.PyTorchTrainer has been "
+                             "renamed to ray.util.sgd.torch.TorchTrainer")
--- a/python/ray/util/sgd/pytorch/init.py
+++ b/python/ray/util/sgd/pytorch/init.py
@ -1,18 +0,0 @@
-import logging
-logger = logging.getLogger(__name__)
-
-PyTorchTrainer = None
-PyTorchTrainable = None
-TrainingOperator = None
-
-try:
-    import torch  # noqa: F401
-
-    from ray.util.sgd.pytorch.pytorch_trainer import (PyTorchTrainer,
-                                                      PyTorchTrainable)
-
-    from ray.util.sgd.pytorch.training_operator import TrainingOperator
-
-    __all__ = ["PyTorchTrainer", "PyTorchTrainable", "TrainingOperator"]
-except ImportError:
-    logger.warning("PyTorch not found. PyTorchTrainer will not be available")
--- a/python/ray/util/sgd/tests/test_pytorch.py
+++ b/python/ray/util/sgd/tests/test_pytorch.py
@ -10,12 +10,12 @@ import torch.distributed as dist

 import ray
 from ray import tune
-from ray.util.sgd.pytorch import PyTorchTrainer, PyTorchTrainable
-from ray.util.sgd.pytorch.training_operator import _TestingOperator
-from ray.util.sgd.pytorch.constants import BATCH_COUNT, SCHEDULER_STEP
+from ray.util.sgd.torch import TorchTrainer, TorchTrainable
+from ray.util.sgd.torch.training_operator import _TestingOperator
+from ray.util.sgd.torch.constants import BATCH_COUNT, SCHEDULER_STEP
 from ray.util.sgd.utils import check_for_failure

-from ray.util.sgd.pytorch.examples.train_example import (
+from ray.util.sgd.torch.examples.train_example import (
    model_creator, optimizer_creator, data_creator, LinearDataset)


@ -28,7 +28,7 @@ def ray_start_2_cpus():


 def test_single_step(ray_start_2_cpus):  # noqa: F811
-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -44,7 +44,7 @@ def test_single_step(ray_start_2_cpus):  # noqa: F811
@pytest.mark.parametrize("num_replicas", [1, 2]
                         if dist.is_available() else [1])
 def test_train(ray_start_2_cpus, num_replicas):  # noqa: F811
-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -107,7 +107,7 @@ def test_multi_model(ray_start_2_cpus, num_replicas):
        ]
        return opts[0], opts[1]

-    trainer1 = PyTorchTrainer(
+    trainer1 = TorchTrainer(
        multi_model_creator,
        data_creator,
        multi_optimizer_creator,
@ -124,7 +124,7 @@ def test_multi_model(ray_start_2_cpus, num_replicas):

    trainer1.shutdown()

-    trainer2 = PyTorchTrainer(
+    trainer2 = TorchTrainer(
        multi_model_creator,
        data_creator,
        multi_optimizer_creator,
@ -193,7 +193,7 @@ def test_multi_model_matrix(ray_start_2_cpus, num_replicas):  # noqa: F811
    for model_count in range(1, 3):
        for optimizer_count in range(1, 3):
            for scheduler_count in range(1, 3):
-                trainer = PyTorchTrainer(
+                trainer = TorchTrainer(
                    multi_model_creator,
                    data_creator,
                    multi_optimizer_creator,
@ -221,7 +221,7 @@ def test_scheduler_freq(ray_start_2_cpus, scheduler_freq):  # noqa: F811
        return torch.optim.lr_scheduler.StepLR(
            optimizer, step_size=30, gamma=0.1)

-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -239,7 +239,7 @@ def test_scheduler_freq(ray_start_2_cpus, scheduler_freq):  # noqa: F811
 def test_scheduler_validate(ray_start_2_cpus):  # noqa: F811
    from torch.optim.lr_scheduler import ReduceLROnPlateau

-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -273,7 +273,7 @@ def test_tune_train(ray_start_2_cpus, num_replicas):  # noqa: F811
    }

    analysis = tune.run(
-        PyTorchTrainable,
+        TorchTrainable,
        num_samples=2,
        config=config,
        stop={"training_iteration": 2},
@ -293,7 +293,7 @@ def test_tune_train(ray_start_2_cpus, num_replicas):  # noqa: F811
@pytest.mark.parametrize("num_replicas", [1, 2]
                         if dist.is_available() else [1])
 def test_save_and_restore(ray_start_2_cpus, num_replicas):  # noqa: F811
-    trainer1 = PyTorchTrainer(
+    trainer1 = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -308,7 +308,7 @@ def test_save_and_restore(ray_start_2_cpus, num_replicas):  # noqa: F811

    trainer1.shutdown()

-    trainer2 = PyTorchTrainer(
+    trainer2 = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
@ -346,8 +346,8 @@ def test_fail_with_recover(ray_start_2_cpus):  # noqa: F811
        success = check_for_failure(worker_stats)
        return success, worker_stats

-    with patch.object(PyTorchTrainer, "_train_epoch", step_with_fail):
-        trainer1 = PyTorchTrainer(
+    with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
+        trainer1 = TorchTrainer(
            model_creator,
            single_loader,
            optimizer_creator,
@ -376,8 +376,8 @@ def test_resize(ray_start_2_cpus):  # noqa: F811
        success = check_for_failure(worker_stats)
        return success, worker_stats

-    with patch.object(PyTorchTrainer, "_train_epoch", step_with_fail):
-        trainer1 = PyTorchTrainer(
+    with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
+        trainer1 = TorchTrainer(
            model_creator,
            single_loader,
            optimizer_creator,
@ -412,8 +412,8 @@ def test_fail_twice(ray_start_2_cpus):  # noqa: F811
        success = check_for_failure(worker_stats)
        return success, worker_stats

-    with patch.object(PyTorchTrainer, "_train_epoch", step_with_fail):
-        trainer1 = PyTorchTrainer(
+    with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
+        trainer1 = TorchTrainer(
            model_creator,
            single_loader,
            optimizer_creator,
--- a/python/ray/util/sgd/tests/test_pytorch_runner.py
+++ b/python/ray/util/sgd/tests/test_pytorch_runner.py
@ -4,8 +4,8 @@ import torch.nn as nn
 import unittest
 from unittest.mock import MagicMock

-from ray.util.sgd.pytorch.training_operator import TrainingOperator
-from ray.util.sgd.pytorch.pytorch_runner import PyTorchRunner
+from ray.util.sgd.torch.training_operator import TrainingOperator
+from ray.util.sgd.torch.torch_runner import TorchRunner


 class LinearDataset(torch.utils.data.Dataset):
@ -45,14 +45,14 @@ def create_dataloaders(config):
    return LinearDataset(2, 5), LinearDataset(2, 5, size=400)


-class TestPyTorchRunner(unittest.TestCase):
+class TestTorchRunner(unittest.TestCase):
    def testValidate(self):
        class MockOperator(TrainingOperator):
            def setup(self, config):
                self.train_epoch = MagicMock(returns=dict(mean_accuracy=10))
                self.validate = MagicMock(returns=dict(mean_accuracy=10))

-        runner = PyTorchRunner(
+        runner = TorchRunner(
            model_creator,
            create_dataloaders,
            optimizer_creator,
@ -76,7 +76,7 @@ class TestPyTorchRunner(unittest.TestCase):
                self.count += 1
                return {"count": self.count}

-        runner = PyTorchRunner(
+        runner = TorchRunner(
            model_creator,
            create_dataloaders,
            optimizer_creator,
@ -105,7 +105,7 @@ class TestPyTorchRunner(unittest.TestCase):
            ]
            return opts[0], opts[1], opts[2]

-        runner = PyTorchRunner(
+        runner = TorchRunner(
            three_model_creator,
            single_loader,
            three_optimizer_creator,
@ -116,8 +116,8 @@ class TestPyTorchRunner(unittest.TestCase):
        self.assertEqual(len(runner.given_models), 3)
        self.assertEqual(len(runner.given_optimizers), 3)

-        runner2 = PyTorchRunner(model_creator, single_loader,
-                                optimizer_creator, loss_creator)
+        runner2 = TorchRunner(model_creator, single_loader, optimizer_creator,
+                              loss_creator)
        runner2.setup()

        self.assertNotEqual(runner2.given_models, runner2.models)
@ -128,26 +128,26 @@ class TestPyTorchRunner(unittest.TestCase):
            return (LinearDataset(2, 5), LinearDataset(2, 5, size=400),
                    LinearDataset(2, 5, size=400))

-        runner = PyTorchRunner(model_creator, three_data_loader,
-                               optimizer_creator, loss_creator)
+        runner = TorchRunner(model_creator, three_data_loader,
+                             optimizer_creator, loss_creator)
        with self.assertRaises(ValueError):
            runner.setup()

-        runner2 = PyTorchRunner(model_creator, three_data_loader,
-                                optimizer_creator, loss_creator)
+        runner2 = TorchRunner(model_creator, three_data_loader,
+                              optimizer_creator, loss_creator)
        with self.assertRaises(ValueError):
            runner2.setup()

    def testSingleLoader(self):
-        runner = PyTorchRunner(model_creator, single_loader, optimizer_creator,
-                               loss_creator)
+        runner = TorchRunner(model_creator, single_loader, optimizer_creator,
+                             loss_creator)
        runner.setup()
        runner.train_epoch()
        with self.assertRaises(ValueError):
            runner.validate()

    def testNativeLoss(self):
-        runner = PyTorchRunner(
+        runner = TorchRunner(
            model_creator,
            single_loader,
            optimizer_creator,
@ -165,8 +165,8 @@ class TestPyTorchRunner(unittest.TestCase):
            ]
            return opts[0], opts[1], opts[2]

-        runner = PyTorchRunner(multi_model_creator, single_loader,
-                               multi_optimizer_creator, loss_creator)
+        runner = TorchRunner(multi_model_creator, single_loader,
+                             multi_optimizer_creator, loss_creator)

        with self.assertRaises(ValueError):
            runner.setup()
--- a/python/ray/util/sgd/torch/init.py
+++ b/python/ray/util/sgd/torch/init.py
@ -0,0 +1,17 @@
+import logging
+logger = logging.getLogger(__name__)
+
+TorchTrainer = None
+TorchTrainable = None
+TrainingOperator = None
+
+try:
+    import torch  # noqa: F401
+
+    from ray.util.sgd.torch.torch_trainer import (TorchTrainer, TorchTrainable)
+
+    from ray.util.sgd.torch.training_operator import TrainingOperator
+
+    __all__ = ["TorchTrainer", "TorchTrainable", "TrainingOperator"]
+except ImportError:
+    logger.warning("PyTorch not found. TorchTrainer will not be available")
--- a/python/ray/util/sgd/pytorch/constants.py
+++ b/python/ray/util/sgd/pytorch/constants.py
--- a/python/ray/util/sgd/pytorch/distributed_pytorch_runner.py
+++ b/python/ray/util/sgd/pytorch/distributed_pytorch_runner.py
@ -7,24 +7,24 @@ import torch.distributed as dist
 import torch.utils.data
 from torch.nn.parallel import DistributedDataParallel

-from ray.util.sgd.pytorch.pytorch_runner import PyTorchRunner
+from ray.util.sgd.torch.torch_runner import TorchRunner

 logger = logging.getLogger(__name__)


-class DistributedPyTorchRunner(PyTorchRunner):
+class DistributedTorchRunner(TorchRunner):
    """Manages a distributed PyTorch model replica.


    Args:
-        args: Arguments for PyTorchRunner.
+        args: Arguments for TorchRunner.
        backend (string): backend used by distributed PyTorch.
-        kwargs: Keyword arguments for PyTorchRunner.
+        kwargs: Keyword arguments for TorchRunner.

    """

    def __init__(self, *args, backend="gloo", **kwargs):
-        super(DistributedPyTorchRunner, self).__init__(*args, **kwargs)
+        super(DistributedTorchRunner, self).__init__(*args, **kwargs)
        self.backend = backend

    def setup(self, url, world_rank, world_size):
@ -110,7 +110,7 @@ class DistributedPyTorchRunner(PyTorchRunner):
        """
        if hasattr(self.train_loader.sampler, "set_epoch"):
            self.train_loader.sampler.set_epoch(self.epochs)
-        return super(DistributedPyTorchRunner, self).train_epoch(**kwargs)
+        return super(DistributedTorchRunner, self).train_epoch(**kwargs)

    def _get_model_state_dicts(self):
        """Fetch state from ``model.module`` instead of ``model``.
@ -132,7 +132,7 @@ class DistributedPyTorchRunner(PyTorchRunner):

    # def shutdown(self):
        """Attempts to shut down the worker."""
-        # super(DistributedPyTorchRunner, self).shutdown()
+        # super(DistributedTorchRunner, self).shutdown()
        # TODO: Temporarily removing since it causes hangs on MacOSX.
        # However, it seems to be harmless to remove permanently
        # since the processes are shutdown anyways. This comment can be
--- a/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py
+++ b/python/ray/util/sgd/pytorch/examples/cifar_pytorch_example.py
@ -8,8 +8,8 @@ import torchvision
 import torchvision.transforms as transforms

 import ray
-from ray.util.sgd.pytorch import (PyTorchTrainer, PyTorchTrainable)
-from ray.util.sgd.pytorch.resnet import ResNet18
+from ray.util.sgd.torch import (TorchTrainer, TorchTrainable)
+from ray.util.sgd.torch.resnet import ResNet18


 def initialization_hook():
@ -62,7 +62,7 @@ def train_example(num_replicas=1,
                  use_gpu=False,
                  use_fp16=False,
                  test_mode=False):
-    trainer1 = PyTorchTrainer(
+    trainer1 = TorchTrainer(
        ResNet18,
        cifar_creator,
        optimizer_creator,
@ -107,7 +107,7 @@ def tune_example(num_replicas=1, use_gpu=False, test_mode=False):
    }

    analysis = tune.run(
-        PyTorchTrainable,
+        TorchTrainable,
        num_samples=2,
        config=config,
        stop={"training_iteration": 2},
--- a/python/ray/util/sgd/pytorch/examples/dcgan.py
+++ b/python/ray/util/sgd/pytorch/examples/dcgan.py
@ -15,9 +15,9 @@ from torch.nn import functional as F
 from scipy.stats import entropy

 import ray
-from ray.util.sgd import PyTorchTrainer
+from ray.util.sgd import TorchTrainer
 from ray.util.sgd.utils import override
-from ray.util.sgd.pytorch import TrainingOperator
+from ray.util.sgd.torch import TrainingOperator


 def data_creator(config):
@ -223,9 +223,9 @@ def train_example(num_replicas=1, use_gpu=False, test_mode=False):
        "test_mode": test_mode,
        "classification_model_path": os.path.join(
            os.path.dirname(ray.__file__),
-            "util/sgd/pytorch/examples/mnist_cnn.pt")
+            "util/sgd/torch/examples/mnist_cnn.pt")
    }
-    trainer = PyTorchTrainer(
+    trainer = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
--- a/python/ray/util/sgd/pytorch/examples/example-sgd.yaml
+++ b/python/ray/util/sgd/pytorch/examples/example-sgd.yaml
--- a/python/ray/util/sgd/pytorch/examples/mnist_cnn.pt
+++ b/python/ray/util/sgd/pytorch/examples/mnist_cnn.pt
--- a/python/ray/util/sgd/pytorch/examples/train_example.py
+++ b/python/ray/util/sgd/pytorch/examples/train_example.py
@ -13,7 +13,7 @@ import numpy as np
 import torch
 import torch.nn as nn

-from ray.util.sgd import PyTorchTrainer
+from ray.util.sgd import TorchTrainer


 class LinearDataset(torch.utils.data.Dataset):
@ -44,7 +44,7 @@ def optimizer_creator(model, config):
 def scheduler_creator(optimizer, config):
    """Returns a learning rate scheduler wrapping the optimizer.

-    You will need to set ``PyTorchTrainer(scheduler_step_freq="epoch")``
+    You will need to set ``TorchTrainer(scheduler_step_freq="epoch")``
    for the scheduler to be incremented correctly.

    If using a scheduler for validation loss, be sure to call
@ -59,7 +59,7 @@ def data_creator(config):


 def train_example(num_replicas=1, use_gpu=False):
-    trainer1 = PyTorchTrainer(
+    trainer1 = TorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
--- a/python/ray/util/sgd/pytorch/examples/tune_example.py
+++ b/python/ray/util/sgd/pytorch/examples/tune_example.py
@ -14,7 +14,7 @@ import torch.nn as nn

 import ray
 from ray import tune
-from ray.util.sgd.pytorch.pytorch_trainer import PyTorchTrainable
+from ray.util.sgd.torch.torch_trainer import TorchTrainable


 class LinearDataset(torch.utils.data.Dataset):
@ -60,7 +60,7 @@ def tune_example(num_replicas=1, use_gpu=False):
    }

    analysis = tune.run(
-        PyTorchTrainable,
+        TorchTrainable,
        num_samples=12,
        config=config,
        stop={"training_iteration": 2},
--- a/python/ray/util/sgd/pytorch/resnet.py
+++ b/python/ray/util/sgd/pytorch/resnet.py
--- a/python/ray/util/sgd/pytorch/pytorch_runner.py
+++ b/python/ray/util/sgd/pytorch/pytorch_runner.py
@ -9,8 +9,8 @@ import torch.utils.data
 from torch.utils.data import Dataset

 import ray
-from ray.util.sgd.pytorch.constants import USE_FP16, SCHEDULER_STEP
-from ray.util.sgd.pytorch.training_operator import TrainingOperator
+from ray.util.sgd.torch.constants import USE_FP16, SCHEDULER_STEP
+from ray.util.sgd.torch.training_operator import TrainingOperator
 from ray.util.sgd import utils

 logger = logging.getLogger(__name__)
@ -23,23 +23,23 @@ except ImportError:
    pass


-class PyTorchRunner:
+class TorchRunner:
    """Manages a PyTorch model for training.

    Args:
-        model_creator (dict -> *): see pytorch_trainer.py
-        data_creator (dict -> Dataset, Dataset): see pytorch_trainer.py.
-        optimizer_creator (models, dict -> optimizers): see pytorch_trainer.py.
-        loss_creator (dict -> loss | Loss class): see pytorch_trainer.py.
+        model_creator (dict -> *): see torch_trainer.py
+        data_creator (dict -> Dataset, Dataset): see torch_trainer.py.
+        optimizer_creator (models, dict -> optimizers): see torch_trainer.py.
+        loss_creator (dict -> loss | Loss class): see torch_trainer.py.
        scheduler_creator (optimizers, dict -> schedulers): see
-            pytorch_trainer.py.
-        training_operator_cls: see pytorch_trainer.py
-        config (dict): see pytorch_trainer.py.
-        dataloader_config (dict): See pytorch_trainer.py.
-        batch_size (int): see pytorch_trainer.py.
-        use_fp16 (bool): see pytorch_trainer.py.
-        apex_args (dict|None): see pytorch_trainer.py.
-        scheduler_step_freq (str): see pytorch_trainer.py.
+            torch_trainer.py.
+        training_operator_cls: see torch_trainer.py
+        config (dict): see torch_trainer.py.
+        dataloader_config (dict): See torch_trainer.py.
+        batch_size (int): see torch_trainer.py.
+        use_fp16 (bool): see torch_trainer.py.
+        apex_args (dict|None): see torch_trainer.py.
+        scheduler_step_freq (str): see torch_trainer.py.
    """

    def __init__(self,
--- a/python/ray/util/sgd/pytorch/pytorch_trainer.py
+++ b/python/ray/util/sgd/pytorch/pytorch_trainer.py
@ -11,11 +11,11 @@ import ray

 from ray.tune import Trainable
 from ray.tune.trial import Resources
-from ray.util.sgd.pytorch.distributed_pytorch_runner import (
-    DistributedPyTorchRunner)
+from ray.util.sgd.torch.distributed_torch_runner import (
+    DistributedTorchRunner)
 from ray.util.sgd import utils
-from ray.util.sgd.pytorch.pytorch_runner import PyTorchRunner
-from ray.util.sgd.pytorch.constants import VALID_SCHEDULER_STEP
+from ray.util.sgd.torch.torch_runner import TorchRunner
+from ray.util.sgd.torch.constants import VALID_SCHEDULER_STEP

 logger = logging.getLogger(__name__)
 RESIZE_COOLDOWN_S = 10
@ -29,7 +29,7 @@ def _validate_scheduler_step_freq(scheduler_step_freq):
                    VALID_SCHEDULER_STEP, scheduler_step_freq))


-class PyTorchTrainer:
+class TorchTrainer:
    """Train a PyTorch model using distributed PyTorch.

    Launches a set of actors which connect via distributed PyTorch and
@ -49,7 +49,7 @@ class PyTorchTrainer:
        def data_creator(config):
            return LinearDataset(2, 5), LinearDataset(2, 5, size=400)

-        trainer = PyTorchTrainer(
+        trainer = TorchTrainer(
            model_creator,
            data_creator,
            optimizer_creator,
@ -195,7 +195,7 @@ class PyTorchTrainer:
        if num_replicas == 1:
            # Generate actor class
            Runner = ray.remote(
-                num_cpus=1, num_gpus=int(self.use_gpu))(PyTorchRunner)
+                num_cpus=1, num_gpus=int(self.use_gpu))(TorchRunner)
            # Start workers
            self.workers = [
                Runner.remote(
@ -220,8 +220,7 @@ class PyTorchTrainer:
        else:
            # Generate actor class
            Runner = ray.remote(
-                num_cpus=1,
-                num_gpus=int(self.use_gpu))(DistributedPyTorchRunner)
+                num_cpus=1, num_gpus=int(self.use_gpu))(DistributedTorchRunner)
            # Compute batch size per replica
            batch_size_per_replica = self.batch_size // num_replicas
            if self.batch_size % num_replicas > 0:
@ -285,7 +284,7 @@ class PyTorchTrainer:
                in case of shared cluster usage.
            checkpoint (str): Path to checkpoint to restore from if retrying.
                If max_retries is set and ``checkpoint == "auto"``,
-                PyTorchTrainer will save a checkpoint before starting to train.
+                TorchTrainer will save a checkpoint before starting to train.
            info (dict): Optional dictionary passed to the training
                operator for ``train_epoch`` and ``train_batch``.

@ -487,7 +486,7 @@ class PyTorchTrainer:
        return False


-class PyTorchTrainable(Trainable):
+class TorchTrainable(Trainable):
    @classmethod
    def default_resource_request(cls, config):
        return Resources(
@ -497,7 +496,7 @@ class PyTorchTrainable(Trainable):
            extra_gpu=int(config["use_gpu"]) * config["num_replicas"])

    def _setup(self, config):
-        self._trainer = PyTorchTrainer(**config)
+        self._trainer = TorchTrainer(**config)

    def _train(self):
        train_stats = self._trainer.train()
--- a/python/ray/util/sgd/pytorch/training_operator.py
+++ b/python/ray/util/sgd/pytorch/training_operator.py
@ -2,7 +2,7 @@ import collections
 import torch

 from ray.util.sgd.utils import TimerStat, AverageMeter
-from ray.util.sgd.pytorch.constants import (
+from ray.util.sgd.torch.constants import (
    SCHEDULER_STEP_EPOCH, SCHEDULER_STEP_BATCH, SCHEDULER_STEP, BATCH_COUNT)

 amp = None
@ -11,7 +11,7 @@ try:
    from apex import amp
 except ImportError:
    # Apex library is not installed, so we cannot enable mixed precision.
-    # We don't log here because logging happens in the pytorch_runner,
+    # We don't log here because logging happens in the torch_runner,
    # where amp is initialized.
    pass

@ -26,7 +26,7 @@ class TrainingOperator:

    The scheduler will only be called at a batch or epoch frequency, depending
    on the user parameter. Be sure to set ``scheduler_step_freq`` in
-    ``PyTorchTrainer`` to either "batch" or "epoch" to increment the scheduler
+    ``TorchTrainer`` to either "batch" or "epoch" to increment the scheduler
    correctly during training. If using a learning rate scheduler
    that depends on validation loss, you can use ``trainer.update_scheduler``.

@ -290,7 +290,7 @@ class TrainingOperator:

    @property
    def config(self):
-        """Dictionary as provided into PyTorchTrainer."""
+        """Dictionary as provided into TorchTrainer."""
        return self._config

    @property