[tune] extend PTL template (GPU, typing fixes, tensorboard) (#9451)

Co-authored-by: Kai Fricke <kai@anyscale.com>
This commit is contained in:
krfricke 2020-07-15 19:30:20 +02:00 committed by GitHub
parent aa8928fac2
commit 5a40299d42
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 119 additions and 39 deletions

View file

@ -102,14 +102,22 @@ The callback just reports some metrics back to Tune after each validation epoch:
:start-after: __tune_callback_begin__
:end-before: __tune_callback_end__
Note that we have to explicitly convert the metrics from a tensor to a Python value.
Adding the Tune training function
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Then we specify our training function. Note that we added the ``data_dir`` as a config
parameter here, even though it should not be tuned. We just need to specify it to avoid
Then we specify our training function. Note that we added the ``data_dir`` as a
parameter here to avoid
that each training run downloads the full MNIST dataset. Instead, we want to access
a shared data location.
We are also able to specify the number of epochs to train each model, and the number
of GPUs we want to use for training. We also create a TensorBoard logger that writes
logfiles directly into Tune's root trial directory - if we didn't do that PyTorch
Lightning would create subdirectories, and each trial would thus be shown twice in
TensorBoard, one time for Tune's logs, and another time for PyTorch Lightning's logs.
.. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch_lightning.py
:language: python
:start-after: __tune_train_begin__
@ -134,7 +142,7 @@ We also delete this data after training to avoid filling up our disk or memory s
:language: python
:start-after: __tune_asha_begin__
:end-before: __tune_asha_end__
:lines: 27
:lines: 36
:dedent: 4
Configuring the search space
@ -150,7 +158,7 @@ we are able to also sample small values.
:language: python
:start-after: __tune_asha_begin__
:end-before: __tune_asha_end__
:lines: 4-10
:lines: 5-10
:dedent: 4
Selecting a scheduler
@ -165,7 +173,7 @@ configurations.
:language: python
:start-after: __tune_asha_begin__
:end-before: __tune_asha_end__
:lines: 11-16
:lines: 12-17
:dedent: 4
@ -173,17 +181,53 @@ Changing the CLI output
~~~~~~~~~~~~~~~~~~~~~~~
We instantiate a ``CLIReporter`` to specify which metrics we would like to see in our
output tables in the command line. If we didn't specify this, Tune would print all
hyperparameters by default, but since ``data_dir`` is not a real hyperparameter, we
can avoid printing it by omitting it in the ``parameter_columns`` parameter.
output tables in the command line. This is optional, but can be used to make sure our
output tables only include information we would like to see.
.. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch_lightning.py
:language: python
:start-after: __tune_asha_begin__
:end-before: __tune_asha_end__
:lines: 17-19
:lines: 19-21
:dedent: 4
Passing constants to the train function
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The ``data_dir``, ``num_epochs`` and ``num_gpus`` we pass to the training function
are constants. To avoid including them as non-configurable parameters in the ``config``
specification, we can use ``functools.partial`` to wrap around the training function.
.. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch_lightning.py
:language: python
:start-after: __tune_asha_begin__
:end-before: __tune_asha_end__
:lines: 24-28
:dedent: 8
Training with GPUs
~~~~~~~~~~~~~~~~~~
We can specify how many resources Tune should request for each trial.
This also includes GPUs.
PyTorch Lightning takes care of moving the training to the GPUs. We
already made sure that our code is compatible with that, so there's
nothing more to do here other than to specify the number of GPUs
we would like to use:
.. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch_lightning.py
:language: python
:start-after: __tune_asha_begin__
:end-before: __tune_asha_end__
:lines: 29
:dedent: 4
Please note that in the current state of PyTorch Lightning, training
on :doc:`fractional GPUs </using-ray-with-gpus>` or
multiple GPUs requires some workarounds. We will address these in a
separate tutorial - for now this example works with no or exactly one
GPU.
Putting it together
~~~~~~~~~~~~~~~~~~~

View file

@ -13,8 +13,10 @@ import os
# __import_tune_begin__
import shutil
from functools import partial
from tempfile import mkdtemp
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.cloud_io import load as pl_load
from ray import tune
from ray.tune import CLIReporter
@ -74,7 +76,7 @@ class LightningMNISTClassifier(pl.LightningModule):
loss = self.cross_entropy_loss(logits, y)
accuracy = self.accuracy(logits, y)
logs = {"train_loss": loss, "train_accuracy": accuracy}
logs = {"ptl/train_loss": loss, "ptl/train_accuracy": accuracy}
return {"loss": loss, "log": logs}
def validation_step(self, val_batch, batch_idx):
@ -88,12 +90,12 @@ class LightningMNISTClassifier(pl.LightningModule):
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
tensorboard_logs = {"val_loss": avg_loss, "val_accuracy": avg_acc}
logs = {"ptl/val_loss": avg_loss, "ptl/val_accuracy": avg_acc}
return {
"avg_val_loss": avg_loss,
"avg_val_accuracy": avg_acc,
"log": tensorboard_logs
"log": logs
}
@staticmethod
@ -133,16 +135,19 @@ def train_mnist(config):
class TuneReportCallback(Callback):
def on_validation_end(self, trainer, pl_module):
tune.report(
loss=trainer.callback_metrics["avg_val_loss"],
mean_accuracy=trainer.callback_metrics["avg_val_accuracy"])
loss=trainer.callback_metrics["avg_val_loss"].item(),
mean_accuracy=trainer.callback_metrics["avg_val_accuracy"].item())
# __tune_callback_end__
# __tune_train_begin__
def train_mnist_tune(config):
model = LightningMNISTClassifier(config, config["data_dir"])
def train_mnist_tune(config, data_dir=None, num_epochs=10, num_gpus=0):
model = LightningMNISTClassifier(config, data_dir)
trainer = pl.Trainer(
max_epochs=10,
max_epochs=num_epochs,
gpus=num_gpus,
logger=TensorBoardLogger(
save_dir=tune.get_trial_dir(), name="", version="."),
progress_bar_refresh_rate=0,
callbacks=[TuneReportCallback()])
@ -160,9 +165,17 @@ class CheckpointCallback(Callback):
# __tune_train_checkpoint_begin__
def train_mnist_tune_checkpoint(config, checkpoint=None):
def train_mnist_tune_checkpoint(
config,
checkpoint=None,
data_dir=None,
num_epochs=10,
num_gpus=0):
trainer = pl.Trainer(
max_epochs=10,
max_epochs=num_epochs,
gpus=num_gpus,
logger=TensorBoardLogger(
save_dir=tune.get_trial_dir(), name="", version="."),
progress_bar_refresh_rate=0,
callbacks=[CheckpointCallback(),
TuneReportCallback()])
@ -178,54 +191,64 @@ def train_mnist_tune_checkpoint(config, checkpoint=None):
trainer.current_epoch = ckpt["epoch"]
else:
model = LightningMNISTClassifier(
config=config, data_dir=config["data_dir"])
config=config, data_dir=data_dir)
trainer.fit(model)
# __tune_train_checkpoint_end__
# __tune_asha_begin__
def tune_mnist_asha(num_samples=10, max_num_epochs=10):
def tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0):
data_dir = mkdtemp(prefix="mnist_data_")
LightningMNISTClassifier.download_data(data_dir)
config = {
"layer_1_size": tune.choice([32, 64, 128]),
"layer_2_size": tune.choice([64, 128, 256]),
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": tune.choice([32, 64, 128]),
"data_dir": data_dir
}
scheduler = ASHAScheduler(
metric="loss",
mode="min",
max_t=max_num_epochs,
max_t=num_epochs,
grace_period=1,
reduction_factor=2)
reporter = CLIReporter(
parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
metric_columns=["loss", "mean_accuracy", "training_iteration"])
tune.run(
train_mnist_tune,
resources_per_trial={"cpu": 1},
partial(
train_mnist_tune,
data_dir=data_dir,
num_epochs=num_epochs,
num_gpus=gpus_per_trial),
resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
config=config,
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=reporter)
progress_reporter=reporter,
name="tune_mnist_asha")
shutil.rmtree(data_dir)
# __tune_asha_end__
# __tune_pbt_begin__
def tune_mnist_pbt():
def tune_mnist_pbt(num_samples=10, num_epochs=10, gpus_per_trial=0):
data_dir = mkdtemp(prefix="mnist_data_")
LightningMNISTClassifier.download_data(data_dir)
config = {
"layer_1_size": tune.choice([32, 64, 128]),
"layer_2_size": tune.choice([64, 128, 256]),
"lr": 1e-3,
"batch_size": 64,
"data_dir": data_dir
}
scheduler = PopulationBasedTraining(
time_attr="training_iteration",
metric="loss",
@ -235,16 +258,24 @@ def tune_mnist_pbt():
"lr": lambda: tune.loguniform(1e-4, 1e-1).func(None),
"batch_size": [32, 64, 128]
})
reporter = CLIReporter(
parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
metric_columns=["loss", "mean_accuracy", "training_iteration"])
tune.run(
train_mnist_tune_checkpoint,
resources_per_trial={"cpu": 1},
partial(
train_mnist_tune_checkpoint,
data_dir=data_dir,
num_epochs=num_epochs,
num_gpus=gpus_per_trial),
resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
config=config,
num_samples=10,
num_samples=num_samples,
scheduler=scheduler,
progress_reporter=reporter)
progress_reporter=reporter,
name="tune_mnist_pbt")
shutil.rmtree(data_dir)
# __tune_pbt_end__
@ -258,7 +289,10 @@ if __name__ == "__main__":
args, _ = parser.parse_known_args()
if args.smoke_test:
tune_mnist_asha(1, 1)
tune_mnist_asha(num_samples=1, num_epochs=1, gpus_per_trial=0)
tune_mnist_pbt(num_samples=1, num_epochs=1, gpus_per_trial=0)
else:
tune_mnist_asha() # ASHA scheduler
tune_mnist_pbt() # population based training
# ASHA scheduler
tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0)
# Population based training
tune_mnist_pbt(num_samples=10, num_epochs=10, gpus_per_trial=0)

View file

@ -1,4 +1,6 @@
import logging
import random
import numpy as np
logger = logging.getLogger(__name__)
@ -56,13 +58,13 @@ def loguniform(min_bound, max_bound, base=10):
def choice(*args, **kwargs):
"""Wraps tune.sample_from around ``np.random.choice``.
"""Wraps tune.sample_from around ``random.choice``.
``tune.choice(10)`` is equivalent to
``tune.sample_from(lambda _: np.random.choice(10))``
``tune.choice([1, 2])`` is equivalent to
``tune.sample_from(lambda _: random.choice([1, 2]))``
"""
return sample_from(lambda _: np.random.choice(*args, **kwargs))
return sample_from(lambda _: random.choice(*args, **kwargs))
def randint(*args, **kwargs):