mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[docs] Tune overhaul part II (#22656)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
This commit is contained in:
parent
25d60d9cc9
commit
372c620f58
113 changed files with 6191 additions and 2995 deletions
|
@ -22,3 +22,16 @@ def py_test_run_all_subdirectory(include, exclude, extra_srcs, **kwargs):
|
|||
srcs = extra_srcs + [file],
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Runs all included notebooks as py_test targets, by first converting them to .py files with "test_myst_doc.py".
|
||||
def py_test_run_all_notebooks(include, exclude, **kwargs):
|
||||
for file in native.glob(include = include, exclude = exclude):
|
||||
print(file)
|
||||
basename = file.rpartition("/")[-1]
|
||||
native.py_test(
|
||||
name = basename[:-3],
|
||||
main = "test_myst_doc.py",
|
||||
srcs = ["test_myst_doc.py"],
|
||||
args = ["--path", file],
|
||||
**kwargs
|
||||
)
|
||||
|
|
46
doc/BUILD
46
doc/BUILD
|
@ -1,3 +1,6 @@
|
|||
load("//bazel:python.bzl", "py_test_run_all_subdirectory")
|
||||
load("//bazel:python.bzl", "py_test_run_all_notebooks")
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Tests from the doc directory.
|
||||
# Please keep these sorted alphabetically, but start with the
|
||||
|
@ -132,36 +135,25 @@ py_test(
|
|||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Tests from the doc/source/tune/tutorials directory.
|
||||
# Please keep these sorted alphabetically.
|
||||
# Test all doc/source/tune/examples notebooks.
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
py_test(
|
||||
name = "tune_sklearn",
|
||||
py_test_run_all_notebooks(
|
||||
size = "medium",
|
||||
main = "test_myst_doc.py",
|
||||
srcs = ["test_myst_doc.py"],
|
||||
args = ["--path", "doc/source/tune/tutorials/tune-sklearn.ipynb"],
|
||||
data = ["//doc/source/tune/tutorials:tune_tutorials"],
|
||||
tags = ["exclusive", "team:ml"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "tune_serve_integration_mnist",
|
||||
size = "medium",
|
||||
main = "test_myst_doc.py",
|
||||
srcs = ["test_myst_doc.py"],
|
||||
args = ["--path", "doc/source/tune/tutorials/tune-serve-integration-mnist.ipynb", "--smoke-test", "--from_scratch", "--day 0"],
|
||||
data = ["//doc/source/tune/tutorials:tune_tutorials"],
|
||||
tags = ["exclusive", "team:ml"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "hyperopt_example",
|
||||
size = "small",
|
||||
main = "test_myst_doc.py",
|
||||
srcs = ["test_myst_doc.py"],
|
||||
args = ["--path", "doc/source/tune/examples/hyperopt_example.ipynb"],
|
||||
include = ["doc/source/tune/examples/*.ipynb"],
|
||||
exclude = [],
|
||||
data = ["//doc/source/tune/examples:tune_examples"],
|
||||
tags = ["exclusive", "team:ml"],
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Test all doc/source/tune/doc_code code included in rst/md files.
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
py_test_run_all_subdirectory(
|
||||
size = "medium",
|
||||
include = ["source/tune/doc_code/*.py"],
|
||||
exclude = [],
|
||||
extra_srcs = [],
|
||||
tags = ["exclusive", "team:ml"],
|
||||
)
|
|
@ -34,22 +34,26 @@ parts:
|
|||
- file: tune/key-concepts
|
||||
- file: tune/tutorials/overview
|
||||
sections:
|
||||
- file: tune/tutorials/tune-sklearn
|
||||
- file: tune/tutorials/tune-pytorch-cifar
|
||||
- file: tune/tutorials/tune-pytorch-lightning
|
||||
- file: tune/tutorials/tune-serve-integration-mnist
|
||||
- file: tune/tutorials/tune-xgboost
|
||||
- file: tune/tutorials/tune-wandb
|
||||
- file: tune/tutorials/tune-mlflow
|
||||
- file: tune/tutorials/tune-comet
|
||||
- file: tune/tutorials/tune-stopping
|
||||
- file: tune/tutorials/tune-metrics
|
||||
- file: tune/tutorials/tune-output
|
||||
- file: tune/tutorials/tune-resources
|
||||
- file: tune/tutorials/tune-checkpoints
|
||||
- file: tune/tutorials/tune-lifecycle
|
||||
- file: tune/tutorials/tune-advanced-tutorial
|
||||
title: "How Tune Works"
|
||||
- file: tune/tutorials/tune-stopping
|
||||
title: "How to Stop and Resume"
|
||||
- file: tune/tutorials/tune-metrics
|
||||
title: "Using Callbacks and Metrics"
|
||||
- file: tune/tutorials/tune-distributed
|
||||
title: "Distributed Tuning"
|
||||
- file: tune/tutorials/tune-output
|
||||
title: "Logging Tune Runs"
|
||||
- file: tune/tutorials/tune-resources
|
||||
title: "Managing Resources"
|
||||
- file: tune/tutorials/tune-checkpoints
|
||||
title: "Working with Checkpoints"
|
||||
- file: tune/tutorials/tune-search-spaces
|
||||
title: "Using Search Spaces"
|
||||
- file: tune/tutorials/tune-advanced-tutorial
|
||||
title: "Understanding PBT"
|
||||
- file: tune/tutorials/tune-scalability
|
||||
title: "Scalability Benchmarks"
|
||||
- file: tune/examples/index
|
||||
- file: tune/faq
|
||||
- file: tune/api_docs/overview.rst
|
||||
|
|
BIN
doc/source/images/keras.png
Normal file
BIN
doc/source/images/keras.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 4.8 KiB |
BIN
doc/source/images/mxnet_logo.png
Normal file
BIN
doc/source/images/mxnet_logo.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 36 KiB |
BIN
doc/source/images/tf_keras_logo.jpeg
Normal file
BIN
doc/source/images/tf_keras_logo.jpeg
Normal file
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
|
@ -1,7 +1,7 @@
|
|||
Best Practices: Ray with Tensorflow
|
||||
===================================
|
||||
|
||||
This document describes best practices for using the Ray core APIs with TensorFlow. Ray also provides higher-level utilities for working with Tensorflow, such as distributed training APIs (`training tensorflow example`_), Tune for hyperparameter search (:doc:`/tune/examples/tf_mnist_example`), RLlib for reinforcement learning (`RLlib tensorflow example`_).
|
||||
This document describes best practices for using the Ray core APIs with TensorFlow. Ray also provides higher-level utilities for working with Tensorflow, such as distributed training APIs (`training tensorflow example`_), Tune for hyperparameter search (:doc:`/tune/examples/includes/tf_mnist_example`), RLlib for reinforcement learning (`RLlib tensorflow example`_).
|
||||
|
||||
.. _`training tensorflow example`: tf_distributed_training.html
|
||||
.. _`RLlib tensorflow example`: rllib-models.html#tensorflow-models
|
||||
|
|
|
@ -3,7 +3,8 @@
|
|||
Analysis (tune.analysis)
|
||||
========================
|
||||
|
||||
You can use the ``ExperimentAnalysis`` object for analyzing results. It is returned automatically when calling ``tune.run``.
|
||||
You can use the ``ExperimentAnalysis`` object for analyzing results.
|
||||
It is returned automatically when calling ``tune.run``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -29,7 +30,8 @@ Here are some example operations for obtaining a summary of your experiment:
|
|||
# Get a list of trials
|
||||
trials = analysis.trials
|
||||
|
||||
You may want to get a summary of multiple experiments that point to the same ``local_dir``. This is also supported by the ``ExperimentAnalysis`` class.
|
||||
You may want to get a summary of multiple experiments that point to the same ``local_dir``.
|
||||
This is also supported by the ``ExperimentAnalysis`` class.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
|
|
@ -1,15 +1,20 @@
|
|||
Tune CLI (Experimental)
|
||||
=======================
|
||||
|
||||
``tune`` has an easy-to-use command line interface (CLI) to manage and monitor your experiments on Ray. To do this, verify that you have the ``tabulate`` library installed:
|
||||
``tune`` has an easy-to-use command line interface (CLI) to manage and monitor your experiments on Ray.
|
||||
To do this, verify that you have the ``tabulate`` library installed:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ pip install tabulate
|
||||
|
||||
Here are a few examples of command line calls.
|
||||
Here is an example command line call:
|
||||
|
||||
- ``tune list-trials``: List tabular information about trials within an experiment. Empty columns will be dropped by default. Add the ``--sort`` flag to sort the output by specific columns. Add the ``--filter`` flag to filter the output in the format ``"<column> <operator> <value>"``. Add the ``--output`` flag to write the trial information to a specific file (CSV or Pickle). Add the ``--columns`` and ``--result-columns`` flags to select specific columns to display.
|
||||
``tune list-trials``: List tabular information about trials within an experiment.
|
||||
Empty columns will be dropped by default. Add the ``--sort`` flag to sort the output by specific columns.
|
||||
Add the ``--filter`` flag to filter the output in the format ``"<column> <operator> <value>"``.
|
||||
Add the ``--output`` flag to write the trial information to a specific file (CSV or Pickle).
|
||||
Add the ``--columns`` and ``--result-columns`` flags to select specific columns to display.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
Tune Client API
|
||||
===============
|
||||
|
||||
You can interact with an ongoing experiment with the Tune Client API. The Tune Client API is organized around REST, which includes resource-oriented URLs, accepts form-encoded requests, returns JSON-encoded responses, and uses standard HTTP protocol.
|
||||
You can interact with an ongoing experiment with the Tune Client API. The Tune Client API is organized around REST,
|
||||
which includes resource-oriented URLs, accepts form-encoded requests, returns JSON-encoded responses,
|
||||
and uses standard HTTP protocol.
|
||||
|
||||
To allow Tune to receive and respond to your API calls, you have to start your experiment with ``tune.run(server_port)``:
|
||||
|
||||
|
@ -9,18 +11,21 @@ To allow Tune to receive and respond to your API calls, you have to start your e
|
|||
|
||||
tune.run(..., server_port=4321)
|
||||
|
||||
The easiest way to use the Tune Client API is with the built-in TuneClient. To use TuneClient, verify that you have the ``requests`` library installed:
|
||||
The easiest way to use the Tune Client API is with the built-in TuneClient. To use TuneClient,
|
||||
verify that you have the ``requests`` library installed:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ pip install requests
|
||||
|
||||
Then, on the client side, you can use the following class. If on a cluster, you may want to forward this port (e.g. ``ssh -L <local_port>:localhost:<remote_port> <address>``) so that you can use the Client on your local machine.
|
||||
Then, on the client side, you can use the following class. If on a cluster, you may want to forward this port
|
||||
(e.g. ``ssh -L <local_port>:localhost:<remote_port> <address>``) so that you can use the Client on your local machine.
|
||||
|
||||
.. autoclass:: ray.tune.web_server.TuneClient
|
||||
:members:
|
||||
|
||||
For an example notebook for using the Client API, see the `Client API Example <https://github.com/ray-project/ray/tree/master/python/ray/tune/TuneClient.ipynb>`__.
|
||||
For an example notebook for using the Client API, see the
|
||||
`Client API Example <https://github.com/ray-project/ray/tree/master/python/ray/tune/TuneClient.ipynb>`__.
|
||||
|
||||
The API also supports curl. Here are the examples for getting trials (``GET /trials/[:id]``):
|
||||
|
||||
|
|
|
@ -45,7 +45,8 @@ These are the environment variables Ray Tune currently considers:
|
|||
* **TUNE_MAX_LEN_IDENTIFIER**: Maximum length of trial subdirectory names (those
|
||||
with the parameter values in them)
|
||||
* **TUNE_MAX_PENDING_TRIALS_PG**: Maximum number of pending trials when placement groups are used. Defaults
|
||||
to ``auto``, which will be updated to ``max(16, cluster_cpus * 1.1)`` for random/grid search and ``1`` for any other search algorithms.
|
||||
to ``auto``, which will be updated to ``max(16, cluster_cpus * 1.1)`` for random/grid search and ``1``
|
||||
for any other search algorithms.
|
||||
* **TUNE_PLACEMENT_GROUP_CLEANUP_DISABLED**: Ray Tune cleans up existing placement groups
|
||||
with the ``_tune__`` prefix in their name before starting a run. This is used to make sure
|
||||
that scheduled placement groups are removed when multiple calls to ``tune.run()`` are
|
||||
|
|
|
@ -1,95 +1,6 @@
|
|||
Tune Internals
|
||||
==============
|
||||
|
||||
This page overviews the design and architectures of Tune and provides docstrings for internal components.
|
||||
|
||||
.. image:: ../../images/tune-arch.png
|
||||
|
||||
The blue boxes refer to internal components, and green boxes are public-facing.
|
||||
|
||||
Main Components
|
||||
---------------
|
||||
|
||||
Tune's main components consist of TrialRunner, Trial objects, TrialExecutor, SearchAlg, TrialScheduler, and Trainable.
|
||||
|
||||
.. _trial-runner-flow:
|
||||
|
||||
This is an illustration of the high-level training flow and how some of the components interact:
|
||||
|
||||
*Note: This figure is horizontally scrollable*
|
||||
|
||||
.. figure:: ../../images/tune-trial-runner-flow-horizontal.png
|
||||
:class: horizontal-scroll
|
||||
|
||||
|
||||
TrialRunner
|
||||
~~~~~~~~~~~
|
||||
[`source code <https://github.com/ray-project/ray/blob/master/python/ray/tune/trial_runner.py>`__]
|
||||
This is the main driver of the training loop. This component
|
||||
uses the TrialScheduler to prioritize and execute trials,
|
||||
queries the SearchAlgorithm for new
|
||||
configurations to evaluate, and handles the fault tolerance logic.
|
||||
|
||||
**Fault Tolerance**: The TrialRunner executes checkpointing if ``checkpoint_freq``
|
||||
is set, along with automatic trial restarting in case of trial failures (if ``max_failures`` is set).
|
||||
For example, if a node is lost while a trial (specifically, the corresponding
|
||||
Trainable of the trial) is still executing on that node and checkpointing
|
||||
is enabled, the trial will then be reverted to a ``"PENDING"`` state and resumed
|
||||
from the last available checkpoint when it is run.
|
||||
The TrialRunner is also in charge of checkpointing the entire experiment execution state
|
||||
upon each loop iteration. This allows users to restart their experiment
|
||||
in case of machine failure.
|
||||
|
||||
See the docstring at :ref:`trialrunner-docstring`.
|
||||
|
||||
Trial objects
|
||||
~~~~~~~~~~~~~
|
||||
[`source code <https://github.com/ray-project/ray/blob/master/python/ray/tune/trial.py>`__]
|
||||
This is an internal data structure that contains metadata about each training run. Each Trial
|
||||
object is mapped one-to-one with a Trainable object but are not themselves
|
||||
distributed/remote. Trial objects transition among
|
||||
the following states: ``"PENDING"``, ``"RUNNING"``, ``"PAUSED"``, ``"ERRORED"``, and
|
||||
``"TERMINATED"``.
|
||||
|
||||
See the docstring at :ref:`trial-docstring`.
|
||||
|
||||
TrialExecutor
|
||||
~~~~~~~~~~~~~
|
||||
[`source code <https://github.com/ray-project/ray/blob/master/python/ray/tune/trial_executor.py>`__]
|
||||
The TrialExecutor is a component that interacts with the underlying execution framework.
|
||||
It also manages resources to ensure the cluster isn't overloaded. By default, the TrialExecutor uses Ray to execute trials.
|
||||
|
||||
See the docstring at :ref:`raytrialexecutor-docstring`.
|
||||
|
||||
|
||||
SearchAlg
|
||||
~~~~~~~~~
|
||||
[`source code <https://github.com/ray-project/ray/tree/master/python/ray/tune/suggest>`__] The SearchAlgorithm is a user-provided object
|
||||
that is used for querying new hyperparameter configurations to evaluate.
|
||||
|
||||
SearchAlgorithms will be notified every time a trial finishes
|
||||
executing one training step (of ``train()``), every time a trial
|
||||
errors, and every time a trial completes.
|
||||
|
||||
TrialScheduler
|
||||
~~~~~~~~~~~~~~
|
||||
[`source code <https://github.com/ray-project/ray/blob/master/python/ray/tune/schedulers>`__] TrialSchedulers operate over a set of possible trials to run,
|
||||
prioritizing trial execution given available cluster resources.
|
||||
|
||||
TrialSchedulers are given the ability to kill or pause trials,
|
||||
and also are given the ability to reorder/prioritize incoming trials.
|
||||
|
||||
Trainables
|
||||
~~~~~~~~~~
|
||||
[`source code <https://github.com/ray-project/ray/blob/master/python/ray/tune/trainable.py>`__]
|
||||
These are user-provided objects that are used for
|
||||
the training process. If a class is provided, it is expected to conform to the
|
||||
Trainable interface. If a function is provided. it is wrapped into a
|
||||
Trainable class, and the function itself is executed on a separate thread.
|
||||
|
||||
Trainables will execute one step of ``train()`` before notifying the TrialRunner.
|
||||
|
||||
|
||||
.. _raytrialexecutor-docstring:
|
||||
|
||||
RayTrialExecutor
|
||||
|
|
|
@ -3,143 +3,30 @@
|
|||
Loggers (tune.logger)
|
||||
=====================
|
||||
|
||||
Tune has default loggers for TensorBoard, CSV, and JSON formats. By default, Tune only logs the returned result dictionaries from the training function.
|
||||
Tune automatically uses loggers for TensorBoard, CSV, and JSON formats.
|
||||
By default, Tune only logs the returned result dictionaries from the training function.
|
||||
|
||||
If you need to log something lower level like model weights or gradients, see :ref:`Trainable Logging <trainable-logging>`.
|
||||
If you need to log something lower level like model weights or gradients,
|
||||
see :ref:`Trainable Logging <trainable-logging>`.
|
||||
|
||||
.. note::
|
||||
Tune's per-trial ``Logger`` classes have been deprecated. They can still be used, but we encourage you
|
||||
to use our new interface with the ``LoggerCallback`` class instead.
|
||||
|
||||
Custom Loggers
|
||||
--------------
|
||||
|
||||
You can create a custom logger by inheriting the LoggerCallback interface (:ref:`logger-interface`):
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from typing import Dict, List
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
from ray.tune.logger import LoggerCallback
|
||||
|
||||
|
||||
class CustomLoggerCallback(LoggerCallback):
|
||||
"""Custom logger interface"""
|
||||
|
||||
def __init__(self, filename: str = "log.txt):
|
||||
self._trial_files = {}
|
||||
self._filename = filename
|
||||
|
||||
def log_trial_start(self, trial: "Trial"):
|
||||
trial_logfile = os.path.join(trial.logdir, self._filename)
|
||||
self._trial_files[trial] = open(trial_logfile, "at")
|
||||
|
||||
def log_trial_result(self, iteration: int, trial: "Trial", result: Dict):
|
||||
if trial in self._trial_files:
|
||||
self._trial_files[trial].write(json.dumps(result))
|
||||
|
||||
def on_trial_complete(self, iteration: int, trials: List["Trial"],
|
||||
trial: "Trial", **info):
|
||||
if trial in self._trial_files:
|
||||
self._trial_files[trial].close()
|
||||
del self._trial_files[trial]
|
||||
|
||||
|
||||
You can then pass in your own logger as follows:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
MyTrainableClass,
|
||||
name="experiment_name",
|
||||
callbacks=[CustomLoggerCallback("log_test.txt")]
|
||||
)
|
||||
|
||||
Per default, Ray Tune creates JSON, CSV and TensorBoardX logger callbacks if you don't pass them yourself.
|
||||
You can disable this behavior by setting the ``TUNE_DISABLE_AUTO_CALLBACK_LOGGERS`` environment variable to ``"1"``.
|
||||
|
||||
An example of creating a custom logger can be found in :doc:`/tune/examples/logging_example`.
|
||||
|
||||
.. _trainable-logging:
|
||||
|
||||
Trainable Logging
|
||||
-----------------
|
||||
|
||||
By default, Tune only logs the *training result dictionaries* from your Trainable. However, you may want to visualize the model weights, model graph, or use a custom logging library that requires multi-process logging. For example, you may want to do this if you're trying to log images to TensorBoard.
|
||||
|
||||
You can do this in the trainable, as shown below:
|
||||
|
||||
.. tip:: Make sure that any logging calls or objects stay within scope of the Trainable. You may see Pickling/serialization errors or inconsistent logs otherwise.
|
||||
|
||||
**Function API**:
|
||||
|
||||
``library`` refers to whatever 3rd party logging library you are using.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def trainable(config):
|
||||
library.init(
|
||||
name=trial_id,
|
||||
id=trial_id,
|
||||
resume=trial_id,
|
||||
reinit=True,
|
||||
allow_val_change=True)
|
||||
library.set_log_path(tune.get_trial_dir())
|
||||
|
||||
for step in range(100):
|
||||
library.log_model(...)
|
||||
library.log(results, step=step)
|
||||
tune.report(results)
|
||||
|
||||
|
||||
**Class API**:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class CustomLogging(tune.Trainable)
|
||||
def setup(self, config):
|
||||
trial_id = self.trial_id
|
||||
library.init(
|
||||
name=trial_id,
|
||||
id=trial_id,
|
||||
resume=trial_id,
|
||||
reinit=True,
|
||||
allow_val_change=True)
|
||||
library.set_log_path(self.logdir)
|
||||
|
||||
def step(self):
|
||||
library.log_model(...)
|
||||
|
||||
def log_result(self, result):
|
||||
res_dict = {
|
||||
str(k): v
|
||||
for k, v in result.items()
|
||||
if (v and "config" not in k and not isinstance(v, str))
|
||||
}
|
||||
step = result["training_iteration"]
|
||||
library.log(res_dict, step=step)
|
||||
|
||||
Use ``self.logdir`` (only for Class API) or ``tune.get_trial_dir()`` (only for Function API) for the trial log directory.
|
||||
|
||||
In the distributed case, these logs will be sync'ed back to the driver under your logger path. This will allow you to visualize and analyze logs of all distributed training workers on a single machine.
|
||||
|
||||
|
||||
Viskit
|
||||
------
|
||||
|
||||
Tune automatically integrates with `Viskit <https://github.com/vitchyr/viskit>`_ via the ``CSVLoggerCallback`` outputs. To use VisKit (you may have to install some dependencies), run:
|
||||
Tune automatically integrates with `Viskit <https://github.com/vitchyr/viskit>`_ via the ``CSVLoggerCallback`` outputs.
|
||||
To use VisKit (you may have to install some dependencies), run:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ git clone https://github.com/rll/rllab.git
|
||||
$ python rllab/rllab/viskit/frontend.py ~/ray_results/my_experiment
|
||||
|
||||
The nonrelevant metrics (like timing stats) can be disabled on the left to show only the relevant ones (like accuracy, loss, etc.).
|
||||
The non-relevant metrics (like timing stats) can be disabled on the left to show only the
|
||||
relevant ones (like accuracy, loss, etc.).
|
||||
|
||||
.. image:: ../images/ray-tune-viskit.png
|
||||
|
||||
|
@ -162,14 +49,16 @@ CSVLogger
|
|||
MLFlowLogger
|
||||
------------
|
||||
|
||||
Tune also provides a default logger for `MLflow <https://mlflow.org>`_. You can install MLflow via ``pip install mlflow``.
|
||||
You can see the :doc:`tutorial here </tune/tutorials/tune-mlflow>`.
|
||||
Tune also provides a logger for `MLflow <https://mlflow.org>`_.
|
||||
You can install MLflow via ``pip install mlflow``.
|
||||
You can see the :doc:`tutorial here </tune/examples/tune-mlflow>`.
|
||||
|
||||
WandbLogger
|
||||
-----------
|
||||
|
||||
Tune also provides a default logger for `Weights & Biases <https://www.wandb.ai/>`_. You can install Wandb via ``pip install wandb``.
|
||||
You can see the :doc:`tutorial here </tune/tutorials/tune-wandb>`
|
||||
Tune also provides a logger for `Weights & Biases <https://www.wandb.ai/>`_.
|
||||
You can install Wandb via ``pip install wandb``.
|
||||
You can see the :doc:`tutorial here </tune/examples/tune-wandb>`
|
||||
|
||||
|
||||
.. _logger-interface:
|
||||
|
|
|
@ -14,18 +14,17 @@ on `Github`_.
|
|||
:maxdepth: 2
|
||||
|
||||
execution.rst
|
||||
env.rst
|
||||
trainable.rst
|
||||
search_space.rst
|
||||
suggestion.rst
|
||||
schedulers.rst
|
||||
stoppers.rst
|
||||
reporters.rst
|
||||
analysis.rst
|
||||
sklearn.rst
|
||||
reporters.rst
|
||||
logging.rst
|
||||
env.rst
|
||||
sklearn.rst
|
||||
integration.rst
|
||||
internals.rst
|
||||
client.rst
|
||||
cli.rst
|
||||
scalability.rst
|
||||
cli.rst
|
|
@ -1,5 +1,6 @@
|
|||
.. _tune-reporter-doc:
|
||||
|
||||
|
||||
Console Output (Reporters)
|
||||
==========================
|
||||
|
||||
|
@ -22,7 +23,11 @@ By default, Tune reports experiment progress periodically to the command-line as
|
|||
| MyTrainable_a826b7bc | RUNNING | 10.234.98.164:31112 | 0.729127 | 0.0748 | 0.1784 | 0.1797 | 1.7161 | 7.05715 | 14 |
|
||||
+----------------------+----------+---------------------+-----------+--------+--------+--------+--------+------------------+-------+
|
||||
|
||||
Note that columns will be hidden if they are completely empty. The output can be configured in various ways by instantiating a ``CLIReporter`` instance (or ``JupyterNotebookReporter`` if you're using jupyter notebook). Here's an example:
|
||||
Note that columns will be hidden if they are completely empty. The output can be configured in various ways by
|
||||
instantiating a ``CLIReporter`` instance (or ``JupyterNotebookReporter`` if you're using jupyter notebook).
|
||||
Here's an example:
|
||||
|
||||
.. TODO: test these snippets
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
|
|
@ -3,65 +3,25 @@
|
|||
Trial Schedulers (tune.schedulers)
|
||||
==================================
|
||||
|
||||
In Tune, some hyperparameter optimization algorithms are written as "scheduling algorithms". These Trial Schedulers can early terminate bad trials, pause trials, clone trials, and alter hyperparameters of a running trial.
|
||||
In Tune, some hyperparameter optimization algorithms are written as "scheduling algorithms".
|
||||
These Trial Schedulers can early terminate bad trials, pause trials, clone trials,
|
||||
and alter hyperparameters of a running trial.
|
||||
|
||||
All Trial Schedulers take in a ``metric``, which is a value returned in the result dict of your Trainable and is maximized or minimized according to ``mode``.
|
||||
All Trial Schedulers take in a ``metric``, which is a value returned in the result dict of your
|
||||
Trainable and is maximized or minimized according to ``mode``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run( ... , scheduler=Scheduler(metric="accuracy", mode="max"))
|
||||
|
||||
.. _schedulers-ref:
|
||||
|
||||
Summary
|
||||
-------
|
||||
|
||||
Tune includes distributed implementations of early stopping algorithms such as `Median Stopping Rule <https://research.google.com/pubs/pub46180.html>`__, `HyperBand <https://arxiv.org/abs/1603.06560>`__, and `ASHA <https://openreview.net/forum?id=S1Y7OOlRZ>`__. Tune also includes a distributed implementation of `Population Based Training (PBT) <https://deepmind.com/blog/population-based-training-neural-networks>`__ and `Population Based Bandits (PB2) <https://arxiv.org/abs/2002.02518>`__.
|
||||
|
||||
.. tip:: The easiest scheduler to start with is the ``ASHAScheduler`` which will aggressively terminate low-performing trials.
|
||||
|
||||
When using schedulers, you may face compatibility issues, as shown in the below compatibility matrix. Certain schedulers cannot be used with Search Algorithms, and certain schedulers require :ref:`checkpointing to be implemented <tune-checkpoint-syncing>`.
|
||||
|
||||
Schedulers can dynamically change trial resource requirements during tuning. This is currently implemented in ``ResourceChangingScheduler``, which can wrap around any other scheduler.
|
||||
|
||||
.. list-table:: TrialScheduler Feature Compatibility Matrix
|
||||
:header-rows: 1
|
||||
|
||||
* - Scheduler
|
||||
- Need Checkpointing?
|
||||
- SearchAlg Compatible?
|
||||
- Example
|
||||
* - :ref:`ASHA <tune-scheduler-hyperband>`
|
||||
- No
|
||||
- Yes
|
||||
- :doc:`Link </tune/examples/async_hyperband_example>`
|
||||
* - :ref:`Median Stopping Rule <tune-scheduler-msr>`
|
||||
- No
|
||||
- Yes
|
||||
- :ref:`Link <tune-scheduler-msr>`
|
||||
* - :ref:`HyperBand <tune-original-hyperband>`
|
||||
- Yes
|
||||
- Yes
|
||||
- :doc:`Link </tune/examples/hyperband_example>`
|
||||
* - :ref:`BOHB <tune-scheduler-bohb>`
|
||||
- Yes
|
||||
- Only TuneBOHB
|
||||
- :doc:`Link </tune/examples/bohb_example>`
|
||||
* - :ref:`Population Based Training <tune-scheduler-pbt>`
|
||||
- Yes
|
||||
- Not Compatible
|
||||
- :doc:`Link </tune/examples/pbt_function>`
|
||||
* - :ref:`Population Based Bandits <tune-scheduler-pb2>`
|
||||
- Yes
|
||||
- Not Compatible
|
||||
- :doc:`Basic Example </tune/examples/pb2_example>`, :doc:`PPO example </tune/examples/pb2_ppo_example>`
|
||||
|
||||
.. _tune-scheduler-hyperband:
|
||||
|
||||
ASHA (tune.schedulers.ASHAScheduler)
|
||||
------------------------------------
|
||||
|
||||
The `ASHA <https://openreview.net/forum?id=S1Y7OOlRZ>`__ scheduler can be used by setting the ``scheduler`` parameter of ``tune.run``, e.g.
|
||||
The `ASHA <https://openreview.net/forum?id=S1Y7OOlRZ>`__ scheduler can be used by
|
||||
setting the ``scheduler`` parameter of ``tune.run``, e.g.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -75,9 +35,14 @@ The `ASHA <https://openreview.net/forum?id=S1Y7OOlRZ>`__ scheduler can be used b
|
|||
brackets=1)
|
||||
tune.run( ... , scheduler=asha_scheduler)
|
||||
|
||||
Compared to the original version of HyperBand, this implementation provides better parallelism and avoids straggler issues during eliminations. **We recommend using this over the standard HyperBand scheduler.** An example of this can be found here: :doc:`/tune/examples/async_hyperband_example`.
|
||||
Compared to the original version of HyperBand, this implementation provides better
|
||||
parallelism and avoids straggler issues during eliminations.
|
||||
**We recommend using this over the standard HyperBand scheduler.**
|
||||
An example of this can be found here: :doc:`/tune/examples/includes/async_hyperband_example`.
|
||||
|
||||
Even though the original paper mentions a bracket count of 3, discussions with the authors concluded that the value should be left to 1 bracket. This is the default used if no value is provided for the ``brackets`` argument.
|
||||
Even though the original paper mentions a bracket count of 3, discussions with the authors concluded
|
||||
that the value should be left to 1 bracket.
|
||||
This is the default used if no value is provided for the ``brackets`` argument.
|
||||
|
||||
.. autoclass:: ray.tune.schedulers.AsyncHyperBandScheduler
|
||||
|
||||
|
@ -88,7 +53,8 @@ Even though the original paper mentions a bracket count of 3, discussions with t
|
|||
HyperBand (tune.schedulers.HyperBandScheduler)
|
||||
----------------------------------------------
|
||||
|
||||
Tune implements the `standard version of HyperBand <https://arxiv.org/abs/1603.06560>`__. **We recommend using the ASHA Scheduler over the standard HyperBand scheduler.**
|
||||
Tune implements the `standard version of HyperBand <https://arxiv.org/abs/1603.06560>`__.
|
||||
**We recommend using the ASHA Scheduler over the standard HyperBand scheduler.**
|
||||
|
||||
.. autoclass:: ray.tune.schedulers.HyperBandScheduler
|
||||
|
||||
|
@ -96,31 +62,44 @@ Tune implements the `standard version of HyperBand <https://arxiv.org/abs/1603.0
|
|||
HyperBand Implementation Details
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Implementation details may deviate slightly from theory but are focused on increasing usability. Note: ``R``, ``s_max``, and ``eta`` are parameters of HyperBand given by the paper. See `this post <https://homes.cs.washington.edu/~jamieson/hyperband.html>`_ for context.
|
||||
Implementation details may deviate slightly from theory but are focused on increasing usability.
|
||||
Note: ``R``, ``s_max``, and ``eta`` are parameters of HyperBand given by the paper.
|
||||
See `this post <https://homes.cs.washington.edu/~jamieson/hyperband.html>`_ for context.
|
||||
|
||||
1. Both ``s_max`` (representing the ``number of brackets - 1``) and ``eta``, representing the downsampling rate, are fixed. In many practical settings, ``R``, which represents some resource unit and often the number of training iterations, can be set reasonably large, like ``R >= 200``. For simplicity, assume ``eta = 3``. Varying ``R`` between ``R = 200`` and ``R = 1000`` creates a huge range of the number of trials needed to fill up all brackets.
|
||||
1. Both ``s_max`` (representing the ``number of brackets - 1``) and ``eta``, representing the downsampling rate, are fixed.
|
||||
In many practical settings, ``R``, which represents some resource unit and often the number of training iterations,
|
||||
can be set reasonably large, like ``R >= 200``.
|
||||
For simplicity, assume ``eta = 3``. Varying ``R`` between ``R = 200`` and ``R = 1000``
|
||||
creates a huge range of the number of trials needed to fill up all brackets.
|
||||
|
||||
.. image:: /images/hyperband_bracket.png
|
||||
|
||||
On the other hand, holding ``R`` constant at ``R = 300`` and varying ``eta`` also leads to HyperBand configurations that are not very intuitive:
|
||||
On the other hand, holding ``R`` constant at ``R = 300`` and varying ``eta`` also leads to
|
||||
HyperBand configurations that are not very intuitive:
|
||||
|
||||
.. image:: /images/hyperband_eta.png
|
||||
|
||||
The implementation takes the same configuration as the example given in the paper and exposes ``max_t``, which is not a parameter in the paper.
|
||||
The implementation takes the same configuration as the example given in the paper
|
||||
and exposes ``max_t``, which is not a parameter in the paper.
|
||||
|
||||
2. The example in the `post <https://homes.cs.washington.edu/~jamieson/hyperband.html>`_ to calculate ``n_0`` is actually a little different than the algorithm given in the paper. In this implementation, we implement ``n_0`` according to the paper (which is `n` in the below example):
|
||||
2. The example in the `post <https://homes.cs.washington.edu/~jamieson/hyperband.html>`_ to calculate ``n_0``
|
||||
is actually a little different than the algorithm given in the paper.
|
||||
In this implementation, we implement ``n_0`` according to the paper (which is `n` in the below example):
|
||||
|
||||
.. image:: /images/hyperband_allocation.png
|
||||
|
||||
|
||||
3. There are also implementation specific details like how trials are placed into brackets which are not covered in the paper. This implementation places trials within brackets according to smaller bracket first - meaning that with low number of trials, there will be less early stopping.
|
||||
3. There are also implementation specific details like how trials are placed into brackets which are not covered in the paper.
|
||||
This implementation places trials within brackets according to smaller bracket first - meaning
|
||||
that with low number of trials, there will be less early stopping.
|
||||
|
||||
.. _tune-scheduler-msr:
|
||||
|
||||
Median Stopping Rule (tune.schedulers.MedianStoppingRule)
|
||||
---------------------------------------------------------
|
||||
|
||||
The Median Stopping Rule implements the simple strategy of stopping a trial if its performance falls below the median of other trials at similar points in time.
|
||||
The Median Stopping Rule implements the simple strategy of stopping a trial if its performance falls
|
||||
below the median of other trials at similar points in time.
|
||||
|
||||
.. autoclass:: ray.tune.schedulers.MedianStoppingRule
|
||||
|
||||
|
@ -129,7 +108,8 @@ The Median Stopping Rule implements the simple strategy of stopping a trial if i
|
|||
Population Based Training (tune.schedulers.PopulationBasedTraining)
|
||||
-------------------------------------------------------------------
|
||||
|
||||
Tune includes a distributed implementation of `Population Based Training (PBT) <https://deepmind.com/blog/population-based-training-neural-networks>`__. This can be enabled by setting the ``scheduler`` parameter of ``tune.run``, e.g.
|
||||
Tune includes a distributed implementation of `Population Based Training (PBT) <https://deepmind.com/blog/population-based-training-neural-networks>`__.
|
||||
This can be enabled by setting the ``scheduler`` parameter of ``tune.run``, e.g.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -145,9 +125,17 @@ Tune includes a distributed implementation of `Population Based Training (PBT) <
|
|||
})
|
||||
tune.run( ... , scheduler=pbt_scheduler)
|
||||
|
||||
When the PBT scheduler is enabled, each trial variant is treated as a member of the population. Periodically, top-performing trials are checkpointed (this requires your Trainable to support :ref:`save and restore <tune-checkpoint-syncing>`). Low-performing trials clone the checkpoints of top performers and perturb the configurations in the hope of discovering an even better variation.
|
||||
When the PBT scheduler is enabled, each trial variant is treated as a member of the population.
|
||||
Periodically, top-performing trials are checkpointed
|
||||
(this requires your Trainable to support :ref:`save and restore <tune-checkpoint-syncing>`).
|
||||
Low-performing trials clone the checkpoints of top performers and perturb the configurations
|
||||
in the hope of discovering an even better variation.
|
||||
|
||||
You can run this :doc:`toy PBT example </tune/examples/pbt_function>` to get an idea of how how PBT operates. When training in PBT mode, a single trial may see many different hyperparameters over its lifetime, which is recorded in its ``result.json`` file. The following figure generated by the example shows PBT with optimizing a LR schedule over the course of a single experiment:
|
||||
You can run this :doc:`toy PBT example </tune/examples/includes/pbt_function>` to get an idea of how how PBT operates.
|
||||
When training in PBT mode, a single trial may see many different hyperparameters over its lifetime,
|
||||
which is recorded in its ``result.json`` file.
|
||||
The following figure generated by the example shows PBT with optimizing a LR schedule over
|
||||
the course of a single experiment:
|
||||
|
||||
.. image:: ../images/pbt.png
|
||||
|
||||
|
@ -184,7 +172,9 @@ replay utility in practice.
|
|||
Population Based Bandits (PB2) (tune.schedulers.pb2.PB2)
|
||||
--------------------------------------------------------
|
||||
|
||||
Tune includes a distributed implementation of `Population Based Bandits (PB2) <https://arxiv.org/abs/2002.02518>`__. This algorithm builds upon PBT, with the main difference being that instead of using random perturbations, PB2 selects new hyperparameter configurations using a Gaussian Process model.
|
||||
Tune includes a distributed implementation of `Population Based Bandits (PB2) <https://arxiv.org/abs/2002.02518>`__.
|
||||
This algorithm builds upon PBT, with the main difference being that instead of using random perturbations,
|
||||
PB2 selects new hyperparameter configurations using a Gaussian Process model.
|
||||
|
||||
The Tune implementation of PB2 requires GPy and sklearn to be installed:
|
||||
|
||||
|
@ -212,9 +202,16 @@ PB2 can be enabled by setting the ``scheduler`` parameter of ``tune.run``, e.g.:
|
|||
tune.run( ... , scheduler=pb2_scheduler)
|
||||
|
||||
|
||||
When the PB2 scheduler is enabled, each trial variant is treated as a member of the population. Periodically, top-performing trials are checkpointed (this requires your Trainable to support :ref:`save and restore <tune-checkpoint-syncing>`). Low-performing trials clone the checkpoints of top performers and perturb the configurations in the hope of discovering an even better variation.
|
||||
When the PB2 scheduler is enabled, each trial variant is treated as a member of the population.
|
||||
Periodically, top-performing trials are checkpointed (this requires your Trainable to
|
||||
support :ref:`save and restore <tune-checkpoint-syncing>`).
|
||||
Low-performing trials clone the checkpoints of top performers and perturb the configurations
|
||||
in the hope of discovering an even better variation.
|
||||
|
||||
The primary motivation for PB2 is the ability to find promising hyperparamters with only a small population size. With that in mind, you can run this :doc:`PB2 PPO example </tune/examples/pb2_ppo_example>` to compare PB2 vs. PBT, with a population size of ``4`` (as in the paper). The example uses the ``BipedalWalker`` environment so does not require any additional licenses.
|
||||
The primary motivation for PB2 is the ability to find promising hyperparamters with only a small population size.
|
||||
With that in mind, you can run this :doc:`PB2 PPO example </tune/examples/includes/pb2_ppo_example>` to compare PB2 vs. PBT,
|
||||
with a population size of ``4`` (as in the paper).
|
||||
The example uses the ``BipedalWalker`` environment so does not require any additional licenses.
|
||||
|
||||
.. autoclass:: ray.tune.schedulers.pb2.PB2
|
||||
|
||||
|
@ -224,24 +221,35 @@ The primary motivation for PB2 is the ability to find promising hyperparamters w
|
|||
BOHB (tune.schedulers.HyperBandForBOHB)
|
||||
---------------------------------------
|
||||
|
||||
This class is a variant of HyperBand that enables the `BOHB Algorithm <https://arxiv.org/abs/1807.01774>`_. This implementation is true to the original HyperBand implementation and does not implement pipelining nor straggler mitigation.
|
||||
This class is a variant of HyperBand that enables the `BOHB Algorithm <https://arxiv.org/abs/1807.01774>`_.
|
||||
This implementation is true to the original HyperBand implementation and does not implement pipelining nor
|
||||
straggler mitigation.
|
||||
|
||||
This is to be used in conjunction with the Tune BOHB search algorithm. See :ref:`TuneBOHB <suggest-TuneBOHB>` for package requirements, examples, and details.
|
||||
This is to be used in conjunction with the Tune BOHB search algorithm.
|
||||
See :ref:`TuneBOHB <suggest-TuneBOHB>` for package requirements, examples, and details.
|
||||
|
||||
An example of this in use can be found here: :doc:`/tune/examples/bohb_example`.
|
||||
An example of this in use can be found here: :doc:`/tune/examples/includes/bohb_example`.
|
||||
|
||||
.. autoclass:: ray.tune.schedulers.HyperBandForBOHB
|
||||
|
||||
.. _tune-resource-changing-scheduler:
|
||||
|
||||
ResourceChangingScheduler
|
||||
-------------------------
|
||||
|
||||
This class is a utility scheduler, allowing for trial resource requirements to be changed during tuning. It wraps around another scheduler and uses its decisions.
|
||||
This class is a utility scheduler, allowing for trial resource requirements to be changed during tuning.
|
||||
It wraps around another scheduler and uses its decisions.
|
||||
|
||||
* If you are using the Trainable (class) API for tuning, you can obtain the current trial resources through the ``Trainable.trial_resources`` property.
|
||||
* If you are using the Trainable (class) API for tuning, your Trainable must implement ``Trainable.update_resources``,
|
||||
which will let your model know about the new resources assigned. You can also obtain the current trial resources
|
||||
by calling ``Trainable.trial_resources``.
|
||||
|
||||
* If you are using the functional API for tuning, the current trial resources can be obtained by calling `tune.get_trial_resources()` inside the training function. The function should be able to :ref:`load and save checkpoints <tune-checkpoint-syncing>` (the latter preferably every iteration).
|
||||
* If you are using the functional API for tuning, the current trial resources can be
|
||||
obtained by calling `tune.get_trial_resources()` inside the training function.
|
||||
The function should be able to :ref:`load and save checkpoints <tune-checkpoint-syncing>`
|
||||
(the latter preferably every iteration).
|
||||
|
||||
An example of this in use can be found here: :doc:`/tune/examples/xgboost_dynamic_resources_example`.
|
||||
An example of this in use can be found here: :doc:`/tune/examples/includes/xgboost_dynamic_resources_example`.
|
||||
|
||||
.. autoclass:: ray.tune.schedulers.ResourceChangingScheduler
|
||||
|
||||
|
@ -268,6 +276,9 @@ TrialScheduler
|
|||
|
||||
Shim Instantiation (tune.create_scheduler)
|
||||
------------------------------------------
|
||||
There is also a shim function that constructs the scheduler based on the provided string. This can be useful if the scheduler you want to use changes often (e.g., specifying the scheduler via a CLI option or config file).
|
||||
|
||||
There is also a shim function that constructs the scheduler based on the provided string.
|
||||
This can be useful if the scheduler you want to use changes often (e.g., specifying the scheduler
|
||||
via a CLI option or config file).
|
||||
|
||||
.. automethod:: ray.tune.create_scheduler
|
||||
|
|
|
@ -2,163 +2,6 @@
|
|||
|
||||
Search Space API
|
||||
================
|
||||
Overview
|
||||
--------
|
||||
|
||||
Tune has a native interface for specifying search spaces. You can specify the search space via ``tune.run(config=...)``.
|
||||
|
||||
Thereby, you can either use the ``tune.grid_search`` primitive to specify an axis of a grid search...
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
config={"bar": tune.grid_search([True, False])})
|
||||
|
||||
|
||||
... or one of the random sampling primitives to specify distributions (:ref:`tune-sample-docs`):
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
config={
|
||||
"param1": tune.choice([True, False]),
|
||||
"bar": tune.uniform(0, 10),
|
||||
"alpha": tune.sample_from(lambda _: np.random.uniform(100) ** 2),
|
||||
"const": "hello" # It is also ok to specify constant values.
|
||||
})
|
||||
|
||||
|
||||
|
||||
.. caution:: If you use a SearchAlgorithm, you may not be able to specify lambdas or grid search with this
|
||||
interface, as some search algorithms may not be compatible.
|
||||
|
||||
|
||||
To sample multiple times/run multiple trials, specify ``tune.run(num_samples=N``. If ``grid_search`` is provided as an argument, the *same* grid will be repeated ``N`` times.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# 13 different configs.
|
||||
tune.run(trainable, num_samples=13, config={
|
||||
"x": tune.choice([0, 1, 2]),
|
||||
}
|
||||
)
|
||||
|
||||
# 13 different configs.
|
||||
tune.run(trainable, num_samples=13, config={
|
||||
"x": tune.choice([0, 1, 2]),
|
||||
"y": tune.randn([0, 1, 2]),
|
||||
}
|
||||
)
|
||||
|
||||
# 4 different configs.
|
||||
tune.run(trainable, config={"x": tune.grid_search([1, 2, 3, 4])}, num_samples=1)
|
||||
|
||||
# 3 different configs.
|
||||
tune.run(trainable, config={"x": grid_search([1, 2, 3])}, num_samples=1)
|
||||
|
||||
# 6 different configs.
|
||||
tune.run(trainable, config={"x": tune.grid_search([1, 2, 3])}, num_samples=2)
|
||||
|
||||
# 9 different configs.
|
||||
tune.run(trainable, num_samples=1, config={
|
||||
"x": tune.grid_search([1, 2, 3]),
|
||||
"y": tune.grid_search([a, b, c])}
|
||||
)
|
||||
|
||||
# 18 different configs.
|
||||
tune.run(trainable, num_samples=2, config={
|
||||
"x": tune.grid_search([1, 2, 3]),
|
||||
"y": tune.grid_search([a, b, c])}
|
||||
)
|
||||
|
||||
# 45 different configs.
|
||||
tune.run(trainable, num_samples=5, config={
|
||||
"x": tune.grid_search([1, 2, 3]),
|
||||
"y": tune.grid_search([a, b, c])}
|
||||
)
|
||||
|
||||
|
||||
|
||||
Note that grid search and random search primitives are inter-operable. Each can be used independently or in combination with each other.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# 6 different configs.
|
||||
tune.run(trainable, num_samples=2, config={
|
||||
"x": tune.sample_from(...),
|
||||
"y": tune.grid_search([a, b, c])
|
||||
}
|
||||
)
|
||||
|
||||
In the below example, ``num_samples=10`` repeats the 3x3 grid search 10 times, for a total of 90 trials, each with randomly sampled values of ``alpha`` and ``beta``.
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 12
|
||||
|
||||
tune.run(
|
||||
my_trainable,
|
||||
name="my_trainable",
|
||||
# num_samples will repeat the entire config 10 times.
|
||||
num_samples=10
|
||||
config={
|
||||
# ``sample_from`` creates a generator to call the lambda once per trial.
|
||||
"alpha": tune.sample_from(lambda spec: np.random.uniform(100)),
|
||||
# ``sample_from`` also supports "conditional search spaces"
|
||||
"beta": tune.sample_from(lambda spec: spec.config.alpha * np.random.normal()),
|
||||
"nn_layers": [
|
||||
# tune.grid_search will make it so that all values are evaluated.
|
||||
tune.grid_search([16, 64, 256]),
|
||||
tune.grid_search([16, 64, 256]),
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
.. _tune_custom-search:
|
||||
|
||||
Custom/Conditional Search Spaces
|
||||
--------------------------------
|
||||
|
||||
You'll often run into awkward search spaces (i.e., when one hyperparameter depends on another). Use ``tune.sample_from(func)`` to provide a **custom** callable function for generating a search space.
|
||||
|
||||
The parameter ``func`` should take in a ``spec`` object, which has a ``config`` namespace from which you can access other hyperparameters. This is useful for conditional distributions:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
...,
|
||||
config={
|
||||
# A random function
|
||||
"alpha": tune.sample_from(lambda _: np.random.uniform(100)),
|
||||
# Use the `spec.config` namespace to access other hyperparameters
|
||||
"beta": tune.sample_from(lambda spec: spec.config.alpha * np.random.normal())
|
||||
}
|
||||
)
|
||||
|
||||
Here's an example showing a grid search over two nested parameters combined with random sampling from two lambda functions, generating 9 different trials. Note that the value of ``beta`` depends on the value of ``alpha``, which is represented by referencing ``spec.config.alpha`` in the lambda function. This lets you specify conditional parameter distributions.
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 4-11
|
||||
|
||||
tune.run(
|
||||
my_trainable,
|
||||
name="my_trainable",
|
||||
config={
|
||||
"alpha": tune.sample_from(lambda spec: np.random.uniform(100)),
|
||||
"beta": tune.sample_from(lambda spec: spec.config.alpha * np.random.normal()),
|
||||
"nn_layers": [
|
||||
tune.grid_search([16, 64, 256]),
|
||||
tune.grid_search([16, 64, 256]),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
.. note::
|
||||
|
||||
This format is not supported by every SearchAlgorithm, and only some SearchAlgorithms, like :ref:`HyperOpt <tune-hyperopt>` and :ref:`Optuna <tune-optuna>`, handle conditional search spaces at all.
|
||||
|
||||
In order to use conditional search spaces with :ref:`HyperOpt <tune-hyperopt>`, a `Hyperopt search space <http://hyperopt.github.io/hyperopt/getting-started/search_spaces/>`_ is necessary. :ref:`Optuna <tune-optuna>` supports conditional search spaces through its define-by-run interface (:doc:`/tune/examples/optuna_define_by_run_example`).
|
||||
|
||||
.. _tune-sample-docs:
|
||||
|
||||
|
@ -169,11 +12,14 @@ This section covers the functions you can use to define your search spaces.
|
|||
|
||||
.. caution::
|
||||
|
||||
Not all SearchAlgorithms support all distributions. In particular, ``tune.sample_from`` and ``tune.grid_search`` are often unsupported.
|
||||
Not all Search Algorithms support all distributions. In particular,
|
||||
``tune.sample_from`` and ``tune.grid_search`` are often unsupported.
|
||||
The default :ref:`tune-basicvariant` supports all distributions.
|
||||
|
||||
For a high-level overview, see this example:
|
||||
|
||||
.. TODO: test this
|
||||
|
||||
.. code-block :: python
|
||||
|
||||
config = {
|
||||
|
|
|
@ -4,7 +4,8 @@ Stopping mechanisms (tune.stopper)
|
|||
==================================
|
||||
|
||||
In addition to Trial Schedulers like :ref:`ASHA <tune-scheduler-hyperband>`, where a number of
|
||||
trials are stopped if they perform subpar, Ray Tune also supports custom stopping mechanisms to stop trials early. For instance, stopping mechanisms can specify to stop trials when they reached a plateau and the metric
|
||||
trials are stopped if they perform subpar, Ray Tune also supports custom stopping mechanisms to stop trials early. They can also stop the entire experiment after a condition is met.
|
||||
For instance, stopping mechanisms can specify to stop trials when they reached a plateau and the metric
|
||||
doesn't change anymore.
|
||||
|
||||
Ray Tune comes with several stopping mechanisms out of the box. For custom stopping behavior, you can
|
||||
|
|
|
@ -3,7 +3,9 @@
|
|||
Search Algorithms (tune.suggest)
|
||||
================================
|
||||
|
||||
Tune's Search Algorithms are wrappers around open-source optimization libraries for efficient hyperparameter selection. Each library has a specific way of defining the search space - please refer to their documentation for more details.
|
||||
Tune's Search Algorithms are wrappers around open-source optimization libraries for efficient hyperparameter selection.
|
||||
Each library has a specific way of defining the search space - please refer to their documentation for more details.
|
||||
Tune will automatically convert search spaces passed to ``tune.run`` to the library format in most cases.
|
||||
|
||||
You can utilize these search algorithms as follows:
|
||||
|
||||
|
@ -12,88 +14,13 @@ You can utilize these search algorithms as follows:
|
|||
from ray.tune.suggest.hyperopt import HyperOptSearch
|
||||
tune.run(my_function, search_alg=HyperOptSearch(...))
|
||||
|
||||
Summary
|
||||
-------
|
||||
|
||||
.. list-table::
|
||||
:widths: 5 5 2 10
|
||||
:header-rows: 1
|
||||
|
||||
* - SearchAlgorithm
|
||||
- Summary
|
||||
- Website
|
||||
- Code Example
|
||||
* - :ref:`Random search/grid search <tune-basicvariant>`
|
||||
- Random search/grid search
|
||||
-
|
||||
- :doc:`/tune/examples/tune_basic_example`
|
||||
* - :ref:`AxSearch <tune-ax>`
|
||||
- Bayesian/Bandit Optimization
|
||||
- [`Ax <https://ax.dev/>`__]
|
||||
- :doc:`/tune/examples/ax_example`
|
||||
* - :ref:`BlendSearch <BlendSearch>`
|
||||
- Blended Search
|
||||
- [`Bs <https://github.com/microsoft/FLAML/tree/main/flaml/tune>`__]
|
||||
- :doc:`/tune/examples/blendsearch_example`
|
||||
* - :ref:`CFO <CFO>`
|
||||
- Cost-Frugal hyperparameter Optimization
|
||||
- [`Cfo <https://github.com/microsoft/FLAML/tree/main/flaml/tune>`__]
|
||||
- :doc:`/tune/examples/cfo_example`
|
||||
* - :ref:`DragonflySearch <Dragonfly>`
|
||||
- Scalable Bayesian Optimization
|
||||
- [`Dragonfly <https://dragonfly-opt.readthedocs.io/>`__]
|
||||
- :doc:`/tune/examples/dragonfly_example`
|
||||
* - :ref:`SkoptSearch <skopt>`
|
||||
- Bayesian Optimization
|
||||
- [`Scikit-Optimize <https://scikit-optimize.github.io>`__]
|
||||
- :doc:`/tune/examples/skopt_example`
|
||||
* - :ref:`HyperOptSearch <tune-hyperopt>`
|
||||
- Tree-Parzen Estimators
|
||||
- [`HyperOpt <http://hyperopt.github.io/hyperopt>`__]
|
||||
- :doc:`/tune/examples/hyperopt_example`
|
||||
* - :ref:`BayesOptSearch <bayesopt>`
|
||||
- Bayesian Optimization
|
||||
- [`BayesianOptimization <https://github.com/fmfn/BayesianOptimization>`__]
|
||||
- :doc:`/tune/examples/bayesopt_example`
|
||||
* - :ref:`TuneBOHB <suggest-TuneBOHB>`
|
||||
- Bayesian Opt/HyperBand
|
||||
- [`BOHB <https://github.com/automl/HpBandSter>`__]
|
||||
- :doc:`/tune/examples/bohb_example`
|
||||
* - :ref:`NevergradSearch <nevergrad>`
|
||||
- Gradient-free Optimization
|
||||
- [`Nevergrad <https://github.com/facebookresearch/nevergrad>`__]
|
||||
- :doc:`/tune/examples/nevergrad_example`
|
||||
* - :ref:`OptunaSearch <tune-optuna>`
|
||||
- Optuna search algorithms
|
||||
- [`Optuna <https://optuna.org/>`__]
|
||||
- :doc:`/tune/examples/optuna_example`
|
||||
* - :ref:`ZOOptSearch <zoopt>`
|
||||
- Zeroth-order Optimization
|
||||
- [`ZOOpt <https://github.com/polixir/ZOOpt>`__]
|
||||
- :doc:`/tune/examples/zoopt_example`
|
||||
* - :ref:`SigOptSearch <sigopt>`
|
||||
- Closed source
|
||||
- [`SigOpt <https://sigopt.com/>`__]
|
||||
- :doc:`/tune/examples/sigopt_example`
|
||||
* - :ref:`HEBOSearch <tune-hebo>`
|
||||
- Heteroscedastic Evolutionary Bayesian Optimization
|
||||
- [`HEBO <https://github.com/huawei-noah/HEBO/tree/master/HEBO>`__]
|
||||
- :doc:`/tune/examples/hebo_example`
|
||||
|
||||
.. note:: Unlike :ref:`Tune's Trial Schedulers <tune-schedulers>`, Tune SearchAlgorithms cannot affect or stop training processes. However, you can use them together to **early stop the evaluation of bad trials**.
|
||||
|
||||
**Want to use your own algorithm?** The interface is easy to implement. :ref:`Read instructions here <byo-algo>`.
|
||||
|
||||
|
||||
Tune also provides helpful utilities to use with Search Algorithms:
|
||||
|
||||
* :ref:`repeater`: Support for running each *sampled hyperparameter* with multiple random seeds.
|
||||
* :ref:`limiter`: Limits the amount of concurrent trials when running optimization.
|
||||
* :ref:`shim`: Allows creation of the search algorithm object given a string.
|
||||
|
||||
Saving and Restoring
|
||||
--------------------
|
||||
|
||||
.. TODO: what to do about this section? It doesn't really belong here and is not worth its own guide.
|
||||
.. TODO: at least check that this pseudo-code runs.
|
||||
|
||||
Certain search algorithms have ``save/restore`` implemented,
|
||||
allowing reuse of learnings across multiple tuning runs.
|
||||
|
||||
|
@ -176,11 +103,13 @@ Bayesian Optimization (tune.suggest.bayesopt.BayesOptSearch)
|
|||
BOHB (tune.suggest.bohb.TuneBOHB)
|
||||
---------------------------------
|
||||
|
||||
BOHB (Bayesian Optimization HyperBand) is an algorithm that both terminates bad trials and also uses Bayesian Optimization to improve the hyperparameter search. It is available from the `HpBandSter library <https://github.com/automl/HpBandSter>`_.
|
||||
BOHB (Bayesian Optimization HyperBand) is an algorithm that both terminates bad trials
|
||||
and also uses Bayesian Optimization to improve the hyperparameter search.
|
||||
It is available from the `HpBandSter library <https://github.com/automl/HpBandSter>`_.
|
||||
|
||||
Importantly, BOHB is intended to be paired with a specific scheduler class: :ref:`HyperBandForBOHB <tune-scheduler-bohb>`.
|
||||
|
||||
This algorithm requires using the `ConfigSpace search space specification <https://automl.github.io/HpBandSter/build/html/quickstart.html#searchspace>`_. In order to use this search algorithm, you will need to install ``HpBandSter`` and ``ConfigSpace``:
|
||||
In order to use this search algorithm, you will need to install ``HpBandSter`` and ``ConfigSpace``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
@ -195,7 +124,8 @@ See the `BOHB paper <https://arxiv.org/abs/1807.01774>`_ for more details.
|
|||
BlendSearch (tune.suggest.flaml.BlendSearch)
|
||||
--------------------------------------------
|
||||
|
||||
BlendSearch is an economical hyperparameter optimization algorithm that combines combines local search with global search. It is backed by the `FLAML library <https://github.com/microsoft/FLAML>`_.
|
||||
BlendSearch is an economical hyperparameter optimization algorithm that combines combines local search with global search.
|
||||
It is backed by the `FLAML library <https://github.com/microsoft/FLAML>`_.
|
||||
It allows the users to specify a low-cost initial point as input if such point exists.
|
||||
|
||||
In order to use this search algorithm, you will need to install ``flaml``:
|
||||
|
@ -213,7 +143,8 @@ See the `BlendSearch paper <https://openreview.net/pdf?id=VbLH04pRA3>`_ and docu
|
|||
CFO (tune.suggest.flaml.CFO)
|
||||
----------------------------
|
||||
|
||||
CFO (Cost-Frugal hyperparameter Optimization) is a hyperparameter search algorithm based on randomized local search. It is backed by the `FLAML library <https://github.com/microsoft/FLAML>`_.
|
||||
CFO (Cost-Frugal hyperparameter Optimization) is a hyperparameter search algorithm based on randomized local search.
|
||||
It is backed by the `FLAML library <https://github.com/microsoft/FLAML>`_.
|
||||
It allows the users to specify a low-cost initial point as input if such point exists.
|
||||
|
||||
In order to use this search algorithm, you will need to install ``flaml``:
|
||||
|
@ -222,7 +153,8 @@ In order to use this search algorithm, you will need to install ``flaml``:
|
|||
|
||||
$ pip install flaml
|
||||
|
||||
See the `CFO paper <https://arxiv.org/pdf/2005.01571.pdf>`_ and documentation in FLAML `CFO documentation <https://github.com/microsoft/FLAML/tree/main/flaml/tune>`_ for more details.
|
||||
See the `CFO paper <https://arxiv.org/pdf/2005.01571.pdf>`_ and documentation in
|
||||
FLAML `CFO documentation <https://github.com/microsoft/FLAML/tree/main/flaml/tune>`_ for more details.
|
||||
|
||||
.. autoclass:: ray.tune.suggest.flaml.CFO
|
||||
|
||||
|
@ -274,7 +206,8 @@ Optuna (tune.suggest.optuna.OptunaSearch)
|
|||
SigOpt (tune.suggest.sigopt.SigOptSearch)
|
||||
-----------------------------------------
|
||||
|
||||
You will need to use the `SigOpt experiment and space specification <https://app.sigopt.com/docs/overview/create>`__ to specify your search space.
|
||||
You will need to use the `SigOpt experiment and space specification <https://app.sigopt.com/docs/overview/create>`__
|
||||
to specify your search space.
|
||||
|
||||
.. autoclass:: ray.tune.suggest.sigopt.SigOptSearch
|
||||
|
||||
|
@ -321,7 +254,8 @@ will run ``repeat`` trials of the configuration. It will then average the
|
|||
ConcurrencyLimiter (tune.suggest.ConcurrencyLimiter)
|
||||
----------------------------------------------------
|
||||
|
||||
Use ``ray.tune.suggest.ConcurrencyLimiter`` to limit the amount of concurrency when using a search algorithm. This is useful when a given optimization algorithm does not parallelize very well (like a naive Bayesian Optimization).
|
||||
Use ``ray.tune.suggest.ConcurrencyLimiter`` to limit the amount of concurrency when using a search algorithm.
|
||||
This is useful when a given optimization algorithm does not parallelize very well (like a naive Bayesian Optimization).
|
||||
|
||||
.. autoclass:: ray.tune.suggest.ConcurrencyLimiter
|
||||
|
||||
|
@ -344,6 +278,8 @@ If contributing, make sure to add test cases and an entry in the function descri
|
|||
|
||||
Shim Instantiation (tune.create_searcher)
|
||||
-----------------------------------------
|
||||
There is also a shim function that constructs the search algorithm based on the provided string. This can be useful if the search algorithm you want to use changes often (e.g., specifying the search algorithm via a CLI option or config file).
|
||||
There is also a shim function that constructs the search algorithm based on the provided string.
|
||||
This can be useful if the search algorithm you want to use changes often
|
||||
(e.g., specifying the search algorithm via a CLI option or config file).
|
||||
|
||||
.. automethod:: ray.tune.create_searcher
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
.. _trainable-docs:
|
||||
|
||||
.. TODO: these "basic" sections before the actual API docs start don't really belong here. Then again, the function
|
||||
API does not really have a signature to just describe.
|
||||
.. TODO: Reusing actors and advanced resources allocation seem ill-placed.
|
||||
|
||||
Training (tune.Trainable, tune.report)
|
||||
======================================
|
||||
|
||||
|
@ -40,9 +44,12 @@ With the Function API, you can report intermediate metrics by simply calling ``t
|
|||
|
||||
Tune will run this function on a separate thread in a Ray actor process.
|
||||
|
||||
You'll notice that Ray Tune will output extra values in addition to the user reported metrics, such as ``iterations_since_restore``. See :ref:`tune-autofilled-metrics` for an explanation/glossary of these values.
|
||||
You'll notice that Ray Tune will output extra values in addition to the user reported metrics,
|
||||
such as ``iterations_since_restore``. See :ref:`tune-autofilled-metrics` for an explanation/glossary of these values.
|
||||
|
||||
.. tip:: If you want to leverage multi-node data parallel training with PyTorch while using parallel hyperparameter tuning, check out our :ref:`PyTorch <tune-pytorch-cifar-ref>` user guide and Tune's :ref:`distributed pytorch integrations <tune-integration-torch>`.
|
||||
.. tip:: If you want to leverage multi-node data parallel training with PyTorch while using parallel
|
||||
hyperparameter tuning, check out our :ref:`PyTorch <tune-pytorch-cifar-ref>` user guide and
|
||||
Tune's :ref:`distributed pytorch integrations <tune-integration-torch>`.
|
||||
|
||||
Function API return and yield values
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -98,7 +105,9 @@ report metrics at the end of each run:
|
|||
Function API Checkpointing
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Many Tune features rely on checkpointing, including the usage of certain Trial Schedulers and fault tolerance. To use Tune's checkpointing features, you must expose a ``checkpoint_dir`` argument in the function signature, and call ``tune.checkpoint_dir`` :
|
||||
Many Tune features rely on checkpointing, including the usage of certain Trial Schedulers and fault tolerance.
|
||||
To use Tune's checkpointing features, you must expose a ``checkpoint_dir`` argument in the function signature,
|
||||
and call ``tune.checkpoint_dir`` :
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -126,7 +135,8 @@ Many Tune features rely on checkpointing, including the usage of certain Trial S
|
|||
|
||||
.. note:: ``checkpoint_freq`` and ``checkpoint_at_end`` will not work with Function API checkpointing.
|
||||
|
||||
In this example, checkpoints will be saved by training iteration to ``local_dir/exp_name/trial_name/checkpoint_<step>``. You can restore a single trial checkpoint by using ``tune.run(restore=<checkpoint_dir>)``:
|
||||
In this example, checkpoints will be saved by training iteration to ``local_dir/exp_name/trial_name/checkpoint_<step>``.
|
||||
You can restore a single trial checkpoint by using ``tune.run(restore=<checkpoint_dir>)``:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -139,7 +149,8 @@ In this example, checkpoints will be saved by training iteration to ``local_dir/
|
|||
last_ckpt = trial.checkpoint.value
|
||||
analysis = tune.run(train, config={"max_iter": 10}, restore=last_ckpt)
|
||||
|
||||
Tune also may copy or move checkpoints during the course of tuning. For this purpose, it is important not to depend on absolute paths in the implementation of ``save``.
|
||||
Tune also may copy or move checkpoints during the course of tuning. For this purpose,
|
||||
it is important not to depend on absolute paths in the implementation of ``save``.
|
||||
|
||||
.. _tune-class-api:
|
||||
|
||||
|
@ -176,15 +187,21 @@ The Trainable **class API** will require users to subclass ``ray.tune.Trainable`
|
|||
|
||||
print('best config: ', analysis.get_best_config(metric="score", mode="max"))
|
||||
|
||||
As a subclass of ``tune.Trainable``, Tune will create a ``Trainable`` object on a separate process (using the :ref:`Ray Actor API <actor-guide>`).
|
||||
As a subclass of ``tune.Trainable``, Tune will create a ``Trainable`` object on a
|
||||
separate process (using the :ref:`Ray Actor API <actor-guide>`).
|
||||
|
||||
1. ``setup`` function is invoked once training starts.
|
||||
2. ``step`` is invoked **multiple times**. Each time, the Trainable object executes one logical iteration of training in the tuning process, which may include one or more iterations of actual training.
|
||||
2. ``step`` is invoked **multiple times**.
|
||||
Each time, the Trainable object executes one logical iteration of training in the tuning process,
|
||||
which may include one or more iterations of actual training.
|
||||
3. ``cleanup`` is invoked when training is finished.
|
||||
|
||||
.. tip:: As a rule of thumb, the execution time of ``step`` should be large enough to avoid overheads (i.e. more than a few seconds), but short enough to report progress periodically (i.e. at most a few minutes).
|
||||
.. tip:: As a rule of thumb, the execution time of ``step`` should be large enough to avoid overheads
|
||||
(i.e. more than a few seconds), but short enough to report progress periodically (i.e. at most a few minutes).
|
||||
|
||||
You'll notice that Ray Tune will output extra values in addition to the user reported metrics, such as ``iterations_since_restore``. See :ref:`tune-autofilled-metrics` for an explanation/glossary of these values.
|
||||
You'll notice that Ray Tune will output extra values in addition to the user reported metrics,
|
||||
such as ``iterations_since_restore``.
|
||||
See :ref:`tune-autofilled-metrics` for an explanation/glossary of these values.
|
||||
|
||||
.. _tune-trainable-save-restore:
|
||||
|
||||
|
@ -209,7 +226,9 @@ You can also implement checkpoint/restore using the Trainable Class API:
|
|||
|
||||
You can checkpoint with three different mechanisms: manually, periodically, and at termination.
|
||||
|
||||
**Manual Checkpointing**: A custom Trainable can manually trigger checkpointing by returning ``should_checkpoint: True`` (or ``tune.result.SHOULD_CHECKPOINT: True``) in the result dictionary of `step`. This can be especially helpful in spot instances:
|
||||
**Manual Checkpointing**: A custom Trainable can manually trigger checkpointing by returning ``should_checkpoint: True``
|
||||
(or ``tune.result.SHOULD_CHECKPOINT: True``) in the result dictionary of `step`.
|
||||
This can be especially helpful in spot instances:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -221,7 +240,9 @@ You can checkpoint with three different mechanisms: manually, periodically, and
|
|||
return result
|
||||
|
||||
|
||||
**Periodic Checkpointing**: periodic checkpointing can be used to provide fault-tolerance for experiments. This can be enabled by setting ``checkpoint_freq=<int>`` and ``max_failures=<int>`` to checkpoint trials every *N* iterations and recover from up to *M* crashes per trial, e.g.:
|
||||
**Periodic Checkpointing**: periodic checkpointing can be used to provide fault-tolerance for experiments.
|
||||
This can be enabled by setting ``checkpoint_freq=<int>`` and ``max_failures=<int>`` to checkpoint trials
|
||||
every *N* iterations and recover from up to *M* crashes per trial, e.g.:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -231,8 +252,8 @@ You can checkpoint with three different mechanisms: manually, periodically, and
|
|||
max_failures=5,
|
||||
)
|
||||
|
||||
**Checkpointing at Termination**: The checkpoint_freq may not coincide with the exact end of an experiment. If you want a checkpoint to be created at the end
|
||||
of a trial, you can additionally set the ``checkpoint_at_end=True``:
|
||||
**Checkpointing at Termination**: The checkpoint_freq may not coincide with the exact end of an experiment.
|
||||
If you want a checkpoint to be created at the end of a trial, you can additionally set the ``checkpoint_at_end=True``:
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 5
|
||||
|
@ -262,9 +283,12 @@ Advanced: Reusing Actors
|
|||
|
||||
.. note:: This feature is only for the Trainable Class API.
|
||||
|
||||
Your Trainable can often take a long time to start. To avoid this, you can do ``tune.run(reuse_actors=True)`` to reuse the same Trainable Python process and object for multiple hyperparameters.
|
||||
Your Trainable can often take a long time to start.
|
||||
To avoid this, you can do ``tune.run(reuse_actors=True)`` to reuse the same Trainable Python process and
|
||||
object for multiple hyperparameters.
|
||||
|
||||
This requires you to implement ``Trainable.reset_config``, which provides a new set of hyperparameters. It is up to the user to correctly update the hyperparameters of your trainable.
|
||||
This requires you to implement ``Trainable.reset_config``, which provides a new set of hyperparameters.
|
||||
It is up to the user to correctly update the hyperparameters of your trainable.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -294,7 +318,11 @@ This requires you to implement ``Trainable.reset_config``, which provides a new
|
|||
Advanced Resource Allocation
|
||||
----------------------------
|
||||
|
||||
Trainables can themselves be distributed. If your trainable function / class creates further Ray actors or tasks that also consume CPU / GPU resources, you will want to add more bundles to the :class:`PlacementGroupFactory` to reserve extra resource slots. For example, if a trainable class requires 1 GPU itself, but also launches 4 actors, each using another GPU, then you should use this:
|
||||
Trainables can themselves be distributed. If your trainable function / class creates further Ray actors or tasks
|
||||
that also consume CPU / GPU resources, you will want to add more bundles to the :class:`PlacementGroupFactory`
|
||||
to reserve extra resource slots.
|
||||
For example, if a trainable class requires 1 GPU itself, but also launches 4 actors, each using another GPU,
|
||||
then you should use this:
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 4-10
|
||||
|
@ -311,7 +339,8 @@ Trainables can themselves be distributed. If your trainable function / class cre
|
|||
])
|
||||
)
|
||||
|
||||
The ``Trainable`` also provides the ``default_resource_requests`` interface to automatically declare the ``resources_per_trial`` based on the given configuration.
|
||||
The ``Trainable`` also provides the ``default_resource_requests`` interface to automatically
|
||||
declare the ``resources_per_trial`` based on the given configuration.
|
||||
|
||||
It is also possible to specify memory (``"memory"``, in bytes) and custom resource requirements.
|
||||
|
||||
|
|
311
doc/source/tune/doc_code/faq.py
Normal file
311
doc/source/tune/doc_code/faq.py
Normal file
|
@ -0,0 +1,311 @@
|
|||
# flake8: noqa
|
||||
|
||||
# __reproducible_start__
|
||||
import numpy as np
|
||||
from ray import tune
|
||||
|
||||
|
||||
def train(config):
|
||||
# Set seed for trainable random result.
|
||||
# If you remove this line, you will get different results
|
||||
# each time you run the trial, even if the configuration
|
||||
# is the same.
|
||||
np.random.seed(config["seed"])
|
||||
random_result = np.random.uniform(0, 100, size=1).item()
|
||||
tune.report(result=random_result)
|
||||
|
||||
|
||||
# Set seed for Ray Tune's random search.
|
||||
# If you remove this line, you will get different configurations
|
||||
# each time you run the script.
|
||||
np.random.seed(1234)
|
||||
tune.run(
|
||||
train,
|
||||
config={"seed": tune.randint(0, 1000)},
|
||||
search_alg=tune.suggest.BasicVariantGenerator(),
|
||||
num_samples=10,
|
||||
)
|
||||
# __reproducible_end__
|
||||
|
||||
# __basic_config_start__
|
||||
config = {"a": {"x": tune.uniform(0, 10)}, "b": tune.choice([1, 2, 3])}
|
||||
# __basic_config_end__
|
||||
|
||||
# __conditional_spaces_start__
|
||||
config = {
|
||||
"a": tune.randint(5, 10),
|
||||
"b": tune.sample_from(lambda spec: np.random.randint(0, spec.config.a)),
|
||||
}
|
||||
# __conditional_spaces_end__
|
||||
|
||||
|
||||
# __iter_start__
|
||||
def _iter():
|
||||
for a in range(5, 10):
|
||||
for b in range(a):
|
||||
yield a, b
|
||||
|
||||
|
||||
config = {
|
||||
"ab": tune.grid_search(list(_iter())),
|
||||
}
|
||||
# __iter_end__
|
||||
|
||||
|
||||
def train(config):
|
||||
random_result = np.random.uniform(0, 100, size=1).item()
|
||||
tune.report(result=random_result)
|
||||
|
||||
|
||||
train_fn = train
|
||||
MOCK = True
|
||||
# Note we put this check here to make sure at least the syntax of
|
||||
# the code is correct. Some of these snippets simply can't be run on the nose.
|
||||
|
||||
if not MOCK:
|
||||
# __resources_start__
|
||||
tune.run(
|
||||
train_fn,
|
||||
resources_per_trial={"cpu": 2, "gpu": 0.5, "custom_resources": {"hdd": 80}},
|
||||
)
|
||||
# __resources_end__
|
||||
|
||||
# __resources_pgf_start__
|
||||
tune.run(
|
||||
train_fn,
|
||||
resources_per_trial=tune.PlacementGroupFactory(
|
||||
[
|
||||
{"CPU": 2, "GPU": 0.5, "hdd": 80},
|
||||
{"CPU": 1},
|
||||
{"CPU": 1},
|
||||
],
|
||||
strategy="PACK",
|
||||
),
|
||||
)
|
||||
# __resources_pgf_end__
|
||||
|
||||
metric = None
|
||||
|
||||
# __modin_start__
|
||||
def train_fn(config, checkpoint_dir=None):
|
||||
# some Modin operations here
|
||||
# import modin.pandas as pd
|
||||
tune.report(metric=metric)
|
||||
|
||||
tune.run(
|
||||
train_fn,
|
||||
resources_per_trial=tune.PlacementGroupFactory(
|
||||
[
|
||||
{"CPU": 1}, # this bundle will be used by the trainable itself
|
||||
{"CPU": 1}, # this bundle will be used by Modin
|
||||
],
|
||||
strategy="PACK",
|
||||
),
|
||||
)
|
||||
# __modin_end__
|
||||
|
||||
# __huge_data_start__
|
||||
from ray import tune
|
||||
import numpy as np
|
||||
|
||||
|
||||
def train(config, checkpoint_dir=None, num_epochs=5, data=None):
|
||||
for i in range(num_epochs):
|
||||
for sample in data:
|
||||
# ... train on sample
|
||||
pass
|
||||
|
||||
|
||||
# Some huge dataset
|
||||
data = np.random.random(size=100000000)
|
||||
|
||||
tune.run(tune.with_parameters(train, num_epochs=5, data=data))
|
||||
# __huge_data_end__
|
||||
|
||||
|
||||
# __seeded_1_start__
|
||||
import random
|
||||
|
||||
random.seed(1234)
|
||||
output = [random.randint(0, 100) for _ in range(10)]
|
||||
|
||||
# The output will always be the same.
|
||||
assert output == [99, 56, 14, 0, 11, 74, 4, 85, 88, 10]
|
||||
# __seeded_1_end__
|
||||
|
||||
|
||||
# __seeded_2_start__
|
||||
# This should suffice to initialize the RNGs for most Python-based libraries
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
random.seed(1234)
|
||||
np.random.seed(5678)
|
||||
# __seeded_2_end__
|
||||
|
||||
|
||||
# __torch_tf_seeds_start__
|
||||
import torch
|
||||
|
||||
torch.manual_seed(0)
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
tf.random.set_seed(0)
|
||||
# __torch_tf_seeds_end__
|
||||
|
||||
# __torch_seed_example_start__
|
||||
import random
|
||||
import numpy as np
|
||||
from ray import tune
|
||||
|
||||
|
||||
def trainable(config):
|
||||
# config["seed"] is set deterministically, but differs between training runs
|
||||
random.seed(config["seed"])
|
||||
np.random.seed(config["seed"])
|
||||
# torch.manual_seed(config["seed"])
|
||||
# ... training code
|
||||
|
||||
|
||||
config = {
|
||||
"seed": tune.randint(0, 10000),
|
||||
# ...
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Set seed for the search algorithms/schedulers
|
||||
random.seed(1234)
|
||||
np.random.seed(1234)
|
||||
# Don't forget to check if the search alg has a `seed` parameter
|
||||
tune.run(trainable, config=config)
|
||||
# __torch_seed_example_end__
|
||||
|
||||
# __large_data_start__
|
||||
from ray import tune
|
||||
import numpy as np
|
||||
|
||||
|
||||
def f(config, data=None):
|
||||
pass
|
||||
# use data
|
||||
|
||||
|
||||
data = np.random.random(size=100000000)
|
||||
|
||||
tune.run(tune.with_parameters(f, data=data))
|
||||
# __large_data_end__
|
||||
|
||||
MyTrainableClass = None
|
||||
custom_sync_str_or_func = ""
|
||||
|
||||
if not MOCK:
|
||||
# __log_1_start__
|
||||
tune.run(
|
||||
MyTrainableClass,
|
||||
local_dir="~/ray_results",
|
||||
sync_config=tune.SyncConfig(upload_dir="s3://my-log-dir"),
|
||||
)
|
||||
# __log_1_end__
|
||||
|
||||
# __log_2_start__
|
||||
tune.run(
|
||||
MyTrainableClass,
|
||||
sync_config=tune.SyncConfig(
|
||||
upload_dir="s3://my-log-dir", syncer=custom_sync_str_or_func
|
||||
),
|
||||
)
|
||||
# __log_2_end__
|
||||
|
||||
# __sync_start__
|
||||
import subprocess
|
||||
|
||||
|
||||
def custom_sync_func(source, target):
|
||||
# run other workload here
|
||||
sync_cmd = "s3 {source} {target}".format(source=source, target=target)
|
||||
sync_process = subprocess.Popen(sync_cmd, shell=True)
|
||||
sync_process.wait()
|
||||
|
||||
|
||||
# __sync_end__
|
||||
|
||||
if not MOCK:
|
||||
# __docker_start__
|
||||
from ray import tune
|
||||
from ray.tune.integration.docker import DockerSyncer
|
||||
|
||||
sync_config = tune.SyncConfig(syncer=DockerSyncer)
|
||||
|
||||
tune.run(train, sync_config=sync_config)
|
||||
# __docker_end__
|
||||
|
||||
# __s3_start__
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
tune.durable(train_fn),
|
||||
# ...,
|
||||
sync_config=tune.SyncConfig(upload_dir="s3://your-s3-bucket/durable-trial/"),
|
||||
)
|
||||
# __s3_end__
|
||||
|
||||
# __sync_config_start__
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
train_fn,
|
||||
# ...,
|
||||
local_dir="/path/to/shared/storage",
|
||||
sync_config=tune.SyncConfig(
|
||||
# Do not sync because we are on shared storage
|
||||
syncer=None
|
||||
),
|
||||
)
|
||||
# __sync_config_end__
|
||||
|
||||
# __k8s_start__
|
||||
from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer
|
||||
|
||||
sync_config = tune.SyncConfig(syncer=NamespacedKubernetesSyncer("ray"))
|
||||
|
||||
tune.run(train, sync_config=sync_config)
|
||||
# __k8s_end__
|
||||
|
||||
import ray
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
# __local_start__
|
||||
import ray
|
||||
|
||||
ray.init(local_mode=True)
|
||||
# __local_end__
|
||||
|
||||
# __grid_search_start__
|
||||
parameters = {
|
||||
"qux": tune.sample_from(lambda spec: 2 + 2),
|
||||
"bar": tune.grid_search([True, False]),
|
||||
"foo": tune.grid_search([1, 2, 3]),
|
||||
"baz": "asd", # a constant value
|
||||
}
|
||||
|
||||
tune.run(train_fn, config=parameters)
|
||||
# __grid_search_end__
|
||||
|
||||
# __grid_search_2_start__
|
||||
# num_samples=10 repeats the 3x3 grid search 10 times, for a total of 90 trials
|
||||
tune.run(
|
||||
train_fn,
|
||||
name="my_trainable",
|
||||
config={
|
||||
"alpha": tune.uniform(100, 200),
|
||||
"beta": tune.sample_from(lambda spec: spec.config.alpha * np.random.normal()),
|
||||
"nn_layers": [
|
||||
tune.grid_search([16, 64, 256]),
|
||||
tune.grid_search([16, 64, 256]),
|
||||
],
|
||||
},
|
||||
num_samples=10,
|
||||
)
|
||||
# __grid_search_2_end__
|
31
doc/source/tune/doc_code/keras_hyperopt.py
Normal file
31
doc/source/tune/doc_code/keras_hyperopt.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
# flake8: noqa
|
||||
|
||||
accuracy = 42
|
||||
|
||||
# __keras_hyperopt_start__
|
||||
from ray import tune
|
||||
from ray.tune.suggest.hyperopt import HyperOptSearch
|
||||
import keras
|
||||
|
||||
|
||||
# 1. Wrap a Keras model in an objective function.
|
||||
def objective(config):
|
||||
model = keras.models.Sequential()
|
||||
model.add(keras.layers.Dense(784, activation=config["activation"]))
|
||||
model.add(keras.layers.Dense(10, activation="softmax"))
|
||||
|
||||
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
|
||||
# model.fit(...)
|
||||
# loss, accuracy = model.evaluate(...)
|
||||
return {"accuracy": accuracy}
|
||||
|
||||
|
||||
# 2. Define a search space and initialize the search algorithm.
|
||||
search_space = {"activation": tune.choice(["relu", "tanh"])}
|
||||
algo = HyperOptSearch()
|
||||
|
||||
# 3. Start a Tune run that maximizes accuracy.
|
||||
analysis = tune.run(
|
||||
objective, search_alg=algo, config=search_space, metric="accuracy", mode="max"
|
||||
)
|
||||
# __keras_hyperopt_end__
|
139
doc/source/tune/doc_code/key_concepts.py
Normal file
139
doc/source/tune/doc_code/key_concepts.py
Normal file
|
@ -0,0 +1,139 @@
|
|||
# flake8: noqa
|
||||
|
||||
# __function_api_start__
|
||||
from ray import tune
|
||||
|
||||
|
||||
def objective(x, a, b): # Define an objective function.
|
||||
return a * (x ** 0.5) + b
|
||||
|
||||
|
||||
def trainable(config): # Pass a "config" dictionary into your trainable.
|
||||
|
||||
for x in range(20): # "Train" for 20 iterations and compute intermediate scores.
|
||||
score = objective(x, config["a"], config["b"])
|
||||
|
||||
tune.report(score=score) # Send the score to Tune.
|
||||
|
||||
|
||||
# __function_api_end__
|
||||
|
||||
|
||||
# __class_api_start__
|
||||
from ray import tune
|
||||
|
||||
|
||||
def objective(x, a, b):
|
||||
return a * (x ** 2) + b
|
||||
|
||||
|
||||
class Trainable(tune.Trainable):
|
||||
def setup(self, config):
|
||||
# config (dict): A dict of hyperparameters
|
||||
self.x = 0
|
||||
self.a = config["a"]
|
||||
self.b = config["b"]
|
||||
|
||||
def step(self): # This is called iteratively.
|
||||
score = objective(self.x, self.a, self.b)
|
||||
self.x += 1
|
||||
return {"score": score}
|
||||
|
||||
# __class_api_end__
|
||||
|
||||
# TODO: this example does not work as advertised. Errors out.
|
||||
def save_checkpoint(self, checkpoint_dir):
|
||||
pass
|
||||
|
||||
def load_checkpoint(self, checkpoint_dir):
|
||||
pass
|
||||
|
||||
|
||||
# __run_tunable_start__
|
||||
# Pass in a Trainable class or function, along with a search space "config".
|
||||
tune.run(trainable, config={"a": 2, "b": 4})
|
||||
# __run_tunable_end__
|
||||
|
||||
# __run_tunable_samples_start__
|
||||
tune.run(trainable, config={"a": 2, "b": 4}, num_samples=10)
|
||||
# __run_tunable_samples_end__
|
||||
|
||||
# __search_space_start__
|
||||
space = {"a": tune.uniform(0, 1), "b": tune.uniform(0, 1)}
|
||||
tune.run(trainable, config=space, num_samples=10)
|
||||
# __search_space_end__
|
||||
|
||||
# __config_start__
|
||||
config = {
|
||||
"uniform": tune.uniform(-5, -1), # Uniform float between -5 and -1
|
||||
"quniform": tune.quniform(3.2, 5.4, 0.2), # Round to increments of 0.2
|
||||
"loguniform": tune.loguniform(1e-4, 1e-1), # Uniform float in log space
|
||||
"qloguniform": tune.qloguniform(1e-4, 1e-1, 5e-5), # Round to increments of 0.00005
|
||||
"randn": tune.randn(10, 2), # Normal distribution with mean 10 and sd 2
|
||||
"qrandn": tune.qrandn(10, 2, 0.2), # Round to increments of 0.2
|
||||
"randint": tune.randint(-9, 15), # Random integer between -9 and 15
|
||||
"qrandint": tune.qrandint(-21, 12, 3), # Round to increments of 3 (includes 12)
|
||||
"lograndint": tune.lograndint(1, 10), # Random integer in log space
|
||||
"qlograndint": tune.qlograndint(1, 10, 2), # Round to increments of 2
|
||||
"choice": tune.choice(["a", "b", "c"]), # Choose one of these options uniformly
|
||||
"func": tune.sample_from(
|
||||
lambda spec: spec.config.uniform * 0.01
|
||||
), # Depends on other value
|
||||
"grid": tune.grid_search([32, 64, 128]), # Search over all these values
|
||||
}
|
||||
# __config_end__
|
||||
|
||||
# __bayes_start__
|
||||
from ray.tune.suggest.bayesopt import BayesOptSearch
|
||||
|
||||
# Define the search space
|
||||
search_space = {"a": tune.uniform(0, 1), "b": tune.uniform(0, 20)}
|
||||
|
||||
algo = BayesOptSearch(random_search_steps=4)
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
config=search_space,
|
||||
metric="score",
|
||||
mode="min",
|
||||
search_alg=algo,
|
||||
stop={"training_iteration": 20},
|
||||
)
|
||||
# __bayes_end__
|
||||
|
||||
# __hyperband_start__
|
||||
from ray.tune.schedulers import HyperBandScheduler
|
||||
|
||||
# Create HyperBand scheduler and minimize the score
|
||||
hyperband = HyperBandScheduler(metric="score", mode="max")
|
||||
|
||||
config = {"a": tune.uniform(0, 1), "b": tune.uniform(0, 1)}
|
||||
|
||||
tune.run(trainable, config=config, num_samples=20, scheduler=hyperband)
|
||||
# __hyperband_end__
|
||||
|
||||
# __analysis_start__
|
||||
analysis = tune.run(
|
||||
trainable,
|
||||
config=config,
|
||||
metric="score",
|
||||
mode="min",
|
||||
search_alg=BayesOptSearch(random_search_steps=4),
|
||||
stop={"training_iteration": 20},
|
||||
)
|
||||
|
||||
best_trial = analysis.best_trial # Get best trial
|
||||
best_config = analysis.best_config # Get best trial's hyperparameters
|
||||
best_logdir = analysis.best_logdir # Get best trial's logdir
|
||||
best_checkpoint = analysis.best_checkpoint # Get best trial's best checkpoint
|
||||
best_result = analysis.best_result # Get best trial's last results
|
||||
best_result_df = analysis.best_result_df # Get best result as pandas dataframe
|
||||
# __analysis_end__
|
||||
|
||||
# __results_start__
|
||||
# Get a dataframe with the last results for each trial
|
||||
df_results = analysis.results_df
|
||||
|
||||
# Get a dataframe of results for a specific score or mode
|
||||
df = analysis.dataframe(metric="score", mode="max")
|
||||
# __results_end__
|
115
doc/source/tune/doc_code/pytorch_optuna.py
Normal file
115
doc/source/tune/doc_code/pytorch_optuna.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
# flake8: noqa
|
||||
|
||||
import os
|
||||
from filelock import FileLock
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torchvision import datasets, transforms
|
||||
|
||||
EPOCH_SIZE = 512
|
||||
TEST_SIZE = 256
|
||||
|
||||
|
||||
def train(model, optimizer, train_loader, device=None):
|
||||
device = device or torch.device("cpu")
|
||||
model.train()
|
||||
for batch_idx, (data, target) in enumerate(train_loader):
|
||||
if batch_idx * len(data) > EPOCH_SIZE:
|
||||
return
|
||||
data, target = data.to(device), target.to(device)
|
||||
optimizer.zero_grad()
|
||||
output = model(data)
|
||||
loss = F.nll_loss(output, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
|
||||
def test(model, data_loader, device=None):
|
||||
device = device or torch.device("cpu")
|
||||
model.eval()
|
||||
correct = 0
|
||||
total = 0
|
||||
with torch.no_grad():
|
||||
for batch_idx, (data, target) in enumerate(data_loader):
|
||||
if batch_idx * len(data) > TEST_SIZE:
|
||||
break
|
||||
data, target = data.to(device), target.to(device)
|
||||
outputs = model(data)
|
||||
_, predicted = torch.max(outputs.data, 1)
|
||||
total += target.size(0)
|
||||
correct += (predicted == target).sum().item()
|
||||
|
||||
return correct / total
|
||||
|
||||
|
||||
def load_data():
|
||||
mnist_transforms = transforms.Compose(
|
||||
[transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
|
||||
)
|
||||
with FileLock(os.path.expanduser("~/data.lock")):
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
datasets.MNIST(
|
||||
"~/data", train=True, download=True, transform=mnist_transforms
|
||||
),
|
||||
batch_size=64,
|
||||
shuffle=True,
|
||||
)
|
||||
test_loader = torch.utils.data.DataLoader(
|
||||
datasets.MNIST(
|
||||
"~/data", train=False, download=True, transform=mnist_transforms
|
||||
),
|
||||
batch_size=64,
|
||||
shuffle=True,
|
||||
)
|
||||
return train_loader, test_loader
|
||||
|
||||
|
||||
class ConvNet(nn.Module):
|
||||
def __init__(self):
|
||||
super(ConvNet, self).__init__()
|
||||
self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
|
||||
self.fc = nn.Linear(192, 10)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.relu(F.max_pool2d(self.conv1(x), 3))
|
||||
x = x.view(-1, 192)
|
||||
x = self.fc(x)
|
||||
return F.log_softmax(x, dim=1)
|
||||
|
||||
|
||||
# __pytorch_optuna_start__
|
||||
# 1. Wrap your PyTorch model in an objective function.
|
||||
import torch
|
||||
from ray import tune
|
||||
from ray.tune.suggest.optuna import OptunaSearch
|
||||
|
||||
|
||||
# 1. Wrap a PyTorch model in an objective function.
|
||||
def objective(config):
|
||||
train_loader, test_loader = load_data() # Load some data
|
||||
model = ConvNet().to("cpu") # Create a PyTorch conv net
|
||||
optimizer = torch.optim.SGD( # Tune the optimizer
|
||||
model.parameters(), lr=config["lr"], momentum=config["momentum"]
|
||||
)
|
||||
|
||||
while True:
|
||||
train(model, optimizer, train_loader) # Train the model
|
||||
acc = test(model, test_loader) # Compute test accuracy
|
||||
tune.report(mean_accuracy=acc) # Report to Tune
|
||||
|
||||
|
||||
# 2. Define a search space and initialize the search algorithm.
|
||||
search_space = {"lr": tune.loguniform(1e-4, 1e-2), "momentum": tune.uniform(0.1, 0.9)}
|
||||
algo = OptunaSearch()
|
||||
|
||||
# 3. Start a Tune run that maximizes mean accuracy and stops after 5 iterations.
|
||||
analysis = tune.run(
|
||||
objective,
|
||||
metric="mean_accuracy",
|
||||
mode="max",
|
||||
search_alg=algo,
|
||||
stop={"training_iteration": 5},
|
||||
config=search_space,
|
||||
)
|
||||
print("Best config is:", analysis.best_config)
|
||||
# __pytorch_optuna_end__
|
|
@ -1,6 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
cifar10_pytorch
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/cifar10_pytorch.py
|
|
@ -1,6 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
comet_example
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/tune_comet_example.py
|
202
doc/source/tune/examples/horovod_simple.ipynb
Normal file
202
doc/source/tune/examples/horovod_simple.ipynb
Normal file
|
@ -0,0 +1,202 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"(tune-horovod-example)=\n",
|
||||
"\n",
|
||||
"# Using Horovod with Tune\n",
|
||||
"\n",
|
||||
"```{image} /images/horovod.png\n",
|
||||
":align: center\n",
|
||||
":alt: Horovod Logo\n",
|
||||
":height: 120px\n",
|
||||
":target: https://horovod.ai/\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## Example"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"import ray\n",
|
||||
"from ray import tune\n",
|
||||
"from ray.tune.integration.horovod import DistributedTrainableCreator\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def sq(x):\n",
|
||||
" m2 = 1.0\n",
|
||||
" m1 = -20.0\n",
|
||||
" m0 = 50.0\n",
|
||||
" return m2 * x * x + m1 * x + m0\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def qu(x):\n",
|
||||
" m3 = 10.0\n",
|
||||
" m2 = 5.0\n",
|
||||
" m1 = -20.0\n",
|
||||
" m0 = -5.0\n",
|
||||
" return m3 * x * x * x + m2 * x * x + m1 * x + m0\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Net(torch.nn.Module):\n",
|
||||
" def __init__(self, mode=\"sq\"):\n",
|
||||
" super(Net, self).__init__()\n",
|
||||
"\n",
|
||||
" if mode == \"square\":\n",
|
||||
" self.mode = 0\n",
|
||||
" self.param = torch.nn.Parameter(torch.FloatTensor([1.0, -1.0]))\n",
|
||||
" else:\n",
|
||||
" self.mode = 1\n",
|
||||
" self.param = torch.nn.Parameter(torch.FloatTensor([1.0, -1.0, 1.0]))\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" if ~self.mode:\n",
|
||||
" return x * x + self.param[0] * x + self.param[1]\n",
|
||||
" else:\n",
|
||||
" return_val = 10 * x * x * x\n",
|
||||
" return_val += self.param[0] * x * x\n",
|
||||
" return_val += self.param[1] * x + self.param[2]\n",
|
||||
" return return_val\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def train(config):\n",
|
||||
" import torch\n",
|
||||
" import horovod.torch as hvd\n",
|
||||
"\n",
|
||||
" hvd.init()\n",
|
||||
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
||||
" mode = config[\"mode\"]\n",
|
||||
" net = Net(mode).to(device)\n",
|
||||
" optimizer = torch.optim.SGD(\n",
|
||||
" net.parameters(),\n",
|
||||
" lr=config[\"lr\"],\n",
|
||||
" )\n",
|
||||
" optimizer = hvd.DistributedOptimizer(optimizer)\n",
|
||||
"\n",
|
||||
" num_steps = 5\n",
|
||||
" print(hvd.size())\n",
|
||||
" np.random.seed(1 + hvd.rank())\n",
|
||||
" torch.manual_seed(1234)\n",
|
||||
" # To ensure consistent initialization across slots,\n",
|
||||
" hvd.broadcast_parameters(net.state_dict(), root_rank=0)\n",
|
||||
" hvd.broadcast_optimizer_state(optimizer, root_rank=0)\n",
|
||||
"\n",
|
||||
" start = time.time()\n",
|
||||
" x_max = config[\"x_max\"]\n",
|
||||
" for step in range(1, num_steps + 1):\n",
|
||||
" features = torch.Tensor(np.random.rand(1) * 2 * x_max - x_max).to(device)\n",
|
||||
" if mode == \"square\":\n",
|
||||
" labels = sq(features)\n",
|
||||
" else:\n",
|
||||
" labels = qu(features)\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" outputs = net(features)\n",
|
||||
" loss = torch.nn.MSELoss()(outputs, labels)\n",
|
||||
" loss.backward()\n",
|
||||
"\n",
|
||||
" optimizer.step()\n",
|
||||
" time.sleep(0.1)\n",
|
||||
" tune.report(loss=loss.item())\n",
|
||||
" total = time.time() - start\n",
|
||||
" print(f\"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tune_horovod(\n",
|
||||
" hosts_per_trial, slots_per_host, num_samples, use_gpu, mode=\"square\", x_max=1.0\n",
|
||||
"):\n",
|
||||
" horovod_trainable = DistributedTrainableCreator(\n",
|
||||
" train,\n",
|
||||
" use_gpu=use_gpu,\n",
|
||||
" num_hosts=hosts_per_trial,\n",
|
||||
" num_slots=slots_per_host,\n",
|
||||
" replicate_pem=False,\n",
|
||||
" )\n",
|
||||
" analysis = tune.run(\n",
|
||||
" horovod_trainable,\n",
|
||||
" metric=\"loss\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" config={\"lr\": tune.uniform(0.1, 1), \"mode\": mode, \"x_max\": x_max},\n",
|
||||
" num_samples=num_samples,\n",
|
||||
" fail_fast=True,\n",
|
||||
" )\n",
|
||||
" print(\"Best hyperparameters found were: \", analysis.best_config)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" import argparse\n",
|
||||
"\n",
|
||||
" parser = argparse.ArgumentParser()\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--mode\", type=str, default=\"square\", choices=[\"square\", \"cubic\"]\n",
|
||||
" )\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--learning_rate\", type=float, default=0.1, dest=\"learning_rate\"\n",
|
||||
" )\n",
|
||||
" parser.add_argument(\"--x_max\", type=float, default=1.0, dest=\"x_max\")\n",
|
||||
" parser.add_argument(\"--gpu\", default=False, action=\"store_true\")\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--smoke-test\", default=True, action=\"store_true\", help=(\"Finish quickly for testing.\")\n",
|
||||
" )\n",
|
||||
" parser.add_argument(\"--hosts-per-trial\", type=int, default=1)\n",
|
||||
" parser.add_argument(\"--slots-per-host\", type=int, default=2)\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--server-address\",\n",
|
||||
" type=str,\n",
|
||||
" default=None,\n",
|
||||
" required=False,\n",
|
||||
" help=\"The address of server to connect to if using \" \"Ray Client.\",\n",
|
||||
" )\n",
|
||||
" args, _ = parser.parse_known_args()\n",
|
||||
"\n",
|
||||
" if args.smoke_test:\n",
|
||||
" ray.init(num_cpus=2)\n",
|
||||
" elif args.server_address:\n",
|
||||
" ray.init(f\"ray://{args.server_address}\")\n",
|
||||
"\n",
|
||||
" # import ray\n",
|
||||
" # ray.init(address=\"auto\") # assumes ray is started with ray up\n",
|
||||
"\n",
|
||||
" tune_horovod(\n",
|
||||
" hosts_per_trial=args.hosts_per_trial,\n",
|
||||
" slots_per_host=args.slots_per_host,\n",
|
||||
" num_samples=2 if args.smoke_test else 10,\n",
|
||||
" use_gpu=args.gpu,\n",
|
||||
" mode=args.mode,\n",
|
||||
" x_max=args.x_max,\n",
|
||||
" )"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
horovod_simple
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/horovod_simple.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
async_hyperband_example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Asynchronous HyperBand Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/async_hyperband_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
ax_example
|
||||
AX Example
|
||||
~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/ax_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
bayesopt_example
|
||||
BayesOpt Example
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/bayesopt_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
blendsearch_example
|
||||
Blendsearch Example
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/blendsearch_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
bohb_example
|
||||
~~~~~~~~~~~~~~~
|
||||
BOHB Example
|
||||
~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/bohb_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
cfo_example
|
||||
CFO Example
|
||||
~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/cfo_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
custom_func_checkpointing
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Custom Checkpointing Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/custom_func_checkpointing.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
ddp_mnist_torch
|
||||
~~~~~~~~~~~~~~~
|
||||
DDP Mnist Torch Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/ddp_mnist_torch.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
dragonfly_example
|
||||
Dragonfly Example
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/dragonfly_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
durable_trainable_example
|
||||
Durable Trainable Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/durable_trainable_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
genetic_example
|
||||
~~~~~~~~~~~~~~~
|
||||
Genetic Search Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/genetic_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
hebo_example
|
||||
~~~~~~~~~~~~~~~
|
||||
HEBO Example
|
||||
~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/hebo_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
hyperband_example
|
||||
HyperBand Example
|
||||
=================
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/hyperband_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
hyperband_function_example
|
||||
HyperBand Function Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/hyperband_function_example.py
|
|
@ -1,8 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
hyperopt_conditional_search_space_example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/hyperopt_conditional_search_space_example.py
|
||||
:orphan:
|
||||
|
||||
Hyperopt Conditional Search Space Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/hyperopt_conditional_search_space_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
logging_example
|
||||
Logging Example
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/logging_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
mlflow_example
|
||||
MLflow Example
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/mlflow_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
mlflow_ptl_example
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
MLflow PyTorch Lightning Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/mlflow_ptl.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
mnist_ptl_mini
|
||||
~~~~~~~~~~~~~~
|
||||
MNIST PyTorch Lightning Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/mnist_ptl_mini.py
|
85
doc/source/tune/examples/includes/mnist_pytorch.rst
Normal file
85
doc/source/tune/examples/includes/mnist_pytorch.rst
Normal file
|
@ -0,0 +1,85 @@
|
|||
:orphan:
|
||||
|
||||
MNIST PyTorch Example
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch.py
|
||||
|
||||
.. TODO: test this code snippet below
|
||||
|
||||
.. _tune-torch-ddp:
|
||||
|
||||
Advanced: Distributed training with DistributedDataParallel
|
||||
-----------------------------------------------------------
|
||||
|
||||
Some models require multiple nodes to train in a short amount of time.
|
||||
Ray Tune allows you to easily do distributed data parallel training in addition to distributed hyperparameter tuning.
|
||||
|
||||
You can wrap your model in ``torch.nn.parallel.DistributedDataParallel`` to support distributed data parallel training:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd.torch import is_distributed_trainable
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
|
||||
def train_cifar(config, checkpoint_dir=None, data_dir=None):
|
||||
net = Net(config["l1"], config["l2"])
|
||||
|
||||
device = "cpu"
|
||||
|
||||
#### Using distributed data parallel training
|
||||
if is_distributed_trainable():
|
||||
net = DistributedDataParallel(net)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
|
||||
net.to(device)
|
||||
|
||||
|
||||
If using checkpointing, be sure to use a :ref:`special checkpoint context manager <tune-ddp-doc>`,
|
||||
``distributed_checkpoint_dir`` that avoids redundant checkpointing across multiple processes:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.util.sgd.torch import distributed_checkpoint_dir
|
||||
|
||||
#### Using distributed data parallel training
|
||||
# Inside `def train_cifar(...)`,
|
||||
# replace tune.checkpoint_dir() with the following
|
||||
# Avoids redundant checkpointing on different processes.
|
||||
with distributed_checkpoint_dir(step=epoch) as checkpoint_dir:
|
||||
path = os.path.join(checkpoint_dir, "checkpoint")
|
||||
torch.save((net.state_dict(), optimizer.state_dict()), path)
|
||||
|
||||
|
||||
Finally, we need to tell Ray Tune to start multiple distributed processes at once by using
|
||||
``ray.tune.integration.torch.DistributedTrainableCreator`` (:ref:`docs <tune-ddp-doc>`).
|
||||
This is essentially equivalent to running ``torch.distributed.launch`` for each hyperparameter trial:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# You'll probably want to be running on a distributed Ray cluster.
|
||||
# ray.init(address="auto")
|
||||
|
||||
from ray.util.sgd.integration.torch import DistributedTrainableCreator
|
||||
|
||||
distributed_train_cifar = DistributedTrainableCreator(
|
||||
partial(train_cifar, data_dir=data_dir),
|
||||
use_gpu=True,
|
||||
num_workers=2, # number of parallel workers to use
|
||||
num_cpus_per_worker=8
|
||||
)
|
||||
tune.run(
|
||||
distributed_train_cifar,
|
||||
resources_per_trial=None,
|
||||
config=config,
|
||||
num_samples=num_samples,
|
||||
...
|
||||
)
|
||||
|
||||
See an :doc:`end-to-end example here </tune/examples/includes/ddp_mnist_torch>`.
|
||||
|
||||
If you consider switching to PyTorch Lightning to get rid of some of your boilerplate
|
||||
training code, please know that we also have a walkthrough on :doc:`how to use Tune with
|
||||
PyTorch Lightning models </tune/examples/tune-pytorch-lightning>`.
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
mnist_pytorch_trainable
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
MNIST PyTorch Trainable Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch_trainable.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
nevergrad_example
|
||||
Nevergrad Example
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/nevergrad_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
optuna_define_by_run_example
|
||||
Optuna Define-By-Run Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/optuna_define_by_run_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
optuna_example
|
||||
Optuna Example
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/optuna_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
optuna_multiobjective_example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Optuna Multi-Objective Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/optuna_multiobjective_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
pb2_example
|
||||
PB2 Example
|
||||
~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pb2_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
pb2_ppo_example
|
||||
PB2 PPO Example
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pb2_ppo_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
pbt_convnet_function_example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
PBT ConvNet Example
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_convnet_function_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
pbt_example
|
||||
PBT Example
|
||||
~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
pbt_function
|
||||
~~~~~~~~~~~~
|
||||
PBT Function Example
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_function.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
pbt_memnn_example
|
||||
Memory NN Example
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_memnn_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
pbt_tune_cifar10_with_keras
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Keras Cifar10 Example
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_tune_cifar10_with_keras.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
sigopt_example
|
||||
SigOpt Example
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/sigopt_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
sigopt_multi_objective_example
|
||||
SigOpt Multi-Objective Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/sigopt_multi_objective_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
sigopt_prior_beliefs_example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
SigOpt Prior Belief Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/sigopt_prior_beliefs_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
skopt_example
|
||||
SkOpt Example
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/skopt_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
tf_mnist_example
|
||||
~~~~~~~~~~~~~~~~
|
||||
TensorFlow MNIST Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/tf_mnist_example.py
|
|
@ -1,7 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
xgboost_dynamic_resources_example
|
||||
XGBoost Dynamic Resources Example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/xgboost_dynamic_resources_example.py
|
|
@ -1,6 +1,6 @@
|
|||
:orphan:
|
||||
|
||||
zoopt_example
|
||||
ZOOpt Example
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/zoopt_example.py
|
|
@ -4,156 +4,276 @@
|
|||
Examples
|
||||
========
|
||||
|
||||
.. Keep this in sync with ray/python/ray/tune/examples/README.rst
|
||||
.. tip:: Check out :ref:`the Tune User Guides <tune-guides>` To learn more about Tune's features in depth.
|
||||
|
||||
If any example is broken, or if you'd like to add an example to this page, feel free to raise an issue on our Github repository.
|
||||
.. _tune-recipes:
|
||||
|
||||
.. tip:: Check out :ref:`the Tune tutorials page <tune-guides>` for guides on how to use Tune with your preferred machine learning library.
|
||||
Practical How-To Guides
|
||||
-----------------------
|
||||
|
||||
.. _tune-general-examples:
|
||||
Ray Tune integrates with many popular machine learning frameworks.
|
||||
Here you find a few practical examples showing you how to tune your models.
|
||||
At the end of these guides you will often find links to even more examples.
|
||||
|
||||
General Examples
|
||||
----------------
|
||||
.. panels::
|
||||
:container: container pb-4
|
||||
:column: col-md-4 px-2 py-2
|
||||
:img-top-cls: pt-5 w-75 d-block mx-auto
|
||||
|
||||
- :doc:`/tune/examples/tune_basic_example`: Simple example for doing a basic random and grid search.
|
||||
- :doc:`/tune/examples/async_hyperband_example`: Example of using a simple tuning function with AsyncHyperBandScheduler.
|
||||
- :doc:`/tune/examples/hyperband_function_example`: Example of using a Trainable function with HyperBandScheduler. Also uses the AsyncHyperBandScheduler.
|
||||
- :doc:`/tune/examples/pbt_function`: Example of using the function API with a PopulationBasedTraining scheduler.
|
||||
- :doc:`/tune/examples/pb2_example`: Example of using the Population-based Bandits (PB2) scheduler.
|
||||
- :doc:`/tune/examples/logging_example`: Example of custom loggers and custom trial directory naming.
|
||||
---
|
||||
:img-top: /images/tune-sklearn.png
|
||||
|
||||
**Trainable Class Examples**
|
||||
+++
|
||||
.. link-button:: tune-sklearn
|
||||
:type: ref
|
||||
:text: How To Use Tune's Scikit-Learn Adapters?
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
Though it is preferable to use the Function API, Tune also supports a Class-based API for training.
|
||||
---
|
||||
:img-top: /images/keras.png
|
||||
|
||||
- :doc:`/tune/examples/hyperband_example`: Example of using a Trainable class with HyperBandScheduler. Also uses the AsyncHyperBandScheduler.
|
||||
- :doc:`/tune/examples/pbt_example`: Example of using a Trainable class with PopulationBasedTraining scheduler.
|
||||
+++
|
||||
.. link-button:: tune-mnist-keras
|
||||
:type: ref
|
||||
:text: How To Use Tune With Keras & TF Models
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
.. - :doc:`/tune/examples/durable_trainable_example`: Example using a durable storage mechanism in the Trainable.
|
||||
---
|
||||
:img-top: /images/pytorch_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-pytorch-cifar-ref
|
||||
:type: ref
|
||||
:text: How To Use Tune With PyTorch Models
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/pytorch_lightning_small.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-pytorch-lightning-ref
|
||||
:type: ref
|
||||
:text: How To Tune PyTorch Lightning Models
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/mxnet_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-mxnet-example
|
||||
:type: ref
|
||||
:text: How To Tune MXNet Models
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/serve.svg
|
||||
|
||||
+++
|
||||
.. link-button:: tune-serve-integration-mnist
|
||||
:type: ref
|
||||
:text: Model Selection & Serving With Ray Serve
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /rllib/images/rllib-logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-rllib-example
|
||||
:type: ref
|
||||
:text: Tuning RL Experiments With Ray Tune & Ray Serve
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/xgboost_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-xgboost-ref
|
||||
:type: ref
|
||||
:text: A Guide To Tuning XGBoost Parameters With Tune
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/lightgbm_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-lightgbm-example
|
||||
:type: ref
|
||||
:text: A Guide To Tuning LightGBM Parameters With Tune
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/horovod.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-horovod-example
|
||||
:type: ref
|
||||
:text: A Guide To Tuning Horovod Parameters With Tune
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/hugging.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-huggingface-example
|
||||
:type: ref
|
||||
:text: A Guide To Tuning Huggingface Transformers With Tune
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
|
||||
---
|
||||
:img-top: /images/wandb_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-wandb-ref
|
||||
:type: ref
|
||||
:text: Tracking Your Experiment Process Weights & Biases
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/mlflow.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-mlflow-ref
|
||||
:type: ref
|
||||
:text: Using MLflow Tracking & AutoLogging with Tune
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/comet_logo_full.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-comet-ref
|
||||
:type: ref
|
||||
:text: Using Comet with Ray Tune For Experiment Management
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
|
||||
Search Algorithm Examples
|
||||
-------------------------
|
||||
|
||||
- :doc:`/tune/examples/ax_example`: Example script showing usage of :ref:`AxSearch <tune-ax>` [`Ax website <https://ax.dev/>`__]
|
||||
- :doc:`/tune/examples/dragonfly_example`: Example script showing usage of :ref:`DragonflySearch <Dragonfly>` [`Dragonfly website <https://dragonfly-opt.readthedocs.io/>`__]
|
||||
- :doc:`/tune/examples/skopt_example`: Example script showing usage of :ref:`SkoptSearch <skopt>` [`Scikit-Optimize website <https://scikit-optimize.github.io>`__]
|
||||
- :doc:`/tune/examples/hyperopt_example`: Example script showing usage of :ref:`HyperOptSearch <tune-hyperopt>` [`HyperOpt website <http://hyperopt.github.io/hyperopt>`__]
|
||||
- :doc:`/tune/examples/hyperopt_conditional_search_space_example`: Example script showing usage of :ref:`HyperOptSearch <tune-hyperopt>` [`HyperOpt website <http://hyperopt.github.io/hyperopt>`__] with a conditional search space
|
||||
- :doc:`/tune/examples/bayesopt_example`: Example script showing usage of :ref:`BayesOptSearch <bayesopt>` [`BayesianOptimization website <https://github.com/fmfn/BayesianOptimization>`__]
|
||||
- :doc:`/tune/examples/blendsearch_example`: Example script showing usage of :ref:`BlendSearch <BlendSearch>` [`BlendSearch website <https://github.com/microsoft/FLAML/tree/main/flaml/tune>`__]
|
||||
- :doc:`/tune/examples/cfo_example`: Example script showing usage of :ref:`CFO <CFO>` [`CFO website <https://github.com/microsoft/FLAML/tree/main/flaml/tune>`__]
|
||||
- :doc:`/tune/examples/bohb_example`: Example script showing usage of :ref:`TuneBOHB <suggest-TuneBOHB>` [`BOHB website <https://github.com/automl/HpBandSter>`__]
|
||||
- :doc:`/tune/examples/nevergrad_example`: Example script showing usage of :ref:`NevergradSearch <nevergrad>` [`Nevergrad website <https://github.com/facebookresearch/nevergrad>`__]
|
||||
- :doc:`/tune/examples/optuna_example`: Example script showing usage of :ref:`OptunaSearch <tune-optuna>` [`Optuna website <https://optuna.org/>`__]
|
||||
- :doc:`/tune/examples/optuna_define_by_run_example`: Example script showing usage of :ref:`OptunaSearch <tune-optuna>` [`Optuna website <https://optuna.org/>`__] with a define-by-run function
|
||||
- :doc:`/tune/examples/optuna_multiobjective_example`: Example script showing usage of :ref:`OptunaSearch <tune-optuna>` [`Optuna website <https://optuna.org/>`__] for multi-objective optimization
|
||||
- :doc:`/tune/examples/zoopt_example`: Example script showing usage of :ref:`ZOOptSearch <zoopt>` [`ZOOpt website <https://github.com/polixir/ZOOpt>`__]
|
||||
- :doc:`/tune/examples/sigopt_example`: Example script showing usage of :ref:`SigOptSearch <sigopt>` [`SigOpt website <https://sigopt.com/>`__]
|
||||
- :doc:`/tune/examples/hebo_example`: Example script showing usage of :ref:`HEBOSearch <tune-hebo>` [`HEBO website <https://github.com/huawei-noah/HEBO/tree/master/HEBO>`__]
|
||||
.. TODO: make these panels with logos!
|
||||
|
||||
- :doc:`/tune/examples/includes/ax_example`:
|
||||
Example script showing usage of :ref:`AxSearch <tune-ax>` [`Ax website <https://ax.dev/>`__]
|
||||
- :doc:`/tune/examples/includes/dragonfly_example`:
|
||||
Example script showing usage of :ref:`DragonflySearch <Dragonfly>` [`Dragonfly website <https://dragonfly-opt.readthedocs.io/>`__]
|
||||
- :doc:`/tune/examples/includes/skopt_example`:
|
||||
Example script showing usage of :ref:`SkoptSearch <skopt>` [`Scikit-Optimize website <https://scikit-optimize.github.io>`__]
|
||||
- :doc:`/tune/examples/hyperopt_example`:
|
||||
Example script showing usage of :ref:`HyperOptSearch <tune-hyperopt>` [`HyperOpt website <http://hyperopt.github.io/hyperopt>`__]
|
||||
- :doc:`/tune/examples/includes/hyperopt_conditional_search_space_example`:
|
||||
Example script showing usage of :ref:`HyperOptSearch <tune-hyperopt>` [`HyperOpt website <http://hyperopt.github.io/hyperopt>`__] with a conditional search space
|
||||
- :doc:`/tune/examples/includes/bayesopt_example`:
|
||||
Example script showing usage of :ref:`BayesOptSearch <bayesopt>` [`BayesianOptimization website <https://github.com/fmfn/BayesianOptimization>`__]
|
||||
- :doc:`/tune/examples/includes/blendsearch_example`:
|
||||
Example script showing usage of :ref:`BlendSearch <BlendSearch>` [`BlendSearch website <https://github.com/microsoft/FLAML/tree/main/flaml/tune>`__]
|
||||
- :doc:`/tune/examples/includes/cfo_example`:
|
||||
Example script showing usage of :ref:`CFO <CFO>` [`CFO website <https://github.com/microsoft/FLAML/tree/main/flaml/tune>`__]
|
||||
- :doc:`/tune/examples/includes/bohb_example`:
|
||||
Example script showing usage of :ref:`TuneBOHB <suggest-TuneBOHB>` [`BOHB website <https://github.com/automl/HpBandSter>`__]
|
||||
- :doc:`/tune/examples/includes/nevergrad_example`:
|
||||
Example script showing usage of :ref:`NevergradSearch <nevergrad>` [`Nevergrad website <https://github.com/facebookresearch/nevergrad>`__]
|
||||
- :doc:`/tune/examples/includes/optuna_example`:
|
||||
Example script showing usage of :ref:`OptunaSearch <tune-optuna>` [`Optuna website <https://optuna.org/>`__]
|
||||
- :doc:`/tune/examples/includes/optuna_define_by_run_example`:
|
||||
Example script showing usage of :ref:`OptunaSearch <tune-optuna>` [`Optuna website <https://optuna.org/>`__] with a define-by-run function
|
||||
- :doc:`/tune/examples/includes/optuna_multiobjective_example`:
|
||||
Example script showing usage of :ref:`OptunaSearch <tune-optuna>` [`Optuna website <https://optuna.org/>`__] for multi-objective optimization
|
||||
- :doc:`/tune/examples/includes/zoopt_example`:
|
||||
Example script showing usage of :ref:`ZOOptSearch <zoopt>` [`ZOOpt website <https://github.com/polixir/ZOOpt>`__]
|
||||
- :doc:`/tune/examples/includes/sigopt_example`:
|
||||
Example script showing usage of :ref:`SigOptSearch <sigopt>` [`SigOpt website <https://sigopt.com/>`__]
|
||||
- :doc:`/tune/examples/includes/hebo_example`:
|
||||
Example script showing usage of :ref:`HEBOSearch <tune-hebo>` [`HEBO website <https://github.com/huawei-noah/HEBO/tree/master/HEBO>`__]
|
||||
- :doc:`/tune/examples/includes/sigopt_multi_objective_example`:
|
||||
Example using Sigopt's multi-objective functionality (contributed).
|
||||
- :doc:`/tune/examples/includes/sigopt_prior_beliefs_example`:
|
||||
Example using Sigopt's support for prior beliefs (contributed).
|
||||
|
||||
|
||||
**Sigopt (Contributed)**
|
||||
.. _tune-general-examples:
|
||||
|
||||
- :doc:`/tune/examples/sigopt_multi_objective_example`: Example using Sigopt's multi-objective functionality.
|
||||
- :doc:`/tune/examples/sigopt_prior_beliefs_example`: Example using Sigopt's support for prior beliefs.
|
||||
Other Examples
|
||||
--------------
|
||||
|
||||
- :doc:`/tune/examples/includes/tune_basic_example`: Simple example for doing a basic random and grid search.
|
||||
- :doc:`/tune/examples/includes/async_hyperband_example`: Example of using a simple tuning function with
|
||||
AsyncHyperBandScheduler.
|
||||
- :doc:`/tune/examples/includes/hyperband_function_example`:
|
||||
Example of using a Trainable function with HyperBandScheduler.
|
||||
Also uses the AsyncHyperBandScheduler.
|
||||
- :doc:`/tune/examples/includes/pbt_function`:
|
||||
Example of using the function API with a PopulationBasedTraining scheduler.
|
||||
- :doc:`/tune/examples/includes/pb2_example`: Example of using the Population-based Bandits (PB2) scheduler.
|
||||
- :doc:`/tune/examples/includes/logging_example`: Example of custom loggers and custom trial directory naming.
|
||||
- :doc:`/tune/examples/includes/genetic_example`: Optimizing the michalewicz function using the contributed
|
||||
GeneticSearch algorithm with AsyncHyperBandScheduler.
|
||||
|
||||
|
||||
tune-sklearn examples
|
||||
---------------------
|
||||
.. _tune-exercises:
|
||||
|
||||
See the `ray-project/tune-sklearn examples <https://github.com/ray-project/tune-sklearn/tree/master/examples>`__ for a comprehensive list of examples leveraging Tune's sklearn interface.
|
||||
Exercises
|
||||
---------
|
||||
|
||||
- `tune-sklearn with xgboost <https://github.com/ray-project/tune-sklearn/blob/master/examples/xgbclassifier.py>`__
|
||||
- `tune-sklearn with sklearn pipelines <https://github.com/ray-project/tune-sklearn/blob/master/examples/sklearn_pipeline.py>`__
|
||||
- `tune-sklearn with Bayesian Optimization <https://github.com/ray-project/tune-sklearn/blob/master/examples/hyperopt_sgd.py>`__
|
||||
Learn how to use Tune in your browser with the following Colab-based exercises.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
Framework-specific Examples
|
||||
---------------------------
|
||||
<table>
|
||||
<tr>
|
||||
<th class="tune-colab">Exercise Description</th>
|
||||
<th class="tune-colab">Library</th>
|
||||
<th class="tune-colab">Colab Link</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="tune-colab">Basics of using Tune.</td>
|
||||
<td class="tune-colab">TF/Keras</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_1_basics.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
PyTorch
|
||||
~~~~~~~
|
||||
<tr>
|
||||
<td class="tune-colab">Using Search algorithms and Trial Schedulers to optimize your model.</td>
|
||||
<td class="tune-colab">Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_2_optimize.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
- :doc:`/tune/examples/mnist_pytorch`: Converts the PyTorch MNIST example to use Tune with the function-based API. Also shows how to easily convert something relying on argparse to use Tune.
|
||||
- :doc:`/tune/examples/ddp_mnist_torch`: An example showing how to use DistributedDataParallel with Ray Tune. This enables both distributed training and distributed hyperparameter tuning.
|
||||
- :doc:`/tune/examples/cifar10_pytorch`: Uses Pytorch to tune a simple model on CIFAR10.
|
||||
- :doc:`/tune/examples/pbt_convnet_function_example`: Example training a ConvNet with checkpointing in function API.
|
||||
<tr>
|
||||
<td class="tune-colab">Using Population-Based Training (PBT).</td>
|
||||
<td class="tune-colab">Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_3_pbt.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
.. - :doc:`/tune/examples/pbt_convnet_example`: Example of training a Memory NN on bAbI with Keras using PBT.
|
||||
.. - :doc:`/tune/examples/mnist_pytorch_trainable`: Converts the PyTorch MNIST example to use Tune with Trainable API. Also uses the HyperBandScheduler and checkpoints the model at the end.
|
||||
<tr>
|
||||
<td class="tune-colab">Fine-tuning Huggingface Transformers with PBT.</td>
|
||||
<td class="tune-colab">Huggingface Transformers/Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/drive/1tQgAKgcKQzheoh503OzhS4N9NtfFgmjF?usp=sharing" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
Pytorch Lightning
|
||||
~~~~~~~~~~~~~~~~~
|
||||
<tr>
|
||||
<td class="tune-colab">Logging Tune Runs to Comet ML.</td>
|
||||
<td class="tune-colab">Comet</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/drive/1dp3VwVoAH1acn_kG7RuT62mICnOqxU1z?usp=sharing" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
- :doc:`/tune/examples/mnist_ptl_mini`: A minimal example of using `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_ to train a MNIST model. This example utilizes the Ray Tune-provided :ref:`PyTorch Lightning callbacks <tune-integration-pytorch-lightning>`. See also :ref:`this tutorial for a full walkthrough <tune-pytorch-lightning-ref>`.
|
||||
- :doc:`/tune/examples/mnist_pytorch_lightning`: A comprehensive example using `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_ to train a MNIST model. This example showcases how to use various search optimization techniques. It utilizes the Ray Tune-provided :ref:`PyTorch Lightning callbacks <tune-integration-pytorch-lightning>`.
|
||||
- :ref:`A walkthrough tutorial for using Ray Tune with Pytorch-Lightning <tune-pytorch-lightning-ref>`.
|
||||
|
||||
Wandb, MLflow
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
- :ref:`Tutorial <tune-wandb-ref>` for using `wandb <https://www.wandb.ai/>`__ with Ray Tune
|
||||
- :doc:`/tune/examples/wandb_example`: Example for using `Weights and Biases <https://www.wandb.ai/>`__ with Ray Tune.
|
||||
- :doc:`/tune/examples/mlflow_example`: Example for using `MLflow <https://github.com/mlflow/mlflow/>`__ with Ray Tune.
|
||||
- :doc:`/tune/examples/mlflow_ptl_example`: Example for using `MLflow <https://github.com/mlflow/mlflow/>`__ and `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_ with Ray Tune.
|
||||
|
||||
Tensorflow/Keras
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
- :doc:`/tune/examples/tune_mnist_keras`: Converts the Keras MNIST example to use Tune with the function-based API and a Keras callback. Also shows how to easily convert something relying on argparse to use Tune.
|
||||
- :doc:`/tune/examples/pbt_memnn_example`: Example of training a Memory NN on bAbI with Keras using PBT.
|
||||
- :doc:`/tune/examples/tf_mnist_example`: Converts the Advanced TF2.0 MNIST example to use Tune with the Trainable. This uses `tf.function`. Original code from tensorflow: https://www.tensorflow.org/tutorials/quickstart/advanced
|
||||
|
||||
MXNet
|
||||
~~~~~
|
||||
|
||||
- :doc:`/tune/examples/mxnet_example`: Simple example for using MXNet with Tune.
|
||||
- :doc:`/tune/examples/tune_cifar10_gluon`: MXNet Gluon example to use Tune with the function-based API on CIFAR-10 dataset.
|
||||
|
||||
|
||||
Horovod
|
||||
~~~~~~~
|
||||
|
||||
- :doc:`/tune/examples/horovod_simple`: Leverages the :ref:`Horovod-Tune <tune-integration-horovod>` integration to launch a distributed training + tuning job.
|
||||
|
||||
XGBoost, LightGBM
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
- :ref:`XGBoost tutorial <tune-xgboost-ref>`: A guide to tuning XGBoost parameters with Tune.
|
||||
- :doc:`/tune/examples/xgboost_example`: Trains a basic XGBoost model with Tune with the function-based API and an XGBoost callback.
|
||||
- :doc:`/tune/examples/xgboost_dynamic_resources_example`: Trains a basic XGBoost model with Tune with the class-based API and a ResourceChangingScheduler, ensuring all resources are being used at all time.
|
||||
- :doc:`/tune/examples/lightgbm_example`: Trains a basic LightGBM model with Tune with the function-based API and a LightGBM callback.
|
||||
|
||||
RLlib
|
||||
~~~~~
|
||||
|
||||
- :doc:`/tune/examples/pbt_ppo_example`: Example of optimizing a distributed RLlib algorithm (PPO) with the PopulationBasedTraining scheduler.
|
||||
- :doc:`/tune/examples/pb2_ppo_example`: Example of optimizing a distributed RLlib algorithm (PPO) with the PB2 scheduler. Uses a small population size of 4, so can train on a laptop.
|
||||
|
||||
|
||||
|:hugging_face:| Huggingface Transformers
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- :doc:`/tune/examples/pbt_transformers`: Fine-tunes a Huggingface transformer with Tune Population Based Training.
|
||||
|
||||
|
||||
Contributed Examples
|
||||
--------------------
|
||||
|
||||
- :doc:`/tune/examples/pbt_tune_cifar10_with_keras`: A contributed example of tuning a Keras model on CIFAR10 with the PopulationBasedTraining scheduler.
|
||||
- :doc:`/tune/examples/genetic_example`: Optimizing the michalewicz function using the contributed GeneticSearch algorithm with AsyncHyperBandScheduler.
|
||||
|
||||
|
||||
Open Source Projects using Tune
|
||||
-------------------------------
|
||||
|
||||
Here are some of the popular open source repositories and research projects that leverage Tune. Feel free to submit a pull-request adding (or requesting a removal!) of a listed project.
|
||||
|
||||
- `Softlearning <https://github.com/rail-berkeley/softlearning>`_: Softlearning is a reinforcement learning framework for training maximum entropy policies in continuous domains. Includes the official implementation of the Soft Actor-Critic algorithm.
|
||||
- `Flambe <https://github.com/asappresearch/flambe>`_: An ML framework to accelerate research and its path to production. See `flambe.ai <https://flambe.ai>`_.
|
||||
- `Population Based Augmentation <https://github.com/arcelien/pba>`_: Population Based Augmentation (PBA) is a algorithm that quickly and efficiently learns data augmentation functions for neural network training. PBA matches state-of-the-art results on CIFAR with one thousand times less compute.
|
||||
- `Fast AutoAugment by Kakao <https://github.com/kakaobrain/fast-autoaugment>`_: Fast AutoAugment (Accepted at NeurIPS 2019) learns augmentation policies using a more efficient search strategy based on density matching.
|
||||
- `Allentune <https://github.com/allenai/allentune>`_: Hyperparameter Search for AllenNLP from AllenAI.
|
||||
- `machinable <https://github.com/frthjf/machinable>`_: A modular configuration system for machine learning research. See `machinable.org <https://machinable.org>`_.
|
||||
- `NeuroCard <https://github.com/neurocard/neurocard>`_: NeuroCard (Accepted at VLDB 2021) is a neural cardinality estimator for multi-table join queries. It uses state of the art deep density models to learn correlations across relational database tables.
|
||||
Tutorial source files `can be found here <https://github.com/ray-project/tutorial>`_.
|
||||
|
|
128
doc/source/tune/examples/lightgbm_example.ipynb
Normal file
128
doc/source/tune/examples/lightgbm_example.ipynb
Normal file
|
@ -0,0 +1,128 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"(tune-lightgbm-example)=\n",
|
||||
"\n",
|
||||
"# Using LightGBM with Tune\n",
|
||||
"\n",
|
||||
"```{image} /images/lightgbm_logo.png\n",
|
||||
":align: center\n",
|
||||
":alt: LightGBM Logo\n",
|
||||
":height: 120px\n",
|
||||
":target: https://lightgbm.readthedocs.io\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## Example"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import lightgbm as lgb\n",
|
||||
"import numpy as np\n",
|
||||
"import sklearn.datasets\n",
|
||||
"import sklearn.metrics\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"from ray import tune\n",
|
||||
"from ray.tune.schedulers import ASHAScheduler\n",
|
||||
"from ray.tune.integration.lightgbm import TuneReportCheckpointCallback\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def train_breast_cancer(config):\n",
|
||||
"\n",
|
||||
" data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
|
||||
" train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25)\n",
|
||||
" train_set = lgb.Dataset(train_x, label=train_y)\n",
|
||||
" test_set = lgb.Dataset(test_x, label=test_y)\n",
|
||||
" gbm = lgb.train(\n",
|
||||
" config,\n",
|
||||
" train_set,\n",
|
||||
" valid_sets=[test_set],\n",
|
||||
" valid_names=[\"eval\"],\n",
|
||||
" verbose_eval=False,\n",
|
||||
" callbacks=[\n",
|
||||
" TuneReportCheckpointCallback(\n",
|
||||
" {\n",
|
||||
" \"binary_error\": \"eval-binary_error\",\n",
|
||||
" \"binary_logloss\": \"eval-binary_logloss\",\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" ],\n",
|
||||
" )\n",
|
||||
" preds = gbm.predict(test_x)\n",
|
||||
" pred_labels = np.rint(preds)\n",
|
||||
" tune.report(\n",
|
||||
" mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels), done=True\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" import argparse\n",
|
||||
"\n",
|
||||
" parser = argparse.ArgumentParser()\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--server-address\",\n",
|
||||
" type=str,\n",
|
||||
" default=None,\n",
|
||||
" required=False,\n",
|
||||
" help=\"The address of server to connect to if using \" \"Ray Client.\",\n",
|
||||
" )\n",
|
||||
" args, _ = parser.parse_known_args()\n",
|
||||
"\n",
|
||||
" if args.server_address:\n",
|
||||
" import ray\n",
|
||||
"\n",
|
||||
" ray.init(f\"ray://{args.server_address}\")\n",
|
||||
"\n",
|
||||
" config = {\n",
|
||||
" \"objective\": \"binary\",\n",
|
||||
" \"metric\": [\"binary_error\", \"binary_logloss\"],\n",
|
||||
" \"verbose\": -1,\n",
|
||||
" \"boosting_type\": tune.grid_search([\"gbdt\", \"dart\"]),\n",
|
||||
" \"num_leaves\": tune.randint(10, 1000),\n",
|
||||
" \"learning_rate\": tune.loguniform(1e-8, 1e-1),\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" analysis = tune.run(\n",
|
||||
" train_breast_cancer,\n",
|
||||
" metric=\"binary_error\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" config=config,\n",
|
||||
" num_samples=2,\n",
|
||||
" scheduler=ASHAScheduler(),\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" print(\"Best hyperparameters found were: \", analysis.best_config)\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
lightgbm_example
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/lightgbm_example.py
|
|
@ -1,7 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
mnist_pytorch
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch.py
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
mnist_pytorch_lightning
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/mnist_pytorch_lightning.py
|
166
doc/source/tune/examples/mxnet_example.ipynb
Normal file
166
doc/source/tune/examples/mxnet_example.ipynb
Normal file
|
@ -0,0 +1,166 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"(tune-mxnet-example)=\n",
|
||||
"\n",
|
||||
"# Using MXNet with Tune\n",
|
||||
"\n",
|
||||
"```{image} /images/mxnet_logo.png\n",
|
||||
":align: center\n",
|
||||
":alt: MXNet Logo\n",
|
||||
":height: 120px\n",
|
||||
":target: https://mxnet.apache.org/\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## Example"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import mxnet as mx\n",
|
||||
"\n",
|
||||
"from ray import tune, logger\n",
|
||||
"from ray.tune.integration.mxnet import TuneCheckpointCallback, TuneReportCallback\n",
|
||||
"from ray.tune.schedulers import ASHAScheduler\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def train_mnist_mxnet(config, mnist, num_epochs=10):\n",
|
||||
" batch_size = config[\"batch_size\"]\n",
|
||||
" train_iter = mx.io.NDArrayIter(\n",
|
||||
" mnist[\"train_data\"], mnist[\"train_label\"], batch_size, shuffle=True\n",
|
||||
" )\n",
|
||||
" val_iter = mx.io.NDArrayIter(mnist[\"test_data\"], mnist[\"test_label\"], batch_size)\n",
|
||||
"\n",
|
||||
" data = mx.sym.var(\"data\")\n",
|
||||
" data = mx.sym.flatten(data=data)\n",
|
||||
"\n",
|
||||
" fc1 = mx.sym.FullyConnected(data=data, num_hidden=config[\"layer_1_size\"])\n",
|
||||
" act1 = mx.sym.Activation(data=fc1, act_type=\"relu\")\n",
|
||||
"\n",
|
||||
" fc2 = mx.sym.FullyConnected(data=act1, num_hidden=config[\"layer_2_size\"])\n",
|
||||
" act2 = mx.sym.Activation(data=fc2, act_type=\"relu\")\n",
|
||||
"\n",
|
||||
" # MNIST has 10 classes\n",
|
||||
" fc3 = mx.sym.FullyConnected(data=act2, num_hidden=10)\n",
|
||||
" # Softmax with cross entropy loss\n",
|
||||
" mlp = mx.sym.SoftmaxOutput(data=fc3, name=\"softmax\")\n",
|
||||
"\n",
|
||||
" # create a trainable module on CPU\n",
|
||||
" mlp_model = mx.mod.Module(symbol=mlp, context=mx.cpu())\n",
|
||||
" mlp_model.fit(\n",
|
||||
" train_iter,\n",
|
||||
" eval_data=val_iter,\n",
|
||||
" optimizer=\"sgd\",\n",
|
||||
" optimizer_params={\"learning_rate\": config[\"lr\"]},\n",
|
||||
" eval_metric=\"acc\",\n",
|
||||
" batch_end_callback=mx.callback.Speedometer(batch_size, 100),\n",
|
||||
" eval_end_callback=TuneReportCallback({\"mean_accuracy\": \"accuracy\"}),\n",
|
||||
" epoch_end_callback=TuneCheckpointCallback(filename=\"mxnet_cp\", frequency=3),\n",
|
||||
" num_epoch=num_epochs,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tune_mnist_mxnet(num_samples=10, num_epochs=10):\n",
|
||||
" logger.info(\"Downloading MNIST data...\")\n",
|
||||
" mnist_data = mx.test_utils.get_mnist()\n",
|
||||
" logger.info(\"Got MNIST data, starting Ray Tune.\")\n",
|
||||
"\n",
|
||||
" config = {\n",
|
||||
" \"layer_1_size\": tune.choice([32, 64, 128]),\n",
|
||||
" \"layer_2_size\": tune.choice([64, 128, 256]),\n",
|
||||
" \"lr\": tune.loguniform(1e-3, 1e-1),\n",
|
||||
" \"batch_size\": tune.choice([32, 64, 128]),\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)\n",
|
||||
"\n",
|
||||
" analysis = tune.run(\n",
|
||||
" tune.with_parameters(\n",
|
||||
" train_mnist_mxnet, mnist=mnist_data, num_epochs=num_epochs\n",
|
||||
" ),\n",
|
||||
" resources_per_trial={\n",
|
||||
" \"cpu\": 1,\n",
|
||||
" },\n",
|
||||
" metric=\"mean_accuracy\",\n",
|
||||
" mode=\"max\",\n",
|
||||
" config=config,\n",
|
||||
" num_samples=num_samples,\n",
|
||||
" scheduler=scheduler,\n",
|
||||
" name=\"tune_mnist_mxnet\",\n",
|
||||
" )\n",
|
||||
" return analysis\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" import argparse\n",
|
||||
"\n",
|
||||
" parser = argparse.ArgumentParser()\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--smoke-test\", action=\"store_true\", help=\"Finish quickly for testing\"\n",
|
||||
" )\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--server-address\",\n",
|
||||
" type=str,\n",
|
||||
" default=None,\n",
|
||||
" required=False,\n",
|
||||
" help=\"The address of server to connect to if using \" \"Ray Client.\",\n",
|
||||
" )\n",
|
||||
" args, _ = parser.parse_known_args()\n",
|
||||
"\n",
|
||||
" if args.server_address and not args.smoke_test:\n",
|
||||
" import ray\n",
|
||||
"\n",
|
||||
" ray.init(f\"ray://{args.server_address}\")\n",
|
||||
"\n",
|
||||
" if args.smoke_test:\n",
|
||||
" analysis = tune_mnist_mxnet(num_samples=1, num_epochs=1)\n",
|
||||
" else:\n",
|
||||
" analysis = tune_mnist_mxnet(num_samples=10, num_epochs=10)\n",
|
||||
"\n",
|
||||
" print(\"Best hyperparameters found were: \", analysis.best_config)\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## More MXNet Examples\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"- {doc}`/tune/examples/includes/tune_cifar10_gluon`:\n",
|
||||
" MXNet Gluon example to use Tune with the function-based API on CIFAR-10 dataset.\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
mxnet_example
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/mxnet_example.py
|
130
doc/source/tune/examples/pbt_ppo_example.ipynb
Normal file
130
doc/source/tune/examples/pbt_ppo_example.ipynb
Normal file
|
@ -0,0 +1,130 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b05af3b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"(tune-rllib-example)=\n",
|
||||
"\n",
|
||||
"# Using RLlib with Tune\n",
|
||||
"\n",
|
||||
"```{image} /rllib/images/rllib-logo.png\n",
|
||||
":align: center\n",
|
||||
":alt: RLlib Logo\n",
|
||||
":height: 120px\n",
|
||||
":target: https://docs.ray.io\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## Example\n",
|
||||
"\n",
|
||||
"Example of using PBT with RLlib.\n",
|
||||
"\n",
|
||||
"Note that this requires a cluster with at least 8 GPUs in order for all trials\n",
|
||||
"to run concurrently, otherwise PBT will round-robin train the trials which\n",
|
||||
"is less efficient (or you can set {\"gpu\": 0} to use CPUs for SGD instead).\n",
|
||||
"\n",
|
||||
"Note that Tune in general does not need 8 GPUs, and this is just a more\n",
|
||||
"computationally demanding example."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "19e3c389",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"\n",
|
||||
"from ray import tune\n",
|
||||
"from ray.tune.schedulers import PopulationBasedTraining\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
"\n",
|
||||
" # Postprocess the perturbed config to ensure it's still valid\n",
|
||||
" def explore(config):\n",
|
||||
" # ensure we collect enough timesteps to do sgd\n",
|
||||
" if config[\"train_batch_size\"] < config[\"sgd_minibatch_size\"] * 2:\n",
|
||||
" config[\"train_batch_size\"] = config[\"sgd_minibatch_size\"] * 2\n",
|
||||
" # ensure we run at least one sgd iter\n",
|
||||
" if config[\"num_sgd_iter\"] < 1:\n",
|
||||
" config[\"num_sgd_iter\"] = 1\n",
|
||||
" return config\n",
|
||||
"\n",
|
||||
" pbt = PopulationBasedTraining(\n",
|
||||
" time_attr=\"time_total_s\",\n",
|
||||
" perturbation_interval=120,\n",
|
||||
" resample_probability=0.25,\n",
|
||||
" # Specifies the mutations of these hyperparams\n",
|
||||
" hyperparam_mutations={\n",
|
||||
" \"lambda\": lambda: random.uniform(0.9, 1.0),\n",
|
||||
" \"clip_param\": lambda: random.uniform(0.01, 0.5),\n",
|
||||
" \"lr\": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],\n",
|
||||
" \"num_sgd_iter\": lambda: random.randint(1, 30),\n",
|
||||
" \"sgd_minibatch_size\": lambda: random.randint(128, 16384),\n",
|
||||
" \"train_batch_size\": lambda: random.randint(2000, 160000),\n",
|
||||
" },\n",
|
||||
" custom_explore_fn=explore,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" analysis = tune.run(\n",
|
||||
" \"PPO\",\n",
|
||||
" name=\"pbt_humanoid_test\",\n",
|
||||
" scheduler=pbt,\n",
|
||||
" num_samples=1,\n",
|
||||
" metric=\"episode_reward_mean\",\n",
|
||||
" mode=\"max\",\n",
|
||||
" config={\n",
|
||||
" \"env\": \"Humanoid-v1\",\n",
|
||||
" \"kl_coeff\": 1.0,\n",
|
||||
" \"num_workers\": 8,\n",
|
||||
" \"num_gpus\": 0, # number of GPUs to use\n",
|
||||
" \"model\": {\"free_log_std\": True},\n",
|
||||
" # These params are tuned from a fixed starting value.\n",
|
||||
" \"lambda\": 0.95,\n",
|
||||
" \"clip_param\": 0.2,\n",
|
||||
" \"lr\": 1e-4,\n",
|
||||
" # These params start off randomly drawn from a set.\n",
|
||||
" \"num_sgd_iter\": tune.choice([10, 20, 30]),\n",
|
||||
" \"sgd_minibatch_size\": tune.choice([128, 512, 2048]),\n",
|
||||
" \"train_batch_size\": tune.choice([10000, 20000, 40000]),\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" print(\"best hyperparameters: \", analysis.best_config)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6fb69a24",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## More RLlib Examples\n",
|
||||
"\n",
|
||||
"- {doc}`/tune/examples/includes/pb2_ppo_example`:\n",
|
||||
" Example of optimizing a distributed RLlib algorithm (PPO) with the PB2 scheduler.\n",
|
||||
" Uses a small population size of 4, so can train on a laptop."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
pbt_ppo_example
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_ppo_example.py
|
291
doc/source/tune/examples/pbt_transformers.ipynb
Normal file
291
doc/source/tune/examples/pbt_transformers.ipynb
Normal file
|
@ -0,0 +1,291 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b05af3b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"(tune-huggingface-example)=\n",
|
||||
"\n",
|
||||
"# Using |:hugging_face:| Huggingface Transformers with Tune\n",
|
||||
"\n",
|
||||
"```{image} /images/hugging.png\n",
|
||||
":align: center\n",
|
||||
":alt: Huggingface Logo\n",
|
||||
":height: 120px\n",
|
||||
":target: https://huggingface.co\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "19e3c389",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"This example is uses the official\n",
|
||||
"huggingface transformers `hyperparameter_search` API.\n",
|
||||
"\"\"\"\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"import ray\n",
|
||||
"from ray import tune\n",
|
||||
"from ray.tune import CLIReporter\n",
|
||||
"from ray.tune.examples.pbt_transformers.utils import (\n",
|
||||
" download_data,\n",
|
||||
" build_compute_metrics_fn,\n",
|
||||
")\n",
|
||||
"from ray.tune.schedulers import PopulationBasedTraining\n",
|
||||
"from transformers import (\n",
|
||||
" glue_tasks_num_labels,\n",
|
||||
" AutoConfig,\n",
|
||||
" AutoModelForSequenceClassification,\n",
|
||||
" AutoTokenizer,\n",
|
||||
" Trainer,\n",
|
||||
" GlueDataset,\n",
|
||||
" GlueDataTrainingArguments,\n",
|
||||
" TrainingArguments,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tune_transformer(num_samples=8, gpus_per_trial=0, smoke_test=False):\n",
|
||||
" data_dir_name = \"./data\" if not smoke_test else \"./test_data\"\n",
|
||||
" data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))\n",
|
||||
" if not os.path.exists(data_dir):\n",
|
||||
" os.mkdir(data_dir, 0o755)\n",
|
||||
"\n",
|
||||
" # Change these as needed.\n",
|
||||
" model_name = (\n",
|
||||
" \"bert-base-uncased\" if not smoke_test else \"sshleifer/tiny-distilroberta-base\"\n",
|
||||
" )\n",
|
||||
" task_name = \"rte\"\n",
|
||||
"\n",
|
||||
" task_data_dir = os.path.join(data_dir, task_name.upper())\n",
|
||||
"\n",
|
||||
" num_labels = glue_tasks_num_labels[task_name]\n",
|
||||
"\n",
|
||||
" config = AutoConfig.from_pretrained(\n",
|
||||
" model_name, num_labels=num_labels, finetuning_task=task_name\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Download and cache tokenizer, model, and features\n",
|
||||
" print(\"Downloading and caching Tokenizer\")\n",
|
||||
" tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
||||
"\n",
|
||||
" # Triggers tokenizer download to cache\n",
|
||||
" print(\"Downloading and caching pre-trained model\")\n",
|
||||
" AutoModelForSequenceClassification.from_pretrained(\n",
|
||||
" model_name,\n",
|
||||
" config=config,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def get_model():\n",
|
||||
" return AutoModelForSequenceClassification.from_pretrained(\n",
|
||||
" model_name,\n",
|
||||
" config=config,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Download data.\n",
|
||||
" download_data(task_name, data_dir)\n",
|
||||
"\n",
|
||||
" data_args = GlueDataTrainingArguments(task_name=task_name, data_dir=task_data_dir)\n",
|
||||
"\n",
|
||||
" train_dataset = GlueDataset(\n",
|
||||
" data_args, tokenizer=tokenizer, mode=\"train\", cache_dir=task_data_dir\n",
|
||||
" )\n",
|
||||
" eval_dataset = GlueDataset(\n",
|
||||
" data_args, tokenizer=tokenizer, mode=\"dev\", cache_dir=task_data_dir\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" training_args = TrainingArguments(\n",
|
||||
" output_dir=\".\",\n",
|
||||
" learning_rate=1e-5, # config\n",
|
||||
" do_train=True,\n",
|
||||
" do_eval=True,\n",
|
||||
" no_cuda=gpus_per_trial <= 0,\n",
|
||||
" evaluation_strategy=\"epoch\",\n",
|
||||
" save_strategy=\"epoch\",\n",
|
||||
" load_best_model_at_end=True,\n",
|
||||
" num_train_epochs=2, # config\n",
|
||||
" max_steps=-1,\n",
|
||||
" per_device_train_batch_size=16, # config\n",
|
||||
" per_device_eval_batch_size=16, # config\n",
|
||||
" warmup_steps=0,\n",
|
||||
" weight_decay=0.1, # config\n",
|
||||
" logging_dir=\"./logs\",\n",
|
||||
" skip_memory_metrics=True,\n",
|
||||
" report_to=\"none\",\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" trainer = Trainer(\n",
|
||||
" model_init=get_model,\n",
|
||||
" args=training_args,\n",
|
||||
" train_dataset=train_dataset,\n",
|
||||
" eval_dataset=eval_dataset,\n",
|
||||
" compute_metrics=build_compute_metrics_fn(task_name),\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" tune_config = {\n",
|
||||
" \"per_device_train_batch_size\": 32,\n",
|
||||
" \"per_device_eval_batch_size\": 32,\n",
|
||||
" \"num_train_epochs\": tune.choice([2, 3, 4, 5]),\n",
|
||||
" \"max_steps\": 1 if smoke_test else -1, # Used for smoke test.\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" scheduler = PopulationBasedTraining(\n",
|
||||
" time_attr=\"training_iteration\",\n",
|
||||
" metric=\"eval_acc\",\n",
|
||||
" mode=\"max\",\n",
|
||||
" perturbation_interval=1,\n",
|
||||
" hyperparam_mutations={\n",
|
||||
" \"weight_decay\": tune.uniform(0.0, 0.3),\n",
|
||||
" \"learning_rate\": tune.uniform(1e-5, 5e-5),\n",
|
||||
" \"per_device_train_batch_size\": [16, 32, 64],\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" reporter = CLIReporter(\n",
|
||||
" parameter_columns={\n",
|
||||
" \"weight_decay\": \"w_decay\",\n",
|
||||
" \"learning_rate\": \"lr\",\n",
|
||||
" \"per_device_train_batch_size\": \"train_bs/gpu\",\n",
|
||||
" \"num_train_epochs\": \"num_epochs\",\n",
|
||||
" },\n",
|
||||
" metric_columns=[\"eval_acc\", \"eval_loss\", \"epoch\", \"training_iteration\"],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" trainer.hyperparameter_search(\n",
|
||||
" hp_space=lambda _: tune_config,\n",
|
||||
" backend=\"ray\",\n",
|
||||
" n_trials=num_samples,\n",
|
||||
" resources_per_trial={\"cpu\": 1, \"gpu\": gpus_per_trial},\n",
|
||||
" scheduler=scheduler,\n",
|
||||
" keep_checkpoints_num=1,\n",
|
||||
" checkpoint_score_attr=\"training_iteration\",\n",
|
||||
" stop={\"training_iteration\": 1} if smoke_test else None,\n",
|
||||
" progress_reporter=reporter,\n",
|
||||
" local_dir=\"~/ray_results/\",\n",
|
||||
" name=\"tune_transformer_pbt\",\n",
|
||||
" log_to_file=True,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" import argparse\n",
|
||||
"\n",
|
||||
" parser = argparse.ArgumentParser()\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--smoke-test\", default=True, action=\"store_true\", help=\"Finish quickly for testing\"\n",
|
||||
" )\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--ray-address\",\n",
|
||||
" type=str,\n",
|
||||
" default=None,\n",
|
||||
" help=\"Address to use for Ray. \"\n",
|
||||
" 'Use \"auto\" for cluster. '\n",
|
||||
" \"Defaults to None for local.\",\n",
|
||||
" )\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--server-address\",\n",
|
||||
" type=str,\n",
|
||||
" default=None,\n",
|
||||
" required=False,\n",
|
||||
" help=\"The address of server to connect to if using \" \"Ray Client.\",\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" args, _ = parser.parse_known_args()\n",
|
||||
"\n",
|
||||
" if args.smoke_test:\n",
|
||||
" ray.init()\n",
|
||||
" elif args.server_address:\n",
|
||||
" ray.init(f\"ray://{args.server_address}\")\n",
|
||||
" else:\n",
|
||||
" ray.init(args.ray_address)\n",
|
||||
"\n",
|
||||
" if args.smoke_test:\n",
|
||||
" tune_transformer(num_samples=1, gpus_per_trial=0, smoke_test=True)\n",
|
||||
" else:\n",
|
||||
" # You can change the number of GPUs here:\n",
|
||||
" tune_transformer(num_samples=8, gpus_per_trial=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\"\"\"Utilities to load and cache data.\"\"\"\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from typing import Callable, Dict\n",
|
||||
"import numpy as np\n",
|
||||
"from transformers import EvalPrediction\n",
|
||||
"from transformers import glue_compute_metrics, glue_output_modes\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]:\n",
|
||||
" \"\"\"Function from transformers/examples/text-classification/run_glue.py\"\"\"\n",
|
||||
" output_mode = glue_output_modes[task_name]\n",
|
||||
"\n",
|
||||
" def compute_metrics_fn(p: EvalPrediction):\n",
|
||||
" if output_mode == \"classification\":\n",
|
||||
" preds = np.argmax(p.predictions, axis=1)\n",
|
||||
" elif output_mode == \"regression\":\n",
|
||||
" preds = np.squeeze(p.predictions)\n",
|
||||
" metrics = glue_compute_metrics(task_name, preds, p.label_ids)\n",
|
||||
" return metrics\n",
|
||||
"\n",
|
||||
" return compute_metrics_fn\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def download_data(task_name, data_dir=\"./data\"):\n",
|
||||
" # Download RTE training data\n",
|
||||
" print(\"Downloading dataset.\")\n",
|
||||
" import urllib\n",
|
||||
" import zipfile\n",
|
||||
"\n",
|
||||
" if task_name == \"rte\":\n",
|
||||
" url = \"https://dl.fbaipublicfiles.com/glue/data/RTE.zip\"\n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"Unknown task: {}\".format(task_name))\n",
|
||||
" data_file = os.path.join(data_dir, \"{}.zip\".format(task_name))\n",
|
||||
" if not os.path.exists(data_file):\n",
|
||||
" urllib.request.urlretrieve(url, data_file)\n",
|
||||
" with zipfile.ZipFile(data_file) as zip_ref:\n",
|
||||
" zip_ref.extractall(data_dir)\n",
|
||||
" print(\"Downloaded data for task {} to {}\".format(task_name, data_dir))\n",
|
||||
" else:\n",
|
||||
" print(\n",
|
||||
" \"Data already exists. Using downloaded data for task {} from {}\".format(\n",
|
||||
" task_name, data_dir\n",
|
||||
" )\n",
|
||||
" )"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -1,7 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
pbt_transformers_example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/pbt_transformers.py
|
||||
.. literalinclude:: /../../python/ray/tune/examples/pbt_transformers/utils.py
|
152
doc/source/tune/examples/tune-comet.ipynb
Normal file
152
doc/source/tune/examples/tune-comet.ipynb
Normal file
|
@ -0,0 +1,152 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b05af3b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"(tune-comet-ref)=\n",
|
||||
"\n",
|
||||
"# Using Comet with Tune\n",
|
||||
"\n",
|
||||
"[Comet](https://www.comet.ml/site/) is a tool to manage and optimize the\n",
|
||||
"entire ML lifecycle, from experiment tracking, model optimization and dataset\n",
|
||||
"versioning to model production monitoring.\n",
|
||||
"\n",
|
||||
"```{image} /images/comet_logo_full.png\n",
|
||||
":align: center\n",
|
||||
":alt: Comet\n",
|
||||
":height: 120px\n",
|
||||
":target: https://www.comet.ml/site/\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## Example\n",
|
||||
"\n",
|
||||
"To illustrate logging your trial results to Comet, we'll define a simple training function\n",
|
||||
"that simulates a `loss` metric:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "19e3c389",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"from ray import tune\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def train_function(config, checkpoint_dir=None):\n",
|
||||
" for i in range(30):\n",
|
||||
" loss = config[\"mean\"] + config[\"sd\"] * np.random.randn()\n",
|
||||
" tune.report(loss=loss)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6fb69a24",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, given that you provide your Comet API key and your project name like so:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "993d5be6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"api_key = \"YOUR_COMET_API_KEY\"\n",
|
||||
"project_name = \"YOUR_COMET_PROJECT_NAME\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e9ce0d76",
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"remove-cell"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This cell is hidden from the rendered notebook. It makes the \n",
|
||||
"from unittest.mock import MagicMock\n",
|
||||
"from ray.tune.integration.comet import CometLoggerCallback\n",
|
||||
"\n",
|
||||
"CometLoggerCallback._logger_process_cls = MagicMock\n",
|
||||
"api_key = \"abc\"\n",
|
||||
"project_name = \"test\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"You can add a Comet logger by specifying the `callbacks` argument in your `tune.run` accordingly:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dbb761e7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from ray.tune.integration.comet import CometLoggerCallback\n",
|
||||
"\n",
|
||||
"analysis = tune.run(\n",
|
||||
" train_function,\n",
|
||||
" name=\"comet\",\n",
|
||||
" metric=\"loss\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" callbacks=[\n",
|
||||
" CometLoggerCallback(\n",
|
||||
" api_key=api_key, project_name=project_name, tags=[\"comet_example\"]\n",
|
||||
" )\n",
|
||||
" ],\n",
|
||||
" config={\"mean\": tune.grid_search([1, 2, 3]), \"sd\": tune.uniform(0.2, 0.8)},\n",
|
||||
")\n",
|
||||
"print(analysis.best_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d7e46189",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Tune Comet Logger\n",
|
||||
"\n",
|
||||
"Ray Tune offers an integration with Comet through the `CometLoggerCallback`,\n",
|
||||
"which automatically logs metrics and parameters reported to Tune to the Comet UI.\n",
|
||||
"\n",
|
||||
"Click on the following dropdown to see this callback API in detail:\n",
|
||||
"\n",
|
||||
"```{eval-rst}\n",
|
||||
".. autoclass:: ray.tune.integration.comet.CometLoggerCallback\n",
|
||||
" :noindex:\n",
|
||||
"```"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
320
doc/source/tune/examples/tune-mlflow.ipynb
Normal file
320
doc/source/tune/examples/tune-mlflow.ipynb
Normal file
|
@ -0,0 +1,320 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6df76a1f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Using MLflow with Tune\n",
|
||||
"\n",
|
||||
"(tune-mlflow-ref)=\n",
|
||||
"\n",
|
||||
":::{warning}\n",
|
||||
"If you are using these MLflow integrations with {ref}`ray-client`, it is recommended that you setup a\n",
|
||||
"remote Mlflow tracking server instead of one that is backed by the local filesystem.\n",
|
||||
":::\n",
|
||||
"\n",
|
||||
"[MLflow](https://mlflow.org/) is an open source platform to manage the ML lifecycle, including experimentation,\n",
|
||||
"reproducibility, deployment, and a central model registry. It currently offers four components, including\n",
|
||||
"MLflow Tracking to record and query experiments, including code, data, config, and results.\n",
|
||||
"\n",
|
||||
"```{image} /images/mlflow.png\n",
|
||||
":align: center\n",
|
||||
":alt: MLflow\n",
|
||||
":height: 80px\n",
|
||||
":target: https://www.mlflow.org/\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Ray Tune currently offers two lightweight integrations for MLflow Tracking.\n",
|
||||
"One is the {ref}`MLflowLoggerCallback <tune-mlflow-logger>`, which automatically logs\n",
|
||||
"metrics reported to Tune to the MLflow Tracking API.\n",
|
||||
"\n",
|
||||
"The other one is the {ref}`@mlflow_mixin <tune-mlflow-mixin>` decorator, which can be\n",
|
||||
"used with the function API. It automatically\n",
|
||||
"initializes the MLflow API with Tune's training information and creates a run for each Tune trial.\n",
|
||||
"Then within your training function, you can just use the\n",
|
||||
"MLflow like you would normally do, e.g. using `mlflow.log_metrics()` or even `mlflow.autolog()`\n",
|
||||
"to log to your training process.\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## Running an MLflow Example\n",
|
||||
"\n",
|
||||
"In the following example we're going to use both of the above methods, namely the `MLflowLoggerCallback` and\n",
|
||||
"the `mlflow_mixin` decorator to log metrics.\n",
|
||||
"Let's start with a few crucial imports:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b0e47339",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import tempfile\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"import mlflow\n",
|
||||
"\n",
|
||||
"from ray import tune\n",
|
||||
"from ray.tune.integration.mlflow import MLflowLoggerCallback, mlflow_mixin"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Next, let's define an easy objective function (a Tune `Trainable`) that iteratively computes steps and evaluates\n",
|
||||
"intermediate scores that we report to Tune."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def evaluation_fn(step, width, height):\n",
|
||||
" return (0.1 + width * step / 100) ** (-1) + height * 0.1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def easy_objective(config):\n",
|
||||
" width, height = config[\"width\"], config[\"height\"]\n",
|
||||
"\n",
|
||||
" for step in range(config.get(\"steps\", 100)):\n",
|
||||
" # Iterative training function - can be any arbitrary training procedure\n",
|
||||
" intermediate_score = evaluation_fn(step, width, height)\n",
|
||||
" # Feed the score back to Tune.\n",
|
||||
" tune.report(iterations=step, mean_loss=intermediate_score)\n",
|
||||
" time.sleep(0.1)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Given an MLFlow tracking URI, you can now simply use the `MLflowLoggerCallback` as a `callback` argument to\n",
|
||||
"your `tune.run()` call:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tune_function(mlflow_tracking_uri, finish_fast=False):\n",
|
||||
" tune.run(\n",
|
||||
" easy_objective,\n",
|
||||
" name=\"mlflow\",\n",
|
||||
" num_samples=5,\n",
|
||||
" callbacks=[\n",
|
||||
" MLflowLoggerCallback(\n",
|
||||
" tracking_uri=mlflow_tracking_uri,\n",
|
||||
" experiment_name=\"example\",\n",
|
||||
" save_artifact=True,\n",
|
||||
" )\n",
|
||||
" ],\n",
|
||||
" config={\n",
|
||||
" \"width\": tune.randint(10, 100),\n",
|
||||
" \"height\": tune.randint(0, 100),\n",
|
||||
" \"steps\": 5 if finish_fast else 100,\n",
|
||||
" },\n",
|
||||
" )"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"To use the `mlflow_mixin` decorator, you can simply decorate the objective function from earlier.\n",
|
||||
"Note that we also use `mlflow.log_metrics(...)` to log metrics to MLflow.\n",
|
||||
"Otherwise, the decorated version of our objective is identical to its original."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@mlflow_mixin\n",
|
||||
"def decorated_easy_objective(config):\n",
|
||||
" # Hyperparameters\n",
|
||||
" width, height = config[\"width\"], config[\"height\"]\n",
|
||||
"\n",
|
||||
" for step in range(config.get(\"steps\", 100)):\n",
|
||||
" # Iterative training function - can be any arbitrary training procedure\n",
|
||||
" intermediate_score = evaluation_fn(step, width, height)\n",
|
||||
" # Log the metrics to mlflow\n",
|
||||
" mlflow.log_metrics(dict(mean_loss=intermediate_score), step=step)\n",
|
||||
" # Feed the score back to Tune.\n",
|
||||
" tune.report(iterations=step, mean_loss=intermediate_score)\n",
|
||||
" time.sleep(0.1)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"With this new objective function ready, you can now create a Tune run with it as follows:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tune_decorated(mlflow_tracking_uri, finish_fast=False):\n",
|
||||
" # Set the experiment, or create a new one if does not exist yet.\n",
|
||||
" mlflow.set_tracking_uri(mlflow_tracking_uri)\n",
|
||||
" mlflow.set_experiment(experiment_name=\"mixin_example\")\n",
|
||||
" tune.run(\n",
|
||||
" decorated_easy_objective,\n",
|
||||
" name=\"mlflow\",\n",
|
||||
" num_samples=5,\n",
|
||||
" config={\n",
|
||||
" \"width\": tune.randint(10, 100),\n",
|
||||
" \"height\": tune.randint(0, 100),\n",
|
||||
" \"steps\": 5 if finish_fast else 100,\n",
|
||||
" \"mlflow\": {\n",
|
||||
" \"experiment_name\": \"mixin_example\",\n",
|
||||
" \"tracking_uri\": mlflow.get_tracking_uri(),\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" )"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"If you hapen to have an MLFlow tracking URI, you can set it below in the `mlflow_tracking_uri` variable and set\n",
|
||||
"`smoke_test=False`.\n",
|
||||
"Otherwise, you can just run a quick test of the `tune_function` and `tune_decorated` functions without using MLflow."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"smoke_test = True\n",
|
||||
"\n",
|
||||
"if smoke_test:\n",
|
||||
" mlflow_tracking_uri = os.path.join(tempfile.gettempdir(), \"mlruns\")\n",
|
||||
"else:\n",
|
||||
" mlflow_tracking_uri = \"<MLFLOW_TRACKING_URI>\"\n",
|
||||
"\n",
|
||||
"tune_function(mlflow_tracking_uri, finish_fast=smoke_test)\n",
|
||||
"if not smoke_test:\n",
|
||||
" df = mlflow.search_runs(\n",
|
||||
" [mlflow.get_experiment_by_name(\"example\").experiment_id]\n",
|
||||
" )\n",
|
||||
" print(df)\n",
|
||||
"\n",
|
||||
"tune_decorated(mlflow_tracking_uri, finish_fast=smoke_test)\n",
|
||||
"if not smoke_test:\n",
|
||||
" df = mlflow.search_runs(\n",
|
||||
" [mlflow.get_experiment_by_name(\"mixin_example\").experiment_id]\n",
|
||||
" )\n",
|
||||
" print(df)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f0df0817",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This completes our Tune and MLflow walk-through.\n",
|
||||
"In the following sections you can find more details on the API of the Tune-MLflow integration.\n",
|
||||
"\n",
|
||||
"## MLflow AutoLogging\n",
|
||||
"\n",
|
||||
"You can also check out {doc}`here </tune/examples/includes/mlflow_ptl_example>` for an example on how you can\n",
|
||||
"leverage MLflow auto-logging, in this case with Pytorch Lightning\n",
|
||||
"\n",
|
||||
"## MLflow Logger API\n",
|
||||
"\n",
|
||||
"(tune-mlflow-logger)=\n",
|
||||
"\n",
|
||||
"```{eval-rst}\n",
|
||||
".. autoclass:: ray.tune.integration.mlflow.MLflowLoggerCallback\n",
|
||||
" :noindex:\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## MLflow Mixin API\n",
|
||||
"\n",
|
||||
"(tune-mlflow-mixin)=\n",
|
||||
"\n",
|
||||
"```{eval-rst}\n",
|
||||
".. autofunction:: ray.tune.integration.mlflow.mlflow_mixin\n",
|
||||
" :noindex:\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## More MLflow Examples\n",
|
||||
"\n",
|
||||
"- {doc}`/tune/examples/includes/mlflow_ptl_example`: Example for using [MLflow](https://github.com/mlflow/mlflow/)\n",
|
||||
" and [Pytorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning) with Ray Tune."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
501
doc/source/tune/examples/tune-pytorch-cifar.ipynb
Normal file
501
doc/source/tune/examples/tune-pytorch-cifar.ipynb
Normal file
|
@ -0,0 +1,501 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "586737af",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# How to use Tune with PyTorch\n",
|
||||
"\n",
|
||||
"(tune-pytorch-cifar-ref)=\n",
|
||||
"\n",
|
||||
"In this walkthrough, we will show you how to integrate Tune into your PyTorch\n",
|
||||
"training workflow. We will follow [this tutorial from the PyTorch documentation](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html)\n",
|
||||
"for training a CIFAR10 image classifier.\n",
|
||||
"\n",
|
||||
"```{image} /images/pytorch_logo.png\n",
|
||||
":align: center\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Hyperparameter tuning can make the difference between an average model and a highly\n",
|
||||
"accurate one. Often simple things like choosing a different learning rate or changing\n",
|
||||
"a network layer size can have a dramatic impact on your model performance. Fortunately,\n",
|
||||
"Tune makes exploring these optimal parameter combinations easy - and works nicely\n",
|
||||
"together with PyTorch.\n",
|
||||
"\n",
|
||||
"As you will see, we only need to add some slight modifications. In particular, we\n",
|
||||
"need to\n",
|
||||
"\n",
|
||||
"1. wrap data loading and training in functions,\n",
|
||||
"2. make some network parameters configurable,\n",
|
||||
"3. add checkpointing (optional),\n",
|
||||
"4. and define the search space for the model tuning\n",
|
||||
"\n",
|
||||
"Optionally, you can seamlessly leverage {ref}`DistributedDataParallel training <tune-torch-ddp>`\n",
|
||||
"for each individual Pytorch model within Tune.\n",
|
||||
"\n",
|
||||
":::{note}\n",
|
||||
"To run this example, you will need to install the following:\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"$ pip install ray torch torchvision\n",
|
||||
"```\n",
|
||||
":::\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Setup / Imports\n",
|
||||
"\n",
|
||||
"Let's start with the imports:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "55529285",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"import torch.optim as optim\n",
|
||||
"from filelock import FileLock\n",
|
||||
"from torch.utils.data import random_split\n",
|
||||
"import torchvision\n",
|
||||
"import torchvision.transforms as transforms\n",
|
||||
"import ray\n",
|
||||
"from ray import tune\n",
|
||||
"from ray.tune.schedulers import ASHAScheduler"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f59e551d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Most of the imports are needed for building the PyTorch model. Only the last three\n",
|
||||
"imports are for Ray Tune.\n",
|
||||
"\n",
|
||||
"## Data loaders\n",
|
||||
"\n",
|
||||
"We wrap the data loaders in their own function and pass a global data directory.\n",
|
||||
"This way we can share a data directory between different trials."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "01471556",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_data(data_dir=\"./data\"):\n",
|
||||
" transform = transforms.Compose([\n",
|
||||
" transforms.ToTensor(),\n",
|
||||
" transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n",
|
||||
" ])\n",
|
||||
"\n",
|
||||
" # We add FileLock here because multiple workers will want to\n",
|
||||
" # download data, and this may cause overwrites since\n",
|
||||
" # DataLoader is not threadsafe.\n",
|
||||
" with FileLock(os.path.expanduser(\"~/.data.lock\")):\n",
|
||||
" trainset = torchvision.datasets.CIFAR10(\n",
|
||||
" root=data_dir, train=True, download=True, transform=transform)\n",
|
||||
"\n",
|
||||
" testset = torchvision.datasets.CIFAR10(\n",
|
||||
" root=data_dir, train=False, download=True, transform=transform)\n",
|
||||
"\n",
|
||||
" return trainset, testset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "80958cf3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configurable neural network\n",
|
||||
"\n",
|
||||
"We can only tune those parameters that are configurable. In this example, we can specify\n",
|
||||
"the layer sizes of the fully connected layers:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fff6bd0d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class Net(nn.Module):\n",
|
||||
" def __init__(self, l1=120, l2=84):\n",
|
||||
" super(Net, self).__init__()\n",
|
||||
" self.conv1 = nn.Conv2d(3, 6, 5)\n",
|
||||
" self.pool = nn.MaxPool2d(2, 2)\n",
|
||||
" self.conv2 = nn.Conv2d(6, 16, 5)\n",
|
||||
" self.fc1 = nn.Linear(16 * 5 * 5, l1)\n",
|
||||
" self.fc2 = nn.Linear(l1, l2)\n",
|
||||
" self.fc3 = nn.Linear(l2, 10)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.pool(F.relu(self.conv1(x)))\n",
|
||||
" x = self.pool(F.relu(self.conv2(x)))\n",
|
||||
" x = x.view(-1, 16 * 5 * 5)\n",
|
||||
" x = F.relu(self.fc1(x))\n",
|
||||
" x = F.relu(self.fc2(x))\n",
|
||||
" x = self.fc3(x)\n",
|
||||
" return x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fb619875",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## The train function\n",
|
||||
"\n",
|
||||
"Now it gets interesting, because we introduce some changes to the example [from the PyTorch\n",
|
||||
"documentation](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).\n",
|
||||
"\n",
|
||||
"(communicating-with-ray-tune)=\n",
|
||||
"\n",
|
||||
"The full code example looks like this:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fa0bdae0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_cifar(config, checkpoint_dir=None):\n",
|
||||
" net = Net(config[\"l1\"], config[\"l2\"])\n",
|
||||
"\n",
|
||||
" device = \"cpu\"\n",
|
||||
" if torch.cuda.is_available():\n",
|
||||
" device = \"cuda:0\"\n",
|
||||
" if torch.cuda.device_count() > 1:\n",
|
||||
" net = nn.DataParallel(net)\n",
|
||||
" net.to(device)\n",
|
||||
"\n",
|
||||
" criterion = nn.CrossEntropyLoss()\n",
|
||||
" optimizer = optim.SGD(net.parameters(), lr=config[\"lr\"], momentum=0.9)\n",
|
||||
"\n",
|
||||
" # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint\n",
|
||||
" # should be restored.\n",
|
||||
" if checkpoint_dir:\n",
|
||||
" checkpoint = os.path.join(checkpoint_dir, \"checkpoint\")\n",
|
||||
" model_state, optimizer_state = torch.load(checkpoint)\n",
|
||||
" net.load_state_dict(model_state)\n",
|
||||
" optimizer.load_state_dict(optimizer_state)\n",
|
||||
"\n",
|
||||
" data_dir = os.path.abspath(\"./data\")\n",
|
||||
" trainset, testset = load_data(data_dir)\n",
|
||||
"\n",
|
||||
" test_abs = int(len(trainset) * 0.8)\n",
|
||||
" train_subset, val_subset = random_split(\n",
|
||||
" trainset, [test_abs, len(trainset) - test_abs])\n",
|
||||
"\n",
|
||||
" trainloader = torch.utils.data.DataLoader(\n",
|
||||
" train_subset,\n",
|
||||
" batch_size=int(config[\"batch_size\"]),\n",
|
||||
" shuffle=True,\n",
|
||||
" num_workers=8)\n",
|
||||
" valloader = torch.utils.data.DataLoader(\n",
|
||||
" val_subset,\n",
|
||||
" batch_size=int(config[\"batch_size\"]),\n",
|
||||
" shuffle=True,\n",
|
||||
" num_workers=8)\n",
|
||||
"\n",
|
||||
" for epoch in range(10): # loop over the dataset multiple times\n",
|
||||
" running_loss = 0.0\n",
|
||||
" epoch_steps = 0\n",
|
||||
" for i, data in enumerate(trainloader, 0):\n",
|
||||
" # get the inputs; data is a list of [inputs, labels]\n",
|
||||
" inputs, labels = data\n",
|
||||
" inputs, labels = inputs.to(device), labels.to(device)\n",
|
||||
"\n",
|
||||
" # zero the parameter gradients\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
"\n",
|
||||
" # forward + backward + optimize\n",
|
||||
" outputs = net(inputs)\n",
|
||||
" loss = criterion(outputs, labels)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
" # print statistics\n",
|
||||
" running_loss += loss.item()\n",
|
||||
" epoch_steps += 1\n",
|
||||
" if i % 2000 == 1999: # print every 2000 mini-batches\n",
|
||||
" print(\"[%d, %5d] loss: %.3f\" % (epoch + 1, i + 1,\n",
|
||||
" running_loss / epoch_steps))\n",
|
||||
" running_loss = 0.0\n",
|
||||
"\n",
|
||||
" # Validation loss\n",
|
||||
" val_loss = 0.0\n",
|
||||
" val_steps = 0\n",
|
||||
" total = 0\n",
|
||||
" correct = 0\n",
|
||||
" for i, data in enumerate(valloader, 0):\n",
|
||||
" with torch.no_grad():\n",
|
||||
" inputs, labels = data\n",
|
||||
" inputs, labels = inputs.to(device), labels.to(device)\n",
|
||||
"\n",
|
||||
" outputs = net(inputs)\n",
|
||||
" _, predicted = torch.max(outputs.data, 1)\n",
|
||||
" total += labels.size(0)\n",
|
||||
" correct += (predicted == labels).sum().item()\n",
|
||||
"\n",
|
||||
" loss = criterion(outputs, labels)\n",
|
||||
" val_loss += loss.cpu().numpy()\n",
|
||||
" val_steps += 1\n",
|
||||
"\n",
|
||||
" # Here we save a checkpoint. It is automatically registered with\n",
|
||||
" # Ray Tune and will potentially be passed as the `checkpoint_dir`\n",
|
||||
" # parameter in future iterations.\n",
|
||||
" with tune.checkpoint_dir(step=epoch) as checkpoint_dir:\n",
|
||||
" path = os.path.join(checkpoint_dir, \"checkpoint\")\n",
|
||||
" torch.save(\n",
|
||||
" (net.state_dict(), optimizer.state_dict()), path)\n",
|
||||
"\n",
|
||||
" tune.report(loss=(val_loss / val_steps), accuracy=correct / total)\n",
|
||||
" print(\"Finished Training\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "918d8baf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As you can see, most of the code is adapted directly from the example.\n",
|
||||
"\n",
|
||||
"## Test set accuracy\n",
|
||||
"\n",
|
||||
"Commonly the performance of a machine learning model is tested on a hold-out test\n",
|
||||
"set with data that has not been used for training the model. We also wrap this in a\n",
|
||||
"function:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "93b5b4af",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def test_best_model(best_trial):\n",
|
||||
" best_trained_model = Net(best_trial.config[\"l1\"], best_trial.config[\"l2\"])\n",
|
||||
" device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
" best_trained_model.to(device)\n",
|
||||
"\n",
|
||||
" checkpoint_path = os.path.join(best_trial.checkpoint.value, \"checkpoint\")\n",
|
||||
"\n",
|
||||
" model_state, optimizer_state = torch.load(checkpoint_path)\n",
|
||||
" best_trained_model.load_state_dict(model_state)\n",
|
||||
"\n",
|
||||
" trainset, testset = load_data()\n",
|
||||
"\n",
|
||||
" testloader = torch.utils.data.DataLoader(\n",
|
||||
" testset, batch_size=4, shuffle=False, num_workers=2)\n",
|
||||
"\n",
|
||||
" correct = 0\n",
|
||||
" total = 0\n",
|
||||
" with torch.no_grad():\n",
|
||||
" for data in testloader:\n",
|
||||
" images, labels = data\n",
|
||||
" images, labels = images.to(device), labels.to(device)\n",
|
||||
" outputs = best_trained_model(images)\n",
|
||||
" _, predicted = torch.max(outputs.data, 1)\n",
|
||||
" total += labels.size(0)\n",
|
||||
" correct += (predicted == labels).sum().item()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" print(\"Best trial test set accuracy: {}\".format(correct / total))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "85f8230e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As you can see, the function also expects a `device` parameter, so we can do the\n",
|
||||
"test set validation on a GPU.\n",
|
||||
"\n",
|
||||
"## Configuring the search space\n",
|
||||
"\n",
|
||||
"Lastly, we need to define Tune's search space. Here is an example:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5416cece",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"config = {\n",
|
||||
" \"l1\": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),\n",
|
||||
" \"l2\": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),\n",
|
||||
" \"lr\": tune.loguniform(1e-4, 1e-1),\n",
|
||||
" \"batch_size\": tune.choice([2, 4, 8, 16]),\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "20af95cc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The `tune.sample_from()` function makes it possible to define your own sample\n",
|
||||
"methods to obtain hyperparameters. In this example, the `l1` and `l2` parameters\n",
|
||||
"should be powers of 2 between 4 and 256, so either 4, 8, 16, 32, 64, 128, or 256.\n",
|
||||
"The `lr` (learning rate) should be uniformly sampled between 0.0001 and 0.1. Lastly,\n",
|
||||
"the batch size is a choice between 2, 4, 8, and 16.\n",
|
||||
"\n",
|
||||
"At each trial, Tune will now randomly sample a combination of parameters from these\n",
|
||||
"search spaces. It will then train a number of models in parallel and find the best\n",
|
||||
"performing one among these. We also use the `ASHAScheduler` which will terminate bad\n",
|
||||
"performing trials early.\n",
|
||||
"\n",
|
||||
"You can specify the number of CPUs, which are then available e.g.\n",
|
||||
"to increase the `num_workers` of the PyTorch `DataLoader` instances. The selected\n",
|
||||
"number of GPUs are made visible to PyTorch in each trial. Trials do not have access to\n",
|
||||
"GPUs that haven't been requested for them - so you don't have to care about two trials\n",
|
||||
"using the same set of resources.\n",
|
||||
"\n",
|
||||
"Here we can also specify fractional GPUs, so something like `gpus_per_trial=0.5` is\n",
|
||||
"completely valid. The trials will then share GPUs among each other.\n",
|
||||
"You just have to make sure that the models still fit in the GPU memory.\n",
|
||||
"\n",
|
||||
"After training the models, we will find the best performing one and load the trained\n",
|
||||
"network from the checkpoint file. We then obtain the test set accuracy and report\n",
|
||||
"everything by printing.\n",
|
||||
"\n",
|
||||
"The full main function looks like this:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "91d83380",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):\n",
|
||||
" config = {\n",
|
||||
" \"l1\": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),\n",
|
||||
" \"l2\": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),\n",
|
||||
" \"lr\": tune.loguniform(1e-4, 1e-1),\n",
|
||||
" \"batch_size\": tune.choice([2, 4, 8, 16])\n",
|
||||
" }\n",
|
||||
" scheduler = ASHAScheduler(\n",
|
||||
" max_t=max_num_epochs,\n",
|
||||
" grace_period=1,\n",
|
||||
" reduction_factor=2)\n",
|
||||
" result = tune.run(\n",
|
||||
" tune.with_parameters(train_cifar),\n",
|
||||
" resources_per_trial={\"cpu\": 2, \"gpu\": gpus_per_trial},\n",
|
||||
" config=config,\n",
|
||||
" metric=\"loss\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" num_samples=num_samples,\n",
|
||||
" scheduler=scheduler\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" best_trial = result.get_best_trial(\"loss\", \"min\", \"last\")\n",
|
||||
" print(\"Best trial config: {}\".format(best_trial.config))\n",
|
||||
" print(\"Best trial final validation loss: {}\".format(\n",
|
||||
" best_trial.last_result[\"loss\"]))\n",
|
||||
" print(\"Best trial final validation accuracy: {}\".format(\n",
|
||||
" best_trial.last_result[\"accuracy\"]))\n",
|
||||
"\n",
|
||||
" if ray.util.client.ray.is_connected():\n",
|
||||
" # If using Ray Client, we want to make sure checkpoint access\n",
|
||||
" # happens on the server. So we wrap `test_best_model` in a Ray task.\n",
|
||||
" # We have to make sure it gets executed on the same node that\n",
|
||||
" # ``tune.run`` is called on.\n",
|
||||
" from ray.util.ml_utils.node import force_on_current_node\n",
|
||||
" remote_fn = force_on_current_node(ray.remote(test_best_model))\n",
|
||||
" ray.get(remote_fn.remote(best_trial))\n",
|
||||
" else:\n",
|
||||
" test_best_model(best_trial)\n",
|
||||
"\n",
|
||||
"main(num_samples=2, max_num_epochs=2, gpus_per_trial=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b702b4ce",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you run the code, an example output could look like this:\n",
|
||||
"\n",
|
||||
"```{code-block} bash\n",
|
||||
":emphasize-lines: 7\n",
|
||||
"\n",
|
||||
" Number of trials: 10 (10 TERMINATED)\n",
|
||||
" +-------------------------+------------+-------+------+------+-------------+--------------+---------+------------+----------------------+\n",
|
||||
" | Trial name | status | loc | l1 | l2 | lr | batch_size | loss | accuracy | training_iteration |\n",
|
||||
" |-------------------------+------------+-------+------+------+-------------+--------------+---------+------------+----------------------|\n",
|
||||
" | train_cifar_87d1f_00000 | TERMINATED | | 64 | 4 | 0.00011629 | 2 | 1.87273 | 0.244 | 2 |\n",
|
||||
" | train_cifar_87d1f_00001 | TERMINATED | | 32 | 64 | 0.000339763 | 8 | 1.23603 | 0.567 | 8 |\n",
|
||||
" | train_cifar_87d1f_00002 | TERMINATED | | 8 | 16 | 0.00276249 | 16 | 1.1815 | 0.5836 | 10 |\n",
|
||||
" | train_cifar_87d1f_00003 | TERMINATED | | 4 | 64 | 0.000648721 | 4 | 1.31131 | 0.5224 | 8 |\n",
|
||||
" | train_cifar_87d1f_00004 | TERMINATED | | 32 | 16 | 0.000340753 | 8 | 1.26454 | 0.5444 | 8 |\n",
|
||||
" | train_cifar_87d1f_00005 | TERMINATED | | 8 | 4 | 0.000699775 | 8 | 1.99594 | 0.1983 | 2 |\n",
|
||||
" | train_cifar_87d1f_00006 | TERMINATED | | 256 | 8 | 0.0839654 | 16 | 2.3119 | 0.0993 | 1 |\n",
|
||||
" | train_cifar_87d1f_00007 | TERMINATED | | 16 | 128 | 0.0758154 | 16 | 2.33575 | 0.1327 | 1 |\n",
|
||||
" | train_cifar_87d1f_00008 | TERMINATED | | 16 | 8 | 0.0763312 | 16 | 2.31129 | 0.1042 | 4 |\n",
|
||||
" | train_cifar_87d1f_00009 | TERMINATED | | 128 | 16 | 0.000124903 | 4 | 2.26917 | 0.1945 | 1 |\n",
|
||||
" +-------------------------+------------+-------+------+------+-------------+--------------+---------+------------+----------------------+\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" Best trial config: {'l1': 8, 'l2': 16, 'lr': 0.0027624906698231976, 'batch_size': 16, 'data_dir': '...'}\n",
|
||||
" Best trial final validation loss: 1.1815014744281769\n",
|
||||
" Best trial final validation accuracy: 0.5836\n",
|
||||
" Best trial test set accuracy: 0.5806\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"As you can see, most trials have been stopped early in order to avoid wasting resources.\n",
|
||||
"The best performing trial achieved a validation accuracy of about 58%, which could\n",
|
||||
"be confirmed on the test set.\n",
|
||||
"\n",
|
||||
"So that's it! You can now tune the parameters of your PyTorch models.\n",
|
||||
"\n",
|
||||
"## See More PyTorch Examples\n",
|
||||
"\n",
|
||||
"- {doc}`/tune/examples/includes/mnist_pytorch`: Converts the PyTorch MNIST example to use Tune with the function-based API.\n",
|
||||
" Also shows how to easily convert something relying on argparse to use Tune.\n",
|
||||
"- {doc}`/tune/examples/includes/ddp_mnist_torch`: An example showing how to use DistributedDataParallel with Ray Tune.\n",
|
||||
" This enables both distributed training and distributed hyperparameter tuning.\n",
|
||||
"- {doc}`/tune/examples/includes/pbt_convnet_function_example`: Example training a ConvNet with checkpointing in function API.\n",
|
||||
"- {doc}`/tune/examples/includes/mnist_pytorch_trainable`: Converts the PyTorch MNIST example to use Tune with Trainable API.\n",
|
||||
" Also uses the HyperBandScheduler and checkpoints the model at the end."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
812
doc/source/tune/examples/tune-pytorch-lightning.ipynb
Normal file
812
doc/source/tune/examples/tune-pytorch-lightning.ipynb
Normal file
|
@ -0,0 +1,812 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "aa6af4d3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Using PyTorch Lightning with Tune\n",
|
||||
"\n",
|
||||
"(tune-pytorch-lightning-ref)=\n",
|
||||
"\n",
|
||||
"PyTorch Lightning is a framework which brings structure into training PyTorch models. It\n",
|
||||
"aims to avoid boilerplate code, so you don't have to write the same training\n",
|
||||
"loops all over again when building a new model.\n",
|
||||
"\n",
|
||||
"```{image} /images/pytorch_lightning_full.png\n",
|
||||
":align: center\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"The main abstraction of PyTorch Lightning is the `LightningModule` class, which\n",
|
||||
"should be extended by your application. There is [a great post on how to transfer your models from vanilla PyTorch to Lightning](https://towardsdatascience.com/from-pytorch-to-pytorch-lightning-a-gentle-introduction-b371b7caaf09).\n",
|
||||
"\n",
|
||||
"The class structure of PyTorch Lightning makes it very easy to define and tune model\n",
|
||||
"parameters. This tutorial will show you how to use Tune to find the best set of\n",
|
||||
"parameters for your application on the example of training a MNIST classifier. Notably,\n",
|
||||
"the `LightningModule` does not have to be altered at all for this - so you can\n",
|
||||
"use it plug and play for your existing models, assuming their parameters are configurable!\n",
|
||||
"\n",
|
||||
":::{note}\n",
|
||||
"To run this example, you will need to install the following:\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"$ pip install \"ray[tune]\" torch torchvision pytorch-lightning\n",
|
||||
"```\n",
|
||||
":::\n",
|
||||
"\n",
|
||||
":::{tip}\n",
|
||||
"If you want distributed PyTorch Lightning Training on Ray in addition to hyperparameter tuning with Tune,\n",
|
||||
"check out the [Ray Lightning Library](https://github.com/ray-project/ray_lightning)\n",
|
||||
":::\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## PyTorch Lightning classifier for MNIST\n",
|
||||
"\n",
|
||||
"Let's first start with the basic PyTorch Lightning implementation of an MNIST classifier.\n",
|
||||
"This classifier does not include any tuning code at this point.\n",
|
||||
"\n",
|
||||
"Our example builds on the MNIST example from the [blog post we talked about\n",
|
||||
"earlier](https://towardsdatascience.com/from-pytorch-to-pytorch-lightning-a-gentle-introduction-b371b7caaf09).\n",
|
||||
"\n",
|
||||
"First, we run some imports:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e6e77570",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import math\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import pytorch_lightning as pl\n",
|
||||
"from filelock import FileLock\n",
|
||||
"from torch.utils.data import DataLoader, random_split\n",
|
||||
"from torch.nn import functional as F\n",
|
||||
"from torchvision.datasets import MNIST\n",
|
||||
"from torchvision import transforms\n",
|
||||
"import os"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3c442e73",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And then there is the Lightning model adapted from the blog post.\n",
|
||||
"Note that we left out the test set validation and made the model parameters\n",
|
||||
"configurable through a `config` dict that is passed on initialization.\n",
|
||||
"Also, we specify a `data_dir` where the MNIST data will be stored. Note that\n",
|
||||
"we use a `FileLock` for downloading data so that the dataset is only downloaded\n",
|
||||
"once per node.\n",
|
||||
"Lastly, we added a new metric, the validation accuracy, to the logs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "48b20f48",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class LightningMNISTClassifier(pl.LightningModule):\n",
|
||||
" \"\"\"\n",
|
||||
" This has been adapted from\n",
|
||||
" https://towardsdatascience.com/from-pytorch-to-pytorch-lightning-a-gentle-introduction-b371b7caaf09\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self, config, data_dir=None):\n",
|
||||
" super(LightningMNISTClassifier, self).__init__()\n",
|
||||
"\n",
|
||||
" self.data_dir = data_dir or os.getcwd()\n",
|
||||
"\n",
|
||||
" self.layer_1_size = config[\"layer_1_size\"]\n",
|
||||
" self.layer_2_size = config[\"layer_2_size\"]\n",
|
||||
" self.lr = config[\"lr\"]\n",
|
||||
" self.batch_size = config[\"batch_size\"]\n",
|
||||
"\n",
|
||||
" # mnist images are (1, 28, 28) (channels, width, height)\n",
|
||||
" self.layer_1 = torch.nn.Linear(28 * 28, self.layer_1_size)\n",
|
||||
" self.layer_2 = torch.nn.Linear(self.layer_1_size, self.layer_2_size)\n",
|
||||
" self.layer_3 = torch.nn.Linear(self.layer_2_size, 10)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" batch_size, channels, width, height = x.size()\n",
|
||||
" x = x.view(batch_size, -1)\n",
|
||||
"\n",
|
||||
" x = self.layer_1(x)\n",
|
||||
" x = torch.relu(x)\n",
|
||||
"\n",
|
||||
" x = self.layer_2(x)\n",
|
||||
" x = torch.relu(x)\n",
|
||||
"\n",
|
||||
" x = self.layer_3(x)\n",
|
||||
" x = torch.log_softmax(x, dim=1)\n",
|
||||
"\n",
|
||||
" return x\n",
|
||||
"\n",
|
||||
" def cross_entropy_loss(self, logits, labels):\n",
|
||||
" return F.nll_loss(logits, labels)\n",
|
||||
"\n",
|
||||
" def accuracy(self, logits, labels):\n",
|
||||
" _, predicted = torch.max(logits.data, 1)\n",
|
||||
" correct = (predicted == labels).sum().item()\n",
|
||||
" accuracy = correct / len(labels)\n",
|
||||
" return torch.tensor(accuracy)\n",
|
||||
"\n",
|
||||
" def training_step(self, train_batch, batch_idx):\n",
|
||||
" x, y = train_batch\n",
|
||||
" logits = self.forward(x)\n",
|
||||
" loss = self.cross_entropy_loss(logits, y)\n",
|
||||
" accuracy = self.accuracy(logits, y)\n",
|
||||
"\n",
|
||||
" self.log(\"ptl/train_loss\", loss)\n",
|
||||
" self.log(\"ptl/train_accuracy\", accuracy)\n",
|
||||
" return loss\n",
|
||||
"\n",
|
||||
" def validation_step(self, val_batch, batch_idx):\n",
|
||||
" x, y = val_batch\n",
|
||||
" logits = self.forward(x)\n",
|
||||
" loss = self.cross_entropy_loss(logits, y)\n",
|
||||
" accuracy = self.accuracy(logits, y)\n",
|
||||
" return {\"val_loss\": loss, \"val_accuracy\": accuracy}\n",
|
||||
"\n",
|
||||
" def validation_epoch_end(self, outputs):\n",
|
||||
" avg_loss = torch.stack([x[\"val_loss\"] for x in outputs]).mean()\n",
|
||||
" avg_acc = torch.stack([x[\"val_accuracy\"] for x in outputs]).mean()\n",
|
||||
" self.log(\"ptl/val_loss\", avg_loss)\n",
|
||||
" self.log(\"ptl/val_accuracy\", avg_acc)\n",
|
||||
"\n",
|
||||
" @staticmethod\n",
|
||||
" def download_data(data_dir):\n",
|
||||
" transform = transforms.Compose([\n",
|
||||
" transforms.ToTensor(),\n",
|
||||
" transforms.Normalize((0.1307, ), (0.3081, ))\n",
|
||||
" ])\n",
|
||||
" with FileLock(os.path.expanduser(\"~/.data.lock\")):\n",
|
||||
" return MNIST(data_dir, train=True, download=True, transform=transform)\n",
|
||||
"\n",
|
||||
" def prepare_data(self):\n",
|
||||
" mnist_train = self.download_data(self.data_dir)\n",
|
||||
"\n",
|
||||
" self.mnist_train, self.mnist_val = random_split(\n",
|
||||
" mnist_train, [55000, 5000])\n",
|
||||
"\n",
|
||||
" def train_dataloader(self):\n",
|
||||
" return DataLoader(self.mnist_train, batch_size=int(self.batch_size))\n",
|
||||
"\n",
|
||||
" def val_dataloader(self):\n",
|
||||
" return DataLoader(self.mnist_val, batch_size=int(self.batch_size))\n",
|
||||
"\n",
|
||||
" def configure_optimizers(self):\n",
|
||||
" optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)\n",
|
||||
" return optimizer\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def train_mnist(config):\n",
|
||||
" model = LightningMNISTClassifier(config)\n",
|
||||
" trainer = pl.Trainer(max_epochs=10, enable_progress_bar=False)\n",
|
||||
"\n",
|
||||
" trainer.fit(model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "da1c3632",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And that's it! You can now run `train_mnist(config)` to train the classifier, e.g.\n",
|
||||
"like so:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "86df3d39",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_mnist_no_tune():\n",
|
||||
" config = {\n",
|
||||
" \"layer_1_size\": 128,\n",
|
||||
" \"layer_2_size\": 256,\n",
|
||||
" \"lr\": 1e-3,\n",
|
||||
" \"batch_size\": 64\n",
|
||||
" }\n",
|
||||
" train_mnist(config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "edcc0991",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Tuning the model parameters\n",
|
||||
"\n",
|
||||
"The parameters above should give you a good accuracy of over 90% already. However,\n",
|
||||
"we might improve on this simply by changing some of the hyperparameters. For instance,\n",
|
||||
"maybe we get an even higher accuracy if we used a larger batch size.\n",
|
||||
"\n",
|
||||
"Instead of guessing the parameter values, let's use Tune to systematically try out\n",
|
||||
"parameter combinations and find the best performing set.\n",
|
||||
"\n",
|
||||
"First, we need some additional imports:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "34faeb3b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pytorch_lightning.loggers import TensorBoardLogger\n",
|
||||
"from ray import tune\n",
|
||||
"from ray.tune import CLIReporter\n",
|
||||
"from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining\n",
|
||||
"from ray.tune.integration.pytorch_lightning import TuneReportCallback, \\\n",
|
||||
" TuneReportCheckpointCallback"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f65b9c5f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Talking to Tune with a PyTorch Lightning callback\n",
|
||||
"\n",
|
||||
"PyTorch Lightning introduced [Callbacks](https://pytorch-lightning.readthedocs.io/en/latest/extensions/callbacks.html)\n",
|
||||
"that can be used to plug custom functions into the training loop. This way the original\n",
|
||||
"`LightningModule` does not have to be altered at all. Also, we could use the same\n",
|
||||
"callback for multiple modules.\n",
|
||||
"\n",
|
||||
"Ray Tune comes with ready-to-use PyTorch Lightning callbacks. To report metrics\n",
|
||||
"back to Tune after each validation epoch, we will use the `TuneReportCallback`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4bab80bc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TuneReportCallback(\n",
|
||||
" {\n",
|
||||
" \"loss\": \"ptl/val_loss\",\n",
|
||||
" \"mean_accuracy\": \"ptl/val_accuracy\"\n",
|
||||
" },\n",
|
||||
" on=\"validation_end\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "286a1070",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This callback will take the `val_loss` and `val_accuracy` values\n",
|
||||
"from the PyTorch Lightning trainer and report them to Tune as the `loss`\n",
|
||||
"and `mean_accuracy`, respectively.\n",
|
||||
"\n",
|
||||
"### Adding the Tune training function\n",
|
||||
"\n",
|
||||
"Then we specify our training function. Note that we added the `data_dir` as a\n",
|
||||
"parameter here to avoid\n",
|
||||
"that each training run downloads the full MNIST dataset. Instead, we want to access\n",
|
||||
"a shared data location.\n",
|
||||
"\n",
|
||||
"We are also able to specify the number of epochs to train each model, and the number\n",
|
||||
"of GPUs we want to use for training. We also create a TensorBoard logger that writes\n",
|
||||
"logfiles directly into Tune's root trial directory - if we didn't do that PyTorch\n",
|
||||
"Lightning would create subdirectories, and each trial would thus be shown twice in\n",
|
||||
"TensorBoard, one time for Tune's logs, and another time for PyTorch Lightning's logs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "74e7d1c2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_mnist_tune(config, num_epochs=10, num_gpus=0, data_dir=\"~/data\"):\n",
|
||||
" data_dir = os.path.expanduser(data_dir)\n",
|
||||
" model = LightningMNISTClassifier(config, data_dir)\n",
|
||||
" trainer = pl.Trainer(\n",
|
||||
" max_epochs=num_epochs,\n",
|
||||
" # If fractional GPUs passed in, convert to int.\n",
|
||||
" gpus=math.ceil(num_gpus),\n",
|
||||
" logger=TensorBoardLogger(\n",
|
||||
" save_dir=tune.get_trial_dir(), name=\"\", version=\".\"),\n",
|
||||
" enable_progress_bar=False,\n",
|
||||
" callbacks=[\n",
|
||||
" TuneReportCallback(\n",
|
||||
" {\n",
|
||||
" \"loss\": \"ptl/val_loss\",\n",
|
||||
" \"mean_accuracy\": \"ptl/val_accuracy\"\n",
|
||||
" },\n",
|
||||
" on=\"validation_end\")\n",
|
||||
" ])\n",
|
||||
" trainer.fit(model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cf0f6d6e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configuring the search space\n",
|
||||
"\n",
|
||||
"Now we configure the parameter search space. We would like to choose between three\n",
|
||||
"different layer and batch sizes. The learning rate should be sampled uniformly between\n",
|
||||
"`0.0001` and `0.1`. The `tune.loguniform()` function is syntactic sugar to make\n",
|
||||
"sampling between these different orders of magnitude easier, specifically\n",
|
||||
"we are able to also sample small values."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a50645e9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"config = {\n",
|
||||
" \"layer_1_size\": tune.choice([32, 64, 128]),\n",
|
||||
" \"layer_2_size\": tune.choice([64, 128, 256]),\n",
|
||||
" \"lr\": tune.loguniform(1e-4, 1e-1),\n",
|
||||
" \"batch_size\": tune.choice([32, 64, 128]),\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b1fb9ecd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Selecting a scheduler\n",
|
||||
"\n",
|
||||
"In this example, we use an [Asynchronous Hyperband](https://blog.ml.cmu.edu/2018/12/12/massively-parallel-hyperparameter-optimization/)\n",
|
||||
"scheduler. This scheduler decides at each iteration which trials are likely to perform\n",
|
||||
"badly, and stops these trials. This way we don't waste any resources on bad hyperparameter\n",
|
||||
"configurations."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a2596b01",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"num_epochs = 10\n",
|
||||
"\n",
|
||||
"scheduler = ASHAScheduler(\n",
|
||||
" max_t=num_epochs,\n",
|
||||
" grace_period=1,\n",
|
||||
" reduction_factor=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9a49ae58",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Changing the CLI output\n",
|
||||
"\n",
|
||||
"We instantiate a `CLIReporter` to specify which metrics we would like to see in our\n",
|
||||
"output tables in the command line. This is optional, but can be used to make sure our\n",
|
||||
"output tables only include information we would like to see."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cd605a16",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"reporter = CLIReporter(\n",
|
||||
" parameter_columns=[\"layer_1_size\", \"layer_2_size\", \"lr\", \"batch_size\"],\n",
|
||||
" metric_columns=[\"loss\", \"mean_accuracy\", \"training_iteration\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5ec9a305",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Passing constants to the train function\n",
|
||||
"\n",
|
||||
"The `data_dir`, `num_epochs` and `num_gpus` we pass to the training function\n",
|
||||
"are constants. To avoid including them as non-configurable parameters in the `config`\n",
|
||||
"specification, we can use `tune.with_parameters` to wrap around the training function."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "332668dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gpus_per_trial = 0\n",
|
||||
"data_dir = \"~/data\"\n",
|
||||
"\n",
|
||||
"train_fn_with_parameters = tune.with_parameters(train_mnist_tune,\n",
|
||||
" num_epochs=num_epochs,\n",
|
||||
" num_gpus=gpus_per_trial,\n",
|
||||
" data_dir=data_dir)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "feef8c39",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Training with GPUs\n",
|
||||
"\n",
|
||||
"We can specify how many resources Tune should request for each trial.\n",
|
||||
"This also includes GPUs.\n",
|
||||
"\n",
|
||||
"PyTorch Lightning takes care of moving the training to the GPUs. We\n",
|
||||
"already made sure that our code is compatible with that, so there's\n",
|
||||
"nothing more to do here other than to specify the number of GPUs\n",
|
||||
"we would like to use:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dc402716",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"resources_per_trial = {\"cpu\": 1, \"gpu\": gpus_per_trial}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ca050dfa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also specify {doc}`fractional GPUs for Tune <../../ray-core/using-ray-with-gpus>`,\n",
|
||||
"allowing multiple trials to share GPUs and thus increase concurrency under resource constraints.\n",
|
||||
"While the `gpus_per_trial` passed into\n",
|
||||
"Tune is a decimal value, the `gpus` passed into the `pl.Trainer` should still be an integer.\n",
|
||||
"Please note that if using fractional GPUs, it is the user's responsibility to\n",
|
||||
"make sure multiple trials can share GPUs and there is enough memory to do so.\n",
|
||||
"Ray does not automatically handle this for you.\n",
|
||||
"\n",
|
||||
"If you want to use multiple GPUs per trial, you should check out the\n",
|
||||
"[Ray Lightning Library](https://github.com/ray-project/ray_lightning).\n",
|
||||
"This library makes it easy to run multiple concurrent trials with Ray Tune, with each trial also running\n",
|
||||
"in a distributed fashion using Ray.\n",
|
||||
"\n",
|
||||
"### Putting it together\n",
|
||||
"\n",
|
||||
"Lastly, we need to start Tune with `tune.run()`.\n",
|
||||
"\n",
|
||||
"The full code looks like this:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ea182330",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0, data_dir=\"~/data\"):\n",
|
||||
" config = {\n",
|
||||
" \"layer_1_size\": tune.choice([32, 64, 128]),\n",
|
||||
" \"layer_2_size\": tune.choice([64, 128, 256]),\n",
|
||||
" \"lr\": tune.loguniform(1e-4, 1e-1),\n",
|
||||
" \"batch_size\": tune.choice([32, 64, 128]),\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" scheduler = ASHAScheduler(\n",
|
||||
" max_t=num_epochs,\n",
|
||||
" grace_period=1,\n",
|
||||
" reduction_factor=2)\n",
|
||||
"\n",
|
||||
" reporter = CLIReporter(\n",
|
||||
" parameter_columns=[\"layer_1_size\", \"layer_2_size\", \"lr\", \"batch_size\"],\n",
|
||||
" metric_columns=[\"loss\", \"mean_accuracy\", \"training_iteration\"])\n",
|
||||
"\n",
|
||||
" train_fn_with_parameters = tune.with_parameters(train_mnist_tune,\n",
|
||||
" num_epochs=num_epochs,\n",
|
||||
" num_gpus=gpus_per_trial,\n",
|
||||
" data_dir=data_dir)\n",
|
||||
" resources_per_trial = {\"cpu\": 1, \"gpu\": gpus_per_trial}\n",
|
||||
"\n",
|
||||
" analysis = tune.run(train_fn_with_parameters,\n",
|
||||
" resources_per_trial=resources_per_trial,\n",
|
||||
" metric=\"loss\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" config=config,\n",
|
||||
" num_samples=num_samples,\n",
|
||||
" scheduler=scheduler,\n",
|
||||
" progress_reporter=reporter,\n",
|
||||
" name=\"tune_mnist_asha\")\n",
|
||||
"\n",
|
||||
" print(\"Best hyperparameters found were: \", analysis.best_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1fb96b6c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In the example above, Tune runs 10 trials with different hyperparameter configurations.\n",
|
||||
"An example output could look like so:\n",
|
||||
"\n",
|
||||
"```{code-block} bash\n",
|
||||
":emphasize-lines: 12\n",
|
||||
"\n",
|
||||
" +------------------------------+------------+-------+----------------+----------------+-------------+--------------+----------+-----------------+----------------------+\n",
|
||||
" | Trial name | status | loc | layer_1_size | layer_2_size | lr | batch_size | loss | mean_accuracy | training_iteration |\n",
|
||||
" |------------------------------+------------+-------+----------------+----------------+-------------+--------------+----------+-----------------+----------------------|\n",
|
||||
" | train_mnist_tune_63ecc_00000 | TERMINATED | | 128 | 64 | 0.00121197 | 128 | 0.120173 | 0.972461 | 10 |\n",
|
||||
" | train_mnist_tune_63ecc_00001 | TERMINATED | | 64 | 128 | 0.0301395 | 128 | 0.454836 | 0.868164 | 4 |\n",
|
||||
" | train_mnist_tune_63ecc_00002 | TERMINATED | | 64 | 128 | 0.0432097 | 128 | 0.718396 | 0.718359 | 1 |\n",
|
||||
" | train_mnist_tune_63ecc_00003 | TERMINATED | | 32 | 128 | 0.000294669 | 32 | 0.111475 | 0.965764 | 10 |\n",
|
||||
" | train_mnist_tune_63ecc_00004 | TERMINATED | | 32 | 256 | 0.000386664 | 64 | 0.133538 | 0.960839 | 8 |\n",
|
||||
" | train_mnist_tune_63ecc_00005 | TERMINATED | | 128 | 128 | 0.0837395 | 32 | 2.32628 | 0.0991242 | 1 |\n",
|
||||
" | train_mnist_tune_63ecc_00006 | TERMINATED | | 64 | 128 | 0.000158761 | 128 | 0.134595 | 0.959766 | 10 |\n",
|
||||
" | train_mnist_tune_63ecc_00007 | TERMINATED | | 64 | 64 | 0.000672126 | 64 | 0.118182 | 0.972903 | 10 |\n",
|
||||
" | train_mnist_tune_63ecc_00008 | TERMINATED | | 128 | 64 | 0.000502428 | 32 | 0.11082 | 0.975518 | 10 |\n",
|
||||
" | train_mnist_tune_63ecc_00009 | TERMINATED | | 64 | 256 | 0.00112894 | 32 | 0.13472 | 0.971935 | 8 |\n",
|
||||
" +------------------------------+------------+-------+----------------+----------------+-------------+--------------+----------+-----------------+----------------------+\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"As you can see in the `training_iteration` column, trials with a high loss\n",
|
||||
"(and low accuracy) have been terminated early. The best performing trial used\n",
|
||||
"`layer_1_size=128`, `layer_2_size=64`, `lr=0.000502428` and\n",
|
||||
"`batch_size=32`.\n",
|
||||
"\n",
|
||||
"## Using Population Based Training to find the best parameters\n",
|
||||
"\n",
|
||||
"The `ASHAScheduler` terminates those trials early that show bad performance.\n",
|
||||
"Sometimes, this stops trials that would get better after more training steps,\n",
|
||||
"and which might eventually even show better performance than other configurations.\n",
|
||||
"\n",
|
||||
"Another popular method for hyperparameter tuning, called\n",
|
||||
"[Population Based Training](https://deepmind.com/blog/article/population-based-training-neural-networks),\n",
|
||||
"instead perturbs hyperparameters during the training run. Tune implements PBT, and\n",
|
||||
"we only need to make some slight adjustments to our code.\n",
|
||||
"\n",
|
||||
"### Adding checkpoints to the PyTorch Lightning module\n",
|
||||
"\n",
|
||||
"First, we need to introduce\n",
|
||||
"another callback to save model checkpoints. Since Tune requires a call to\n",
|
||||
"`tune.report()` after creating a new checkpoint to register it, we will use\n",
|
||||
"a combined reporting and checkpointing callback:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7f86e4d8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TuneReportCheckpointCallback(\n",
|
||||
" metrics={\n",
|
||||
" \"loss\": \"ptl/val_loss\",\n",
|
||||
" \"mean_accuracy\": \"ptl/val_accuracy\"\n",
|
||||
" },\n",
|
||||
" filename=\"checkpoint\",\n",
|
||||
" on=\"validation_end\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "33a76d5b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The `checkpoint` value is the name of the checkpoint file within the\n",
|
||||
"checkpoint directory.\n",
|
||||
"\n",
|
||||
"We also include checkpoint loading in our training function:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "746e962a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_mnist_tune_checkpoint(config,\n",
|
||||
" checkpoint_dir=None,\n",
|
||||
" num_epochs=10,\n",
|
||||
" num_gpus=0,\n",
|
||||
" data_dir=\"~/data\"):\n",
|
||||
" data_dir = os.path.expanduser(data_dir)\n",
|
||||
" kwargs = {\n",
|
||||
" \"max_epochs\": num_epochs,\n",
|
||||
" # If fractional GPUs passed in, convert to int.\n",
|
||||
" \"gpus\": math.ceil(num_gpus),\n",
|
||||
" \"logger\": TensorBoardLogger(\n",
|
||||
" save_dir=tune.get_trial_dir(), name=\"\", version=\".\"),\n",
|
||||
" \"enable_progress_bar\": False,\n",
|
||||
" \"callbacks\": [\n",
|
||||
" TuneReportCheckpointCallback(\n",
|
||||
" metrics={\n",
|
||||
" \"loss\": \"ptl/val_loss\",\n",
|
||||
" \"mean_accuracy\": \"ptl/val_accuracy\"\n",
|
||||
" },\n",
|
||||
" filename=\"checkpoint\",\n",
|
||||
" on=\"validation_end\")\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" if checkpoint_dir:\n",
|
||||
" kwargs[\"resume_from_checkpoint\"] = os.path.join(\n",
|
||||
" checkpoint_dir, \"checkpoint\")\n",
|
||||
"\n",
|
||||
" model = LightningMNISTClassifier(config=config, data_dir=data_dir)\n",
|
||||
" trainer = pl.Trainer(**kwargs)\n",
|
||||
"\n",
|
||||
" trainer.fit(model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "39dc7b46",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configuring and running Population Based Training\n",
|
||||
"\n",
|
||||
"We need to call Tune slightly differently:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e12a1bd5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tune_mnist_pbt(num_samples=10, num_epochs=10, gpus_per_trial=0, data_dir=\"~/data\"):\n",
|
||||
" config = {\n",
|
||||
" \"layer_1_size\": tune.choice([32, 64, 128]),\n",
|
||||
" \"layer_2_size\": tune.choice([64, 128, 256]),\n",
|
||||
" \"lr\": 1e-3,\n",
|
||||
" \"batch_size\": 64,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" scheduler = PopulationBasedTraining(\n",
|
||||
" perturbation_interval=4,\n",
|
||||
" hyperparam_mutations={\n",
|
||||
" \"lr\": tune.loguniform(1e-4, 1e-1),\n",
|
||||
" \"batch_size\": [32, 64, 128]\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" reporter = CLIReporter(\n",
|
||||
" parameter_columns=[\"layer_1_size\", \"layer_2_size\", \"lr\", \"batch_size\"],\n",
|
||||
" metric_columns=[\"loss\", \"mean_accuracy\", \"training_iteration\"])\n",
|
||||
"\n",
|
||||
" analysis = tune.run(\n",
|
||||
" tune.with_parameters(\n",
|
||||
" train_mnist_tune_checkpoint,\n",
|
||||
" num_epochs=num_epochs,\n",
|
||||
" num_gpus=gpus_per_trial,\n",
|
||||
" data_dir=data_dir),\n",
|
||||
" resources_per_trial={\n",
|
||||
" \"cpu\": 1,\n",
|
||||
" \"gpu\": gpus_per_trial\n",
|
||||
" },\n",
|
||||
" metric=\"loss\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" config=config,\n",
|
||||
" num_samples=num_samples,\n",
|
||||
" scheduler=scheduler,\n",
|
||||
" progress_reporter=reporter,\n",
|
||||
" name=\"tune_mnist_pbt\")\n",
|
||||
"\n",
|
||||
" print(\"Best hyperparameters found were: \", analysis.best_config)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6087f807",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Instead of passing tune parameters to the `config` dict, we start\n",
|
||||
"with fixed values, though we are also able to sample some of them, like the\n",
|
||||
"layer sizes. Additionally, we have to tell PBT how to perturb the hyperparameters.\n",
|
||||
"Note that the layer sizes are not tuned right here. This is because we cannot simply\n",
|
||||
"change layer sizes during a training run - which is what would happen in PBT.\n",
|
||||
"\n",
|
||||
"To test running both of our main scripts (`tune_mnist_asha` and `tune_mnist_pbt`), all you have to do is specify\n",
|
||||
"a `data_dir` folder and run the scripts with reasonable parameters:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_dir = \"~/data/\"\n",
|
||||
"\n",
|
||||
"tune_mnist_asha(num_samples=1, num_epochs=6, gpus_per_trial=0, data_dir=data_dir)\n",
|
||||
"tune_mnist_pbt(num_samples=1, num_epochs=6, gpus_per_trial=0, data_dir=data_dir)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"If you have more resources available (e.g. a GPU), you can modify the above parameters accordingly.\n",
|
||||
"\n",
|
||||
"An example output of a run could look like this:\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"+-----------------------------------------+------------+-------+----------------+----------------+-----------+--------------+-----------+-----------------+----------------------+\n",
|
||||
"| Trial name | status | loc | layer_1_size | layer_2_size | lr | batch_size | loss | mean_accuracy | training_iteration |\n",
|
||||
"|-----------------------------------------+------------+-------+----------------+----------------+-----------+--------------+-----------+-----------------+----------------------|\n",
|
||||
"| train_mnist_tune_checkpoint_85489_00000 | TERMINATED | | 128 | 128 | 0.001 | 64 | 0.108734 | 0.973101 | 10 |\n",
|
||||
"| train_mnist_tune_checkpoint_85489_00001 | TERMINATED | | 128 | 128 | 0.001 | 64 | 0.093577 | 0.978639 | 10 |\n",
|
||||
"| train_mnist_tune_checkpoint_85489_00002 | TERMINATED | | 128 | 256 | 0.0008 | 32 | 0.0922348 | 0.979299 | 10 |\n",
|
||||
"| train_mnist_tune_checkpoint_85489_00003 | TERMINATED | | 64 | 256 | 0.001 | 64 | 0.124648 | 0.973892 | 10 |\n",
|
||||
"| train_mnist_tune_checkpoint_85489_00004 | TERMINATED | | 128 | 64 | 0.001 | 64 | 0.101717 | 0.975079 | 10 |\n",
|
||||
"| train_mnist_tune_checkpoint_85489_00005 | TERMINATED | | 64 | 64 | 0.001 | 64 | 0.121467 | 0.969146 | 10 |\n",
|
||||
"| train_mnist_tune_checkpoint_85489_00006 | TERMINATED | | 128 | 256 | 0.00064 | 32 | 0.053446 | 0.987062 | 10 |\n",
|
||||
"| train_mnist_tune_checkpoint_85489_00007 | TERMINATED | | 128 | 256 | 0.001 | 64 | 0.129804 | 0.973497 | 10 |\n",
|
||||
"| train_mnist_tune_checkpoint_85489_00008 | TERMINATED | | 64 | 256 | 0.0285125 | 128 | 0.363236 | 0.913867 | 10 |\n",
|
||||
"| train_mnist_tune_checkpoint_85489_00009 | TERMINATED | | 32 | 256 | 0.001 | 64 | 0.150946 | 0.964201 | 10 |\n",
|
||||
"+-----------------------------------------+------------+-------+----------------+----------------+-----------+--------------+-----------+-----------------+----------------------+\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"As you can see, each sample ran the full number of 10 iterations.\n",
|
||||
"All trials ended with quite good parameter combinations and showed relatively good performances.\n",
|
||||
"In some runs, the parameters have been perturbed. And the best configuration even reached a\n",
|
||||
"mean validation accuracy of `0.987062`!\n",
|
||||
"\n",
|
||||
"In summary, PyTorch Lightning Modules are easy to extend to use with Tune. It just took\n",
|
||||
"us importing one or two callbacks and a small wrapper function to get great performing\n",
|
||||
"parameter configurations.\n",
|
||||
"\n",
|
||||
"## More PyTorch Lightning Examples\n",
|
||||
"\n",
|
||||
"- {doc}`/tune/examples/includes/mnist_ptl_mini`:\n",
|
||||
" A minimal example of using [Pytorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n",
|
||||
" to train a MNIST model. This example utilizes the Ray Tune-provided\n",
|
||||
" {ref}`PyTorch Lightning callbacks <tune-integration-pytorch-lightning>`.\n",
|
||||
" See also {ref}`this tutorial for a full walkthrough <tune-pytorch-lightning-ref>`.\n",
|
||||
"- {ref}`A walkthrough tutorial for using Ray Tune with Pytorch-Lightning <tune-pytorch-lightning-ref>`.\n",
|
||||
"- {doc}`/tune/examples/includes/mlflow_ptl_example`: Example for using [MLflow](https://github.com/mlflow/mlflow/)\n",
|
||||
" and [Pytorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning) with Ray Tune."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a9980775",
|
||||
"id": "3fb2a049",
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"remove-cell"
|
||||
|
@ -17,7 +17,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ba46c952",
|
||||
"id": "24af9556",
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"remove-cell"
|
||||
|
@ -30,11 +30,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0a05d46b",
|
||||
"id": "70ac1fe8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Model selection and serving with Ray Tune and Ray Serve\n",
|
||||
"\n",
|
||||
"```{image} /images/serve.svg\n",
|
||||
":align: center\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"This tutorial will show you an end-to-end example how to train a\n",
|
||||
"model using Ray Tune on incrementally arriving data and deploy\n",
|
||||
"the model using Ray Serve.\n",
|
||||
|
@ -123,7 +132,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0ca7cff7",
|
||||
"id": "0376c9c4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -152,7 +161,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2b1f869f",
|
||||
"id": "58eaafa1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data interface\n",
|
||||
|
@ -169,7 +178,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3bb602ec",
|
||||
"id": "fcf4de94",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -216,14 +225,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cb15d421",
|
||||
"id": "13612bb2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## PyTorch neural network classifier\n",
|
||||
"\n",
|
||||
"Next, we will introduce our PyTorch neural network model and the\n",
|
||||
"train and test function. These are adapted directly from\n",
|
||||
"our {doc}`PyTorch MNIST example </tune/examples/mnist_pytorch>`.\n",
|
||||
"our {doc}`PyTorch MNIST example </tune/examples/includes/mnist_pytorch>`.\n",
|
||||
"We only introduced an additional neural network layer with a configurable\n",
|
||||
"layer size. This is not strictly needed for learning good performance on\n",
|
||||
"MNIST, but it is useful to demonstrate scenarios where your hyperparameter\n",
|
||||
|
@ -233,7 +242,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7303568b",
|
||||
"id": "c2c21aa0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -283,7 +292,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bdebee8a",
|
||||
"id": "677ded46",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Tune trainable for model selection\n",
|
||||
|
@ -299,7 +308,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d344b960",
|
||||
"id": "4c29de4c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -362,7 +371,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8f8bc5ba",
|
||||
"id": "513f8db0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configuring the search space and starting Ray Tune\n",
|
||||
|
@ -381,7 +390,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7e3f5874",
|
||||
"id": "82fcbf6e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -437,7 +446,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2f3f7d0d",
|
||||
"id": "f051b634",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To continue training from an existing model, we can use this function\n",
|
||||
|
@ -453,7 +462,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fb127561",
|
||||
"id": "56b26451",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -515,7 +524,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7f9c7b87",
|
||||
"id": "a25629c1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Serving tuned models with Ray Serve\n",
|
||||
|
@ -534,7 +543,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "07f67a40",
|
||||
"id": "a0d6a4ca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -566,7 +575,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "26101b5c",
|
||||
"id": "2ba14c4a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We would like to have a fixed location where we store the currently\n",
|
||||
|
@ -578,7 +587,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "36a78d9a",
|
||||
"id": "bba77923",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -615,7 +624,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a4102f33",
|
||||
"id": "f779c7bc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Since we would like to continue training from the current existing\n",
|
||||
|
@ -627,7 +636,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2ec6daa2",
|
||||
"id": "005f2787",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -646,7 +655,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4d636cd5",
|
||||
"id": "5c55a5d3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Putting everything together\n",
|
||||
|
@ -667,7 +676,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1d7b2fb8",
|
||||
"id": "053bbbfe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -723,7 +732,7 @@
|
|||
" \"--from_scratch\",\n",
|
||||
" action=\"store_true\",\n",
|
||||
" help=\"Train and select best model from scratch\",\n",
|
||||
" default=False,\n",
|
||||
" default=True,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" parser.add_argument(\n",
|
||||
|
@ -748,7 +757,7 @@
|
|||
" \"--smoke-test\",\n",
|
||||
" action=\"store_true\",\n",
|
||||
" help=\"Finish quickly for testing\",\n",
|
||||
" default=False,\n",
|
||||
" default=True,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" args = parser.parse_args()\n",
|
||||
|
@ -831,7 +840,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae6b2542",
|
||||
"id": "7c8be26a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"That's it! We now have an end-to-end workflow to train and update a\n",
|
||||
|
@ -860,8 +869,9 @@
|
|||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
}
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ff4fd80e",
|
||||
"id": "9cf0e5ac",
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"remove-cell"
|
||||
|
@ -17,7 +17,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "24024bba",
|
||||
"id": "29c1356a",
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"remove-cell"
|
||||
|
@ -30,7 +30,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0694daf8",
|
||||
"id": "a97b7427",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Tune's Scikit Learn Adapters\n",
|
||||
|
@ -44,8 +44,13 @@
|
|||
":width: 50%\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Scikit-Learn [has an existing module for model selection](https://scikit-learn.org/stable/modules/grid_search.html),\n",
|
||||
"but the algorithms offered (Grid Search via``GridSearchCV`` and Random Search via``RandomizedSearchCV``)\n",
|
||||
"but the algorithms offered (Grid Search via ``GridSearchCV`` and Random Search via ``RandomizedSearchCV``)\n",
|
||||
"are often considered inefficient.\n",
|
||||
"In this tutorial, we'll cover ``tune-sklearn``, a drop-in replacement for Scikit-Learn's model selection module\n",
|
||||
"with state-of-the-art optimization features such as early stopping and Bayesian Optimization.\n",
|
||||
|
@ -70,7 +75,8 @@
|
|||
"## Walkthrough\n",
|
||||
"\n",
|
||||
"Let's compare Tune's Scikit-Learn APIs to the standard scikit-learn GridSearchCV. For this example, we'll be using\n",
|
||||
"``TuneGridSearchCV`` with a [SGDClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html).\n",
|
||||
"``TuneGridSearchCV`` with a\n",
|
||||
"[SGDClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html).\n",
|
||||
"\n",
|
||||
"To start out, change the import statement to get tune-scikit-learn’s grid search cross validation interface:"
|
||||
]
|
||||
|
@ -78,7 +84,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7f6b2190",
|
||||
"id": "5a0cc1d8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -91,19 +97,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ab2e6677",
|
||||
"id": "3a8c2610",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And from there, we would proceed just like how we would in Scikit-Learn’s interface!\n",
|
||||
"\n",
|
||||
"The `SGDClassifier` has a ``partial_fit`` API, which enables it to stop fitting to the data for a certain hyperparameter configuration.\n",
|
||||
"The `SGDClassifier` has a ``partial_fit`` API, which enables it to stop fitting to the data for a certain\n",
|
||||
"hyperparameter configuration.\n",
|
||||
"If the estimator does not support early stopping, we would fall back to a parallel grid search."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "30320f82",
|
||||
"id": "712b215e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -130,7 +137,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "abc84e87",
|
||||
"id": "79870ffb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As you can see, the setup here is exactly how you would do it for Scikit-Learn.\n",
|
||||
|
@ -140,7 +147,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4f81facf",
|
||||
"id": "7f2541b0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -159,7 +166,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4c675139",
|
||||
"id": "831d6609",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note the slight differences we introduced above:\n",
|
||||
|
@ -169,7 +176,8 @@
|
|||
"\n",
|
||||
"The ``early_stopping`` parameter allows us to terminate unpromising configurations. If ``early_stopping=True``,\n",
|
||||
"TuneGridSearchCV will default to using Tune's ASHAScheduler.\n",
|
||||
"You can pass in a custom algorithm - see {ref}`Tune's documentation on schedulers <tune-schedulers>` here for a full list to choose from.\n",
|
||||
"You can pass in a custom algorithm - see {ref}`Tune's documentation on schedulers <tune-schedulers>`\n",
|
||||
"here for a full list to choose from.\n",
|
||||
"``max_iters`` is the maximum number of iterations a given hyperparameter set could run for;\n",
|
||||
"it may run for fewer iterations if it is early stopped.\n",
|
||||
"\n",
|
||||
|
@ -179,7 +187,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d43b1ed3",
|
||||
"id": "bad624d5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -197,14 +205,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f31efdbf",
|
||||
"id": "328accb8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Bayesian Optimization\n",
|
||||
"\n",
|
||||
"In addition to the grid search interface, tune-sklearn also provides an interface,\n",
|
||||
"TuneSearchCV, for sampling from **distributions of hyperparameters**.\n",
|
||||
"In the following example we'll be using the [digits dataset from scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html)\n",
|
||||
"In the following example we'll be using the\n",
|
||||
"[digits dataset from scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html)\n",
|
||||
"\n",
|
||||
"In addition, you can easily enable Bayesian optimization over the distributions in only 2 lines of code:"
|
||||
]
|
||||
|
@ -212,7 +221,7 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5016e858",
|
||||
"id": "21ccda8d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -246,22 +255,25 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ac30adce",
|
||||
"id": "0fb1dc0d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As you can see, it’s very simple to integrate tune-sklearn into existing code.\n",
|
||||
"Distributed execution is also easy - you can simply run ``ray.init(address=\"auto\")`` before\n",
|
||||
"TuneSearchCV to connect to the Ray cluster and parallelize tuning across multiple nodes, as you would in any other Ray Tune script.\n",
|
||||
"TuneSearchCV to connect to the Ray cluster and parallelize tuning across multiple nodes,\n",
|
||||
"as you would in any other Ray Tune script.\n",
|
||||
"\n",
|
||||
"## Code Examples\n",
|
||||
"## More Scikit-Learn Examples\n",
|
||||
"\n",
|
||||
"See the [ray-project/tune-sklearn examples](https://github.com/ray-project/tune-sklearn/tree/master/examples)\n",
|
||||
"for a comprehensive list of examples leveraging Tune's sklearn interface.\n",
|
||||
"Check out more detailed examples and get started with tune-sklearn!\n",
|
||||
"\n",
|
||||
"* [Skorch with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/torch_nn.py>)\n",
|
||||
"* [Scikit-Learn Pipelines with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/sklearn_pipeline.py>)\n",
|
||||
"* [XGBoost with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/xgbclassifier.py>)\n",
|
||||
"* [KerasClassifier with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/keras_example.py>)\n",
|
||||
"* [LightGBM with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/lgbm.py>)\n",
|
||||
"* [Skorch with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/torch_nn.py)\n",
|
||||
"* [Scikit-Learn Pipelines with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/sklearn_pipeline.py)\n",
|
||||
"* [XGBoost with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/xgbclassifier.py)\n",
|
||||
"* [KerasClassifier with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/keras_example.py)\n",
|
||||
"* [LightGBM with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/lgbm.py)\n",
|
||||
"\n",
|
||||
"## Further Reading\n",
|
||||
"\n",
|
||||
|
@ -275,8 +287,9 @@
|
|||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
}
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
}
|
349
doc/source/tune/examples/tune-wandb.ipynb
Normal file
349
doc/source/tune/examples/tune-wandb.ipynb
Normal file
|
@ -0,0 +1,349 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ecad719c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Using Weights & Biases with Tune\n",
|
||||
"\n",
|
||||
"(tune-wandb-ref)=\n",
|
||||
"\n",
|
||||
"[Weights & Biases](https://www.wandb.ai/) (Wandb) is a tool for experiment\n",
|
||||
"tracking, model optimizaton, and dataset versioning. It is very popular\n",
|
||||
"in the machine learning and data science community for its superb visualization\n",
|
||||
"tools.\n",
|
||||
"\n",
|
||||
"```{image} /images/wandb_logo_full.png\n",
|
||||
":align: center\n",
|
||||
":alt: Weights & Biases\n",
|
||||
":height: 80px\n",
|
||||
":target: https://www.wandb.ai/\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Ray Tune currently offers two lightweight integrations for Weights & Biases.\n",
|
||||
"One is the {ref}`WandbLoggerCallback <tune-wandb-logger>`, which automatically logs\n",
|
||||
"metrics reported to Tune to the Wandb API.\n",
|
||||
"\n",
|
||||
"The other one is the {ref}`@wandb_mixin <tune-wandb-mixin>` decorator, which can be\n",
|
||||
"used with the function API. It automatically\n",
|
||||
"initializes the Wandb API with Tune's training information. You can just use the\n",
|
||||
"Wandb API like you would normally do, e.g. using `wandb.log()` to log your training\n",
|
||||
"process.\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## Running A Weights & Biases Example\n",
|
||||
"\n",
|
||||
"In the following example we're going to use both of the above methods, namely the `WandbLoggerCallback` and\n",
|
||||
"the `wandb_mixin` decorator to log metrics.\n",
|
||||
"Let's start with a few crucial imports:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "100bcf8a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import wandb\n",
|
||||
"\n",
|
||||
"from ray import tune\n",
|
||||
"from ray.tune import Trainable\n",
|
||||
"from ray.tune.integration.wandb import (\n",
|
||||
" WandbLoggerCallback,\n",
|
||||
" WandbTrainableMixin,\n",
|
||||
" wandb_mixin,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Next, let's define an easy `objective` function (a Tune `Trainable`) that reports a random loss to Tune.\n",
|
||||
"The objective function itself is not important for this example, since we want to focus on the Weights & Biases\n",
|
||||
"integration primarily."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def objective(config, checkpoint_dir=None):\n",
|
||||
" for i in range(30):\n",
|
||||
" loss = config[\"mean\"] + config[\"sd\"] * np.random.randn()\n",
|
||||
" tune.report(loss=loss)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Given that you provide an `api_key_file` pointing to your Weights & Biases API key, you cna define a\n",
|
||||
"simple grid-search Tune run using the `WandbLoggerCallback` as follows:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tune_function(api_key_file):\n",
|
||||
" \"\"\"Example for using a WandbLoggerCallback with the function API\"\"\"\n",
|
||||
" analysis = tune.run(\n",
|
||||
" objective,\n",
|
||||
" metric=\"loss\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" config={\n",
|
||||
" \"mean\": tune.grid_search([1, 2, 3, 4, 5]),\n",
|
||||
" \"sd\": tune.uniform(0.2, 0.8),\n",
|
||||
" },\n",
|
||||
" callbacks=[\n",
|
||||
" WandbLoggerCallback(api_key_file=api_key_file, project=\"Wandb_example\")\n",
|
||||
" ],\n",
|
||||
" )\n",
|
||||
" return analysis.best_config"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"To use the `wandb_mixin` decorator, you can simply decorate the objective function from earlier.\n",
|
||||
"Note that we also use `wandb.log(...)` to log the `loss` to Weights & Biases as a dictionary.\n",
|
||||
"Otherwise, the decorated version of our objective is identical to its original."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@wandb_mixin\n",
|
||||
"def decorated_objective(config, checkpoint_dir=None):\n",
|
||||
" for i in range(30):\n",
|
||||
" loss = config[\"mean\"] + config[\"sd\"] * np.random.randn()\n",
|
||||
" tune.report(loss=loss)\n",
|
||||
" wandb.log(dict(loss=loss))"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"With the `decorated_objective` defined, running a Tune experiment is as simple as providing this objective and\n",
|
||||
"passing the `api_key_file` to the `wandb` key of your Tune `config`:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tune_decorated(api_key_file):\n",
|
||||
" \"\"\"Example for using the @wandb_mixin decorator with the function API\"\"\"\n",
|
||||
" analysis = tune.run(\n",
|
||||
" decorated_objective,\n",
|
||||
" metric=\"loss\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" config={\n",
|
||||
" \"mean\": tune.grid_search([1, 2, 3, 4, 5]),\n",
|
||||
" \"sd\": tune.uniform(0.2, 0.8),\n",
|
||||
" \"wandb\": {\"api_key_file\": api_key_file, \"project\": \"Wandb_example\"},\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
" return analysis.best_config"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Finally, you can also define a class-based Tune `Trainable` by using the `WandbTrainableMixin` to define your objective:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class WandbTrainable(WandbTrainableMixin, Trainable):\n",
|
||||
" def step(self):\n",
|
||||
" for i in range(30):\n",
|
||||
" loss = self.config[\"mean\"] + self.config[\"sd\"] * np.random.randn()\n",
|
||||
" wandb.log({\"loss\": loss})\n",
|
||||
" return {\"loss\": loss, \"done\": True}"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Running Tune with this `WandbTrainable` works exactly the same as with the function API.\n",
|
||||
"The below `tune_trainable` function differs from `tune_decorated` above only in the first argument we pass to\n",
|
||||
"`tune.run()`:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tune_trainable(api_key_file):\n",
|
||||
" \"\"\"Example for using a WandTrainableMixin with the class API\"\"\"\n",
|
||||
" analysis = tune.run(\n",
|
||||
" WandbTrainable,\n",
|
||||
" metric=\"loss\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" config={\n",
|
||||
" \"mean\": tune.grid_search([1, 2, 3, 4, 5]),\n",
|
||||
" \"sd\": tune.uniform(0.2, 0.8),\n",
|
||||
" \"wandb\": {\"api_key_file\": api_key_file, \"project\": \"Wandb_example\"},\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
" return analysis.best_config"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Since you may not have an API key for Wandb, we can _mock_ the Wandb logger and test all three of our training\n",
|
||||
"functions as follows.\n",
|
||||
"If you do have an API key file, make sure to set `mock_api` to `False` and pass in the right `api_key_file` below."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tempfile\n",
|
||||
"from unittest.mock import MagicMock\n",
|
||||
"\n",
|
||||
"mock_api = True\n",
|
||||
"\n",
|
||||
"api_key_file = \"~/.wandb_api_key\"\n",
|
||||
"\n",
|
||||
"if mock_api:\n",
|
||||
" WandbLoggerCallback._logger_process_cls = MagicMock\n",
|
||||
" decorated_objective.__mixins__ = tuple()\n",
|
||||
" WandbTrainable._wandb = MagicMock()\n",
|
||||
" wandb = MagicMock() # noqa: F811\n",
|
||||
" temp_file = tempfile.NamedTemporaryFile()\n",
|
||||
" temp_file.write(b\"1234\")\n",
|
||||
" temp_file.flush()\n",
|
||||
" api_key_file = temp_file.name\n",
|
||||
"\n",
|
||||
"tune_function(api_key_file)\n",
|
||||
"tune_decorated(api_key_file)\n",
|
||||
"tune_trainable(api_key_file)\n",
|
||||
"\n",
|
||||
"if mock_api:\n",
|
||||
" temp_file.close()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2f6e9138",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This completes our Tune and Wandb walk-through.\n",
|
||||
"In the following sections you can find more details on the API of the Tune-Wandb integration.\n",
|
||||
"\n",
|
||||
"## Tune Wandb API Reference\n",
|
||||
"\n",
|
||||
"### WandbLoggerCallback\n",
|
||||
"\n",
|
||||
"(tune-wandb-logger)=\n",
|
||||
"\n",
|
||||
"```{eval-rst}\n",
|
||||
".. autoclass:: ray.tune.integration.wandb.WandbLoggerCallback\n",
|
||||
" :noindex:\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"### Wandb-Mixin\n",
|
||||
"\n",
|
||||
"(tune-wandb-mixin)=\n",
|
||||
"\n",
|
||||
"```{eval-rst}\n",
|
||||
".. autofunction:: ray.tune.integration.wandb.wandb_mixin\n",
|
||||
" :noindex:\n",
|
||||
"```"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
718
doc/source/tune/examples/tune-xgboost.ipynb
Normal file
718
doc/source/tune/examples/tune-xgboost.ipynb
Normal file
|
@ -0,0 +1,718 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "edce67b9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Tuning XGBoost parameters\n",
|
||||
"\n",
|
||||
"(tune-xgboost-ref)=\n",
|
||||
"\n",
|
||||
"XGBoost is currently one of the most popular machine learning algorithms. It performs\n",
|
||||
"very well on a large selection of tasks, and was the key to success in many Kaggle\n",
|
||||
"competitions.\n",
|
||||
"\n",
|
||||
"```{image} /images/xgboost_logo.png\n",
|
||||
":align: center\n",
|
||||
":alt: XGBoost\n",
|
||||
":target: https://xgboost.readthedocs.io/en/latest/\n",
|
||||
":width: 200px\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"This tutorial will give you a quick introduction to XGBoost, show you how\n",
|
||||
"to train an XGBoost model, and then guide you on how to optimize XGBoost\n",
|
||||
"parameters using Tune to get the best performance. We tackle the following topics:\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":depth: 2\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
":::{note}\n",
|
||||
"To run this tutorial, you will need to install the following:\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"$ pip install xgboost\n",
|
||||
"```\n",
|
||||
":::\n",
|
||||
"\n",
|
||||
"## What is XGBoost\n",
|
||||
"\n",
|
||||
"XGBoost is an acronym for e**X**treme **G**radient **Boost**ing. Internally,\n",
|
||||
"XGBoost uses [decision trees](https://en.wikipedia.org/wiki/Decision_tree). Instead\n",
|
||||
"of training just one large decision tree, XGBoost and other related algorithms train\n",
|
||||
"many small decision trees. The intuition behind this is that even though single\n",
|
||||
"decision trees can be inaccurate and suffer from high variance,\n",
|
||||
"combining the output of a large number of these weak learners can actually lead to\n",
|
||||
"strong learner, resulting in better predictions and less variance.\n",
|
||||
"\n",
|
||||
":::{figure} /images/tune-xgboost-ensemble.svg\n",
|
||||
":alt: Single vs. ensemble learning\n",
|
||||
"\n",
|
||||
"A single decision tree (left) might be able to get to an accuracy of 70%\n",
|
||||
"for a binary classification task. By combining the output of several small\n",
|
||||
"decision trees, an ensemble learner (right) might end up with a higher accuracy\n",
|
||||
"of 90%.\n",
|
||||
":::\n",
|
||||
"\n",
|
||||
"Boosting algorithms start with a single small decision tree and evaluate how well\n",
|
||||
"it predicts the given examples. When building the next tree, those samples that have\n",
|
||||
"been misclassified before have a higher chance of being used to generate the tree.\n",
|
||||
"This is useful because it avoids overfitting to samples that can be easily classified\n",
|
||||
"and instead tries to come up with models that are able to classify hard examples, too.\n",
|
||||
"Please see [here for a more thorough introduction to bagging and boosting algorithms](https://towardsdatascience.com/ensemble-methods-bagging-boosting-and-stacking-c9214a10a205).\n",
|
||||
"\n",
|
||||
"There are many boosting algorithms. In their core, they are all very similar. XGBoost\n",
|
||||
"uses second-level derivatives to find splits that maximize the *gain* (the inverse of\n",
|
||||
"the *loss*) - hence the name. In practice, there really is no drawback in using\n",
|
||||
"XGBoost over other boosting algorithms - in fact, it usually shows the best performance.\n",
|
||||
"\n",
|
||||
"## Training a simple XGBoost classifier\n",
|
||||
"\n",
|
||||
"Let's first see how a simple XGBoost classifier can be trained. We'll use the\n",
|
||||
"`breast_cancer`-Dataset included in the `sklearn` dataset collection. This is\n",
|
||||
"a binary classification dataset. Given 30 different input features, our task is to\n",
|
||||
"learn to identify subjects with breast cancer and those without.\n",
|
||||
"\n",
|
||||
"Here is the full code to train a simple XGBoost model:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sklearn.datasets\n",
|
||||
"import sklearn.metrics\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"import xgboost as xgb\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def train_breast_cancer(config):\n",
|
||||
" # Load dataset\n",
|
||||
" data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
|
||||
" # Split into train and test set\n",
|
||||
" train_x, test_x, train_y, test_y = train_test_split(\n",
|
||||
" data, labels, test_size=0.25)\n",
|
||||
" # Build input matrices for XGBoost\n",
|
||||
" train_set = xgb.DMatrix(train_x, label=train_y)\n",
|
||||
" test_set = xgb.DMatrix(test_x, label=test_y)\n",
|
||||
" # Train the classifier\n",
|
||||
" results = {}\n",
|
||||
" bst = xgb.train(\n",
|
||||
" config,\n",
|
||||
" train_set,\n",
|
||||
" evals=[(test_set, \"eval\")],\n",
|
||||
" evals_result=results,\n",
|
||||
" verbose_eval=False)\n",
|
||||
" return results\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" results = train_breast_cancer({\n",
|
||||
" \"objective\": \"binary:logistic\",\n",
|
||||
" \"eval_metric\": [\"logloss\", \"error\"]\n",
|
||||
" })\n",
|
||||
" accuracy = 1. - results[\"eval\"][\"error\"][-1]\n",
|
||||
" print(f\"Accuracy: {accuracy:.4f}\")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"As you can see, the code is quite simple. First, the dataset is loaded and split\n",
|
||||
"into a `test` and `train` set. The XGBoost model is trained with `xgb.train()`.\n",
|
||||
"XGBoost automatically evaluates metrics we specified on the test set. In our case\n",
|
||||
"it calculates the *logloss* and the prediction *error*, which is the percentage of\n",
|
||||
"misclassified examples. To calculate the accuracy, we just have to subtract the error\n",
|
||||
"from `1.0`. Even in this simple example, most runs result\n",
|
||||
"in a good accuracy of over `0.90`.\n",
|
||||
"\n",
|
||||
"Maybe you have noticed the `config` parameter we pass to the XGBoost algorithm. This\n",
|
||||
"is a {class}`dict` in which you can specify parameters for the XGBoost algorithm. In this\n",
|
||||
"simple example, the only parameters we passed are the `objective` and `eval_metric` parameters.\n",
|
||||
"The value `binary:logistic` tells XGBoost that we aim to train a logistic regression model for\n",
|
||||
"a binary classification task. You can find an overview over all valid objectives\n",
|
||||
"[here in the XGBoost documentation](https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters).\n",
|
||||
"\n",
|
||||
"## XGBoost Hyperparameters\n",
|
||||
"\n",
|
||||
"Even with the default settings, XGBoost was able to get to a good accuracy on the\n",
|
||||
"breast cancer dataset. However, as in many machine learning algorithms, there are\n",
|
||||
"many knobs to tune which might lead to even better performance. Let's explore some of\n",
|
||||
"them below.\n",
|
||||
"\n",
|
||||
"### Maximum tree depth\n",
|
||||
"\n",
|
||||
"Remember that XGBoost internally uses many decision tree models to come up with\n",
|
||||
"predictions. When training a decision tree, we need to tell the algorithm how\n",
|
||||
"large the tree may get. The parameter for this is called the tree *depth*.\n",
|
||||
"\n",
|
||||
":::{figure} /images/tune-xgboost-depth.svg\n",
|
||||
":align: center\n",
|
||||
":alt: Decision tree depth\n",
|
||||
"\n",
|
||||
"In this image, the left tree has a depth of 2, and the right tree a depth of 3.\n",
|
||||
"Note that with each level, $2^{(d-1)}$ splits are added, where *d* is the depth\n",
|
||||
"of the tree.\n",
|
||||
":::\n",
|
||||
"\n",
|
||||
"Tree depth is a property that concerns the model complexity. If you only allow short\n",
|
||||
"trees, the models are likely not very precise - they underfit the data. If you allow\n",
|
||||
"very large trees, the single models are likely to overfit to the data. In practice,\n",
|
||||
"a number between `2` and `6` is often a good starting point for this parameter.\n",
|
||||
"\n",
|
||||
"XGBoost's default value is `3`.\n",
|
||||
"\n",
|
||||
"### Minimum child weight\n",
|
||||
"\n",
|
||||
"When a decision tree creates new leaves, it splits up the remaining data at one node\n",
|
||||
"into two groups. If there are only few samples in one of these groups, it often\n",
|
||||
"doesn't make sense to split it further. One of the reasons for this is that the\n",
|
||||
"model is harder to train when we have fewer samples.\n",
|
||||
"\n",
|
||||
":::{figure} /images/tune-xgboost-weight.svg\n",
|
||||
":align: center\n",
|
||||
":alt: Minimum child weight\n",
|
||||
"\n",
|
||||
"In this example, we start with 100 examples. At the first node, they are split\n",
|
||||
"into 4 and 96 samples, respectively. In the next step, our model might find\n",
|
||||
"that it doesn't make sense to split the 4 examples more. It thus only continues\n",
|
||||
"to add leaves on the right side.\n",
|
||||
":::\n",
|
||||
"\n",
|
||||
"The parameter used by the model to decide if it makes sense to split a node is called\n",
|
||||
"the *minimum child weight*. In the case of linear regression, this is just the absolute\n",
|
||||
"number of nodes requried in each child. In other objectives, this value is determined\n",
|
||||
"using the weights of the examples, hence the name.\n",
|
||||
"\n",
|
||||
"The larger the value, the more constrained the trees are and the less deep they will be.\n",
|
||||
"This parameter thus also affects the model complexity. Values can range between 0\n",
|
||||
"and infinity and are dependent on the sample size. For our ca. 500 examples in the\n",
|
||||
"breast cancer dataset, values between `0` and `10` should be sensible.\n",
|
||||
"\n",
|
||||
"XGBoost's default value is `1`.\n",
|
||||
"\n",
|
||||
"### Subsample size\n",
|
||||
"\n",
|
||||
"Each decision tree we add is trained on a subsample of the total training dataset.\n",
|
||||
"The probabilities for the samples are weighted according to the XGBoost algorithm,\n",
|
||||
"but we can decide on which fraction of the samples we want to train each decision\n",
|
||||
"tree on.\n",
|
||||
"\n",
|
||||
"Setting this value to `0.7` would mean that we randomly sample `70%` of the\n",
|
||||
"training dataset before each training iteration.\n",
|
||||
"\n",
|
||||
"XGBoost's default value is `1`.\n",
|
||||
"\n",
|
||||
"### Learning rate / Eta\n",
|
||||
"\n",
|
||||
"Remember that XGBoost sequentially trains many decision trees, and that later trees\n",
|
||||
"are more likely trained on data that has been misclassified by prior trees. In effect\n",
|
||||
"this means that earlier trees make decisions for easy samples (i.e. those samples that\n",
|
||||
"can easily be classified) and later trees make decisions for harder samples. It is then\n",
|
||||
"sensible to assume that the later trees are less accurate than earlier trees.\n",
|
||||
"\n",
|
||||
"To address this fact, XGBoost uses a parameter called *Eta*, which is sometimes called\n",
|
||||
"the *learning rate*. Don't confuse this with learning rates from gradient descent!\n",
|
||||
"The original [paper on stochastic gradient boosting](https://www.sciencedirect.com/science/article/abs/pii/S0167947301000652)\n",
|
||||
"introduces this parameter like so:\n",
|
||||
"\n",
|
||||
"$$\n",
|
||||
"F_m(x) = F_{m-1}(x) + \\eta \\cdot \\gamma_{lm} \\textbf{1}(x \\in R_{lm})\n",
|
||||
"$$\n",
|
||||
"\n",
|
||||
"This is just a complicated way to say that when we train we new decision tree,\n",
|
||||
"represented by $\\gamma_{lm} \\textbf{1}(x \\in R_{lm})$, we want to dampen\n",
|
||||
"its effect on the previous prediction $F_{m-1}(x)$ with a factor\n",
|
||||
"$\\eta$.\n",
|
||||
"\n",
|
||||
"Typical values for this parameter are between `0.01` and `` 0.3` ``.\n",
|
||||
"\n",
|
||||
"XGBoost's default value is `0.3`.\n",
|
||||
"\n",
|
||||
"### Number of boost rounds\n",
|
||||
"\n",
|
||||
"Lastly, we can decide on how many boosting rounds we perform, which means how\n",
|
||||
"many decision trees we ultimately train. When we do heavy subsampling or use small\n",
|
||||
"learning rate, it might make sense to increase the number of boosting rounds.\n",
|
||||
"\n",
|
||||
"XGBoost's default value is `10`.\n",
|
||||
"\n",
|
||||
"### Putting it together\n",
|
||||
"\n",
|
||||
"Let's see how this looks like in code! We just need to adjust our `config` dict:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" config = {\n",
|
||||
" \"objective\": \"binary:logistic\",\n",
|
||||
" \"eval_metric\": [\"logloss\", \"error\"],\n",
|
||||
" \"max_depth\": 2,\n",
|
||||
" \"min_child_weight\": 0,\n",
|
||||
" \"subsample\": 0.8,\n",
|
||||
" \"eta\": 0.2\n",
|
||||
" }\n",
|
||||
" results = train_breast_cancer(config)\n",
|
||||
" accuracy = 1. - results[\"eval\"][\"error\"][-1]\n",
|
||||
" print(f\"Accuracy: {accuracy:.4f}\")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"The rest stays the same. Please note that we do not adjust the `num_boost_rounds` here.\n",
|
||||
"The result should also show a high accuracy of over 90%.\n",
|
||||
"\n",
|
||||
"## Tuning the configuration parameters\n",
|
||||
"\n",
|
||||
"XGBoosts default parameters already lead to a good accuracy, and even our guesses in the\n",
|
||||
"last section should result in accuracies well above 90%. However, our guesses were\n",
|
||||
"just that: guesses. Often we do not know what combination of parameters would actually\n",
|
||||
"lead to the best results on a machine learning task.\n",
|
||||
"\n",
|
||||
"Unfortunately, there are infinitely many combinations of hyperparameters we could try\n",
|
||||
"out. Should we combine `max_depth=3` with `subsample=0.8` or with `subsample=0.9`?\n",
|
||||
"What about the other parameters?\n",
|
||||
"\n",
|
||||
"This is where hyperparameter tuning comes into play. By using tuning libraries such as\n",
|
||||
"Ray Tune we can try out combinations of hyperparameters. Using sophisticated search\n",
|
||||
"strategies, these parameters can be selected so that they are likely to lead to good\n",
|
||||
"results (avoiding an expensive *exhaustive search*). Also, trials that do not perform\n",
|
||||
"well can be preemptively stopped to reduce waste of computing resources. Lastly, Ray Tune\n",
|
||||
"also takes care of training these runs in parallel, greatly increasing search speed.\n",
|
||||
"\n",
|
||||
"Let's start with a basic example on how to use Tune for this. We just need to make\n",
|
||||
"a few changes to our code-block:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sklearn.datasets\n",
|
||||
"import sklearn.metrics\n",
|
||||
"\n",
|
||||
"from ray import tune\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def train_breast_cancer(config):\n",
|
||||
" # Load dataset\n",
|
||||
" data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
|
||||
" # Split into train and test set\n",
|
||||
" train_x, test_x, train_y, test_y = train_test_split(\n",
|
||||
" data, labels, test_size=0.25)\n",
|
||||
" # Build input matrices for XGBoost\n",
|
||||
" train_set = xgb.DMatrix(train_x, label=train_y)\n",
|
||||
" test_set = xgb.DMatrix(test_x, label=test_y)\n",
|
||||
" # Train the classifier\n",
|
||||
" results = {}\n",
|
||||
" xgb.train(\n",
|
||||
" config,\n",
|
||||
" train_set,\n",
|
||||
" evals=[(test_set, \"eval\")],\n",
|
||||
" evals_result=results,\n",
|
||||
" verbose_eval=False)\n",
|
||||
" # Return prediction accuracy\n",
|
||||
" accuracy = 1. - results[\"eval\"][\"error\"][-1]\n",
|
||||
" tune.report(mean_accuracy=accuracy, done=True)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" config = {\n",
|
||||
" \"objective\": \"binary:logistic\",\n",
|
||||
" \"eval_metric\": [\"logloss\", \"error\"],\n",
|
||||
" \"max_depth\": tune.randint(1, 9),\n",
|
||||
" \"min_child_weight\": tune.choice([1, 2, 3]),\n",
|
||||
" \"subsample\": tune.uniform(0.5, 1.0),\n",
|
||||
" \"eta\": tune.loguniform(1e-4, 1e-1)\n",
|
||||
" }\n",
|
||||
" analysis = tune.run(\n",
|
||||
" train_breast_cancer,\n",
|
||||
" resources_per_trial={\"cpu\": 1},\n",
|
||||
" config=config,\n",
|
||||
" num_samples=10)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"As you can see, the changes in the actual training function are minimal. Instead of\n",
|
||||
"returning the accuracy value, we report it back to Tune using `tune.report()`.\n",
|
||||
"Our `config` dictionary only changed slightly. Instead of passing hard-coded\n",
|
||||
"parameters, we tell Tune to choose values from a range of valid options. There are\n",
|
||||
"a number of options we have here, all of which are explained in\n",
|
||||
"{ref}`the Tune docs <tune-sample-docs>`.\n",
|
||||
"\n",
|
||||
"For a brief explanation, this is what they do:\n",
|
||||
"\n",
|
||||
"- `tune.randint(min, max)` chooses a random integer value between *min* and *max*.\n",
|
||||
" Note that *max* is exclusive, so it will not be sampled.\n",
|
||||
"- `tune.choice([a, b, c])` chooses one of the items of the list at random. Each item\n",
|
||||
" has the same chance to be sampled.\n",
|
||||
"- `tune.uniform(min, max)` samples a floating point number between *min* and *max*.\n",
|
||||
" Note that *max* is exclusive here, too.\n",
|
||||
"- `tune.loguniform(min, max, base=10)` samples a floating point number between *min* and *max*,\n",
|
||||
" but applies a logarithmic transformation to these boundaries first. Thus, this makes\n",
|
||||
" it easy to sample values from different orders of magnitude.\n",
|
||||
"\n",
|
||||
"The `num_samples=10` option we pass to `tune.run()` means that we sample 10 different\n",
|
||||
"hyperparameter configurations from this search space.\n",
|
||||
"\n",
|
||||
"The output of our training run coud look like this:\n",
|
||||
"\n",
|
||||
"```{code-block} bash\n",
|
||||
":emphasize-lines: 14\n",
|
||||
"\n",
|
||||
" Number of trials: 10/10 (10 TERMINATED)\n",
|
||||
" +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+\n",
|
||||
" | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | acc | iter | total time (s) |\n",
|
||||
" |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------|\n",
|
||||
" | train_breast_cancer_b63aa_00000 | TERMINATED | | 0.000117625 | 2 | 2 | 0.616347 | 0.916084 | 1 | 0.0306492 |\n",
|
||||
" | train_breast_cancer_b63aa_00001 | TERMINATED | | 0.0382954 | 8 | 2 | 0.581549 | 0.937063 | 1 | 0.0357082 |\n",
|
||||
" | train_breast_cancer_b63aa_00002 | TERMINATED | | 0.000217926 | 1 | 3 | 0.528428 | 0.874126 | 1 | 0.0264609 |\n",
|
||||
" | train_breast_cancer_b63aa_00003 | TERMINATED | | 0.000120929 | 8 | 1 | 0.634508 | 0.958042 | 1 | 0.036406 |\n",
|
||||
" | train_breast_cancer_b63aa_00004 | TERMINATED | | 0.00839715 | 5 | 1 | 0.730624 | 0.958042 | 1 | 0.0389378 |\n",
|
||||
" | train_breast_cancer_b63aa_00005 | TERMINATED | | 0.000732948 | 8 | 2 | 0.915863 | 0.958042 | 1 | 0.0382841 |\n",
|
||||
" | train_breast_cancer_b63aa_00006 | TERMINATED | | 0.000856226 | 4 | 1 | 0.645209 | 0.916084 | 1 | 0.0357089 |\n",
|
||||
" | train_breast_cancer_b63aa_00007 | TERMINATED | | 0.00769908 | 7 | 1 | 0.729443 | 0.909091 | 1 | 0.0390737 |\n",
|
||||
" | train_breast_cancer_b63aa_00008 | TERMINATED | | 0.00186339 | 5 | 3 | 0.595744 | 0.944056 | 1 | 0.0343912 |\n",
|
||||
" | train_breast_cancer_b63aa_00009 | TERMINATED | | 0.000950272 | 3 | 2 | 0.835504 | 0.965035 | 1 | 0.0348201 |\n",
|
||||
" +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"The best configuration we found used `eta=0.000950272`, `max_depth=3`,\n",
|
||||
"`min_child_weight=2`, `subsample=0.835504` and reached an accuracy of\n",
|
||||
"`0.965035`.\n",
|
||||
"\n",
|
||||
"## Early stopping\n",
|
||||
"\n",
|
||||
"Currently, Tune samples 10 different hyperparameter configurations and trains a full\n",
|
||||
"XGBoost on all of them. In our small example, training is very fast. However,\n",
|
||||
"if training takes longer, a significant amount of computer resources is spent on trials\n",
|
||||
"that will eventually show a bad performance, e.g. a low accuracy. It would be good\n",
|
||||
"if we could identify these trials early and stop them, so we don't waste any resources.\n",
|
||||
"\n",
|
||||
"This is where Tune's *Schedulers* shine. A Tune `TrialScheduler` is responsible\n",
|
||||
"for starting and stopping trials. Tune implements a number of different schedulers, each\n",
|
||||
"described {ref}`in the Tune documentation <tune-schedulers>`.\n",
|
||||
"For our example, we will use the `AsyncHyperBandScheduler` or `ASHAScheduler`.\n",
|
||||
"\n",
|
||||
"The basic idea of this scheduler: We sample a number of hyperparameter configurations.\n",
|
||||
"Each of these configurations is trained for a specific number of iterations.\n",
|
||||
"After these iterations, only the best performing hyperparameters are retained. These\n",
|
||||
"are selected according to some loss metric, usually an evaluation loss. This cycle is\n",
|
||||
"repeated until we end up with the best configuration.\n",
|
||||
"\n",
|
||||
"The `ASHAScheduler` needs to know three things:\n",
|
||||
"\n",
|
||||
"1. Which metric should be used to identify badly performing trials?\n",
|
||||
"2. Should this metric be maximized or minimized?\n",
|
||||
"3. How many iterations does each trial train for?\n",
|
||||
"\n",
|
||||
"There are more parameters, which are explained in the\n",
|
||||
"{ref}`documentation <tune-scheduler-hyperband>`.\n",
|
||||
"\n",
|
||||
"Lastly, we have to report the loss metric to Tune. We do this with a `Callback` that\n",
|
||||
"XGBoost accepts and calls after each evaluation round. Ray Tune comes\n",
|
||||
"with {ref}`two XGBoost callbacks <tune-integration-xgboost>`\n",
|
||||
"we can use for this. The `TuneReportCallback` just reports the evaluation\n",
|
||||
"metrics back to Tune. The `TuneReportCheckpointCallback` also saves\n",
|
||||
"checkpoints after each evaluation round. We will just use the latter in this\n",
|
||||
"example so that we can retrieve the saved model later.\n",
|
||||
"\n",
|
||||
"These parameters from the `eval_metrics` configuration setting are then automatically\n",
|
||||
"reported to Tune via the callback. Here, the raw error will be reported, not the accuracy.\n",
|
||||
"To display the best reached accuracy, we will inverse it later.\n",
|
||||
"\n",
|
||||
"We will also load the best checkpointed model so that we can use it for predictions.\n",
|
||||
"The best model is selected with respect to the `metric` and `mode` parameters we\n",
|
||||
"pass to `tune.run()`."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sklearn.datasets\n",
|
||||
"import sklearn.metrics\n",
|
||||
"import os\n",
|
||||
"from ray.tune.schedulers import ASHAScheduler\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"import xgboost as xgb\n",
|
||||
"\n",
|
||||
"from ray import tune\n",
|
||||
"from ray.tune.integration.xgboost import TuneReportCheckpointCallback\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def train_breast_cancer(config: dict):\n",
|
||||
" # This is a simple training function to be passed into Tune\n",
|
||||
" # Load dataset\n",
|
||||
" data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
|
||||
" # Split into train and test set\n",
|
||||
" train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)\n",
|
||||
" # Build input matrices for XGBoost\n",
|
||||
" train_set = xgb.DMatrix(train_x, label=train_y)\n",
|
||||
" test_set = xgb.DMatrix(test_x, label=test_y)\n",
|
||||
" # Train the classifier, using the Tune callback\n",
|
||||
" xgb.train(\n",
|
||||
" config,\n",
|
||||
" train_set,\n",
|
||||
" evals=[(test_set, \"eval\")],\n",
|
||||
" verbose_eval=False,\n",
|
||||
" callbacks=[TuneReportCheckpointCallback(filename=\"model.xgb\")],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_best_model_checkpoint(analysis):\n",
|
||||
" best_bst = xgb.Booster()\n",
|
||||
" best_bst.load_model(os.path.join(analysis.best_checkpoint, \"model.xgb\"))\n",
|
||||
" accuracy = 1.0 - analysis.best_result[\"eval-error\"]\n",
|
||||
" print(f\"Best model parameters: {analysis.best_config}\")\n",
|
||||
" print(f\"Best model total accuracy: {accuracy:.4f}\")\n",
|
||||
" return best_bst\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tune_xgboost():\n",
|
||||
" search_space = {\n",
|
||||
" # You can mix constants with search space objects.\n",
|
||||
" \"objective\": \"binary:logistic\",\n",
|
||||
" \"eval_metric\": [\"logloss\", \"error\"],\n",
|
||||
" \"max_depth\": tune.randint(1, 9),\n",
|
||||
" \"min_child_weight\": tune.choice([1, 2, 3]),\n",
|
||||
" \"subsample\": tune.uniform(0.5, 1.0),\n",
|
||||
" \"eta\": tune.loguniform(1e-4, 1e-1),\n",
|
||||
" }\n",
|
||||
" # This will enable aggressive early stopping of bad trials.\n",
|
||||
" scheduler = ASHAScheduler(\n",
|
||||
" max_t=10, grace_period=1, reduction_factor=2 # 10 training iterations\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" analysis = tune.run(\n",
|
||||
" train_breast_cancer,\n",
|
||||
" metric=\"eval-logloss\",\n",
|
||||
" mode=\"min\",\n",
|
||||
" # You can add \"gpu\": 0.1 to allocate GPUs\n",
|
||||
" resources_per_trial={\"cpu\": 1},\n",
|
||||
" config=search_space,\n",
|
||||
" num_samples=10,\n",
|
||||
" scheduler=scheduler,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" return analysis\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" import argparse\n",
|
||||
"\n",
|
||||
" parser = argparse.ArgumentParser()\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--server-address\",\n",
|
||||
" type=str,\n",
|
||||
" default=None,\n",
|
||||
" required=False,\n",
|
||||
" help=\"The address of server to connect to if using \" \"Ray Client.\",\n",
|
||||
" )\n",
|
||||
" args, _ = parser.parse_known_args()\n",
|
||||
"\n",
|
||||
" if args.server_address:\n",
|
||||
" import ray\n",
|
||||
"\n",
|
||||
" ray.init(f\"ray://{args.server_address}\")\n",
|
||||
"\n",
|
||||
" analysis = tune_xgboost()\n",
|
||||
"\n",
|
||||
" # Load the best model checkpoint.\n",
|
||||
" if args.server_address:\n",
|
||||
" # If connecting to a remote server with Ray Client, checkpoint loading\n",
|
||||
" # should be wrapped in a task so it will execute on the server.\n",
|
||||
" # We have to make sure it gets executed on the same node that\n",
|
||||
" # ``tune.run`` is called on.\n",
|
||||
" from ray.util.ml_utils.node import force_on_current_node\n",
|
||||
"\n",
|
||||
" remote_fn = force_on_current_node(ray.remote(get_best_model_checkpoint))\n",
|
||||
" best_bst = ray.get(remote_fn.remote(analysis))\n",
|
||||
" else:\n",
|
||||
" best_bst = get_best_model_checkpoint(analysis)\n",
|
||||
"\n",
|
||||
" # You could now do further predictions with\n",
|
||||
" # best_bst.predict(...)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"The output of our run could look like this:\n",
|
||||
"\n",
|
||||
"```{code-block} bash\n",
|
||||
":emphasize-lines: 7\n",
|
||||
"\n",
|
||||
" Number of trials: 10/10 (10 TERMINATED)\n",
|
||||
" +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------+\n",
|
||||
" | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | iter | total time (s) | eval-logloss | eval-error |\n",
|
||||
" |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------|\n",
|
||||
" | train_breast_cancer_ba275_00000 | TERMINATED | | 0.00205087 | 2 | 1 | 0.898391 | 10 | 0.380619 | 0.678039 | 0.090909 |\n",
|
||||
" | train_breast_cancer_ba275_00001 | TERMINATED | | 0.000183834 | 4 | 3 | 0.924939 | 1 | 0.0228798 | 0.693009 | 0.111888 |\n",
|
||||
" | train_breast_cancer_ba275_00002 | TERMINATED | | 0.0242721 | 7 | 2 | 0.501551 | 10 | 0.376154 | 0.54472 | 0.06993 |\n",
|
||||
" | train_breast_cancer_ba275_00003 | TERMINATED | | 0.000449692 | 5 | 3 | 0.890212 | 1 | 0.0234981 | 0.692811 | 0.090909 |\n",
|
||||
" | train_breast_cancer_ba275_00004 | TERMINATED | | 0.000376393 | 7 | 2 | 0.883609 | 1 | 0.0231569 | 0.692847 | 0.062937 |\n",
|
||||
" | train_breast_cancer_ba275_00005 | TERMINATED | | 0.00231942 | 3 | 3 | 0.877464 | 2 | 0.104867 | 0.689541 | 0.083916 |\n",
|
||||
" | train_breast_cancer_ba275_00006 | TERMINATED | | 0.000542326 | 1 | 2 | 0.578584 | 1 | 0.0213971 | 0.692765 | 0.083916 |\n",
|
||||
" | train_breast_cancer_ba275_00007 | TERMINATED | | 0.0016801 | 1 | 2 | 0.975302 | 1 | 0.02226 | 0.691999 | 0.083916 |\n",
|
||||
" | train_breast_cancer_ba275_00008 | TERMINATED | | 0.000595756 | 8 | 3 | 0.58429 | 1 | 0.0221152 | 0.692657 | 0.06993 |\n",
|
||||
" | train_breast_cancer_ba275_00009 | TERMINATED | | 0.000357845 | 8 | 1 | 0.637776 | 1 | 0.022635 | 0.692859 | 0.090909 |\n",
|
||||
" +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------+\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" Best model parameters: {'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.5015513240240503, 'eta': 0.024272050872920895}\n",
|
||||
" Best model total accuracy: 0.9301\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"As you can see, most trials have been stopped only after a few iterations. Only the\n",
|
||||
"two most promising trials were run for the full 10 iterations.\n",
|
||||
"\n",
|
||||
"You can also ensure that all available resources are being used as the scheduler\n",
|
||||
"terminates trials, freeing them up. This can be done through the\n",
|
||||
"`ResourceChangingScheduler`. An example of this can be found here:\n",
|
||||
"{doc}`/tune/examples/includes/xgboost_dynamic_resources_example`.\n",
|
||||
"\n",
|
||||
"## Using fractional GPUs\n",
|
||||
"\n",
|
||||
"You can often accelerate your training by using GPUs in addition to CPUs. However,\n",
|
||||
"you usually don't have as many GPUs as you have trials to run. For instance, if you\n",
|
||||
"run 10 Tune trials in parallel, you usually don't have access to 10 separate GPUs.\n",
|
||||
"\n",
|
||||
"Tune supports *fractional GPUs*. This means that each task is assigned a fraction\n",
|
||||
"of the GPU memory for training. For 10 tasks, this could look like this:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"config = {\n",
|
||||
" \"objective\": \"binary:logistic\",\n",
|
||||
" \"eval_metric\": [\"logloss\", \"error\"],\n",
|
||||
" \"tree_method\": \"gpu_hist\",\n",
|
||||
" \"max_depth\": tune.randint(1, 9),\n",
|
||||
" \"min_child_weight\": tune.choice([1, 2, 3]),\n",
|
||||
" \"subsample\": tune.uniform(0.5, 1.0),\n",
|
||||
" \"eta\": tune.loguniform(1e-4, 1e-1)\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"tune.run(\n",
|
||||
" train_breast_cancer,\n",
|
||||
" resources_per_trial={\"cpu\": 1, \"gpu\": 0.1},\n",
|
||||
" config=config,\n",
|
||||
" num_samples=10,\n",
|
||||
")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Each task thus works with 10% of the available GPU memory. You also have to tell\n",
|
||||
"XGBoost to use the `gpu_hist` tree method, so it knows it should use the GPU.\n",
|
||||
"\n",
|
||||
"## Conclusion\n",
|
||||
"\n",
|
||||
"You should now have a basic understanding on how to train XGBoost models and on how\n",
|
||||
"to tune the hyperparameters to yield the best results. In our simple example,\n",
|
||||
"Tuning the parameters didn't make a huge difference for the accuracy.\n",
|
||||
"But in larger applications, intelligent hyperparameter tuning can make the\n",
|
||||
"difference between a model that doesn't seem to learn at all, and a model\n",
|
||||
"that outperforms all the other ones.\n",
|
||||
"\n",
|
||||
"## More XGBoost Examples\n",
|
||||
"\n",
|
||||
"- {doc}`/tune/examples/includes/xgboost_dynamic_resources_example`:\n",
|
||||
" Trains a basic XGBoost model with Tune with the class-based API and a ResourceChangingScheduler, ensuring all resources are being used at all time.\n",
|
||||
"\n",
|
||||
"## Learn More\n",
|
||||
"\n",
|
||||
"- [XGBoost Hyperparameter Tuning - A Visual Guide](https://kevinvecmanis.io/machine%20learning/hyperparameter%20tuning/dataviz/python/2019/05/11/XGBoost-Tuning-Visual-Guide.html)\n",
|
||||
"- [Notes on XGBoost Parameter Tuning](https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html)\n",
|
||||
"- [Doing XGBoost Hyperparameter Tuning the smart way](https://towardsdatascience.com/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
154
doc/source/tune/examples/tune_mnist_keras.ipynb
Normal file
154
doc/source/tune/examples/tune_mnist_keras.ipynb
Normal file
|
@ -0,0 +1,154 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3b05af3b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"(tune-mnist-keras)=\n",
|
||||
"\n",
|
||||
"# Using Keras & TensorFlow with Tune\n",
|
||||
"\n",
|
||||
"```{image} /images/tf_keras_logo.jpeg\n",
|
||||
":align: center\n",
|
||||
":alt: Keras & TensorFlow Logo\n",
|
||||
":height: 120px\n",
|
||||
":target: https://www.keras.io\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"```{contents}\n",
|
||||
":backlinks: none\n",
|
||||
":local: true\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "19e3c389",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import argparse\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"from filelock import FileLock\n",
|
||||
"from tensorflow.keras.datasets import mnist\n",
|
||||
"\n",
|
||||
"import ray\n",
|
||||
"from ray import tune\n",
|
||||
"from ray.tune.schedulers import AsyncHyperBandScheduler\n",
|
||||
"from ray.tune.integration.keras import TuneReportCallback\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def train_mnist(config):\n",
|
||||
" # https://github.com/tensorflow/tensorflow/issues/32159\n",
|
||||
" import tensorflow as tf\n",
|
||||
"\n",
|
||||
" batch_size = 128\n",
|
||||
" num_classes = 10\n",
|
||||
" epochs = 12\n",
|
||||
"\n",
|
||||
" with FileLock(os.path.expanduser(\"~/.data.lock\")):\n",
|
||||
" (x_train, y_train), (x_test, y_test) = mnist.load_data()\n",
|
||||
" x_train, x_test = x_train / 255.0, x_test / 255.0\n",
|
||||
" model = tf.keras.models.Sequential(\n",
|
||||
" [\n",
|
||||
" tf.keras.layers.Flatten(input_shape=(28, 28)),\n",
|
||||
" tf.keras.layers.Dense(config[\"hidden\"], activation=\"relu\"),\n",
|
||||
" tf.keras.layers.Dropout(0.2),\n",
|
||||
" tf.keras.layers.Dense(num_classes, activation=\"softmax\"),\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" model.compile(\n",
|
||||
" loss=\"sparse_categorical_crossentropy\",\n",
|
||||
" optimizer=tf.keras.optimizers.SGD(lr=config[\"lr\"], momentum=config[\"momentum\"]),\n",
|
||||
" metrics=[\"accuracy\"],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" model.fit(\n",
|
||||
" x_train,\n",
|
||||
" y_train,\n",
|
||||
" batch_size=batch_size,\n",
|
||||
" epochs=epochs,\n",
|
||||
" verbose=0,\n",
|
||||
" validation_data=(x_test, y_test),\n",
|
||||
" callbacks=[TuneReportCallback({\"mean_accuracy\": \"accuracy\"})],\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tune_mnist(num_training_iterations):\n",
|
||||
" sched = AsyncHyperBandScheduler(\n",
|
||||
" time_attr=\"training_iteration\", max_t=400, grace_period=20\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" analysis = tune.run(\n",
|
||||
" train_mnist,\n",
|
||||
" name=\"exp\",\n",
|
||||
" scheduler=sched,\n",
|
||||
" metric=\"mean_accuracy\",\n",
|
||||
" mode=\"max\",\n",
|
||||
" stop={\"mean_accuracy\": 0.99, \"training_iteration\": num_training_iterations},\n",
|
||||
" num_samples=10,\n",
|
||||
" resources_per_trial={\"cpu\": 2, \"gpu\": 0},\n",
|
||||
" config={\n",
|
||||
" \"threads\": 2,\n",
|
||||
" \"lr\": tune.uniform(0.001, 0.1),\n",
|
||||
" \"momentum\": tune.uniform(0.1, 0.9),\n",
|
||||
" \"hidden\": tune.randint(32, 512),\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
" print(\"Best hyperparameters found were: \", analysis.best_config)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" parser = argparse.ArgumentParser()\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--smoke-test\", action=\"store_true\", help=\"Finish quickly for testing\"\n",
|
||||
" )\n",
|
||||
" parser.add_argument(\n",
|
||||
" \"--server-address\",\n",
|
||||
" type=str,\n",
|
||||
" default=None,\n",
|
||||
" required=False,\n",
|
||||
" help=\"The address of server to connect to if using \" \"Ray Client.\",\n",
|
||||
" )\n",
|
||||
" args, _ = parser.parse_known_args()\n",
|
||||
" if args.smoke_test:\n",
|
||||
" ray.init(num_cpus=4)\n",
|
||||
" elif args.server_address:\n",
|
||||
" ray.init(f\"ray://{args.server_address}\")\n",
|
||||
"\n",
|
||||
" tune_mnist(num_training_iterations=5 if args.smoke_test else 300)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d7e46189",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## More Keras and TensorFlow Examples\n",
|
||||
"\n",
|
||||
"- {doc}`/tune/examples/includes/pbt_memnn_example`: Example of training a Memory NN on bAbI with Keras using PBT.\n",
|
||||
"- {doc}`/tune/examples/includes/tf_mnist_example`: Converts the Advanced TF2.0 MNIST example to use Tune\n",
|
||||
" with the Trainable. This uses `tf.function`.\n",
|
||||
" Original code from tensorflow: https://www.tensorflow.org/tutorials/quickstart/advanced\n",
|
||||
"- {doc}`/tune/examples/includes/pbt_tune_cifar10_with_keras`:\n",
|
||||
" A contributed example of tuning a Keras model on CIFAR10 with the PopulationBasedTraining scheduler.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"orphan": true
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
tune_mnist_keras
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/tune_mnist_keras.py
|
|
@ -1,7 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
wandb_example
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/wandb_example.py
|
|
@ -1,7 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
xgboost_example
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/examples/xgboost_example.py
|
|
@ -10,6 +10,36 @@ If you still have questions after reading this FAQ, let us know!
|
|||
:local:
|
||||
:depth: 1
|
||||
|
||||
|
||||
What are Hyperparameters?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
What are *hyperparameters?* And how are they different from *model parameters*?
|
||||
|
||||
In supervised learning, we train a model with labeled data so the model can properly identify new data values.
|
||||
Everything about the model is defined by a set of parameters, such as the weights in a linear regression. These
|
||||
are *model parameters*; they are learned during training.
|
||||
|
||||
.. image:: /images/hyper-model-parameters.png
|
||||
|
||||
In contrast, the *hyperparameters* define structural details about the kind of model itself, like whether or not
|
||||
we are using a linear regression or classification, what architecture is best for a neural network,
|
||||
how many layers, what kind of filters, etc. They are defined before training, not learned.
|
||||
|
||||
.. image:: /images/hyper-network-params.png
|
||||
|
||||
Other quantities considered *hyperparameters* include learning rates, discount rates, etc. If we want our training
|
||||
process and resulting model to work well, we first need to determine the optimal or near-optimal set of *hyperparameters*.
|
||||
|
||||
How do we determine the optimal *hyperparameters*? The most direct approach is to perform a loop where we pick
|
||||
a candidate set of values from some reasonably inclusive list of possible values, train a model, compare the results
|
||||
achieved with previous loop iterations, and pick the set that performed best. This process is called
|
||||
*Hyperparameter Tuning* or *Optimization* (HPO). And *hyperparameters* are specified over a configured and confined
|
||||
search space, collectively defined for each *hyperparameter* in a ``config`` dictionary.
|
||||
|
||||
|
||||
.. TODO: We *really* need to improve this section.
|
||||
|
||||
Which search algorithm/scheduler should I choose?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Ray Tune offers :ref:`many different search algorithms <tune-search-alg>`
|
||||
|
@ -56,10 +86,12 @@ work, but less good with an increasing number of categories.
|
|||
or a TPE-based Bayesian Optimization algorithm such as :ref:`Optuna <tune-optuna>` or
|
||||
:ref:`HyperOpt <tune-hyperopt>`.
|
||||
|
||||
**Our go-to solution** is usually to use **random search** with :ref:`ASHA for early stopping <tune-scheduler-hyperband>`
|
||||
for smaller problems. Use :ref:`BOHB <tune-scheduler-bohb>` for **larger problems** with a **small number of hyperparameters**
|
||||
and :ref:`Population Based Training <tune-scheduler-pbt>` for **larger problems** with a **large number of hyperparameters**
|
||||
if a learning schedule is acceptable.
|
||||
**Our go-to solution** is usually to use **random search** with
|
||||
:ref:`ASHA for early stopping <tune-scheduler-hyperband>` for smaller problems.
|
||||
Use :ref:`BOHB <tune-scheduler-bohb>` for **larger problems** with a **small number of hyperparameters**
|
||||
and :ref:`Population Based Training <tune-scheduler-pbt>` for **larger problems** with a
|
||||
**large number of hyperparameters** if a learning schedule is acceptable.
|
||||
|
||||
|
||||
How do I choose hyperparameter ranges?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -86,6 +118,7 @@ For **discount factors** in reinforcement learning we suggest sampling uniformly
|
|||
between 0.9 and 1.0. Depending on the problem, a much stricter range above 0.97
|
||||
or oeven above 0.99 can make sense (e.g. for Atari).
|
||||
|
||||
|
||||
How can I use nested/conditional search spaces?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Sometimes you might need to define parameters whose value depend on the value
|
||||
|
@ -95,29 +128,25 @@ Nested spaces
|
|||
'''''''''''''
|
||||
You can nest hyperparameter definition in sub dictionaries:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
config = {
|
||||
"a": {
|
||||
"x": tune.uniform(0, 10)
|
||||
},
|
||||
"b": tune.choice([1, 2, 3])
|
||||
}
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __basic_config_start__
|
||||
:end-before: __basic_config_end__
|
||||
|
||||
The trial config will be nested exactly like the input config.
|
||||
|
||||
|
||||
Conditional spaces
|
||||
''''''''''''''''''
|
||||
:ref:`Custom and conditional search spaces are explained in detail here <tune_custom-search>`.
|
||||
In short, you can pass custom functions to ``tune.sample_from()`` that can
|
||||
return values that depend on other values:
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __conditional_spaces_start__
|
||||
:end-before: __conditional_spaces_end__
|
||||
|
||||
config = {
|
||||
"a": tune.randint(5, 10)
|
||||
"b": tune.sample_from(lambda spec: np.random.randint(0, spec.config.a))
|
||||
}
|
||||
|
||||
Conditional grid search
|
||||
'''''''''''''''''''''''
|
||||
|
@ -129,20 +158,16 @@ cannot use ``tune.sample_from`` because it doesn't support grid searching.
|
|||
The solution here is to create a list of valid *tuples* with the help of a
|
||||
helper function, like this:
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __iter_start__
|
||||
:end-before: __iter_end__
|
||||
|
||||
def _iter():
|
||||
for a in range(5, 10):
|
||||
for b in range(a):
|
||||
yield a, b
|
||||
|
||||
config = {
|
||||
"ab": tune.grid_search(list(_iter())),
|
||||
}
|
||||
|
||||
Your trainable then can do something like ``a, b = config["ab"]`` to split
|
||||
the a and b variables and use them afterwards.
|
||||
|
||||
|
||||
How does early termination (e.g. Hyperband/ASHA) work?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Early termination algorithms look at the intermediately reported values,
|
||||
|
@ -157,6 +182,7 @@ In ASHA, you can decide how many trials are early terminated.
|
|||
time they are reduced. With ``grace_period=n`` you can force ASHA to
|
||||
train each trial at least for ``n`` epochs.
|
||||
|
||||
|
||||
Why are all my trials returning "1" iteration?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -172,6 +198,7 @@ instance, if you train your algorithm for 1000 timesteps, consider reporting
|
|||
intermediate performance values every 100 steps. That way, schedulers
|
||||
like Hyperband/ASHA can terminate bad performing trials early.
|
||||
|
||||
|
||||
What are all these extra outputs?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -209,17 +236,11 @@ If you want to allocate specific resources to a trial, you can use the
|
|||
``resources_per_trial`` parameter of ``tune.run()``, to which you can pass
|
||||
a dict or a :class:`PlacementGroupFactory <ray.tune.utils.placement_groups.PlacementGroupFactory>` object:
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
train_fn,
|
||||
resources_per_trial={
|
||||
"cpu": 2,
|
||||
"gpu": 0.5,
|
||||
"custom_resources": {"hdd": 80}
|
||||
}
|
||||
)
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __resources_start__
|
||||
:end-before: __resources_end__
|
||||
|
||||
The example above showcases three things:
|
||||
|
||||
|
@ -245,15 +266,11 @@ In some cases your trainable might want to start other remote actors, for instan
|
|||
leveraging distributed training via Ray Train. In these cases, you can use
|
||||
:ref:`placement groups <ray-placement-group-doc-ref>` to request additional resources:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
train_fn,
|
||||
resources_per_trial=tune.PlacementGroupFactory([
|
||||
{"CPU": 2, "GPU": 0.5, "hdd": 80},
|
||||
{"CPU": 1},
|
||||
{"CPU": 1},
|
||||
], strategy="PACK")
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __resources_pgf_start__
|
||||
:end-before: __resources_pgf_end__
|
||||
|
||||
Here, you're requesting 2 additional CPUs for remote tasks. These two additional
|
||||
actors do not necessarily have to live on the same node as your main trainable.
|
||||
|
@ -277,20 +294,12 @@ For example, if your trainable is using Modin dataframes, operations on those wi
|
|||
Ray tasks. By allocating an additional CPU bundle to the trial, those tasks will be able
|
||||
to run without being starved of resources.
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __modin_start__
|
||||
:end-before: __modin_end__
|
||||
|
||||
import modin.pandas as pd
|
||||
|
||||
def train_fn(config, checkpoint_dir=None):
|
||||
# some Modin operations here
|
||||
tune.report(metric=metric)
|
||||
|
||||
tune.run(
|
||||
train_fn,
|
||||
resources_per_trial=tune.PlacementGroupFactory([
|
||||
{"CPU": 1}, # this bundle will be used by the trainable itself
|
||||
{"CPU": 1}, # this bundle will be used by Modin
|
||||
], strategy="PACK")
|
||||
|
||||
How can I pass further parameter values to my trainable?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -301,22 +310,10 @@ you want to pass constant arguments, like the number of epochs to run,
|
|||
or a dataset to train on. Ray Tune offers a wrapper function to achieve
|
||||
just that, called :func:`tune.with_parameters() <ray.tune.with_parameters>`:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
import numpy as np
|
||||
|
||||
def train(config, checkpoint_dir=None, num_epochs=10, data=None):
|
||||
for i in range(num_epochs):
|
||||
for sample in data:
|
||||
# ... train on sample
|
||||
|
||||
# Some huge dataset
|
||||
data = np.random.random(size=100000000)
|
||||
|
||||
tune.run(
|
||||
tune.with_parameters(train, num_epochs=10, data=data))
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __huge_data_start__
|
||||
:end-before: __huge_data_end__
|
||||
|
||||
|
||||
This function works similarly to ``functools.partial``, but it stores
|
||||
|
@ -344,28 +341,20 @@ there are sophisticated algorithms that generate numbers that *seem* to be rando
|
|||
fulfill all properties of a random distribution. These algorithms can be *seeded* with
|
||||
an initial state, after which the generated random numbers are always the same.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import random
|
||||
random.seed(1234)
|
||||
print([random.randint(0, 100) for _ in range(10)])
|
||||
|
||||
# The output of this will always be
|
||||
# [99, 56, 14, 0, 11, 74, 4, 85, 88, 10]
|
||||
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __seeded_1_start__
|
||||
:end-before: __seeded_1_end__
|
||||
|
||||
The most commonly used random number generators from Python libraries are those in the
|
||||
native ``random`` submodule and the ``numpy.random`` module.
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __seeded_2_start__
|
||||
:end-before: __seeded_2_end__
|
||||
|
||||
# This should suffice to initialize the RNGs for most Python-based libraries
|
||||
import random
|
||||
import numpy as np
|
||||
random.seed(1234)
|
||||
np.random.seed(5678)
|
||||
|
||||
In your tuning and training run, there are several places where randomness occurrs, and
|
||||
In your tuning and training run, there are several places where randomness occurs, and
|
||||
at all these places we will have to introduce seeds to make sure we get the same behavior.
|
||||
|
||||
* **Search algorithm**: Search algorithms have to be seeded to generate the same
|
||||
|
@ -381,13 +370,10 @@ at all these places we will have to introduce seeds to make sure we get the same
|
|||
|
||||
PyTorch and TensorFlow use their own RNGs, which have to be initialized, too:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import torch
|
||||
torch.manual_seed(0)
|
||||
|
||||
import tensorflow as tf
|
||||
tf.random.set_seed(0)
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __torch_tf_seeds_start__
|
||||
:end-before: __torch_tf_seeds_end__
|
||||
|
||||
You should thus seed both Ray Tune's schedulers and search algorithms, and the
|
||||
training code. The schedulers and search algorithms should always be seeded with the
|
||||
|
@ -396,35 +382,11 @@ the seeds differ *between different training runs*.
|
|||
|
||||
Here's a blueprint on how to do all this in your training code:
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __torch_seed_example_start__
|
||||
:end-before: __torch_seed_example_end__
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
from ray import tune
|
||||
|
||||
|
||||
def trainable(config):
|
||||
# config["seed"] is set deterministically, but differs between training runs
|
||||
random.seed(config["seed"])
|
||||
np.random.seed(config["seed"])
|
||||
# torch.manual_seed(config["seed"])
|
||||
# ... training code
|
||||
|
||||
|
||||
config = {
|
||||
"seed": tune.randint(0, 10000),
|
||||
# ...
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Set seed for the search algorithms/schedulers
|
||||
random.seed(1234)
|
||||
np.random.seed(1234)
|
||||
# Don't forget to check if the search alg has a `seed` parameter
|
||||
tune.run(
|
||||
trainable,
|
||||
config=config
|
||||
)
|
||||
|
||||
**Please note** that it is not always possible to control all sources of non-determinism.
|
||||
For instance, if you use schedulers like ASHA or PBT, some trials might finish earlier
|
||||
|
@ -537,18 +499,18 @@ How can I get started contributing to Tune?
|
|||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
We use Github to track issues, feature requests, and bugs. Take a look at the
|
||||
ones labeled `"good first issue" <https://github.com/ray-project/ray/issues?utf8=%E2%9C%93&q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22>`__ and `"help wanted" <https://github.com/ray-project/ray/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22>`__ for a place to start. Look for issues with "[tune]" in the title.
|
||||
ones labeled `"good first issue" <https://github.com/ray-project/ray/issues?utf8=%E2%9C%93&q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22>`__ and `"help wanted" <https://github.com/ray-project/ray/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22>`__ for a place to start.
|
||||
Look for issues with "[tune]" in the title.
|
||||
|
||||
.. note::
|
||||
|
||||
If raising a new issue or PR related to Tune, be sure to include "[tune]" in the title and add a ``tune`` label.
|
||||
If raising a new issue or PR related to Tune, be sure to include "[tune]" in the title and add a ``tune`` label.
|
||||
|
||||
For project organization, Tune maintains a relatively up-to-date organization of
|
||||
issues on the `Tune Github Project Board <https://github.com/ray-project/ray/projects/4>`__.
|
||||
Here, you can track and identify how issues are organized.
|
||||
|
||||
|
||||
|
||||
.. _tune-reproducible:
|
||||
|
||||
How can I make my Tune experiments reproducible?
|
||||
|
@ -576,33 +538,11 @@ places where you'll have to set random seeds:
|
|||
Here is an example that will always produce the same result (except for trial
|
||||
runtimes).
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __reproducible_start__
|
||||
:end-before: __reproducible_end__
|
||||
|
||||
import numpy as np
|
||||
from ray import tune
|
||||
|
||||
|
||||
def train(config):
|
||||
# Set seed for trainable random result.
|
||||
# If you remove this line, you will get different results
|
||||
# each time you run the trial, even if the configuration
|
||||
# is the same.
|
||||
np.random.seed(config["seed"])
|
||||
random_result = np.random.uniform(0, 100, size=1).item()
|
||||
tune.report(result=random_result)
|
||||
|
||||
|
||||
# Set seed for Ray Tune's random search.
|
||||
# If you remove this line, you will get different configurations
|
||||
# each time you run the script.
|
||||
np.random.seed(1234)
|
||||
tune.run(
|
||||
train,
|
||||
config={
|
||||
"seed": tune.randint(0, 1000)
|
||||
},
|
||||
search_alg=tune.suggest.BasicVariantGenerator(),
|
||||
num_samples=10)
|
||||
|
||||
Some searchers use their own random states to sample new configurations.
|
||||
These searchers usually accept a ``seed`` parameter that can be passed on
|
||||
|
@ -627,19 +567,10 @@ be automatically fetched and passed to your trainable as a parameter.
|
|||
|
||||
.. tip:: If the objects are small in size or already exist in the :ref:`Ray Object Store <objects-in-ray>`, there's no need to use ``tune.with_parameters()``. You can use `partials <https://docs.python.org/3/library/functools.html#functools.partial>`__ or pass in directly to ``config`` instead.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
import numpy as np
|
||||
|
||||
def f(config, data=None):
|
||||
pass
|
||||
# use data
|
||||
|
||||
data = np.random.random(size=100000000)
|
||||
|
||||
tune.run(tune.with_parameters(f, data=data))
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __large_data_start__
|
||||
:end-before: __large_data_end__
|
||||
|
||||
|
||||
How can I upload my Tune results to cloud storage?
|
||||
|
@ -649,39 +580,28 @@ If an upload directory is provided, Tune will automatically sync results from th
|
|||
natively supporting standard URIs for systems like S3, gsutil or HDFS.
|
||||
Here is an example of uploading to S3, using a bucket called ``my-log-dir``:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
MyTrainableClass,
|
||||
local_dir="~/ray_results",
|
||||
sync_config=tune.SyncConfig(upload_dir="s3://my-log-dir")
|
||||
)
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __log_1_start__
|
||||
:end-before: __log_1_end__
|
||||
|
||||
You can customize this to specify arbitrary storages with the ``syncer`` argument in ``tune.SyncConfig``.
|
||||
This argument supports either strings with the same replacement fields OR arbitrary functions.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
MyTrainableClass,
|
||||
sync_config=tune.SyncConfig(
|
||||
upload_dir="s3://my-log-dir",
|
||||
syncer=custom_sync_str_or_func
|
||||
)
|
||||
)
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __log_2_start__
|
||||
:end-before: __log_2_end__
|
||||
|
||||
If a string is provided, then it must include replacement fields ``{source}`` and ``{target}``, like
|
||||
``s3 sync {source} {target}``. Alternatively, a function can be provided with the following signature:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def custom_sync_func(source, target):
|
||||
# do arbitrary things inside
|
||||
sync_cmd = "s3 {source} {target}".format(
|
||||
source=source,
|
||||
target=target)
|
||||
sync_process = subprocess.Popen(sync_cmd, shell=True)
|
||||
sync_process.wait()
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:language: python
|
||||
:start-after: __sync_start__
|
||||
:end-before: __sync_end__
|
||||
|
||||
By default, syncing occurs every 300 seconds.
|
||||
To change the frequency of syncing, set the ``sync_period`` attribute of the sync config to the desired syncing period.
|
||||
|
@ -707,14 +627,11 @@ To make this work in your Docker cluster, e.g. when you are using the Ray autosc
|
|||
with docker containers, you will need to pass a
|
||||
``DockerSyncer`` to the ``syncer`` argument of ``tune.SyncConfig``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.tune.integration.docker import DockerSyncer
|
||||
sync_config = tune.SyncConfig(
|
||||
syncer=DockerSyncer)
|
||||
|
||||
tune.run(train, sync_config=sync_config)
|
||||
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __docker_start__
|
||||
:end-before: __docker_end__
|
||||
|
||||
.. _tune-kubernetes:
|
||||
|
||||
|
@ -731,49 +648,30 @@ is necessary. There are two main options.
|
|||
First, you can use the :ref:`SyncConfig <tune-sync-config>` to store your
|
||||
logs and checkpoints on cloud storage, such as AWS S3 or Google Cloud Storage:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
tune.durable(train_fn),
|
||||
# ...,
|
||||
sync_config=tune.SyncConfig(
|
||||
upload_dir="s3://your-s3-bucket/durable-trial/"
|
||||
)
|
||||
)
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __s3_start__
|
||||
:end-before: __s3_end__
|
||||
|
||||
Second, you can set up a shared file system like NFS. If you do this, disable automatic trial syncing:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
train_fn,
|
||||
# ...,
|
||||
local_dir="/path/to/shared/storage",
|
||||
sync_config=tune.SyncConfig(
|
||||
# Do not sync because we are on shared storage
|
||||
syncer=None
|
||||
)
|
||||
)
|
||||
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __sync_config_start__
|
||||
:end-before: __sync_config_end__
|
||||
|
||||
Lastly, if you still want to use SSH for trial synchronization, but are not running
|
||||
on the Ray cluster launcher, you might need to pass a
|
||||
``KubernetesSyncer`` to the ``syncer`` argument of ``tune.SyncConfig``.
|
||||
You have to specify your Kubernetes namespace explicitly:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer
|
||||
sync_config = tune.SyncConfig(
|
||||
syncer=NamespacedKubernetesSyncer("ray")
|
||||
)
|
||||
|
||||
tune.run(train, sync_config=sync_config)
|
||||
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __k8s_start__
|
||||
:end-before: __k8s_end__
|
||||
|
||||
Please note that we strongly encourage you to use one of the other two options instead, as they will
|
||||
result in less overhead and don't require pods to SSH into each other.
|
||||
|
@ -789,9 +687,11 @@ However, if you need to debug your training process, it may be easier to do ever
|
|||
You can force all Ray functions to occur on a single process with ``local_mode`` by calling the following
|
||||
before ``tune.run``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
ray.init(local_mode=True)
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __local_start__
|
||||
:end-before: __local_end__
|
||||
|
||||
Local mode with multiple configuration evaluations will interleave computation,
|
||||
so it is most naturally used when running a single configuration evaluation.
|
||||
|
@ -799,7 +699,6 @@ so it is most naturally used when running a single configuration evaluation.
|
|||
Note that ``local_mode`` has some known issues, so please read :ref:`these tips <local-mode-tips>` for more info.
|
||||
|
||||
|
||||
|
||||
.. _tune-default-search-space:
|
||||
|
||||
How do I configure search spaces?
|
||||
|
@ -807,38 +706,21 @@ How do I configure search spaces?
|
|||
|
||||
You can specify a grid search or sampling distribution via the dict passed into ``tune.run(config=...)``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
parameters = {
|
||||
"qux": tune.sample_from(lambda spec: 2 + 2),
|
||||
"bar": tune.grid_search([True, False]),
|
||||
"foo": tune.grid_search([1, 2, 3]),
|
||||
"baz": "asd", # a constant value
|
||||
}
|
||||
|
||||
tune.run(trainable, config=parameters)
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:dedent:
|
||||
:language: python
|
||||
:start-after: __grid_search_start__
|
||||
:end-before: __grid_search_end__
|
||||
|
||||
By default, each random variable and grid search point is sampled once.
|
||||
To take multiple random samples, add ``num_samples: N`` to the experiment config.
|
||||
If `grid_search` is provided as an argument, the grid will be repeated ``num_samples`` of times.
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 13
|
||||
|
||||
# num_samples=10 repeats the 3x3 grid search 10 times, for a total of 90 trials
|
||||
tune.run(
|
||||
my_trainable,
|
||||
name="my_trainable",
|
||||
config={
|
||||
"alpha": tune.uniform(100),
|
||||
"beta": tune.sample_from(lambda spec: spec.config.alpha * np.random.normal()),
|
||||
"nn_layers": [
|
||||
tune.grid_search([16, 64, 256]),
|
||||
tune.grid_search([16, 64, 256]),
|
||||
],
|
||||
},
|
||||
num_samples=10
|
||||
)
|
||||
.. literalinclude:: doc_code/faq.py
|
||||
:emphasize-lines: 13
|
||||
:language: python
|
||||
:start-after: __grid_search_2_start__
|
||||
:end-before: __grid_search_2_end__
|
||||
|
||||
Note that search spaces may not be interoperable across different search algorithms.
|
||||
For example, for many search algorithms, you will not be able to use a ``grid_search`` or ``sample_from`` parameters.
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
.. _tune-tutorial:
|
||||
|
||||
.. TODO: make this an executable notebook later on.
|
||||
|
||||
Getting Started
|
||||
===============
|
||||
|
||||
|
@ -7,7 +9,8 @@ This tutorial will walk you through the process of setting up a Tune experiment.
|
|||
We'll start with a PyTorch model and show you how to leverage Ray Tune to optimize the hyperparameters of this model.
|
||||
Specifically, we'll leverage early stopping and Bayesian Optimization via HyperOpt to do so.
|
||||
|
||||
.. tip:: If you have suggestions as to how to improve this tutorial, please `let us know <https://github.com/ray-project/ray/issues/new/choose>`_!
|
||||
.. tip:: If you have sugges tions as to how to improve this tutorial,
|
||||
please `let us know <https://github.com/ray-project/ray/issues/new/choose>`_!
|
||||
|
||||
To run this example, you will need to install the following:
|
||||
|
||||
|
@ -85,7 +88,9 @@ You can use this to plot the performance of this trial.
|
|||
:start-after: __plot_begin__
|
||||
:end-before: __plot_end__
|
||||
|
||||
.. note:: Tune will automatically run parallel trials across all available cores/GPUs on your machine or cluster. To limit the number of cores that Tune uses, you can call ``ray.init(num_cpus=<int>, num_gpus=<int>)`` before ``tune.run``. If you're using a Search Algorithm like Bayesian Optimization, you'll want to use the :ref:`ConcurrencyLimiter <limiter>`.
|
||||
.. note:: Tune will automatically run parallel trials across all available cores/GPUs on your machine or cluster.
|
||||
To limit the number of cores that Tune uses, you can call ``ray.init(num_cpus=<int>, num_gpus=<int>)`` before ``tune.run``.
|
||||
If you're using a Search Algorithm like Bayesian Optimization, you'll want to use the :ref:`ConcurrencyLimiter <limiter>`.
|
||||
|
||||
|
||||
Early Stopping with ASHA
|
||||
|
@ -95,9 +100,12 @@ Let's integrate early stopping into our optimization process. Let's use :ref:`AS
|
|||
|
||||
.. _`principled early stopping`: https://blog.ml.cmu.edu/2018/12/12/massively-parallel-hyperparameter-optimization/
|
||||
|
||||
On a high level, ASHA terminates trials that are less promising and allocates more time and resources to more promising trials. As our optimization process becomes more efficient, we can afford to **increase the search space by 5x**, by adjusting the parameter ``num_samples``.
|
||||
On a high level, ASHA terminates trials that are less promising and allocates more time and resources to more promising trials.
|
||||
As our optimization process becomes more efficient, we can afford to **increase the search space by 5x**, by adjusting the parameter ``num_samples``.
|
||||
|
||||
ASHA is implemented in Tune as a "Trial Scheduler". These Trial Schedulers can early terminate bad trials, pause trials, clone trials, and alter hyperparameters of a running trial. See :ref:`the TrialScheduler documentation <tune-schedulers>` for more details of available schedulers and library integrations.
|
||||
ASHA is implemented in Tune as a "Trial Scheduler".
|
||||
These Trial Schedulers can early terminate bad trials, pause trials, clone trials, and alter hyperparameters of a running trial.
|
||||
See :ref:`the TrialScheduler documentation <tune-schedulers>` for more details of available schedulers and library integrations.
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/tests/tutorial.py
|
||||
:language: python
|
||||
|
@ -125,7 +133,10 @@ You can also use :ref:`TensorBoard <tensorboard>` for visualizing results.
|
|||
Search Algorithms in Tune
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
In addition to :ref:`TrialSchedulers <tune-schedulers>`, you can further optimize your hyperparameters by using an intelligent search technique like Bayesian Optimization. To do this, you can use a Tune :ref:`Search Algorithm <tune-search-alg>`. Search Algorithms leverage optimization algorithms to intelligently navigate the given hyperparameter space.
|
||||
In addition to :ref:`TrialSchedulers <tune-schedulers>`, you can further optimize your hyperparameters
|
||||
by using an intelligent search technique like Bayesian Optimization.
|
||||
To do this, you can use a Tune :ref:`Search Algorithm <tune-search-alg>`.
|
||||
Search Algorithms leverage optimization algorithms to intelligently navigate the given hyperparameter space.
|
||||
|
||||
Note that each library has a specific way of defining the search space.
|
||||
|
||||
|
|
BIN
doc/source/tune/images/tune_flow.png
Normal file
BIN
doc/source/tune/images/tune_flow.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 40 KiB |
|
@ -44,35 +44,26 @@ Tune integrates with a wide range of hyperparameter optimization tools, like
|
|||
After defining the search space, you can simply initialize the ``HyperOptSearch`` object and pass it to ``run``.
|
||||
It's important to tell Ray Tune which metric you want to optimize and whether you want to maximize or minimize it.
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: doc_code/keras_hyperopt.py
|
||||
:language: python
|
||||
:start-after: __keras_hyperopt_start__
|
||||
:end-before: __keras_hyperopt_end__
|
||||
|
||||
from ray import tune
|
||||
from ray.tune.suggest.hyperopt import HyperOptSearch
|
||||
import keras
|
||||
.. tabbed:: PyTorch+Optuna
|
||||
|
||||
# 1. Wrap a Keras model in an objective function.
|
||||
def objective(config):
|
||||
model = keras.models.Sequential()
|
||||
model.add(keras.layers.Dense(784, activation=config["activation"]))
|
||||
model.add(keras.layers.Dense(10, activation="softmax"))
|
||||
To tune your PyTorch models with Optuna, you wrap your model in an objective function whose ``config`` you
|
||||
can access for selecting hyperparameters.
|
||||
In the example below we only tune the ``momentum`` and learning rate (``lr``) parameters of the model's optimizer,
|
||||
but you can tune any other model parameter you want.
|
||||
After defining the search space, you can simply initialize the ``OptunaSearch`` object and pass it to ``run``.
|
||||
It's important to tell Ray Tune which metric you want to optimize and whether you want to maximize or minimize it.
|
||||
We stop tuning this training run after ``5`` iterations, but you can easily define other stopping rules as well.
|
||||
|
||||
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
|
||||
model.fit(...)
|
||||
loss, accuracy = model.evaluate(...)
|
||||
return {"accuracy": accuracy}
|
||||
.. literalinclude:: doc_code/pytorch_optuna.py
|
||||
:language: python
|
||||
:start-after: __pytorch_optuna_start__
|
||||
:end-before: __pytorch_optuna_end__
|
||||
|
||||
# 2. Define a search space and initialize the search algorithm.
|
||||
search_space = {"activation": tune.choice(["relu", "tanh"])}
|
||||
algo = HyperOptSearch()
|
||||
|
||||
# 3. Start a Tune run that maximizes accuracy.
|
||||
analysis = tune.run(
|
||||
objective, search_alg=algo, config=search_space, metric="accuracy", mode="max"
|
||||
)
|
||||
|
||||
.. TODO add .. tabbed:: PyTorch+Optuna
|
||||
|
||||
.. TODO add .. tabbed:: Scikit+PBT
|
||||
|
||||
With Tune you can also launch a multi-node :ref:`distributed hyperparameter sweep <tune-distributed-ref>`
|
||||
in less than 10 lines of code.
|
||||
|
@ -116,8 +107,9 @@ And you can move your models from training to serving on the same infrastructure
|
|||
**User Guides**
|
||||
^^^
|
||||
|
||||
Our guides teach you about key features of Tune, such as distributed training or early stopping.
|
||||
You can also find practical tutorials for scikit-learn, PyTorch, mlflow, and many more.
|
||||
Our guides teach you about key features of Tune,
|
||||
such as distributed training or early stopping.
|
||||
|
||||
|
||||
+++
|
||||
.. link-button:: tune-guides
|
||||
|
@ -129,7 +121,8 @@ And you can move your models from training to serving on the same infrastructure
|
|||
**Examples**
|
||||
^^^
|
||||
|
||||
Check out some of our many examples on Ray Tune.
|
||||
In our examples you can find practical tutorials for
|
||||
scikit-learn, Keras, TensorFlow, PyTorch, mlflow, and many more.
|
||||
|
||||
+++
|
||||
.. link-button:: tune-examples-ref
|
||||
|
@ -212,10 +205,25 @@ If you're new to Tune, you're probably wondering, "what makes Tune different?"
|
|||
libraries (such as Nevergrad or HyperOpt) and allow you to seamlessly scale up your optimization
|
||||
process - without sacrificing performance.
|
||||
|
||||
|
||||
Reference Materials
|
||||
Projects using Tune
|
||||
-------------------
|
||||
|
||||
Here are some of the popular open source repositories and research projects that leverage Tune.
|
||||
Feel free to submit a pull-request adding (or requesting a removal!) of a listed project.
|
||||
|
||||
- `Softlearning <https://github.com/rail-berkeley/softlearning>`_: Softlearning is a reinforcement learning framework for training maximum entropy policies in continuous domains. Includes the official implementation of the Soft Actor-Critic algorithm.
|
||||
- `Flambe <https://github.com/asappresearch/flambe>`_: An ML framework to accelerate research and its path to production. See `flambe.ai <https://flambe.ai>`_.
|
||||
- `Population Based Augmentation <https://github.com/arcelien/pba>`_: Population Based Augmentation (PBA) is a algorithm that quickly and efficiently learns data augmentation functions for neural network training. PBA matches state-of-the-art results on CIFAR with one thousand times less compute.
|
||||
- `Fast AutoAugment by Kakao <https://github.com/kakaobrain/fast-autoaugment>`_: Fast AutoAugment (Accepted at NeurIPS 2019) learns augmentation policies using a more efficient search strategy based on density matching.
|
||||
- `Allentune <https://github.com/allenai/allentune>`_: Hyperparameter Search for AllenNLP from AllenAI.
|
||||
- `machinable <https://github.com/frthjf/machinable>`_: A modular configuration system for machine learning research. See `machinable.org <https://machinable.org>`_.
|
||||
- `NeuroCard <https://github.com/neurocard/neurocard>`_: NeuroCard (Accepted at VLDB 2021) is a neural cardinality estimator for multi-table join queries. It uses state of the art deep density models to learn correlations across relational database tables.
|
||||
|
||||
|
||||
|
||||
Learn More
|
||||
----------
|
||||
|
||||
Below you can find blog posts and talks about Ray Tune:
|
||||
|
||||
- [blog] `Tune: a Python library for fast hyperparameter tuning at any scale <https://towardsdatascience.com/fast-hyperparameter-tuning-at-scale-d428223b081c>`_
|
||||
|
|
|
@ -4,112 +4,117 @@
|
|||
Key Concepts
|
||||
============
|
||||
|
||||
Let's quickly walk through the key concepts you need to know to use Tune. In this guide, we'll be covering the following:
|
||||
.. TODO: should we introduce checkpoints as well?
|
||||
.. TODO: should we at least mention "Stopper" classes here?
|
||||
|
||||
.. contents::
|
||||
:local:
|
||||
:depth: 1
|
||||
Let's quickly walk through the key concepts you need to know to use Tune.
|
||||
If you want to see practical tutorials right away, go visit our :ref:`user guides<tune-guides>`.
|
||||
In essence, Tune has six crucial components that you need to understand.
|
||||
|
||||
.. image:: /images/tune-workflow.png
|
||||
First, you define the hyperparameters you want to tune in a `search space` and pass them into a `trainable`
|
||||
that specifies the objective you want to tune.
|
||||
Then you select a `search algorithm` to effectively optimize your parameters and optionally use a
|
||||
`scheduler` to stop searches early and speed up your experiments.
|
||||
Together with other configuration, your `trainable`, algorithm, and scheduler are passed into ``tune.run()``,
|
||||
which runs your experiments and creates `trials`.
|
||||
These trials can then be used in `analyses` to inspect your experiment results.
|
||||
The following figure shows an overview of these components, which we cover in detail in the next sections.
|
||||
|
||||
.. image:: images/tune_flow.png
|
||||
|
||||
Trainables
|
||||
----------
|
||||
|
||||
To start, let's try to maximize this objective function:
|
||||
In short, a :ref:`Trainable<trainable-docs>` is an object that you can pass into a Tune run.
|
||||
Ray Tune has two ways of defining a `trainable`, namely the :ref:`Function API <tune-function-api>`
|
||||
and the :ref:`Class API<tune-class-api>`.
|
||||
Both are valid ways of defining a `trainable`, but the Function API is generally recommended and is used
|
||||
throughout the rest of this guide.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def objective(x, a, b):
|
||||
return a * (x ** 0.5) + b
|
||||
|
||||
To use Tune, you will need to wrap this function in a lightweight :ref:`trainable API <trainable-docs>`. You can either use a :ref:`function-based version <tune-function-api>` or a :ref:`class-based version <tune-class-api>`.
|
||||
Let's say we want to optimize a simple objective function like ``a (x ** 2) + b`` in which ``a`` and ``b`` are the
|
||||
hyperparameters we want to tune to `minimize` the objective.
|
||||
Since the objective also has a variable ``x``, we need to test for different values of ``x``.
|
||||
Given concrete choices for ``a``, ``b`` and ``x`` we can evaluate the objective function and get a `score` to minimize.
|
||||
|
||||
.. tabbed:: Function API
|
||||
|
||||
Here's an example of specifying the objective function using :ref:`the function-based Trainable API <tune-function-api>`:
|
||||
With the :ref:`the function-based API <tune-function-api>` you create a function (here called ``trainable``) that
|
||||
takes in a dictionary of hyperparameters.
|
||||
This function computes a ``score`` in a "training loop" and `reports` this score back to Tune:
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: doc_code/key_concepts.py
|
||||
:language: python
|
||||
:start-after: __function_api_start__
|
||||
:end-before: __function_api_end__
|
||||
|
||||
def trainable(config):
|
||||
# config (dict): A dict of hyperparameters.
|
||||
|
||||
for x in range(20):
|
||||
score = objective(x, config["a"], config["b"])
|
||||
|
||||
tune.report(score=score) # This sends the score to Tune.
|
||||
Note that we use ``tune.report(...)`` to report the intermediate ``score`` in the training loop, which can be useful
|
||||
in many machine learning tasks.
|
||||
If you just want to report the final ``score`` outside of this loop, you can simply return the score at the
|
||||
end of the ``trainable`` function with ``return {"score": score}``.
|
||||
You can also use ``yield {"score": score}`` instead of ``tune.report()``.
|
||||
|
||||
.. tabbed:: Class API
|
||||
|
||||
Here's an example of specifying the objective function using the :ref:`class-based API <tune-class-api>`:
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: doc_code/key_concepts.py
|
||||
:language: python
|
||||
:start-after: __class_api_start__
|
||||
:end-before: __class_api_end__
|
||||
|
||||
from ray import tune
|
||||
.. tip:: ``tune.report`` can't be used within a ``Trainable`` class.
|
||||
|
||||
class Trainable(tune.Trainable):
|
||||
def setup(self, config):
|
||||
# config (dict): A dict of hyperparameters
|
||||
self.x = 0
|
||||
self.a = config["a"]
|
||||
self.b = config["b"]
|
||||
Learn more about the details of :ref:`Trainables here<trainable-docs>`
|
||||
and :ref:`have a look at our examples <tune-general-examples>`.
|
||||
Next, let's have a closer look at what the ``config`` dictionary is that you pass into your trainables.
|
||||
|
||||
def step(self): # This is called iteratively.
|
||||
score = objective(self.x, self.a, self.b)
|
||||
self.x += 1
|
||||
return {"score": score}
|
||||
Search Spaces
|
||||
-------------
|
||||
|
||||
.. tip:: Do not use ``tune.report`` within a ``Trainable`` class.
|
||||
To optimize your *hyperparameters*, you have to define a *search space*.
|
||||
A search space defines valid values for your hyperparameters and can specify
|
||||
how these values are sampled (e.g. from a uniform distribution or a normal
|
||||
distribution).
|
||||
|
||||
See the documentation: :ref:`trainable-docs` and :ref:`examples <tune-general-examples>`.
|
||||
Tune offers various functions to define search spaces and sampling methods.
|
||||
:ref:`You can find the documentation of these search space definitions here <tune-sample-docs>`.
|
||||
|
||||
Hyperparameters
|
||||
---------------
|
||||
Here's an example covering all search space functions. Again,
|
||||
:ref:`here is the full explanation of all these functions <tune-sample-docs>`.
|
||||
|
||||
What are *hyperparameters?* And how are they different from *model parameters*?
|
||||
.. literalinclude:: doc_code/key_concepts.py
|
||||
:language: python
|
||||
:start-after: __config_start__
|
||||
:end-before: __config_end__
|
||||
|
||||
In supervised learning, we train a model with labeled data so the model can properly identify new data values.
|
||||
Everything about the model is defined by a set of parameters, such as the weights in a linear regression. These
|
||||
are *model parameters*; they are learned during training.
|
||||
Trials
|
||||
------
|
||||
|
||||
.. image:: /images/hyper-model-parameters.png
|
||||
You use :ref:`tune.run <tune-run-ref>` to execute and manage hyperparameter tuning and generate your `trials`.
|
||||
At a minimum, your ``tune.run()`` call takes in a trainable as first argument, and a ``config`` dictionary
|
||||
to define your search space.
|
||||
|
||||
In contrast, the *hyperparameters* define structural details about the kind of model itself, like whether or not
|
||||
we are using a linear regression or classification, what architecture is best for a neural network, how many layers, what kind
|
||||
of filters, etc. They are defined before training, not learned.
|
||||
The ``tune.run()`` function also provides many features such as :ref:`logging <tune-logging>`,
|
||||
:ref:`checkpointing <tune-checkpoint-syncing>`, and :ref:`early stopping <tune-stopping-ref>`.
|
||||
Continuing with the example defined earlier (minimizing ``a (x ** 2) + b``), a simple Tune run with a simplistic
|
||||
search space for ``a`` and ``b`` would look like this:
|
||||
|
||||
.. image:: /images/hyper-network-params.png
|
||||
.. literalinclude:: doc_code/key_concepts.py
|
||||
:language: python
|
||||
:start-after: __run_tunable_start__
|
||||
:end-before: __run_tunable_end__
|
||||
|
||||
Other quantities considered *hyperparameters* include learning rates, discount rates, etc. If we want our training
|
||||
process and resulting model to work well, we first need to determine the optimal or near-optimal set of *hyperparameters*.
|
||||
``tune.run`` will generate a couple of hyperparameter configurations from its arguments,
|
||||
wrapping them into :ref:`Trial objects <trial-docstring>`.
|
||||
|
||||
How do we determine the optimal *hyperparameters*? The most direct approach is to perform a loop where we pick
|
||||
a candidate set of values from some reasonably inclusive list of possible values, train a model, compare the results
|
||||
achieved with previous loop iterations, and pick the set that performed best. This process is called
|
||||
*Hyperparameter Tuning* or *Optimization* (HPO). And *hyperparameters* are specified over a configured and confined
|
||||
search space, collectively defined for each *hyperparameter* in a ``config`` dictionary.
|
||||
Trials contain a lot of information.
|
||||
For instance, you can get the hyperparameter configuration used (``trial.config``), the trial ID (``trial.trial_id``),
|
||||
the trial's resource specification (``resources_per_trial`` or ``trial.placement_group_factory``) and many other values.
|
||||
|
||||
tune.run and Trials
|
||||
-------------------
|
||||
|
||||
Use :ref:`tune.run <tune-run-ref>` to execute hyperparameter tuning. This function manages your experiment and provides many features such as :ref:`logging <tune-logging>`, :ref:`checkpointing <tune-checkpoint-syncing>`, and :ref:`early stopping <tune-stopping-ref>`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Pass in a Trainable class or function to tune.run, along with configs
|
||||
tune.run(trainable, config={"a": 2, "b": 4})
|
||||
|
||||
``tune.run`` will generate a couple of hyperparameter configurations from its arguments, wrapping them into :ref:`Trial objects <trial-docstring>`.
|
||||
|
||||
Each trial has
|
||||
|
||||
- a hyperparameter configuration (``trial.config``), id (``trial.trial_id``)
|
||||
- a resource specification (``resources_per_trial`` or ``trial.placement_group_factory``)
|
||||
- And other configuration values.
|
||||
|
||||
Each trial is also associated with one instance of a :ref:`Trainable <trainable-docs>`. You can access trial objects through the :ref:`ExperimentAnalysis object <tune-concepts-analysis>` provided after ``tune.run`` finishes.
|
||||
|
||||
``tune.run`` will execute until all trials stop or error:
|
||||
By default ``tune.run`` will execute until all trials stop or error.
|
||||
Here's an example output of a trial run:
|
||||
|
||||
.. TODO: how to make sure this doesn't get outdated?
|
||||
.. code-block:: bash
|
||||
|
||||
== Status ==
|
||||
|
@ -121,158 +126,240 @@ Each trial is also associated with one instance of a :ref:`Trainable <trainable-
|
|||
+----------------------+----------+---------------------+-----------+--------+--------+----------------+-------+
|
||||
| Trial name | status | loc | a | b | score | total time (s) | iter |
|
||||
|----------------------+----------+---------------------+-----------+--------+--------+----------------+-------|
|
||||
| MyTrainable_a826033a | RUNNING | 10.234.98.164:31115 | 0.303706 | 0.0761 | 0.1289 | 7.54952 | 15 |
|
||||
| Trainable_a826033a | RUNNING | 10.234.98.164:31115 | 0.303706 | 0.0761 | 0.1289 | 7.54952 | 15 |
|
||||
+----------------------+----------+---------------------+-----------+--------+--------+----------------+-------+
|
||||
|
||||
|
||||
You can also easily run 10 trials. Tune automatically :ref:`determines how many trials will run in parallel <tune-parallelism>`.
|
||||
You can also easily run just 10 trials by specifying the number of samples (``num_samples``).
|
||||
Tune automatically :ref:`determines how many trials will run in parallel <tune-parallelism>`.
|
||||
Note that instead of the number of samples, you can also specify a time budget in seconds through ``time_budget_s``,
|
||||
if you set ``num_samples=-1``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(trainable, config={"a": 2, "b": 4}, num_samples=10)
|
||||
|
||||
Finally, you can randomly sample or grid search hyperparameters via Tune's :ref:`search space API <tune-default-search-space>`:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
space = {"x": tune.uniform(0, 1)}
|
||||
tune.run(my_trainable, config=space, num_samples=10)
|
||||
|
||||
See more documentation: :ref:`tune-run-ref`.
|
||||
.. literalinclude:: doc_code/key_concepts.py
|
||||
:language: python
|
||||
:start-after: __run_tunable_samples_start__
|
||||
:end-before: __run_tunable_samples_end__
|
||||
|
||||
|
||||
Search spaces
|
||||
-------------
|
||||
Finally, you can use more interesting search spaces to optimize your hyperparameters
|
||||
via Tune's :ref:`search space API <tune-default-search-space>`, like using random samples or grid search.
|
||||
Here's an example of uniformly sampling between ``[0, 1]`` for ``a`` and ``b``:
|
||||
|
||||
To optimize your *hyperparameters*, you have to define a *search space*.
|
||||
A search space defines valid values for your hyperparameters and can specify
|
||||
how these values are sampled (e.g. from a uniform distribution or a normal
|
||||
distribution).
|
||||
.. literalinclude:: doc_code/key_concepts.py
|
||||
:language: python
|
||||
:start-after: __search_space_start__
|
||||
:end-before: __search_space_end__
|
||||
|
||||
Tune offers various functions to define search spaces and sampling methods.
|
||||
:ref:`You can find the documentation of these search space definitions here <tune-sample-docs>`.
|
||||
|
||||
Usually you pass your search space definition in the `config` parameter of
|
||||
``tune.run()``.
|
||||
|
||||
Here's an example covering all search space functions. Again,
|
||||
:ref:`here is the full explanation of all these functions <tune-sample-docs>`.
|
||||
|
||||
|
||||
.. code-block :: python
|
||||
|
||||
config = {
|
||||
"uniform": tune.uniform(-5, -1), # Uniform float between -5 and -1
|
||||
"quniform": tune.quniform(3.2, 5.4, 0.2), # Round to increments of 0.2
|
||||
"loguniform": tune.loguniform(1e-4, 1e-1), # Uniform float in log space
|
||||
"qloguniform": tune.qloguniform(1e-4, 1e-1, 5e-5), # Round to increments of 0.00005
|
||||
"randn": tune.randn(10, 2), # Normal distribution with mean 10 and sd 2
|
||||
"qrandn": tune.qrandn(10, 2, 0.2), # Round to increments of 0.2
|
||||
"randint": tune.randint(-9, 15), # Random integer between -9 and 15
|
||||
"qrandint": tune.qrandint(-21, 12, 3), # Round to increments of 3 (includes 12)
|
||||
"lograndint": tune.lograndint(1, 10), # Random integer in log space
|
||||
"qlograndint": tune.qlograndint(1, 10, 2), # Round to increments of 2
|
||||
"choice": tune.choice(["a", "b", "c"]), # Choose one of these options uniformly
|
||||
"func": tune.sample_from(lambda spec: spec.config.uniform * 0.01), # Depends on other value
|
||||
"grid": tune.grid_search([32, 64, 128]) # Search over all these values
|
||||
}
|
||||
To learn more about the various ways of configuring your Tune runs,
|
||||
check out the :ref:`tune.run() API reference<tune-run-ref>`.
|
||||
|
||||
Search Algorithms
|
||||
-----------------
|
||||
|
||||
To optimize the hyperparameters of your training process, you will want to use a :ref:`Search Algorithm <tune-search-alg>` which will help suggest better hyperparameters.
|
||||
To optimize the hyperparameters of your training process, you use
|
||||
a :ref:`Search Algorithm <tune-search-alg>` which suggests hyperparameter configurations.
|
||||
If you don't specify a search algorithm, Tune will use random search by default, which can provide you
|
||||
with a good starting point for your hyperparameter optimization.
|
||||
|
||||
.. code-block:: python
|
||||
For instance, to use Tune with simple Bayesian optimization through the ``bayesian-optimization`` package
|
||||
(make sure to first run ``pip install bayesian-optimization``), we can define an ``algo`` using ``BayesOptSearch``.
|
||||
Simply pass in a ``search_alg`` argument to ``tune.run``:
|
||||
|
||||
# Be sure to first run `pip install bayesian-optimization`
|
||||
.. literalinclude:: doc_code/key_concepts.py
|
||||
:language: python
|
||||
:start-after: __bayes_start__
|
||||
:end-before: __bayes_end__
|
||||
|
||||
from ray.tune.suggest import ConcurrencyLimiter
|
||||
from ray.tune.suggest.bayesopt import BayesOptSearch
|
||||
Tune has Search Algorithms that integrate with many popular **optimization** libraries,
|
||||
such as :ref:`Nevergrad <nevergrad>`, :ref:`HyperOpt <tune-hyperopt>`, or :ref:`Optuna <tune-optuna>`.
|
||||
Tune automatically converts the provided search space into the search
|
||||
spaces the search algorithms and underlying libraries expect.
|
||||
See the :ref:`Search Algorithm API documentation <tune-search-alg>` for more details.
|
||||
|
||||
# Define the search space
|
||||
config = {
|
||||
"a": tune.uniform(0, 1),
|
||||
"b": tune.uniform(0, 20)
|
||||
}
|
||||
Here's an overview of all available search algorithms in Tune:
|
||||
|
||||
# Execute 20 trials using BayesOpt and stop after 20 iterations
|
||||
tune.run(
|
||||
trainable,
|
||||
config=config,
|
||||
metric="score",
|
||||
mode="max",
|
||||
# Limit to two concurrent trials (otherwise we end up with random search)
|
||||
search_alg=ConcurrencyLimiter(
|
||||
BayesOptSearch(random_search_steps=4),
|
||||
max_concurrent=2),
|
||||
num_samples=20,
|
||||
stop={"training_iteration": 20},
|
||||
verbose=2)
|
||||
.. list-table::
|
||||
:widths: 5 5 2 10
|
||||
:header-rows: 1
|
||||
|
||||
* - SearchAlgorithm
|
||||
- Summary
|
||||
- Website
|
||||
- Code Example
|
||||
* - :ref:`Random search/grid search <tune-basicvariant>`
|
||||
- Random search/grid search
|
||||
-
|
||||
- :doc:`/tune/examples/includes/tune_basic_example`
|
||||
* - :ref:`AxSearch <tune-ax>`
|
||||
- Bayesian/Bandit Optimization
|
||||
- [`Ax <https://ax.dev/>`__]
|
||||
- :doc:`/tune/examples/includes/ax_example`
|
||||
* - :ref:`BlendSearch <BlendSearch>`
|
||||
- Blended Search
|
||||
- [`Bs <https://github.com/microsoft/FLAML/tree/main/flaml/tune>`__]
|
||||
- :doc:`/tune/examples/includes/blendsearch_example`
|
||||
* - :ref:`CFO <CFO>`
|
||||
- Cost-Frugal hyperparameter Optimization
|
||||
- [`Cfo <https://github.com/microsoft/FLAML/tree/main/flaml/tune>`__]
|
||||
- :doc:`/tune/examples/includes/cfo_example`
|
||||
* - :ref:`DragonflySearch <Dragonfly>`
|
||||
- Scalable Bayesian Optimization
|
||||
- [`Dragonfly <https://dragonfly-opt.readthedocs.io/>`__]
|
||||
- :doc:`/tune/examples/includes/dragonfly_example`
|
||||
* - :ref:`SkoptSearch <skopt>`
|
||||
- Bayesian Optimization
|
||||
- [`Scikit-Optimize <https://scikit-optimize.github.io>`__]
|
||||
- :doc:`/tune/examples/includes/skopt_example`
|
||||
* - :ref:`HyperOptSearch <tune-hyperopt>`
|
||||
- Tree-Parzen Estimators
|
||||
- [`HyperOpt <http://hyperopt.github.io/hyperopt>`__]
|
||||
- :doc:`/tune/examples/hyperopt_example`
|
||||
* - :ref:`BayesOptSearch <bayesopt>`
|
||||
- Bayesian Optimization
|
||||
- [`BayesianOptimization <https://github.com/fmfn/BayesianOptimization>`__]
|
||||
- :doc:`/tune/examples/includes/bayesopt_example`
|
||||
* - :ref:`TuneBOHB <suggest-TuneBOHB>`
|
||||
- Bayesian Opt/HyperBand
|
||||
- [`BOHB <https://github.com/automl/HpBandSter>`__]
|
||||
- :doc:`/tune/examples/includes/bohb_example`
|
||||
* - :ref:`NevergradSearch <nevergrad>`
|
||||
- Gradient-free Optimization
|
||||
- [`Nevergrad <https://github.com/facebookresearch/nevergrad>`__]
|
||||
- :doc:`/tune/examples/includes/nevergrad_example`
|
||||
* - :ref:`OptunaSearch <tune-optuna>`
|
||||
- Optuna search algorithms
|
||||
- [`Optuna <https://optuna.org/>`__]
|
||||
- :doc:`/tune/examples/includes/optuna_example`
|
||||
* - :ref:`ZOOptSearch <zoopt>`
|
||||
- Zeroth-order Optimization
|
||||
- [`ZOOpt <https://github.com/polixir/ZOOpt>`__]
|
||||
- :doc:`/tune/examples/includes/zoopt_example`
|
||||
* - :ref:`SigOptSearch <sigopt>`
|
||||
- Closed source
|
||||
- [`SigOpt <https://sigopt.com/>`__]
|
||||
- :doc:`/tune/examples/includes/sigopt_example`
|
||||
* - :ref:`HEBOSearch <tune-hebo>`
|
||||
- Heteroscedastic Evolutionary Bayesian Optimization
|
||||
- [`HEBO <https://github.com/huawei-noah/HEBO/tree/master/HEBO>`__]
|
||||
- :doc:`/tune/examples/includes/hebo_example`
|
||||
|
||||
.. note:: Unlike :ref:`Tune's Trial Schedulers <tune-schedulers>`,
|
||||
Tune Search Algorithms cannot affect or stop training processes.
|
||||
However, you can use them together to early stop the evaluation of bad trials.
|
||||
|
||||
In case you want to implement your own search algorithm, the interface is easy to implement,
|
||||
you can :ref:`read the instructions here <byo-algo>`.
|
||||
|
||||
Tune also provides helpful utilities to use with Search Algorithms:
|
||||
|
||||
* :ref:`repeater`: Support for running each *sampled hyperparameter* with multiple random seeds.
|
||||
* :ref:`limiter`: Limits the amount of concurrent trials when running optimization.
|
||||
* :ref:`shim`: Allows creation of the search algorithm object given a string.
|
||||
|
||||
Note that in the example above we tell Tune to ``stop`` after ``20`` training iterations.
|
||||
This way of stopping trials with explicit rules is useful, but in many cases we can do even better with
|
||||
`schedulers`.
|
||||
|
||||
.. _schedulers-ref:
|
||||
|
||||
Schedulers
|
||||
----------
|
||||
|
||||
To make your training process more efficient, you can use a :ref:`Trial Scheduler <tune-schedulers>`.
|
||||
For instance, in our ``trainable`` example minimizing a function in a training loop, we used ``tune.report()``.
|
||||
This reported `incremental` results, given a hyperparameter configuration selected by a search algorithm.
|
||||
Based on these reported results, a Tune scheduler can decide whether to stop the trial early or not.
|
||||
If you don't specify a scheduler, Tune will use a first-in-first-out (FIFO) scheduler by default, which simply
|
||||
passes through the trials selected by your search algorithm in the order they were picked and does not perform any early stopping.
|
||||
|
||||
In short, schedulers can stop, pause, or tweak the
|
||||
hyperparameters of running trials, potentially making your hyperparameter tuning process much faster.
|
||||
Unlike search algorithms, :ref:`Trial Scheduler <tune-schedulers>` do not select which hyperparameter
|
||||
configurations to evaluate.
|
||||
|
||||
Here's a quick example of using the so-called ``HyperBand`` scheduler to tune an experiment.
|
||||
All schedulers take in a ``metric``, which is the value reported by your trainable.
|
||||
The ``metric`` is then maximized or minimized according to the ``mode`` you provide.
|
||||
To use a scheduler, just pass in a ``scheduler`` argument to ``tune.run()``:
|
||||
|
||||
.. literalinclude:: doc_code/key_concepts.py
|
||||
:language: python
|
||||
:start-after: __hyperband_start__
|
||||
:end-before: __hyperband_end__
|
||||
|
||||
|
||||
Tune has SearchAlgorithms that integrate with many popular **optimization** libraries, such as :ref:`Nevergrad <nevergrad>` and :ref:`HyperOpt <tune-hyperopt>`. Tune automatically converts the provided search space into the search
|
||||
spaces the search algorithms/underlying library expect.
|
||||
Tune includes distributed implementations of early stopping algorithms such as
|
||||
`Median Stopping Rule <https://research.google.com/pubs/pub46180.html>`__, `HyperBand <https://arxiv.org/abs/1603.06560>`__,
|
||||
and `ASHA <https://openreview.net/forum?id=S1Y7OOlRZ>`__.
|
||||
Tune also includes a distributed implementation of `Population Based Training (PBT) <https://deepmind.com/blog/population-based-training-neural-networks>`__
|
||||
and `Population Based Bandits (PB2) <https://arxiv.org/abs/2002.02518>`__.
|
||||
|
||||
See the documentation: :ref:`tune-search-alg`.
|
||||
.. tip:: The easiest scheduler to start with is the ``ASHAScheduler`` which will aggressively terminate low-performing trials.
|
||||
|
||||
Trial Schedulers
|
||||
----------------
|
||||
When using schedulers, you may face compatibility issues, as shown in the below compatibility matrix.
|
||||
Certain schedulers cannot be used with search algorithms,
|
||||
and certain schedulers require :ref:`checkpointing to be implemented <tune-checkpoint-syncing>`.
|
||||
|
||||
In addition, you can make your training process more efficient by using a :ref:`Trial Scheduler <tune-schedulers>`.
|
||||
Schedulers can dynamically change trial resource requirements during tuning.
|
||||
This is currently implemented in :ref:`ResourceChangingScheduler<tune-resource-changing-scheduler>`,
|
||||
which can wrap around any other scheduler.
|
||||
|
||||
Trial Schedulers can stop/pause/tweak the hyperparameters of running trials, making your hyperparameter tuning process much faster.
|
||||
.. list-table:: Scheduler Compatibility Matrix
|
||||
:header-rows: 1
|
||||
|
||||
.. code-block:: python
|
||||
* - Scheduler
|
||||
- Need Checkpointing?
|
||||
- SearchAlg Compatible?
|
||||
- Example
|
||||
* - :ref:`ASHA <tune-scheduler-hyperband>`
|
||||
- No
|
||||
- Yes
|
||||
- :doc:`Link </tune/examples/includes/async_hyperband_example>`
|
||||
* - :ref:`Median Stopping Rule <tune-scheduler-msr>`
|
||||
- No
|
||||
- Yes
|
||||
- :ref:`Link <tune-scheduler-msr>`
|
||||
* - :ref:`HyperBand <tune-original-hyperband>`
|
||||
- Yes
|
||||
- Yes
|
||||
- :doc:`Link </tune/examples/includes/hyperband_example>`
|
||||
* - :ref:`BOHB <tune-scheduler-bohb>`
|
||||
- Yes
|
||||
- Only TuneBOHB
|
||||
- :doc:`Link </tune/examples/includes/bohb_example>`
|
||||
* - :ref:`Population Based Training <tune-scheduler-pbt>`
|
||||
- Yes
|
||||
- Not Compatible
|
||||
- :doc:`Link </tune/examples/includes/pbt_function>`
|
||||
* - :ref:`Population Based Bandits <tune-scheduler-pb2>`
|
||||
- Yes
|
||||
- Not Compatible
|
||||
- :doc:`Basic Example </tune/examples/includes/pb2_example>`, :doc:`PPO example </tune/examples/includes/pb2_ppo_example>`
|
||||
|
||||
from ray.tune.schedulers import HyperBandScheduler
|
||||
|
||||
# Create HyperBand scheduler and maximize score
|
||||
hyperband = HyperBandScheduler(metric="score", mode="max")
|
||||
|
||||
# Execute 20 trials using HyperBand using a search space
|
||||
configs = {"a": tune.uniform(0, 1), "b": tune.uniform(0, 1)}
|
||||
|
||||
tune.run(
|
||||
MyTrainableClass,
|
||||
config=configs,
|
||||
num_samples=20,
|
||||
scheduler=hyperband
|
||||
)
|
||||
|
||||
:ref:`Population-based Training <tune-scheduler-pbt>` and :ref:`HyperBand <tune-scheduler-hyperband>` are examples of popular optimization algorithms implemented as Trial Schedulers.
|
||||
|
||||
Unlike **Search Algorithms**, :ref:`Trial Scheduler <tune-schedulers>` do not select which hyperparameter configurations to evaluate. However, you can use them together.
|
||||
|
||||
See the documentation: :ref:`schedulers-ref`.
|
||||
Learn more about trial schedulers in :ref:`the scheduler API documentation<schedulers-ref>`.
|
||||
|
||||
.. _tune-concepts-analysis:
|
||||
|
||||
Analysis
|
||||
Analyses
|
||||
--------
|
||||
|
||||
``tune.run`` returns an :ref:`ExperimentAnalysis <tune-analysis-docs>` object which has methods you can use for analyzing your training.
|
||||
``tune.run`` returns an :ref:`ExperimentAnalysis <tune-analysis-docs>` object which has methods you can use for
|
||||
analyzing your training.
|
||||
The following example shows you how to access various metrics from an ``analysis`` object, like the best available
|
||||
trial, or the best hyperparameter configuration for that trial:
|
||||
|
||||
.. code-block:: python
|
||||
.. literalinclude:: doc_code/key_concepts.py
|
||||
:language: python
|
||||
:start-after: __analysis_start__
|
||||
:end-before: __analysis_end__
|
||||
|
||||
analysis = tune.run(trainable, search_alg=algo, stop={"training_iteration": 20})
|
||||
|
||||
best_trial = analysis.best_trial # Get best trial
|
||||
best_config = analysis.best_config # Get best trial's hyperparameters
|
||||
best_logdir = analysis.best_logdir # Get best trial's logdir
|
||||
best_checkpoint = analysis.best_checkpoint # Get best trial's best checkpoint
|
||||
best_result = analysis.best_result # Get best trial's last results
|
||||
best_result_df = analysis.best_result_df # Get best result as pandas dataframe
|
||||
|
||||
This object can also retrieve all training runs as dataframes, allowing you to do ad-hoc data analysis over your results.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Get a dataframe with the last results for each trial
|
||||
df_results = analysis.results_df
|
||||
|
||||
# Get a dataframe of results for a specific score or mode
|
||||
df = analysis.dataframe(metric="score", mode="max")
|
||||
This object can also retrieve all training runs as dataframes,
|
||||
allowing you to do ad-hoc data analysis over your results.
|
||||
|
||||
.. literalinclude:: doc_code/key_concepts.py
|
||||
:language: python
|
||||
:start-after: __results_start__
|
||||
:end-before: __results_end__
|
||||
|
||||
What's Next?
|
||||
-------------
|
||||
|
@ -281,7 +368,7 @@ Now that you have a working understanding of Tune, check out:
|
|||
|
||||
* :ref:`tune-guides`: Tutorials for using Tune with your preferred machine learning library.
|
||||
* :doc:`/tune/examples/index`: End-to-end examples and templates for using Tune with your preferred machine learning library.
|
||||
* :ref:`tune-tutorial`: A simple tutorial that walks you through the process of setting up a Tune experiment.
|
||||
* :doc:`/tune/getting-started`: A simple tutorial that walks you through the process of setting up a Tune experiment.
|
||||
|
||||
|
||||
Further Questions or Issues?
|
||||
|
|
|
@ -7,92 +7,8 @@ User Guides
|
|||
.. tip:: We'd love to hear your feedback on using Tune - `get in touch <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
|
||||
|
||||
In this section, you can find material on how to use Tune and its various features.
|
||||
You can follow our :ref:`How-To Guides<tune-recipes>`, :ref:`Tune Feature Guides<tune-feature-guides>`, or
|
||||
go through some :ref:`Exercises<tune-exercises>`, to get started.
|
||||
|
||||
|
||||
.. _tune-recipes:
|
||||
|
||||
Practical How-To Guides
|
||||
-----------------------
|
||||
|
||||
.. panels::
|
||||
:container: container pb-4 full-width
|
||||
:column: col-md-3 px-2 py-2
|
||||
:img-top-cls: pt-5 w-75 d-block mx-auto
|
||||
|
||||
---
|
||||
:img-top: /images/tune-sklearn.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-sklearn
|
||||
:type: ref
|
||||
:text: How To Use Tune's Scikit-Learn Adapters?
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/pytorch_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-pytorch-cifar-ref
|
||||
:type: ref
|
||||
:text: How To Use Tune With PyTorch Models?
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/pytorch_lightning_small.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-pytorch-lightning-ref
|
||||
:type: ref
|
||||
:text: How To Tune PyTorch Lightning Models
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/serve.svg
|
||||
|
||||
+++
|
||||
.. link-button:: tune-serve-integration-mnist
|
||||
:type: ref
|
||||
:text: Model Selection & Serving With Ray Serve
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/xgboost_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-xgboost-ref
|
||||
:type: ref
|
||||
:text: A Guide To Tuning XGBoost Parameters With Tune
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/wandb_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-wandb-ref
|
||||
:type: ref
|
||||
:text: Tracking Your Experiment Process Weights & Biases
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/mlflow.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-mlflow-ref
|
||||
:type: ref
|
||||
:text: Using MLflow Tracking & AutoLogging with Tune
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/comet_logo_full.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-comet-ref
|
||||
:type: ref
|
||||
:text: Using Comet with Ray Tune For Experiment Management
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
You can follow our :ref:`Tune Feature Guides<tune-feature-guides>`, but can also look into our
|
||||
:ref:`Practical Examples<tune-recipes>`, or go through some :ref:`Exercises<tune-exercises>` to get started.
|
||||
|
||||
.. _tune-feature-guides:
|
||||
|
||||
|
@ -100,13 +16,21 @@ Tune Feature Guides
|
|||
-------------------
|
||||
|
||||
.. panels::
|
||||
:container: container pb-4 full-width
|
||||
:column: col-md-3 px-2 py-2
|
||||
:container: container pb-4
|
||||
:column: col-md-4 px-2 py-2
|
||||
:img-top-cls: pt-5 w-50 d-block mx-auto
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-lifecycle
|
||||
:type: ref
|
||||
:text: How does Tune work?
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-stopping
|
||||
:type: ref
|
||||
:text: A Guide To Stopping and Resuming Tune Experiments
|
||||
|
@ -123,6 +47,14 @@ Tune Feature Guides
|
|||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-distributed
|
||||
:type: ref
|
||||
:text: A Guide To Distributed Hyperparameter Tuning
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-output
|
||||
:type: ref
|
||||
:text: How To Log Tune Runs
|
||||
|
@ -147,9 +79,9 @@ Tune Feature Guides
|
|||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-lifecycle
|
||||
.. link-button:: tune-search-space-tutorial
|
||||
:type: ref
|
||||
:text: How does Tune work?
|
||||
:text: A Guide To Working with Advanced Search Spaces
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
|
@ -163,77 +95,8 @@ Tune Feature Guides
|
|||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-distributed
|
||||
.. link-button:: tune-scalability
|
||||
:type: ref
|
||||
:text: A Guide To Distributed Hyperparameter Tuning
|
||||
:text: Tune Scalability and Overhead Benchmarks
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
|
||||
.. _tune-exercises:
|
||||
|
||||
Exercises
|
||||
---------
|
||||
|
||||
Learn how to use Tune in your browser with the following Colab-based exercises.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th class="tune-colab">Exercise Description</th>
|
||||
<th class="tune-colab">Library</th>
|
||||
<th class="tune-colab">Colab Link</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="tune-colab">Basics of using Tune.</td>
|
||||
<td class="tune-colab">TF/Keras</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_1_basics.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Using Search algorithms and Trial Schedulers to optimize your model.</td>
|
||||
<td class="tune-colab">Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_2_optimize.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Using Population-Based Training (PBT).</td>
|
||||
<td class="tune-colab">Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_3_pbt.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Fine-tuning Huggingface Transformers with PBT.</td>
|
||||
<td class="tune-colab">Huggingface Transformers/Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/drive/1tQgAKgcKQzheoh503OzhS4N9NtfFgmjF?usp=sharing" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Logging Tune Runs to Comet ML.</td>
|
||||
<td class="tune-colab">Comet</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/drive/1dp3VwVoAH1acn_kG7RuT62mICnOqxU1z?usp=sharing" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
Tutorial source files `can be found here <https://github.com/ray-project/tutorial>`_.
|
||||
|
||||
|
|
|
@ -191,7 +191,7 @@ Your ``my_trainable`` is either a:
|
|||
2. **Custom training function**
|
||||
|
||||
* All this means is that your function has to expose a ``checkpoint_dir`` argument in the function signature,
|
||||
and call ``tune.checkpoint_dir``. See :doc:`this example </tune/examples/custom_func_checkpointing>`,
|
||||
and call ``tune.checkpoint_dir``. See :doc:`this example </tune/examples/includes/custom_func_checkpointing>`,
|
||||
it's quite simple to do.
|
||||
|
||||
Let's assume for this example you're running this script from your laptop, and connecting to your remote Ray cluster
|
||||
|
@ -322,7 +322,7 @@ requires rsync to be installed.
|
|||
|
||||
Note that you must use the ``tune.checkpoint_dir`` API to trigger syncing
|
||||
(or use a model type with a built-in Ray Tune integration as described here).
|
||||
See :doc:`/tune/examples/custom_func_checkpointing` for an example.
|
||||
See :doc:`/tune/examples/includes/custom_func_checkpointing` for an example.
|
||||
|
||||
If you are running Ray Tune on Kubernetes, you should usually use a
|
||||
:ref:`cloud checkpointing <tune-sync-config>` or a shared filesystem for checkpoint sharing.
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
.. _tune-comet-ref:
|
||||
|
||||
Using Comet with Tune
|
||||
================================
|
||||
|
||||
`Comet <https://www.comet.ml/site/>`_ is a tool to manage and optimize the
|
||||
entire ML lifecycle, from experiment tracking, model optimization and dataset
|
||||
versioning to model production monitoring.
|
||||
|
||||
.. image:: /images/comet_logo_full.png
|
||||
:height: 80px
|
||||
:alt: Comet
|
||||
:align: center
|
||||
:target: https://www.comet.ml/site/
|
||||
|
||||
Ray Tune offers an integration with Comet through the :ref:`CometLoggerCallback <tune-comet-logger>`, which automatically logs
|
||||
metrics and parameters reported to Tune to the Comet UI.
|
||||
|
||||
Please :doc:`see here for a full example </tune/examples/comet_example>`.
|
||||
|
||||
.. _tune-comet-logger:
|
||||
|
||||
.. autoclass:: ray.tune.integration.comet.CometLoggerCallback
|
||||
:noindex:
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue