mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[Docs ] Tune docs overhaul (first part) (#22112)
Continuing docs overhaul, tune now has: - [x] better landing page - [x] a getting started guide - [x] user guide was cut down, partially merged with FAQ, and partially integrated with tutorials - [x] the new user guide contains guides to tune features and practical integrations - [x] we rewrote some of the feature guides for clarity - [x] we got rid of sphinx-gallery for this sub-project (only data and core left), as it looks bad and is unnecessarily complicated anyway (plus, makes the build slower) - [x] sphinx-gallery examples are now moved to markdown notebook, as started in #22030. - [x] Examples are tested in the new framework, of course. There's still a lot one can do, but this is already getting too large. Will follow up with more fine-tuning next week. Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> Co-authored-by: Kai Fricke <krfricke@users.noreply.github.com>
This commit is contained in:
parent
13819304d4
commit
5cc9355303
55 changed files with 2273 additions and 1959 deletions
2
doc/.gitignore
vendored
2
doc/.gitignore
vendored
|
@ -3,6 +3,4 @@ _build
|
|||
source/_static/thumbs
|
||||
|
||||
source/ray-core/examples/
|
||||
source/tune/tutorials/
|
||||
source/tune/generated_guides/
|
||||
source/data/examples/
|
21
doc/BUILD
21
doc/BUILD
|
@ -114,24 +114,25 @@ py_test(
|
|||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Tests from the doc/source/tune/_tutorials directory.
|
||||
# Tests from the doc/source/tune/tutorials directory.
|
||||
# Please keep these sorted alphabetically.
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
py_test(
|
||||
name = "tune_sklearn",
|
||||
size = "medium",
|
||||
main = "source/tune/_tutorials/tune-sklearn.py",
|
||||
srcs = ["source/tune/_tutorials/tune-sklearn.py"],
|
||||
tags = ["exclusive", "example", "team:ml"],
|
||||
args = ["--smoke-test"]
|
||||
main = "test_myst_doc.py",
|
||||
srcs = ["test_myst_doc.py"],
|
||||
args = ["--path", "doc/source/tune/tutorials/tune-sklearn.md"],
|
||||
data = ["//doc/source/tune/tutorials:tune_tutorials"],
|
||||
tags = ["exclusive", "team:ml"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "tune_serve_integration_mnist",
|
||||
size = "medium",
|
||||
main = "source/tune/_tutorials/tune-serve-integration-mnist.py",
|
||||
srcs = ["source/tune/_tutorials/tune-serve-integration-mnist.py"],
|
||||
tags = ["exclusive", "example", "team:ml"],
|
||||
args = ["--smoke-test", "--from_scratch", "--day 0"]
|
||||
main = "test_myst_doc.py",
|
||||
srcs = ["test_myst_doc.py"],
|
||||
args = ["--path", "doc/source/tune/tutorials/tune-serve-integration-mnist.md", "--smoke-test", "--from_scratch", "--day 0"],
|
||||
data = ["//doc/source/tune/tutorials:tune_tutorials"],
|
||||
tags = ["exclusive", "team:ml"],
|
||||
)
|
||||
|
|
|
@ -7,8 +7,6 @@ SPHINXBUILD = sphinx-build
|
|||
PAPER =
|
||||
BUILDDIR = _build
|
||||
AUTOGALLERYDIR= source/ray-core/examples\
|
||||
source/tune/tutorials\
|
||||
source/tune/generated_guides\
|
||||
source/data/examples
|
||||
|
||||
# User-friendly check for sphinx-build
|
||||
|
|
|
@ -30,27 +30,33 @@ parts:
|
|||
- file: tune/index
|
||||
title: Ray Tune
|
||||
sections:
|
||||
- file: tune/getting-started
|
||||
- file: tune/key-concepts
|
||||
- file: tune/user-guide
|
||||
- file: tune/tutorials/overview
|
||||
sections:
|
||||
- file: tune/tutorials/tune-tutorial.rst
|
||||
- file: tune/tutorials/tune-advanced-tutorial.rst
|
||||
- file: tune/tutorials/tune-distributed.rst
|
||||
- file: tune/tutorials/tune-lifecycle.rst
|
||||
- file: tune/tutorials/tune-mlflow.rst
|
||||
- file: tune/tutorials/tune-pytorch-cifar.rst
|
||||
- file: tune/tutorials/tune-pytorch-lightning.rst
|
||||
- file: tune/tutorials/tune-serve-integration-mnist.rst
|
||||
- file: tune/tutorials/tune-sklearn.rst
|
||||
- file: tune/tutorials/tune-xgboost.rst
|
||||
- file: tune/tutorials/tune-wandb.rst
|
||||
- file: tune/tutorials/tune-sklearn
|
||||
- file: tune/tutorials/tune-pytorch-cifar
|
||||
- file: tune/tutorials/tune-pytorch-lightning
|
||||
- file: tune/tutorials/tune-serve-integration-mnist
|
||||
- file: tune/tutorials/tune-xgboost
|
||||
- file: tune/tutorials/tune-wandb
|
||||
- file: tune/tutorials/tune-mlflow
|
||||
- file: tune/tutorials/tune-comet
|
||||
- file: tune/tutorials/tune-stopping
|
||||
- file: tune/tutorials/tune-metrics
|
||||
- file: tune/tutorials/tune-output
|
||||
- file: tune/tutorials/tune-resources
|
||||
- file: tune/tutorials/tune-checkpoints
|
||||
- file: tune/tutorials/tune-lifecycle
|
||||
- file: tune/tutorials/tune-advanced-tutorial
|
||||
- file: tune/tutorials/tune-distributed
|
||||
- file: tune/examples/index
|
||||
- file: tune/contrib
|
||||
- file: tune/faq
|
||||
- file: tune/api_docs/overview.rst
|
||||
- file: serve/index
|
||||
title: Ray Serve
|
||||
sections:
|
||||
- file: serve/end_to_end_tutorial.rst
|
||||
- file: serve/end_to_end_tutorial
|
||||
- file: serve/core-apis
|
||||
- file: serve/http-servehandle
|
||||
- file: serve/deployment
|
||||
|
@ -106,7 +112,6 @@ parts:
|
|||
- caption: References
|
||||
chapters:
|
||||
- file: ray-references/api
|
||||
- file: ray-references/faq
|
||||
|
||||
- caption: Developer Guides
|
||||
chapters:
|
||||
|
|
|
@ -121,11 +121,10 @@ sphinx_gallery_conf = {
|
|||
# Example sources are taken from these folders:
|
||||
"examples_dirs": [
|
||||
"ray-core/_examples",
|
||||
"tune/_tutorials",
|
||||
"data/_examples",
|
||||
],
|
||||
# and then generated into these respective target folders:
|
||||
"gallery_dirs": ["ray-core/examples", "tune/tutorials", "data/examples"],
|
||||
"gallery_dirs": ["ray-core/examples", "data/examples"],
|
||||
"ignore_pattern": "ray-core/examples/doc_code/",
|
||||
"plot_gallery": "False",
|
||||
"min_reported_time": sys.maxsize,
|
||||
|
|
|
@ -614,7 +614,7 @@ if __name__ == "__main__":
|
|||
"num_features": num_features,
|
||||
}
|
||||
|
||||
# Create 2 callbacks: one for Tensorboard Logging and one for MLflow
|
||||
# Create 2 callbacks: one for TensorBoard Logging and one for MLflow
|
||||
# logging. Pass these into Trainer, and all results that are
|
||||
# reported by ``train.report()`` will be logged to these 2 places.
|
||||
# TODO: TBXLoggerCallback should create nonexistent logdir
|
||||
|
|
|
@ -17,7 +17,7 @@ Ray provides 2 integration points with Pytorch Lightning.
|
|||
|
||||
1. `Ray Lightning Library <https://github.com/ray-project/ray_lightning>`_ for distributed Pytorch Lightning training with Ray
|
||||
|
||||
2. :ref:`Ray Tune with Pytorch Lightning <tune-pytorch-lightning>` for distributed hyperparameter tuning of your PTL models.
|
||||
2. :ref:`Ray Tune with Pytorch Lightning <tune-pytorch-lightning-ref>` for distributed hyperparameter tuning of your PTL models.
|
||||
|
||||
|
||||
Distributed Training with ``Ray Lightning``
|
||||
|
@ -137,7 +137,7 @@ And if you want to add periodic checkpointing as well, you can use the ``TuneRep
|
|||
on="validation_end")
|
||||
|
||||
|
||||
Check out the :ref:`Pytorch Lightning with Ray Tune tutorial<tune-pytorch-lightning>` for a full example on how you can use these callbacks and run a tuning experiment for your Pytorch Lightning model.
|
||||
Check out the :ref:`Pytorch Lightning with Ray Tune tutorial<tune-pytorch-lightning-ref>` for a full example on how you can use these callbacks and run a tuning experiment for your Pytorch Lightning model.
|
||||
|
||||
|
||||
Hyperparameter Tuning with distributed training
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
:orphan:
|
||||
|
||||
FAQ
|
||||
==============
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ Basic Usage
|
|||
Setting up training
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. tip:: If you want to leverage multi-node data parallel training with PyTorch while using RayTune *without* using RaySGD, check out the :ref:`Tune PyTorch user guide <tune-pytorch-cifar>` and Tune's :ref:`distributed pytorch integrations <tune-ddp-doc>`.
|
||||
.. tip:: If you want to leverage multi-node data parallel training with PyTorch while using RayTune *without* using RaySGD, check out the :ref:`Tune PyTorch user guide <tune-pytorch-cifar-ref>` and Tune's :ref:`distributed pytorch integrations <tune-ddp-doc>`.
|
||||
|
||||
The :ref:`ref-torch-trainer` can be constructed from a custom :ref:`ref-torch-operator` subclass that defines training components like the model, data, optimizer, loss, and ``lr_scheduler``. These components are all automatically replicated across different machines and devices so that training can be executed in parallel.
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ RaySGD integrates with :ref:`Ray Tune <tune-60-seconds>` to easily run distribut
|
|||
PyTorch
|
||||
-------
|
||||
|
||||
.. tip:: If you want to leverage multi-node data parallel training with PyTorch while using RayTune *without* using RaySGD, check out the :ref:`Tune PyTorch user guide <tune-pytorch-cifar>` and Tune's lightweight :ref:`distributed pytorch integrations <tune-ddp-doc>`.
|
||||
.. tip:: If you want to leverage multi-node data parallel training with PyTorch while using RayTune *without* using RaySGD, check out the :ref:`Tune PyTorch user guide <tune-pytorch-cifar-ref>` and Tune's lightweight :ref:`distributed pytorch integrations <tune-ddp-doc>`.
|
||||
|
||||
``TorchTrainer`` naturally integrates with Tune via the ``BaseTorchTrainable`` interface. Without changing any arguments, you can call ``TorchTrainer.as_trainable(...)`` to create a Tune-compatible class.
|
||||
Then, you can simply pass the returned Trainable class to ``tune.run``. The ``config`` used for each ``Trainable`` in tune will automatically be passed down to the ``TorchTrainer``.
|
||||
|
|
|
@ -520,7 +520,7 @@ Dreamer
|
|||
|
||||
Dreamer is an image-only model-based RL method that learns by imagining trajectories in the future and is evaluated on the DeepMind Control Suite `environments <https://github.com/ray-project/ray/blob/master/rllib/examples/env/dm_control_suite.py>`__. RLlib's Dreamer is adapted from the `official Google research repo <https://github.com/google-research/dreamer>`__.
|
||||
|
||||
To visualize learning, RLLib Dreamer's imagined trajectories are logged as gifs in Tensorboard. Examples of such can be seen `here <https://github.com/ray-project/rl-experiments>`__.
|
||||
To visualize learning, RLLib Dreamer's imagined trajectories are logged as gifs in TensorBoard. Examples of such can be seen `here <https://github.com/ray-project/rl-experiments>`__.
|
||||
|
||||
Tuned examples: `Deepmind Control Environments <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dreamer/dreamer-deepmind-control.yaml>`__
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@ Ray Train is a library that aims to simplify distributed deep learning.
|
|||
|
||||
* Callbacks for early stopping
|
||||
* Checkpointing
|
||||
* Integration with Tensorboard, Weights/Biases, and MLflow
|
||||
* Integration with TensorBoard, Weights/Biases, and MLflow
|
||||
* Jupyter notebooks
|
||||
|
||||
**Integration with Ray Ecosystem**: Distributed deep learning often comes with a lot of complexity.
|
||||
|
|
|
@ -430,7 +430,7 @@ The following ``TrainingCallback``\s are available and will log the intermediate
|
|||
3. :ref:`train-api-tbx-logger-callback`
|
||||
4. :ref:`train-api-mlflow-logger-callback`
|
||||
|
||||
Example: Logging to MLflow and Tensorboard
|
||||
Example: Logging to MLflow and TensorBoard
|
||||
++++++++++++++++++++++++++++++++++++++++++
|
||||
|
||||
**Step 1: Install the necessary packages**
|
||||
|
|
1
doc/source/tune/.gitignore
vendored
1
doc/source/tune/.gitignore
vendored
|
@ -1 +0,0 @@
|
|||
tutorials/
|
|
@ -1 +0,0 @@
|
|||
:orphan:
|
|
@ -1,177 +0,0 @@
|
|||
.. _tune-guides:
|
||||
|
||||
===============
|
||||
Tutorials & FAQ
|
||||
===============
|
||||
|
||||
.. tip:: We'd love to hear your feedback on using Tune - `get in touch <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
|
||||
|
||||
In this section, you can find material on how to use Tune and its various features. If any of the materials is out of date or broken, or if you'd like to add an example to this page, feel free to raise an issue on our Github repository.
|
||||
|
||||
Take a look at any of the below tutorials to get started with Tune.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div class="sphx-glr-bigcontainer">
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Key concepts in 60 seconds.
|
||||
:figure: /images/tune-workflow.png
|
||||
:description: :doc:`Key concepts in 60 seconds </tune/key-concepts>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: A simple Tune walkthrough.
|
||||
:figure: /images/tune.png
|
||||
:description: :doc:`A walkthrough to setup your first Tune experiment <tune-tutorial>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: A deep dive into Tune's workings.
|
||||
:figure: /images/tune.png
|
||||
:description: :doc:`How does Tune work? <tune-lifecycle>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: A simple guide to Population-based Training
|
||||
:figure: /images/tune-pbt-small.png
|
||||
:description: :doc:`A simple guide to Population-based Training <tune-advanced-tutorial>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: A guide to distributed hyperparameter tuning
|
||||
:figure: /images/tune.png
|
||||
:description: :doc:`A guide to distributed hyperparameter tuning <tune-distributed>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Tune's Scikit-Learn Adapters
|
||||
:figure: /images/tune-sklearn.png
|
||||
:description: :doc:`Tune's Scikit-Learn Adapters <tune-sklearn>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: How to use Tune with PyTorch
|
||||
:figure: /images/pytorch_logo.png
|
||||
:description: :doc:`How to use Tune with PyTorch <tune-pytorch-cifar>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Tuning PyTorch Lightning modules
|
||||
:figure: /images/pytorch_lightning_small.png
|
||||
:description: :doc:`Tuning PyTorch Lightning modules <tune-pytorch-lightning>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Model selection and serving with Ray Tune and Ray Serve
|
||||
:figure: /images/serve.png
|
||||
:description: :doc:`Model selection and serving with Ray Tune and Ray Serve <tune-serve-integration-mnist>`
|
||||
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Tuning XGBoost parameters.
|
||||
:figure: /images/xgboost_logo.png
|
||||
:description: :doc:`A guide to tuning XGBoost parameters with Tune <tune-xgboost>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Use Weights & Biases within Tune.
|
||||
:figure: /images/wandb_logo.png
|
||||
:description: :doc:`Track your experiment process with the Weights & Biases tools <tune-wandb>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Use MLflow with Ray Tune.
|
||||
:figure: /images/mlflow.png
|
||||
:description: :doc:`Log and track your hyperparameter sweep with MLflow Tracking & AutoLogging <tune-mlflow>`
|
||||
|
||||
.. customgalleryitem::
|
||||
:tooltip: Use Comet with Ray Tune.
|
||||
:figure: /images/comet_logo_full.png
|
||||
:description: :doc:`Log and analyze your Tune trial runs with Comet's Experiment Management Tools <tune-comet>`
|
||||
|
||||
|
||||
.. raw:: html
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
|
||||
tune-tutorial.rst
|
||||
tune-advanced-tutorial.rst
|
||||
tune-distributed.rst
|
||||
tune-lifecycle.rst
|
||||
tune-mlflow.rst
|
||||
tune-pytorch-cifar.rst
|
||||
tune-pytorch-lightning.rst
|
||||
tune-serve-integration-mnist.rst
|
||||
tune-sklearn.rst
|
||||
tune-xgboost.rst
|
||||
tune-wandb.rst
|
||||
tune-comet.rst
|
||||
|
||||
Colab Exercises
|
||||
---------------
|
||||
|
||||
Learn how to use Tune in your browser with the following Colab-based exercises.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th class="tune-colab">Exercise Description</th>
|
||||
<th class="tune-colab">Library</th>
|
||||
<th class="tune-colab">Colab Link</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="tune-colab">Basics of using Tune.</td>
|
||||
<td class="tune-colab">TF/Keras</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_1_basics.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Using Search algorithms and Trial Schedulers to optimize your model.</td>
|
||||
<td class="tune-colab">Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_2_optimize.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Using Population-Based Training (PBT).</td>
|
||||
<td class="tune-colab">Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_3_pbt.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Fine-tuning Huggingface Transformers with PBT.</td>
|
||||
<td class="tune-colab">Huggingface Transformers/Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/drive/1tQgAKgcKQzheoh503OzhS4N9NtfFgmjF?usp=sharing" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Logging Tune Runs to Comet ML.</td>
|
||||
<td class="tune-colab">Comet</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/drive/1dp3VwVoAH1acn_kG7RuT62mICnOqxU1z?usp=sharing" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
Tutorial source files `can be found here <https://github.com/ray-project/tutorial>`_.
|
||||
|
||||
What's Next?
|
||||
-------------
|
||||
|
||||
Check out:
|
||||
|
||||
* :doc:`/tune/user-guide`: A comprehensive overview of Tune's features.
|
||||
* :doc:`/tune/examples/index`: End-to-end examples and templates for using Tune with your preferred machine learning library.
|
|
@ -1,165 +0,0 @@
|
|||
# flake8: noqa
|
||||
"""
|
||||
Tune's Scikit Learn Adapters
|
||||
============================
|
||||
|
||||
Scikit-Learn is one of the most widely used tools in the ML community for working with data, offering dozens of easy-to-use machine learning algorithms. However, to achieve high performance for these algorithms, you often need to perform **model selection**.
|
||||
|
||||
|
||||
.. image:: /images/tune-sklearn.png
|
||||
:align: center
|
||||
:width: 50%
|
||||
|
||||
Scikit-Learn `has an existing module for model selection <https://scikit-learn.org/stable/modules/grid_search.html>`_, but the algorithms offered (Grid Search/``GridSearchCV`` and Random Search/``RandomizedSearchCV``) are often considered inefficient. In this tutorial, we'll cover ``tune-sklearn``, a drop-in replacement for Scikit-Learn's model selection module with state-of-the-art optimization features such as early stopping and Bayesian Optimization.
|
||||
|
||||
.. tip:: Check out the `tune-sklearn code`_ and :ref:`documentation <tune-sklearn-docs>`.
|
||||
|
||||
.. _`tune-sklearn code`: https://github.com/ray-project/tune-sklearn
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
``tune-sklearn`` is a module that integrates Ray Tune's hyperparameter tuning and scikit-learn's Classifier API. ``tune-sklearn`` has two APIs: :ref:`TuneSearchCV <tunesearchcv-docs>`, and :ref:`TuneGridSearchCV <tunegridsearchcv-docs>`. They are drop-in replacements for Scikit-learn's RandomizedSearchCV and GridSearchCV, so you only need to change less than 5 lines in a standard Scikit-Learn script to use the API.
|
||||
|
||||
Ray Tune's Scikit-learn APIs allows you to easily leverage Bayesian Optimization, HyperBand, and other cutting edge tuning techniques by simply toggling a few parameters. It also supports and provides examples for many other frameworks with Scikit-Learn wrappers such as Skorch (Pytorch), KerasClassifiers (Keras), and XGBoostClassifiers (XGBoost).
|
||||
|
||||
Run ``pip install "ray[tune]" tune-sklearn`` to get started.
|
||||
|
||||
Walkthrough
|
||||
-----------
|
||||
|
||||
Let's compare Tune's Scikit-Learn APIs to the standard scikit-learn GridSearchCV. For this example, we'll be using ``TuneGridSearchCV`` with a `SGDClassifier`_.
|
||||
|
||||
.. _`digits dataset`: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html
|
||||
.. _`SGDClassifier`: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
|
||||
|
||||
To start out, change the import statement to get tune-scikit-learn’s grid search cross validation interface:
|
||||
|
||||
"""
|
||||
# Keep this here for https://github.com/ray-project/ray/issues/11547
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
# Replace above line with:
|
||||
from ray.tune.sklearn import TuneGridSearchCV
|
||||
|
||||
#######################################################################
|
||||
# And from there, we would proceed just like how we would in Scikit-Learn’s interface!
|
||||
#
|
||||
# The `SGDClassifier`_ has a ``partial_fit`` API, which enables it to stop fitting to the data for a certain hyperparameter configuration.
|
||||
# If the estimator does not support early stopping, we would fall back to a parallel grid search.
|
||||
|
||||
# Other imports
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
from sklearn.datasets import make_classification
|
||||
import numpy as np
|
||||
|
||||
# Create dataset
|
||||
X, y = make_classification(
|
||||
n_samples=11000,
|
||||
n_features=1000,
|
||||
n_informative=50,
|
||||
n_redundant=0,
|
||||
n_classes=10,
|
||||
class_sep=2.5,
|
||||
)
|
||||
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1000)
|
||||
|
||||
# Example parameters to tune from SGDClassifier
|
||||
parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]}
|
||||
|
||||
#######################################################################
|
||||
# As you can see, the setup here is exactly how you would do it for Scikit-Learn. Now, let's try fitting a model.
|
||||
|
||||
tune_search = TuneGridSearchCV(
|
||||
SGDClassifier(), parameter_grid, early_stopping=True, max_iters=10
|
||||
)
|
||||
|
||||
import time # Just to compare fit times
|
||||
|
||||
start = time.time()
|
||||
tune_search.fit(x_train, y_train)
|
||||
end = time.time()
|
||||
print("Tune GridSearch Fit Time:", end - start)
|
||||
# Tune GridSearch Fit Time: 15.436315774917603 (for an 8 core laptop)
|
||||
|
||||
#######################################################################
|
||||
# Note the slight differences we introduced above:
|
||||
#
|
||||
# * a `early_stopping`, and
|
||||
# * a specification of `max_iters` parameter
|
||||
#
|
||||
# The ``early_stopping`` parameter allows us to terminate unpromising configurations. If ``early_stopping=True``,
|
||||
# TuneGridSearchCV will default to using Tune's ASHAScheduler. You can pass in a custom
|
||||
# algorithm - see :ref:`Tune's documentation on schedulers <tune-schedulers>` here for a full list to choose from.
|
||||
# ``max_iters`` is the maximum number of iterations a given hyperparameter set could run for; it may run for fewer iterations if it is early stopped.
|
||||
#
|
||||
# Try running this compared to the GridSearchCV equivalent, and see the speedup for yourself!
|
||||
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
# n_jobs=-1 enables use of all cores like Tune does
|
||||
sklearn_search = GridSearchCV(SGDClassifier(), parameter_grid, n_jobs=-1)
|
||||
|
||||
start = time.time()
|
||||
sklearn_search.fit(x_train, y_train)
|
||||
end = time.time()
|
||||
print("Sklearn Fit Time:", end - start)
|
||||
# Sklearn Fit Time: 47.48055911064148 (for an 8 core laptop)
|
||||
|
||||
###################################################################
|
||||
# Using Bayesian Optimization
|
||||
# ---------------------------
|
||||
#
|
||||
# In addition to the grid search interface, tune-sklearn also provides an interface, TuneSearchCV, for sampling from **distributions of hyperparameters**.
|
||||
#
|
||||
# In addition, you can easily enable Bayesian optimization over the distributions in only 2 lines of code:
|
||||
|
||||
# First run `pip install bayesian-optimization`
|
||||
from ray.tune.sklearn import TuneSearchCV
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split
|
||||
import numpy as np
|
||||
|
||||
digits = datasets.load_digits()
|
||||
x = digits.data
|
||||
y = digits.target
|
||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
|
||||
|
||||
clf = SGDClassifier()
|
||||
parameter_grid = {"alpha": (1e-4, 1), "epsilon": (0.01, 0.1)}
|
||||
|
||||
tune_search = TuneSearchCV(
|
||||
clf,
|
||||
parameter_grid,
|
||||
search_optimization="bayesian",
|
||||
n_trials=3,
|
||||
early_stopping=True,
|
||||
max_iters=10,
|
||||
)
|
||||
tune_search.fit(x_train, y_train)
|
||||
print(tune_search.best_params_)
|
||||
# {'alpha': 0.37460266483547777, 'epsilon': 0.09556428757689246}
|
||||
|
||||
################################################################
|
||||
# As you can see, it’s very simple to integrate tune-sklearn into existing code. Distributed execution is also easy - you can simply run ``ray.init(address="auto")`` before
|
||||
# TuneSearchCV to connect to the Ray cluster and parallelize tuning across multiple nodes, as you would in any other Ray Tune script.
|
||||
#
|
||||
#
|
||||
# Code Examples
|
||||
# -------------
|
||||
#
|
||||
# Check out more detailed examples and get started with tune-sklearn!
|
||||
#
|
||||
# * `Skorch with tune-sklearn <https://github.com/ray-project/tune-sklearn/blob/master/examples/torch_nn.py>`_
|
||||
# * `Scikit-Learn Pipelines with tune-sklearn <https://github.com/ray-project/tune-sklearn/blob/master/examples/sklearn_pipeline.py>`_
|
||||
# * `XGBoost with tune-sklearn <https://github.com/ray-project/tune-sklearn/blob/master/examples/xgbclassifier.py>`_
|
||||
# * `KerasClassifier with tune-sklearn <https://github.com/ray-project/tune-sklearn/blob/master/examples/keras_example.py>`_
|
||||
# * `LightGBM with tune-sklearn <https://github.com/ray-project/tune-sklearn/blob/master/examples/lgbm.py>`_
|
||||
#
|
||||
#
|
||||
# Further Reading
|
||||
# ---------------
|
||||
#
|
||||
# If you're using scikit-learn for other tasks, take a look at Ray’s :ref:`replacement for joblib <ray-joblib>`, which allows users to parallelize scikit learn jobs over multiple nodes.
|
98
doc/source/tune/api_docs/env.rst
Normal file
98
doc/source/tune/api_docs/env.rst
Normal file
|
@ -0,0 +1,98 @@
|
|||
|
||||
.. _tune-env-vars:
|
||||
|
||||
Environment variables
|
||||
---------------------
|
||||
Some of Ray Tune's behavior can be configured using environment variables.
|
||||
These are the environment variables Ray Tune currently considers:
|
||||
|
||||
* **TUNE_CLUSTER_SSH_KEY**: SSH key used by the Tune driver process to connect
|
||||
to remote cluster machines for checkpoint syncing. If this is not set,
|
||||
``~/ray_bootstrap_key.pem`` will be used.
|
||||
* **TUNE_DISABLE_AUTO_CALLBACK_LOGGERS**: Ray Tune automatically adds a CSV and
|
||||
JSON logger callback if they haven't been passed. Setting this variable to
|
||||
`1` disables this automatic creation. Please note that this will most likely
|
||||
affect analyzing your results after the tuning run.
|
||||
* **TUNE_DISABLE_AUTO_CALLBACK_SYNCER**: Ray Tune automatically adds a
|
||||
Syncer callback to sync logs and checkpoints between different nodes if none
|
||||
has been passed. Setting this variable to `1` disables this automatic creation.
|
||||
Please note that this will most likely affect advanced scheduling algorithms
|
||||
like PopulationBasedTraining.
|
||||
* **TUNE_DISABLE_AUTO_INIT**: Disable automatically calling ``ray.init()`` if
|
||||
not attached to a Ray session.
|
||||
* **TUNE_DISABLE_DATED_SUBDIR**: Ray Tune automatically adds a date string to experiment
|
||||
directories when the name is not specified explicitly or the trainable isn't passed
|
||||
as a string. Setting this environment variable to ``1`` disables adding these date strings.
|
||||
* **TUNE_DISABLE_STRICT_METRIC_CHECKING**: When you report metrics to Tune via
|
||||
``tune.report()`` and passed a ``metric`` parameter to ``tune.run()``, a scheduler,
|
||||
or a search algorithm, Tune will error
|
||||
if the metric was not reported in the result. Setting this environment variable
|
||||
to ``1`` will disable this check.
|
||||
* **TUNE_DISABLE_SIGINT_HANDLER**: Ray Tune catches SIGINT signals (e.g. sent by
|
||||
Ctrl+C) to gracefully shutdown and do a final checkpoint. Setting this variable
|
||||
to ``1`` will disable signal handling and stop execution right away. Defaults to
|
||||
``0``.
|
||||
* **TUNE_FORCE_TRIAL_CLEANUP_S**: By default, Ray Tune will gracefully terminate trials,
|
||||
letting them finish the current training step and any user-defined cleanup.
|
||||
Setting this variable to a non-zero, positive integer will cause trials to be forcefully
|
||||
terminated after a grace period of that many seconds. Defaults to ``0``.
|
||||
* **TUNE_FUNCTION_THREAD_TIMEOUT_S**: Time in seconds the function API waits
|
||||
for threads to finish after instructing them to complete. Defaults to ``2``.
|
||||
* **TUNE_GLOBAL_CHECKPOINT_S**: Time in seconds that limits how often Tune's
|
||||
experiment state is checkpointed. If not set this will default to ``10``.
|
||||
* **TUNE_MAX_LEN_IDENTIFIER**: Maximum length of trial subdirectory names (those
|
||||
with the parameter values in them)
|
||||
* **TUNE_MAX_PENDING_TRIALS_PG**: Maximum number of pending trials when placement groups are used. Defaults
|
||||
to ``auto``, which will be updated to ``max(16, cluster_cpus * 1.1)`` for random/grid search and ``1`` for any other search algorithms.
|
||||
* **TUNE_PLACEMENT_GROUP_CLEANUP_DISABLED**: Ray Tune cleans up existing placement groups
|
||||
with the ``_tune__`` prefix in their name before starting a run. This is used to make sure
|
||||
that scheduled placement groups are removed when multiple calls to ``tune.run()`` are
|
||||
done in the same script. You might want to disable this if you run multiple Tune runs in
|
||||
parallel from different scripts. Set to 1 to disable.
|
||||
* **TUNE_PLACEMENT_GROUP_PREFIX**: Prefix for placement groups created by Ray Tune. This prefix is used
|
||||
e.g. to identify placement groups that should be cleaned up on start/stop of the tuning run. This is
|
||||
initialized to a unique name at the start of the first run.
|
||||
* **TUNE_PLACEMENT_GROUP_RECON_INTERVAL**: How often to reconcile placement groups. Reconcilation is
|
||||
used to make sure that the number of requested placement groups and pending/running trials are in sync.
|
||||
In normal circumstances these shouldn't differ anyway, but reconcilation makes sure to capture cases when
|
||||
placement groups are manually destroyed. Reconcilation doesn't take much time, but it can add up when
|
||||
running a large number of short trials. Defaults to every ``5`` (seconds).
|
||||
* **TUNE_PLACEMENT_GROUP_WAIT_S**: Default time the trial executor waits for placement
|
||||
groups to be placed before continuing the tuning loop. Setting this to a float
|
||||
will block for that many seconds. This is mostly used for testing purposes. Defaults
|
||||
to -1, which disables blocking.
|
||||
* **TUNE_RESULT_DIR**: Directory where Ray Tune trial results are stored. If this
|
||||
is not set, ``~/ray_results`` will be used.
|
||||
* **TUNE_RESULT_BUFFER_LENGTH**: Ray Tune can buffer results from trainables before they are passed
|
||||
to the driver. Enabling this might delay scheduling decisions, as trainables are speculatively
|
||||
continued. Setting this to ``1`` disables result buffering. Cannot be used with ``checkpoint_at_end``.
|
||||
Defaults to disabled.
|
||||
* **TUNE_RESULT_DELIM**: Delimiter used for nested entries in
|
||||
:class:`ExperimentAnalysis <ray.tune.ExperimentAnalysis>` dataframes. Defaults to ``.`` (but will be
|
||||
changed to ``/`` in future versions of Ray).
|
||||
* **TUNE_RESULT_BUFFER_MAX_TIME_S**: Similarly, Ray Tune buffers results up to ``number_of_trial/10`` seconds,
|
||||
but never longer than this value. Defaults to 100 (seconds).
|
||||
* **TUNE_RESULT_BUFFER_MIN_TIME_S**: Additionally, you can specify a minimum time to buffer results. Defaults to 0.
|
||||
* **TUNE_SYNCER_VERBOSITY**: Amount of command output when using Tune with Docker Syncer. Defaults to 0.
|
||||
* **TUNE_TRIAL_RESULT_WAIT_TIME_S**: Amount of time Ray Tune will block until a result from a running trial is received.
|
||||
Defaults to 1 (second).
|
||||
* **TUNE_TRIAL_STARTUP_GRACE_PERIOD**: Amount of time after starting a trial that Ray Tune checks for successful
|
||||
trial startups. After the grace period, Tune will block for up to ``TUNE_TRIAL_RESULT_WAIT_TIME_S`` seconds
|
||||
until a result from a running trial is received. Can be disabled by setting this to lower or equal to 0.
|
||||
* **TUNE_WARN_THRESHOLD_S**: Threshold for logging if an Tune event loop operation takes too long. Defaults to 0.5 (seconds).
|
||||
* **TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S**: Threshold for throwing a warning if no active trials are in ``RUNNING`` state
|
||||
for this amount of seconds. If the Ray Tune job is stuck in this state (most likely due to insufficient resources),
|
||||
the warning message is printed repeatedly every this amount of seconds. Defaults to 60 (seconds).
|
||||
* **TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER**: Threshold for throwing a warning, when the autoscaler is enabled,
|
||||
if no active trials are in ``RUNNING`` state for this amount of seconds.
|
||||
If the Ray Tune job is stuck in this state (most likely due to insufficient resources), the warning message is printed
|
||||
repeatedly every this amount of seconds. Defaults to 60 (seconds).
|
||||
* **TUNE_STATE_REFRESH_PERIOD**: Frequency of updating the resource tracking from Ray. Defaults to 10 (seconds).
|
||||
* **TUNE_SYNC_DISABLE_BOOTSTRAP**: Disable bootstrapping the autoscaler config for Docker syncing.
|
||||
|
||||
|
||||
There are some environment variables that are mostly relevant for integrated libraries:
|
||||
|
||||
* **SIGOPT_KEY**: SigOpt API access key.
|
||||
* **WANDB_API_KEY**: Weights and Biases API key. You can also use ``wandb login``
|
||||
instead.
|
|
@ -37,7 +37,7 @@ Kubernetes (tune.integration.kubernetes)
|
|||
MLflow (tune.integration.mlflow)
|
||||
--------------------------------
|
||||
|
||||
:ref:`See also here <tune-mlflow>`.
|
||||
:ref:`See also here <tune-mlflow-ref>`.
|
||||
|
||||
.. autoclass:: ray.tune.integration.mlflow.MLflowLoggerCallback
|
||||
|
||||
|
@ -87,7 +87,7 @@ Horovod (tune.integration.horovod)
|
|||
Weights and Biases (tune.integration.wandb)
|
||||
-------------------------------------------
|
||||
|
||||
:ref:`See also here <tune-wandb>`.
|
||||
:ref:`See also here <tune-wandb-ref>`.
|
||||
|
||||
.. autoclass:: ray.tune.integration.wandb.WandbLoggerCallback
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
Loggers (tune.logger)
|
||||
=====================
|
||||
|
||||
Tune has default loggers for Tensorboard, CSV, and JSON formats. By default, Tune only logs the returned result dictionaries from the training function.
|
||||
Tune has default loggers for TensorBoard, CSV, and JSON formats. By default, Tune only logs the returned result dictionaries from the training function.
|
||||
|
||||
If you need to log something lower level like model weights or gradients, see :ref:`Trainable Logging <trainable-logging>`.
|
||||
|
||||
|
@ -60,7 +60,7 @@ You can then pass in your own logger as follows:
|
|||
callbacks=[CustomLoggerCallback("log_test.txt")]
|
||||
)
|
||||
|
||||
Per default, Ray Tune creates JSON, CSV and TensorboardX logger callbacks if you don't pass them yourself.
|
||||
Per default, Ray Tune creates JSON, CSV and TensorBoardX logger callbacks if you don't pass them yourself.
|
||||
You can disable this behavior by setting the ``TUNE_DISABLE_AUTO_CALLBACK_LOGGERS`` environment variable to ``"1"``.
|
||||
|
||||
An example of creating a custom logger can be found in :doc:`/tune/examples/logging_example`.
|
||||
|
@ -70,7 +70,7 @@ An example of creating a custom logger can be found in :doc:`/tune/examples/logg
|
|||
Trainable Logging
|
||||
-----------------
|
||||
|
||||
By default, Tune only logs the *training result dictionaries* from your Trainable. However, you may want to visualize the model weights, model graph, or use a custom logging library that requires multi-process logging. For example, you may want to do this if you're trying to log images to Tensorboard.
|
||||
By default, Tune only logs the *training result dictionaries* from your Trainable. However, you may want to visualize the model weights, model graph, or use a custom logging library that requires multi-process logging. For example, you may want to do this if you're trying to log images to TensorBoard.
|
||||
|
||||
You can do this in the trainable, as shown below:
|
||||
|
||||
|
|
|
@ -11,20 +11,21 @@ on `Github`_.
|
|||
.. _`GitHub`: https://github.com/ray-project/ray/issues
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:maxdepth: 2
|
||||
|
||||
execution.rst
|
||||
trainable.rst
|
||||
reporters.rst
|
||||
analysis.rst
|
||||
search_space.rst
|
||||
suggestion.rst
|
||||
schedulers.rst
|
||||
sklearn.rst
|
||||
stoppers.rst
|
||||
logging.rst
|
||||
integration.rst
|
||||
internals.rst
|
||||
client.rst
|
||||
cli.rst
|
||||
scalability.rst
|
||||
execution.rst
|
||||
env.rst
|
||||
trainable.rst
|
||||
search_space.rst
|
||||
suggestion.rst
|
||||
schedulers.rst
|
||||
stoppers.rst
|
||||
reporters.rst
|
||||
analysis.rst
|
||||
sklearn.rst
|
||||
logging.rst
|
||||
integration.rst
|
||||
internals.rst
|
||||
client.rst
|
||||
cli.rst
|
||||
scalability.rst
|
||||
|
|
|
@ -107,7 +107,7 @@ Below we present more detailed results on the result throughput performance.
|
|||
|
||||
Many concurrent trials
|
||||
""""""""""""""""""""""
|
||||
In this setup, loggers (CSV, JSON, and TensorboardX) and trial synchronization are disabled, except when
|
||||
In this setup, loggers (CSV, JSON, and TensorBoardX) and trial synchronization are disabled, except when
|
||||
explicitly noted.
|
||||
|
||||
In this experiment, we're running many concurrent trials (up to 1,000) on a cluster. We then adjust the
|
||||
|
@ -143,7 +143,7 @@ should be considered.
|
|||
|
||||
Many results on a single node
|
||||
"""""""""""""""""""""""""""""
|
||||
In this setup, loggers (CSV, JSON, and TensorboardX) are disabled, except when
|
||||
In this setup, loggers (CSV, JSON, and TensorBoardX) are disabled, except when
|
||||
explicitly noted.
|
||||
|
||||
In this experiment, we're running 96 concurrent trials on a single node. We then adjust the
|
||||
|
|
|
@ -10,7 +10,7 @@ doesn't change anymore.
|
|||
Ray Tune comes with several stopping mechanisms out of the box. For custom stopping behavior, you can
|
||||
inherit from the :class:`Stopper <ray.tune.Stopper>` class.
|
||||
|
||||
Other stopping behaviors are described :ref:`in the user guide <tune-stopping>`.
|
||||
Other stopping behaviors are described :ref:`in the user guide <tune-stopping-ref>`.
|
||||
|
||||
.. contents::
|
||||
:local:
|
||||
|
|
|
@ -42,7 +42,7 @@ Tune will run this function on a separate thread in a Ray actor process.
|
|||
|
||||
You'll notice that Ray Tune will output extra values in addition to the user reported metrics, such as ``iterations_since_restore``. See :ref:`tune-autofilled-metrics` for an explanation/glossary of these values.
|
||||
|
||||
.. tip:: If you want to leverage multi-node data parallel training with PyTorch while using parallel hyperparameter tuning, check out our :ref:`PyTorch <tune-pytorch-cifar>` user guide and Tune's :ref:`distributed pytorch integrations <tune-integration-torch>`.
|
||||
.. tip:: If you want to leverage multi-node data parallel training with PyTorch while using parallel hyperparameter tuning, check out our :ref:`PyTorch <tune-pytorch-cifar-ref>` user guide and Tune's :ref:`distributed pytorch integrations <tune-integration-torch>`.
|
||||
|
||||
Function API return and yield values
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
|
@ -1,97 +0,0 @@
|
|||
.. _tune-contrib:
|
||||
|
||||
Contributing to Tune
|
||||
====================
|
||||
|
||||
We welcome (and encourage!) all forms of contributions to Tune, including and not limited to:
|
||||
|
||||
- Code reviewing of patches and PRs.
|
||||
- Pushing patches.
|
||||
- Documentation and examples.
|
||||
- Community participation in forums and issues.
|
||||
- Code readability and code comments to improve readability.
|
||||
- Test cases to make the codebase more robust.
|
||||
- Tutorials, blog posts, talks that promote the project.
|
||||
|
||||
Developing Tune
|
||||
---------------
|
||||
|
||||
First, following the instructions in :ref:`python-develop` to develop Tune without compiling Ray.
|
||||
|
||||
After Ray is set up, run ``pip install -r ray/python/ray/tune/requirements-dev.txt`` to install all packages required for Tune development.
|
||||
|
||||
Submitting and Merging a Contribution
|
||||
-------------------------------------
|
||||
|
||||
There are a couple steps to merge a contribution.
|
||||
|
||||
1. First rebase your development branch on the most recent version of master.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
git remote add upstream https://github.com/ray-project/ray.git
|
||||
git fetch upstream
|
||||
git rebase upstream/master # or git pull . upstream/master
|
||||
|
||||
2. Make sure all existing tests `pass <contrib.html#testing>`__.
|
||||
3. If introducing a new feature or patching a bug, be sure to add new test cases
|
||||
in the relevant file in ``tune/tests/``.
|
||||
4. Document the code. Public functions need to be documented, and remember to provide a usage
|
||||
example if applicable.
|
||||
5. Request code reviews from other contributors and address their comments. One fast way to get reviews is
|
||||
to help review others' code so that they return the favor. You should aim to improve the code as much as
|
||||
possible before the review. We highly value patches that can get in without extensive reviews.
|
||||
6. Reviewers will merge and approve the pull request; be sure to ping them if
|
||||
the pull request is getting stale.
|
||||
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
Even though we have hooks to run unit tests automatically for each pull request,
|
||||
we recommend you to run unit tests locally beforehand to reduce reviewers’
|
||||
burden and speedup review process.
|
||||
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
pytest ray/python/ray/tune/tests/
|
||||
|
||||
Documentation should be documented in `Google style <https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html>`__ format.
|
||||
|
||||
We also have tests for code formatting and linting that need to pass before merge. You can run the following locally:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
ray/scripts/format.sh
|
||||
|
||||
|
||||
What can I work on?
|
||||
-------------------
|
||||
|
||||
We use Github to track issues, feature requests, and bugs. Take a look at the
|
||||
ones labeled `"good first issue" <https://github.com/ray-project/ray/issues?utf8=%E2%9C%93&q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22>`__ and `"help wanted" <https://github.com/ray-project/ray/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22>`__ for a place to start. Look for issues with "[tune]" in the title.
|
||||
|
||||
.. note::
|
||||
|
||||
If raising a new issue or PR related to Tune, be sure to include "[tune]" in the title and add a ``tune`` label.
|
||||
|
||||
For project organization, Tune maintains a relatively up-to-date organization of
|
||||
issues on the `Tune Github Project Board <https://github.com/ray-project/ray/projects/4>`__.
|
||||
Here, you can track and identify how issues are organized.
|
||||
|
||||
|
||||
Becoming a Reviewer
|
||||
-------------------
|
||||
|
||||
We identify reviewers from active contributors. Reviewers are individuals who
|
||||
not only actively contribute to the project but are also willing
|
||||
to participate in the code review of new contributions.
|
||||
A pull request to the project has to be reviewed by at least one reviewer in order to be merged.
|
||||
There is currently no formal process, but active contributors to Tune will be
|
||||
solicited by current reviewers.
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
These tips are based off of the TVM `contributor guide <https://github.com/dmlc/tvm>`__.
|
|
@ -1,3 +1,5 @@
|
|||
.. _tune-examples-ref:
|
||||
|
||||
========
|
||||
Examples
|
||||
========
|
||||
|
@ -84,14 +86,14 @@ PyTorch
|
|||
Pytorch Lightning
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
- :doc:`/tune/examples/mnist_ptl_mini`: A minimal example of using `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_ to train a MNIST model. This example utilizes the Ray Tune-provided :ref:`PyTorch Lightning callbacks <tune-integration-pytorch-lightning>`. See also :ref:`this tutorial for a full walkthrough <tune-pytorch-lightning>`.
|
||||
- :doc:`/tune/examples/mnist_ptl_mini`: A minimal example of using `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_ to train a MNIST model. This example utilizes the Ray Tune-provided :ref:`PyTorch Lightning callbacks <tune-integration-pytorch-lightning>`. See also :ref:`this tutorial for a full walkthrough <tune-pytorch-lightning-ref>`.
|
||||
- :doc:`/tune/examples/mnist_pytorch_lightning`: A comprehensive example using `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_ to train a MNIST model. This example showcases how to use various search optimization techniques. It utilizes the Ray Tune-provided :ref:`PyTorch Lightning callbacks <tune-integration-pytorch-lightning>`.
|
||||
- :ref:`A walkthrough tutorial for using Ray Tune with Pytorch-Lightning <tune-pytorch-lightning>`.
|
||||
- :ref:`A walkthrough tutorial for using Ray Tune with Pytorch-Lightning <tune-pytorch-lightning-ref>`.
|
||||
|
||||
Wandb, MLflow
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
- :ref:`Tutorial <tune-wandb>` for using `wandb <https://www.wandb.ai/>`__ with Ray Tune
|
||||
- :ref:`Tutorial <tune-wandb-ref>` for using `wandb <https://www.wandb.ai/>`__ with Ray Tune
|
||||
- :doc:`/tune/examples/wandb_example`: Example for using `Weights and Biases <https://www.wandb.ai/>`__ with Ray Tune.
|
||||
- :doc:`/tune/examples/mlflow_example`: Example for using `MLflow <https://github.com/mlflow/mlflow/>`__ with Ray Tune.
|
||||
- :doc:`/tune/examples/mlflow_ptl_example`: Example for using `MLflow <https://github.com/mlflow/mlflow/>`__ and `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_ with Ray Tune.
|
||||
|
@ -118,7 +120,7 @@ Horovod
|
|||
XGBoost, LightGBM
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
- :ref:`XGBoost tutorial <tune-xgboost>`: A guide to tuning XGBoost parameters with Tune.
|
||||
- :ref:`XGBoost tutorial <tune-xgboost-ref>`: A guide to tuning XGBoost parameters with Tune.
|
||||
- :doc:`/tune/examples/xgboost_example`: Trains a basic XGBoost model with Tune with the function-based API and an XGBoost callback.
|
||||
- :doc:`/tune/examples/xgboost_dynamic_resources_example`: Trains a basic XGBoost model with Tune with the class-based API and a ResourceChangingScheduler, ensuring all resources are being used at all time.
|
||||
- :doc:`/tune/examples/lightgbm_example`: Trains a basic LightGBM model with Tune with the function-based API and a LightGBM callback.
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
Ray Tune FAQ
|
||||
------------
|
||||
|
||||
Here we try to answer questions that come up often. If you still have questions
|
||||
after reading this, let us know!
|
||||
Here we try to answer questions that come up often.
|
||||
If you still have questions after reading this FAQ, let us know!
|
||||
|
||||
.. contents::
|
||||
:local:
|
||||
|
@ -516,3 +516,330 @@ should maybe process a larger chunk of data. In function trainables, you can rep
|
|||
of the training loop. Try to balance the number of results you really need to make scheduling or searching
|
||||
decisions. If you need more fine grained metrics for logging or tracking, consider using a separate logging
|
||||
mechanism for this instead of the Ray Tune-provided progress logging of results.
|
||||
|
||||
How can I develop and test Tune locally?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
First, follow the instructions in :ref:`python-develop` to develop Tune without compiling Ray.
|
||||
After Ray is set up, run ``pip install -r ray/python/ray/tune/requirements-dev.txt`` to install all packages
|
||||
required for Tune development. Now, to run all Tune tests simply run:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
pytest ray/python/ray/tune/tests/
|
||||
|
||||
If you plan to submit a pull request, we recommend you to run unit tests locally beforehand to speed up the review process.
|
||||
Even though we have hooks to run unit tests automatically for each pull request, it's usually quicker to run them
|
||||
on your machine first to avoid any obvious mistakes.
|
||||
|
||||
|
||||
How can I get started contributing to Tune?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
We use Github to track issues, feature requests, and bugs. Take a look at the
|
||||
ones labeled `"good first issue" <https://github.com/ray-project/ray/issues?utf8=%E2%9C%93&q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22>`__ and `"help wanted" <https://github.com/ray-project/ray/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22>`__ for a place to start. Look for issues with "[tune]" in the title.
|
||||
|
||||
.. note::
|
||||
|
||||
If raising a new issue or PR related to Tune, be sure to include "[tune]" in the title and add a ``tune`` label.
|
||||
|
||||
For project organization, Tune maintains a relatively up-to-date organization of
|
||||
issues on the `Tune Github Project Board <https://github.com/ray-project/ray/projects/4>`__.
|
||||
Here, you can track and identify how issues are organized.
|
||||
|
||||
|
||||
|
||||
.. _tune-reproducible:
|
||||
|
||||
How can I make my Tune experiments reproducible?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Exact reproducibility of machine learning runs is hard to achieve. This
|
||||
is even more true in a distributed setting, as more non-determinism is
|
||||
introduced. For instance, if two trials finish at the same time, the
|
||||
convergence of the search algorithm might be influenced by which trial
|
||||
result is processed first. This depends on the searcher - for random search,
|
||||
this shouldn't make a difference, but for most other searchers it will.
|
||||
|
||||
If you try to achieve some amount of reproducibility, there are two
|
||||
places where you'll have to set random seeds:
|
||||
|
||||
1. On the driver program, e.g. for the search algorithm. This will ensure
|
||||
that at least the initial configurations suggested by the search
|
||||
algorithms are the same.
|
||||
|
||||
2. In the trainable (if required). Neural networks are usually initialized
|
||||
with random numbers, and many classical ML algorithms, like GBDTs, make use of
|
||||
randomness. Thus you'll want to make sure to set a seed here
|
||||
so that the initialization is always the same.
|
||||
|
||||
Here is an example that will always produce the same result (except for trial
|
||||
runtimes).
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import numpy as np
|
||||
from ray import tune
|
||||
|
||||
|
||||
def train(config):
|
||||
# Set seed for trainable random result.
|
||||
# If you remove this line, you will get different results
|
||||
# each time you run the trial, even if the configuration
|
||||
# is the same.
|
||||
np.random.seed(config["seed"])
|
||||
random_result = np.random.uniform(0, 100, size=1).item()
|
||||
tune.report(result=random_result)
|
||||
|
||||
|
||||
# Set seed for Ray Tune's random search.
|
||||
# If you remove this line, you will get different configurations
|
||||
# each time you run the script.
|
||||
np.random.seed(1234)
|
||||
tune.run(
|
||||
train,
|
||||
config={
|
||||
"seed": tune.randint(0, 1000)
|
||||
},
|
||||
search_alg=tune.suggest.BasicVariantGenerator(),
|
||||
num_samples=10)
|
||||
|
||||
Some searchers use their own random states to sample new configurations.
|
||||
These searchers usually accept a ``seed`` parameter that can be passed on
|
||||
initialization. Other searchers use Numpy's ``np.random`` interface -
|
||||
these seeds can be then set with ``np.random.seed()``. We don't offer an
|
||||
interface to do this in the searcher classes as setting a random seed
|
||||
globally could have side effects. For instance, it could influence the
|
||||
way your dataset is split. Thus, we leave it up to the user to make
|
||||
these global configuration changes.
|
||||
|
||||
|
||||
How can I use large datasets in Tune?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You often will want to compute a large object (e.g., training data, model weights) on the driver and use that
|
||||
object within each trial.
|
||||
|
||||
Tune provides a wrapper function ``tune.with_parameters()`` that allows you to broadcast large objects to your trainable.
|
||||
Objects passed with this wrapper will be stored on the :ref:`Ray object store <objects-in-ray>` and will
|
||||
be automatically fetched and passed to your trainable as a parameter.
|
||||
|
||||
|
||||
.. tip:: If the objects are small in size or already exist in the :ref:`Ray Object Store <objects-in-ray>`, there's no need to use ``tune.with_parameters()``. You can use `partials <https://docs.python.org/3/library/functools.html#functools.partial>`__ or pass in directly to ``config`` instead.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
import numpy as np
|
||||
|
||||
def f(config, data=None):
|
||||
pass
|
||||
# use data
|
||||
|
||||
data = np.random.random(size=100000000)
|
||||
|
||||
tune.run(tune.with_parameters(f, data=data))
|
||||
|
||||
|
||||
How can I upload my Tune results to cloud storage?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If an upload directory is provided, Tune will automatically sync results from the ``local_dir`` to the given directory,
|
||||
natively supporting standard URIs for systems like S3, gsutil or HDFS.
|
||||
Here is an example of uploading to S3, using a bucket called ``my-log-dir``:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
MyTrainableClass,
|
||||
local_dir="~/ray_results",
|
||||
sync_config=tune.SyncConfig(upload_dir="s3://my-log-dir")
|
||||
)
|
||||
|
||||
You can customize this to specify arbitrary storages with the ``syncer`` argument in ``tune.SyncConfig``.
|
||||
This argument supports either strings with the same replacement fields OR arbitrary functions.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
MyTrainableClass,
|
||||
sync_config=tune.SyncConfig(
|
||||
upload_dir="s3://my-log-dir",
|
||||
syncer=custom_sync_str_or_func
|
||||
)
|
||||
)
|
||||
|
||||
If a string is provided, then it must include replacement fields ``{source}`` and ``{target}``, like
|
||||
``s3 sync {source} {target}``. Alternatively, a function can be provided with the following signature:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def custom_sync_func(source, target):
|
||||
# do arbitrary things inside
|
||||
sync_cmd = "s3 {source} {target}".format(
|
||||
source=source,
|
||||
target=target)
|
||||
sync_process = subprocess.Popen(sync_cmd, shell=True)
|
||||
sync_process.wait()
|
||||
|
||||
By default, syncing occurs every 300 seconds.
|
||||
To change the frequency of syncing, set the ``sync_period`` attribute of the sync config to the desired syncing period.
|
||||
|
||||
Note that uploading only happens when global experiment state is collected, and the frequency of this is
|
||||
determined by the sync period. So the true upload period is given by ``max(sync period, TUNE_GLOBAL_CHECKPOINT_S)``.
|
||||
|
||||
Make sure that worker nodes have the write access to the cloud storage.
|
||||
Failing to do so would cause error messages like ``Error message (1): fatal error: Unable to locate credentials``.
|
||||
For AWS set up, this involves adding an IamInstanceProfile configuration for worker nodes.
|
||||
Please :ref:`see here for more tips <aws-cluster-s3>`.
|
||||
|
||||
|
||||
.. _tune-docker:
|
||||
|
||||
How can I use Tune with Docker?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Tune automatically syncs files and checkpoints between different remote
|
||||
containers as needed.
|
||||
|
||||
To make this work in your Docker cluster, e.g. when you are using the Ray autoscaler
|
||||
with docker containers, you will need to pass a
|
||||
``DockerSyncer`` to the ``syncer`` argument of ``tune.SyncConfig``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.tune.integration.docker import DockerSyncer
|
||||
sync_config = tune.SyncConfig(
|
||||
syncer=DockerSyncer)
|
||||
|
||||
tune.run(train, sync_config=sync_config)
|
||||
|
||||
|
||||
.. _tune-kubernetes:
|
||||
|
||||
How can I use Tune with Kubernetes?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Ray Tune automatically synchronizes files and checkpoints between different remote nodes as needed.
|
||||
This usually happens via SSH, but this can be a :ref:`performance bottleneck <tune-bottlenecks>`,
|
||||
especially when running many trials in parallel.
|
||||
|
||||
Instead you should use shared storage for checkpoints so that no additional synchronization across nodes
|
||||
is necessary. There are two main options.
|
||||
|
||||
First, you can use the :ref:`SyncConfig <tune-sync-config>` to store your
|
||||
logs and checkpoints on cloud storage, such as AWS S3 or Google Cloud Storage:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
tune.durable(train_fn),
|
||||
# ...,
|
||||
sync_config=tune.SyncConfig(
|
||||
upload_dir="s3://your-s3-bucket/durable-trial/"
|
||||
)
|
||||
)
|
||||
|
||||
Second, you can set up a shared file system like NFS. If you do this, disable automatic trial syncing:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
train_fn,
|
||||
# ...,
|
||||
local_dir="/path/to/shared/storage",
|
||||
sync_config=tune.SyncConfig(
|
||||
# Do not sync because we are on shared storage
|
||||
syncer=None
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
Lastly, if you still want to use SSH for trial synchronization, but are not running
|
||||
on the Ray cluster launcher, you might need to pass a
|
||||
``KubernetesSyncer`` to the ``syncer`` argument of ``tune.SyncConfig``.
|
||||
You have to specify your Kubernetes namespace explicitly:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer
|
||||
sync_config = tune.SyncConfig(
|
||||
syncer=NamespacedKubernetesSyncer("ray")
|
||||
)
|
||||
|
||||
tune.run(train, sync_config=sync_config)
|
||||
|
||||
|
||||
Please note that we strongly encourage you to use one of the other two options instead, as they will
|
||||
result in less overhead and don't require pods to SSH into each other.
|
||||
|
||||
|
||||
.. _tune-debugging:
|
||||
|
||||
How can I debug Tune experiments locally?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
By default, Tune will run hyperparameter evaluations on multiple processes.
|
||||
However, if you need to debug your training process, it may be easier to do everything on a single process.
|
||||
You can force all Ray functions to occur on a single process with ``local_mode`` by calling the following
|
||||
before ``tune.run``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
ray.init(local_mode=True)
|
||||
|
||||
Local mode with multiple configuration evaluations will interleave computation,
|
||||
so it is most naturally used when running a single configuration evaluation.
|
||||
|
||||
Note that ``local_mode`` has some known issues, so please read :ref:`these tips <local-mode-tips>` for more info.
|
||||
|
||||
|
||||
|
||||
.. _tune-default-search-space:
|
||||
|
||||
How do I configure search spaces?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You can specify a grid search or sampling distribution via the dict passed into ``tune.run(config=...)``.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
parameters = {
|
||||
"qux": tune.sample_from(lambda spec: 2 + 2),
|
||||
"bar": tune.grid_search([True, False]),
|
||||
"foo": tune.grid_search([1, 2, 3]),
|
||||
"baz": "asd", # a constant value
|
||||
}
|
||||
|
||||
tune.run(trainable, config=parameters)
|
||||
|
||||
By default, each random variable and grid search point is sampled once.
|
||||
To take multiple random samples, add ``num_samples: N`` to the experiment config.
|
||||
If `grid_search` is provided as an argument, the grid will be repeated ``num_samples`` of times.
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 13
|
||||
|
||||
# num_samples=10 repeats the 3x3 grid search 10 times, for a total of 90 trials
|
||||
tune.run(
|
||||
my_trainable,
|
||||
name="my_trainable",
|
||||
config={
|
||||
"alpha": tune.uniform(100),
|
||||
"beta": tune.sample_from(lambda spec: spec.config.alpha * np.random.normal()),
|
||||
"nn_layers": [
|
||||
tune.grid_search([16, 64, 256]),
|
||||
tune.grid_search([16, 64, 256]),
|
||||
],
|
||||
},
|
||||
num_samples=10
|
||||
)
|
||||
|
||||
Note that search spaces may not be interoperable across different search algorithms.
|
||||
For example, for many search algorithms, you will not be able to use a ``grid_search`` or ``sample_from`` parameters.
|
||||
Read about this in the :ref:`Search Space API <tune-search-space>` page.
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
.. _tune-tutorial:
|
||||
|
||||
A Basic Tune Tutorial
|
||||
=====================
|
||||
|
||||
This tutorial will walk you through the process of setting up Tune. Specifically, we'll leverage early stopping and Bayesian Optimization (via HyperOpt) to optimize your PyTorch model.
|
||||
Getting Started
|
||||
===============
|
||||
|
||||
This tutorial will walk you through the process of setting up a Tune experiment.
|
||||
We'll start with a PyTorch model and show you how to leverage Ray Tune to optimize the hyperparameters of this model.
|
||||
Specifically, we'll leverage early stopping and Bayesian Optimization via HyperOpt to do so.
|
||||
|
||||
.. tip:: If you have suggestions as to how to improve this tutorial, please `let us know <https://github.com/ray-project/ray/issues/new/choose>`_!
|
||||
|
||||
|
@ -14,55 +15,70 @@ To run this example, you will need to install the following:
|
|||
|
||||
$ pip install ray torch torchvision
|
||||
|
||||
Pytorch Model Setup
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
Setting Up a Pytorch Model to Tune
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To start off, let's first import some dependencies:
|
||||
To start off, let's first import some dependencies.
|
||||
We import some PyTorch and TorchVision modules to help us create a model and train it.
|
||||
Also, we'll import Ray Tune to help us optimize the model.
|
||||
As you can see we use a so-called scheduler, in this case the ``ASHAScheduler`` that we will use for tuning the model
|
||||
later in this tutorial.
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/tests/tutorial.py
|
||||
:language: python
|
||||
:start-after: __tutorial_imports_begin__
|
||||
:end-before: __tutorial_imports_end__
|
||||
|
||||
Then, let's define the PyTorch model that we'll be training.
|
||||
Then, let's define a simple PyTorch model that we'll be training.
|
||||
If you're not familiar with PyTorch, the simplest way to define a model is to implement a ``nn.Module``.
|
||||
This requires you to set up your model with ``__init__`` and then implement a ``forward`` pass.
|
||||
In this example we're using a small convolutional neural network consisting of one 2D convolutional layer, a fully
|
||||
connected layer, and a softmax function.
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/tests/tutorial.py
|
||||
:language: python
|
||||
:start-after: __model_def_begin__
|
||||
:end-before: __model_def_end__
|
||||
|
||||
Below, we have implemented functions for training and evaluating your Pytorch model.
|
||||
We define a ``train`` and a ``test`` function for that purpose.
|
||||
If you know how to do this, skip ahead to the next section.
|
||||
|
||||
Below, we have some boiler plate code for training and evaluating your model in Pytorch. :ref:`Skip ahead to the Tune usage <tutorial-tune-setup>`.
|
||||
.. dropdown:: Training and evaluating the model
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/tests/tutorial.py
|
||||
:language: python
|
||||
:start-after: __train_def_begin__
|
||||
:end-before: __train_def_end__
|
||||
.. literalinclude:: /../../python/ray/tune/tests/tutorial.py
|
||||
:language: python
|
||||
:start-after: __train_def_begin__
|
||||
:end-before: __train_def_end__
|
||||
|
||||
.. _tutorial-tune-setup:
|
||||
|
||||
Setting up Tune
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Below, we define a function that trains the Pytorch model for multiple epochs. This function will be executed on a separate :ref:`Ray Actor (process) <actor-guide>` underneath the hood, so we need to communicate the performance of the model back to Tune (which is on the main Python process).
|
||||
Below, we define a function that trains the Pytorch model for multiple epochs.
|
||||
This function will be executed on a separate :ref:`Ray Actor (process) <actor-guide>` underneath the hood,
|
||||
so we need to communicate the performance of the model back to Tune (which is on the main Python process).
|
||||
|
||||
To do this, we call :ref:`tune.report <tune-function-docstring>` in our training function, which sends the performance value back to Tune.
|
||||
|
||||
.. tip:: Since the function is executed on the separate process, make sure that the function is :ref:`serializable by Ray <serialization-guide>`.
|
||||
To do this, we call :ref:`tune.report <tune-function-docstring>` in our training function,
|
||||
which sends the performance value back to Tune. Since the function is executed on the separate process,
|
||||
make sure that the function is :ref:`serializable by Ray <serialization-guide>`.
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/tests/tutorial.py
|
||||
:language: python
|
||||
:start-after: __train_func_begin__
|
||||
:end-before: __train_func_end__
|
||||
|
||||
Let's run 1 trial by calling :ref:`tune.run <tune-run-ref>` and :ref:`randomly sample <tune-sample-docs>` from a uniform distribution for learning rate and momentum.
|
||||
Let's run one trial by calling :ref:`tune.run <tune-run-ref>` and :ref:`randomly sample <tune-sample-docs>`
|
||||
from a uniform distribution for learning rate and momentum.
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/tests/tutorial.py
|
||||
:language: python
|
||||
:start-after: __eval_func_begin__
|
||||
:end-before: __eval_func_end__
|
||||
|
||||
``tune.run`` returns an :ref:`ExperimentAnalysis object <tune-analysis-docs>`. You can use this to plot the performance of this trial.
|
||||
``tune.run`` returns an :ref:`ExperimentAnalysis object <tune-analysis-docs>`.
|
||||
You can use this to plot the performance of this trial.
|
||||
|
||||
.. literalinclude:: /../../python/ray/tune/tests/tutorial.py
|
||||
:language: python
|
||||
|
@ -99,7 +115,7 @@ You can run the below in a Jupyter notebook to visualize trial progress.
|
|||
:scale: 50%
|
||||
:align: center
|
||||
|
||||
You can also use :ref:`Tensorboard <tensorboard>` for visualizing results.
|
||||
You can also use :ref:`TensorBoard <tensorboard>` for visualizing results.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
|
@ -134,7 +150,6 @@ You can evaluate best trained model using the :ref:`ExperimentAnalysis object <t
|
|||
Next Steps
|
||||
----------
|
||||
|
||||
* Take a look at the :doc:`/tune/user-guide` for a more comprehensive overview of Tune's features.
|
||||
* Check out the :ref:`Tune tutorials <tune-guides>` for guides on using Tune with your preferred machine learning library.
|
||||
* Browse our :ref:`gallery of examples <tune-general-examples>` to see how to use Tune with PyTorch, XGBoost, Tensorflow, etc.
|
||||
* `Let us know <https://github.com/ray-project/ray/issues>`__ if you ran into issues or have any questions by opening an issue on our Github.
|
BIN
doc/source/tune/images/serve_thumb.png
Normal file
BIN
doc/source/tune/images/serve_thumb.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
BIN
doc/source/tune/images/sklearn_thumb.png
Normal file
BIN
doc/source/tune/images/sklearn_thumb.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
BIN
doc/source/tune/images/tune_overview.png
Normal file
BIN
doc/source/tune/images/tune_overview.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 112 KiB |
|
@ -3,101 +3,220 @@
|
|||
Tune: Scalable Hyperparameter Tuning
|
||||
====================================
|
||||
|
||||
.. tip:: We'd love to hear your feedback on using Tune - `get in touch <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
|
||||
|
||||
.. image:: /images/tune.png
|
||||
:scale: 30%
|
||||
.. image:: images/tune_overview.png
|
||||
:scale: 50%
|
||||
:align: center
|
||||
|
||||
Tune is a Python library for experiment execution and hyperparameter tuning at any scale. Core features:
|
||||
Tune is a Python library for experiment execution and hyperparameter tuning at any scale.
|
||||
You can tune your favorite machine learning framework, :ref:`including PyTorch, XGBoost, TensorFlow and Keras <tune-guides>`,
|
||||
and choose among state of the art algorithms such as :ref:`Population Based Training (PBT) <tune-scheduler-pbt>`,
|
||||
:ref:`BayesOptSearch <bayesopt>`, or :ref:`HyperBand/ASHA <tune-scheduler-hyperband>`.
|
||||
Tune integrates with a wide range of hyperparameter optimization tools, like
|
||||
:ref:`Optuna, Hyperopt, Ax, and Nevergrad <tune-search-alg>`, to name a few.
|
||||
|
||||
* Launch a multi-node :ref:`distributed hyperparameter sweep <tune-distributed>` in less than 10 lines of code.
|
||||
* Supports any machine learning framework, :ref:`including PyTorch, XGBoost, MXNet, and Keras <tune-guides>`.
|
||||
* Automatically manages :ref:`checkpoints <tune-checkpoint-syncing>` and logging to :ref:`TensorBoard <tune-logging>`.
|
||||
* Choose among state of the art algorithms such as :ref:`Population Based Training (PBT) <tune-scheduler-pbt>`, :ref:`BayesOptSearch <bayesopt>`, :ref:`HyperBand/ASHA <tune-scheduler-hyperband>`.
|
||||
* Move your models from training to serving on the same infrastructure with `Ray Serve`_.
|
||||
|
||||
.. tabbed:: Examples
|
||||
|
||||
Learn how to use Ray Tune for various machine learning frameworks in just a few steps.
|
||||
**Click on the tabs to see code examples**.
|
||||
|
||||
.. tabbed:: Quickstart
|
||||
|
||||
.. tip:: We'd love to hear your feedback on using Tune - `get in touch <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
|
||||
|
||||
To run this example, install the following: ``pip install "ray[tune]"``.
|
||||
|
||||
In this quick-start example you _minimize_ a simple function of the form ``f(x) = a**2 + b``, our `objective` function.
|
||||
The closer ``a`` is to zero and the smaller ``b`` is, the smaller the total value of ``f(x)``.
|
||||
We will define a so-called `search space` for ``a`` and ``b`` and let Ray Tune explore the space for good values.
|
||||
|
||||
.. literalinclude:: ../../../python/ray/tune/tests/example.py
|
||||
:language: python
|
||||
:start-after: __quick_start_begin__
|
||||
:end-before: __quick_start_end__
|
||||
|
||||
.. tabbed:: Keras+Hyperopt
|
||||
|
||||
To tune your Keras models with Hyperopt, you wrap your model in an objective function whose ``config`` you
|
||||
can access for selecting hyperparameters.
|
||||
In the example below we only tune the ``activation`` parameter of the first layer of the model, but you can
|
||||
tune any parameter of the model you want.
|
||||
After defining the search space, you can simply initialize the ``HyperOptSearch`` object and pass it to ``run``.
|
||||
It's important to tell Ray Tune which metric you want to optimize and whether you want to maximize or minimize it.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
from ray.tune.suggest.hyperopt import HyperOptSearch
|
||||
import keras
|
||||
|
||||
# 1. Wrap a Keras model in an objective function.
|
||||
def objective(config):
|
||||
model = keras.models.Sequential()
|
||||
model.add(keras.layers.Dense(784, activation=config["activation"]))
|
||||
model.add(keras.layers.Dense(10, activation="softmax"))
|
||||
|
||||
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
|
||||
model.fit(...)
|
||||
loss, accuracy = model.evaluate(...)
|
||||
return {"accuracy": accuracy}
|
||||
|
||||
# 2. Define a search space and initialize the search algorithm.
|
||||
search_space = {"activation": tune.choice(["relu", "tanh"])}
|
||||
algo = HyperOptSearch()
|
||||
|
||||
# 3. Start a Tune run that maximizes accuracy.
|
||||
analysis = tune.run(
|
||||
objective, search_alg=algo, config=search_space, metric="accuracy", mode="max"
|
||||
)
|
||||
|
||||
.. TODO add .. tabbed:: PyTorch+Optuna
|
||||
|
||||
.. TODO add .. tabbed:: Scikit+PBT
|
||||
|
||||
With Tune you can also launch a multi-node :ref:`distributed hyperparameter sweep <tune-distributed-ref>`
|
||||
in less than 10 lines of code.
|
||||
It automatically manages :ref:`checkpoints <tune-checkpoint-syncing>` and logging to :ref:`TensorBoard <tune-logging>`.
|
||||
And you can move your models from training to serving on the same infrastructure with `Ray Serve`_.
|
||||
|
||||
.. _`Ray Serve`: ../serve/index.html
|
||||
|
||||
**Want to get started?** Head over to the :doc:`Key Concepts page </tune/key-concepts>`.
|
||||
|
||||
Quick Start
|
||||
-----------
|
||||
.. panels::
|
||||
:container: text-center
|
||||
:column: col-md-4 px-2 py-2
|
||||
:card:
|
||||
|
||||
To run this example, install the following: ``pip install "ray[tune]"``.
|
||||
**Getting Started**
|
||||
^^^
|
||||
|
||||
This example runs a parallel grid search to optimize an example objective function.
|
||||
In our getting started tutorial you will learn how to tune a PyTorch model
|
||||
effectively with Tune.
|
||||
|
||||
.. literalinclude:: ../../../python/ray/tune/tests/example.py
|
||||
:language: python
|
||||
:start-after: __quick_start_begin__
|
||||
:end-before: __quick_start_end__
|
||||
+++
|
||||
.. link-button:: tune-tutorial
|
||||
:type: ref
|
||||
:text: Get Started with Tune
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
|
||||
If TensorBoard is installed, automatically visualize all trial results:
|
||||
**Key Concepts**
|
||||
^^^
|
||||
|
||||
.. code-block:: bash
|
||||
Understand the key concepts behind Ray Tune.
|
||||
Learn about tune runs, search algorithms, schedulers and other features.
|
||||
|
||||
tensorboard --logdir ~/ray_results
|
||||
+++
|
||||
.. link-button:: tune-60-seconds
|
||||
:type: ref
|
||||
:text: Tune's Key Concepts
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
|
||||
**User Guides**
|
||||
^^^
|
||||
|
||||
.. image:: /images/tune-start-tb.png
|
||||
:scale: 30%
|
||||
:align: center
|
||||
Our guides teach you about key features of Tune, such as distributed training or early stopping.
|
||||
You can also find practical tutorials for scikit-learn, PyTorch, mlflow, and many more.
|
||||
|
||||
If using TF2 and TensorBoard, Tune will also automatically generate TensorBoard HParams output:
|
||||
+++
|
||||
.. link-button:: tune-guides
|
||||
:type: ref
|
||||
:text: Learn How To Use Tune
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
|
||||
.. image:: /images/tune-hparams-coord.png
|
||||
:scale: 20%
|
||||
:align: center
|
||||
**Examples**
|
||||
^^^
|
||||
|
||||
Check out some of our many examples on Ray Tune.
|
||||
|
||||
+++
|
||||
.. link-button:: tune-examples-ref
|
||||
:type: ref
|
||||
:text: Ray Tune Examples
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
|
||||
**Ray Tune FAQ**
|
||||
^^^
|
||||
|
||||
Find answers to commonly asked questions in our detailed FAQ.
|
||||
|
||||
+++
|
||||
.. link-button:: tune-faq
|
||||
:type: ref
|
||||
:text: Ray Tune FAQ
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
|
||||
**Ray Tune API**
|
||||
^^^
|
||||
|
||||
Get more in-depth information about the Ray Tune API, including all about search spaces,
|
||||
algorithms and training configurations.
|
||||
|
||||
+++
|
||||
.. link-button:: tune-api-ref
|
||||
:type: ref
|
||||
:text: Read the API Reference
|
||||
:classes: btn-outline-info btn-block
|
||||
|
||||
|
||||
Why choose Tune?
|
||||
----------------
|
||||
|
||||
There are many other hyperparameter optimization libraries out there. If you're new to Tune, you're probably wondering, "what makes Tune different?"
|
||||
There are many other hyperparameter optimization libraries out there.
|
||||
If you're new to Tune, you're probably wondering, "what makes Tune different?"
|
||||
|
||||
Cutting-edge optimization algorithms
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
.. dropdown:: Cutting-Edge Optimization Algorithms
|
||||
:animate: fade-in-slide-down
|
||||
|
||||
As a user, you're probably looking into hyperparameter optimization because you want to quickly increase your model performance.
|
||||
As a user, you're probably looking into hyperparameter optimization because you want to quickly increase your
|
||||
model performance.
|
||||
|
||||
Tune enables you to leverage a variety of these cutting edge optimization algorithms, reducing the cost of tuning by `aggressively terminating bad hyperparameter evaluations <tune-scheduler-hyperband>`_, intelligently :ref:`choosing better parameters to evaluate <tune-search-alg>`, or even :ref:`changing the hyperparameters during training <tune-scheduler-pbt>` to optimize hyperparameter schedules.
|
||||
Tune enables you to leverage a variety of these cutting edge optimization algorithms, reducing the cost of tuning
|
||||
by `terminating bad runs early <tune-scheduler-hyperband>`_,
|
||||
:ref:`choosing better parameters to evaluate <tune-search-alg>`, or even
|
||||
:ref:`changing the hyperparameters during training <tune-scheduler-pbt>` to optimize schedules.
|
||||
|
||||
First-class Developer Productivity
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
.. dropdown:: First-class Developer Productivity
|
||||
:animate: fade-in-slide-down
|
||||
|
||||
A key problem with machine learning frameworks is the need to restructure all of your code to fit the framework.
|
||||
A key problem with many hyperparameter optimization frameworks is the need to restructure
|
||||
your code to fit the framework.
|
||||
With Tune, you can optimize your model just by :ref:`adding a few code snippets <tune-tutorial>`.
|
||||
|
||||
With Tune, you can optimize your model just by :ref:`adding a few code snippets <tune-tutorial>`.
|
||||
Also, Tune removes boilerplate from your code training workflow,
|
||||
automatically :ref:`manages checkpoints <tune-checkpoint-syncing>` and
|
||||
:ref:`logs results to tools <tune-logging>` such as MLflow and TensorBoard, while also being highly customizable.
|
||||
|
||||
Further, Tune actually removes boilerplate from your code training workflow, automatically :ref:`managing checkpoints <tune-checkpoint-syncing>` and :ref:`logging results to tools <tune-logging>` such as MLflow and TensorBoard.
|
||||
.. dropdown:: Multi-GPU & Distributed Training Out Of The Box
|
||||
:animate: fade-in-slide-down
|
||||
|
||||
Hyperparameter tuning is known to be highly time-consuming, so it is often necessary to parallelize this process.
|
||||
Most other tuning frameworks require you to implement your own multi-process framework or build your own
|
||||
distributed system to speed up hyperparameter tuning.
|
||||
|
||||
Multi-GPU & distributed training out of the box
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
However, Tune allows you to transparently :ref:`parallelize across multiple GPUs and multiple nodes <tune-parallelism>`.
|
||||
Tune even has seamless :ref:`fault tolerance and cloud support <tune-distributed-ref>`, allowing you to scale up
|
||||
your hyperparameter search by 100x while reducing costs by up to 10x by using cheap preemptible instances.
|
||||
|
||||
Hyperparameter tuning is known to be highly time-consuming, so it is often necessary to parallelize this process. Most other tuning frameworks require you to implement your own multi-process framework or build your own distributed system to speed up hyperparameter tuning.
|
||||
.. dropdown:: Coming From Another Hyperparameter Optimization Tool?
|
||||
:animate: fade-in-slide-down
|
||||
|
||||
However, Tune allows you to transparently :ref:`parallelize across multiple GPUs and multiple nodes <tune-parallelism>`. Tune even has seamless :ref:`fault tolerance and cloud support <tune-distributed>`, allowing you to scale up your hyperparameter search by 100x while reducing costs by up to 10x by using cheap preemptible instances.
|
||||
You might be already using an existing hyperparameter tuning tool such as HyperOpt or Bayesian Optimization.
|
||||
|
||||
What if I'm already doing hyperparameter tuning?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You might be already using an existing hyperparameter tuning tool such as HyperOpt or Bayesian Optimization.
|
||||
|
||||
In this situation, Tune actually allows you to power up your existing workflow. Tune's :ref:`Search Algorithms <tune-search-alg>` integrate with a variety of popular hyperparameter tuning libraries (such as Nevergrad or HyperOpt) and allow you to seamlessly scale up your optimization process -- without sacrificing performance.
|
||||
In this situation, Tune actually allows you to power up your existing workflow.
|
||||
Tune's :ref:`Search Algorithms <tune-search-alg>` integrate with a variety of popular hyperparameter tuning
|
||||
libraries (such as Nevergrad or HyperOpt) and allow you to seamlessly scale up your optimization
|
||||
process - without sacrificing performance.
|
||||
|
||||
|
||||
Reference Materials
|
||||
-------------------
|
||||
|
||||
Here are some reference materials for Tune:
|
||||
|
||||
* :doc:`/tune/user-guide`
|
||||
* :ref:`Frequently asked questions <tune-faq>`
|
||||
* `Code <https://github.com/ray-project/ray/tree/master/python/ray/tune>`__: GitHub repository for Tune
|
||||
|
||||
Below are some blog posts and talks about Tune:
|
||||
Below you can find blog posts and talks about Ray Tune:
|
||||
|
||||
- [blog] `Tune: a Python library for fast hyperparameter tuning at any scale <https://towardsdatascience.com/fast-hyperparameter-tuning-at-scale-d428223b081c>`_
|
||||
- [blog] `Cutting edge hyperparameter tuning with Ray Tune <https://medium.com/riselab/cutting-edge-hyperparameter-tuning-with-ray-tune-be6c0447afdf>`_
|
||||
|
@ -109,7 +228,8 @@ Below are some blog posts and talks about Tune:
|
|||
Citing Tune
|
||||
-----------
|
||||
|
||||
If Tune helps you in your academic research, you are encouraged to cite `our paper <https://arxiv.org/abs/1807.05118>`__. Here is an example bibtex:
|
||||
If Tune helps you in your academic research, you are encouraged to cite `our paper <https://arxiv.org/abs/1807.05118>`__.
|
||||
Here is an example bibtex:
|
||||
|
||||
.. code-block:: tex
|
||||
|
||||
|
|
|
@ -91,7 +91,7 @@ search space, collectively defined for each *hyperparameter* in a ``config`` dic
|
|||
tune.run and Trials
|
||||
-------------------
|
||||
|
||||
Use :ref:`tune.run <tune-run-ref>` to execute hyperparameter tuning. This function manages your experiment and provides many features such as :ref:`logging <tune-logging>`, :ref:`checkpointing <tune-checkpoint-syncing>`, and :ref:`early stopping <tune-stopping>`.
|
||||
Use :ref:`tune.run <tune-run-ref>` to execute hyperparameter tuning. This function manages your experiment and provides many features such as :ref:`logging <tune-logging>`, :ref:`checkpointing <tune-checkpoint-syncing>`, and :ref:`early stopping <tune-stopping-ref>`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -279,7 +279,6 @@ What's Next?
|
|||
|
||||
Now that you have a working understanding of Tune, check out:
|
||||
|
||||
* :doc:`/tune/user-guide`: A comprehensive overview of Tune's features.
|
||||
* :ref:`tune-guides`: Tutorials for using Tune with your preferred machine learning library.
|
||||
* :doc:`/tune/examples/index`: End-to-end examples and templates for using Tune with your preferred machine learning library.
|
||||
* :ref:`tune-tutorial`: A simple tutorial that walks you through the process of setting up a Tune experiment.
|
||||
|
|
5
doc/source/tune/tutorials/BUILD
Normal file
5
doc/source/tune/tutorials/BUILD
Normal file
|
@ -0,0 +1,5 @@
|
|||
filegroup(
|
||||
name = "tune_tutorials",
|
||||
srcs = glob(["*.md"]),
|
||||
visibility = ["//doc:__subpackages__"]
|
||||
)
|
239
doc/source/tune/tutorials/overview.rst
Normal file
239
doc/source/tune/tutorials/overview.rst
Normal file
|
@ -0,0 +1,239 @@
|
|||
.. _tune-guides:
|
||||
|
||||
===========
|
||||
User Guides
|
||||
===========
|
||||
|
||||
.. tip:: We'd love to hear your feedback on using Tune - `get in touch <https://forms.gle/PTRvGLbKRdUfuzQo9>`_!
|
||||
|
||||
In this section, you can find material on how to use Tune and its various features.
|
||||
You can follow our :ref:`How-To Guides<tune-recipes>`, :ref:`Tune Feature Guides<tune-feature-guides>`, or
|
||||
go through some :ref:`Exercises<tune-exercises>`, to get started.
|
||||
|
||||
|
||||
.. _tune-recipes:
|
||||
|
||||
Practical How-To Guides
|
||||
-----------------------
|
||||
|
||||
.. panels::
|
||||
:container: container pb-4 full-width
|
||||
:column: col-md-3 px-2 py-2
|
||||
:img-top-cls: pt-5 w-75 d-block mx-auto
|
||||
|
||||
---
|
||||
:img-top: /images/tune-sklearn.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-sklearn
|
||||
:type: ref
|
||||
:text: How To Use Tune's Scikit-Learn Adapters?
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/pytorch_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-pytorch-cifar-ref
|
||||
:type: ref
|
||||
:text: How To Use Tune With PyTorch Models?
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/pytorch_lightning_small.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-pytorch-lightning-ref
|
||||
:type: ref
|
||||
:text: How To Tune PyTorch Lightning Models
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/serve.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-serve-integration-mnist
|
||||
:type: ref
|
||||
:text: Model Selection & Serving With Ray Serve
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/xgboost_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-xgboost-ref
|
||||
:type: ref
|
||||
:text: A Guide To Tuning XGBoost Parameters With Tune
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/wandb_logo.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-wandb-ref
|
||||
:type: ref
|
||||
:text: Tracking Your Experiment Process Weights & Biases
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/mlflow.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-mlflow-ref
|
||||
:type: ref
|
||||
:text: Using MLflow Tracking & AutoLogging with Tune
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/comet_logo_full.png
|
||||
|
||||
+++
|
||||
.. link-button:: tune-comet-ref
|
||||
:type: ref
|
||||
:text: Using Comet with Ray Tune For Experiment Management
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
|
||||
.. _tune-feature-guides:
|
||||
|
||||
Tune Feature Guides
|
||||
-------------------
|
||||
|
||||
.. panels::
|
||||
:container: container pb-4 full-width
|
||||
:column: col-md-3 px-2 py-2
|
||||
:img-top-cls: pt-5 w-50 d-block mx-auto
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-stopping
|
||||
:type: ref
|
||||
:text: A Guide To Stopping and Resuming Tune Experiments
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-metrics
|
||||
:type: ref
|
||||
:text: Using Callbacks and Metrics in Tune
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-output
|
||||
:type: ref
|
||||
:text: How To Log Tune Runs
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-resources
|
||||
:type: ref
|
||||
:text: Using Resources (GPUs, Parallel & Distributed Runs)
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-checkpoints
|
||||
:type: ref
|
||||
:text: Using Checkpoints For Your Experiments
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-lifecycle
|
||||
:type: ref
|
||||
:text: How does Tune work?
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-advanced-tutorial
|
||||
:type: ref
|
||||
:text: A simple guide to Population-based Training
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
---
|
||||
:img-top: /images/tune.png
|
||||
|
||||
.. link-button:: tune-distributed
|
||||
:type: ref
|
||||
:text: A Guide To Distributed Hyperparameter Tuning
|
||||
:classes: btn-link btn-block stretched-link
|
||||
|
||||
|
||||
.. _tune-exercises:
|
||||
|
||||
Exercises
|
||||
---------
|
||||
|
||||
Learn how to use Tune in your browser with the following Colab-based exercises.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th class="tune-colab">Exercise Description</th>
|
||||
<th class="tune-colab">Library</th>
|
||||
<th class="tune-colab">Colab Link</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="tune-colab">Basics of using Tune.</td>
|
||||
<td class="tune-colab">TF/Keras</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_1_basics.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Using Search algorithms and Trial Schedulers to optimize your model.</td>
|
||||
<td class="tune-colab">Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_2_optimize.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Using Population-Based Training (PBT).</td>
|
||||
<td class="tune-colab">Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/github/ray-project/tutorial/blob/master/tune_exercises/exercise_3_pbt.ipynb" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Fine-tuning Huggingface Transformers with PBT.</td>
|
||||
<td class="tune-colab">Huggingface Transformers/Pytorch</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/drive/1tQgAKgcKQzheoh503OzhS4N9NtfFgmjF?usp=sharing" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td class="tune-colab">Logging Tune Runs to Comet ML.</td>
|
||||
<td class="tune-colab">Comet</td>
|
||||
<td class="tune-colab">
|
||||
<a href="https://colab.research.google.com/drive/1dp3VwVoAH1acn_kG7RuT62mICnOqxU1z?usp=sharing" target="_parent">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Tune Tutorial"/>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
Tutorial source files `can be found here <https://github.com/ray-project/tutorial>`_.
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
Guide to Population Based Training (PBT)
|
||||
========================================
|
||||
A Guide to Population Based Training
|
||||
====================================
|
||||
|
||||
Tune includes a distributed implementation of `Population Based Training (PBT) <https://deepmind.com/blog/population-based-training-neural-networks>`__ as
|
||||
a :ref:`scheduler <tune-scheduler-pbt>`.
|
337
doc/source/tune/tutorials/tune-checkpoints.rst
Normal file
337
doc/source/tune/tutorials/tune-checkpoints.rst
Normal file
|
@ -0,0 +1,337 @@
|
|||
A Guide To Using Checkpoints
|
||||
============================
|
||||
|
||||
.. _tune-checkpoint-syncing:
|
||||
|
||||
Checkpointing and synchronization
|
||||
---------------------------------
|
||||
|
||||
When running a hyperparameter search, Tune can automatically and periodically save/checkpoint your model.
|
||||
This allows you to:
|
||||
|
||||
* save intermediate models throughout training
|
||||
* use pre-emptible machines (by automatically restoring from last checkpoint)
|
||||
* Pausing trials when using Trial Schedulers such as HyperBand and PBT.
|
||||
|
||||
Tune stores checkpoints on the node where the trials are executed. If you are training on more than one node,
|
||||
this means that some trial checkpoints may be on the head node and others are not.
|
||||
|
||||
When trials are restored (e.g. after a failure or when the experiment was paused), they may be scheduled on
|
||||
different nodes, but still would need access to the latest checkpoint. To make sure this works, Ray Tune
|
||||
comes with facilities to synchronize trial checkpoints between nodes.
|
||||
|
||||
Generally we consider three cases:
|
||||
|
||||
1. When using a shared directory (e.g. via NFS)
|
||||
2. When using cloud storage (e.g. S3 or GS)
|
||||
3. When using neither
|
||||
|
||||
The default option here is 3, which will be automatically used if nothing else is configured.
|
||||
|
||||
Using a shared directory
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
If all Ray nodes have access to a shared filesystem, e.g. via NFS, they can all write to this directory.
|
||||
In this case, we don't need any synchronization at all, as it is implicitly done by the operating system.
|
||||
|
||||
For this case, we only need to tell Ray Tune not to do any syncing at all (as syncing is the default):
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
name="experiment_name",
|
||||
local_dir="/path/to/shared/storage/",
|
||||
sync_config=tune.SyncConfig(
|
||||
syncer=None # Disable syncing
|
||||
)
|
||||
)
|
||||
|
||||
Note that the driver (on the head node) will have access to all checkpoints locally (in the
|
||||
shared directory) for further processing.
|
||||
|
||||
|
||||
.. _tune-cloud-checkpointing:
|
||||
|
||||
Using cloud storage
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
If all nodes have access to cloud storage, e.g. S3 or GS, the remote trials can automatically synchronize their
|
||||
checkpoints. For the filesystem, we end up with a similar situation as in the first case,
|
||||
only that the consolidated directory including all logs and checkpoints lives on cloud storage.
|
||||
|
||||
This approach is especially useful when training a large number of distributed trials,
|
||||
as logs and checkpoints are otherwise synchronized via SSH, which quickly can become a performance bottleneck.
|
||||
|
||||
For this case, we tell Ray Tune to use an ``upload_dir`` to store checkpoints at.
|
||||
This will automatically store both the experiment state and the trial checkpoints at that directory:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
name="experiment_name",
|
||||
sync_config=tune.SyncConfig(
|
||||
upload_dir="s3://bucket-name/sub-path/"
|
||||
)
|
||||
)
|
||||
|
||||
We don't have to provide a ``syncer`` here as it will be automatically detected. However, you can provide
|
||||
a string if you want to use a custom command:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
name="experiment_name",
|
||||
sync_config=tune.SyncConfig(
|
||||
upload_dir="s3://bucket-name/sub-path/",
|
||||
syncer="aws s3 sync {source} {target}", # Custom sync command
|
||||
)
|
||||
)
|
||||
|
||||
If a string is provided, then it must include replacement fields ``{source}`` and ``{target}``,
|
||||
as demonstrated in the example above.
|
||||
|
||||
The consolidated data will live be available in the cloud bucket. This means that the driver
|
||||
(on the head node) will not have access to all checkpoints locally. If you want to process
|
||||
e.g. the best checkpoint further, you will first have to fetch it from the cloud storage.
|
||||
|
||||
|
||||
Default syncing (no shared/cloud storage)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
If you're using neither a shared filesystem nor cloud storage, Ray Tune will resort to the
|
||||
default syncing mechanisms, which utilizes ``rsync`` (via SSH) to synchronize checkpoints across
|
||||
nodes.
|
||||
|
||||
Please note that this approach is likely the least efficient one - you should always try to use
|
||||
shared or cloud storage if possible when training on a multi node cluster.
|
||||
|
||||
For the syncing to work, the head node must be able to SSH into the worker nodes. If you are using
|
||||
the Ray cluster launcher this is usually the case (note that Kubernetes is an exception, but
|
||||
:ref:`see here for more details <tune-kubernetes>`).
|
||||
|
||||
If you don't provide a ``tune.SyncConfig`` at all, rsync-based syncing will be used.
|
||||
|
||||
If you want to customize syncing behavior, you can again specify a custom sync template:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
name="experiment_name",
|
||||
sync_config=tune.SyncConfig(
|
||||
# Do not specify an upload dir here
|
||||
syncer="rsync -savz -e "ssh -i ssh_key.pem" {source} {target}", # Custom sync command
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
Alternatively, a function can be provided with the following signature:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def custom_sync_func(source, target):
|
||||
sync_cmd = "rsync {source} {target}".format(
|
||||
source=source,
|
||||
target=target)
|
||||
sync_process = subprocess.Popen(sync_cmd, shell=True)
|
||||
sync_process.wait()
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
name="experiment_name",
|
||||
sync_config=tune.SyncConfig(
|
||||
syncer=custom_sync_func,
|
||||
sync_period=60 # Synchronize more often
|
||||
)
|
||||
)
|
||||
|
||||
When syncing results back to the driver, the source would be a path similar to
|
||||
``ubuntu@192.0.0.1:/home/ubuntu/ray_results/trial1``, and the target would be a local path.
|
||||
|
||||
Note that we adjusted the sync period in the example above. Setting this to a lower number will pull
|
||||
checkpoints from remote nodes more often. This will lead to more robust trial recovery,
|
||||
but it will also lead to more synchronization overhead (as SSH is usually slow).
|
||||
|
||||
As in the first case, the driver (on the head node) will have access to all checkpoints locally
|
||||
for further processing.
|
||||
|
||||
Checkpointing examples
|
||||
----------------------
|
||||
|
||||
Let's cover how to configure your checkpoints storage location, checkpointing frequency, and how to resume from a previous run.
|
||||
|
||||
A simple (cloud) checkpointing example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Cloud storage-backed Tune checkpointing is the recommended best practice for both performance and reliability reasons.
|
||||
It also enables checkpointing if using Ray on Kubernetes, which does not work out of the box with rsync-based sync,
|
||||
which relies on SSH. If you'd rather checkpoint locally or use rsync based checkpointing, see :ref:`here <rsync-checkpointing>`.
|
||||
|
||||
Prerequisites to use cloud checkpointing in Ray Tune for the example below:
|
||||
|
||||
Your ``my_trainable`` is either a:
|
||||
|
||||
1. **Model with an existing Ray integration**
|
||||
|
||||
* XGBoost (:ref:`example <xgboost-ray-tuning>`)
|
||||
* Pytorch (:ref:`example <tune-pytorch-lightning-ref>`)
|
||||
* Pytorch Lightning (:ref:`example <ray-lightning-tuning>`)
|
||||
* Keras (:doc:`example </tune/examples/tune_mnist_keras>`)
|
||||
* Tensorflow (:ref:`example <ray-train-tftrainer-example>`)
|
||||
* LightGBM (:ref:`example <lightgbm-ray-tuning>`)
|
||||
|
||||
2. **Custom training function**
|
||||
|
||||
* All this means is that your function has to expose a ``checkpoint_dir`` argument in the function signature,
|
||||
and call ``tune.checkpoint_dir``. See :doc:`this example </tune/examples/custom_func_checkpointing>`,
|
||||
it's quite simple to do.
|
||||
|
||||
Let's assume for this example you're running this script from your laptop, and connecting to your remote Ray cluster
|
||||
via ``ray.init()``, making your script on your laptop the "driver".
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from your_module import my_trainable
|
||||
|
||||
ray.init(address="<cluster-IP>:<port>") # set `address=None` to train on laptop
|
||||
|
||||
# configure how checkpoints are sync'd to the scheduler/sampler
|
||||
# we recommend cloud storage checkpointing as it survives the cluster when
|
||||
# instances are terminated, and has better performance
|
||||
sync_config = tune.syncConfig(
|
||||
upload_dir="s3://my-checkpoints-bucket/path/", # requires AWS credentials
|
||||
)
|
||||
|
||||
# this starts the run!
|
||||
tune.run(
|
||||
my_trainable,
|
||||
|
||||
# name of your experiment
|
||||
name="my-tune-exp",
|
||||
|
||||
# a directory where results are stored before being
|
||||
# sync'd to head node/cloud storage
|
||||
local_dir="/tmp/mypath",
|
||||
|
||||
# see above! we will sync our checkpoints to S3 directory
|
||||
sync_config=sync_config,
|
||||
|
||||
# we'll keep the best five checkpoints at all times
|
||||
# checkpoints (by AUC score, reported by the trainable, descending)
|
||||
checkpoint_score_attr="max-auc",
|
||||
keep_checkpoints_num=5,
|
||||
|
||||
# a very useful trick! this will resume from the last run specified by
|
||||
# sync_config (if one exists), otherwise it will start a new tuning run
|
||||
resume="AUTO",
|
||||
)
|
||||
|
||||
In this example, checkpoints will be saved:
|
||||
|
||||
* **Locally**: not saved! Nothing will be sync'd to the driver (your laptop) automatically (because cloud syncing is enabled)
|
||||
* **S3**: ``s3://my-checkpoints-bucket/path/my-tune-exp/<trial_name>/checkpoint_<step>``
|
||||
* **On head node**: ``~/ray-results/my-tune-exp/<trial_name>/checkpoint_<step>`` (but only for trials done on that node)
|
||||
* **On workers nodes**: ``~/ray-results/my-tune-exp/<trial_name>/checkpoint_<step>`` (but only for trials done on that node)
|
||||
|
||||
If your run stopped for any reason (finished, errored, user CTRL+C), you can restart it any time by running the script above again -- note with ``resume="AUTO"``, it will detect the previous run so long as the ``sync_config`` points to the same location.
|
||||
|
||||
If, however, you prefer not to use ``resume="AUTO"`` (or are on an older version of Ray) you can resume manaully:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Restored previous trial from the given checkpoint
|
||||
tune.run(
|
||||
# our same trainable as before
|
||||
my_trainable,
|
||||
|
||||
# The name can be different from your original name
|
||||
name="my-tune-exp-restart",
|
||||
|
||||
# our same config as above!
|
||||
restore=sync_config,
|
||||
)
|
||||
|
||||
.. _rsync-checkpointing:
|
||||
|
||||
A simple local/rsync checkpointing example
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Local or rsync checkpointing can be a good option if:
|
||||
|
||||
1. You want to tune on a single laptop Ray cluster
|
||||
2. You aren't using Ray on Kubernetes (rsync doesn't work with Ray on Kubernetes)
|
||||
3. You don't want to use S3
|
||||
|
||||
Let's take a look at an example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from your_module import my_trainable
|
||||
|
||||
ray.init(address="<cluster-IP>:<port>") # set `address=None` to train on laptop
|
||||
|
||||
# configure how checkpoints are sync'd to the scheduler/sampler
|
||||
sync_config = tune.syncConfig() # the default mode is to use use rsync
|
||||
|
||||
# this starts the run!
|
||||
tune.run(
|
||||
my_trainable,
|
||||
|
||||
# name of your experiment
|
||||
name="my-tune-exp",
|
||||
|
||||
# a directory where results are stored before being
|
||||
# sync'd to head node/cloud storage
|
||||
local_dir="/tmp/mypath",
|
||||
|
||||
# sync our checkpoints via rsync
|
||||
# you don't have to pass an empty sync config - but we
|
||||
# do it here for clarity and comparison
|
||||
sync_config=sync_config,
|
||||
|
||||
# we'll keep the best five checkpoints at all times
|
||||
# checkpoints (by AUC score, reported by the trainable, descending)
|
||||
checkpoint_score_attr="max-auc",
|
||||
keep_checkpoints_num=5,
|
||||
|
||||
# a very useful trick! this will resume from the last run specified by
|
||||
# sync_config (if one exists), otherwise it will start a new tuning run
|
||||
resume="AUTO",
|
||||
)
|
||||
|
||||
.. _tune-distributed-checkpointing:
|
||||
|
||||
Distributed Checkpointing
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
On a multinode cluster, Tune automatically creates a copy of all trial checkpoints on the head node.
|
||||
This requires the Ray cluster to be started with the :ref:`cluster launcher <cluster-cloud>` and also
|
||||
requires rsync to be installed.
|
||||
|
||||
Note that you must use the ``tune.checkpoint_dir`` API to trigger syncing
|
||||
(or use a model type with a built-in Ray Tune integration as described here).
|
||||
See :doc:`/tune/examples/custom_func_checkpointing` for an example.
|
||||
|
||||
If you are running Ray Tune on Kubernetes, you should usually use a
|
||||
:ref:`cloud checkpointing <tune-sync-config>` or a shared filesystem for checkpoint sharing.
|
||||
Please :ref:`see here for best practices for running Tune on Kubernetes <tune-kubernetes>`.
|
||||
|
||||
If you do not use the cluster launcher, you should set up a NFS or global file system and
|
||||
disable cross-node syncing:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
sync_config = tune.SyncConfig(syncer=None)
|
||||
tune.run(func, sync_config=sync_config)
|
|
@ -1,4 +1,4 @@
|
|||
.. _tune-comet:
|
||||
.. _tune-comet-ref:
|
||||
|
||||
Using Comet with Tune
|
||||
================================
|
|
@ -1,4 +1,4 @@
|
|||
.. _tune-distributed:
|
||||
.. _tune-distributed-ref:
|
||||
|
||||
Tune Distributed Experiments
|
||||
============================
|
||||
|
@ -307,7 +307,7 @@ Below are some commonly used commands for submitting experiments. Please see the
|
|||
# Shut-down all instances of your cluster:
|
||||
$ ray down CLUSTER.YAML [-y]
|
||||
|
||||
# Run Tensorboard and forward the port to your own machine.
|
||||
# Run TensorBoard and forward the port to your own machine.
|
||||
$ ray exec CLUSTER.YAML 'tensorboard --logdir ~/ray_results/ --port 6006' --port-forward 6006
|
||||
|
||||
# Run Jupyter Lab and forward the port to your own machine.
|
|
@ -1,5 +1,3 @@
|
|||
.. _tune-lifecycle:
|
||||
|
||||
How does Tune work?
|
||||
===================
|
||||
|
||||
|
@ -68,11 +66,12 @@ If the trainable is a callable or a function, it will be executed on the Ray act
|
|||
Resource Management in Tune
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Before running a trial, the Tune Driver will check whether there are available resources on the cluster (see :ref:`resource-requirements`). It will compare the available resources with the resources required by the trial.
|
||||
Before running a trial, the Ray Tune driver will check whether there are available resources on the cluster (see :ref:`resource-requirements`).
|
||||
It will compare the available resources with the resources required by the trial.
|
||||
|
||||
If there is space on the cluster, then the Tune Driver will start a Ray actor (worker). This actor will be scheduled and executed on some node where the resources are available.
|
||||
|
||||
See :ref:`tune-parallelism` for more information.
|
||||
If there is space on the cluster, then the Tune Driver will start a Ray actor (worker).
|
||||
This actor will be scheduled and executed on some node where the resources are available.
|
||||
See :doc:`tune-resources` for more information.
|
||||
|
||||
.. _trial-lifecycle:
|
||||
|
91
doc/source/tune/tutorials/tune-metrics.rst
Normal file
91
doc/source/tune/tutorials/tune-metrics.rst
Normal file
|
@ -0,0 +1,91 @@
|
|||
A Guide To Callbacks & Metrics in Tune
|
||||
======================================
|
||||
|
||||
.. _tune-callbacks:
|
||||
|
||||
How to work with Callbacks?
|
||||
---------------------------
|
||||
|
||||
Ray Tune supports callbacks that are called during various times of the training process.
|
||||
Callbacks can be passed as a parameter to ``tune.run()``, and the sub-method you provide will be invoked automatically.
|
||||
|
||||
This simple callback just prints a metric each time a result is received:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray import tune
|
||||
from ray.tune import Callback
|
||||
|
||||
|
||||
class MyCallback(Callback):
|
||||
def on_trial_result(self, iteration, trials, trial, result, **info):
|
||||
print(f"Got result: {result['metric']}")
|
||||
|
||||
|
||||
def train(config):
|
||||
for i in range(10):
|
||||
tune.report(metric=i)
|
||||
|
||||
|
||||
tune.run(
|
||||
train,
|
||||
callbacks=[MyCallback()])
|
||||
|
||||
For more details and available hooks, please :ref:`see the API docs for Ray Tune callbacks <tune-callbacks-docs>`.
|
||||
|
||||
|
||||
.. _tune-autofilled-metrics:
|
||||
|
||||
How to use log metrics in Tune?
|
||||
-------------------------------
|
||||
|
||||
You can log arbitrary values and metrics in both Function and Class training APIs:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def trainable(config):
|
||||
for i in range(num_epochs):
|
||||
...
|
||||
tune.report(acc=accuracy, metric_foo=random_metric_1, bar=metric_2)
|
||||
|
||||
class Trainable(tune.Trainable):
|
||||
def step(self):
|
||||
...
|
||||
# don't call report here!
|
||||
return dict(acc=accuracy, metric_foo=random_metric_1, bar=metric_2)
|
||||
|
||||
|
||||
.. tip::
|
||||
Note that ``tune.report()`` is not meant to transfer large amounts of data, like models or datasets.
|
||||
Doing so can incur large overheads and slow down your Tune run significantly.
|
||||
|
||||
Which metrics get automatically filled in?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Tune has the concept of auto-filled metrics.
|
||||
During training, Tune will automatically log the below metrics in addition to any user-provided values.
|
||||
All of these can be used as stopping conditions or passed as a parameter to Trial Schedulers/Search Algorithms.
|
||||
|
||||
* ``config``: The hyperparameter configuration
|
||||
* ``date``: String-formatted date and time when the result was processed
|
||||
* ``done``: True if the trial has been finished, False otherwise
|
||||
* ``episodes_total``: Total number of episodes (for RLLib trainables)
|
||||
* ``experiment_id``: Unique experiment ID
|
||||
* ``experiment_tag``: Unique experiment tag (includes parameter values)
|
||||
* ``hostname``: Hostname of the worker
|
||||
* ``iterations_since_restore``: The number of times ``tune.report()/trainable.train()`` has been
|
||||
called after restoring the worker from a checkpoint
|
||||
* ``node_ip``: Host IP of the worker
|
||||
* ``pid``: Process ID (PID) of the worker process
|
||||
* ``time_since_restore``: Time in seconds since restoring from a checkpoint.
|
||||
* ``time_this_iter_s``: Runtime of the current training iteration in seconds (i.e.
|
||||
one call to the trainable function or to ``_train()`` in the class API.
|
||||
* ``time_total_s``: Total runtime in seconds.
|
||||
* ``timestamp``: Timestamp when the result was processed
|
||||
* ``timesteps_since_restore``: Number of timesteps since restoring from a checkpoint
|
||||
* ``timesteps_total``: Total number of timesteps
|
||||
* ``training_iteration``: The number of times ``tune.report()`` has been
|
||||
called
|
||||
* ``trial_id``: Unique trial ID
|
||||
|
||||
All of these metrics can be seen in the ``Trial.last_result`` dictionary.
|
|
@ -1,4 +1,4 @@
|
|||
.. _tune-mlflow:
|
||||
.. _tune-mlflow-ref:
|
||||
|
||||
Using MLflow with Tune
|
||||
======================
|
164
doc/source/tune/tutorials/tune-output.rst
Normal file
164
doc/source/tune/tutorials/tune-output.rst
Normal file
|
@ -0,0 +1,164 @@
|
|||
A Guide To Logging & Outputs in Tune
|
||||
====================================
|
||||
|
||||
Tune by default will log results for TensorBoard, CSV, and JSON formats.
|
||||
If you need to log something lower level like model weights or gradients, see :ref:`Trainable Logging <trainable-logging>`.
|
||||
You can learn more about logging and customizations here: :ref:`loggers-docstring`.
|
||||
|
||||
|
||||
.. _tune-logging:
|
||||
|
||||
How to configure logging in Tune?
|
||||
---------------------------------
|
||||
|
||||
Tune will log the results of each trial to a sub-folder under a specified local dir, which defaults to ``~/ray_results``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# This logs to two different trial folders:
|
||||
# ~/ray_results/trainable_name/trial_name_1 and ~/ray_results/trainable_name/trial_name_2
|
||||
# trainable_name and trial_name are autogenerated.
|
||||
tune.run(trainable, num_samples=2)
|
||||
|
||||
You can specify the ``local_dir`` and ``trainable_name``:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# This logs to 2 different trial folders:
|
||||
# ./results/test_experiment/trial_name_1 and ./results/test_experiment/trial_name_2
|
||||
# Only trial_name is autogenerated.
|
||||
tune.run(trainable, num_samples=2, local_dir="./results", name="test_experiment")
|
||||
|
||||
To specify custom trial folder names, you can pass use the ``trial_name_creator`` argument to `tune.run`.
|
||||
This takes a function with the following signature:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def trial_name_string(trial):
|
||||
"""
|
||||
Args:
|
||||
trial (Trial): A generated trial object.
|
||||
|
||||
Returns:
|
||||
trial_name (str): String representation of Trial.
|
||||
"""
|
||||
return str(trial)
|
||||
|
||||
tune.run(
|
||||
MyTrainableClass,
|
||||
name="example-experiment",
|
||||
num_samples=1,
|
||||
trial_name_creator=trial_name_string
|
||||
)
|
||||
|
||||
To learn more about Trials, see its detailed API documentation: :ref:`trial-docstring`.
|
||||
|
||||
.. _tensorboard:
|
||||
|
||||
How to log to TensorBoard?
|
||||
--------------------------
|
||||
|
||||
Tune automatically outputs TensorBoard files during ``tune.run``.
|
||||
To visualize learning in tensorboard, install tensorboardX:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ pip install tensorboardX
|
||||
|
||||
Then, after you run an experiment, you can visualize your experiment with TensorBoard by specifying
|
||||
the output directory of your results.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ tensorboard --logdir=~/ray_results/my_experiment
|
||||
|
||||
If you are running Ray on a remote multi-user cluster where you do not have sudo access,
|
||||
you can run the following commands to make sure tensorboard is able to write to the tmp directory:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ export TMPDIR=/tmp/$USER; mkdir -p $TMPDIR; tensorboard --logdir=~/ray_results
|
||||
|
||||
.. image:: ../images/ray-tune-tensorboard.png
|
||||
|
||||
If using TensorFlow ``2.x``, Tune also automatically generates TensorBoard HParams output, as shown below:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
...,
|
||||
config={
|
||||
"lr": tune.grid_search([1e-5, 1e-4]),
|
||||
"momentum": tune.grid_search([0, 0.9])
|
||||
}
|
||||
)
|
||||
|
||||
.. image:: ../../images/tune-hparams.png
|
||||
|
||||
|
||||
.. _tune-console-output:
|
||||
|
||||
How to control console output?
|
||||
------------------------------
|
||||
|
||||
User-provided fields will be outputted automatically on a best-effort basis.
|
||||
You can use a :ref:`Reporter <tune-reporter-doc>` object to customize the console output.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
== Status ==
|
||||
Memory usage on this node: 11.4/16.0 GiB
|
||||
Using FIFO scheduling algorithm.
|
||||
Resources requested: 4/12 CPUs, 0/0 GPUs, 0.0/3.17 GiB heap, 0.0/1.07 GiB objects
|
||||
Result logdir: /Users/foo/ray_results/myexp
|
||||
Number of trials: 4 (4 RUNNING)
|
||||
+----------------------+----------+---------------------+-----------+--------+--------+----------------+-------+
|
||||
| Trial name | status | loc | param1 | param2 | acc | total time (s) | iter |
|
||||
|----------------------+----------+---------------------+-----------+--------+--------+----------------+-------|
|
||||
| MyTrainable_a826033a | RUNNING | 10.234.98.164:31115 | 0.303706 | 0.0761 | 0.1289 | 7.54952 | 15 |
|
||||
| MyTrainable_a8263fc6 | RUNNING | 10.234.98.164:31117 | 0.929276 | 0.158 | 0.4865 | 7.0501 | 14 |
|
||||
| MyTrainable_a8267914 | RUNNING | 10.234.98.164:31111 | 0.068426 | 0.0319 | 0.9585 | 7.0477 | 14 |
|
||||
| MyTrainable_a826b7bc | RUNNING | 10.234.98.164:31112 | 0.729127 | 0.0748 | 0.1797 | 7.05715 | 14 |
|
||||
+----------------------+----------+---------------------+-----------+--------+--------+----------------+-------+
|
||||
|
||||
|
||||
.. _tune-log_to_file:
|
||||
|
||||
How to redirect stdout and stderr to files?
|
||||
-------------------------------------------
|
||||
|
||||
The stdout and stderr streams are usually printed to the console.
|
||||
For remote actors, Ray collects these logs and prints them to the head process.
|
||||
|
||||
However, if you would like to collect the stream outputs in files for later
|
||||
analysis or troubleshooting, Tune offers an utility parameter, ``log_to_file``,
|
||||
for this.
|
||||
|
||||
By passing ``log_to_file=True`` to ``tune.run()``, stdout and stderr will be logged
|
||||
to ``trial_logdir/stdout`` and ``trial_logdir/stderr``, respectively:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
log_to_file=True)
|
||||
|
||||
If you would like to specify the output files, you can either pass one filename,
|
||||
where the combined output will be stored, or two filenames, for stdout and stderr,
|
||||
respectively:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
log_to_file="std_combined.log")
|
||||
|
||||
tune.run(
|
||||
trainable,
|
||||
log_to_file=("my_stdout.log", "my_stderr.log"))
|
||||
|
||||
The file names are relative to the trial's logdir. You can pass absolute paths,
|
||||
too.
|
||||
|
||||
If ``log_to_file`` is set, Tune will automatically register a new logging handler
|
||||
for Ray's base logger and log the output to the specified stderr output file.
|
|
@ -1,4 +1,4 @@
|
|||
.. _tune-pytorch-cifar:
|
||||
.. _tune-pytorch-cifar-ref:
|
||||
|
||||
How to use Tune with PyTorch
|
||||
============================
|
|
@ -1,4 +1,4 @@
|
|||
.. _tune-pytorch-lightning:
|
||||
.. _tune-pytorch-lightning-ref:
|
||||
|
||||
Using PyTorch Lightning with Tune
|
||||
=================================
|
137
doc/source/tune/tutorials/tune-resources.rst
Normal file
137
doc/source/tune/tutorials/tune-resources.rst
Normal file
|
@ -0,0 +1,137 @@
|
|||
.. _tune-parallelism:
|
||||
|
||||
A Guide To Parallelism and Resources
|
||||
------------------------------------
|
||||
|
||||
Parallelism is determined by ``resources_per_trial`` (defaulting to 1 CPU, 0 GPU per trial)
|
||||
and the resources available to Tune (``ray.cluster_resources()``).
|
||||
|
||||
By default, Tune automatically runs N concurrent trials, where N is the number of CPUs (cores) on your machine.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# If you have 4 CPUs on your machine, this will run 4 concurrent trials at a time.
|
||||
tune.run(trainable, num_samples=10)
|
||||
|
||||
.. tip:: To run your code sequentially, use :ref:`Ray Local Mode <tune-debugging>`.
|
||||
|
||||
You can override this parallelism with ``resources_per_trial``. Here you can
|
||||
specify your resource requests using either a dictionary or a
|
||||
:class:`PlacementGroupFactory <ray.tune.utils.placement_groups.PlacementGroupFactory>`
|
||||
object. In any case, Ray Tune will try to start a placement group for each trial.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# If you have 4 CPUs on your machine, this will run 2 concurrent trials at a time.
|
||||
tune.run(trainable, num_samples=10, resources_per_trial={"cpu": 2})
|
||||
|
||||
# If you have 4 CPUs on your machine, this will run 1 trial at a time.
|
||||
tune.run(trainable, num_samples=10, resources_per_trial={"cpu": 4})
|
||||
|
||||
# Fractional values are also supported, (i.e., {"cpu": 0.5}).
|
||||
tune.run(trainable, num_samples=10, resources_per_trial={"cpu": 0.5})
|
||||
|
||||
|
||||
Tune will allocate the specified GPU and CPU from ``resources_per_trial`` to each individual trial.
|
||||
Even if the trial cannot be scheduled right now, Ray Tune will still try to start
|
||||
the respective placement group. If not enough resources are available, this will trigger
|
||||
:ref:`autoscaling behavior<cluster-index>` if you're using the Ray cluster launcher.
|
||||
|
||||
It is also possible to specify memory (``"memory"``, in bytes) and custom resource requirements.
|
||||
|
||||
If your trainable function starts more remote workers, you will need to pass so-called placement group
|
||||
factory objects to request these resources.
|
||||
See the :class:`PlacementGroupFactory documentation <ray.tune.utils.placement_groups.PlacementGroupFactory>`
|
||||
for further information.
|
||||
This also applies if you are using other libraries making use of Ray, such as Modin.
|
||||
Failure to set resources correctly may result in a deadlock, "hanging" the cluster.
|
||||
|
||||
.. note::
|
||||
The resources specified this way will only be allocated for scheduling Tune trials.
|
||||
These resources will not be enforced on your objective function (Tune trainable) automatically.
|
||||
You will have to make sure your trainable has enough resources to run (e.g. by setting ``n_jobs`` for a
|
||||
scikit-learn model accordingly).
|
||||
|
||||
How to leverage GPUs?
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To leverage GPUs, you must set ``gpu`` in ``tune.run(resources_per_trial)``.
|
||||
This will automatically set ``CUDA_VISIBLE_DEVICES`` for each trial.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# If you have 8 GPUs, this will run 8 trials at once.
|
||||
tune.run(trainable, num_samples=10, resources_per_trial={"gpu": 1})
|
||||
|
||||
# If you have 4 CPUs on your machine and 1 GPU, this will run 1 trial at a time.
|
||||
tune.run(trainable, num_samples=10, resources_per_trial={"cpu": 2, "gpu": 1})
|
||||
|
||||
You can find an example of this in the :doc:`Keras MNIST example </tune/examples/tune_mnist_keras>`.
|
||||
|
||||
.. warning:: If 'gpu' is not set, ``CUDA_VISIBLE_DEVICES`` environment variable will be set as empty, disallowing GPU access.
|
||||
|
||||
**Troubleshooting**: Occasionally, you may run into GPU memory issues when running a new trial. This may be
|
||||
due to the previous trial not cleaning up its GPU state fast enough. To avoid this,
|
||||
you can use ``tune.utils.wait_for_gpu`` - see :ref:`docstring <tune-util-ref>`.
|
||||
|
||||
How to run distributed tuning on a cluster?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To attach to an existing Ray cluster, simply run ``ray.init`` before ``tune.run``.
|
||||
See :ref:`start-ray-cli` for more information about ``ray.init``:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Connect to an existing distributed Ray cluster
|
||||
ray.init(address=<ray_address>)
|
||||
tune.run(trainable, num_samples=100, resources_per_trial=tune.PlacementGroupFactory([{"CPU": 2, "GPU": 1}]))
|
||||
|
||||
Read more in the Tune :ref:`distributed experiments guide <tune-distributed-ref>`.
|
||||
|
||||
|
||||
.. _tune-dist-training:
|
||||
|
||||
How to run distributed training with Tune?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To tune distributed training jobs, Tune provides a set of ``DistributedTrainableCreator`` for different training frameworks.
|
||||
Below is an example for tuning distributed TensorFlow jobs:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Please refer to full example in tf_distributed_keras_example.py
|
||||
from ray.tune.integration.tensorflow import DistributedTrainableCreator
|
||||
tf_trainable = DistributedTrainableCreator(
|
||||
train_mnist,
|
||||
use_gpu=args.use_gpu,
|
||||
num_workers=2)
|
||||
tune.run(tf_trainable,
|
||||
num_samples=1)
|
||||
|
||||
Read more about tuning :ref:`distributed PyTorch <tune-ddp-doc>`,
|
||||
:ref:`TensorFlow <tune-dist-tf-doc>` and :ref:`Horovod <tune-integration-horovod>` jobs.
|
||||
|
||||
How to limit concurrency?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If using a :ref:`search algorithm <tune-search-alg>`, you may want to limit the number of trials that are being evaluated.
|
||||
For example, you may want to serialize the evaluation of trials to do sequential optimization.
|
||||
|
||||
In this case, ``ray.tune.suggest.ConcurrencyLimiter`` to limit the amount of concurrency:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
algo = BayesOptSearch(utility_kwargs={
|
||||
"kind": "ucb",
|
||||
"kappa": 2.5,
|
||||
"xi": 0.0
|
||||
})
|
||||
algo = ConcurrencyLimiter(algo, max_concurrent=4)
|
||||
scheduler = AsyncHyperBandScheduler()
|
||||
|
||||
.. note::
|
||||
|
||||
It is also possible to directly use ``tune.run(max_concurrent_trials=4, ...)``, which automatically wraps
|
||||
the underlying search algorithm in a ``ConcurrencyLimiter`` for you.
|
||||
|
||||
To understand concurrency limiting in depth, please see :ref:`limiter` for more details.
|
331
doc/source/tune/_tutorials/tune-serve-integration-mnist.py → doc/source/tune/tutorials/tune-serve-integration-mnist.md
Executable file → Normal file
331
doc/source/tune/_tutorials/tune-serve-integration-mnist.py → doc/source/tune/tutorials/tune-serve-integration-mnist.md
Executable file → Normal file
|
@ -1,7 +1,26 @@
|
|||
---
|
||||
jupytext:
|
||||
text_representation:
|
||||
extension: .md
|
||||
format_name: myst
|
||||
kernelspec:
|
||||
display_name: Python 3
|
||||
language: python
|
||||
name: python3
|
||||
---
|
||||
|
||||
```{code-cell}
|
||||
:tags: [remove-cell]
|
||||
%matplotlib inline
|
||||
```
|
||||
|
||||
```{code-cell}
|
||||
:tags: [remove-cell]
|
||||
# flake8: noqa
|
||||
"""
|
||||
Model selection and serving with Ray Tune and Ray Serve
|
||||
=======================================================
|
||||
```
|
||||
|
||||
# Model selection and serving with Ray Tune and Ray Serve
|
||||
|
||||
This tutorial will show you an end-to-end example how to train a
|
||||
model using Ray Tune on incrementally arriving data and deploy
|
||||
the model using Ray Serve.
|
||||
|
@ -37,8 +56,8 @@ By the end of this tutorial you will be able to
|
|||
newly arriving data
|
||||
3. Automatically create and serve data deployments with Ray Serve
|
||||
|
||||
Roadmap and desired functionality
|
||||
---------------------------------
|
||||
## Roadmap and desired functionality
|
||||
|
||||
The general idea of this example is that we simulate newly arriving
|
||||
data each day. So at day 0 we might have some initial data available
|
||||
already, but at each day, new data arrives.
|
||||
|
@ -48,43 +67,45 @@ from an existing model. Maybe you would like to train and select models
|
|||
from scratch each week with all data available until then, e.g. each
|
||||
Sunday, like this:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Train with all data available at day 0
|
||||
python tune-serve-integration-mnist.py --from_scratch --day 0
|
||||
```{code-block} bash
|
||||
# Train with all data available at day 0
|
||||
python tune-serve-integration-mnist.py --from_scratch --day 0
|
||||
```
|
||||
|
||||
During the other days you might want to improve your model, but
|
||||
not train everything from scratch, saving some cluster resources.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Train with data arriving between day 0 and day 1
|
||||
python tune-serve-integration-mnist.py --from_existing --day 1
|
||||
# Train with incremental data on the other days, too
|
||||
python tune-serve-integration-mnist.py --from_existing --day 2
|
||||
python tune-serve-integration-mnist.py --from_existing --day 3
|
||||
python tune-serve-integration-mnist.py --from_existing --day 4
|
||||
python tune-serve-integration-mnist.py --from_existing --day 5
|
||||
python tune-serve-integration-mnist.py --from_existing --day 6
|
||||
# Retrain from scratch every 7th day:
|
||||
python tune-serve-integration-mnist.py --from_scratch --day 7
|
||||
```{code-block} bash
|
||||
# Train with data arriving between day 0 and day 1
|
||||
python tune-serve-integration-mnist.py --from_existing --day 1
|
||||
# Train with incremental data on the other days, too
|
||||
python tune-serve-integration-mnist.py --from_existing --day 2
|
||||
python tune-serve-integration-mnist.py --from_existing --day 3
|
||||
python tune-serve-integration-mnist.py --from_existing --day 4
|
||||
python tune-serve-integration-mnist.py --from_existing --day 5
|
||||
python tune-serve-integration-mnist.py --from_existing --day 6
|
||||
# Retrain from scratch every 7th day:
|
||||
python tune-serve-integration-mnist.py --from_scratch --day 7
|
||||
```
|
||||
|
||||
This example will support both modes. After each model selection run,
|
||||
we will tell Ray Serve to serve an updated model. We also include a
|
||||
small utility to query our served model to see if it works as it should.
|
||||
|
||||
.. code-block:: bash
|
||||
```{code-block} bash
|
||||
$ python tune-serve-integration-mnist.py --query 6
|
||||
Querying model with example #6. Label = 1, Response = 1, Correct = True
|
||||
```
|
||||
|
||||
$ python tune-serve-integration-mnist.py --query 6
|
||||
Querying model with example #6. Label = 1, Response = 1, Correct = True
|
||||
|
||||
Imports
|
||||
-------
|
||||
## Imports
|
||||
|
||||
Let's start with our dependencies. Most of these should be familiar
|
||||
if you worked with PyTorch before. The most notable import for Ray
|
||||
is the ``from ray import tune, serve`` import statement - which
|
||||
includes almost all the things we need from the Ray side.
|
||||
"""
|
||||
|
||||
```{code-cell}
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
@ -106,18 +127,19 @@ from ray.tune.schedulers import ASHAScheduler
|
|||
from torch.utils.data import random_split, Subset
|
||||
from torchvision.datasets import MNIST
|
||||
from torchvision.transforms import transforms
|
||||
```
|
||||
|
||||
## Data interface
|
||||
|
||||
#######################################################################
|
||||
# Data interface
|
||||
# --------------
|
||||
# Let's start with a simulated data interface. This class acts as the
|
||||
# interface between your training code and your database. We simulate
|
||||
# that new data arrives each day with a ``day`` parameter. So, calling
|
||||
# ``get_data(day=3)`` would return all data we received until day 3.
|
||||
# We also implement an incremental data method, so calling
|
||||
# ``get_incremental_data(day=3)`` would return all data collected
|
||||
# between day 2 and day 3.
|
||||
Let's start with a simulated data interface. This class acts as the
|
||||
interface between your training code and your database. We simulate
|
||||
that new data arrives each day with a ``day`` parameter. So, calling
|
||||
``get_data(day=3)`` would return all data we received until day 3.
|
||||
We also implement an incremental data method, so calling
|
||||
``get_incremental_data(day=3)`` would return all data collected
|
||||
between day 2 and day 3.
|
||||
|
||||
```{code-cell}
|
||||
class MNISTDataInterface(object):
|
||||
"""Data interface. Simulates that new data arrives every day."""
|
||||
|
||||
|
@ -157,18 +179,19 @@ class MNISTDataInterface(object):
|
|||
train_n = int(0.8 * (end - start)) # 80% train data, 20% validation data
|
||||
|
||||
return random_split(available_data, [train_n, end - start - train_n])
|
||||
```
|
||||
|
||||
## PyTorch neural network classifier
|
||||
|
||||
#######################################################################
|
||||
# PyTorch neural network classifier
|
||||
# ---------------------------------
|
||||
# Next, we will introduce our PyTorch neural network model and the
|
||||
# train and test function. These are adapted directly from
|
||||
# our :doc:`PyTorch MNIST example </tune/examples/mnist_pytorch>`.
|
||||
# We only introduced an additional neural network layer with a configurable
|
||||
# layer size. This is not strictly needed for learning good performance on
|
||||
# MNIST, but it is useful to demonstrate scenarios where your hyperparameter
|
||||
# search space affects the model complexity.
|
||||
Next, we will introduce our PyTorch neural network model and the
|
||||
train and test function. These are adapted directly from
|
||||
our {doc}`PyTorch MNIST example </tune/examples/mnist_pytorch>`.
|
||||
We only introduced an additional neural network layer with a configurable
|
||||
layer size. This is not strictly needed for learning good performance on
|
||||
MNIST, but it is useful to demonstrate scenarios where your hyperparameter
|
||||
search space affects the model complexity.
|
||||
|
||||
```{code-cell}
|
||||
class ConvNet(nn.Module):
|
||||
def __init__(self, layer_size=192):
|
||||
super(ConvNet, self).__init__()
|
||||
|
@ -211,17 +234,18 @@ def test(model, data_loader, device=None):
|
|||
correct += (predicted == target).sum().item()
|
||||
|
||||
return correct / total
|
||||
```
|
||||
|
||||
## Tune trainable for model selection
|
||||
|
||||
#######################################################################
|
||||
# Tune trainable for model selection
|
||||
# ----------------------------------
|
||||
# We'll now define our Tune trainable function. This function takes
|
||||
# a ``config`` parameter containing the hyperparameters we should train
|
||||
# the model on, and will start a full training run. This means it
|
||||
# will take care of creating the model and optimizer and repeatedly
|
||||
# call the ``train`` function to train the model. Also, this function
|
||||
# will report the training progress back to Tune.
|
||||
We'll now define our Tune trainable function. This function takes
|
||||
a ``config`` parameter containing the hyperparameters we should train
|
||||
the model on, and will start a full training run. This means it
|
||||
will take care of creating the model and optimizer and repeatedly
|
||||
call the ``train`` function to train the model. Also, this function
|
||||
will report the training progress back to Tune.
|
||||
|
||||
```{code-cell}
|
||||
def train_mnist(
|
||||
config,
|
||||
start_model=None,
|
||||
|
@ -277,20 +301,21 @@ def train_mnist(
|
|||
tune.report(mean_accuracy=acc, done=True)
|
||||
else:
|
||||
tune.report(mean_accuracy=acc)
|
||||
```
|
||||
|
||||
## Configuring the search space and starting Ray Tune
|
||||
|
||||
#######################################################################
|
||||
# Configuring the search space and starting Ray Tune
|
||||
# --------------------------------------------------
|
||||
# We would like to support two modes of training the model: Training
|
||||
# a model from scratch, and continuing to train a model from an
|
||||
# existing one.
|
||||
#
|
||||
# This is our function to train a number of models with different
|
||||
# hyperparameters from scratch, i.e. from all data that is available
|
||||
# until the given day. Our search space can thus also contain parameters
|
||||
# that affect the model complexity (such as the layer size), since it
|
||||
# does not have to be compatible to an existing model.
|
||||
We would like to support two modes of training the model: Training
|
||||
a model from scratch, and continuing to train a model from an
|
||||
existing one.
|
||||
|
||||
This is our function to train a number of models with different
|
||||
hyperparameters from scratch, i.e. from all data that is available
|
||||
until the given day. Our search space can thus also contain parameters
|
||||
that affect the model complexity (such as the layer size), since it
|
||||
does not have to be compatible to an existing model.
|
||||
|
||||
```{code-cell}
|
||||
def tune_from_scratch(num_samples=10, num_epochs=10, gpus_per_trial=0.0, day=0):
|
||||
data_interface = MNISTDataInterface("~/data", max_days=10)
|
||||
num_examples = data_interface._get_day_slice(day)
|
||||
|
@ -339,17 +364,18 @@ def tune_from_scratch(num_samples=10, num_epochs=10, gpus_per_trial=0.0, day=0):
|
|||
best_checkpoint = best_trial.checkpoint.value
|
||||
|
||||
return best_accuracy, best_trial_config, best_checkpoint, num_examples
|
||||
```
|
||||
|
||||
To continue training from an existing model, we can use this function
|
||||
instead. It takes a starting model (a checkpoint) as a parameter and
|
||||
the old config.
|
||||
|
||||
#######################################################################
|
||||
# To continue training from an existing model, we can use this function
|
||||
# instead. It takes a starting model (a checkpoint) as a parameter and
|
||||
# the old config.
|
||||
#
|
||||
# Note that this time the search space does _not_ contain the
|
||||
# layer size parameter. Since we continue to train an existing model,
|
||||
# we cannot change the layer size mid training, so we just continue
|
||||
# to use the existing one.
|
||||
Note that this time the search space does _not_ contain the
|
||||
layer size parameter. Since we continue to train an existing model,
|
||||
we cannot change the layer size mid training, so we just continue
|
||||
to use the existing one.
|
||||
|
||||
```{code-cell}
|
||||
def tune_from_existing(
|
||||
start_model, start_config, num_samples=10, num_epochs=10, gpus_per_trial=0.0, day=0
|
||||
):
|
||||
|
@ -404,20 +430,21 @@ def tune_from_existing(
|
|||
best_checkpoint = best_trial.checkpoint.value
|
||||
|
||||
return best_accuracy, best_trial_config, best_checkpoint, num_examples
|
||||
```
|
||||
|
||||
## Serving tuned models with Ray Serve
|
||||
|
||||
#######################################################################
|
||||
# Serving tuned models with Ray Serve
|
||||
# -----------------------------------
|
||||
# Let's now turn to the model serving part with Ray Serve. Serve allows
|
||||
# you to deploy your models as multiple _deployments_. Broadly speaking,
|
||||
# a deployment handles incoming requests and replies with a result. For
|
||||
# instance, our MNIST deployment takes an image as input and outputs the
|
||||
# digit it recognized from it. This deployment can be exposed over HTTP.
|
||||
#
|
||||
# First, we will define our deployment. This loads our PyTorch
|
||||
# MNIST model from a checkpoint, takes an image as an input and
|
||||
# outputs our digit prediction according to our trained model:
|
||||
Let's now turn to the model serving part with Ray Serve. Serve allows
|
||||
you to deploy your models as multiple _deployments_. Broadly speaking,
|
||||
a deployment handles incoming requests and replies with a result. For
|
||||
instance, our MNIST deployment takes an image as input and outputs the
|
||||
digit it recognized from it. This deployment can be exposed over HTTP.
|
||||
|
||||
First, we will define our deployment. This loads our PyTorch
|
||||
MNIST model from a checkpoint, takes an image as an input and
|
||||
outputs our digit prediction according to our trained model:
|
||||
|
||||
```{code-cell}
|
||||
@serve.deployment(name="mnist", route_prefix="/mnist")
|
||||
class MNISTDeployment:
|
||||
def __init__(self, checkpoint_dir, config, metrics, use_gpu=False):
|
||||
|
@ -442,13 +469,14 @@ class MNISTDeployment:
|
|||
outputs = self.model(images)
|
||||
predicted = torch.max(outputs.data, 1)[1]
|
||||
return {"result": predicted.numpy().tolist()}
|
||||
```
|
||||
|
||||
We would like to have a fixed location where we store the currently
|
||||
active model. We call this directory ``model_dir``. Every time we
|
||||
would like to update our model, we copy the checkpoint of the new
|
||||
model to this directory. We then update the deployment to the new version.
|
||||
|
||||
#######################################################################
|
||||
# We would like to have a fixed location where we store the currently
|
||||
# active model. We call this directory ``model_dir``. Every time we
|
||||
# would like to update our model, we copy the checkpoint of the new
|
||||
# model to this directory. We then update the deployment to the new version.
|
||||
```{code-cell}
|
||||
def serve_new_model(model_dir, checkpoint, config, metrics, day, use_gpu=False):
|
||||
print("Serving checkpoint: {}".format(checkpoint))
|
||||
|
||||
|
@ -478,13 +506,14 @@ def _move_checkpoint_to_model_dir(model_dir, checkpoint, config, metrics):
|
|||
json.dump(dict(config=config, metrics=metrics), fp)
|
||||
|
||||
return checkpoint_path
|
||||
```
|
||||
|
||||
Since we would like to continue training from the current existing
|
||||
model, we introduce an utility function that fetches the currently
|
||||
served checkpoint as well as the hyperparameter config and achieved
|
||||
accuracy.
|
||||
|
||||
#######################################################################
|
||||
# Since we would like to continue training from the current existing
|
||||
# model, we introduce an utility function that fetches the currently
|
||||
# served checkpoint as well as the hyperparameter config and achieved
|
||||
# accuracy.
|
||||
```{code-cell}
|
||||
def get_current_model(model_dir):
|
||||
checkpoint_path = os.path.join(model_dir, "checkpoint")
|
||||
meta_path = os.path.join(model_dir, "meta.json")
|
||||
|
@ -496,23 +525,23 @@ def get_current_model(model_dir):
|
|||
meta = json.load(fp)
|
||||
|
||||
return checkpoint_path, meta["config"], meta["metrics"]
|
||||
```
|
||||
|
||||
## Putting everything together
|
||||
|
||||
#######################################################################
|
||||
# Putting everything together
|
||||
# ---------------------------
|
||||
# Now we only need to glue this code together. This is the main
|
||||
# entrypoint of the script, and we will define three methods:
|
||||
#
|
||||
# 1. Train new model from scratch with all data
|
||||
# 2. Continue training from existing model with new data only
|
||||
# 3. Query the model with test data
|
||||
#
|
||||
# Internally, this will just call the ``tune_from_scratch`` and
|
||||
# ``tune_from_existing()`` functions.
|
||||
# Both training functions will then call ``serve_new_model()`` to serve
|
||||
# the newly trained or updated model.
|
||||
Now we only need to glue this code together. This is the main
|
||||
entrypoint of the script, and we will define three methods:
|
||||
|
||||
1. Train new model from scratch with all data
|
||||
2. Continue training from existing model with new data only
|
||||
3. Query the model with test data
|
||||
|
||||
Internally, this will just call the ``tune_from_scratch`` and
|
||||
``tune_from_existing()`` functions.
|
||||
Both training functions will then call ``serve_new_model()`` to serve
|
||||
the newly trained or updated model.
|
||||
|
||||
```{code-cell}
|
||||
# The query function will send a HTTP request to Serve with some
|
||||
# test data obtained from the MNIST dataset.
|
||||
if __name__ == "__main__":
|
||||
|
@ -529,33 +558,33 @@ if __name__ == "__main__":
|
|||
|
||||
First, we might train a model with all data available at this day:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python tune-serve-integration-mnist.py --from_scratch --day 0
|
||||
```{code-block} bash
|
||||
python tune-serve-integration-mnist.py --from_scratch --day 0
|
||||
```
|
||||
|
||||
On the coming days, we want to continue to train this model with
|
||||
newly available data:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python tune-serve-integration-mnist.py --from_existing --day 1
|
||||
python tune-serve-integration-mnist.py --from_existing --day 2
|
||||
python tune-serve-integration-mnist.py --from_existing --day 3
|
||||
python tune-serve-integration-mnist.py --from_existing --day 4
|
||||
python tune-serve-integration-mnist.py --from_existing --day 5
|
||||
python tune-serve-integration-mnist.py --from_existing --day 6
|
||||
# Retrain from scratch every 7th day:
|
||||
python tune-serve-integration-mnist.py --from_scratch --day 7
|
||||
```{code-block} bash
|
||||
python tune-serve-integration-mnist.py --from_existing --day 1
|
||||
python tune-serve-integration-mnist.py --from_existing --day 2
|
||||
python tune-serve-integration-mnist.py --from_existing --day 3
|
||||
python tune-serve-integration-mnist.py --from_existing --day 4
|
||||
python tune-serve-integration-mnist.py --from_existing --day 5
|
||||
python tune-serve-integration-mnist.py --from_existing --day 6
|
||||
# Retrain from scratch every 7th day:
|
||||
python tune-serve-integration-mnist.py --from_scratch --day 7
|
||||
```
|
||||
|
||||
We can also use this script to query our served model
|
||||
with some test data:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python tune-serve-integration-mnist.py --query 6
|
||||
Querying model with example #6. Label = 1, Response = 1, Correct = T
|
||||
python tune-serve-integration-mnist.py --query 28
|
||||
Querying model with example #28. Label = 2, Response = 7, Correct = F
|
||||
```{code-block} bash
|
||||
python tune-serve-integration-mnist.py --query 6
|
||||
Querying model with example #6. Label = 1, Response = 1, Correct = T
|
||||
python tune-serve-integration-mnist.py --query 28
|
||||
Querying model with example #28. Label = 2, Response = 7, Correct = F
|
||||
```
|
||||
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="MNIST Tune/Serve example")
|
||||
|
@ -669,23 +698,23 @@ if __name__ == "__main__":
|
|||
serve_new_model(
|
||||
model_dir, best_checkpoint, config, acc, args.day, use_gpu=serve_gpu
|
||||
)
|
||||
```
|
||||
|
||||
#######################################################################
|
||||
# That's it! We now have an end-to-end workflow to train and update a
|
||||
# model every day with newly arrived data. Every week we might retrain
|
||||
# the whole model. At every point in time we make sure to serve the
|
||||
# model that achieved the best validation set accuracy.
|
||||
#
|
||||
# There are some ways we might extend this example. For instance, right
|
||||
# now we only serve the latest trained model. We could also choose to
|
||||
# route only a certain percentage of users to the new model, maybe to
|
||||
# see if the new model really does it's job right. These kind of
|
||||
# deployments are called canary deployments.
|
||||
# These kind of deployments would also require us to keep more than one
|
||||
# model in our ``model_dir`` - which should be quite easy: We could just
|
||||
# create subdirectories for each training day.
|
||||
#
|
||||
# Still, this example should show you how easy it is to integrate the
|
||||
# Ray libraries Ray Tune and Ray Serve in your workflow. While both tools
|
||||
# also work independently of each other, they complement each other
|
||||
# nicely and support a large number of use cases.
|
||||
That's it! We now have an end-to-end workflow to train and update a
|
||||
model every day with newly arrived data. Every week we might retrain
|
||||
the whole model. At every point in time we make sure to serve the
|
||||
model that achieved the best validation set accuracy.
|
||||
|
||||
There are some ways we might extend this example. For instance, right
|
||||
now we only serve the latest trained model. We could also choose to
|
||||
route only a certain percentage of users to the new model, maybe to
|
||||
see if the new model really does it's job right. These kind of
|
||||
deployments are called canary deployments.
|
||||
These kind of deployments would also require us to keep more than one
|
||||
model in our ``model_dir`` - which should be quite easy: We could just
|
||||
create subdirectories for each training day.
|
||||
|
||||
Still, this example should show you how easy it is to integrate the
|
||||
Ray libraries Ray Tune and Ray Serve in your workflow. While both tools
|
||||
also work independently of each other, they complement each other
|
||||
nicely and support a large number of use cases.
|
195
doc/source/tune/tutorials/tune-sklearn.md
Normal file
195
doc/source/tune/tutorials/tune-sklearn.md
Normal file
|
@ -0,0 +1,195 @@
|
|||
---
|
||||
jupytext:
|
||||
text_representation:
|
||||
extension: .md
|
||||
format_name: myst
|
||||
kernelspec:
|
||||
display_name: Python 3
|
||||
language: python
|
||||
name: python3
|
||||
---
|
||||
|
||||
```{code-cell}
|
||||
:tags: [remove-cell]
|
||||
%matplotlib inline
|
||||
```
|
||||
|
||||
```{code-cell}
|
||||
:tags: [remove-cell]
|
||||
# flake8: noqa
|
||||
```
|
||||
|
||||
# Tune's Scikit Learn Adapters
|
||||
|
||||
Scikit-Learn is one of the most widely used tools in the ML community for working with data,
|
||||
offering dozens of easy-to-use machine learning algorithms.
|
||||
However, to achieve high performance for these algorithms, you often need to perform **model selection**.
|
||||
|
||||
```{image} /images/tune-sklearn.png
|
||||
:align: center
|
||||
:width: 50%
|
||||
```
|
||||
|
||||
Scikit-Learn [has an existing module for model selection](https://scikit-learn.org/stable/modules/grid_search.html),
|
||||
but the algorithms offered (Grid Search via``GridSearchCV`` and Random Search via``RandomizedSearchCV``)
|
||||
are often considered inefficient.
|
||||
In this tutorial, we'll cover ``tune-sklearn``, a drop-in replacement for Scikit-Learn's model selection module
|
||||
with state-of-the-art optimization features such as early stopping and Bayesian Optimization.
|
||||
|
||||
```{tip}
|
||||
Check out the [tune-sklearn code](https://github.com/ray-project/tune-sklearn) and {ref}`documentation <tune-sklearn-docs>`.
|
||||
```
|
||||
|
||||
## Overview
|
||||
|
||||
``tune-sklearn`` is a module that integrates Ray Tune's hyperparameter tuning and scikit-learn's Classifier API.
|
||||
``tune-sklearn`` has two APIs: {ref}`TuneSearchCV <tunesearchcv-docs>`, and {ref}`TuneGridSearchCV <tunegridsearchcv-docs>`.
|
||||
They are drop-in replacements for Scikit-learn's RandomizedSearchCV and GridSearchCV, so you only need to change
|
||||
less than 5 lines in a standard Scikit-Learn script to use the API.
|
||||
|
||||
Ray Tune's Scikit-learn APIs allows you to easily leverage Bayesian Optimization, HyperBand, and other cutting edge
|
||||
tuning techniques by simply toggling a few parameters. It also supports and provides examples for many other
|
||||
frameworks with Scikit-Learn wrappers such as Skorch (Pytorch), KerasClassifiers (Keras), and XGBoostClassifiers (XGBoost).
|
||||
|
||||
Run ``pip install "ray[tune]" tune-sklearn`` to get started.
|
||||
|
||||
## Walkthrough
|
||||
|
||||
Let's compare Tune's Scikit-Learn APIs to the standard scikit-learn GridSearchCV. For this example, we'll be using
|
||||
``TuneGridSearchCV`` with a [SGDClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html).
|
||||
|
||||
To start out, change the import statement to get tune-scikit-learn’s grid search cross validation interface:
|
||||
|
||||
```{code-cell}
|
||||
# Keep this here for https://github.com/ray-project/ray/issues/11547
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
# Replace above line with:
|
||||
from ray.tune.sklearn import TuneGridSearchCV
|
||||
```
|
||||
|
||||
And from there, we would proceed just like how we would in Scikit-Learn’s interface!
|
||||
|
||||
The `SGDClassifier` has a ``partial_fit`` API, which enables it to stop fitting to the data for a certain hyperparameter configuration.
|
||||
If the estimator does not support early stopping, we would fall back to a parallel grid search.
|
||||
|
||||
```{code-cell}
|
||||
# Other imports
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
from sklearn.datasets import make_classification
|
||||
import numpy as np
|
||||
|
||||
# Create dataset
|
||||
X, y = make_classification(
|
||||
n_samples=11000,
|
||||
n_features=1000,
|
||||
n_informative=50,
|
||||
n_redundant=0,
|
||||
n_classes=10,
|
||||
class_sep=2.5,
|
||||
)
|
||||
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1000)
|
||||
|
||||
# Example parameters to tune from SGDClassifier
|
||||
parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]}
|
||||
```
|
||||
|
||||
As you can see, the setup here is exactly how you would do it for Scikit-Learn.
|
||||
Now, let's try fitting a model.
|
||||
|
||||
```{code-cell}
|
||||
tune_search = TuneGridSearchCV(
|
||||
SGDClassifier(), parameter_grid, early_stopping=True, max_iters=10
|
||||
)
|
||||
|
||||
import time # Just to compare fit times
|
||||
|
||||
start = time.time()
|
||||
tune_search.fit(x_train, y_train)
|
||||
end = time.time()
|
||||
print("Tune GridSearch Fit Time:", end - start)
|
||||
# Tune GridSearch Fit Time: 15.436315774917603 (for an 8 core laptop)
|
||||
```
|
||||
|
||||
Note the slight differences we introduced above:
|
||||
|
||||
* a `early_stopping`, and
|
||||
* a specification of `max_iters` parameter
|
||||
|
||||
The ``early_stopping`` parameter allows us to terminate unpromising configurations. If ``early_stopping=True``,
|
||||
TuneGridSearchCV will default to using Tune's ASHAScheduler.
|
||||
You can pass in a custom algorithm - see {ref}`Tune's documentation on schedulers <tune-schedulers>` here for a full list to choose from.
|
||||
``max_iters`` is the maximum number of iterations a given hyperparameter set could run for;
|
||||
it may run for fewer iterations if it is early stopped.
|
||||
|
||||
Try running this compared to the GridSearchCV equivalent, and see the speedup for yourself!
|
||||
|
||||
```{code-cell}
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
# n_jobs=-1 enables use of all cores like Tune does
|
||||
sklearn_search = GridSearchCV(SGDClassifier(), parameter_grid, n_jobs=-1)
|
||||
|
||||
start = time.time()
|
||||
sklearn_search.fit(x_train, y_train)
|
||||
end = time.time()
|
||||
print("Sklearn Fit Time:", end - start)
|
||||
# Sklearn Fit Time: 47.48055911064148 (for an 8 core laptop)
|
||||
```
|
||||
|
||||
## Using Bayesian Optimization
|
||||
|
||||
In addition to the grid search interface, tune-sklearn also provides an interface,
|
||||
TuneSearchCV, for sampling from **distributions of hyperparameters**.
|
||||
In the following example we'll be using the [digits dataset from scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html)
|
||||
|
||||
In addition, you can easily enable Bayesian optimization over the distributions in only 2 lines of code:
|
||||
|
||||
```{code-cell}
|
||||
# First run `pip install bayesian-optimization`
|
||||
from ray.tune.sklearn import TuneSearchCV
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split
|
||||
import numpy as np
|
||||
|
||||
digits = datasets.load_digits()
|
||||
x = digits.data
|
||||
y = digits.target
|
||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
|
||||
|
||||
clf = SGDClassifier()
|
||||
parameter_grid = {"alpha": (1e-4, 1), "epsilon": (0.01, 0.1)}
|
||||
|
||||
tune_search = TuneSearchCV(
|
||||
clf,
|
||||
parameter_grid,
|
||||
search_optimization="bayesian",
|
||||
n_trials=3,
|
||||
early_stopping=True,
|
||||
max_iters=10,
|
||||
)
|
||||
tune_search.fit(x_train, y_train)
|
||||
print(tune_search.best_params_)
|
||||
# {'alpha': 0.37460266483547777, 'epsilon': 0.09556428757689246}
|
||||
```
|
||||
|
||||
As you can see, it’s very simple to integrate tune-sklearn into existing code.
|
||||
Distributed execution is also easy - you can simply run ``ray.init(address="auto")`` before
|
||||
TuneSearchCV to connect to the Ray cluster and parallelize tuning across multiple nodes, as you would in any other Ray Tune script.
|
||||
|
||||
## Code Examples
|
||||
|
||||
Check out more detailed examples and get started with tune-sklearn!
|
||||
|
||||
* [Skorch with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/torch_nn.py>)
|
||||
* [Scikit-Learn Pipelines with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/sklearn_pipeline.py>)
|
||||
* [XGBoost with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/xgbclassifier.py>)
|
||||
* [KerasClassifier with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/keras_example.py>)
|
||||
* [LightGBM with tune-sklearn](https://github.com/ray-project/tune-sklearn/blob/master/examples/lgbm.py>)
|
||||
|
||||
## Further Reading
|
||||
|
||||
If you're using scikit-learn for other tasks, take a look at Ray’s {ref}`replacement for joblib <ray-joblib>`,
|
||||
which allows users to parallelize scikit learn jobs over multiple nodes.
|
135
doc/source/tune/tutorials/tune-stopping.rst
Normal file
135
doc/source/tune/tutorials/tune-stopping.rst
Normal file
|
@ -0,0 +1,135 @@
|
|||
Stopping and Resuming Tune Trials
|
||||
=================================
|
||||
|
||||
Ray Tune periodically checkpoints the experiment state so that it can be restarted when it fails or stops.
|
||||
The checkpointing period is dynamically adjusted so that at least 95% of the time is used for handling
|
||||
training results and scheduling.
|
||||
|
||||
If you send a SIGINT signal to the process running ``tune.run()`` (which is
|
||||
usually what happens when you press Ctrl+C in the console), Ray Tune shuts
|
||||
down training gracefully and saves a final experiment-level checkpoint.
|
||||
|
||||
How to resume a Tune run?
|
||||
-------------------------
|
||||
|
||||
If you've stopped a run and and want to resume from where you left off,
|
||||
you can then call ``tune.run()`` with ``resume=True`` like this:
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 5
|
||||
|
||||
tune.run(
|
||||
train,
|
||||
# other configuration
|
||||
name="my_experiment",
|
||||
resume=True
|
||||
)
|
||||
|
||||
You will have to pass a ``name`` if you are using ``resume=True`` so that Ray Tune can detect the experiment
|
||||
folder (which is usually stored at e.g. ``~/ray_results/my_experiment``).
|
||||
If you forgot to pass a name in the first call, you can still pass the name when you resume the run.
|
||||
Please note that in this case it is likely that your experiment name has a date suffix, so if you
|
||||
ran ``tune.run(my_trainable)``, the ``name`` might look like something like this:
|
||||
``my_trainable_2021-01-29_10-16-44``.
|
||||
|
||||
You can see which name you need to pass by taking a look at the results table
|
||||
of your original tuning run:
|
||||
|
||||
.. code-block::
|
||||
:emphasize-lines: 5
|
||||
|
||||
== Status ==
|
||||
Memory usage on this node: 11.0/16.0 GiB
|
||||
Using FIFO scheduling algorithm.
|
||||
Resources requested: 1/16 CPUs, 0/0 GPUs, 0.0/4.69 GiB heap, 0.0/1.61 GiB objects
|
||||
Result logdir: /Users/ray/ray_results/my_trainable_2021-01-29_10-16-44
|
||||
Number of trials: 1/1 (1 RUNNING)
|
||||
|
||||
Another useful option to know about is ``resume="AUTO"``, which will attempt to resume the experiment if possible,
|
||||
and otherwise will start a new experiment.
|
||||
For more details and other options for ``resume``, see the :ref:`Tune run API documentation <tune-run-ref>`.
|
||||
|
||||
.. _tune-stopping-ref:
|
||||
|
||||
How to stop Tune runs programmatically?
|
||||
---------------------------------------
|
||||
|
||||
We've just covered the case in which you manually interrupt a Tune run.
|
||||
But you can also control when trials are stopped early by passing the ``stop`` argument to ``tune.run``.
|
||||
This argument takes, a dictionary, a function, or a :class:`Stopper <ray.tune.stopper.Stopper>` class as an argument.
|
||||
|
||||
If a dictionary is passed in, the keys may be any field in the return result of ``tune.report`` in the
|
||||
Function API or ``step()`` (including the results from ``step`` and auto-filled metrics).
|
||||
|
||||
Stopping with a dictionary
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
In the example below, each trial will be stopped either when it completes ``10`` iterations or when it
|
||||
reaches a mean accuracy of ``0.98``.
|
||||
These metrics are assumed to be **increasing**.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# training_iteration is an auto-filled metric by Tune.
|
||||
tune.run(
|
||||
my_trainable,
|
||||
stop={"training_iteration": 10, "mean_accuracy": 0.98}
|
||||
)
|
||||
|
||||
Stopping with a function
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
For more flexibility, you can pass in a function instead.
|
||||
If a function is passed in, it must take ``(trial_id, result)`` as arguments and return a boolean
|
||||
(``True`` if trial should be stopped and ``False`` otherwise).
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def stopper(trial_id, result):
|
||||
return result["mean_accuracy"] / result["training_iteration"] > 5
|
||||
|
||||
tune.run(my_trainable, stop=stopper)
|
||||
|
||||
Stopping with a class
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Finally, you can implement the :class:`Stopper <ray.tune.stopper.Stopper>` abstract class for stopping entire experiments. For example, the following example stops all trials after the criteria is fulfilled by any individual trial, and prevents new ones from starting:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.tune import Stopper
|
||||
|
||||
class CustomStopper(Stopper):
|
||||
def __init__(self):
|
||||
self.should_stop = False
|
||||
|
||||
def __call__(self, trial_id, result):
|
||||
if not self.should_stop and result['foo'] > 10:
|
||||
self.should_stop = True
|
||||
return self.should_stop
|
||||
|
||||
def stop_all(self):
|
||||
"""Returns whether to stop trials and prevent new ones from starting."""
|
||||
return self.should_stop
|
||||
|
||||
stopper = CustomStopper()
|
||||
tune.run(my_trainable, stop=stopper)
|
||||
|
||||
|
||||
Note that in the above example the currently running trials will not stop immediately but will do so
|
||||
once their current iterations are complete.
|
||||
|
||||
Ray Tune comes with a set of out-of-the-box stopper classes. See the :ref:`Stopper <tune-stoppers>` documentation.
|
||||
|
||||
|
||||
Stopping after the first failure
|
||||
--------------------------------
|
||||
|
||||
By default, ``tune.run`` will continue executing until all trials have terminated or errored.
|
||||
To stop the entire Tune run as soon as **any** trial errors:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
tune.run(trainable, fail_fast=True)
|
||||
|
||||
This is useful when you are trying to setup a large hyperparameter experiment.
|
|
@ -1,4 +1,4 @@
|
|||
.. _tune-wandb:
|
||||
.. _tune-wandb-ref:
|
||||
|
||||
Using Weights & Biases with Tune
|
||||
================================
|
|
@ -1,4 +1,4 @@
|
|||
.. _tune-xgboost:
|
||||
.. _tune-xgboost-ref:
|
||||
|
||||
Tuning XGBoost parameters
|
||||
=========================
|
File diff suppressed because it is too large
Load diff
32
doc/test_myst_doc.py
Normal file
32
doc/test_myst_doc.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
"""Execute a jupytext markdown notebook."""
|
||||
|
||||
import subprocess
|
||||
import argparse
|
||||
import tempfile
|
||||
import sys
|
||||
|
||||
import jupytext
|
||||
|
||||
parser = argparse.ArgumentParser(description="Run a jupytext parsable markdown file.")
|
||||
parser.add_argument(
|
||||
"--path",
|
||||
help="path to the markdown file",
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
args, remainder = parser.parse_known_args()
|
||||
|
||||
with open(args.path, "r") as f:
|
||||
notebook = jupytext.read(f)
|
||||
|
||||
name = ""
|
||||
with tempfile.NamedTemporaryFile("w", delete=False) as f:
|
||||
jupytext.write(notebook, f, fmt="py:percent")
|
||||
name = f.name
|
||||
|
||||
remainder.insert(0, name)
|
||||
remainder.insert(0, sys.executable)
|
||||
|
||||
# Run the notebook
|
||||
subprocess.run(remainder)
|
|
@ -13,7 +13,24 @@
|
|||
# __quick_start_begin__
|
||||
from ray import tune
|
||||
|
||||
# 1. Define an objective function.
|
||||
def objective(config):
|
||||
score = config["a"] ** 2 + config["b"]
|
||||
return {"score": score}
|
||||
|
||||
|
||||
# 2. Define a search space.
|
||||
search_space = {
|
||||
"a": tune.grid_search([0.001, 0.01, 0.1, 1.0]),
|
||||
"b": tune.choice([1, 2, 3]),
|
||||
}
|
||||
|
||||
# 3. Start a Tune run and print the best result.
|
||||
analysis = tune.run(objective, config=search_space)
|
||||
print(analysis.get_best_config(metric="score", mode="min"))
|
||||
# __quick_start_end__
|
||||
|
||||
# __ml_quick_start_begin__
|
||||
def objective(step, alpha, beta):
|
||||
return (0.1 + alpha * step / 100) ** (-1) + beta * 0.1
|
||||
|
||||
|
@ -40,4 +57,4 @@ print("Best config: ", analysis.get_best_config(metric="mean_loss", mode="min"))
|
|||
|
||||
# Get a dataframe for analyzing trial results.
|
||||
df = analysis.results_df
|
||||
# __quick_start_end__
|
||||
# __ml_quick_start_end__
|
||||
|
|
|
@ -254,15 +254,18 @@ def run(
|
|||
restore (str): Path to checkpoint. Only makes sense to set if
|
||||
running 1 trial. Defaults to None.
|
||||
server_port (int): Port number for launching TuneServer.
|
||||
resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY",
|
||||
or bool. LOCAL/True restores the checkpoint from the
|
||||
resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY", "AUTO",
|
||||
or bool. "LOCAL"/True restores the checkpoint from the
|
||||
local experiment directory, determined
|
||||
by ``name`` and ``local_dir``. REMOTE restores the checkpoint
|
||||
by ``name`` and ``local_dir``. "REMOTE" restores the checkpoint
|
||||
from ``upload_dir`` (as passed to ``sync_config``).
|
||||
PROMPT provides CLI feedback.
|
||||
False forces a new experiment. ERRORED_ONLY resets and reruns
|
||||
ERRORED trials upon resume - previous trial artifacts will
|
||||
be left untouched. If resume is set but checkpoint does not exist,
|
||||
"PROMPT" provides the CLI feedback.
|
||||
False forces a new experiment. "ERRORED_ONLY" resets and reruns
|
||||
errored trials upon resume - previous trial artifacts will
|
||||
be left untouched.
|
||||
"AUTO" will attempt to resume from a checkpoint and otherwise
|
||||
start a new experiment.
|
||||
If resume is set but checkpoint does not exist,
|
||||
ValueError will be thrown.
|
||||
reuse_actors (bool): Whether to reuse actors between different trials
|
||||
when possible. This can drastically speed up experiments that start
|
||||
|
|
Loading…
Add table
Reference in a new issue