[docs] Add xgboost_ray to docs (#12184)

Co-authored-by: Amog Kamsetty <amogkamsetty@yahoo.com>
This commit is contained in:
Richard Liaw 2020-11-27 11:36:56 -08:00 committed by GitHub
parent 0a505ca83d
commit 7c009d22cf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 333 additions and 1 deletions

View file

@ -432,6 +432,7 @@ matrix:
# - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 python/ray/util/sgd/...
- ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/...
- ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/...
- ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/...
# Docs: Tests and examples.
- os: linux

View file

@ -29,4 +29,5 @@ tabulate
uvicorn
werkzeug
git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray
scikit-optimize

View file

@ -74,7 +74,6 @@ MOCK_MODULES = [
"torch.utils.data",
"torch.utils.data.distributed",
"wandb",
"xgboost",
"zoopt",
]
import scipy.stats

View file

@ -285,6 +285,7 @@ Papers
multiprocessing.rst
joblib.rst
iter.rst
xgboost-ray.rst
dask-on-ray.rst
mars-on-ray.rst

View file

@ -1,3 +1,5 @@
.. _tune-main:
Tune: Scalable Hyperparameter Tuning
====================================

161
doc/source/xgboost-ray.rst Normal file
View file

@ -0,0 +1,161 @@
.. _xgboost-ray:
XGBoost on Ray
==============
This library adds a new backend for XGBoost utilizing Ray.
Please note that this is an early version and both the API and
the behavior can change without prior notice.
Installation
------------
You can install XGBoost on Ray (``xgboost_ray``) like this:
.. code-block:: bash
git clone https://github.com/ray-project/xgboost_ray.git
cd xgboost_ray
pip install -e .
Usage
-----
After installation, you can import XGBoost on Ray via two ways:
.. code-block:: bash
import xgboost_ray
# or
import ray.util.xgboost
``xgboost_ray`` provides a drop-in replacement for XGBoost's ``train``
function. To pass data, instead of using ``xgb.DMatrix`` you will
have to use ``ray.util.xgboost.RayDMatrix``.
Here is a simplified example:
.. literalinclude:: /../../python/ray/util/xgboost/simple_example.py
:language: python
:start-after: __xgboost_begin__
:end-before: __xgboost_end__
Data loading
------------
Data is passed to ``xgboost_ray`` via a ``RayDMatrix`` object.
The ``RayDMatrix`` lazy loads data and stores it sharded in the
Ray object store. The Ray XGBoost actors then access these
shards to run their training on.
A ``RayDMatrix`` support various data and file types, like
Pandas DataFrames, Numpy Arrays, CSV files and Parquet files.
Example loading multiple parquet files:
.. code-block:: python
import glob
from ray.util.xgboost import RayDMatrix, RayFileType
# We can also pass a list of files
path = list(sorted(glob.glob("/data/nyc-taxi/*/*/*.parquet")))
# This argument will be passed to pd.read_parquet()
columns = [
"passenger_count",
"trip_distance", "pickup_longitude", "pickup_latitude",
"dropoff_longitude", "dropoff_latitude",
"fare_amount", "extra", "mta_tax", "tip_amount",
"tolls_amount", "total_amount"
]
dtrain = RayDMatrix(
path,
label="passenger_count", # Will select this column as the label
columns=columns,
filetype=RayFileType.PARQUET)
Hyperparameter Tuning
---------------------
``xgboost_ray`` integrates with Ray Tune (:ref:`tune-main`) to provide distributed hyperparameter tuning for your
distributed XGBoost models. You can run multiple ``xgboost_ray`` training runs in parallel, each with a different
hyperparameter configuration, with each individual training run parallelized.
First, move your training code into a function. This function should take in a ``config`` argument which specifies the
hyperparameters for the xgboost model.
.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
:language: python
:start-after: __train_begin__
:end-before: __train_end__
Then, you import tune and use tune's search primitives to define a hyperparameter search space.
.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
:language: python
:start-after: __tune_begin__
:end-before: __tune_end__
Finally, you call ``tune.run`` passing in the training function and the ``config``. Internally, tune will resolve the
hyperparameter search space and invoke the training function multiple times, each with different hyperparameters.
.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
:language: python
:start-after: __tune_run_begin__
:end-before: __tune_run_end__
Make sure you set the ``extra_cpu`` field appropriately so tune is aware of the total number of resources each trial
requires.
Resources
---------
By default, ``xgboost_ray`` tries to determine the number of CPUs
available and distributes them evenly across actors.
In the case of very large clusters or clusters with many different
machine sizes, it makes sense to limit the number of CPUs per actor
by setting the ``cpus_per_actor`` argument. Consider always
setting this explicitly.
The number of XGBoost actors always has to be set manually with
the ``num_actors`` argument.
More examples
-------------
Fore complete end to end examples, please have a look at
the `examples folder <https://github.com/ray-project/xgboost_ray/tree/master/examples/>`__:
* `Simple sklearn breastcancer dataset example <https://github.com/ray-project/xgboost_ray/tree/master/examples/simple.py>`__ (requires `sklearn`)
* `Simple sklearn breastcancer dataset example with Ray Tune <ttps://github.com/ray-project/xgboost_ray/tree/master/examples/simple_tune.py>`__ (requires `sklearn`)
* `HIGGS classification example <https://github.com/ray-project/xgboost_ray/tree/master/examples/higgs.py>`__
* `[download dataset (2.6 GB)] <https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz>`__
* `HIGGS classification example with Parquet <https://github.com/ray-project/xgboost_ray/tree/master/examples/higgs_parquet.py>`__ (uses the same dataset)
* `Test data classification <https://github.com/ray-project/xgboost_ray/tree/master/examples/train_on_test_data.py>`__ (uses a self-generated dataset)
Package Reference
-----------------
Training/Validation
~~~~~~~~~~~~~~~~~~~
.. autofunction:: ray.util.xgboost.train
.. autofunction:: ray.util.xgboost.predict
RayDMatrix
~~~~~~~~~~
.. autoclass:: ray.util.xgboost.RayDMatrix

View file

@ -0,0 +1,26 @@
# --------------------------------------------------------------------
# Tests from the python/ray/util/sgd/tests directory.
# Please keep these sorted alphabetically.
# --------------------------------------------------------------------
py_test(
name = "simple_example",
size = "small",
srcs = ["simple_example.py"],
deps = [":xgb_lib"],
tags = ["exclusive"],
)
py_test(
name = "simple_tune",
size="small",
srcs = ["simple_tune.py"],
deps = [":xgb_lib"],
tags = ["exlcusive"]
)
# This is a dummy test dependency that causes the above tests to be
# re-run if any of these files changes.
py_library(
name = "xgb_lib",
srcs = glob(["**/*.py"], exclude=["tests/*.py"]),
)

View file

@ -0,0 +1,16 @@
import logging
logger = logging.getLogger(__name__)
train = None
predict = None
RayDMatrix = None
try:
from xgboost_ray import train, predict, RayDMatrix, RayFileType
except ImportError:
logger.info(
"xgboost_ray is not installed. Please run "
"`pip install git+https://github.com/ray-project/xgboost_ray`.")
__all__ = ["train", "predict", "RayDMatrix", "RayFileType"]

View file

@ -0,0 +1,46 @@
from sklearn import datasets
from sklearn.model_selection import train_test_split
from ray.util.xgboost import RayDMatrix, train
# __xgboost_begin__
def main():
# Load dataset
data, labels = datasets.load_breast_cancer(return_X_y=True)
# Split into train and test set
train_x, test_x, train_y, test_y = train_test_split(
data, labels, test_size=0.25)
train_set = RayDMatrix(train_x, train_y)
test_set = RayDMatrix(test_x, test_y)
# Set config
config = {
"tree_method": "approx",
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
"max_depth": 3,
}
evals_result = {}
# Train the classifier
bst = train(
config,
train_set,
evals=[(test_set, "eval")],
evals_result=evals_result,
max_actor_restarts=1,
checkpoint_path="/tmp/checkpoint/",
verbose_eval=False)
bst.save_model("simple.xgb")
print("Final validation error: {:.4f}".format(
evals_result["eval"]["error"][-1]))
# __xgboost_end__
if __name__ == "__main__":
main()

View file

@ -0,0 +1,78 @@
from sklearn import datasets
from sklearn.model_selection import train_test_split
from ray.util.xgboost import RayDMatrix, train
# __train_begin__
num_cpus_per_actor = 1
num_actors = 1
def train_model(config):
# Load dataset
data, labels = datasets.load_breast_cancer(return_X_y=True)
# Split into train and test set
train_x, test_x, train_y, test_y = train_test_split(
data, labels, test_size=0.25)
train_set = RayDMatrix(train_x, train_y)
test_set = RayDMatrix(test_x, test_y)
evals_result = {}
bst = train(
params=config,
dtrain=train_set,
evals=[(test_set, "eval")],
evals_result=evals_result,
verbose_eval=False,
num_actors=num_actors,
cpus_per_actor=num_cpus_per_actor)
bst.save_model("model.xgb")
# __train_end__
def main():
# __tune_begin__
from ray import tune
# Set config
config = {
"tree_method": "approx",
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
"eta": tune.loguniform(1e-4, 1e-1),
"subsample": tune.uniform(0.5, 1.0),
"max_depth": tune.randint(1, 9)
}
# __tune_end__
# __tune_run_begin__
analysis = tune.run(
train_model,
config=config,
metric="eval-error",
mode="min",
num_samples=4,
resources_per_trial={
"cpu": 1,
"extra_cpu": num_actors * num_cpus_per_actor
})
# Load the best model checkpoint
import xgboost as xgb
import os
# Load in the best performing model.
best_bst = xgb.Booster()
best_bst.load_model(os.path.join(analysis.best_logdir, "model.xgb"))
accuracy = 1. - analysis.best_result["eval-error"]
print(f"Best model parameters: {analysis.best_config}")
print(f"Best model total accuracy: {accuracy:.4f}")
# __tune_run_end__
if __name__ == "__main__":
main()

View file

@ -31,6 +31,7 @@ torchvision>=0.6.0
# transformers
git+git://github.com/huggingface/transformers.git@bdcc4b78a27775d1ec8f3fd297cb679c257289db#transformers
git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray
wandb
xgboost
zoopt>=0.4.1