mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[docs] Add xgboost_ray to docs (#12184)
Co-authored-by: Amog Kamsetty <amogkamsetty@yahoo.com>
This commit is contained in:
parent
0a505ca83d
commit
7c009d22cf
11 changed files with 333 additions and 1 deletions
|
@ -432,6 +432,7 @@ matrix:
|
|||
# - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 python/ray/util/sgd/...
|
||||
- ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/...
|
||||
- ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/...
|
||||
- ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/...
|
||||
|
||||
# Docs: Tests and examples.
|
||||
- os: linux
|
||||
|
|
|
@ -29,4 +29,5 @@ tabulate
|
|||
uvicorn
|
||||
werkzeug
|
||||
git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
|
||||
git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray
|
||||
scikit-optimize
|
||||
|
|
|
@ -74,7 +74,6 @@ MOCK_MODULES = [
|
|||
"torch.utils.data",
|
||||
"torch.utils.data.distributed",
|
||||
"wandb",
|
||||
"xgboost",
|
||||
"zoopt",
|
||||
]
|
||||
import scipy.stats
|
||||
|
|
|
@ -285,6 +285,7 @@ Papers
|
|||
multiprocessing.rst
|
||||
joblib.rst
|
||||
iter.rst
|
||||
xgboost-ray.rst
|
||||
dask-on-ray.rst
|
||||
mars-on-ray.rst
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
.. _tune-main:
|
||||
|
||||
Tune: Scalable Hyperparameter Tuning
|
||||
====================================
|
||||
|
||||
|
|
161
doc/source/xgboost-ray.rst
Normal file
161
doc/source/xgboost-ray.rst
Normal file
|
@ -0,0 +1,161 @@
|
|||
.. _xgboost-ray:
|
||||
|
||||
XGBoost on Ray
|
||||
==============
|
||||
|
||||
This library adds a new backend for XGBoost utilizing Ray.
|
||||
|
||||
Please note that this is an early version and both the API and
|
||||
the behavior can change without prior notice.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
You can install XGBoost on Ray (``xgboost_ray``) like this:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
git clone https://github.com/ray-project/xgboost_ray.git
|
||||
cd xgboost_ray
|
||||
pip install -e .
|
||||
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
After installation, you can import XGBoost on Ray via two ways:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
import xgboost_ray
|
||||
# or
|
||||
import ray.util.xgboost
|
||||
|
||||
|
||||
``xgboost_ray`` provides a drop-in replacement for XGBoost's ``train``
|
||||
function. To pass data, instead of using ``xgb.DMatrix`` you will
|
||||
have to use ``ray.util.xgboost.RayDMatrix``.
|
||||
|
||||
Here is a simplified example:
|
||||
|
||||
|
||||
.. literalinclude:: /../../python/ray/util/xgboost/simple_example.py
|
||||
:language: python
|
||||
:start-after: __xgboost_begin__
|
||||
:end-before: __xgboost_end__
|
||||
|
||||
|
||||
|
||||
Data loading
|
||||
------------
|
||||
|
||||
Data is passed to ``xgboost_ray`` via a ``RayDMatrix`` object.
|
||||
|
||||
The ``RayDMatrix`` lazy loads data and stores it sharded in the
|
||||
Ray object store. The Ray XGBoost actors then access these
|
||||
shards to run their training on.
|
||||
|
||||
A ``RayDMatrix`` support various data and file types, like
|
||||
Pandas DataFrames, Numpy Arrays, CSV files and Parquet files.
|
||||
|
||||
Example loading multiple parquet files:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import glob
|
||||
from ray.util.xgboost import RayDMatrix, RayFileType
|
||||
|
||||
# We can also pass a list of files
|
||||
path = list(sorted(glob.glob("/data/nyc-taxi/*/*/*.parquet")))
|
||||
|
||||
# This argument will be passed to pd.read_parquet()
|
||||
columns = [
|
||||
"passenger_count",
|
||||
"trip_distance", "pickup_longitude", "pickup_latitude",
|
||||
"dropoff_longitude", "dropoff_latitude",
|
||||
"fare_amount", "extra", "mta_tax", "tip_amount",
|
||||
"tolls_amount", "total_amount"
|
||||
]
|
||||
|
||||
dtrain = RayDMatrix(
|
||||
path,
|
||||
label="passenger_count", # Will select this column as the label
|
||||
columns=columns,
|
||||
filetype=RayFileType.PARQUET)
|
||||
|
||||
|
||||
Hyperparameter Tuning
|
||||
---------------------
|
||||
``xgboost_ray`` integrates with Ray Tune (:ref:`tune-main`) to provide distributed hyperparameter tuning for your
|
||||
distributed XGBoost models. You can run multiple ``xgboost_ray`` training runs in parallel, each with a different
|
||||
hyperparameter configuration, with each individual training run parallelized.
|
||||
|
||||
First, move your training code into a function. This function should take in a ``config`` argument which specifies the
|
||||
hyperparameters for the xgboost model.
|
||||
|
||||
.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
|
||||
:language: python
|
||||
:start-after: __train_begin__
|
||||
:end-before: __train_end__
|
||||
|
||||
Then, you import tune and use tune's search primitives to define a hyperparameter search space.
|
||||
|
||||
.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
|
||||
:language: python
|
||||
:start-after: __tune_begin__
|
||||
:end-before: __tune_end__
|
||||
|
||||
Finally, you call ``tune.run`` passing in the training function and the ``config``. Internally, tune will resolve the
|
||||
hyperparameter search space and invoke the training function multiple times, each with different hyperparameters.
|
||||
|
||||
.. literalinclude:: /../../python/ray/util/xgboost/simple_tune.py
|
||||
:language: python
|
||||
:start-after: __tune_run_begin__
|
||||
:end-before: __tune_run_end__
|
||||
|
||||
Make sure you set the ``extra_cpu`` field appropriately so tune is aware of the total number of resources each trial
|
||||
requires.
|
||||
|
||||
|
||||
Resources
|
||||
---------
|
||||
|
||||
By default, ``xgboost_ray`` tries to determine the number of CPUs
|
||||
available and distributes them evenly across actors.
|
||||
|
||||
In the case of very large clusters or clusters with many different
|
||||
machine sizes, it makes sense to limit the number of CPUs per actor
|
||||
by setting the ``cpus_per_actor`` argument. Consider always
|
||||
setting this explicitly.
|
||||
|
||||
The number of XGBoost actors always has to be set manually with
|
||||
the ``num_actors`` argument.
|
||||
|
||||
More examples
|
||||
-------------
|
||||
|
||||
Fore complete end to end examples, please have a look at
|
||||
the `examples folder <https://github.com/ray-project/xgboost_ray/tree/master/examples/>`__:
|
||||
|
||||
* `Simple sklearn breastcancer dataset example <https://github.com/ray-project/xgboost_ray/tree/master/examples/simple.py>`__ (requires `sklearn`)
|
||||
* `Simple sklearn breastcancer dataset example with Ray Tune <ttps://github.com/ray-project/xgboost_ray/tree/master/examples/simple_tune.py>`__ (requires `sklearn`)
|
||||
* `HIGGS classification example <https://github.com/ray-project/xgboost_ray/tree/master/examples/higgs.py>`__
|
||||
* `[download dataset (2.6 GB)] <https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz>`__
|
||||
* `HIGGS classification example with Parquet <https://github.com/ray-project/xgboost_ray/tree/master/examples/higgs_parquet.py>`__ (uses the same dataset)
|
||||
* `Test data classification <https://github.com/ray-project/xgboost_ray/tree/master/examples/train_on_test_data.py>`__ (uses a self-generated dataset)
|
||||
|
||||
Package Reference
|
||||
-----------------
|
||||
|
||||
|
||||
Training/Validation
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autofunction:: ray.util.xgboost.train
|
||||
|
||||
.. autofunction:: ray.util.xgboost.predict
|
||||
|
||||
RayDMatrix
|
||||
~~~~~~~~~~
|
||||
|
||||
.. autoclass:: ray.util.xgboost.RayDMatrix
|
26
python/ray/util/xgboost/BUILD
Normal file
26
python/ray/util/xgboost/BUILD
Normal file
|
@ -0,0 +1,26 @@
|
|||
# --------------------------------------------------------------------
|
||||
# Tests from the python/ray/util/sgd/tests directory.
|
||||
# Please keep these sorted alphabetically.
|
||||
# --------------------------------------------------------------------
|
||||
py_test(
|
||||
name = "simple_example",
|
||||
size = "small",
|
||||
srcs = ["simple_example.py"],
|
||||
deps = [":xgb_lib"],
|
||||
tags = ["exclusive"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "simple_tune",
|
||||
size="small",
|
||||
srcs = ["simple_tune.py"],
|
||||
deps = [":xgb_lib"],
|
||||
tags = ["exlcusive"]
|
||||
)
|
||||
|
||||
# This is a dummy test dependency that causes the above tests to be
|
||||
# re-run if any of these files changes.
|
||||
py_library(
|
||||
name = "xgb_lib",
|
||||
srcs = glob(["**/*.py"], exclude=["tests/*.py"]),
|
||||
)
|
16
python/ray/util/xgboost/__init__.py
Normal file
16
python/ray/util/xgboost/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
train = None
|
||||
predict = None
|
||||
RayDMatrix = None
|
||||
|
||||
try:
|
||||
from xgboost_ray import train, predict, RayDMatrix, RayFileType
|
||||
except ImportError:
|
||||
logger.info(
|
||||
"xgboost_ray is not installed. Please run "
|
||||
"`pip install git+https://github.com/ray-project/xgboost_ray`.")
|
||||
|
||||
__all__ = ["train", "predict", "RayDMatrix", "RayFileType"]
|
46
python/ray/util/xgboost/simple_example.py
Normal file
46
python/ray/util/xgboost/simple_example.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from ray.util.xgboost import RayDMatrix, train
|
||||
|
||||
|
||||
# __xgboost_begin__
|
||||
def main():
|
||||
# Load dataset
|
||||
data, labels = datasets.load_breast_cancer(return_X_y=True)
|
||||
# Split into train and test set
|
||||
train_x, test_x, train_y, test_y = train_test_split(
|
||||
data, labels, test_size=0.25)
|
||||
|
||||
train_set = RayDMatrix(train_x, train_y)
|
||||
test_set = RayDMatrix(test_x, test_y)
|
||||
|
||||
# Set config
|
||||
config = {
|
||||
"tree_method": "approx",
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": ["logloss", "error"],
|
||||
"max_depth": 3,
|
||||
}
|
||||
|
||||
evals_result = {}
|
||||
|
||||
# Train the classifier
|
||||
bst = train(
|
||||
config,
|
||||
train_set,
|
||||
evals=[(test_set, "eval")],
|
||||
evals_result=evals_result,
|
||||
max_actor_restarts=1,
|
||||
checkpoint_path="/tmp/checkpoint/",
|
||||
verbose_eval=False)
|
||||
|
||||
bst.save_model("simple.xgb")
|
||||
print("Final validation error: {:.4f}".format(
|
||||
evals_result["eval"]["error"][-1]))
|
||||
|
||||
|
||||
# __xgboost_end__
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
78
python/ray/util/xgboost/simple_tune.py
Normal file
78
python/ray/util/xgboost/simple_tune.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from ray.util.xgboost import RayDMatrix, train
|
||||
|
||||
# __train_begin__
|
||||
num_cpus_per_actor = 1
|
||||
num_actors = 1
|
||||
|
||||
|
||||
def train_model(config):
|
||||
# Load dataset
|
||||
data, labels = datasets.load_breast_cancer(return_X_y=True)
|
||||
# Split into train and test set
|
||||
train_x, test_x, train_y, test_y = train_test_split(
|
||||
data, labels, test_size=0.25)
|
||||
|
||||
train_set = RayDMatrix(train_x, train_y)
|
||||
test_set = RayDMatrix(test_x, test_y)
|
||||
|
||||
evals_result = {}
|
||||
bst = train(
|
||||
params=config,
|
||||
dtrain=train_set,
|
||||
evals=[(test_set, "eval")],
|
||||
evals_result=evals_result,
|
||||
verbose_eval=False,
|
||||
num_actors=num_actors,
|
||||
cpus_per_actor=num_cpus_per_actor)
|
||||
bst.save_model("model.xgb")
|
||||
|
||||
|
||||
# __train_end__
|
||||
|
||||
|
||||
def main():
|
||||
# __tune_begin__
|
||||
from ray import tune
|
||||
|
||||
# Set config
|
||||
config = {
|
||||
"tree_method": "approx",
|
||||
"objective": "binary:logistic",
|
||||
"eval_metric": ["logloss", "error"],
|
||||
"eta": tune.loguniform(1e-4, 1e-1),
|
||||
"subsample": tune.uniform(0.5, 1.0),
|
||||
"max_depth": tune.randint(1, 9)
|
||||
}
|
||||
# __tune_end__
|
||||
|
||||
# __tune_run_begin__
|
||||
analysis = tune.run(
|
||||
train_model,
|
||||
config=config,
|
||||
metric="eval-error",
|
||||
mode="min",
|
||||
num_samples=4,
|
||||
resources_per_trial={
|
||||
"cpu": 1,
|
||||
"extra_cpu": num_actors * num_cpus_per_actor
|
||||
})
|
||||
|
||||
# Load the best model checkpoint
|
||||
import xgboost as xgb
|
||||
import os
|
||||
|
||||
# Load in the best performing model.
|
||||
best_bst = xgb.Booster()
|
||||
best_bst.load_model(os.path.join(analysis.best_logdir, "model.xgb"))
|
||||
|
||||
accuracy = 1. - analysis.best_result["eval-error"]
|
||||
print(f"Best model parameters: {analysis.best_config}")
|
||||
print(f"Best model total accuracy: {accuracy:.4f}")
|
||||
# __tune_run_end__
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -31,6 +31,7 @@ torchvision>=0.6.0
|
|||
# transformers
|
||||
git+git://github.com/huggingface/transformers.git@bdcc4b78a27775d1ec8f3fd297cb679c257289db#transformers
|
||||
git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
|
||||
git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray
|
||||
wandb
|
||||
xgboost
|
||||
zoopt>=0.4.1
|
||||
|
|
Loading…
Add table
Reference in a new issue