[docs] Add LightGBM-Ray docs, update XGBoost-Ray docs (#17188)

2025-03-06 02:21:39 -05:00 · 2021-07-20 17:06:47 +02:00 · 2021-07-20 17:06:47 +02:00 · 5e9b680e39
commit 5e9b680e39
parent 8efc04a8a6
11 changed files with 726 additions and 9 deletions
--- a/doc/requirements-doc.txt
+++ b/doc/requirements-doc.txt
@ -33,4 +33,5 @@ uvicorn
 werkzeug
 git+git://github.com/ray-project/tune-sklearn@master#tune-sklearn
 git+git://github.com/ray-project/xgboost_ray@master#xgboost_ray
+git+git://github.com/ray-project/lightgbm_ray@main#lightgbm_ray
 scikit-optimize
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@ -37,6 +37,7 @@ MOCK_MODULES = [
    "ax.service.ax_client",
    "blist",
    "ConfigSpace",
+    "dask.distributed",
    "gym",
    "gym.spaces",
    "horovod",
@ -107,9 +108,49 @@ def import_or_mock(module):
        sys.modules[module] = ChildClassMock()


-xgb_modules = ["xgboost", "xgboost.core", "xgboost.callback", "xgboost_ray"]
-for xgb_mod in xgb_modules:
-    import_or_mock(xgb_mod)
+xgb_lgbm_modules = [
+    "xgboost", "xgboost.core", "xgboost.callback", "xgboost.sklearn",
+    "xgboost_ray", "lightgbm.main", "lightgbm.callback", "lightgbm.compat",
+    "lightgbm_ray"
+]
+for module in xgb_lgbm_modules:
+    import_or_mock(module)
+
+# replace docstring refs in XGBoost documentation XGBoost-Ray inherits from
+# if this is not done, an error during make will be thrown, as there is no
+# such ref in ray docs
+replaces_callback_api = [
+    sys.modules["xgboost_ray"].RayXGBClassifier.fit,
+    sys.modules["xgboost_ray"].RayXGBRegressor.fit
+]
+for m in replaces_callback_api:
+    m.__doc__ = m.__doc__.replace(":ref:`callback_api`", "Callback API")
+
+# fix "more than one target found for cross-reference 'RayParams'"
+# as both lightgbm-ray and xgboost-ray have classes with the same
+# names, we need to refer to the explicitly in ray docs, otherwise
+# make will throw an error
+replaces_ray_params = [
+    sys.modules["xgboost_ray"].train, sys.modules["xgboost_ray"].predict,
+    sys.modules["xgboost_ray"].RayXGBClassifier.fit,
+    sys.modules["xgboost_ray"].RayXGBClassifier.predict,
+    sys.modules["xgboost_ray"].RayXGBClassifier.predict_proba,
+    sys.modules["xgboost_ray"].RayXGBRegressor.fit,
+    sys.modules["xgboost_ray"].RayXGBRegressor.predict
+]
+for m in replaces_ray_params:
+    m.__doc__ = m.__doc__.replace("RayParams", "xgboost_ray.RayParams")
+
+replaces_ray_params = [
+    sys.modules["lightgbm_ray"].train, sys.modules["lightgbm_ray"].predict,
+    sys.modules["lightgbm_ray"].RayLGBMClassifier.fit,
+    sys.modules["lightgbm_ray"].RayLGBMClassifier.predict,
+    sys.modules["lightgbm_ray"].RayLGBMClassifier.predict_proba,
+    sys.modules["lightgbm_ray"].RayLGBMRegressor.fit,
+    sys.modules["lightgbm_ray"].RayLGBMRegressor.predict
+]
+for m in replaces_ray_params:
+    m.__doc__ = m.__doc__.replace("RayParams", "lightgbm_ray.RayParams")


 class SimpleClass(object):
--- a/doc/source/images/lightgbm_logo.png
+++ b/doc/source/images/lightgbm_logo.png
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@ -337,6 +337,7 @@ Papers
   multiprocessing.rst
   joblib.rst
   xgboost-ray.rst
+   lightgbm-ray.rst
   ray-collective.rst

 .. toctree::
--- a/doc/source/lightgbm-ray.rst
+++ b/doc/source/lightgbm-ray.rst
@ -0,0 +1,552 @@
+.. _lightgbm-ray:
+
+..
+  This part of the docs is generated from the LightGBM-Ray readme using m2r
+  To update:
+  - run `m2r /path/to/_lightgbm_ray/README.md`
+  - copy the contents of README.rst here
+  - Get rid of the badges in the top
+  - Get rid of the references section at the bottom
+  - Be sure not to delete the API reference section in the bottom of this file.
+  - Adjust some link targets (e.g. for "Ray Tune") to anonymous references
+    by adding a second underscore (use `target <link>`__)
+  - Search for `\ **` and delete this from the links (bold links are not supported)
+
+Distributed LightGBM on Ray
+===========================
+
+LightGBM-Ray is a distributed backend for
+`LightGBM <https://lightgbm.readthedocs.io/>`_\ , built
+on top of
+`distributed computing framework Ray <https://ray.io>`_.
+
+LightGBM-Ray
+
+
+* enables `multi-node <#usage>`_ and `multi-GPU <#multi-gpu-training>`_ training
+* integrates seamlessly with distributed `hyperparameter optimization <#hyperparameter-tuning>`_ library `Ray Tune <http://tune.io>`__
+* comes with `fault tolerance handling <#fault-tolerance>`_ mechanisms, and
+* supports `distributed dataframes and distributed data loading <#distributed-data-loading>`_
+
+All releases are tested on large clusters and workloads.
+
+This package is based on :ref:`XGBoost-Ray <xgboost-ray>`. As of now, XGBoost-Ray is a dependency for LightGBM-Ray.
+
+Installation
+------------
+
+You can install the latest LightGBM-Ray release from PIP:
+
+.. code-block:: bash
+
+   pip install lightgbm_ray
+
+If you'd like to install the latest master, use this command instead:
+
+.. code-block:: bash
+
+   pip install git+https://github.com/ray-project/lightgbm_ray.git#lightgbm_ray
+
+Usage
+-----
+
+LightGBM-Ray provides a drop-in replacement for LightGBM's ``train``
+function. To pass data, a ``RayDMatrix`` object is required, common
+with XGBoost-Ray.
+
+Distributed training parameters are configured with a
+``lightgbm_ray.RayParams`` object. For instance, you can set
+the ``num_actors`` property to specify how many distributed actors
+you would like to use.
+
+Here is a simplified example (which requires ``sklearn``\ ):
+
+**Training:**
+
+.. code-block:: python
+
+   from lightgbm_ray import RayDMatrix, RayParams, train
+   from sklearn.datasets import load_breast_cancer
+
+   train_x, train_y = load_breast_cancer(return_X_y=True)
+   train_set = RayDMatrix(train_x, train_y)
+
+   evals_result = {}
+   bst = train(
+       {
+           "objective": "binary",
+           "metric": ["binary_logloss", "binary_error"],
+       },
+       train_set,
+       evals_result=evals_result,
+       valid_sets=[train_set],
+       valid_names=["train"],
+       verbose_eval=False,
+       ray_params=RayParams(num_actors=2, cpus_per_actor=2))
+
+   bst.booster_.save_model("model.lgbm")
+   print("Final training error: {:.4f}".format(
+       evals_result["train"]["binary_error"][-1]))
+
+**Prediction:**
+
+.. code-block:: python
+
+   from lightgbm_ray import RayDMatrix, RayParams, predict
+   from sklearn.datasets import load_breast_cancer
+   import lightgbm as lgbm
+
+   data, labels = load_breast_cancer(return_X_y=True)
+
+   dpred = RayDMatrix(data, labels)
+
+   bst = lgbm.Booster(model_file="model.lgbm")
+   pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2))
+
+   print(pred_ray)
+
+scikit-learn API
+^^^^^^^^^^^^^^^^
+
+LightGBM-Ray also features a scikit-learn API fully mirroring pure
+LightGBM scikit-learn API, providing a completely drop-in
+replacement. The following estimators are available:
+
+
+* ``RayLGBMClassifier``
+* ``RayLGBMRegressor``
+
+Example usage of ``RayLGBMClassifier``\ :
+
+.. code-block:: python
+
+   from lightgbm_ray import RayLGBMClassifier, RayParams
+   from sklearn.datasets import load_breast_cancer
+   from sklearn.model_selection import train_test_split
+
+   seed = 42
+
+   X, y = load_breast_cancer(return_X_y=True)
+   X_train, X_test, y_train, y_test = train_test_split(
+       X, y, train_size=0.25, random_state=42)
+
+   clf = RayLGBMClassifier(
+       n_jobs=2,  # In LightGBM-Ray, n_jobs sets the number of actors
+       random_state=seed)
+
+   # scikit-learn API will automatically conver the data
+   # to RayDMatrix format as needed.
+   # You can also pass X as a RayDMatrix, in which case
+   # y will be ignored.
+
+   clf.fit(X_train, y_train)
+
+   pred_ray = clf.predict(X_test)
+   print(pred_ray)
+
+   pred_proba_ray = clf.predict_proba(X_test)
+   print(pred_proba_ray)
+
+   # It is also possible to pass a RayParams object
+   # to fit/predict/predict_proba methods - will override
+   # n_jobs set during initialization
+
+   clf.fit(X_train, y_train, ray_params=RayParams(num_actors=2))
+
+   pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2))
+   print(pred_ray)
+
+Things to keep in mind:
+
+
+* ``n_jobs`` parameter controls the number of actors spawned.
+  You can pass a ``RayParams`` object to the
+  ``fit``\ /\ ``predict``\ /\ ``predict_proba`` methods as the ``ray_params`` argument
+  for greater control over resource allocation. Doing
+  so will override the value of ``n_jobs`` with the value of
+  ``ray_params.num_actors`` attribute. For more information, refer
+  to the `Resources <#resources>`_ section below.
+* By default ``n_jobs`` is set to ``1``\ , which means the training
+  will **not** be distributed. Make sure to either set ``n_jobs``
+  to a higher value or pass a ``RayParams`` object as outlined above
+  in order to take advantage of LightGBM-Ray's functionality.
+* After calling ``fit``\ , additional evaluation results (e.g. training time,
+  number of rows, callback results) will be available under
+  ``additional_results_`` attribute.
+* ``eval_`` arguments are supported, but early stopping is not.
+* LightGBM-Ray's scikit-learn API is based on LightGBM 3.2.1.
+  While we try to support older LightGBM versions, please note that
+  this library is only fully tested and supported for LightGBM >= 3.2.1.
+
+For more information on the scikit-learn API, refer to the `LightGBM documentation <https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api>`_.
+
+Data loading
+------------
+
+Data is passed to LightGBM-Ray via a ``RayDMatrix`` object.
+
+The ``RayDMatrix`` lazy loads data and stores it sharded in the
+Ray object store. The Ray LightGBM actors then access these
+shards to run their training on.
+
+A ``RayDMatrix`` support various data and file types, like
+Pandas DataFrames, Numpy Arrays, CSV files and Parquet files.
+
+Example loading multiple parquet files:
+
+.. code-block:: python
+
+   import glob
+   from lightgbm_ray import RayDMatrix, RayFileType
+
+   # We can also pass a list of files
+   path = list(sorted(glob.glob("/data/nyc-taxi/*/*/*.parquet")))
+
+   # This argument will be passed to `pd.read_parquet()`
+   columns = [
+       "passenger_count",
+       "trip_distance", "pickup_longitude", "pickup_latitude",
+       "dropoff_longitude", "dropoff_latitude",
+       "fare_amount", "extra", "mta_tax", "tip_amount",
+       "tolls_amount", "total_amount"
+   ]
+
+   dtrain = RayDMatrix(
+       path,
+       label="passenger_count",  # Will select this column as the label
+       columns=columns,
+       filetype=RayFileType.PARQUET)
+
+Hyperparameter Tuning
+---------------------
+
+LightGBM-Ray integrates with `Ray Tune <https://tune.io>`_ to provide distributed hyperparameter tuning for your
+distributed LightGBM models. You can run multiple LightGBM-Ray training runs in parallel, each with a different
+hyperparameter configuration, and each training run parallelized by itself. All you have to do is move your training
+code to a function, and pass the function to ``tune.run``. Internally, ``train`` will detect if ``tune`` is being used and will
+automatically report results to tune.
+
+Example using LightGBM-Ray with Ray Tune:
+
+.. code-block:: python
+
+   from lightgbm_ray import RayDMatrix, RayParams, train
+   from sklearn.datasets import load_breast_cancer
+
+   num_actors = 2
+   num_cpus_per_actor = 2
+
+   ray_params = RayParams(
+       num_actors=num_actors, cpus_per_actor=num_cpus_per_actor)
+
+   def train_model(config):
+       train_x, train_y = load_breast_cancer(return_X_y=True)
+       train_set = RayDMatrix(train_x, train_y)
+
+       evals_result = {}
+       bst = train(
+           params=config,
+           dtrain=train_set,
+           evals_result=evals_result,
+           valid_sets=[train_set],
+           valid_names=["train"],
+           verbose_eval=False,
+           ray_params=ray_params)
+       bst.booster_.save_model("model.lgbm")
+
+   from ray import tune
+
+   # Specify the hyperparameter search space.
+   config = {
+       "objective": "binary",
+       "metric": ["binary_logloss", "binary_error"],
+       "eta": tune.loguniform(1e-4, 1e-1),
+       "subsample": tune.uniform(0.5, 1.0),
+       "max_depth": tune.randint(1, 9)
+   }
+
+   # Make sure to use the `get_tune_resources` method to set the `resources_per_trial`
+   analysis = tune.run(
+       train_model,
+       config=config,
+       metric="train-binary_error",
+       mode="min",
+       num_samples=4,
+       resources_per_trial=ray_params.get_tune_resources())
+   print("Best hyperparameters", analysis.best_config)
+
+Also see examples/simple_tune.py for another example.
+
+Fault tolerance
+---------------
+
+LightGBM-Ray leverages the stateful Ray actor model to
+enable fault tolerant training. Currently, only non-elastic
+training is supported.
+
+Non-elastic training (warm restart)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When an actor or node dies, LightGBM-Ray will retain the
+state of the remaining actors. In non-elastic training,
+the failed actors will be replaced as soon as resources
+are available again. Only these actors will reload their
+parts of the data. Training will resume once all actors
+are ready for training again.
+
+You can configure this mode in the ``RayParams``\ :
+
+.. code-block:: python
+
+   from lightgbm_ray import RayParams
+
+   ray_params = RayParams(
+       max_actor_restarts=2,    # How often are actors allowed to fail, Default = 0
+   )
+
+Resources
+---------
+
+By default, LightGBM-Ray tries to determine the number of CPUs
+available and distributes them evenly across actors.
+
+It is important to note that distributed LightGBM needs at least
+two CPUs per actor to function efficiently (without blocking).
+Therefore, by default, at least two CPUs will be assigned to each actor,
+and an exception will be raised if an actor has less than two CPUs.
+It is possible to override this check by setting the
+``allow_less_than_two_cpus`` argument to ``True``\ , though it is not
+recommended, as it will negatively impact training performance.
+
+In the case of very large clusters or clusters with many different
+machine sizes, it makes sense to limit the number of CPUs per actor
+by setting the ``cpus_per_actor`` argument. Consider always
+setting this explicitly.
+
+The number of LightGBM actors always has to be set manually with
+the ``num_actors`` argument.
+
+Multi GPU training
+^^^^^^^^^^^^^^^^^^
+
+LightGBM-Ray enables multi GPU training. The LightGBM core backend
+will automatically handle communication.
+All you have to do is to start one actor per GPU.
+
+For instance, if you have 2 machines with 4 GPUs each, you will want
+to start 8 remote actors, and set ``gpus_per_actor=1``. There is usually
+no benefit in allocating less (e.g. 0.5) or more than one GPU per actor.
+
+You should divide the CPUs evenly across actors per machine, so if your
+machines have 16 CPUs in addition to the 4 GPUs, each actor should have
+4 CPUs to use.
+
+.. code-block:: python
+
+   from lightgbm_ray import RayParams
+
+   ray_params = RayParams(
+       num_actors=8,
+       gpus_per_actor=1,
+       cpus_per_actor=4,   # Divide evenly across actors per machine
+   )
+
+How many remote actors should I use?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This depends on your workload and your cluster setup.
+Generally there is no inherent benefit of running more than
+one remote actor per node for CPU-only training. This is because
+LightGBM core can already leverage multiple CPUs via threading.
+
+However, there are some cases when you should consider starting
+more than one actor per node:
+
+
+* For `multi GPU training <#multi-gpu-training>`_\ , each GPU should have a separate
+  remote actor. Thus, if your machine has 24 CPUs and 4 GPUs,
+  you will want to start 4 remote actors with 6 CPUs and 1 GPU
+  each
+* In a **heterogeneous cluster**\ , you might want to find the
+  `greatest common divisor <https://en.wikipedia.org/wiki/Greatest_common_divisor>`_
+  for the number of CPUs.
+  E.g. for a cluster with three nodes of 4, 8, and 12 CPUs, respectively,
+  you should set the number of actors to 6 and the CPUs per
+  actor to 4.
+
+Distributed data loading
+------------------------
+
+LightGBM-Ray can leverage both centralized and distributed data loading.
+
+In **centralized data loading**\ , the data is partitioned by the head node
+and stored in the object store. Each remote actor then retrieves their
+partitions by querying the Ray object store. Centralized loading is used
+when you pass centralized in-memory dataframes, such as Pandas dataframes
+or Numpy arrays, or when you pass a single source file, such as a single CSV
+or Parquet file.
+
+.. code-block:: python
+
+   from lightgbm_ray import RayDMatrix
+
+   # This will use centralized data loading, as only one source file is specified
+   # `label_col` is a column in the CSV, used as the target label
+   ray_params = RayDMatrix("./source_file.csv", label="label_col")
+
+In **distributed data loading**\ , each remote actor loads their data directly from
+the source (e.g. local hard disk, NFS, HDFS, S3),
+without a central bottleneck. The data is still stored in the
+object store, but locally to each actor. This mode is used automatically
+when loading data from multiple CSV or Parquet files. Please note that
+we do not check or enforce partition sizes in this case - it is your job
+to make sure the data is evenly distributed across the source files.
+
+.. code-block:: python
+
+   from lightgbm_ray import RayDMatrix
+
+   # This will use distributed data loading, as four source files are specified
+   # Please note that you cannot schedule more than four actors in this case.
+   # `label_col` is a column in the Parquet files, used as the target label
+   ray_params = RayDMatrix([
+       "hdfs:///tmp/part1.parquet",
+       "hdfs:///tmp/part2.parquet",
+       "hdfs:///tmp/part3.parquet",
+       "hdfs:///tmp/part4.parquet",
+   ], label="label_col")
+
+Lastly, LightGBM-Ray supports **distributed dataframe** representations, such
+as `Modin <https://modin.readthedocs.io/en/latest/>`_ and
+`Dask dataframes <https://docs.dask.org/en/latest/dataframe.html>`_
+(used with `Dask on Ray <https://docs.ray.io/en/master/dask-on-ray.html>`_\ ).
+Here, LightGBM-Ray will check on which nodes the distributed partitions
+are currently located, and will assign partitions to actors in order to
+minimize cross-node data transfer. Please note that we also assume here
+that partition sizes are uniform.
+
+.. code-block:: python
+
+   from lightgbm_ray import RayDMatrix
+
+   # This will try to allocate the existing Modin partitions
+   # to co-located Ray actors. If this is not possible, data will
+   # be transferred across nodes
+   ray_params = RayDMatrix(existing_modin_df)
+
+
+Data sources
+^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+
+   * - Type
+     - Centralized loading
+     - Distributed loading
+   * - Numpy array
+     - Yes
+     - No
+   * - Pandas dataframe
+     - Yes
+     - No
+   * - Single CSV
+     - Yes
+     - No
+   * - Multi CSV
+     - Yes
+     - Yes
+   * - Single Parquet
+     - Yes
+     - No
+   * - Multi Parquet
+     - Yes
+     - Yes
+   * - `Petastorm <https://github.com/uber/petastorm>`__
+     - Yes
+     - Yes
+   * - `Ray MLDataset <https://docs.ray.io/en/master/iter.html>`__
+     - Yes
+     - Yes
+   * - `Dask dataframe <https://docs.dask.org/en/latest/dataframe.html>`__
+     - Yes
+     - Yes
+   * - `Modin dataframe <https://modin.readthedocs.io/en/latest/>`__
+     - Yes
+     - Yes
+
+
+Memory usage
+------------
+
+Details coming soon.
+
+**Best practices**
+
+In order to reduce peak memory usage, consider the following
+suggestions:
+
+
+* Store data as ``float32`` or less. More precision is often
+  not needed, and keeping data in a smaller format will
+  help reduce peak memory usage for initial data loading.
+* Pass the ``dtype`` when loading data from CSV. Otherwise,
+  floating point values will be loaded as ``np.float64``
+  per default, increasing peak memory usage by 33%.
+
+Placement Strategies
+--------------------
+
+LightGBM-Ray leverages Ray's Placement Group API (https://docs.ray.io/en/master/placement-group.html)
+to implement placement strategies for better fault tolerance.
+
+By default, a SPREAD strategy is used for training, which attempts to spread all of the training workers
+across the nodes in a cluster on a best-effort basis. This improves fault tolerance since it minimizes the
+number of worker failures when a node goes down, but comes at a cost of increased inter-node communication
+To disable this strategy, set the ``USE_SPREAD_STRATEGY`` environment variable to 0. If disabled, no
+particular placement strategy will be used.
+
+
+When LightGBM-Ray is used with Ray Tune for hyperparameter tuning, a PACK strategy is used. This strategy
+attempts to place all workers for each trial on the same node on a best-effort basis. This means that if a node
+goes down, it will be less likely to impact multiple trials.
+
+When placement strategies are used, LightGBM-Ray will wait for 100 seconds for the required resources
+to become available, and will fail if the required resources cannot be reserved and the cluster cannot autoscale
+to increase the number of resources. You can change the ``PLACEMENT_GROUP_TIMEOUT_S`` environment variable to modify
+how long this timeout should be.
+
+More examples
+-------------
+
+For complete end to end examples, please have a look at
+the `examples folder <https://github.com/ray-project/lightgbm_ray/tree/master/examples/>`_\ :
+
+* `Simple sklearn breastcancer dataset example <https://github.com/ray-project/lightgbm_ray/tree/master/examples/simple.py>`_ (requires ``sklearn``\ )
+* `HIGGS classification example <https://github.com/ray-project/lightgbm_ray/tree/master/examples/higgs.py>`_
+  (\ `download dataset (2.6 GB) <https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz>`_\ )
+* `HIGGS classification example with Parquet <https://github.com/ray-project/lightgbm_ray/tree/master/examples/higgs_parquet.py>`_ (uses the same dataset)
+* `Test data classification <https://github.com/ray-project/lightgbm_ray/tree/master/examples/train_on_test_data.py>`_ (uses a self-generated dataset)
+
+API reference
+-------------
+
+.. autoclass:: lightgbm_ray.RayParams
+    :members:
+
+.. autoclass:: xgboost_ray.RayDMatrix
+    :members:
+    :noindex:
+
+.. autofunction:: lightgbm_ray.train
+
+.. autofunction:: lightgbm_ray.predict
+
+scikit-learn API
+^^^^^^^^^^^^^^^^
+
+.. autoclass:: lightgbm_ray.RayLGBMClassifier
+    :members:
+
+.. autoclass:: lightgbm_ray.RayLGBMRegressor
+    :members:
--- a/doc/source/ray-libraries.rst
+++ b/doc/source/ray-libraries.rst
@ -143,6 +143,12 @@ XGBoost is a popular gradient boosting library for classification and regression

 [`Link to integration <https://github.com/ray-project/xgboost_ray>`__]

+LightGBM |lightgbm|
+-------------------
+LightGBM is a high-performance gradient boosting library for classification and regression. It is designed to be distributed and efficient.
+
+[`Link to integration <https://github.com/ray-project/lightgbm_ray>`__]
+

 .. |airflow| image:: images/airflow.png
    :class: inline-figure
@ -212,6 +218,10 @@ XGBoost is a popular gradient boosting library for classification and regression
    :class: inline-figure
    :height: 30

+.. |lightgbm| image:: images/lightgbm_logo.png
+    :class: inline-figure
+    :height: 30
+
 .. |nlu| image:: images/nlu.png
    :class: inline-figure
    :height: 30
--- a/doc/source/xgboost-ray.rst
+++ b/doc/source/xgboost-ray.rst
@ -104,6 +104,85 @@ Here is a simplified example (which requires ``sklearn``\ ):

   print(pred_ray)

+scikit-learn API
+^^^^^^^^^^^^^^^^
+
+XGBoost-Ray also features a scikit-learn API fully mirroring pure
+XGBoost scikit-learn API, providing a completely drop-in
+replacement. The following estimators are available:
+
+
+* ``RayXGBClassifier``
+* ``RayXGRegressor``
+* ``RayXGBRFClassifier``
+* ``RayXGBRFRegressor``
+* ``RayXGBRanker``
+
+Example usage of ``RayXGBClassifier``\ :
+
+.. code-block:: python
+
+   from xgboost_ray import RayXGBClassifier, RayParams
+   from sklearn.datasets import load_breast_cancer
+   from sklearn.model_selection import train_test_split
+
+   seed = 42
+
+   X, y = load_breast_cancer(return_X_y=True)
+   X_train, X_test, y_train, y_test = train_test_split(
+       X, y, train_size=0.25, random_state=42
+   )
+
+   clf = RayXGBClassifier(
+       n_jobs=4,  # In XGBoost-Ray, n_jobs sets the number of actors
+       random_state=seed
+   )
+
+   # scikit-learn API will automatically conver the data
+   # to RayDMatrix format as needed.
+   # You can also pass X as a RayDMatrix, in which case
+   # y will be ignored.
+
+   clf.fit(X_train, y_train)
+
+   pred_ray = clf.predict(X_test)
+   print(pred_ray)
+
+   pred_proba_ray = clf.predict_proba(X_test)
+   print(pred_proba_ray)
+
+   # It is also possible to pass a RayParams object
+   # to fit/predict/predict_proba methods - will override
+   # n_jobs set during initialization
+
+   clf.fit(X_train, y_train, ray_params=RayParams(num_actors=2))
+
+   pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2))
+   print(pred_ray)
+
+Things to keep in mind:
+
+
+* ``n_jobs`` parameter controls the number of actors spawned.
+  You can pass a ``RayParams`` object to the
+  ``fit``\ /\ ``predict``\ /\ ``predict_proba`` methods as the ``ray_params`` argument 
+  for greater control over resource allocation. Doing
+  so will override the value of ``n_jobs`` with the value of
+  ``ray_params.num_actors`` attribute. For more information, refer
+  to the `Resources <#resources>`_ section below.
+* By default ``n_jobs`` is set to ``1``\ , which means the training
+  will **not** be distributed. Make sure to either set ``n_jobs``
+  to a higher value or pass a ``RayParams`` object as outlined above
+  in order to take advantage of XGBoost-Ray's functionality.
+* After calling ``fit``\ , additional evaluation results (e.g. training time,
+  number of rows, callback results) will be available under
+  ``additional_results_`` attribute.
+* XGBoost-Ray's scikit-learn API is based on XGBoost 1.4.
+  While we try to support older XGBoost versions, please note that
+  this library is only fully tested and supported for XGBoost >= 1.4.
+
+For more information on the scikit-learn API, refer to the `XGBoost documentation <https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn>`_.
+
 Data loading
 ------------

@ -506,7 +585,7 @@ how long this timeout should be.
 More examples
 -------------

-Fore complete end to end examples, please have a look at
+For complete end to end examples, please have a look at
 the `examples folder <https://github.com/ray-project/xgboost_ray/tree/master/examples/>`_\ :


@ -528,3 +607,18 @@ API reference
 .. autofunction:: xgboost_ray.train

 .. autofunction:: xgboost_ray.predict
+
+scikit-learn API
+^^^^^^^^^^^^^^^^
+
+.. autoclass:: xgboost_ray.RayXGBClassifier
+    :members:
+
+.. autoclass:: xgboost_ray.RayXGBRegressor
+    :members:
+
+.. autoclass:: xgboost_ray.RayXGBRFClassifier
+    :members:
+
+.. autoclass:: xgboost_ray.RayXGBRFRegressor
+    :members:
--- a/python/ray/util/lightgbm/init.py
+++ b/python/ray/util/lightgbm/init.py
@ -7,12 +7,18 @@ predict = None
 RayParams = None
 RayDMatrix = None
 RayFileType = None
+RayLGBMClassifier = None
+RayLGBMRegressor = None

 try:
-    from lightgbm_ray import train, predict, RayParams, RayDMatrix, RayFileType
+    from lightgbm_ray import (train, predict, RayParams, RayDMatrix,
+                              RayFileType, RayLGBMClassifier, RayLGBMRegressor)
 except ImportError:
    logger.info(
        "lightgbm_ray is not installed. Please run "
        "`pip install git+https://github.com/ray-project/lightgbm_ray`.")

-__all__ = ["train", "predict", "RayParams", "RayDMatrix", "RayFileType"]
+__all__ = [
+    "train", "predict", "RayParams", "RayDMatrix", "RayFileType",
+    "RayLGBMClassifier", "RayLGBMRegressor"
+]
--- a/python/ray/util/xgboost/init.py
+++ b/python/ray/util/xgboost/init.py
@ -7,12 +7,22 @@ predict = None
 RayParams = None
 RayDMatrix = None
 RayFileType = None
+RayXGBClassifier = None
+RayXGBRegressor = None
+RayXGBRFClassifier = None
+RayXGBRFRegressor = None

 try:
-    from xgboost_ray import train, predict, RayParams, RayDMatrix, RayFileType
+    from xgboost_ray import (train, predict, RayParams, RayDMatrix,
+                             RayFileType, RayXGBClassifier, RayXGBRegressor,
+                             RayXGBRFClassifier, RayXGBRFRegressor)
 except ImportError:
    logger.info(
        "xgboost_ray is not installed. Please run "
        "`pip install git+https://github.com/ray-project/xgboost_ray`.")

-__all__ = ["train", "predict", "RayParams", "RayDMatrix", "RayFileType"]
+__all__ = [
+    "train", "predict", "RayParams", "RayDMatrix", "RayFileType",
+    "RayXGBClassifier", "RayXGBRegressor", "RayXGBRFClassifier",
+    "RayXGBRFRegressor"
+]
--- a/python/requirements/tune/requirements_upstream.txt
+++ b/python/requirements/tune/requirements_upstream.txt
@ -3,5 +3,6 @@
 # So we separate its own requirements file.

 tune-sklearn==0.4.0
-xgboost_ray==0.1.1
+xgboost_ray==0.1.2
+lightgbm_ray==0.0.2
 modin>=0.10.0; python_version >= '3.7'
--- a/release/requirements.txt
+++ b/release/requirements.txt
@ -13,3 +13,4 @@ expiringdict
 requests
 pytz
 git+https://github.com/ray-project/xgboost_ray.git#xgboost_ray
+git+https://github.com/ray-project/lightgbm_ray.git#lightgbm_ray