From 15e86581bd9924bdf4f30b0d1393af5f94cece76 Mon Sep 17 00:00:00 2001
From: Amog Kamsetty <amogkam@users.noreply.github.com>
Date: Mon, 4 Jan 2021 17:21:04 -0800
Subject: [PATCH] [XGboost] Update Documentation (#13017)

Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
---
 doc/source/conf.py                    |  2 +-
 doc/source/xgboost-ray.rst            | 78 +++++++++++++++++++++++++++
 python/ray/tune/integration/mlflow.py |  4 +-
 3 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/doc/source/conf.py b/doc/source/conf.py
index 3a0ef1551..bdff928f7 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -25,7 +25,7 @@ from datetime import datetime
 import mock
 
 
-class ChildClassMock(mock.MagicMock):
+class ChildClassMock(mock.Mock):
     @classmethod
     def __getattr__(cls, name):
         return mock.Mock
diff --git a/doc/source/xgboost-ray.rst b/doc/source/xgboost-ray.rst
index 578f6b7a7..caa882539 100644
--- a/doc/source/xgboost-ray.rst
+++ b/doc/source/xgboost-ray.rst
@@ -8,6 +8,9 @@ This library adds a new backend for XGBoost utilizing Ray.
 Please note that this is an early version and both the API and
 the behavior can change without prior notice.
 
+We'll switch to a release-based development process once the
+implementation has all features for first real world use cases.
+
 Installation
 ------------
 
@@ -131,6 +134,81 @@ setting this explicitly.
 The number of XGBoost actors always has to be set manually with
 the ``num_actors`` argument.
 
+Memory usage
+-------------
+XGBoost uses a compute-optimized datastructure, the ``DMatrix``,
+to hold training data. When converting a dataset to a ``DMatrix``,
+XGBoost creates intermediate copies and ends up
+holding a complete copy of the full data. The data will be converted
+into the local dataformat (on a 64 bit system these are 64 bit floats.)
+Depending on the system and original dataset dtype, this matrix can
+thus occupy more memory than the original dataset.
+
+The **peak memory usage** for CPU-based training is at least
+**3x** the dataset size (assuming dtype ``float32`` on a 64bit system)
+plus about **400,000 KiB** for other resources,
+like operating system requirements and storing of intermediate
+results.
+
+**Example**
+
+- Machine type: AWS m5.xlarge (4 vCPUs, 16 GiB RAM)
+- Usable RAM: ~15,350,000 KiB
+- Dataset: 1,250,000 rows with 1024 features, dtype float32.
+  Total size: 5,000,000 KiB
+- XGBoost DMatrix size: ~10,000,000 KiB
+
+This dataset will fit exactly on this node for training.
+
+Note that the DMatrix size might be lower on a 32 bit system.
+
+**GPUs**
+
+Generally, the same memory requirements exist for GPU-based
+training. Additionally, the GPU must have enough memory
+to hold the dataset.
+
+In the example above, the GPU must have at least
+10,000,000 KiB (about 9.6 GiB) memory. However,
+empirically we found that using a ``DeviceQuantileDMatrix``
+seems to show more peak GPU memory usage, possibly
+for intermediate storage when loading data (about 10%).
+
+**Best practices**
+
+In order to reduce peak memory usage, consider the following
+suggestions:
+
+- Store data as ``float32`` or less. More precision is often
+  not needed, and keeping data in a smaller format will
+  help reduce peak memory usage for initial data loading.
+- Pass the ``dtype`` when loading data from CSV. Otherwise,
+  floating point values will be loaded as ``np.float64``
+  per default, increasing peak memory usage by 33%.
+
+Placement Strategies
+--------------------
+``xgboost_ray`` leverages :ref:`Ray's Placement Group API <ray-placement-group-doc-ref>`
+to implement placement strategies for better fault tolerance.
+
+By default, a SPREAD strategy is used for training, which attempts to spread all of the training workers
+across the nodes in a cluster on a best-effort basis. This improves fault tolerance since it minimizes the
+number of worker failures when a node goes down, but comes at a cost of increased inter-node communication
+To disable this strategy, set the ``USE_SPREAD_STRATEGY`` environment variable to 0. If disabled, no
+particular placement strategy will be used.
+
+Note that this strategy is used only when ``elastic_training`` is not used. If ``elastic_training`` is set to ``True``,
+no placement strategy is used.
+
+When ``xgboost_ray`` is used with Ray Tune for hyperparameter tuning, a PACK strategy is used. This strategy
+attempts to place all workers for each trial on the same node on a best-effort basis. This means that if a node
+goes down, it will be less likely to impact multiple trials.
+
+When placement strategies are used, ``xgboost_ray`` will wait for 100 seconds for the required resources
+to become available, and will fail if the required resources cannot be reserved and the cluster cannot autoscale
+to increase the number of resources. You can change the ``PLACEMENT_GROUP_TIMEOUT_S`` environment variable to modify
+how long this timeout should be.
+
 More examples
 -------------
 
diff --git a/python/ray/tune/integration/mlflow.py b/python/ray/tune/integration/mlflow.py
index 1d1e01d48..e1d3de250 100644
--- a/python/ray/tune/integration/mlflow.py
+++ b/python/ray/tune/integration/mlflow.py
@@ -222,8 +222,8 @@ def mlflow_mixin(func: Callable):
 
     You can also use MlFlow's autologging feature if using a training
     framework like Pytorch Lightning, XGBoost, etc. More information can be
-    found here (https://mlflow.org/docs/latest/tracking.html#automatic
-    -logging).
+    found here
+    (https://mlflow.org/docs/latest/tracking.html#automatic-logging).
 
     .. code-block:: python