From 15e86581bd9924bdf4f30b0d1393af5f94cece76 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Mon, 4 Jan 2021 17:21:04 -0800 Subject: [PATCH] [XGboost] Update Documentation (#13017) Co-authored-by: Richard Liaw --- doc/source/conf.py | 2 +- doc/source/xgboost-ray.rst | 78 +++++++++++++++++++++++++++ python/ray/tune/integration/mlflow.py | 4 +- 3 files changed, 81 insertions(+), 3 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 3a0ef1551..bdff928f7 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -25,7 +25,7 @@ from datetime import datetime import mock -class ChildClassMock(mock.MagicMock): +class ChildClassMock(mock.Mock): @classmethod def __getattr__(cls, name): return mock.Mock diff --git a/doc/source/xgboost-ray.rst b/doc/source/xgboost-ray.rst index 578f6b7a7..caa882539 100644 --- a/doc/source/xgboost-ray.rst +++ b/doc/source/xgboost-ray.rst @@ -8,6 +8,9 @@ This library adds a new backend for XGBoost utilizing Ray. Please note that this is an early version and both the API and the behavior can change without prior notice. +We'll switch to a release-based development process once the +implementation has all features for first real world use cases. + Installation ------------ @@ -131,6 +134,81 @@ setting this explicitly. The number of XGBoost actors always has to be set manually with the ``num_actors`` argument. +Memory usage +------------- +XGBoost uses a compute-optimized datastructure, the ``DMatrix``, +to hold training data. When converting a dataset to a ``DMatrix``, +XGBoost creates intermediate copies and ends up +holding a complete copy of the full data. The data will be converted +into the local dataformat (on a 64 bit system these are 64 bit floats.) +Depending on the system and original dataset dtype, this matrix can +thus occupy more memory than the original dataset. + +The **peak memory usage** for CPU-based training is at least +**3x** the dataset size (assuming dtype ``float32`` on a 64bit system) +plus about **400,000 KiB** for other resources, +like operating system requirements and storing of intermediate +results. + +**Example** + +- Machine type: AWS m5.xlarge (4 vCPUs, 16 GiB RAM) +- Usable RAM: ~15,350,000 KiB +- Dataset: 1,250,000 rows with 1024 features, dtype float32. + Total size: 5,000,000 KiB +- XGBoost DMatrix size: ~10,000,000 KiB + +This dataset will fit exactly on this node for training. + +Note that the DMatrix size might be lower on a 32 bit system. + +**GPUs** + +Generally, the same memory requirements exist for GPU-based +training. Additionally, the GPU must have enough memory +to hold the dataset. + +In the example above, the GPU must have at least +10,000,000 KiB (about 9.6 GiB) memory. However, +empirically we found that using a ``DeviceQuantileDMatrix`` +seems to show more peak GPU memory usage, possibly +for intermediate storage when loading data (about 10%). + +**Best practices** + +In order to reduce peak memory usage, consider the following +suggestions: + +- Store data as ``float32`` or less. More precision is often + not needed, and keeping data in a smaller format will + help reduce peak memory usage for initial data loading. +- Pass the ``dtype`` when loading data from CSV. Otherwise, + floating point values will be loaded as ``np.float64`` + per default, increasing peak memory usage by 33%. + +Placement Strategies +-------------------- +``xgboost_ray`` leverages :ref:`Ray's Placement Group API ` +to implement placement strategies for better fault tolerance. + +By default, a SPREAD strategy is used for training, which attempts to spread all of the training workers +across the nodes in a cluster on a best-effort basis. This improves fault tolerance since it minimizes the +number of worker failures when a node goes down, but comes at a cost of increased inter-node communication +To disable this strategy, set the ``USE_SPREAD_STRATEGY`` environment variable to 0. If disabled, no +particular placement strategy will be used. + +Note that this strategy is used only when ``elastic_training`` is not used. If ``elastic_training`` is set to ``True``, +no placement strategy is used. + +When ``xgboost_ray`` is used with Ray Tune for hyperparameter tuning, a PACK strategy is used. This strategy +attempts to place all workers for each trial on the same node on a best-effort basis. This means that if a node +goes down, it will be less likely to impact multiple trials. + +When placement strategies are used, ``xgboost_ray`` will wait for 100 seconds for the required resources +to become available, and will fail if the required resources cannot be reserved and the cluster cannot autoscale +to increase the number of resources. You can change the ``PLACEMENT_GROUP_TIMEOUT_S`` environment variable to modify +how long this timeout should be. + More examples ------------- diff --git a/python/ray/tune/integration/mlflow.py b/python/ray/tune/integration/mlflow.py index 1d1e01d48..e1d3de250 100644 --- a/python/ray/tune/integration/mlflow.py +++ b/python/ray/tune/integration/mlflow.py @@ -222,8 +222,8 @@ def mlflow_mixin(func: Callable): You can also use MlFlow's autologging feature if using a training framework like Pytorch Lightning, XGBoost, etc. More information can be - found here (https://mlflow.org/docs/latest/tracking.html#automatic - -logging). + found here + (https://mlflow.org/docs/latest/tracking.html#automatic-logging). .. code-block:: python