# Training a model with distributed XGBoost
In this example we will train a model in Ray AIR using distributed XGBoost.

Let's start with installing our dependencies:

In [None]:
!pip install -qU "ray[tune]" xgboost_ray

Then we need some imports:

In [1]:
from typing import Tuple

import ray
from ray.train.batch_predictor import BatchPredictor
from ray.train.xgboost import XGBoostPredictor
from ray.train.xgboost import XGBoostTrainer
from ray.data.dataset import Dataset
from ray.air.result import Result
from ray.air.util.datasets import train_test_split
from ray.data.preprocessors import StandardScaler

  from pandas import MultiIndex, Int64Index


Next we define a function to load our train, validation, and test datasets.

In [2]:
def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:
    import pandas as pd
    df = pd.read_csv("https://air-example-data.s3.us-east-2.amazonaws.com/breast_cancer.csv")     
    dataset = ray.data.from_pandas(df)
    # Optionally, read directly from s3
    # dataset = ray.data.read_csv("s3://air-example-data/breast_cancer.csv")
    train_dataset, valid_dataset = train_test_split(dataset, test_size=0.3)
    test_dataset = valid_dataset.map_batches(lambda df: df.drop("target", axis=1), batch_format="pandas")
    return train_dataset, valid_dataset, test_dataset

The following function will create a XGBoost trainer, train it, and return the result.

In [3]:
def train_xgboost(num_workers: int, use_gpu: bool = False) -> Result:
    train_dataset, valid_dataset, _ = prepare_data()

    # Scale some random columns
    columns_to_scale = ["mean radius", "mean texture"]
    preprocessor = StandardScaler(columns=columns_to_scale)

    # XGBoost specific params
    params = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    }

    trainer = XGBoostTrainer(
        scaling_config={
            "num_workers": num_workers,
            "use_gpu": use_gpu,
        },
        label_column="target",
        params=params,
        datasets={"train": train_dataset, "valid": valid_dataset},
        preprocessor=preprocessor,
        num_boost_round=100,
    )
    result = trainer.fit()
    print(result.metrics)

    return result

Once we have the result, we can do batch inference on the obtained model. Let's define a utility function for this.

In [4]:
def predict_xgboost(result: Result):
    _, _, test_dataset = prepare_data()

    batch_predictor = BatchPredictor.from_checkpoint(
        result.checkpoint, XGBoostPredictor
    )

    predicted_labels = (
        batch_predictor.predict(test_dataset)
        .map_batches(lambda df: (df > 0.5).astype(int), batch_format="pandas")
    )
    print(f"PREDICTED LABELS")
    predicted_labels.show()

    shap_values = batch_predictor.predict(test_dataset, pred_contribs=True)
    print(f"SHAP VALUES")
    shap_values.show()


Now we can run the training:

In [5]:
result = train_xgboost(num_workers=2, use_gpu=False)

2022-06-22 17:28:55,841	INFO services.py:1477 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8270[39m[22m
Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 40.28it/s]


Trial name,status,loc,iter,total time (s),train-logloss,train-error,valid-logloss
XGBoostTrainer_cc863_00000,TERMINATED,172.31.43.110:1493910,100,12.5164,0.005874,0,0.078188


[2m[36m(pid=1493910)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(XGBoostTrainer pid=1493910)[0m 2022-06-22 17:29:04,073	INFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.
[2m[36m(pid=1494007)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(pid=1494008)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(pid=1494009)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(XGBoostTrainer pid=1493910)[0m 2022-06-22 17:29:07,874	INFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.
[2m[36m(_RemoteRayXGBoostActor pid=1494008)[0m [17:29:07] task [xgboost.ray]:139731353900128 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=1494009)[0m [17:29:07] task [xgboost.ray]:140076138558608 got new rank 1
[2m[36m(_QueueActor pid=1494006)[0m   from pandas import MultiIndex, Int64Index


Result for XGBoostTrainer_cc863_00000:
  date: 2022-06-22_17-29-09
  done: false
  experiment_id: dc3dac01a34043cfb5751907e2bc648e
  hostname: ip-172-31-43-110
  iterations_since_restore: 1
  node_ip: 172.31.43.110
  pid: 1493910
  should_checkpoint: true
  time_since_restore: 7.967940330505371
  time_this_iter_s: 7.967940330505371
  time_total_s: 7.967940330505371
  timestamp: 1655918949
  timesteps_since_restore: 0
  train-error: 0.017588
  train-logloss: 0.464648
  training_iteration: 1
  trial_id: cc863_00000
  valid-error: 0.081871
  valid-logloss: 0.496374
  warmup_time: 0.004768848419189453
  


[2m[36m(XGBoostTrainer pid=1493910)[0m 2022-06-22 17:29:14,546	INFO main.py:1516 -- [RayXGBoost] Finished XGBoost training on training data with total N=398 in 10.52 seconds (6.66 pure XGBoost training time).


Result for XGBoostTrainer_cc863_00000:
  date: 2022-06-22_17-29-14
  done: true
  experiment_id: dc3dac01a34043cfb5751907e2bc648e
  experiment_tag: '0'
  hostname: ip-172-31-43-110
  iterations_since_restore: 100
  node_ip: 172.31.43.110
  pid: 1493910
  should_checkpoint: true
  time_since_restore: 12.516392230987549
  time_this_iter_s: 0.03008890151977539
  time_total_s: 12.516392230987549
  timestamp: 1655918954
  timesteps_since_restore: 0
  train-error: 0.0
  train-logloss: 0.005874
  training_iteration: 100
  trial_id: cc863_00000
  valid-error: 0.040936
  valid-logloss: 0.078188
  warmup_time: 0.004768848419189453
  


2022-06-22 17:29:15,362	INFO tune.py:734 -- Total run time: 16.94 seconds (16.08 seconds for the tuning loop).


{'train-logloss': 0.005874, 'train-error': 0.0, 'valid-logloss': 0.078188, 'valid-error': 0.040936, 'time_this_iter_s': 0.03008890151977539, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 100, 'trial_id': 'cc863_00000', 'experiment_id': 'dc3dac01a34043cfb5751907e2bc648e', 'date': '2022-06-22_17-29-14', 'timestamp': 1655918954, 'time_total_s': 12.516392230987549, 'pid': 1493910, 'hostname': 'ip-172-31-43-110', 'node_ip': '172.31.43.110', 'config': {}, 'time_since_restore': 12.516392230987549, 'timesteps_since_restore': 0, 'iterations_since_restore': 100, 'warmup_time': 0.004768848419189453, 'experiment_tag': '0'}


And perform inference on the obtained model:

In [6]:
predict_xgboost(result)

Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 46.14it/s]
[2m[36m(pid=1494373)[0m   from pandas import MultiIndex, Int64Index
Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00,  1.90s/it]
Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 75.10it/s]


PREDICTED LABELS
{'predictions': 1}
{'predictions': 1}
{'predictions': 0}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 0}
{'predictions': 1}
{'predictions': 0}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 1}
{'predictions': 0}
{'predictions': 0}
{'predictions': 1}
{'predictions': 1}
{'predictions': 0}


[2m[36m(pid=1494403)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(pid=1494413)[0m   from pandas import MultiIndex, Int64Index
Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00,  1.88s/it]

SHAP VALUES
{'predictions_0': 0.009930070489645004, 'predictions_1': 0.546318531036377, 'predictions_2': -0.006533853709697723, 'predictions_3': 0.022934239357709885, 'predictions_4': 0.32817941904067993, 'predictions_5': 0.004407345782965422, 'predictions_6': 0.013881205581128597, 'predictions_7': 0.568859875202179, 'predictions_8': -0.27460771799087524, 'predictions_9': 0.013218197971582413, 'predictions_10': 0.009325551800429821, 'predictions_11': 0.04015672579407692, 'predictions_12': 0.11667086184024811, 'predictions_13': 0.9853533506393433, 'predictions_14': 0.05529181659221649, 'predictions_15': -0.005734208971261978, 'predictions_16': -0.0008497871458530426, 'predictions_17': 0.16138489544391632, 'predictions_18': -0.36162295937538147, 'predictions_19': 0.003658014815300703, 'predictions_20': 0.393682062625885, 'predictions_21': 0.6647266149520874, 'predictions_22': 1.7201099395751953, 'predictions_23': 0.35084351897239685, 'predictions_24': 0.4841834604740143, 'predictions_25'




[2m[36m(pid=1494469)[0m   from pandas import MultiIndex, Int64Index
