# Training a model with distributed LightGBM
In this example we will train a model in Ray AIR using distributed LightGBM.

Let's start with installing our dependencies:

In [1]:
!pip install -qU "ray[tune]" lightgbm_ray

Then we need some imports:

In [2]:
import argparse
import math
from typing import Tuple

import pandas as pd

import ray
from ray.air.batch_predictor import BatchPredictor
from ray.air.predictors.integrations.lightgbm import LightGBMPredictor
from ray.air.preprocessors.chain import Chain
from ray.air.preprocessors.encoder import Categorizer
from ray.train.lightgbm import LightGBMTrainer
from ray.data.dataset import Dataset
from ray.air.result import Result
from ray.air.preprocessors import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

Next we define a function to load our train, validation, and test datasets.

In [3]:
def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:
    data_raw = load_breast_cancer()
    dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
    dataset_df["target"] = data_raw["target"]
    # add a random categorical column
    num_samples = len(dataset_df)
    dataset_df["categorical_column"] = pd.Series(
        (["A", "B"] * math.ceil(num_samples / 2))[:num_samples]
    )
    train_df, test_df = train_test_split(dataset_df, test_size=0.3)
    train_dataset = ray.data.from_pandas(train_df)
    valid_dataset = ray.data.from_pandas(test_df)
    test_dataset = ray.data.from_pandas(test_df.drop("target", axis=1))
    return train_dataset, valid_dataset, test_dataset

The following function will create a LightGBM trainer, train it, and return the result.

In [4]:
def train_lightgbm(num_workers: int, use_gpu: bool = False) -> Result:
    train_dataset, valid_dataset, _ = prepare_data()

    # Scale some random columns, and categorify the categorical_column,
    # allowing LightGBM to use its built-in categorical feature support
    columns_to_scale = ["mean radius", "mean texture"]
    preprocessor = Chain(
        Categorizer(["categorical_column"]), StandardScaler(columns=columns_to_scale)
    )

    # LightGBM specific params
    params = {
        "objective": "binary",
        "metric": ["binary_logloss", "binary_error"],
    }

    trainer = LightGBMTrainer(
        scaling_config={
            "num_workers": num_workers,
            "use_gpu": use_gpu,
        },
        label_column="target",
        params=params,
        datasets={"train": train_dataset, "valid": valid_dataset},
        preprocessor=preprocessor,
        num_boost_round=100,
    )
    result = trainer.fit()
    print(result.metrics)

    return result

Once we have the result, we can do batch inference on the obtained model. Let's define a utility function for this.

In [5]:
def predict_lightgbm(result: Result):
    _, _, test_dataset = prepare_data()
    batch_predictor = BatchPredictor.from_checkpoint(
        result.checkpoint, LightGBMPredictor
    )

    predicted_labels = (
        batch_predictor.predict(test_dataset)
        .map_batches(lambda df: (df > 0.5).astype(int), batch_format="pandas")
        .to_pandas(limit=float("inf"))
    )
    print(f"PREDICTED LABELS\n{predicted_labels}")

    shap_values = batch_predictor.predict(test_dataset, pred_contrib=True).to_pandas(
        limit=float("inf")
    )
    print(f"SHAP VALUES\n{shap_values}")

Now we can run the training:

In [6]:
result = train_lightgbm(num_workers=2, use_gpu=False)

2022-05-19 11:18:27,652	INFO services.py:1483 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Trial name,status,loc,iter,total time (s),train-binary_logloss,train-binary_error,valid-binary_logloss
LightGBMTrainer_07bf3_00000,TERMINATED,127.0.0.1:9219,100,10.4622,0.000197893,0,0.289033


[2m[33m(raylet)[0m 2022-05-19 11:18:32,940	INFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=51840 --object-store-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-18-25_114449_9132/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=56443 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58688 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134
[2m[33m(raylet)[0m 2022-05-19 11:18:36,664	INFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=51840 --object-store-name=/tmp/ray/session_2022-05

[2m[36m(_RemoteRayLightGBMActor pid=9242)[0m [LightGBM] [Info] Trying to bind port 52127...
[2m[36m(_RemoteRayLightGBMActor pid=9242)[0m [LightGBM] [Info] Binding port 52127 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=9242)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=9243)[0m [LightGBM] [Info] Trying to bind port 52128...
[2m[36m(_RemoteRayLightGBMActor pid=9243)[0m [LightGBM] [Info] Binding port 52128 succeeded
[2m[36m(_RemoteRayLightGBMActor pid=9243)[0m [LightGBM] [Info] Listening...
[2m[36m(_RemoteRayLightGBMActor pid=9242)[0m [LightGBM] [Info] Connected to rank 1
[2m[36m(_RemoteRayLightGBMActor pid=9242)[0m [LightGBM] [Info] Local rank: 0, total number of machines: 2
[2m[36m(_RemoteRayLightGBMActor pid=9243)[0m [LightGBM] [Info] Connected to rank 0
[2m[36m(_RemoteRayLightGBMActor pid=9243)[0m [LightGBM] [Info] Local rank: 1, total number of machines: 2




Result for LightGBMTrainer_07bf3_00000:
  date: 2022-05-19_11-18-44
  done: false
  experiment_id: 1d3640d1c3a743aeae7274a0ce253107
  hostname: Kais-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 9219
  should_checkpoint: true
  time_since_restore: 8.41084909439087
  time_this_iter_s: 8.41084909439087
  time_total_s: 8.41084909439087
  timestamp: 1652955524
  timesteps_since_restore: 0
  train-binary_error: 0.36683417085427134
  train-binary_logloss: 0.5804693664919086
  training_iteration: 1
  trial_id: 07bf3_00000
  valid-binary_error: 0.36470588235294116
  valid-binary_logloss: 0.5868466345817073
  warmup_time: 0.004106044769287109
  
Result for LightGBMTrainer_07bf3_00000:
  date: 2022-05-19_11-18-46
  done: true
  experiment_id: 1d3640d1c3a743aeae7274a0ce253107
  experiment_tag: '0'
  hostname: Kais-MacBook-Pro.local
  iterations_since_restore: 100
  node_ip: 127.0.0.1
  pid: 9219
  should_checkpoint: true
  time_since_restore: 10.46218204498291
  time

2022-05-19 11:18:47,218	INFO tune.py:753 -- Total run time: 16.87 seconds (15.17 seconds for the tuning loop).


{'train-binary_logloss': 0.00019789273681613937, 'train-binary_error': 0.0, 'valid-binary_logloss': 0.2890328865004496, 'valid-binary_error': 0.058823529411764705, 'time_this_iter_s': 0.025421857833862305, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 100, 'trial_id': '07bf3_00000', 'experiment_id': '1d3640d1c3a743aeae7274a0ce253107', 'date': '2022-05-19_11-18-46', 'timestamp': 1652955526, 'time_total_s': 10.46218204498291, 'pid': 9219, 'hostname': 'Kais-MacBook-Pro.local', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 10.46218204498291, 'timesteps_since_restore': 0, 'iterations_since_restore': 100, 'warmup_time': 0.004106044769287109, 'experiment_tag': '0'}


And perform inference on the obtained model:

In [7]:
predict_lightgbm(result)

Map Progress (1 actors 1 pending): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.21s/it]
Map_Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 93.04it/s]


PREDICTED LABELS
     predictions
0              1
1              1
2              1
3              1
4              1
..           ...
166            1
167            0
168            1
169            1
170            1

[171 rows x 1 columns]


Map Progress (1 actors 1 pending): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.20s/it]

SHAP VALUES
     predictions_0  predictions_1  predictions_2  predictions_3  \
0         0.089310      -0.909119       0.042819       0.002084   
1         0.080590      -0.961430       0.043946      -0.014364   
2         0.080606      -0.778903       0.067703       0.005561   
3         0.095129      -0.281614       0.083395       0.005946   
4         0.085822       0.470362       0.106340      -0.014601   
..             ...            ...            ...            ...   
166       0.095845       0.217879       0.131259       0.045455   
167      -0.369657      -1.825973      -0.270361      -0.005203   
168       0.078703       0.142254       0.069414       0.002620   
169       0.069391       0.226548       0.035343      -0.014900   
170       0.088893       0.216221       0.068534      -0.015672   

     predictions_4  predictions_5  predictions_6  predictions_7  \
0        -0.243459       0.536332      -1.275628       0.490998   
1         0.888434      -0.666081       0.541587 


