# Training a model with Sklearn
In this example we will train a model in Ray Air using a Sklearn classifier.

Let's start with installing our dependencies:

In [1]:
!pip install -qU "ray[tune]" sklearn

Then we need some imports:

In [2]:
import argparse
import math
from typing import Tuple

import pandas as pd

import ray
from ray.data.dataset import Dataset
from ray.ml.batch_predictor import BatchPredictor
from ray.ml.predictors.integrations.sklearn import SklearnPredictor
from ray.ml.preprocessors import Chain, OrdinalEncoder, StandardScaler
from ray.ml.result import Result
from ray.ml.train.integrations.sklearn import SklearnTrainer


from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

try:
    from cuml.ensemble import RandomForestClassifier as cuMLRandomForestClassifier
except ImportError:
    cuMLRandomForestClassifier = None

Next we define a function to load our train, validation, and test datasets.

In [3]:
def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:
    data_raw = load_breast_cancer()
    dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
    dataset_df["target"] = data_raw["target"]
    # add a random categorical column
    num_samples = len(dataset_df)
    dataset_df["categorical_column"] = pd.Series(
        (["A", "B"] * math.ceil(num_samples / 2))[:num_samples]
    )
    train_df, test_df = train_test_split(dataset_df, test_size=0.3)
    train_dataset = ray.data.from_pandas(train_df)
    valid_dataset = ray.data.from_pandas(test_df)
    test_dataset = ray.data.from_pandas(test_df.drop("target", axis=1))
    return train_dataset, valid_dataset, test_dataset

The following function will create a Sklearn trainer, train it, and return the result.

In [4]:
def train_sklearn(num_cpus: int, use_gpu: bool = False) -> Result:
    if use_gpu and not cuMLRandomForestClassifier:
        raise RuntimeError("cuML must be installed for GPU enabled sklearn estimators.")

    train_dataset, valid_dataset, _ = prepare_data()

    # Scale some random columns
    columns_to_scale = ["mean radius", "mean texture"]
    preprocessor = Chain(
        OrdinalEncoder(["categorical_column"]), StandardScaler(columns=columns_to_scale)
    )

    if use_gpu:
        trainer_resources = {"CPU": 1, "GPU": 1}
        estimator = cuMLRandomForestClassifier()
    else:
        trainer_resources = {"CPU": num_cpus}
        estimator = RandomForestClassifier()

    trainer = SklearnTrainer(
        estimator=estimator,
        label_column="target",
        datasets={"train": train_dataset, "valid": valid_dataset},
        preprocessor=preprocessor,
        cv=5,
        scaling_config={
            "trainer_resources": trainer_resources,
        },
    )
    result = trainer.fit()
    print(result.metrics)

    return result

Once we have the result, we can do batch inference on the obtained model. Let's define a utility function for this.

In [5]:
def predict_sklearn(result: Result, use_gpu: bool = False):
    _, _, test_dataset = prepare_data()

    batch_predictor = BatchPredictor.from_checkpoint(
        result.checkpoint, SklearnPredictor
    )

    predicted_labels = (
        batch_predictor.predict(
            test_dataset,
            num_gpus_per_worker=int(use_gpu),
        )
        .map_batches(lambda df: (df > 0.5).astype(int), batch_format="pandas")
        .to_pandas(limit=float("inf"))
    )
    print(f"PREDICTED LABELS\n{predicted_labels}")

Now we can run the training:

In [6]:
result = train_sklearn(num_cpus=2, use_gpu=False)

2022-05-19 11:56:26,664	INFO services.py:1483 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


Trial name,status,loc,iter,total time (s),fit_time
SklearnTrainer_564d9_00000,TERMINATED,127.0.0.1:12221,1,17.1905,2.48662


[2m[33m(raylet)[0m 2022-05-19 11:56:31,837	INFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134
[2m[33m(raylet)[0m 2022-05-19 11:56:34,848	INFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-

[2m[33m(raylet)[0m 2022-05-19 11:56:49,612	INFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_11-56-23_998044_12148/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=59341 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:58305 --redis-password=5241590000000000 --startup-token=28 --runtime-env-hash=-2010331134
[2m[33m(raylet)[0m 2022-05-19 11:56:49,612	INFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=55845 --object-store-name=/tmp/ray/session_2022-

Result for SklearnTrainer_564d9_00000:
  cv:
    fit_time:
    - 2.402121067047119
    - 2.312839984893799
    - 2.3265390396118164
    - 2.325679063796997
    - 2.3602960109710693
    fit_time_mean: 2.34549503326416
    fit_time_std: 0.032384969255539235
    score_time:
    - 0.10820889472961426
    - 0.10829401016235352
    - 0.10703587532043457
    - 0.10512709617614746
    - 0.10840892791748047
    score_time_mean: 0.10741496086120605
    score_time_std: 0.0012465199424455708
    test_score:
    - 0.9625
    - 0.8875
    - 1.0
    - 0.9493670886075949
    - 0.9240506329113924
    test_score_mean: 0.9446835443037976
    test_score_std: 0.03766947497186954
  date: 2022-05-19_11-56-51
  done: false
  experiment_id: 200cbc1e2b84434882732d2053ec45c2
  fit_time: 2.4866180419921875
  hostname: Kais-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 12221
  should_checkpoint: true
  time_since_restore: 17.19045615196228
  time_this_iter_s: 17.19045615196228
  time_

[2m[36m(TrainTrainable pid=12221)[0m   (len(rtype_registry), rtype))
2022-05-19 11:56:51,305	INFO tune.py:753 -- Total run time: 21.67 seconds (20.55 seconds for the tuning loop).


{'valid': {'score_time': 0.10993409156799316, 'test_score': 0.9473684210526315}, 'cv': {'fit_time': array([2.40212107, 2.31283998, 2.32653904, 2.32567906, 2.36029601]), 'score_time': array([0.10820889, 0.10829401, 0.10703588, 0.1051271 , 0.10840893]), 'test_score': array([0.9625    , 0.8875    , 1.        , 0.94936709, 0.92405063]), 'fit_time_mean': 2.34549503326416, 'fit_time_std': 0.032384969255539235, 'score_time_mean': 0.10741496086120605, 'score_time_std': 0.0012465199424455708, 'test_score_mean': 0.9446835443037976, 'test_score_std': 0.03766947497186954}, 'fit_time': 2.4866180419921875, 'time_this_iter_s': 17.19045615196228, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '564d9_00000', 'experiment_id': '200cbc1e2b84434882732d2053ec45c2', 'date': '2022-05-19_11-56-51', 'timestamp': 1652957811, 'time_total_s': 17.19045615196228, 'pid': 12221, 'hostname': 'Kais-MacBook-Pro.local', 'node_ip': '127.0.0.1',

And perform inference on the obtained model:

In [7]:
predict_sklearn(result, use_gpu=False)

Map Progress (1 actors 1 pending): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.59s/it]
Map_Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 95.33it/s]

PREDICTED LABELS
     predictions
0              1
1              1
2              1
3              1
4              1
..           ...
166            1
167            1
168            0
169            0
170            1

[171 rows x 1 columns]



