{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c3192ac4",
   "metadata": {},
   "source": [
    "# Training a model with Sklearn\n",
    "In this example we will train a model in Ray AIR using a Sklearn classifier."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5a4823bf",
   "metadata": {},
   "source": [
    "Let's start with installing our dependencies:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88f4bb39",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "!pip install -qU \"ray[tune]\" sklearn"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c049c692",
   "metadata": {},
   "source": [
    "Then we need some imports:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c02eb5cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Tuple\n",
    "\n",
    "\n",
    "import ray\n",
    "from ray.data.dataset import Dataset\n",
    "from ray.train.batch_predictor import BatchPredictor\n",
    "from ray.train.sklearn import SklearnPredictor\n",
    "from ray.data.preprocessors import Chain, OrdinalEncoder, StandardScaler\n",
    "from ray.air.result import Result\n",
    "from ray.train.sklearn import SklearnTrainer\n",
    "from ray.air.config import ScalingConfig\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "try:\n",
    "    from cuml.ensemble import RandomForestClassifier as cuMLRandomForestClassifier\n",
    "except ImportError:\n",
    "    cuMLRandomForestClassifier = None"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52e017f1",
   "metadata": {},
   "source": [
    "Next we define a function to load our train, validation, and test datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3631ed1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n",
    "    dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer_with_categorical.csv\")\n",
    "    train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)\n",
    "    test_dataset = valid_dataset.drop_columns([\"target\"])\n",
    "    return train_dataset, valid_dataset, test_dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8d6c6d17",
   "metadata": {},
   "source": [
    "The following function will create a Sklearn trainer, train it, and return the result."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0fd39e42",
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_sklearn(num_cpus: int, use_gpu: bool = False) -> Result:\n",
    "    if use_gpu and not cuMLRandomForestClassifier:\n",
    "        raise RuntimeError(\"cuML must be installed for GPU enabled sklearn estimators.\")\n",
    "\n",
    "    train_dataset, valid_dataset, _ = prepare_data()\n",
    "\n",
    "    # Scale some random columns\n",
    "    columns_to_scale = [\"mean radius\", \"mean texture\"]\n",
    "    preprocessor = Chain(\n",
    "        OrdinalEncoder([\"categorical_column\"]), StandardScaler(columns=columns_to_scale)\n",
    "    )\n",
    "\n",
    "    if use_gpu:\n",
    "        trainer_resources = {\"CPU\": 1, \"GPU\": 1}\n",
    "        estimator = cuMLRandomForestClassifier()\n",
    "    else:\n",
    "        trainer_resources = {\"CPU\": num_cpus}\n",
    "        estimator = RandomForestClassifier()\n",
    "\n",
    "    trainer = SklearnTrainer(\n",
    "        estimator=estimator,\n",
    "        label_column=\"target\",\n",
    "        datasets={\"train\": train_dataset, \"valid\": valid_dataset},\n",
    "        preprocessor=preprocessor,\n",
    "        cv=5,\n",
    "        scaling_config=ScalingConfig(trainer_resources=trainer_resources),\n",
    "    )\n",
    "    result = trainer.fit()\n",
    "    print(result.metrics)\n",
    "\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7a2efb9d",
   "metadata": {},
   "source": [
    "Once we have the result, we can do batch inference on the obtained model. Let's define a utility function for this."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "59eeadd8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_sklearn(result: Result, use_gpu: bool = False):\n",
    "    _, _, test_dataset = prepare_data()\n",
    "\n",
    "    batch_predictor = BatchPredictor.from_checkpoint(\n",
    "        result.checkpoint, SklearnPredictor\n",
    "    )\n",
    "\n",
    "    predicted_labels = (\n",
    "        batch_predictor.predict(\n",
    "            test_dataset,\n",
    "            num_gpus_per_worker=int(use_gpu),\n",
    "        )\n",
    "        .map_batches(lambda df: (df > 0.5).astype(int), batch_format=\"pandas\")\n",
    "    )\n",
    "    print(f\"PREDICTED LABELS\")\n",
    "    predicted_labels.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d073994",
   "metadata": {},
   "source": [
    "Now we can run the training:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "43f9170a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-06-22 17:27:37,741\tINFO services.py:1477 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8269\u001b[39m\u001b[22m\n",
      "2022-06-22 17:27:39,822\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n",
      "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 44.05it/s]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "== Status ==<br>Current time: 2022-06-22 17:27:59 (running for 00:00:18.31)<br>Memory usage on this node: 10.7/31.0 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/12.9 GiB heap, 0.0/6.45 GiB objects<br>Result logdir: /home/ubuntu/ray_results/SklearnTrainer_2022-06-22_17-27-40<br>Number of trials: 1/1 (1 TERMINATED)<br><table>\n",
       "<thead>\n",
       "<tr><th>Trial name                </th><th>status    </th><th>loc                  </th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">  fit_time</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td>SklearnTrainer_9dec8_00000</td><td>TERMINATED</td><td>172.31.43.110:1492629</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         15.6842</td><td style=\"text-align: right;\">   2.31571</td></tr>\n",
       "</tbody>\n",
       "</table><br><br>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[36m(SklearnTrainer pid=1492629)\u001b[0m 2022-06-22 17:27:45,647\tWARNING pool.py:591 -- The 'context' argument is not supported using ray. Please refer to the documentation for how to control ray initialization.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Result for SklearnTrainer_9dec8_00000:\n",
      "  cv:\n",
      "    fit_time:\n",
      "    - 2.221003770828247\n",
      "    - 2.215489387512207\n",
      "    - 2.2075674533843994\n",
      "    - 2.222351312637329\n",
      "    - 2.312389612197876\n",
      "    fit_time_mean: 2.235760307312012\n",
      "    fit_time_std: 0.03866614559685742\n",
      "    score_time:\n",
      "    - 0.022464990615844727\n",
      "    - 0.0230865478515625\n",
      "    - 0.02564835548400879\n",
      "    - 0.029137849807739258\n",
      "    - 0.021221637725830078\n",
      "    score_time_mean: 0.02431187629699707\n",
      "    score_time_std: 0.0028120522003997595\n",
      "    test_score:\n",
      "    - 0.9625\n",
      "    - 0.9125\n",
      "    - 0.9875\n",
      "    - 1.0\n",
      "    - 0.9367088607594937\n",
      "    test_score_mean: 0.9598417721518986\n",
      "    test_score_std: 0.032128186960552516\n",
      "  date: 2022-06-22_17-27-59\n",
      "  done: false\n",
      "  experiment_id: f8215019c10e4a81ba2187c38e875365\n",
      "  fit_time: 2.3157050609588623\n",
      "  hostname: ip-172-31-43-110\n",
      "  iterations_since_restore: 1\n",
      "  node_ip: 172.31.43.110\n",
      "  pid: 1492629\n",
      "  should_checkpoint: true\n",
      "  time_since_restore: 15.684244871139526\n",
      "  time_this_iter_s: 15.684244871139526\n",
      "  time_total_s: 15.684244871139526\n",
      "  timestamp: 1655918879\n",
      "  timesteps_since_restore: 0\n",
      "  training_iteration: 1\n",
      "  trial_id: 9dec8_00000\n",
      "  valid:\n",
      "    score_time: 0.03549623489379883\n",
      "    test_score: 0.9532163742690059\n",
      "  warmup_time: 0.0057866573333740234\n",
      "  \n",
      "Result for SklearnTrainer_9dec8_00000:\n",
      "  cv:\n",
      "    fit_time:\n",
      "    - 2.221003770828247\n",
      "    - 2.215489387512207\n",
      "    - 2.2075674533843994\n",
      "    - 2.222351312637329\n",
      "    - 2.312389612197876\n",
      "    fit_time_mean: 2.235760307312012\n",
      "    fit_time_std: 0.03866614559685742\n",
      "    score_time:\n",
      "    - 0.022464990615844727\n",
      "    - 0.0230865478515625\n",
      "    - 0.02564835548400879\n",
      "    - 0.029137849807739258\n",
      "    - 0.021221637725830078\n",
      "    score_time_mean: 0.02431187629699707\n",
      "    score_time_std: 0.0028120522003997595\n",
      "    test_score:\n",
      "    - 0.9625\n",
      "    - 0.9125\n",
      "    - 0.9875\n",
      "    - 1.0\n",
      "    - 0.9367088607594937\n",
      "    test_score_mean: 0.9598417721518986\n",
      "    test_score_std: 0.032128186960552516\n",
      "  date: 2022-06-22_17-27-59\n",
      "  done: true\n",
      "  experiment_id: f8215019c10e4a81ba2187c38e875365\n",
      "  experiment_tag: '0'\n",
      "  fit_time: 2.3157050609588623\n",
      "  hostname: ip-172-31-43-110\n",
      "  iterations_since_restore: 1\n",
      "  node_ip: 172.31.43.110\n",
      "  pid: 1492629\n",
      "  should_checkpoint: true\n",
      "  time_since_restore: 15.684244871139526\n",
      "  time_this_iter_s: 15.684244871139526\n",
      "  time_total_s: 15.684244871139526\n",
      "  timestamp: 1655918879\n",
      "  timesteps_since_restore: 0\n",
      "  training_iteration: 1\n",
      "  trial_id: 9dec8_00000\n",
      "  valid:\n",
      "    score_time: 0.03549623489379883\n",
      "    test_score: 0.9532163742690059\n",
      "  warmup_time: 0.0057866573333740234\n",
      "  \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-06-22 17:27:59,333\tINFO tune.py:734 -- Total run time: 19.09 seconds (18.31 seconds for the tuning loop).\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'valid': {'score_time': 0.03549623489379883, 'test_score': 0.9532163742690059}, 'cv': {'fit_time': array([2.22100377, 2.21548939, 2.20756745, 2.22235131, 2.31238961]), 'score_time': array([0.02246499, 0.02308655, 0.02564836, 0.02913785, 0.02122164]), 'test_score': array([0.9625    , 0.9125    , 0.9875    , 1.        , 0.93670886]), 'fit_time_mean': 2.235760307312012, 'fit_time_std': 0.03866614559685742, 'score_time_mean': 0.02431187629699707, 'score_time_std': 0.0028120522003997595, 'test_score_mean': 0.9598417721518986, 'test_score_std': 0.032128186960552516}, 'fit_time': 2.3157050609588623, 'time_this_iter_s': 15.684244871139526, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '9dec8_00000', 'experiment_id': 'f8215019c10e4a81ba2187c38e875365', 'date': '2022-06-22_17-27-59', 'timestamp': 1655918879, 'time_total_s': 15.684244871139526, 'pid': 1492629, 'hostname': 'ip-172-31-43-110', 'node_ip': '172.31.43.110', 'config': {}, 'time_since_restore': 15.684244871139526, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.0057866573333740234, 'experiment_tag': '0'}\n"
     ]
    }
   ],
   "source": [
    "result = train_sklearn(num_cpus=2, use_gpu=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0daba603",
   "metadata": {},
   "source": [
    "And perform inference on the obtained model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "24b16ede",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-06-22 17:27:59,658\tWARNING read_api.py:260 -- The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n",
      "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 64.73it/s]\n",
      "Map Progress (1 actors 1 pending): 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]\n",
      "Map_Batches: 100%|██████████| 1/1 [00:00<00:00, 71.41it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PREDICTED LABELS\n",
      "{'predictions': 1}\n",
      "{'predictions': 1}\n",
      "{'predictions': 0}\n",
      "{'predictions': 1}\n",
      "{'predictions': 1}\n",
      "{'predictions': 1}\n",
      "{'predictions': 1}\n",
      "{'predictions': 1}\n",
      "{'predictions': 0}\n",
      "{'predictions': 1}\n",
      "{'predictions': 0}\n",
      "{'predictions': 1}\n",
      "{'predictions': 1}\n",
      "{'predictions': 1}\n",
      "{'predictions': 1}\n",
      "{'predictions': 0}\n",
      "{'predictions': 1}\n",
      "{'predictions': 1}\n",
      "{'predictions': 1}\n",
      "{'predictions': 0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "predict_sklearn(result, use_gpu=False)"
   ]
  }
 ],
 "metadata": {
  "jupytext": {
   "cell_metadata_filter": "-all",
   "main_language": "python",
   "notebook_metadata_filter": "-all"
  },
  "kernelspec": {
   "display_name": "Python 3.8.10 ('venv': venv)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "vscode": {
   "interpreter": {
    "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}