From 77e2ef2eb630cb7caa7ca31b3c406b45f8643192 Mon Sep 17 00:00:00 2001
From: Jiao <sophchess@gmail.com>
Date: Sat, 16 Jul 2022 17:58:21 -0700
Subject: [PATCH] [AIR] Update Torch benchmarks with documentation (#26631)

Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
---
 doc/source/ray-air/benchmarks.rst             | 59 +++++++++++++++++++
 .../workloads/gpu_batch_prediction.py         | 33 ++---------
 .../workloads/pytorch_training_e2e.py         | 48 +++++----------
 3 files changed, 79 insertions(+), 61 deletions(-)

diff --git a/doc/source/ray-air/benchmarks.rst b/doc/source/ray-air/benchmarks.rst
index 788713357..3ced20ed5 100644
--- a/doc/source/ray-air/benchmarks.rst
+++ b/doc/source/ray-air/benchmarks.rst
@@ -103,8 +103,67 @@ XGBoost parameters were kept as defaults for xgboost==1.6.1 this task.
       - `python xgboost_benchmark.py --size 100GB`
 
 
+GPU image batch prediction
+----------------------------------------------------
+
+This task uses the BatchPredictor module to process different amounts of data
+using a Pytorch pre-trained ResNet model.
+
+We test out the performance across different cluster sizes and data sizes.
+
+- `GPU image batch prediction script`_
+
+.. list-table::
+
+    * - **Cluster Setup**
+      - **Data Size**
+      - **Performance**
+      - **Command**
+    * - 1 g3.8xlarge node
+      - 1 GB (1623 images)
+      - 72.59 s (22.3 images/sec)
+      - `python gpu_batch_prediction.py --data-size-gb=1`
+    * - 1 g3.8xlarge node
+      - 20 GB (32460 images)
+      - 1213.48 s (26.76 images/sec)
+      - `python gpu_batch_prediction.py --data-size-gb=20`
+    * - 8 g3.8xlarge node
+      - 100 GB (162300 images)
+      - 784.91 s (206.78 images/sec)
+      - `python gpu_batch_prediction.py --data-size-gb=100`
+
+
+GPU image training
+------------------------
+
+This task uses the TorchTrainer module to train different amounts of data
+using an Pytorch ResNet model.
+
+We test out the performance across different cluster sizes and data sizes.
+
+- `GPU image training script`_
+
+
+.. list-table::
+
+    * - **Cluster Setup**
+      - **Data Size**
+      - **Performance**
+      - **Command**
+    * - 1 g3.8xlarge node (1 worker)
+      - 1 GB (1623 images)
+      - 79.76 s (2 epochs, 40.7 images/sec)
+      - `python pytorch_training_e2e.py --data-size-gb=1`
+    * - 1 g3.8xlarge node (1 worker)
+      - 20 GB (32460 images)
+      - 1388.33 s (2 epochs, 46.76 images/sec)
+      - `python pytorch_training_e2e.py --data-size-gb=20`
+
+
 .. _`Bulk Ingest Script`: https://github.com/ray-project/ray/blob/a30bdf9ef34a45f973b589993f7707a763df6ebf/release/air_tests/air_benchmarks/workloads/data_benchmark.py#L25-L40
 .. _`Bulk Ingest Cluster Configuration`: https://github.com/ray-project/ray/blob/a30bdf9ef34a45f973b589993f7707a763df6ebf/release/air_tests/air_benchmarks/data_20_nodes.yaml#L6-L15
 .. _`XGBoost Training Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L40-L58
 .. _`XGBoost Prediction Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L63-L71
 .. _`XGBoost Cluster Configuration`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml#L6-L24
+.. _`GPU image batch prediction script`: https://github.com/ray-project/ray/blob/cec82a1ced631525a4d115e4dc0c283fa4275a7f/release/air_tests/air_benchmarks/workloads/gpu_batch_prediction.py#L18-L49
+.. _`GPU image training script`: https://github.com/ray-project/ray/blob/cec82a1ced631525a4d115e4dc0c283fa4275a7f/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py#L95-L106
\ No newline at end of file
diff --git a/release/air_tests/air_benchmarks/workloads/gpu_batch_prediction.py b/release/air_tests/air_benchmarks/workloads/gpu_batch_prediction.py
index f3b48a861..667b7148e 100644
--- a/release/air_tests/air_benchmarks/workloads/gpu_batch_prediction.py
+++ b/release/air_tests/air_benchmarks/workloads/gpu_batch_prediction.py
@@ -2,12 +2,8 @@ import click
 import time
 import json
 import os
-import numpy as np
 import pandas as pd
-from io import BytesIO
-from typing import List
 
-from PIL import Image
 from torchvision import transforms
 from torchvision.models import resnet18
 
@@ -16,26 +12,13 @@ from ray.air.util.tensor_extensions.pandas import TensorArray
 from ray.train.torch import to_air_checkpoint, TorchPredictor
 from ray.train.batch_predictor import BatchPredictor
 from ray.data.preprocessors import BatchMapper
-
-
-# TODO(jiaodong): Remove this once ImageFolder #24641 merges
-def convert_to_pandas(byte_item_list: List[bytes]) -> pd.DataFrame:
-    """
-    Convert input bytes into pandas DataFrame with image column and value of
-    TensorArray to prevent serializing ndarray image data.
-    """
-    images = [
-        Image.open(BytesIO(byte_item)).convert("RGB") for byte_item in byte_item_list
-    ]
-    images = [np.asarray(image) for image in images]
-
-    return pd.DataFrame({"image": TensorArray(images)})
+from ray.data.datasource import ImageFolderDatasource
 
 
 def preprocess(df: pd.DataFrame) -> pd.DataFrame:
     """
-    User Pytorch code to transform user image. Note we still use pandas as
-    intermediate format to hold images as shorthand of python dictionary.
+    User Pytorch code to transform user image. Note we still use TensorArray as
+    intermediate format to hold images for now.
     """
     preprocess = transforms.Compose(
         [
@@ -45,9 +28,7 @@ def preprocess(df: pd.DataFrame) -> pd.DataFrame:
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ]
     )
-    df["image"] = df["image"].map(preprocess)
-    df["image"] = df["image"].map(lambda x: x.numpy())
-    df["image"] = TensorArray(df["image"])
+    df["image"] = TensorArray([preprocess(image.to_numpy()) for image in df["image"]])
     return df
 
 
@@ -57,9 +38,7 @@ def main(data_size_gb: int):
     data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw"
     print(f"Running GPU batch prediction with {data_size_gb}GB data from {data_url}")
     start = time.time()
-    dataset = ray.data.read_binary_files(paths=data_url)
-    # TODO(jiaodong): Remove this once ImageFolder #24641 merges
-    dataset = dataset.map_batches(convert_to_pandas)
+    dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[data_url])
 
     model = resnet18(pretrained=True)
 
@@ -67,7 +46,7 @@ def main(data_size_gb: int):
     ckpt = to_air_checkpoint(model=model, preprocessor=preprocessor)
 
     predictor = BatchPredictor.from_checkpoint(ckpt, TorchPredictor)
-    predictor.predict(dataset, num_gpus_per_worker=1)
+    predictor.predict(dataset, num_gpus_per_worker=1, feature_columns=["image"])
     total_time_s = round(time.time() - start, 2)
 
     # For structured output integration with internal tooling
diff --git a/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py b/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py
index 0d44ddfc5..5cd616489 100644
--- a/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py
+++ b/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py
@@ -2,12 +2,8 @@ import click
 import time
 import json
 import os
-import numpy as np
 import pandas as pd
-from io import BytesIO
-from typing import List
 
-from PIL import Image
 from torchvision import transforms
 from torchvision.models import resnet18
 import torch
@@ -21,28 +17,13 @@ from ray.data.preprocessors import BatchMapper
 from ray import train
 from ray.air import session
 from ray.train.torch import TorchTrainer
+from ray.data.datasource import ImageFolderDatasource
 
 
-# TODO(jiaodong): Remove this once ImageFolder #24641 merges
-def convert_to_pandas(byte_item_list: List[bytes]) -> pd.DataFrame:
+def preprocess_image_with_label(df: pd.DataFrame) -> pd.DataFrame:
     """
-    Convert input bytes into pandas DataFrame with image column and value of
-    TensorArray to prevent serializing ndarray image data.
-    """
-    images = [
-        Image.open(BytesIO(byte_item)).convert("RGB") for byte_item in byte_item_list
-    ]
-    images = [np.asarray(image) for image in images]
-    # Dummy label since we're only testing training throughput
-    labels = [1 for _ in range(len(images))]
-
-    return pd.DataFrame({"image": TensorArray(images), "label": labels})
-
-
-def preprocess(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    User Pytorch code to transform user image. Note we still use pandas as
-    intermediate format to hold images as shorthand of python dictionary.
+    User Pytorch code to transform user image. Note we still use TensorArray as
+    intermediate format to hold images for now.
     """
     preprocess = transforms.Compose(
         [
@@ -52,9 +33,9 @@ def preprocess(df: pd.DataFrame) -> pd.DataFrame:
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ]
     )
-    df["image"] = df["image"].map(preprocess)
-    df["image"] = df["image"].map(lambda x: x.numpy())
-    df["image"] = TensorArray(df["image"])
+    df["image"] = TensorArray([preprocess(image.to_numpy()) for image in df["image"]])
+    # Fix fixed synthetic value for perf benchmark purpose
+    df["label"] = df["label"].map(lambda _: 1)
     return df
 
 
@@ -101,27 +82,26 @@ def train_loop_per_worker(config):
 
 @click.command(help="Run Batch prediction on Pytorch ResNet models.")
 @click.option("--data-size-gb", type=int, default=1)
-@click.option("--num-epochs", type=int, default=10)
-def main(data_size_gb: int, num_epochs=10):
+@click.option("--num-epochs", type=int, default=2)
+@click.option("--num-workers", type=int, default=1)
+def main(data_size_gb: int, num_epochs=2, num_workers=1):
     data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw"
     print(
         "Running Pytorch image model training with "
         f"{data_size_gb}GB data from {data_url}"
     )
-    print(f"Training for {num_epochs} epochs.")
+    print(f"Training for {num_epochs} epochs with {num_workers} workers.")
     start = time.time()
-    dataset = ray.data.read_binary_files(paths=data_url)
-    # TODO(jiaodong): Remove this once ImageFolder #24641 merges
-    dataset = dataset.map_batches(convert_to_pandas)
+    dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[data_url])
 
-    preprocessor = BatchMapper(preprocess)
+    preprocessor = BatchMapper(preprocess_image_with_label)
 
     trainer = TorchTrainer(
         train_loop_per_worker=train_loop_per_worker,
         train_loop_config={"batch_size": 64, "num_epochs": num_epochs},
         datasets={"train": dataset},
         preprocessor=preprocessor,
-        scaling_config={"num_workers": 1, "use_gpu": True},
+        scaling_config={"num_workers": num_workers, "use_gpu": True},
     )
     trainer.fit()