From c2acb7ffe21059fae8fdd35b6542f0a821afd9af Mon Sep 17 00:00:00 2001
From: Maksim Smolin <maximsmol@gmail.com>
Date: Sat, 2 May 2020 16:48:35 -0700
Subject: [PATCH] [SGD] Add imagenet example CI (#8150)

---
 ci/jenkins_tests/run_sgd_tests.sh             |  4 ++
 docker/tune_test/requirements.txt             |  1 +
 .../sgd/torch/examples/image_models/args.py   | 17 +++++---
 .../sgd/torch/examples/image_models/train.py  | 16 ++++++--
 .../sgd/torch/examples/image_models/util.py   | 40 +++++++++++++++++++
 5 files changed, 70 insertions(+), 8 deletions(-)
 create mode 100644 python/ray/util/sgd/torch/examples/image_models/util.py

diff --git a/ci/jenkins_tests/run_sgd_tests.sh b/ci/jenkins_tests/run_sgd_tests.sh
index d0eb03450..444b510b0 100755
--- a/ci/jenkins_tests/run_sgd_tests.sh
+++ b/ci/jenkins_tests/run_sgd_tests.sh
@@ -40,6 +40,10 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE}
 $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
     python /ray/python/ray/util/sgd/torch/examples/raysgd_torch_signatures.py
 
+
+$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
+python /ray/python/ray/util/sgd/torch/examples/image_models/train.py --no-gpu --mock-data --smoke-test --ray-num-workers=2 --model mobilenetv3_small_075 data
+
 $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \
     python /ray/python/ray/util/sgd/torch/examples/train_example.py
 
diff --git a/docker/tune_test/requirements.txt b/docker/tune_test/requirements.txt
index 5c34a5499..afad51a7e 100644
--- a/docker/tune_test/requirements.txt
+++ b/docker/tune_test/requirements.txt
@@ -28,3 +28,4 @@ torch
 torchvision
 xgboost
 zoopt>=0.4.0
+timm
diff --git a/python/ray/util/sgd/torch/examples/image_models/args.py b/python/ray/util/sgd/torch/examples/image_models/args.py
index c6fe0fca0..415040870 100644
--- a/python/ray/util/sgd/torch/examples/image_models/args.py
+++ b/python/ray/util/sgd/torch/examples/image_models/args.py
@@ -427,10 +427,7 @@ parser.add_argument("--local_rank", default=0, type=int)
 
 # ray
 parser.add_argument(
-    "--ray-address",
-    default="auto",
-    metavar="ADDR",
-    help="Ray cluster address. [default=auto]")
+    "--ray-address", metavar="ADDR", help="Ray cluster address.")
 parser.add_argument(
     "-n",
     "--ray-num-workers",
@@ -438,6 +435,16 @@ parser.add_argument(
     default=1,
     metavar="N",
     help="Number of Ray replicas to use. [default=1]")
+parser.add_argument(
+    "--mock-data",
+    action="store_true",
+    default=False,
+    help="Use mocked data for testing. [default=False]")
+parser.add_argument(
+    "--smoke-test",
+    action="store_true",
+    default=False,
+    help="Only run one step for testing. [default=False]")
 
 
 def parse_args():
@@ -460,7 +467,7 @@ def parse_args():
     args.distributed = False  # ray SGD handles this (DistributedSampler)
     args.device = "cuda"  # ray should handle this
 
-    if args.no_gpu == 0 and args.prefetcher:
+    if args.no_gpu and args.prefetcher:
         logging.warning("Prefetcher needs CUDA currently "
                         "(might be a bug in timm). "
                         "Disabling it.")
diff --git a/python/ray/util/sgd/torch/examples/image_models/train.py b/python/ray/util/sgd/torch/examples/image_models/train.py
index 26d070dd9..b8fbd218b 100644
--- a/python/ray/util/sgd/torch/examples/image_models/train.py
+++ b/python/ray/util/sgd/torch/examples/image_models/train.py
@@ -26,13 +26,14 @@ from ray.util.sgd import TorchTrainer
 # from ray.util.sgd.torch import TrainingOperator
 
 from ray.util.sgd.torch.examples.image_models.args import parse_args
+import ray.util.sgd.torch.examples.image_models.util as util
 
 
 def model_creator(config):
     args = config["args"]
 
     model = create_model(
-        "resnet101",  # args.model,
+        args.model,
         pretrained=args.pretrained,
         num_classes=args.num_classes,
         drop_rate=args.drop,
@@ -58,6 +59,12 @@ def data_creator(config):
 
     args = config["args"]
 
+    train_dir = join(args.data, "train")
+    val_dir = join(args.data, "val")
+
+    if args.mock_data:
+        util.mock_data(train_dir, val_dir)
+
     # todo: verbose should depend on rank
     data_config = resolve_data_config(vars(args), verbose=True)
 
@@ -137,11 +144,14 @@ def main():
         },
         num_workers=args.ray_num_workers)
 
+    if args.smoke_test:
+        args.epochs = 1
+
     pbar = trange(args.epochs, unit="epoch")
     for i in pbar:
-        trainer.train()
+        trainer.train(num_steps=1 if args.smoke_test else None)
 
-        val_stats = trainer.validate()
+        val_stats = trainer.validate(num_steps=1 if args.smoke_test else None)
         pbar.set_postfix(dict(acc=val_stats["val_accuracy"]))
 
     trainer.shutdown()
diff --git a/python/ray/util/sgd/torch/examples/image_models/util.py b/python/ray/util/sgd/torch/examples/image_models/util.py
new file mode 100644
index 000000000..f06962784
--- /dev/null
+++ b/python/ray/util/sgd/torch/examples/image_models/util.py
@@ -0,0 +1,40 @@
+import random
+import os
+from os.path import join
+
+import numpy as np
+import PIL
+
+
+def mock_data(train_dir, val_dir):
+    os.makedirs(train_dir, exist_ok=True)
+    os.makedirs(val_dir, exist_ok=True)
+
+    max_cls_n = 99999999
+    total_classes = 3
+    per_cls = max_cls_n // total_classes
+
+    max_img_n = 99999999
+    total_imgs = 3
+    per_img = max_img_n // total_imgs
+
+    def mock_class(base, n):
+        random_cls = random.randint(per_cls * n, per_cls * n + per_cls)
+        sub_dir = join(base, "n{:08d}".format(random_cls))
+        os.makedirs(sub_dir, exist_ok=True)
+
+        for i in range(total_imgs):
+            random_img = random.randint(per_img * i, per_img * i + per_img)
+            file = join(sub_dir,
+                        "ILSVRC2012_val_{:08d}.JPEG".format(random_img))
+
+            PIL.Image.fromarray(np.zeros((375, 500, 3),
+                                         dtype=np.uint8)).save(file)
+
+    existing_train_cls = len(os.listdir(train_dir))
+    for i in range(existing_train_cls, total_classes):
+        mock_class(train_dir, i)
+
+    existing_val_cls = len(os.listdir(val_dir))
+    for i in range(existing_val_cls, total_classes):
+        mock_class(val_dir, i)