From c2acb7ffe21059fae8fdd35b6542f0a821afd9af Mon Sep 17 00:00:00 2001 From: Maksim Smolin Date: Sat, 2 May 2020 16:48:35 -0700 Subject: [PATCH] [SGD] Add imagenet example CI (#8150) --- ci/jenkins_tests/run_sgd_tests.sh | 4 ++ docker/tune_test/requirements.txt | 1 + .../sgd/torch/examples/image_models/args.py | 17 +++++--- .../sgd/torch/examples/image_models/train.py | 16 ++++++-- .../sgd/torch/examples/image_models/util.py | 40 +++++++++++++++++++ 5 files changed, 70 insertions(+), 8 deletions(-) create mode 100644 python/ray/util/sgd/torch/examples/image_models/util.py diff --git a/ci/jenkins_tests/run_sgd_tests.sh b/ci/jenkins_tests/run_sgd_tests.sh index d0eb03450..444b510b0 100755 --- a/ci/jenkins_tests/run_sgd_tests.sh +++ b/ci/jenkins_tests/run_sgd_tests.sh @@ -40,6 +40,10 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \ python /ray/python/ray/util/sgd/torch/examples/raysgd_torch_signatures.py + +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \ +python /ray/python/ray/util/sgd/torch/examples/image_models/train.py --no-gpu --mock-data --smoke-test --ray-num-workers=2 --model mobilenetv3_small_075 data + $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 $DOCKER_SHA \ python /ray/python/ray/util/sgd/torch/examples/train_example.py diff --git a/docker/tune_test/requirements.txt b/docker/tune_test/requirements.txt index 5c34a5499..afad51a7e 100644 --- a/docker/tune_test/requirements.txt +++ b/docker/tune_test/requirements.txt @@ -28,3 +28,4 @@ torch torchvision xgboost zoopt>=0.4.0 +timm diff --git a/python/ray/util/sgd/torch/examples/image_models/args.py b/python/ray/util/sgd/torch/examples/image_models/args.py index c6fe0fca0..415040870 100644 --- a/python/ray/util/sgd/torch/examples/image_models/args.py +++ b/python/ray/util/sgd/torch/examples/image_models/args.py @@ -427,10 +427,7 @@ parser.add_argument("--local_rank", default=0, type=int) # ray parser.add_argument( - "--ray-address", - default="auto", - metavar="ADDR", - help="Ray cluster address. [default=auto]") + "--ray-address", metavar="ADDR", help="Ray cluster address.") parser.add_argument( "-n", "--ray-num-workers", @@ -438,6 +435,16 @@ parser.add_argument( default=1, metavar="N", help="Number of Ray replicas to use. [default=1]") +parser.add_argument( + "--mock-data", + action="store_true", + default=False, + help="Use mocked data for testing. [default=False]") +parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Only run one step for testing. [default=False]") def parse_args(): @@ -460,7 +467,7 @@ def parse_args(): args.distributed = False # ray SGD handles this (DistributedSampler) args.device = "cuda" # ray should handle this - if args.no_gpu == 0 and args.prefetcher: + if args.no_gpu and args.prefetcher: logging.warning("Prefetcher needs CUDA currently " "(might be a bug in timm). " "Disabling it.") diff --git a/python/ray/util/sgd/torch/examples/image_models/train.py b/python/ray/util/sgd/torch/examples/image_models/train.py index 26d070dd9..b8fbd218b 100644 --- a/python/ray/util/sgd/torch/examples/image_models/train.py +++ b/python/ray/util/sgd/torch/examples/image_models/train.py @@ -26,13 +26,14 @@ from ray.util.sgd import TorchTrainer # from ray.util.sgd.torch import TrainingOperator from ray.util.sgd.torch.examples.image_models.args import parse_args +import ray.util.sgd.torch.examples.image_models.util as util def model_creator(config): args = config["args"] model = create_model( - "resnet101", # args.model, + args.model, pretrained=args.pretrained, num_classes=args.num_classes, drop_rate=args.drop, @@ -58,6 +59,12 @@ def data_creator(config): args = config["args"] + train_dir = join(args.data, "train") + val_dir = join(args.data, "val") + + if args.mock_data: + util.mock_data(train_dir, val_dir) + # todo: verbose should depend on rank data_config = resolve_data_config(vars(args), verbose=True) @@ -137,11 +144,14 @@ def main(): }, num_workers=args.ray_num_workers) + if args.smoke_test: + args.epochs = 1 + pbar = trange(args.epochs, unit="epoch") for i in pbar: - trainer.train() + trainer.train(num_steps=1 if args.smoke_test else None) - val_stats = trainer.validate() + val_stats = trainer.validate(num_steps=1 if args.smoke_test else None) pbar.set_postfix(dict(acc=val_stats["val_accuracy"])) trainer.shutdown() diff --git a/python/ray/util/sgd/torch/examples/image_models/util.py b/python/ray/util/sgd/torch/examples/image_models/util.py new file mode 100644 index 000000000..f06962784 --- /dev/null +++ b/python/ray/util/sgd/torch/examples/image_models/util.py @@ -0,0 +1,40 @@ +import random +import os +from os.path import join + +import numpy as np +import PIL + + +def mock_data(train_dir, val_dir): + os.makedirs(train_dir, exist_ok=True) + os.makedirs(val_dir, exist_ok=True) + + max_cls_n = 99999999 + total_classes = 3 + per_cls = max_cls_n // total_classes + + max_img_n = 99999999 + total_imgs = 3 + per_img = max_img_n // total_imgs + + def mock_class(base, n): + random_cls = random.randint(per_cls * n, per_cls * n + per_cls) + sub_dir = join(base, "n{:08d}".format(random_cls)) + os.makedirs(sub_dir, exist_ok=True) + + for i in range(total_imgs): + random_img = random.randint(per_img * i, per_img * i + per_img) + file = join(sub_dir, + "ILSVRC2012_val_{:08d}.JPEG".format(random_img)) + + PIL.Image.fromarray(np.zeros((375, 500, 3), + dtype=np.uint8)).save(file) + + existing_train_cls = len(os.listdir(train_dir)) + for i in range(existing_train_cls, total_classes): + mock_class(train_dir, i) + + existing_val_cls = len(os.listdir(val_dir)) + for i in range(existing_val_cls, total_classes): + mock_class(val_dir, i)