[tune] added MXNet integration callbacks (#10533)

2025-03-06 10:31:39 -05:00 · 2020-09-04 02:06:44 +01:00 · 2020-09-04 02:06:44 +01:00 · 5c3d4a6670
commit 5c3d4a6670
parent ead30ca655
5 changed files with 234 additions and 0 deletions
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@ -39,6 +39,7 @@ MOCK_MODULES = [
    "horovod",
    "horovod.ray",
    "kubernetes",
    "mxnet.model",
    "psutil",
    "ray._raylet",
    "ray.core.generated",
--- a/doc/source/tune/api_docs/integration.rst
+++ b/doc/source/tune/api_docs/integration.rst
@ -23,6 +23,16 @@ Kubernetes (tune.integration.kubernetes)
 .. autofunction:: ray.tune.integration.kubernetes.NamespacedKubernetesSyncer
 .. _tune-integration-mxnet:
 MXNet (tune.integration.mxnet)
 ------------------------------
 .. autoclass:: ray.tune.integration.mxnet.TuneReportCallback
 .. autoclass:: ray.tune.integration.mxnet.TuneCheckpointCallback
 .. _tune-integration-pytorch-lightning:
 PyTorch Lightning (tune.integration.pytorch_lightning)
--- a/python/ray/tune/BUILD
+++ b/python/ray/tune/BUILD
@ -460,6 +460,15 @@ py_test(
    args = ["--smoke-test"]
 )
 py_test(
    name = "mxnet_example",
    size = "small",
    srcs = ["examples/mxnet_example.py"],
    deps = [":tune_lib"],
    tags = ["exclusive", "example"],
    args = ["--smoke-test"]
 )
 py_test(
    name = "nevergrad_example",
    size = "medium",
--- a/python/ray/tune/examples/mxnet_example.py
+++ b/python/ray/tune/examples/mxnet_example.py
@ -0,0 +1,95 @@
 from functools import partial
 import mxnet as mx
 from ray import tune, logger
 from ray.tune import CLIReporter
 from ray.tune.integration.mxnet import TuneCheckpointCallback, \
    TuneReportCallback
 from ray.tune.schedulers import ASHAScheduler
 def train_mnist_mxnet(config, mnist, num_epochs=10):
    batch_size = config["batch_size"]
    train_iter = mx.io.NDArrayIter(
        mnist["train_data"], mnist["train_label"], batch_size, shuffle=True)
    val_iter = mx.io.NDArrayIter(mnist["test_data"], mnist["test_label"],
                                 batch_size)
    data = mx.sym.var("data")
    data = mx.sym.flatten(data=data)
    fc1 = mx.sym.FullyConnected(data=data, num_hidden=config["layer_1_size"])
    act1 = mx.sym.Activation(data=fc1, act_type="relu")
    fc2 = mx.sym.FullyConnected(data=act1, num_hidden=config["layer_2_size"])
    act2 = mx.sym.Activation(data=fc2, act_type="relu")
    # MNIST has 10 classes
    fc3 = mx.sym.FullyConnected(data=act2, num_hidden=10)
    # Softmax with cross entropy loss
    mlp = mx.sym.SoftmaxOutput(data=fc3, name="softmax")
    # create a trainable module on CPU
    mlp_model = mx.mod.Module(symbol=mlp, context=mx.cpu())
    mlp_model.fit(
        train_iter,
        eval_data=val_iter,
        optimizer="sgd",
        optimizer_params={"learning_rate": config["lr"]},
        eval_metric="acc",
        batch_end_callback=mx.callback.Speedometer(batch_size, 100),
        eval_end_callback=TuneReportCallback({
            "mean_accuracy": "accuracy"
        }),
        epoch_end_callback=TuneCheckpointCallback(
            filename="mxnet_cp", frequency=3),
        num_epoch=num_epochs)
 def tune_mnist_mxnet(num_samples=10, num_epochs=10):
    logger.info("Downloading MNIST data...")
    mnist_data = mx.test_utils.get_mnist()
    logger.info("Got MNIST data, starting Ray Tune.")
    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-3, 1e-1),
        "batch_size": tune.choice([32, 64, 128])
    }
    scheduler = ASHAScheduler(
        metric="mean_accuracy",
        mode="max",
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])
    tune.run(
        partial(train_mnist_mxnet, mnist=mnist_data, num_epochs=num_epochs),
        resources_per_trial={
            "cpu": 1,
        },
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_mnist_mxnet")
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing")
    args, _ = parser.parse_known_args()
    if args.smoke_test:
        tune_mnist_mxnet(num_samples=1, num_epochs=1)
    else:
        tune_mnist_mxnet(num_samples=10, num_epochs=10)
--- a/python/ray/tune/integration/mxnet.py
+++ b/python/ray/tune/integration/mxnet.py
@ -0,0 +1,119 @@
 from typing import Dict, List, Union
 from ray import tune
 from mxnet.model import save_checkpoint
 import os
 class TuneCallback:
    """Base class for Tune's MXNet callbacks."""
    pass
 class TuneReportCallback(TuneCallback):
    """MXNet to Ray Tune reporting callback
    Reports metrics to Ray Tune.
    This has to be passed to MXNet as the ``eval_end_callback``.
    Args:
        metrics (str|list|dict): Metrics to report to Tune. If this is a list,
            each item describes the metric key reported to MXNet,
            and it will reported under the same name to Tune. If this is a
            dict, each key will be the name reported to Tune and the respective
            value will be the metric key reported to MXNet.
    Example:
    .. code-block:: python
        from ray.tune.integration.mxnet import TuneReportCallback
        # mlp_model is a MXNet model
        mlp_model.fit(
            train_iter,
            # ...
            eval_metric="acc",
            eval_end_callback=TuneReportCallback({
                "mean_accuracy": "accuracy"
            }))
    """
    def __init__(self,
                 metrics: Union[None, str, List[str], Dict[str, str]] = None):
        if isinstance(metrics, str):
            metrics = [metrics]
        self._metrics = metrics
    def __call__(self, param):
        if not param.eval_metric:
            return
        if not self._metrics:
            report_dict = dict(param.eval_metric.get_name_value())
        else:
            report_dict = {}
            lookup_dict = dict(param.eval_metric.get_name_value())
            for key in self._metrics:
                if isinstance(self._metrics, dict):
                    metric = self._metrics[key]
                else:
                    metric = key
                report_dict[key] = lookup_dict[metric]
        tune.report(**report_dict)
 class TuneCheckpointCallback(TuneCallback):
    """MXNet checkpoint callback
    Saves checkpoints after each epoch.
    This has to be passed to the ``epoch_end_callback`` of the MXNet model.
    Checkpoint are currently not registered if no ``tune.report()`` call
    is made afterwards. You have to use this in conjunction with the
    ``TuneReportCallback`` to work!
    Args:
        filename (str): Filename of the checkpoint within the checkpoint
            directory. Defaults to "checkpoint".
        frequency (int): Integer indicating how often checkpoints should be
            saved.
    Example:
    .. code-block:: python
        from ray.tune.integration.mxnet import TuneReportCallback, \
            TuneCheckpointCallback
        # mlp_model is a MXNet model
        mlp_model.fit(
            train_iter,
            # ...
            eval_metric="acc",
            eval_end_callback=TuneReportCallback({
                "mean_accuracy": "accuracy"
            }),
            epoch_end_callback=TuneCheckpointCallback(
                filename="mxnet_cp",
                frequency=3
            ))
    """
    def __init__(self, filename: str = "checkpoint", frequency: int = 1):
        self._filename = filename
        self._frequency = frequency
    def __call__(self, epoch, sym, arg, aux):
        if epoch % self._frequency != 0:
            return
        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
            save_checkpoint(
                os.path.join(checkpoint_dir, self._filename), epoch, sym, arg,
                aux)