mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

This PR updates the Ray AIR/Tune ipynb examples to use the Tuner() API instead of tune.run(). Signed-off-by: Kai Fricke <kai@anyscale.com> Signed-off-by: Richard Liaw <rliaw@berkeley.edu> Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Signed-off-by: Kai Fricke <coding@kaifricke.com> Co-authored-by: Richard Liaw <rliaw@berkeley.edu> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com>
211 lines
6.9 KiB
Text
211 lines
6.9 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8b66fbef",
|
|
"metadata": {},
|
|
"source": [
|
|
"(tune-horovod-example)=\n",
|
|
"\n",
|
|
"# Using Horovod with Tune\n",
|
|
"\n",
|
|
"```{image} /images/horovod.png\n",
|
|
":align: center\n",
|
|
":alt: Horovod Logo\n",
|
|
":height: 120px\n",
|
|
":target: https://horovod.ai/\n",
|
|
"```\n",
|
|
"\n",
|
|
"```{contents}\n",
|
|
":backlinks: none\n",
|
|
":local: true\n",
|
|
"```\n",
|
|
"\n",
|
|
"## Example"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "82188b4b",
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import time\n",
|
|
"import torch\n",
|
|
"\n",
|
|
"import ray\n",
|
|
"from ray import tune\n",
|
|
"from ray.air import session\n",
|
|
"from ray.train.horovod import HorovodTrainer\n",
|
|
"from ray.air.config import ScalingConfig\n",
|
|
"from ray.tune.tune_config import TuneConfig\n",
|
|
"from ray.tune.tuner import Tuner\n",
|
|
"\n",
|
|
"def sq(x):\n",
|
|
" m2 = 1.0\n",
|
|
" m1 = -20.0\n",
|
|
" m0 = 50.0\n",
|
|
" return m2 * x * x + m1 * x + m0\n",
|
|
"\n",
|
|
"\n",
|
|
"def qu(x):\n",
|
|
" m3 = 10.0\n",
|
|
" m2 = 5.0\n",
|
|
" m1 = -20.0\n",
|
|
" m0 = -5.0\n",
|
|
" return m3 * x * x * x + m2 * x * x + m1 * x + m0\n",
|
|
"\n",
|
|
"\n",
|
|
"class Net(torch.nn.Module):\n",
|
|
" def __init__(self, mode=\"sq\"):\n",
|
|
" super(Net, self).__init__()\n",
|
|
"\n",
|
|
" if mode == \"square\":\n",
|
|
" self.mode = 0\n",
|
|
" self.param = torch.nn.Parameter(torch.FloatTensor([1.0, -1.0]))\n",
|
|
" else:\n",
|
|
" self.mode = 1\n",
|
|
" self.param = torch.nn.Parameter(torch.FloatTensor([1.0, -1.0, 1.0]))\n",
|
|
"\n",
|
|
" def forward(self, x):\n",
|
|
" if ~self.mode:\n",
|
|
" return x * x + self.param[0] * x + self.param[1]\n",
|
|
" else:\n",
|
|
" return_val = 10 * x * x * x\n",
|
|
" return_val += self.param[0] * x * x\n",
|
|
" return_val += self.param[1] * x + self.param[2]\n",
|
|
" return return_val\n",
|
|
"\n",
|
|
"\n",
|
|
"def train_loop_per_worker(config):\n",
|
|
" import torch\n",
|
|
" import horovod.torch as hvd\n",
|
|
"\n",
|
|
" hvd.init()\n",
|
|
" device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
|
" mode = config[\"mode\"]\n",
|
|
" net = Net(mode).to(device)\n",
|
|
" optimizer = torch.optim.SGD(\n",
|
|
" net.parameters(),\n",
|
|
" lr=config[\"lr\"],\n",
|
|
" )\n",
|
|
" optimizer = hvd.DistributedOptimizer(optimizer)\n",
|
|
"\n",
|
|
" num_steps = 5\n",
|
|
" print(hvd.size())\n",
|
|
" np.random.seed(1 + hvd.rank())\n",
|
|
" torch.manual_seed(1234)\n",
|
|
" # To ensure consistent initialization across workers,\n",
|
|
" hvd.broadcast_parameters(net.state_dict(), root_rank=0)\n",
|
|
" hvd.broadcast_optimizer_state(optimizer, root_rank=0)\n",
|
|
"\n",
|
|
" start = time.time()\n",
|
|
" x_max = config[\"x_max\"]\n",
|
|
" for step in range(1, num_steps + 1):\n",
|
|
" features = torch.Tensor(np.random.rand(1) * 2 * x_max - x_max).to(device)\n",
|
|
" if mode == \"square\":\n",
|
|
" labels = sq(features)\n",
|
|
" else:\n",
|
|
" labels = qu(features)\n",
|
|
" optimizer.zero_grad()\n",
|
|
" outputs = net(features)\n",
|
|
" loss = torch.nn.MSELoss()(outputs, labels)\n",
|
|
" loss.backward()\n",
|
|
"\n",
|
|
" optimizer.step()\n",
|
|
" time.sleep(0.1)\n",
|
|
" session.report(dict(loss=loss.item()))\n",
|
|
" total = time.time() - start\n",
|
|
" print(f\"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.\")\n",
|
|
"\n",
|
|
"\n",
|
|
"def tune_horovod(num_workers, num_samples, use_gpu, mode=\"square\", x_max=1.0):\n",
|
|
" horovod_trainer = HorovodTrainer(train_loop_per_worker=train_loop_per_worker,\n",
|
|
" scaling_config=ScalingConfig(\n",
|
|
" num_workers=num_workers,\n",
|
|
" use_gpu=use_gpu),\n",
|
|
" train_loop_config={\n",
|
|
" \"mode\": mode,\n",
|
|
" \"x_max\": x_max})\n",
|
|
"\n",
|
|
" tuner = Tuner(horovod_trainer, param_space={\"train_loop_config\": {\"lr\": tune.uniform(\n",
|
|
" 0.1, 1)}}, tune_config=TuneConfig(mode=\"min\", metric=\"loss\",\n",
|
|
" num_samples=num_samples))\n",
|
|
"\n",
|
|
" result_grid = tuner.fit()\n",
|
|
"\n",
|
|
" print(\"Best hyperparameters found were: \", result_grid.get_best_result().config)\n",
|
|
"\n",
|
|
"\n",
|
|
"if __name__ == \"__main__\":\n",
|
|
" import argparse\n",
|
|
"\n",
|
|
" parser = argparse.ArgumentParser()\n",
|
|
" parser.add_argument(\n",
|
|
" \"--mode\", type=str, default=\"square\", choices=[\"square\", \"cubic\"]\n",
|
|
" )\n",
|
|
" parser.add_argument(\n",
|
|
" \"--learning_rate\", type=float, default=0.1, dest=\"learning_rate\"\n",
|
|
" )\n",
|
|
" parser.add_argument(\"--x_max\", type=float, default=1.0, dest=\"x_max\")\n",
|
|
" parser.add_argument(\"--gpu\", action=\"store_true\")\n",
|
|
" parser.add_argument(\n",
|
|
" \"--smoke-test\", action=\"store_true\", help=(\"Finish quickly for testing.\")\n",
|
|
" )\n",
|
|
" parser.add_argument(\"--num-workers\", type=int, default=2)\n",
|
|
" parser.add_argument(\n",
|
|
" \"--server-address\",\n",
|
|
" type=str,\n",
|
|
" default=None,\n",
|
|
" required=False,\n",
|
|
" help=\"The address of server to connect to if using Ray Client.\",\n",
|
|
" )\n",
|
|
" args, _ = parser.parse_known_args()\n",
|
|
"\n",
|
|
" if args.smoke_test:\n",
|
|
" ray.init(num_cpus=2)\n",
|
|
" elif args.server_address:\n",
|
|
" ray.init(f\"ray://{args.server_address}\")\n",
|
|
"\n",
|
|
" # import ray\n",
|
|
" # ray.init(address=\"auto\") # assumes ray is started with ray up\n",
|
|
"\n",
|
|
" tune_horovod(\n",
|
|
" num_workers=args.num_workers,\n",
|
|
" num_samples=2 if args.smoke_test else 10,\n",
|
|
" use_gpu=args.gpu,\n",
|
|
" mode=args.mode,\n",
|
|
" x_max=args.x_max,\n",
|
|
" )"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.7"
|
|
},
|
|
"orphan": true
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|