ray/doc/source/ray-air/doc_code/hvd_trainer.py
xwjiang2010 8d5c07b781
[air/train/docs] Add trainer user guide and update trainer docs (#27389)
This PR adds a user guide to AIR for using Ray Train. It provides a high level overview of the trainers and removes redundant sections.

The main file to review is here: doc/source/ray-air/trainer.rst.

Signed-off-by: xwjiang2010 <xwjiang2010@gmail.com>
Signed-off-by: Richard Liaw <rliaw@berkeley.edu>
Signed-off-by: Kai Fricke <kai@anyscale.com>

Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
Co-authored-by: Kai Fricke <kai@anyscale.com>
2022-08-04 13:59:50 +01:00

76 lines
2.3 KiB
Python

import ray
import ray.train as train
import ray.train.torch # Need this to use `train.torch.get_device()`
import horovod.torch as hvd
import torch
import torch.nn as nn
from ray.air import session, Checkpoint
from ray.train.horovod import HorovodTrainer
from ray.air.config import ScalingConfig
input_size = 1
layer_size = 15
output_size = 1
num_epochs = 3
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.layer1 = nn.Linear(input_size, layer_size)
self.relu = nn.ReLU()
self.layer2 = nn.Linear(layer_size, output_size)
def forward(self, input):
return self.layer2(self.relu(self.layer1(input)))
def train_loop_per_worker():
hvd.init()
dataset_shard = session.get_dataset_shard("train")
model = NeuralNetwork()
device = train.torch.get_device()
model.to(device)
loss_fn = nn.MSELoss()
lr_scaler = 1
optimizer = torch.optim.SGD(model.parameters(), lr=0.1 * lr_scaler)
# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(
optimizer,
named_parameters=model.named_parameters(),
op=hvd.Average,
)
for epoch in range(num_epochs):
model.train()
for inputs, labels in iter(
dataset_shard.to_torch(
label_column="y",
label_column_dtype=torch.float,
feature_column_dtypes=torch.float,
batch_size=32,
)
):
inputs.to(device)
labels.to(device)
outputs = model(inputs)
loss = loss_fn(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"epoch: {epoch}, loss: {loss.item()}")
session.report(
{},
checkpoint=Checkpoint.from_dict(dict(model=model.state_dict())),
)
train_dataset = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
scaling_config = ScalingConfig(num_workers=3)
# If using GPUs, use the below scaling config instead.
# scaling_config = ScalingConfig(num_workers=3, use_gpu=True)
trainer = HorovodTrainer(
train_loop_per_worker=train_loop_per_worker,
scaling_config=scaling_config,
datasets={"train": train_dataset},
)
result = trainer.fit()