-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
130 lines (107 loc) · 4.2 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import torch
from torchvision import datasets
from torch import nn
from torch.utils.data import DataLoader
import zipfile
import valohai
# Valohai: Define parameters
my_parameters = {
"batch_size": 64,
"learning_rate": 0.001,
"epochs": 5
}
# Valohai: Define inputs
my_inputs = {
"train": "",
"test": ""
}
# Valohai: Define a step with parameters and inputs
valohai.prepare(step="train-model",
image="python:3.9",
default_parameters=my_parameters,
default_inputs=my_inputs)
# Valohai: Get the parameter values
# We're using valohai.parameters to get the actual values during runtime
# The values defined in my_parameters are just default values valohai.yaml
batch_size = valohai.parameters("batch_size").value
epochs = valohai.parameters("epochs").value
learning_rate = valohai.parameters("learning_rate").value
# Valohai: Get the path to our input files
train_data_path = valohai.inputs("train").path()
test_data_path = valohai.inputs("test").path()
# Valohai: Load the previously generated train and test data
train_data = torch.load(train_data_path)
test_data = torch.load(test_data_path)
# Create data loaders.
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
# Define model
# ref: https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html#creating-models
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
model = NeuralNetwork().to(device)
# Optimize the model
# Valohai: We're passing the parameter value learnign_rate to the optimizer
# ref: https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html#optimizing-the-model-parameters
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# Define a single training loop
# In a single training loop, the model makes predictions on the training dataset (fed to it in batches),
# and backpropagates the prediction error to adjust the model’s parameters.
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
model.train()
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
# We also check the model’s performance against the test dataset to ensure it is learning.
def test(dataloader, model, loss_fn, epoch):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
# Valohai: Print out the accuracy and test_loss after each epoch
with valohai.metadata.logger() as logger :
logger.log("epoch", epoch)
logger.log("accuracy", f"{correct:>2f}")
logger.log("test_loss", f"{test_loss:>8f}")
# The training process is conducted over several iterations (epochs).
# During each epoch, the model learns parameters to make better predictions.
# We print the model’s accuracy and loss at each epoch as Valohai metadata
for epoch in range(epochs):
train(train_dataloader, model, loss_fn, optimizer)
test(test_dataloader, model, loss_fn, epoch)
model_path = valohai.outputs().path("model.pth")
torch.save(model.state_dict(), model_path)