-
Notifications
You must be signed in to change notification settings - Fork 0
/
smnn.r
100 lines (72 loc) · 2.49 KB
/
smnn.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Compute the output of a network with two hidden layers
# (with tanh activation function), when given 'x' as input.
# Parameters are in the list 'L'.
nnet <- function (x, L)
{
h1 <- tanh (L$b1 + x %*% L$W1)
h2 <- tanh (L$b2 + h1 %*% L$W2)
as.vector (L$b3 + h2 %*% L$W3)
}
# Train network with hidden layers of n1 and n2 units on
# data in 'X' and 'y'. Initialize with std. dev. 'sd'. Do
# 'iters' gradient descent iterations with stepsize 'step'.
train <- function (X, y, n1, n2, iters, step, sd=0.1)
{
# Initialize parameters randomly.
L <- list()
n0 <- ncol(X)
L$b1 <- rnorm (n1,sd=sd)
L$W1 <- matrix (rnorm(n0*n1,sd=sd), n0, n1)
L$b2 <- rnorm (n2,sd=sd)
L$W2 <- matrix (rnorm(n1*n2,sd=sd), n1, n2)
L$b3 <- rnorm (1,sd=sd)
L$W3 <- rnorm (n2,sd=sd)
# Train for 'iters' iterations to minimize squared
# error predicting 'y'.
for (i in 1:iters) {
# Find gradient of squared error (summed over all
# training examples) with respect to the parameters.
r <- with gradient (L) {
e <- 0
for (i in 1:nrow(X)) {
o <- nnet (X[i,], L)
e <- e + (y[i]-o)^2
}
e
}
g <- attr(r,"gradient")
if (i %% 100 == 0)
cat ("Iteration", i, ": Error", round(r,4), "\n")
# Update parameters to reduce squared error.
L$b1 <- L$b1 - step * as.vector (g$b1)
L$W1 <- L$W1 - step * as.vector (g$W1)
L$b2 <- L$b2 - step * as.vector (g$b2)
L$W2 <- L$W2 - step * as.vector (g$W2)
L$b3 <- L$b3 - step * as.vector (g$b3)
L$W3 <- L$W3 - step * as.vector (g$W3)
}
L
}
# Example of learning a 2D function.
set.seed(1)
pdf("smnn.pdf",width=8,height=4.6)
par(mfrow=c(1,2))
truef <- function (X) cos (2*sqrt(X[,1]*X[,2])) - 2*(0.4-X[,1])^2
grid <- seq(0,1,length=101)
Xgrid <- cbind (rep(grid,times=101), rep(grid,each=101))
contour (matrix(truef(Xgrid),101,101), levels=seq(-1,1,by=0.1))
title("True function")
N <- 100
X <- cbind (runif(N), runif(N))
y <- truef (X) + rnorm(N,sd=0.01)
print (system.time (L <- train (X, y, 10, 10, 30000, 0.001)))
print(L)
contour (
matrix (apply (Xgrid, 1, function (x) nnet (x, L)), 101, 101),
levels=seq(-1,1,by=0.1))
title("Learned function")
# Confirm that there's no allocation of large (eg, 10 x 100)
# intermediate Jacobian matrices when computing gradients of
# network parameters.
Rprofmemt(nelem=8)
with gradient (L) nnet (c(0.3,0.6), L)