-
Notifications
You must be signed in to change notification settings - Fork 50
/
sparse_autoencoder.py
111 lines (92 loc) · 4.17 KB
/
sparse_autoencoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
import numpy as np
import neurolib
from neurolib import T, sigmoid, binary_KL_divergence
def cost(theta, visible_size, hidden_size,
weight_decay, sparsity_param, beta, data):
"""
% visible_size: the number of input units (probably 64)
% hidden_size: the number of hidden units (probably 25)
% lambda: weight decay parameter
% sparsityParam: The desired average activation for the hidden units (denoted in the lecture
% notes by the greek alphabet rho, which looks like a lower-case "p").
% beta: weight of sparsity penalty term
% data: Our 64x10000 matrix containing the training data. So, data(:,i) is the i-th training example.
% The input theta is a vector (because minFunc expects the parameters to be a vector).
% We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this
% follows the notation convention of the lecture notes.
"""
sparsity_param = float(sparsity_param)
W1, W2, b1, b2 = unflatten_params(theta, hidden_size, visible_size)
num_data = data.shape[1]
# do a feed forward pass
a2 = sigmoid(np.dot(W1, data) + T(b1))
a3 = sigmoid(np.dot(W2, a2) + T(b2))
assert a2.shape == (hidden_size, num_data)
assert a3.shape == (visible_size, num_data)
cost = 1.0 / num_data * (0.5 * np.sum((a3 - data)**2))
# add in weight decay
cost += (0.5 * weight_decay) * (np.sum(W1**2) + np.sum(W2**2))
# add in sparsity parameter
sparsity = np.sum(a2, axis=1) / float(num_data)
assert sparsity.shape == (hidden_size,)
s = np.sum(binary_KL_divergence(sparsity_param, sparsity))
cost += beta * s
# delta3: Compute the backprop (product rule)
delta3 = -(data - a3) * a3 * (1 - a3)
assert delta3.shape == (visible_size, num_data)
# delta2: Compute the backprop (product rule)
# 1. calculate inner derivative
delta2 = np.dot(W2.T, delta3)
# 2. add in sparsity parameter
delta2 += T(beta * ((-sparsity_param / sparsity) +
((1 - sparsity_param) / (1 - sparsity))))
# 3. multiply by outer derivative
delta2 *= a2 * (1 - a2)
assert delta2.shape == (hidden_size, num_data)
# compute final gradient
W1grad = np.dot(delta2, data.T) / float(num_data)
W2grad = np.dot(delta3, a2.T) / float(num_data)
# add weight decay
W1grad += weight_decay * W1
W2grad += weight_decay * W2
b1grad = np.sum(delta2, axis=1) / float(num_data)
b2grad = np.sum(delta3, axis=1) / float(num_data)
assert W1grad.shape == W1.shape
assert W2grad.shape == W2.shape
assert b1grad.shape == b1.shape
assert b2grad.shape == b2.shape
grad = neurolib.flatten_params(W1grad, W2grad, b1grad, b2grad)
return cost, grad
def initialize_params(hidden_size, visible_size):
"""Accepts number of hidde states in sparse encoder,
and number of input states in sparse encoder..
Initialize parameters randomly based on layer sizes.
Returns a new flat array of size 2*visisble_size + hidden_size
"""
assert hidden_size <= visible_size
#we'll choose weights uniformly from the interval [-r, r]
r = np.sqrt(6) / np.sqrt(hidden_size + visible_size + 1)
W1 = np.random.rand(hidden_size, visible_size) * 2 * r - r
W2 = np.random.rand(visible_size, hidden_size) * 2 * r - r
b1 = np.zeros(hidden_size)
b2 = np.zeros(visible_size)
"""
% Convert weights and bias gradients to the vector form.
% This step will "unroll" (flatten and concatenate together) all
% your parameters into a vector, which can then be used with minFunc.
"""
#TODO: jperla: make this a function
return neurolib.flatten_params(W1, W2, b1, b2)
def unflatten_params(theta, hidden_size, visible_size):
"""Accepts flat 1-D vector theta.
Pulls out the weight vectors and returns them for
sparse autoencoding.
"""
#TODO: jperla: generalize
hv = hidden_size * visible_size
W1 = theta[:hv].reshape(hidden_size, visible_size)
W2 = theta[hv:2*hv].reshape(visible_size, hidden_size)
b1 = theta[2*hv:2*hv+hidden_size]
b2 = theta[2*hv+hidden_size:]
return W1, W2, b1, b2