-
Notifications
You must be signed in to change notification settings - Fork 17
/
demev2.m
248 lines (219 loc) · 8.01 KB
/
demev2.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
%DEMEV2 Demonstrate Bayesian classification for the MLP.
%
% Description
% A synthetic two class two-dimensional dataset X is sampled from a
% mixture of four Gaussians. Each class is associated with two of the
% Gaussians so that the optimal decision boundary is non-linear. A 2-
% layer network with logistic outputs is trained by minimizing the
% cross-entropy error function with isotroipc Gaussian regularizer (one
% hyperparameter for each of the four standard weight groups), using
% the scaled conjugate gradient optimizer. The hyperparameter vectors
% ALPHA and BETA are re-estimated using the function EVIDENCE. A graph
% is plotted of the optimal, regularised, and unregularised decision
% boundaries. A further plot of the moderated versus unmoderated
% contours is generated.
%
% See also
% EVIDENCE, MLP, SCG, DEMARD, DEMMLP2
%
% Copyright (c) Ian T Nabney (1996-2001)
clc;
disp('This program demonstrates the use of the evidence procedure on')
disp('a two-class problem. It also shows the improved generalisation')
disp('performance that can be achieved with moderated outputs; that is')
disp('predictions where an approximate integration over the true')
disp('posterior distribution is carried out.')
disp(' ')
disp('First we generate a synthetic dataset with two-dimensional input')
disp('sampled from a mixture of four Gaussians. Each class is')
disp('associated with two of the Gaussians so that the optimal decision')
disp('boundary is non-linear.')
disp(' ')
disp('Press any key to see a plot of the data.')
pause;
% Generate the matrix of inputs x and targets t.
rand('state', 423);
randn('state', 423);
ClassSymbol1 = 'r.';
ClassSymbol2 = 'y.';
PointSize = 12;
titleSize = 10;
fh1 = figure;
set(fh1, 'Name', 'True Data Distribution');
whitebg(fh1, 'k');
%
% Generate the data
%
n=200;
% Set up mixture model: 2d data with four centres
% Class 1 is first two centres, class 2 from the other two
mix = gmm(2, 4, 'full');
mix.priors = [0.25 0.25 0.25 0.25];
mix.centres = [0 -0.1; 1.5 0; 1 1; 1 -1];
mix.covars(:,:,1) = [0.625 -0.2165; -0.2165 0.875];
mix.covars(:,:,2) = [0.25 0; 0 0.25];
mix.covars(:,:,3) = [0.2241 -0.1368; -0.1368 0.9759];
mix.covars(:,:,4) = [0.2375 0.1516; 0.1516 0.4125];
[data, label] = gmmsamp(mix, n);
%
% Calculate some useful axis limits
%
x0 = min(data(:,1));
x1 = max(data(:,1));
y0 = min(data(:,2));
y1 = max(data(:,2));
dx = x1-x0;
dy = y1-y0;
expand = 5/100; % Add on 5 percent each way
x0 = x0 - dx*expand;
x1 = x1 + dx*expand;
y0 = y0 - dy*expand;
y1 = y1 + dy*expand;
resolution = 100;
step = dx/resolution;
xrange = [x0:step:x1];
yrange = [y0:step:y1];
%
% Generate the grid
%
[X Y]=meshgrid([x0:step:x1],[y0:step:y1]);
%
% Calculate the class conditional densities, the unconditional densities and
% the posterior probabilities
%
px_j = gmmactiv(mix, [X(:) Y(:)]);
px = reshape(px_j*(mix.priors)',size(X));
post = gmmpost(mix, [X(:) Y(:)]);
p1_x = reshape(post(:, 1) + post(:, 2), size(X));
p2_x = reshape(post(:, 3) + post(:, 4), size(X));
plot(data((label<=2),1),data(label<=2,2),ClassSymbol1, 'MarkerSize', ...
PointSize)
hold on
axis([x0 x1 y0 y1])
plot(data((label>2),1),data(label>2,2),ClassSymbol2, 'MarkerSize', ...
PointSize)
% Convert targets to 0-1 encoding
target=[label<=2];
disp(' ')
disp('Press any key to continue')
pause; clc;
disp('Next we create a two-layer MLP network with 6 hidden units and')
disp('one logistic output. We use a separate inverse variance')
disp('hyperparameter for each group of weights (inputs, input bias,')
disp('outputs, output bias) and the weights are optimised with the')
disp('scaled conjugate gradient algorithm. After each 100 iterations')
disp('the hyperparameters are re-estimated twice. There are eight')
disp('cycles of the whole algorithm.')
disp(' ')
disp('Press any key to train the network and determine the hyperparameters.')
pause;
% Set up network parameters.
nin = 2; % Number of inputs.
nhidden = 6; % Number of hidden units.
nout = 1; % Number of outputs.
alpha = 0.01; % Initial prior hyperparameter.
aw1 = 0.01;
ab1 = 0.01;
aw2 = 0.01;
ab2 = 0.01;
% Create and initialize network weight vector.
prior = mlpprior(nin, nhidden, nout, aw1, ab1, aw2, ab2);
net = mlp(nin, nhidden, nout, 'logistic', prior);
% Set up vector of options for the optimiser.
nouter = 8; % Number of outer loops.
ninner = 2; % Number of innter loops.
options = foptions; % Default options vector.
options(1) = 1; % This provides display of error values.
options(2) = 1.0e-5; % Absolute precision for weights.
options(3) = 1.0e-5; % Precision for objective function.
options(14) = 100; % Number of training cycles in inner loop.
% Train using scaled conjugate gradients, re-estimating alpha and beta.
for k = 1:nouter
net = netopt(net, options, data, target, 'scg');
[net, gamma] = evidence(net, data, target, ninner);
fprintf(1, '\nRe-estimation cycle %d:\n', k);
disp([' alpha = ', num2str(net.alpha')]);
fprintf(1, ' gamma = %8.5f\n\n', gamma);
disp(' ')
disp('Press any key to continue.')
pause;
end
disp(' ')
disp('Network training and hyperparameter re-estimation are now complete.')
disp('Notice that the final error value is close to the number of data')
disp(['points (', num2str(n), ') divided by two.'])
disp('Also, the hyperparameter values differ, which suggests that a single')
disp('hyperparameter would not be so effective.')
disp(' ')
disp('First we train an MLP without Bayesian regularisation on the')
disp('same dataset using 400 iterations of scaled conjugate gradient')
disp(' ')
disp('Press any key to train the network by maximum likelihood.')
pause;
% Train standard network
net2 = mlp(nin, nhidden, nout, 'logistic');
options(14) = 400;
net2 = netopt(net2, options, data, target, 'scg');
y2g = mlpfwd(net2, [X(:), Y(:)]);
y2g = reshape(y2g(:, 1), size(X));
disp(' ')
disp('We can now plot the function represented by the trained networks.')
disp('We show the decision boundaries (output = 0.5) and the optimal')
disp('decision boundary given by applying Bayes'' theorem to the true')
disp('data model.')
disp(' ')
disp('Press any key to add the boundaries to the plot.')
pause;
% Evaluate predictions.
[yg, ymodg] = mlpevfwd(net, data, target, [X(:) Y(:)]);
yg = reshape(yg(:,1),size(X));
ymodg = reshape(ymodg(:,1),size(X));
% Bayesian decision boundary
[cB, hB] = contour(xrange,yrange,p1_x,[0.5 0.5],'b-');
[cNb, hNb] = contour(xrange,yrange,yg,[0.5 0.5],'r-');
[cN, hN] = contour(xrange,yrange,y2g,[0.5 0.5],'g-');
set(hB, 'LineWidth', 2);
set(hNb, 'LineWidth', 2);
set(hN, 'LineWidth', 2);
Chandles = [hB(1) hNb(1) hN(1)];
legend(Chandles, 'Bayes', ...
'Reg. Network', 'Network', 3);
disp(' ')
disp('Note how the regularised network predictions are closer to the')
disp('optimal decision boundary, while the unregularised network is')
disp('overtrained.')
disp(' ')
disp('We will now compare moderated and unmoderated outputs for the');
disp('regularised network by showing the contour plot of the posterior')
disp('probability estimates.')
disp(' ')
disp('The first plot shows the regularised (moderated) predictions')
disp('and the second shows the standard predictions from the same network.')
disp('These agree at the level 0.5.')
disp('Press any key to continue')
pause
levels = 0:0.1:1;
fh4 = figure;
set(fh4, 'Name', 'Moderated outputs');
hold on
plot(data((label<=2),1),data(label<=2,2),'r.', 'MarkerSize', PointSize)
plot(data((label>2),1),data(label>2,2),'y.', 'MarkerSize', PointSize)
[cNby, hNby] = contour(xrange, yrange, ymodg, levels, 'k-');
set(hNby, 'LineWidth', 1);
fh5 = figure;
set(fh5, 'Name', 'Unmoderated outputs');
hold on
plot(data((label<=2),1),data(label<=2,2),'r.', 'MarkerSize', PointSize)
plot(data((label>2),1),data(label>2,2),'y.', 'MarkerSize', PointSize)
[cNbm, hNbm] = contour(xrange, yrange, yg, levels, 'k-');
set(hNbm, 'LineWidth', 1);
disp(' ')
disp('Note how the moderated contours are more widely spaced. This shows')
disp('that there is a larger region where the outputs are close to 0.5')
disp('and a smaller region where the outputs are close to 0 or 1.')
disp(' ')
disp('Press any key to exit')
pause
close(fh1);
close(fh4);
close(fh5);