-
Notifications
You must be signed in to change notification settings - Fork 1
/
sct4.m
369 lines (307 loc) · 12.7 KB
/
sct4.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
function [positions, time] = sct4(video_path, img_files, pos, target_sz, show_visualization)
%% Parameter setting
% Feature & Kernel parameters setting
interp_factor = 0.02;
kernel.sigma = 0.5;
kernel.poly_a = 1;
kernel.poly_b = 9;
features.gray = false;
features.hog = true;
features.hog_orientations = 9;
cell_size = 4;
% KCF parameters setting
padding = 1.5; %extra area surrounding the target
lambda = 1e-4; %regularization
output_sigma_factor = 0.1; %spatial bandwidth (proportional to target)
% Attention map parameters setting
Nfo = 10;
boundary_ratio = 1/3;
salWeight = [0.3 0.3];
bSal = [1 1];
% multiple module trackers type initialization
filterPool(1).kernelType = 'gaussian';
filterPool(1).featureType = 'color';
filterPool(2).kernelType = 'polynomial';
filterPool(2).featureType = 'color';
filterPool(3).kernelType = 'gaussian';
filterPool(3).featureType = 'hog';
filterPool(4).kernelType = 'polynomial';
filterPool(4).featureType = 'hog';
% etc.
time = 0; %to calculate FPS
positions = zeros(numel(img_files), 4); %to calculate precision
%% Tracker initialization
%if the target is large, lower the resolution
resize_image = (sqrt(prod(target_sz)) >= 100);
if resize_image,
pos = floor(pos / 2);
target_sz = floor(target_sz / 2);
end
window_sz = floor(target_sz * (1 + padding));
% Initialize the constant values & maps
output_sigma = sqrt(prod(target_sz)) * output_sigma_factor / cell_size;
yf = fft2(gaussian_shaped_labels(output_sigma, floor(window_sz / cell_size)));
cos_window = (hann(size(yf,1)) * hann(size(yf,2))');
mask = ones(size(yf,1), size(yf,2)); % Initial mask for strong saliency map
depthBoundaryX = max(round(size(yf,2)*boundary_ratio), 3);
depthBoundaryY = max(round(size(yf,1)*boundary_ratio), 3);
mask( depthBoundaryY:(end-depthBoundaryY+1), depthBoundaryX:(end-depthBoundaryX+1) ) = 0;
% visualization initialize
if show_visualization, %create video interface
update_visualization = show_video(img_files, video_path, resize_image);
end
%note: variables ending with 'f' are in the Fourier domain.
%% Tracking Start
for frame = 1:numel(img_files),
%load image
im = imread([video_path img_files{frame}]); % gray image for HOG feature
im2 = im; % color or gray image
if size(im,3) > 1,
im = rgb2gray(im);
end
if resize_image,
im = imresize(im, 0.5);
im2 = imresize(im2, 0.5);
end
tic()
if frame > 1,
% HOG feature extraction
patch = get_subwindow(im, pos, window_sz);
z = get_features(patch, features, cell_size, []);
% Color/Gray intensity feature extraction
patch2 = get_subwindow(im2, pos, window_sz);
feature = double(imresize(patch2, [size(z,1), size(z,2)]))/255;
if(size(feature,3) > 1) % for color image, concatenate 'LAB' space
feature = cat(3, feature, rgb2lab(feature) / 255 + 0.5);
end
z2 = feature;
% Attention map from color/gray feature
if(bSal(1)==1)
stS{1,1} = evaluate_stSaliency(z2, rf{1,1});
stS{1,1} = (1-salWeight(1))*cos_window + salWeight(1)*stS{1,1};
else
stS{1,1} = cos_window;
end
% Attention map from HOG feature
if(bSal(2)==1)
stS{1,2} = evaluate_stSaliency(z, rf{1,2});
stS{1,2} = (1-salWeight(2))*cos_window + salWeight(2)*stS{1,2};
else
stS{1,2} = cos_window;
end
% Attention map multiplication
zs = bsxfun(@times, z, stS{1,2});
zs2 = bsxfun(@times, z2, stS{1,1});
zf = fft2(zs);
zf2 = fft2(zs2);
% response map zero setting
response = zeros(size(yf));
% Module-wise correlation filter response estimation
for ii = 1:4
switch multiFilters(ii).kernelType
case 'gaussian',
if strcmp(multiFilters(ii).featureType, 'hog')
kzf = gaussian_correlation(zf, multiFilters(ii).model_xf, kernel.sigma);
else
kzf = gaussian_correlation(zf2, multiFilters(ii).model_xf, kernel.sigma);
end
case 'polynomial',
if strcmp(multiFilters(ii).featureType, 'hog')
kzf = polynomial_correlation(zf, multiFilters(ii).model_xf, kernel.poly_a, kernel.poly_b);
else
kzf = polynomial_correlation(zf2, multiFilters(ii).model_xf, kernel.poly_a, kernel.poly_b);
end
case 'linear',
if strcmp(multiFilters(ii).featureType, 'hog')
kzf = linear_correlation(zf, multiFilters(ii).model_xf);
else
kzf = linear_correlation(zf2, multiFilters(ii).model_xf);
end
end
aa = multiFilters(ii).model_alphaf .* kzf;
response = response + multiFilters(ii).weight*(1/ii.^1.5)*aa;
end
response = real(ifft2(response));
% determine the target location
[vert_delta, horiz_delta] = find(response == max(response(:)), 1);
if vert_delta > size(zf,1) / 2, %wrap around to negative half-space of vertical axis
vert_delta = vert_delta - size(zf,1);
end
if horiz_delta > size(zf,2) / 2, %same for horizontal axis
horiz_delta = horiz_delta - size(zf,2);
end
pos = pos + cell_size * [vert_delta - 1, horiz_delta - 1];
end
% HOG feature extraction
patch = get_subwindow(im, pos, window_sz);
x = get_features(patch, features, cell_size, []);
% Color/gray intensity feature extraction
patch2 = get_subwindow(im2, pos, window_sz);
feature = double(imresize(patch2, [size(x,1), size(x,2)]))/255;
if(size(feature,3) > 1)
feature = cat(3, feature, rgb2lab(feature) / 255 + 0.5);
end
x2 = feature;
% attentional map estimator initialization & update
if(frame==1)
[rf{1,1}, stS{1,1}] = init_stSaliency(x2, mask);
saliencyMap{1,1} = cos_window;
[rf{1,2}, stS{1,2}] = init_stSaliency(x, mask);
saliencyMap{1,2} = cos_window;
else
if(frame==2)
bSal = [1, 1];
end
if(bSal(1)==1)
[rf{1,1}, stS{1,1}] = update_stSaliency(x2, mask, rf{1,1});
end
if(bSal(2)==1)
[rf{1,2}, stS{1,2}] = update_stSaliency(x, mask, rf{1,2});
end
if(bSal(1)==1)
salWeight(1) = exp( -3* mean(mean( (~mask - stS{1,1}).^2 )) );
end
if(bSal(2)==1)
salWeight(2) = exp( -3* mean(mean( (~mask - stS{1,2}).^2 )) );
end
stS2{1,1} = (1-salWeight(1))*cos_window + salWeight(1)*stS{1,1};
stS2{1,2} = (1-salWeight(2))*cos_window + salWeight(2)*stS{1,2};
% initial attention map weight
if(frame < Nfo)
salWeight = [0.5 0.5];
end
% exception case for attention map weight
if( frame==Nfo && mean(mean( (1-stS2{1,1}(mask==0)).^2 )) > 0.35 ...
&& mean(mean( (1-stS2{1,2}(mask==0)).^2 )) > 0.35)
bSal(1) = -1;
bSal(2) = -1;
salWeight(1) = 0;
salWeight(2) = 0;
end
% attention map weight for full occlusion case
if( frame>Nfo && mean(mean( (1-stS2{1,1}(mask==0)).^2 )) > 0.4)
nn = 1;
bSal(1) = 0;
end
if( frame>Nfo && mean(mean( (1-stS2{1,2}(mask==0)).^2 )) > 0.4)
nn2 = 1;
bSal(2) = 0;
end
if(bSal(1)==0)
nn = nn + 1;
if(nn > Nfo)
bSal(1) = 1;
end
end
if(bSal(2)==0)
nn2 = nn2 + 1;
if(nn2 > Nfo)
bSal(2) = 1;
end
end
% update the attention map & its estimator
stS{1,1} = (1-salWeight(1))*cos_window + salWeight(1)*stS{1,1};
stS{1,2} = (1-salWeight(2))*cos_window + salWeight(2)*stS{1,2};
saliencyMap{1,1} = saliencyMap{1,1}*(1 - interp_factor) + stS{1,1}*interp_factor;
saliencyMap{1,2} = saliencyMap{1,2}*(1 - interp_factor) + stS{1,2}*interp_factor;
end
stS{1,1} = saliencyMap{1,1};
stS{1,2} = saliencyMap{1,2};
% attention map multiplication for tracking
x = bsxfun(@times, stS{1,2}, x);
x2 = bsxfun(@times, stS{1,1}, x2);
xf = fft2(x);
xf2 = fft2(x2);
% Module-wise training
for ii = 1:4
switch filterPool(ii).kernelType
case 'gaussian',
if strcmp(filterPool(ii).featureType, 'hog')
filterPool(ii).kf = gaussian_correlation(xf, xf, kernel.sigma);
else
filterPool(ii).kf = gaussian_correlation(xf2, xf2, kernel.sigma);
end
case 'polynomial',
if strcmp(filterPool(ii).featureType, 'hog')
filterPool(ii).kf = polynomial_correlation(xf, xf, kernel.poly_a, kernel.poly_b);
else
filterPool(ii).kf = polynomial_correlation(xf2, xf2, kernel.poly_a, kernel.poly_b);
end
case 'linear',
if strcmp(filterPool(ii).featureType, 'hog')
filterPool(ii).kf = linear_correlation(xf, xf);
else
filterPool(ii).kf = linear_correlation(xf2, xf2);
end
end
filterPool(ii).dalphaf = 1 ./ (filterPool(ii).kf + lambda); %equation for fast training
end
% Temporal association (filter update)
if frame == 1, %first frame, train with a single image
for ii = 1:4
filterPool(ii).model_dalphaf = filterPool(ii).dalphaf;
if strcmp(filterPool(ii).featureType, 'hog')
filterPool(ii).model_xf = xf;
else
filterPool(ii).model_xf = xf2;
end
end
else %subsequent frames, interpolate model
for ii = 1:4
filterPool(ii).model_dalphaf = ...
(1 - interp_factor) * filterPool(ii).model_dalphaf + interp_factor * filterPool(ii).dalphaf;
if strcmp(filterPool(ii).featureType, 'hog')
filterPool(ii).model_xf = ...
(1 - interp_factor) * filterPool(ii).model_xf + interp_factor * xf;
else
filterPool(ii).model_xf = ...
(1 - interp_factor) * filterPool(ii).model_xf + interp_factor * xf2;
end
end
end
% Estimate the priority & reliability for each module
errs = zeros(4,4);
errWeight = ones(size(yf));
bin = ones(1,4);
errMaps = zeros(size(yf,1), size(yf,2) ,4);
for ii = 1:4
errMaps(:,:,ii) = (real(ifft2(filterPool(ii).kf .* yf .* filterPool(ii).model_dalphaf - yf))).^2;
end
for jj = 1:4
if(jj < 4)
% estimate the module-wise error
for ii = 1:4
errs(jj,ii) = sqrt(sum( vec(errWeight.*errMaps(:,:,ii)) ));
end
% Find the next best module
idx = find(errs(jj,:) == min(errs(jj,bin==1)));
idx = idx(1);
bin(idx) = 0;
% For reliability, the error weight is estimated
errWeight = errWeight .* exp((errMaps(:,:,idx) / max(vec(errMaps(:,:,idx)))));
errWeight = errWeight / max(vec(errWeight));
else
idx = find(bin==1);
end
% For priority, the order of the modules changes
multiFilters(jj).kernelType = filterPool(idx).kernelType;
multiFilters(jj).featureType = filterPool(idx).featureType;
multiFilters(jj).model_alphaf = yf .* filterPool(idx).model_dalphaf;
multiFilters(jj).model_xf = filterPool(idx).model_xf;
multiFilters(jj).weight = exp(-0.01*errs(1,idx));
end
%save position and timing
positions(frame,:) = [pos([2,1]) - target_sz([2,1])/2, pos([2,1]) + target_sz([2,1])/2];
time = time + toc();
%visualization
if show_visualization,
box = [pos([2,1]) - target_sz([2,1])/2, target_sz([2,1])];
stop = update_visualization(frame, box);
if stop, break, end %user pressed Esc, stop early
drawnow
end
end
if resize_image,
positions = positions * 2;
end
end