forked from gronlund/kmeans1d
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans_linear.cpp
246 lines (227 loc) · 7.61 KB
/
kmeans_linear.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#include "kmeans.h"
#include <cassert>
#include <limits>
/**
* This code implements the matrix searching algorithm
* for computing k-means.
*/
static double oo = std::numeric_limits<double>::max();
kmeans_linear::kmeans_linear(const std::vector<double> &points) : kmeans_dp(points) { }
std::string kmeans_linear::name() { return std::string("linear"); }
std::unique_ptr<kmeans_dp> kmeans_linear::get_instance(std::vector<double> &points) {
return std::unique_ptr<kmeans_dp>(new kmeans_linear(points));
}
std::unique_ptr<kmeans_result> kmeans_linear::compute(size_t k) {
std::unique_ptr<kmeans_result> res(new kmeans_result);
res->cost = 0.0;
base_case(k);
for (size_t c = 2; c <= k; ++c) {
std::swap(row_prev, row);
fill_row(c);
}
res->cost = row[n-1];
return res;
}
/**
* @param i is the number of clusters / row of the matrix.
* @param m is the last point of the clustering.
* @param j is the first point of the last cluster.
* @return C_i[m][j] in the the 1D kmeans paper.
*/
double kmeans_linear::cimj(size_t i, size_t m, size_t j) {
assert(i > 0);
if (m < j) {
return row_prev[m];
} else {
if (j == 0) {
return is.cost_interval_l2(0, m);
}
double best_before = row_prev[j-1];
double last_cluster_cost = is.cost_interval_l2(j, m);
return last_cluster_cost + best_before;
}
}
void kmeans_linear::reduce(size_t row_multiplier, std::vector<size_t> &cols, size_t n, size_t m,
std::vector<size_t> &cols_output, size_t reduce_i) {
// n rows, m columns.
// output is n rows and n columns.
#ifdef DEBUG_KMEANS
printf("[reduce] called with n=%d, m=%d, reduce_i=%d\n", n, m, reduce_i);
#endif
std::vector<size_t> prev_col(m, 0);
std::vector<size_t> next_col(m, 0);
for (size_t i = 1; i < m; ++i) {
prev_col[i] = i-1;
next_col[i] = i+1;
}
next_col[0] = 1;
prev_col[0] = m;
size_t remaining_columns = m;
size_t rowk = 0; // index in rows
size_t colk = 0; // index in cols.
while (remaining_columns > n) {
double val = -cimj(reduce_i, row_multiplier * rowk, cols[colk]);
//printf("rowk=%ld\n", rowk);
double next_val = -cimj(reduce_i, row_multiplier * rowk,
cols[next_col[colk]]);
if (val >= next_val && rowk < n-1) {
rowk += 1;
colk = next_col[colk];
assert(colk < m);
} else if (val >= next_val && rowk == n-1) {
// delete column next_col[colk].
// i.e. update the pointers.
size_t to_delete = next_col[colk];
//assert(to_delete < m);
size_t next = next_col[to_delete];
size_t prev = prev_col[to_delete];
assert(prev == colk);
if (next != m) {
prev_col[next] = prev;
}
if (prev != m) {
next_col[prev] = next;
}
next_col[to_delete] = m;
prev_col[to_delete] = m;
--remaining_columns;
} else if (val < next_val) {
// First adjust pointers. Need to use old pointers later.
size_t old_colk = colk;
if (rowk > 0) {
--rowk;
colk = prev_col[colk];
} else {
colk = next_col[colk];
}
// delete column colk, which means update the pointers.
size_t prev = prev_col[old_colk];
size_t next = next_col[old_colk];
if (prev != m) { // meaning the previous exists.
assert(next_col[prev] == old_colk);
next_col[prev] = next_col[old_colk];
}
if (next != m) { // meaning next exists.
assert(prev_col[next] == old_colk);
prev_col[next] = prev_col[old_colk];
}
prev_col[old_colk] = m;
next_col[old_colk] = m;
--remaining_columns;
}
}
// generate output.
size_t j = 0;
for (size_t i = 0; i < m; ++i) {
if (prev_col[i] != m || next_col[i] != m) {
cols_output[j] = cols[i];
++j;
}
}
}
void kmeans_linear::mincompute(size_t row_multiplier, std::vector<size_t> &cols, size_t n, size_t m,
size_t reduce_i, std::vector<size_t> &cols_output) {
#ifdef DEBUG_KMEANS
printf("[mincompute] Called with n=%d, m=%d, reduce_i=%d\n", n, m, reduce_i);
#endif
if (n == 1) {
size_t r = 0; // r = rows[0]
size_t idx = 0;
double best = cimj(reduce_i, r, cols[0]);
for (size_t i = 1; i < m; ++i) {
size_t c = cols[i];
double val = cimj(reduce_i, r, c);
if (val < best) {
best = val;
idx = i;
}
}
cols_output[0] = cols[idx];
#ifdef DEBUG_KMEANS
printf("[mincompute] returning\n", n, m, reduce_i);
#endif
return;
}
std::vector<size_t> cols_output_reduce(n, 0);
reduce(row_multiplier, cols, n, m, cols_output_reduce, reduce_i);
size_t n_rec = (n + 1) / 2;
std::vector<size_t> output(n_rec, 0);
mincompute(row_multiplier * 2, cols_output_reduce, n_rec, n, reduce_i, output);
#ifdef DEBUG_KMEANS
printf("[mincompute] n = %d\n", n);
for (size_t i = 0; i < n_rec; ++i) {
printf("[mincompute] output[%ld] = %ld\n", i, output[i]);
}
#endif
{ std::vector<size_t> empty; std::swap(empty, cols_output_reduce); }
size_t first = 0; // index into cols.
while (output[0] != cols[first]) ++first;
// iterate odd rows.
for (size_t i = 1; i < n; i+=2) {
size_t current = first; // index into cols
size_t end = current; // index into cols
if (i + 1 < n) {
while (output[(i+1)/2] != cols[end]) ++end;
} else {
end = m-1;
}
size_t best_idx = current; // index into cols.
double best = oo;
for (size_t z = current; z <= end; ++z) {
size_t rowsi = row_multiplier * i;
double val = cimj(reduce_i, rowsi, cols[z]);
if (val < best) {
best = val;
best_idx = z;
}
}
#ifdef DEBUG_KMEANS
printf("[mincompute] %d: cols[best_idx] = %d\n", i, cols[best_idx]);
#endif
cols_output[i] = cols[best_idx];
first = end;
}
for (size_t i = 0; i < n; i+=2) cols_output[i] = output[i/2];
{ std::vector<size_t> empty; std::swap(empty, output); };
#ifdef DEBUG_KMEANS
printf("[mincompute] reduce_i = %d n = %d\n", reduce_i, n);
for (size_t i = 0; i < n; ++i) {
printf("[mincompute] cols_output[%d] = %d\n", i, cols_output[i]);
}
printf("[mincompute] returning\n");
#endif
return;
}
void kmeans_linear::fill_row(size_t k) {
std::vector<size_t> cols(n, 0);
std::vector<size_t> output(n, 0);
for (size_t i = 0; i < n; ++i) {
cols[i] = i;
}
#ifdef DEBUG_KMEANS
printf("Matrix cimj:\n");
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
printf("%.2f ", cimj(k, i, j));
}
printf("\n");
}
printf(" ------ fill_row2 k = %ld ---------\n", k);
#endif
mincompute(1, cols, n, n, k, output);
for (size_t i = 0; i < n; ++i) {
#ifdef DEBUG_KMEANS
printf("output[%d] = %d\n", i, output[i]);
#endif
size_t row = i;
size_t col = output[i];
this->row[i] = cimj(k, row, col);
}
}
void kmeans_linear::base_case(size_t k) {
for (size_t i = 0; i < n; ++i) {
double cost = is.cost_interval_l2(0, i);
row[i] = cost;
}
row[0] = 0.0;
}