-
Notifications
You must be signed in to change notification settings - Fork 45
/
Copy pathbplus-pointers.cc
120 lines (95 loc) · 3.16 KB
/
bplus-pointers.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include "binsearch.hh"
#include <x86intrin.h>
#include <sys/mman.h>
typedef __m256i reg;
const int B = 16, INF = std::numeric_limits<int>::max();
constexpr int blocks(int n) {
return (n + B - 1) / B;
}
constexpr int prev_keys(int n) {
return (blocks(n) + B) / (B + 1) * B;
}
constexpr int height(int n) {
return (n <= B ? 1 : height(prev_keys(n)) + 1);
}
constexpr int offset(int h) {
int k = 0, n = N;
while (h--) {
k += blocks(n) * B;
n = prev_keys(n);
}
return k;
}
const int H = height(N), S = offset(H);
// potentially faster with hugepages
// also don't need for lower layers, but its hard to change the code
unsigned pointers[offset(H) / B][B + 1];
int *btree;
void permute(int *node) {
const reg perm_mask = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
reg* middle = (reg*) (node + 4);
reg x = _mm256_loadu_si256(middle);
x = _mm256_permutevar8x32_epi32(x, perm_mask);
_mm256_storeu_si256(middle, x);
}
void prepare(int *a, int n) {
const int P = 1 << 21, T = (4 * S + P - 1) / P * P;
btree = (int*) std::aligned_alloc(P, T);
madvise(btree, T, MADV_HUGEPAGE);
for (int i = N; i < S; i++)
btree[i] = INF;
memcpy(btree, a, 4 * N);
for (int h = 1; h < H; h++) {
for (int i = 0; i < offset(h + 1) - offset(h); i++) {
int k = i / B,
j = i - k * B;
k = k * (B + 1) + j + 1; // compare right
// and then always to the left
for (int l = 0; l < h - 1; l++)
k *= (B + 1);
btree[offset(h) + i] = (k * B < N ? btree[k * B] : INF);
}
}
for (int i = offset(1); i < S; i += B)
permute(btree + i);
for (int h = H - 1; h > 0; h--)
for (int k = 0; k < (offset(h + 1) - offset(h)) / B; k++)
for (int i = 0; i < 17; i++)
// btree + offset(h) + k
// k = k * (B + 1) + (i << 4)
pointers[offset(h) / B + k][i] = offset(h - 1) / B + k * (B + 1) + i;
}
unsigned direct_rank(reg x, int* y) {
reg a = _mm256_load_si256((reg*) y);
reg b = _mm256_load_si256((reg*) (y + 8));
reg ca = _mm256_cmpgt_epi32(a, x);
reg cb = _mm256_cmpgt_epi32(b, x);
int mb = _mm256_movemask_ps((__m256) cb);
int ma = _mm256_movemask_ps((__m256) ca);
unsigned mask = (1 << 16);
mask |= mb << 8;
mask |= ma;
return __tzcnt_u32(mask);
}
unsigned permuted_rank(reg x, int* y) {
reg a = _mm256_load_si256((reg*) y);
reg b = _mm256_load_si256((reg*) (y + 8));
reg ca = _mm256_cmpgt_epi32(a, x);
reg cb = _mm256_cmpgt_epi32(b, x);
reg c = _mm256_packs_epi32(ca, cb);
unsigned mask = _mm256_movemask_epi8(c);
return __tzcnt_u32(mask) >> 1;
}
int lower_bound(int _x) {
unsigned k = offset(H - 1) / B;
reg x = _mm256_set1_epi32(_x - 1);
for (int h = H - 1; h > 0; h--) {
//std::cout << _x << " " << k << std::endl;
unsigned i = permuted_rank(x, btree + k * B);
__builtin_prefetch(&pointers[k][0]);
__builtin_prefetch(&pointers[k][B]);
k = pointers[k][i];
}
unsigned i = direct_rank(x, btree + k * B);
return btree[k * B + i];
}