-
Notifications
You must be signed in to change notification settings - Fork 7
/
caisson.go
404 lines (341 loc) · 10.5 KB
/
caisson.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
// caisson contains the code used to index
package main
import (
"errors"
"fmt"
"hash/fnv"
"sort"
"strings"
"unicode/utf8"
)
var currentBlockDocumentCount = 0
var bloomFilter []uint64
var currentDocumentCount = 0
var currentBlockStartDocumentCount = 0
// Search the results we need to look at very quickly using only bit operations
// mostly limited by memory access
func Search(queryBits []uint64) []uint32 {
var results []uint32
var res uint64
if len(queryBits) == 0 {
return results
}
// we want to go through the index, stepping though each "shard"
for i := 0; i < len(bloomFilter); i += BloomSize {
// preload the res with the result of the first queryBit and if it's not 0 then we continue
// if it is 0 it means nothing can be a match so we don't need to do anything
res = bloomFilter[queryBits[0]+uint64(i)]
// we don't need to look at the first queryBit anymore so start at one
// then go through each long looking to see if we keep a match anywhere
for j := 1; j < len(queryBits); j++ {
res = res & bloomFilter[queryBits[j]+uint64(i)]
// if we have 0 meaning no bits set we should bail out because there is nothing more to do here
// as we cannot have a match even if further queryBits have something set
if res == 0 {
break
}
}
// if we have a non 0 value that means at least one bit is set indicating a match
// so now we need to go through each bit and work out which document it is
if res != 0 {
for j := 0; j < DocumentsPerBlock; j++ {
// determine which bits are still set indicating they have all the bits
// set for this query which means we have a potential match
if res&(1<<j) > 0 {
results = append(results, uint32(DocumentsPerBlock*(i/BloomSize)+j))
}
}
}
}
return results
}
// Tokenize returns a slice of tokens for the given text.
func Tokenize(text string) []string {
res := strings.Fields(strings.ToLower(text))
var cres []string
for _, v := range res {
if len(v) >= 3 {
cres = append(cres, v)
}
}
// now we have clean tokens trigram them
var trigrams []string
for _, r := range cres {
switch trigramMethod {
case "merovius":
trigrams = append(trigrams, TrigramsMerovius(r)...)
case "dancantos":
trigrams = append(trigrams, TrigramsDancantos(r)...)
case "ffmiruz":
trigrams = append(trigrams, TrigramsFfmiruz(r)...)
case "jamesrom":
for _, t := range TrigramsJamesrom(r) {
trigrams = append(trigrams, string(t.Bytes()))
}
default:
trigrams = append(trigrams, Trigrams(r)...)
}
}
return trigrams
}
// Itemise given some content will turn it into tokens
// and then use those to create the bit positions we need to
// set for our bloomFilter filter index
func Itemise(tokens []string) []bool {
docBool := make([]bool, BloomSize)
for _, token := range tokens {
for _, i := range HashBloom([]byte(token)) {
docBool[i] = true
}
}
return docBool
}
// Trigrams takes in text and returns its trigrams
// Attempts to be as efficient as possible
func Trigrams(text string) []string {
var runes = []rune(text)
// if we have less than or 2 runes we cannot do anything so bail out
if len(runes) <= 2 {
return []string{}
}
// we always need this many ngrams, so preallocate to avoid expanding the slice
// which is the most expensive thing in here according to profiles
ngrams := make([]string, len(runes)-2)
for i := 0; i < len(runes); i++ {
if i+3 < len(runes)+1 {
ngram := runes[i : i+3]
ngrams[i] = string(ngram)
}
}
return ngrams
}
func TrigramsMerovius(text string) []string {
var offsets [3]int
for i := 0; i < 2; i++ {
if len(text) <= offsets[i] {
return nil
}
r, sz := utf8.DecodeRuneInString(text[offsets[i]:])
if r == utf8.RuneError {
return nil
}
offsets[i+1] = offsets[i] + sz
}
out := make([]string, 0, len(text))
for offsets[2] < len(text) {
r, sz := utf8.DecodeRuneInString(text)
if r == utf8.RuneError {
return nil
}
out = append(out, text[offsets[0]:offsets[2]+sz])
copy(offsets[:2], offsets[1:])
offsets[2] += sz
}
return out
}
// Trigrams takes in text and returns its trigrams
func TrigramsDancantos(text string) []string {
if len(text) < 3 {
return []string{}
}
result := make([]string, len(text)-2)
trigramsDancantos(text, result, 0)
return result
}
// trigramsDancantos takes in text and inserts its trigrams to the result slice starting at location.
// Attempts to be as efficient as possible
func trigramsDancantos(text string, result []string, location int) int {
l := len(text)
if l < 3 {
return 0
}
// set up vars to track locations in the text as we walk
st, mid, end, tmp := 0, 0, 0, 0
// mid = index of second rune
// end = index of third rune
_, mid = utf8.DecodeRuneInString(text)
_, tmp = utf8.DecodeRuneInString(text[mid:])
end = mid + tmp
for end < l {
result[location] = text[st : end+1]
_, tmp = utf8.DecodeRuneInString(text[end:])
// update start, mid, end = old+mid, old_end, new_end=old_end+tmp
st, mid, end = mid, end, end+tmp
location++
}
return len(text) - 2
}
type Trigram [3]rune
// Bytes is the simplest way to turn an array of runes into a slice of bytes.
// There is a faster way to do this, but not needed for this demo.
// See: https://stackoverflow.com/questions/29255746/how-encode-rune-into-byte-using-utf8
func (t Trigram) Bytes() []byte {
return []byte(string(t[:]))
}
// Trigrams takes in text and returns its trigrams.
func TrigramsJamesrom(text string) []Trigram {
runes := []rune(text)
// if we have less than or 2 runes we cannot do anything so bail out
if len(runes) < 3 {
return []Trigram{}
}
// allocate all trigrams
ngrams := make([]Trigram, len(runes)-2)
// create the trigrams
for i := 0; i < len(runes)-2; i++ {
ngrams[i] = Trigram(runes[i : i+3])
}
return ngrams
}
func TrigramsFfmiruz(text string) []string {
var gram [3]int
for i := 0; i < 2; i++ {
size := runeSize((text[gram[i]]))
gram[i+1] = gram[i] + size
}
list := make([]string, 0, len(text))
for gram[2] < len(text) {
size := runeSize((text[gram[2]]))
list = append(list, text[gram[0]:gram[2]+size])
gram[0], gram[1], gram[2] = gram[1], gram[2], gram[2]+size
}
return list
}
// runeSize returns rune size given its first byte.
// Returns 1 if invalid first byte.
func runeSize(b byte) int {
if b < 0xC2 {
return 1
}
if b < 0xE0 {
return 2
}
if b < 0xF0 {
return 3
}
if b < 0xF5 {
return 4
}
return 1
}
// Queryise given some content will turn it into tokens
// and then hash them and store the resulting values into
// a slice which we can use to query the bloom filter
func Queryise(query string) []uint64 {
var queryBits []uint64
for _, w := range Tokenize(query) {
queryBits = append(queryBits, HashBloom([]byte(w))...)
}
// removing duplicates and sorting should in theory improve RAM access
// and hence performance
queryBits = RemoveUInt64Duplicates(queryBits)
sort.Slice(queryBits, func(i, j int) bool {
return queryBits[i] < queryBits[j]
})
return queryBits
}
// RemoveUInt64Duplicates removes duplicate values from uint64 slice
func RemoveUInt64Duplicates(s []uint64) []uint64 {
if len(s) < 2 {
return s
}
sort.Slice(s, func(x, y int) bool { return s[x] > s[y] })
var e = 1
for i := 1; i < len(s); i++ {
if s[i] == s[i-1] {
continue
}
s[e] = s[i]
e++
}
return s[:e]
}
// HashBloom hashes a single token/word 3 times to give us the entry
// locations we need for our bloomFilter filter
func HashBloom(word []byte) []uint64 {
var hashes []uint64
h1 := fnv.New64a()
h2 := fnv.New64()
// 3 hashes is probably OK for our purposes
// but to be really like Bing it should change this
// based on how common/rare the term is where
// rarer terms are hashes more
_, _ = h1.Write(word)
hashes = append(hashes, h1.Sum64()%BloomSize)
h1.Reset()
_, _ = h2.Write(word)
hashes = append(hashes, h2.Sum64()%BloomSize)
_, _ = h1.Write(word)
_, _ = h1.Write([]byte("salt")) // anything works here
hashes = append(hashes, h1.Sum64()%BloomSize)
h1.Reset()
return hashes
}
// Add adds items into the internal bloomFilter used later for pre-screening documents
// note that it fills the filter from right to left, which might not be what you expect
func Add(item []bool) error {
// bailout if we ever get something that will break the index
// because it does not match the size we expect
if len(item) != BloomSize {
return errors.New(fmt.Sprintf("expected to match size %d", BloomSize))
}
// we need to know if we need to add another batch to this index...
// which should only be called if we are building from the start
// or if we need to reset
if currentBlockDocumentCount == 0 || currentBlockDocumentCount == DocumentsPerBlock {
bloomFilter = append(bloomFilter, make([]uint64, BloomSize)...)
currentBlockDocumentCount = 0
// We don't want to do this for the first document, but everything after
// we want to know the offset, so in short trail by 1 BloomSize
if currentDocumentCount != 0 {
currentBlockStartDocumentCount += BloomSize
}
}
// we need to go through each item and set the correct bit
for i, bit := range item {
// if bit is set then we need to flip that bit from its default state, remember this fills from right to left
// which is not what you expect possibly... anyway it does not matter which way it goes
if bit {
bloomFilter[currentBlockStartDocumentCount+i] |= 1 << currentBlockDocumentCount // 0 in this case is the bit we want to flip so it would be 1 if we added document 2 to this block
}
}
// now we increment where we are and where the current block counts are so we can continue to add
currentBlockDocumentCount++
currentDocumentCount++
return nil
}
// PrintIndex prints out the index which can be useful from time
// to time to ensure that bits are being set correctly.
func PrintIndex() {
// display what the bloomFilter filter looks like broken into chunks
for j, i := range bloomFilter {
if j%BloomSize == 0 {
fmt.Println("")
}
fmt.Printf("%064b\n", i)
}
}
// Ngrams given input splits it according the requested size
// such that you can get trigrams or whatever else is required
func Ngrams(text string, size int) []string {
var runes = []rune(text)
var ngrams []string
for i := 0; i < len(runes); i++ {
if i+size < len(runes)+1 {
ngram := runes[i : i+size]
ngrams = append(ngrams, string(ngram))
}
}
return ngrams
}
// GetFill returns the % value of how much this doc was filled, allowing for
// determining if the index will be overfilled for this document
func GetFill(doc []bool) float64 {
count := 0
for _, i := range doc {
if i {
count++
}
}
return float64(count) / float64(BloomSize) * 100
}