-
Notifications
You must be signed in to change notification settings - Fork 12
/
lshforest.go
164 lines (148 loc) · 4.33 KB
/
lshforest.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
package lshensemble
import (
"math"
"sort"
)
const (
integrationPrecision = 0.01
)
// NewLshForest default constructor uses 32 bit hash value
var NewLshForest = NewLshForest32
// entry contains the hash key (from minhash signature) and the indexed key
type entry struct {
hashKey string
key interface{}
}
// hashTable is a look-up table implemented as a slice sorted by hash keys.
// Look-up operation is implemented using binary search.
type hashTable []entry
func (h hashTable) Len() int { return len(h) }
func (h hashTable) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h hashTable) Less(i, j int) bool { return h[i].hashKey < h[j].hashKey }
// LshForest represents a MinHash LSH implemented using LSH Forest
// (http://ilpubs.stanford.edu:8090/678/1/2005-14.pdf).
// It supports query-time setting of the MinHash LSH parameters
// L (number of bands) and
// K (number of hash functions per band).
type LshForest struct {
k int
l int
hashTables []hashTable
hashKeyFunc hashKeyFunc
hashValueSize int
numIndexedKeys int
}
func newLshForest(k, l, hashValueSize, initSize int) *LshForest {
if k < 0 || l < 0 {
panic("k and l must be positive")
}
hashTables := make([]hashTable, l)
for i := range hashTables {
hashTables[i] = make(hashTable, 0, initSize)
}
return &LshForest{
k: k,
l: l,
hashValueSize: hashValueSize,
hashTables: hashTables,
hashKeyFunc: hashKeyFuncGen(hashValueSize),
numIndexedKeys: 0,
}
}
// NewLshForest64 uses 64-bit hash values.
func NewLshForest64(k, l, initSize int) *LshForest {
return newLshForest(k, l, 8, initSize)
}
// NewLshForest32 uses 32-bit hash values.
// MinHash signatures with 64 bit hash values will have
// their hash values trimed.
func NewLshForest32(k, l, initSize int) *LshForest {
return newLshForest(k, l, 4, initSize)
}
// NewLshForest16 uses 16-bit hash values.
// MinHash signatures with 64 or 32 bit hash values will have
// their hash values trimed.
func NewLshForest16(k, l, initSize int) *LshForest {
return newLshForest(k, l, 2, initSize)
}
func (f *LshForest) hashKeys(sig []uint64, K int) []string {
hs := make([]string, f.l)
for i := 0; i < f.l; i++ {
hs[i] = f.hashKeyFunc(sig[i*f.k : i*f.k+K])
}
return hs
}
// Add a key with MinHash signature into the index.
// The key won't be searchable until Index() is called.
func (f *LshForest) Add(key interface{}, sig []uint64) {
// Generate hash keys
hs := f.hashKeys(sig, f.k)
// Insert keys into the hash tables by appending.
for i := range f.hashTables {
f.hashTables[i] = append(f.hashTables[i], entry{hs[i], key})
}
}
// Index makes all the keys added searchable.
func (f *LshForest) Index() {
for i := range f.hashTables {
sort.Sort(f.hashTables[i])
}
f.numIndexedKeys = len(f.hashTables[0])
}
// Query returns candidate keys given the query signature and parameters.
func (f *LshForest) Query(sig []uint64, K, L int, out chan<- interface{}, done <-chan struct{}) {
if K == -1 {
K = f.k
}
if L == -1 {
L = f.l
}
prefixSize := f.hashValueSize * K
// Generate hash keys
hashKeys := f.hashKeys(sig, K)
seens := make(map[interface{}]bool)
for i := 0; i < L; i++ {
// Only search over indexed keys.
ht := f.hashTables[i][:f.numIndexedKeys]
hk := hashKeys[i]
k := sort.Search(len(ht), func(x int) bool {
return ht[x].hashKey[:prefixSize] >= hk
})
if k < len(ht) && ht[k].hashKey[:prefixSize] == hk {
for j := k; j < len(ht) && ht[j].hashKey[:prefixSize] == hk; j++ {
key := ht[j].key
if _, seen := seens[key]; seen {
continue
}
seens[key] = true
select {
case out <- key:
case <-done:
return
}
}
}
}
}
// OptimalKL returns the optimal K and L for containment search,
// and the false positive and negative probabilities.
// where x is the indexed domain size, q is the query domain size,
// and t is the containment threshold.
func (f *LshForest) OptimalKL(x, q int, t float64) (optK, optL int, fp, fn float64) {
minError := math.MaxFloat64
for l := 1; l <= f.l; l++ {
for k := 1; k <= f.k; k++ {
currFp := probFalsePositive(x, q, l, k, t, integrationPrecision)
currFn := probFalseNegative(x, q, l, k, t, integrationPrecision)
currErr := currFn + currFp
if minError > currErr {
minError = currErr
optK = k
optL = l
fp = currFp
fn = currFn
}
}
}
return
}