-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.go
150 lines (126 loc) · 3.32 KB
/
index.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
package weakand
import (
"bufio"
"crypto/md5"
"encoding/binary"
"log"
"os"
"sort"
"github.com/huichen/sego"
)
type DocId uint64 // MD5 hash of document content.
type TermId int // depends on Vocab.
type InvertedIndex map[TermId]PostingList
type PostingList []Posting
type Posting struct {
DocId
TF int // The term frequency in Doc.
}
type ForwardIndex map[DocId]*Document
type Document struct {
Terms map[TermId]int // map makes it fast to compute Σt∈q∩d U_t.
Len int // sum over Terms.
Literal string
}
type SearchIndex struct {
Fwd ForwardIndex
Ivt InvertedIndex
Vocab *Vocab
Sgmt *sego.Segmenter
}
// If a word in content is not in vocab, add it into vocab.
func NewDocument(literal string, vocab *Vocab, sgmt *sego.Segmenter) *Document {
d := &Document{
Terms: make(map[TermId]int),
Len: 0,
Literal: literal}
for _, term := range tokenize(literal, sgmt) {
d.Terms[vocab.IdOrAdd(term)]++
d.Len++
}
return d
}
// In InvertedIndex, posting lists are sorted by asceding order DocId.
func (p PostingList) Len() int { return len(p) }
func (p PostingList) Less(i, j int) bool { return p[i].DocId < p[j].DocId }
func (p PostingList) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
func NewIndex(vocab *Vocab, sgmt *sego.Segmenter) *SearchIndex {
if vocab == nil {
vocab = NewVocab(nil)
}
return &SearchIndex{
Ivt: make(map[TermId]PostingList),
Fwd: make(map[DocId]*Document),
Vocab: vocab,
Sgmt: sgmt}
}
// add Add a document into the index, but not sorting posting lists.
func (idx *SearchIndex) add(doc string, changed map[TermId]int) {
did := documentHash(doc)
if _, ok := idx.Fwd[did]; !ok { // De-duplicate
d := NewDocument(doc, idx.Vocab, idx.Sgmt)
idx.Fwd[did] = d
for term, tf := range d.Terms {
idx.Ivt[term] = append(idx.Ivt[term], Posting{DocId: did, TF: tf})
changed[term]++
}
}
}
// Add a single document into index.
func (idx *SearchIndex) Add(doc string) *SearchIndex {
changed := make(map[TermId]int)
idx.add(doc, changed)
for termId := range changed {
sort.Sort(idx.Ivt[termId])
}
return idx
}
// BatchAdd adds all documents read from channel corpus into the index.
func (idx *SearchIndex) BatchAdd(corpus chan string) *SearchIndex {
changed := make(map[TermId]int)
for doc := range corpus {
idx.add(doc, changed)
}
for termId := range changed {
sort.Sort(idx.Ivt[termId])
}
return idx
}
func documentHash(literal string) DocId {
md5Bytes := md5.Sum([]byte(literal))
return DocId(binary.BigEndian.Uint64(md5Bytes[:]))
}
func NewIndexFromFile(corpusFile string, sgmt *sego.Segmenter, dump string) *SearchIndex {
idx := NewIndex(NewVocab(nil), sgmt)
if len(corpusFile) > 0 {
ch := make(chan string)
go func() {
WithFile(corpusFile,
func(f *os.File) {
scanner := bufio.NewScanner(f)
for scanner.Scan() {
ch <- scanner.Text()
}
if e := scanner.Err(); e != nil {
log.Panicf("Scanning corpus error:%v", e)
}
})
close(ch)
}()
idx.BatchAdd(ch)
}
if len(dump) > 0 {
idx.Pretty(NewCSVTable(CreateOrDie(dump)), nil, nil, 0)
}
return idx
}
func tokenize(doc string, sgmt *sego.Segmenter) []string {
var terms []string
for _, seg := range sgmt.Segment([]byte(doc)) {
term := seg.Token().Text()
if !AllPunctOrSpace(term) {
terms = append(terms, term)
}
}
return terms
}