-
Notifications
You must be signed in to change notification settings - Fork 5
/
tokenizer.go
67 lines (55 loc) · 1.39 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
package stalefish
import (
"strings"
"unicode"
"github.com/kotaroooo0/stalefish/morphology"
)
type Tokenizer interface {
Tokenize(string) TokenStream
}
type StandardTokenizer struct{}
func NewStandardTokenizer() StandardTokenizer {
return StandardTokenizer{}
}
func (t StandardTokenizer) Tokenize(s string) TokenStream {
terms := strings.FieldsFunc(s, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
tokens := make([]Token, len(terms))
for i, term := range terms {
tokens[i] = NewToken(term)
}
return NewTokenStream(tokens)
}
type MorphologicalTokenizer struct {
morphology morphology.Morphology
}
func NewMorphologicalTokenizer(morphology morphology.Morphology) MorphologicalTokenizer {
return MorphologicalTokenizer{
morphology: morphology,
}
}
func (t MorphologicalTokenizer) Tokenize(s string) TokenStream {
mTokens := t.morphology.Analyze(s)
tokens := make([]Token, len(mTokens))
for i, t := range mTokens {
tokens[i] = NewToken(t.Term, setKana(t.Kana))
}
return NewTokenStream(tokens)
}
type NgramTokenizer struct {
n int
}
func NewNgramTokenizer(n int) NgramTokenizer {
return NgramTokenizer{
n: n,
}
}
func (t NgramTokenizer) Tokenize(s string) TokenStream {
count := len([]rune(s)) + 1 - t.n
tokens := make([]Token, count)
for i := 0; i < count; i++ {
tokens[i] = NewToken(string([]rune(s)[i : i+t.n]))
}
return NewTokenStream(tokens)
}