forked from smarty-archives/mafsa
-
Notifications
You must be signed in to change notification settings - Fork 2
/
encoder.go
159 lines (134 loc) · 4.36 KB
/
encoder.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
package mafsa
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"sort"
"unicode/utf8"
)
// Encoder is a type which can encode a BuildTree into a byte slice
// which can be written to a file.
type Encoder struct {
queue []*BuildTreeNode
curOffset int
}
// Encode serializes a BuildTree t into a byte slice.
func (e *Encoder) Encode(t *BuildTree) ([]byte, error) {
e.queue = []*BuildTreeNode{}
// First entry (== fixed-length entry to which a node can be
// serialized) is a null entry that specifies the file format:
// First byte indicates the flag scheme (basically a file format verison number)
// Second byte is the pointer length in bytes
// Note: entry length can be calculated as follows:
// Second byte + 1 (flags) + char length in bytes (1-4, encoded in the higher flag bits)
// Any leftover bytes in this first entry are zero
data := []byte{0x02, 0x04}
for i := len(data); i < int(data[1]+1+1); i++ {
data = append(data, 0x00)
}
e.curOffset = e.getChildNodesSizeTotal(data, t.Root.Edges) + int(data[1]+1+1)
data, err := e.encodeEdges(t.Root, data)
if err != nil {
return nil, err
}
for len(e.queue) > 0 {
// Pop first item off the queue
top := e.queue[0]
e.queue = e.queue[1:]
// Recursively marshal child nodes
data, err = e.encodeEdges(top, data)
if err != nil {
return nil, err
}
}
return data, nil
}
// WriteTo encodes and saves the BuildTree to a io.Writer.
func (e *Encoder) WriteTo(wr io.Writer, t *BuildTree) error {
bs, err := e.Encode(t)
if err != nil {
return err
}
_, err = io.Copy(wr, bytes.NewReader(bs))
if err != nil {
return err
}
return nil
}
// getChildNodesSizeTotal returns the size in bytes of all encoded child nodes.
func (e *Encoder) getChildNodesSizeTotal(data []byte, childNodes map[rune]*BuildTreeNode) int {
// get rune size in bytes first
var runelentotal int
for r := range childNodes {
runelentotal += utf8.RuneLen(r)
}
// get the non-variable size of the entries which is:
// child node count * (pointer size + flag byte)
statictotal := len(childNodes) * (int(data[1]) + 1)
return statictotal + runelentotal
}
// encodeEdges encodes the edges going out of node into bytes which are appended
// to data. The modified byte slice is returned.
func (e *Encoder) encodeEdges(node *BuildTreeNode, data []byte) ([]byte, error) {
// We want deterministic output for testing purposes,
// so we need to order the keys of the edges map.
edgeKeys := sortEdgeKeys(node)
for i := 0; i < len(edgeKeys); i++ {
currune := edgeKeys[i]
child := node.Edges[currune]
runelen := utf8.RuneLen(currune)
var flags byte
if child.final {
flags |= endOfWord
}
if i == len(edgeKeys)-1 {
flags |= endOfNode // end of node (last child outgoing from this node)
}
// encode rune length in the bits after the first two
// least significant ones
lenmask := byte(runelen) << 2
flags |= lenmask
// the whole entry fits in here. The size is: flag +
// length of rune + pointer size
entryBytes := make([]byte, runelen+1+int(data[1]))
ret := copy(entryBytes, append([]byte{flags}, []byte(string(currune))...))
if ret != runelen+1 {
return nil, fmt.Errorf("could not copy UTF-8 bytes + flag. Wanted to copy %d, but only copied %d\n", runelen, ret)
}
// If bytePos is 0, we haven't encoded this edge yet
if child.bytePos == 0 {
if len(child.Edges) > 0 {
child.bytePos = e.curOffset
childrenlength := e.getChildNodesSizeTotal(data, child.Edges)
e.curOffset += childrenlength
}
e.queue = append(e.queue, child)
}
pointer := child.bytePos
switch int(data[1]) {
case 2:
binary.BigEndian.PutUint16(entryBytes[runelen+1:], uint16(pointer))
case 4:
binary.BigEndian.PutUint32(entryBytes[runelen+1:], uint32(pointer))
case 8:
binary.BigEndian.PutUint64(entryBytes[runelen+1:], uint64(pointer))
}
data = append(data, entryBytes...)
}
return data, nil
}
// sortEdgeKeys returns a sorted list of the keys
// of the map containing outgoing edges.
func sortEdgeKeys(node *BuildTreeNode) []rune {
edgeKeys := make(runeSlice, 0, len(node.Edges))
for char := range node.Edges {
edgeKeys = append(edgeKeys, char)
}
sort.Sort(edgeKeys)
return []rune(edgeKeys)
}
type runeSlice []rune
func (s runeSlice) Len() int { return len(s) }
func (s runeSlice) Less(i, j int) bool { return s[i] < s[j] }
func (s runeSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] }