-
Notifications
You must be signed in to change notification settings - Fork 3
/
parser.go
388 lines (346 loc) · 10.3 KB
/
parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
package pdflex
import (
"bytes"
"fmt"
"strconv"
)
type parseState int
const (
outside parseState = iota
inside
parseEOF
)
// Spec: 7.5.5 - 7.5.6
// Section Header rows are Offset, count
// 'f' is free 'n' is live object
// Object entries are specified as exactly 20 bytes.
// [0-9]{10} [0-9]{5} [fn]\r\n
// 8 1 <- one entry expected, Offset 8
// 0000037413 00000 n <- object 8
// 10 1
// 0000037503 00000 n <- object 10
// 12 2
// 0000037629 00000 n
// 0000037791 00000 n <- object 13
// 15 1
// 0000037931 00000 n
//
// Multiple xref sections can appear in one file, covering from the last %%EOF
// to the %%EOF after the end of the trailer
// Parser represents the state of the input parser
type Parser struct {
From int // range of whole input buffer this xref covers
LastXref int //
Idx int // Object Index of the current object
Offset int // Header Section Offset
Entries int // Number of object entries for this section
*Lexer
State parseState
Scratch bytes.Buffer
}
// Row represents one object entry in an xrefs section
type Row struct {
Offset int
Generation int
Active bool
}
// MaybeFindXref parses forward until it finds an xref token, emitting all seen
// tokens to scratch. It is responsible for maintaining the 'LastXref' parser member
// which records the start of the most recent xref section and the 'State'
// struct member which is a sanity check to verify when we think we're in the
// middle of parsing an xrefs.
func (p *Parser) MaybeFindXref() bool {
if p.State == parseEOF {
return false
}
if p.State != outside {
panic("[BUG] MaybeFindXref() called while still in an xref")
}
for i := p.NextItem(); i.Typ != ItemEOF; i = p.NextItem() {
p.Scratch.WriteString(i.Val)
if i.Typ == ItemXref {
p.State = inside
// FIXED - make sure to use the index of the xref in Scratch, not
// in the shrunk input buffer, because when you change the
// "startxref\rNNNNNNN" string size they get out of sync in files
// with multiple xref sections
p.LastXref = p.Scratch.Len() - len(i.Val)
return true
}
}
p.State = parseEOF
return false
}
// FindRow parses and consumes one object entry in an xref section. It does NOT
// consume the trailing EOL marker. If the row is unable to be parsed, it will
// emit all seen tokens to scratch before returning an error.
func (p *Parser) FindRow() (r Row, e error) {
// Cache the contents of all tokens we evaluate so we can write them out if
// we have to abort
bailout := ""
i, ok := p.Accept(ItemNumber, false)
bailout += i.Val
if !ok || len(i.Val) != 10 {
e = fmt.Errorf("corrupt row - want 10 digit offset, got %#v", i)
p.Scratch.WriteString(bailout)
return
}
r.Offset, e = strconv.Atoi(i.Val)
if e != nil {
// Still need to handle errors - something like +12.5 will pass the
// lexer, but not Atoi
e = fmt.Errorf("corrupt row - want 10 digit offset, got %#v", i)
return
}
i, ok = p.Accept(ItemSpace, false)
bailout += i.Val
if !ok || len(i.Val) != 1 {
e = fmt.Errorf("corrupt row - want ItemSpace, got %#v", i)
p.Scratch.WriteString(bailout)
return
}
i, ok = p.Accept(ItemNumber, false)
bailout += i.Val
if !ok || len(i.Val) != 5 {
e = fmt.Errorf("corrupt row - want 5 digit generation, got %#v", i)
p.Scratch.WriteString(bailout)
return
}
r.Generation, e = strconv.Atoi(i.Val)
if e != nil {
e = fmt.Errorf("corrupt row - 5 digit generation, got %#v", i)
return
}
i, ok = p.Accept(ItemSpace, false)
bailout += i.Val
if !ok || len(i.Val) != 1 {
e = fmt.Errorf("corrupt row - want ItemSpace, got %#v", i)
p.Scratch.WriteString(bailout)
return
}
i, ok = p.Accept(ItemWord, false)
bailout += i.Val
if !ok || len(i.Val) != 1 || !(i.Val == "n" || i.Val == "f") {
e = fmt.Errorf("corrupt row - want [nf], got %#v", i)
p.Scratch.WriteString(bailout)
return
}
if i.Val == "n" {
r.Active = true
}
return
}
// Accept is used to check the type of the next token, returning the token
// itself and a match boolean. If write is true the token will be emitted to
// scratch, whether or not the check matches.
func (p *Parser) Accept(t ItemType, write bool) (Item, bool) {
i := p.NextItem()
if write {
p.Scratch.WriteString(i.Val)
}
if i.Typ == ItemEOF {
p.State = parseEOF
}
return i, i.Typ == t
}
// ResetToHere aborts any xref parsing in progress, sets the xref-related
// state values to -1 and sets 'from' to the current position. This is done so
// that if another xref is encountered later ( which may not be corrupt ) the
// search scope in the raw data will start from wherever the previous xref
// parsing aborted.
func (p *Parser) ResetToHere() {
// If we've reached EOF don't touch the state any more so that other
// functions can detect it and abort.
if p.State != parseEOF {
p.State = outside
}
p.From = p.Scratch.Len() - 1
p.LastXref, p.Idx, p.Offset, p.Entries = -1, -1, -1, -1
}
// SeemsLegit is a quick call to make sure none of the xref-related state
// entries are set to their reset values.
func (p *Parser) SeemsLegit() bool {
return !(p.LastXref < 0 ||
p.Idx < 0 ||
p.Offset < 0 ||
p.Entries < 0)
}
// MaybeFindHeader is called directly after an xref token, or after the end of
// a section inside an xref. If tries to find a header row (offset count EOL).
// If it can't find one, it then tries to find the trailer keyword. If it
// finds a trailer it will:
// - advance to the next startxref token
// - fix the startxref offset
// - reset the state variables ready to find the next xref ( if any )
// - then return false.
func (p *Parser) MaybeFindHeader() bool {
if p.State != inside {
p.ResetToHere()
return false
}
i := p.NextItem()
p.Scratch.WriteString(i.Val)
var err error
switch i.Typ {
case ItemTrailer:
// no more headers in this section. Try to find and fix the startxref
// entry, and then reset to the outside state. Even if there is a
// missing %%EOF token we're not going to abort or anything...
for {
i, atEOF := p.Accept(ItemEOF, true)
if atEOF {
p.State = parseEOF
return false
}
if i.Typ == ItemStartXref {
if _, ok := p.Accept(ItemEOL, true); !ok {
p.ResetToHere()
return false
}
// don't write in this call to Accept, we will write our
// own number
if i, ok := p.Accept(ItemNumber, false); !ok {
p.Scratch.WriteString(i.Val)
p.ResetToHere()
return false
}
p.Scratch.WriteString(fmt.Sprintf("%d", p.LastXref))
// Next tokens should be ItemEOL then ItemComment "%%EOF", but
// we don't actually care, let the general parsing loop emit
// them.
p.ResetToHere()
return false
}
}
case ItemNumber:
p.Offset, err = strconv.Atoi(i.Val)
if err != nil {
p.ResetToHere()
return false
}
if _, ok := p.Accept(ItemSpace, true); !ok {
p.ResetToHere()
return false
}
i, ok := p.Accept(ItemNumber, true)
if !ok {
p.ResetToHere()
return false
}
p.Entries, err = strconv.Atoi(i.Val)
if err != nil {
p.ResetToHere()
return false
}
// Accept both 1 and 2 byte <EOL> as well as <SP><EOL>. Don't know if
// this is strictly per-spec, but it's common.
i, ok = p.Accept(ItemEOL, true)
if !ok && i.Typ != ItemSpace {
p.ResetToHere()
return false
}
if i.Typ == ItemSpace {
// not CRLF, but it was SP ...we must get <EOL> now
if _, ok := p.Accept(ItemEOL, true); !ok {
p.ResetToHere()
return false
}
}
p.Idx = p.Offset
if !p.SeemsLegit() {
panic("BUG: logic broken in MaybeFindHeader")
}
return true
case ItemEOF:
p.State = parseEOF
return false
default:
// we assume that this was a truncated xref section or something, so
// we'll report no header row found, but still set the "from" index.
// That means that if there's another xref later the search scope will
// be (hopefully correctly) from the end of this truncated / corrupt
// xref section to the start of the next one.
p.ResetToHere()
return false
}
}
// FixXrefs is a parsing loop. Essentially it seeks to an xref token, then
// loops through parsing the xref header rows and object entry rows. When no
// more xref tokens are found it runs through until the end of the file. This
// consumes the supplied lexer, so it can only be used once.
func (p *Parser) FixXrefs() []byte {
mainLoop:
for {
found := p.MaybeFindXref()
if !found {
if p.State != parseEOF {
// just checking...
panic("[BUG] No xref found but not at EOF!")
}
return p.Scratch.Bytes()
}
if _, ok := p.Accept(ItemEOL, true); !ok {
p.ResetToHere()
continue mainLoop
}
// found a new xref section now
for p.MaybeFindHeader() {
if !p.SeemsLegit() {
panic("BUG: SeemsLegit() failed after we found a header!")
}
entryLoop:
for i := 0; i < p.Entries; i++ {
row, err := p.FindRow()
if err != nil {
p.ResetToHere()
continue mainLoop
}
if row.Active {
objOffset := locateObj(p.Scratch.Bytes()[p.From:p.LastXref], p.Idx+i)
// no matching object, emit the row unmodified
if objOffset < 0 {
objOffset = row.Offset
} else {
// If we found it in a subslice, add the from index to
// get the true index from the start of the input.
objOffset += p.From
}
p.Scratch.WriteString(fmt.Sprintf("%.10d %.5d n", objOffset, row.Generation))
} else {
p.Scratch.WriteString(fmt.Sprintf("%.10d %.5d f", row.Offset, row.Generation))
}
// Correct line terminators are: SP CR, SP LF, or CRLF
// This makes a correct line exactly 20 bytes.
// Spec section 7.5.4 p 41
i, ok := p.Accept(ItemEOL, true)
if ok && len(i.Val) == 2 {
// CRLF - done with this line
continue entryLoop
}
if i.Typ == ItemSpace && len(i.Val) == 1 {
// not CRLF, but it was SP ...still OK if we get a linebreak now
if j, ok := p.Accept(ItemEOL, true); ok && len(j.Val) == 1 {
// single CR or LF - all is well. Strictly speaking we
// should only accept \r, not \n. Meh.
continue entryLoop
}
}
// line is invalid, bail.
p.ResetToHere()
continue mainLoop
}
}
p.ResetToHere() // probably not neccessary, but idempotent
}
}
func locateObj(in []byte, i int) int {
idx := bytes.Index(in, []byte(fmt.Sprintf("\n%d 0 obj", i)))
if idx < 0 {
idx = bytes.Index(in, []byte(fmt.Sprintf("\r%d 0 obj", i)))
if idx < 0 {
return idx
}
}
// Add 1 to the offset so the index is ahead of the \n or \r
return idx + 1
}