-
Notifications
You must be signed in to change notification settings - Fork 254
/
text_mark.go
139 lines (124 loc) · 4.13 KB
/
text_mark.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/
package extractor
import (
"fmt"
"math"
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/internal/transform"
"github.com/unidoc/unipdf/v3/model"
)
// textMark represents text drawn on a page and its position in device coordinates.
// All dimensions are in device coordinates.
type textMark struct {
serial int // Sequence number for debugging.
model.PdfRectangle // Bounding box.
text string // The text (decoded via ToUnicode).
original string // Original text (decoded).
font *model.PdfFont // The font the mark was drawn with.
fontsize float64 // The font size the mark was drawn with.
charspacing float64 // TODO (peterwilliams97: Should this be exposed in TextMark?
trm transform.Matrix // The current text rendering matrix (TRM above).
end transform.Point // The end of character device coordinates.
}
// newTextMark returns a textMark for text `text` rendered with text rendering matrix (TRM) `trm`
// and end of character device coordinates `end`. `spaceWidth` is our best guess at the width of a
// space in the font the text is rendered in device coordinates.
func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point,
spaceWidth float64, font *model.PdfFont, charspacing float64) (textMark, bool) {
theta := trm.Angle()
orient := nearestMultiple(theta, 10)
var height float64
if orient%180 != 90 {
height = trm.ScalingFactorY()
} else {
height = trm.ScalingFactorX()
}
start := translation(trm)
bbox := model.PdfRectangle{Llx: start.X, Lly: start.Y, Urx: end.X, Ury: end.Y}
switch orient % 360 {
case 90:
bbox.Urx -= height
case 180:
bbox.Ury -= height
case 270:
bbox.Urx += height
default:
bbox.Ury += height
}
if bbox.Llx > bbox.Urx {
bbox.Llx, bbox.Urx = bbox.Urx, bbox.Llx
}
if bbox.Lly > bbox.Ury {
bbox.Lly, bbox.Ury = bbox.Ury, bbox.Lly
}
clipped, onPage := rectIntersection(bbox, to.e.mediaBox)
if !onPage {
common.Log.Debug("Text mark outside page. bbox=%g mediaBox=%g text=%q",
bbox, to.e.mediaBox, text)
}
bbox = clipped
tm := textMark{
text: text,
PdfRectangle: bbox,
font: font,
fontsize: height,
charspacing: charspacing,
trm: trm,
end: end,
serial: serial.mark,
}
serial.mark++
if !isTextSpace(tm.text) && tm.Width() == 0.0 {
common.Log.Debug("ERROR: Zero width text. tm=%s", tm.String())
}
if verboseGeom {
common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String())
}
return tm, onPage
}
// String returns a description of `tm`.
func (tm *textMark) String() string {
return fmt.Sprintf("serial=%d %.2f fontsize=%.2f \"%s\"",
tm.serial, tm.PdfRectangle, tm.fontsize, tm.text)
}
// bbox makes textMark implement the `bounded` interface.
func (tm *textMark) bbox() model.PdfRectangle {
return tm.PdfRectangle
}
// ToTextMark returns the public view of `tm`.
func (tm *textMark) ToTextMark() TextMark {
return TextMark{
count: int64(tm.serial),
Text: tm.text,
Original: tm.original,
BBox: tm.PdfRectangle,
Font: tm.font,
FontSize: tm.fontsize,
}
}
// appendTextMark appends `mark` to `marks` and updates `offset`, the offset of `mark` in the extracted
// text.
func appendTextMark(marks []TextMark, offset *int, mark TextMark) []TextMark {
mark.Offset = *offset
marks = append(marks, mark)
*offset += len(mark.Text)
return marks
}
// appendSpaceMark appends a spaceMark with space character `space` to `marks` and updates `offset`,
// the offset of `mark` in the extracted text.
func appendSpaceMark(marks []TextMark, offset *int, spaceChar string) []TextMark {
mark := spaceMark
mark.Text = spaceChar
return appendTextMark(marks, offset, mark)
}
// nearestMultiple return the integer multiple of `m` that is closest to `x`.
func nearestMultiple(x float64, m int) int {
if m == 0 {
m = 1
}
fac := float64(m)
return int(math.Round(x/fac) * fac)
}