-
Notifications
You must be signed in to change notification settings - Fork 79
/
hocr-extract-g1000
executable file
·189 lines (148 loc) · 5.3 KB
/
hocr-extract-g1000
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python
# extract lines from Google 1000 book sample
import glob
import os
import re
import sys
import xml.sax
from PIL import Image
usage = """
... hocr image_pattern output_prefix
Process Google 1000 books volumes and prepares line or word images
for alignment using OCRopus.
Run ocroscript align-... Volume_0000/0000/0000.{png,txt}
Arguments:
hocr: hOCR source file
image_pattern: either a glob pattern that results in a list
of image files in order, or @filename for a file containing
a list of image files in order; DON'T FORGET TO QUOTE THIS
output_pattern: output images are of the form
output_pattern%(pageno,lineno)
Environment Variables:
element="ocr_line": which element to extract; ocrx_word and
ocr_cinfo are also useful
regex=".": the text for any transcription must match this pattern
dict=None: a dictionary; if provided, all the words in any line
that's output by the program must occur in the dictionary
min_len=20: minimum length of text for which lines are output
max_len=50: maximum length of text for which lines are output
max_lines=1000000: maximum number of lines output
pad=2: pad the bounding box by this many pixels prior to extraction
output_format=png: format for line image files
"""
if len(sys.argv) > 1 and (sys.argv[1] == '-h' or sys.argv[1] == '--help'):
sys.stderr.write(usage)
sys.exit(0)
if len(sys.argv) != 4:
sys.stderr.write(usage)
print("args:", sys.argv)
sys.exit(1)
exe, hocr, image_pattern, output_pattern = sys.argv
if image_pattern[0] == "@":
image_list = open(image_pattern[1:]).readlines()
image_list = [s[:-1] for s in image_list]
image_list.sort()
else:
image_list = glob.glob(image_pattern)
if not os.path.exists(hocr):
sys.stderr.write(hocr + ": not found")
sys.exit(1)
element = os.getenv("element", "ocr_line")
regex = os.getenv("regex", ".")
min_len = int(os.getenv("min_len", "20"))
max_len = int(os.getenv("max_len", "50"))
dict = None
dictfile = os.getenv("dict")
max_lines = int(os.getenv("max_lines", "1000000"))
pad = int(os.getenv("pad", "2"))
output_format = os.getenv("output_format", "png")
if dictfile:
stream = open(dictfile, "r")
words = stream.read().split()
stream.close()
dict = {}
for word in words:
dict[word.lower()] = 1
# print "[read %d words from %s]\n"%(len(words),dictfile)
def check_dict(dict, s):
if not dict:
return 1
words = re.split(r'\W+', s)
for word in words:
if word == "":
continue
if not dict.get(word.lower()):
return 0
return 1
def write_string(file, text):
stream = open(file, "w")
stream.write(text.encode("utf-8"))
stream.close()
def get_prop(title, name):
props = title.split(';')
for prop in props:
(key, args) = prop.split(None, 1)
if key == name:
return args
return None
class docHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self.element = element
self.regex = regex
def startDocument(self):
self.total = 0
self.pageno = -1
self.text = None
self.depth = 0
self.start = -1
self.copied = {}
def endDocument(self):
pass
def startElement(self, name, attrs):
self.depth += 1
if attrs.get("class", "") == "ocr_page":
self.lineno = -1
self.pageno += 1
self.page = image_list[self.pageno]
self.image = Image.open(self.page)
if attrs.get("class", "") == self.element:
self.lineno += 1
props = attrs.get("title", "")
self.bbox = get_prop(props, "bbox")
self.start = self.depth
self.text = u""
def endElement(self, name):
if self.depth == self.start:
if len(self.text) >= min_len and \
len(self.text) <= max_len and \
re.match(self.regex, self.text) and \
check_dict(dict, self.text):
print(self.page, self.bbox, self.text.encode("utf-8"))
w, h = self.image.size
x0, y0, x1, y1 = [int(s) for s in self.bbox.split()]
assert y0 < y1 and x0 < x1 and x1 <= w and y1 <= h
x0 = max(0, x0 - pad)
y0 = max(0, y0 - pad)
x1 = min(w, x1 + pad)
y1 = min(h, y1 + pad)
limage = self.image.crop((x0, y0, x1, y1))
base = output_pattern % (self.pageno, self.lineno)
basedir = os.path.dirname(base)
if not os.path.exists(basedir):
os.mkdir(basedir)
limage.save(base + "." + output_format)
write_string(base + ".txt", self.text)
write_string(base + ".bbox", self.bbox)
self.total += 1
if self.total >= max_lines:
sys.exit(0)
self.text = None
self.start = -1
self.depth -= 1
def characters(self, str, start, end):
if self.text is not None:
self.text += str[start:end]
parser = xml.sax.make_parser()
stream = os.popen("tidy -q -wrap 9999 -asxhtml < %s 2> /tmp/tidy_errs" % hocr,
"r")
parser.parseFile(stream, docHandler())