-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
104 lines (79 loc) · 2.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#coding: utf-8
'''
Takes a list of files, parses them as TXT or XML, extracts all words and finds
out how many of these words match German dictinary.
The purpose is to estimate the OCR engine quality when processing different
types of documents in German.
@author: mkroutikov
'''
import io
import os
import re
import glob
import collections
import csv
import lxml.etree as et
def parse_dictionary(fname):
with io.open('dictionary/german.dic', 'r', encoding='cp1252') as f:
for word in f:
word = word.strip()
if word:
yield word
def words_from_txt(fname):
with io.open(fname, 'r', encoding='utf-8') as f:
text = f.read()
yield from re.findall(r'\w+', text)
def words_from_xml(fname):
with io.open(fname, 'rb') as f:
xml = et.fromstring(f.read())
for elt in xml.findall('.//word'):
yield from re.findall(r'\w+', elt.text)
def stats(dictionary, words):
per_file = collections.defaultdict(int)
for word in words:
word = word.lower()
if re.match(r'\d+$', word):
per_file['numeric'] += 1
elif len(word) < 3:
per_file['short'] += 1
elif word in dictionary:
per_file['dictionary'] += 1
else:
per_file['non-dictionary'] += 1
per_file['total'] += 1
return per_file
def merge(d1, *av):
'''merge one or more dictionaries into the first one'''
for d in av:
d1.update(d)
return d1
def file_stats(files, dictionary, output_fname):
totals = collections.defaultdict(int)
with io.open(output_fname, 'w', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['name', 'numeric', 'short', 'dictionary', 'non-dictionary', 'total'])
writer.writeheader()
for name, words in files:
print('Processing:', name)
per_file = stats(dictionary, words)
for key, val in per_file.items():
totals[key] += val
writer.writerow(merge({
'name': name
}, per_file))
writer.writerow(merge({
'name': 'Totals'
}, totals))
if __name__ == '__main__':
print('Loading German dictionary...')
dictionary = set(
x.lower() for x in parse_dictionary('dictionary/german.dic')
)
print('Loaded', len(dictionary), 'unique lowercase words')
def txt_files():
for fname in sorted(glob.glob('ocr-hlsl/*.txt')):
yield os.path.basename(fname), words_from_txt(fname)
def xml_files():
for fname in sorted(glob.glob('ocr-inno/*.xml')):
yield os.path.basename(fname), words_from_xml(fname)
file_stats(txt_files(), dictionary, output_fname='ocr-hlsl.csv')
file_stats(xml_files(), dictionary, output_fname='ocr-inno.csv')