-
Notifications
You must be signed in to change notification settings - Fork 0
/
german.py
52 lines (35 loc) · 1.02 KB
/
german.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#coding: utf-8
import io
import re
import pickle
def parse_corpus(fname):
corpus = []
with io.open('dictionary/german.dic', 'r', encoding='cp1252') as f:
for word in f:
word = word.strip()
if not word:
continue
corpus.append(word)
return corpus
def save_corpus(corpus):
with io.open('corpus.pickle', 'wb') as f:
pickle.dump(corpus, f, pickle.HIGHEST_PROTOCOL)
def load_corpus():
with io.open('corpus.pickle', 'rb') as f:
return pickle.load(f)
def search_corpus(corpus, regex):
for word in corpus:
if re.match(regex, word, re.IGNORECASE):
yield word
if __name__ == '__main__x':
corpus = parse_corpus('data/german.dic')
print('loaded corpus of', len(corpus), 'words')
save_corpus(corpus)
if __name__ == '__main__':
corpus = load_corpus()
print('unpickled corpus of', len(corpus), 'words')
for word in search_corpus(corpus, r'...?(m|n)g(f|t)en$'):
print(word)
'''
Solgen:
'''