forked from vrandezo/lexicographic_coverage
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcorpora-parse.py
80 lines (71 loc) · 2.1 KB
/
corpora-parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gzip
import json
from typing import Dict
import meta
for language in meta.languages:
wordlist: Dict[str, int] = {}
print("Reading " + language)
errorcount = 0
tokencount = 0
corpusfile = meta.corpora_dir + "/" + language + ".txt.gz"
linecount = 0
first = True
try:
fh = gzip.open(corpusfile, "r")
except Exception:
print("Can't read {}".format(corpusfile))
continue
for line in fh:
linecount += 1
if linecount % 100000 == 0:
print("{:,} articles processed".format(linecount))
for c in [
# ASCII punctuation characters, except for & ' - /
b"!", b"\"", b"#", b"$", b"%", b"(", b")", b"*", b"+",
b",", b".", b":", b";", b"<", b"=", b">", b"?", b"@",
b"[", b"\\", b"]", b"^", b"_", b"`", b"{", b"|", b"}", b"~",
# Various Unicode quotation marks - “ ” „ ‟
b"\xe2\x80\x9c", b"\xe2\x80\x9d", b"\xe2\x80\x9e", b"\xe2\x80\x9f",
# Devanagari danda and double danda
b"\xe0\xa5\xa4", b"\xe0\xa5\xa5",
]:
line = line.replace(c, b" ")
words = line.split()
for word in words:
tokencount += 1
try:
word = word.decode("utf-8")
except UnicodeDecodeError:
errorcount += 1
continue
if word in ["", "NEWLINE"]:
continue
if word.isdigit():
continue
word = word.lower()
if word not in wordlist:
wordlist[word] = 0
wordlist[word] += 1
output = open(meta.output_dir + "/wordlist-" + language + ".txt", "w")
tencount = 0
tentokencount = 0
for l in sorted(wordlist.items(), reverse=True, key=lambda x: x[1]):
if l[1] > 10:
tencount += 1
tentokencount += l[1]
output.write(l[0] + " " + str(l[1]) + "\n")
output.close()
with open(meta.output_dir + "/meta-" + language + ".txt", "w") as output:
json.dump({
"corpus": meta.data[language]["source"],
"numberOfFormsInWiki": len(wordlist),
"numberOfFormsInWikiTen": tencount,
"numberOfTokens": tokencount,
"numberOfTokensTen": tentokencount,
"unicodeErrors": errorcount
}, output, indent=4)
print(
"Read {} with {:,} different word forms,"
" {:,} with 10+ in {:,} words ({:,} errors)"
.format(language, len(wordlist), tencount, tokencount, errorcount)
)