-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfreq_analizer.py
103 lines (91 loc) · 3.91 KB
/
freq_analizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json
from bktree import BKTree
class PriorGenerator:
def __init__(self, zero_to_alpha):
#self.freq = {}
#self.successor = {}
self.word_successor = {}
self.dictionary = set()
self.stopSymbols = []
self.zero_to_alpha = zero_to_alpha
self.bk_tree = BKTree()
def get_word_successor(self, w1, w2):
try:
return self.word_successor[w1][w2]
except:
return self.zero_to_alpha
def load_stop_symbols_from_file(self, path):
with open(path) as file:
for x in file.readlines():
self.stopSymbols.append(x[:-1])
def remove_stop_symbols(self, line):
for x in self.stopSymbols:
line = line.replace(x, " ")
return line
def analize_freq(self, path):
with open(path) as file:
for line in file.readlines():
line = line[:-1]
line = self.remove_stop_symbols(line)
#Solo lettere inglesi?
line = line.lower()
#Analizzo le frequenze
# for i in range(len(line)):
# char = line[i]
# if (char > "z" or char < "a"):
# continue
# try:
# self.freq[char] += 1
# except:
# self.freq[char] = 1
# if i < len(line)-1:
# try:
# suc = self.successor[char]
# try:
# suc[line[i+1]] += 1
# except:
# suc[line[i+1]] = 1
# except:
# self.successor[char] = {}
# self.successor[char][line[i+1]] = 1
#Se non si lavora solo con l'inglese bisogna considerare anche gli apostrofi e simili
#Analizzo le parole
line = line.split()
for word_id in range(len(line)):
self.dictionary.add(line[word_id])
if word_id < len(line)-1:
try:
suc = self.word_successor[line[word_id]]
try:
suc[line[word_id + 1]] += 1
except:
suc[line[word_id + 1]] = 1
except:
self.word_successor[line[word_id]] = {}
self.word_successor[line[word_id]][line[word_id + 1]] = 1
def finalize(self):
#tot = sum(self.freq.values())
# for key in self.freq.keys():
# self.freq[key] /= tot
# tot2 = sum(self.successor[key].values())
# for key2 in self.successor[key].keys():
# self.successor[key][key2] /= tot2
for key in self.word_successor.keys():
tot = sum(self.word_successor[key].values())
for key2 in self.word_successor[key].keys():
self.word_successor[key][key2] /= tot
for key in self.dictionary:
self.bk_tree.addWord(key)
def serialize(self, path):
tmp_dic = {"Dictionary":list(self.dictionary), "Word_successor": self.word_successor, "BK_Tree": self.bk_tree.root}
with open(path,"w") as outFile:
json.dump(tmp_dic,outFile)
def deserialize(self, path):
with open(path) as inFile:
tmp_dic = json.load(inFile)
self.dictionary = tmp_dic["Dictionary"]
#self.freq = tmp_dic["Freq"]
#self.successor = tmp_dic["Successor"]
self.word_successor = tmp_dic["Word_successor"]
self.bk_tree = BKTree()
self.bk_tree.root = tmp_dic["BK_Tree"]