-
Notifications
You must be signed in to change notification settings - Fork 0
/
autocorrect.py
167 lines (125 loc) · 3.84 KB
/
autocorrect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import minEditDist as med
import pandas as pd
from collections import defaultdict
import numpy as np
import time
def main():
t0 = time.time()
sentence = "This class is thouhght prrovoking and chalengin"
#word = "sotoe"
#datasets
personal_words = list()
trigrams = pd.read_csv("~/w3_.txt", encoding="ISO-8859-1", sep='\t', names=["count", "first", "second", "third"])
allwords = pd.read_csv("~/english-words/google-10000-english-usa.txt", sep='\n')
#words.txt
print("built dataframes")
#print(getCandidateWords(word, allwords))
#print(calcMinEditDist('anyone', 'anipene'))
print(process(sentence, allwords, trigrams))
t1 = time.time()
print('time elapsed: ', t1 - t0)
def process(sentence, allwords, trigrams, accuracy=0.95):
#work on this after college
if len(sentence.split()) < 3:
return sentence
#make array of words
sentence = sentence.split()
length = len(sentence)
potential = sentence[:2]
#punctuation = dict()
#signs = ',.:;'
#loop thru sentence (starting at second word)
for index in range(2, length):
probabilities_med = defaultdict(list)
word = sentence[index]
print("word #", index + 2, ":", word)
#word, p = format(word)
#if p:
#punctuation[index] = p
possibles = getCandidateWords(word, allwords, upper_bound=3)
print("got candidates")
print(possibles)
total_size_L = calcTotalSize(possibles) #this is a log
#now the keys are probability of each word appearing
for key in possibles:
if key != 0:
probabilities_med[np.log(1/key) - total_size_L] = possibles[key]
#2-gram
prev_two = (sentence[index - 2], sentence[index-1])
guess = ''
allGuesses = defaultdict(list)
prevProb = float('-inf')
#log of prob of the typed word
if 0 in possibles.keys():
#print("start calculating for the typed word")
prior_of_typed = calcPrior(word, prev_two, trigrams)
prevProb = np.log(accuracy) + prior_of_typed
guess = word
allGuesses[prevProb] = word
#print("end calculating for the typed word")
#pick the most likely word of the candidates
for likelihood in probabilities_med:
for candidate in probabilities_med[likelihood]:
#print("start calc prob of candidate: ", candidate)
prior = calcPrior(candidate, prev_two, trigrams)
prob = likelihood + prior
allGuesses[prob] = candidate #saving
#print("end calc")
#decide
if prob > prevProb:
guess = candidate
prevProb = prob
#finish off and reset
potential.append(guess)
prevProb = float('-inf')
print(allGuesses)
return output(potential)
def calcPrior(word, prev_two, trigrams):
#use counting
#new dataframe
df = trigrams[(trigrams['first'] == prev_two[0]) & (trigrams['second'] == prev_two[1])]
#all 3-gram, starting w the first two words
prev_two_count = int(df.sum(numeric_only=True))
#count of our specific 3-gram
three_gram_count = int(df[df['third'] == word].sum(numeric_only=True))
#return log of the probability
if three_gram_count != 0:
return np.log(three_gram_count / prev_two_count)
else:
return np.log(0.0001)
#def getProbOfMED(key, total_size_L):
#return np.log(1/key) - total_size_L
def calcTotalSize(possibles):
total = 0
for key in possibles:
if key != 0:
total += len(possibles[key])/key
return np.log(total)
def getCandidateWords(word, df, upper_bound=4):
#df_candidates = df.apply(calcMinEditDist, args = (word,))
possibles = defaultdict(list)
for row in df.itertuples():
poss_w = str(row[1])
dist = calcMinEditDist(word, poss_w)
if dist != -1 and dist <= upper_bound:
possibles[dist].append(poss_w)
return possibles
def calcMinEditDist(w: str, c: str):
if 0.5 <= len(w)/len(c) <= 2:
return med.MED(w, c)
else:
#too short/long
return -1
def weighAdjKeys(w: str):
pass
#def formatWord(word):
def output(l):
s = ''
#keys = punctuation.keys()
for word in l:
s += word
s += ' '
return s
#run script
if __name__ == '__main__':
main()