-
Notifications
You must be signed in to change notification settings - Fork 0
/
first_word.py
135 lines (110 loc) · 4.23 KB
/
first_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from collections import defaultdict
from queue import PriorityQueue
def get_five_letter_words():
filename = "five_letter_words.txt"
with open(filename) as f:
words = f.read().splitlines()
return words
# Approach 1
# in: words, a list of all 5-letter words in English
# out: a dictionary mapping from each letter (a, b, c) to the number of times that letter appears across all words
def letter_histogram(words):
histogram = defaultdict(int)
for word in words:
for letter in word:
histogram[letter] += 1
return histogram
def score_word_by_letter_frequency(word, histogram):
score = 0
for letter in word:
score += histogram[letter]
# Multiply score by -1 so that higher frequencies come first in PriorityQueue
return score * -1
def rank_words_by_letter_frequency(words, histogram):
ranked_words = PriorityQueue()
for word in words:
score = score_word_by_letter_frequency(word, histogram)
ranked_words.put((score, word))
return ranked_words
def find_best_words_by_letter_frequency():
words = get_five_letter_words()
histogram = letter_histogram(words)
ranked_words = rank_words_by_letter_frequency(words, histogram)
return ranked_words
# Approach 2
def letter_frequency_histogram(words):
# This histogram has entries like
# ('a', 1) => the number words that have at least 1 a
# ('a', 2) => the number of words that have at least 2 a's
histogram = defaultdict(int)
for word in words:
# This histogram is a simple tally of the number of times each letter appears in the word.
# For the word 'esses':
# 'e' => 2
# 's' => 3
word_histogram = defaultdict(int)
for letter in word:
word_histogram[letter] += 1
count_so_far = word_histogram[letter]
histogram[(letter, count_so_far)] += 1
return histogram
def score_word_by_letter_in_word_frequency(word, histogram):
score = 0
# Kind of redundant - once again keeping track of the histogram per word
word_histogram = defaultdict(int)
for letter in word:
word_histogram[letter] += 1
count_so_far = word_histogram[letter]
score += histogram[(letter, count_so_far)]
# Multiply score by -1 so that higher frequencies come first in PriorityQueue
return score * -1
def rank_words_by_letter_in_word_frequency(words, histogram):
ranked_words = PriorityQueue()
for word in words:
score = score_word_by_letter_in_word_frequency(word, histogram)
ranked_words.put((score, word))
return ranked_words
def find_best_words_by_letter_in_word_frequency():
words = get_five_letter_words()
histogram = letter_frequency_histogram(words)
ranked_words = rank_words_by_letter_in_word_frequency(words, histogram)
return ranked_words
# Approach 3
def letter_position_histogram(words):
histogram = defaultdict(int)
for word in words:
for (index, letter) in enumerate(word):
histogram[(index, letter)] += 1
return histogram
def score_word_by_letter_position_frequency(word, histogram):
score = 0
for (index, letter) in enumerate(word):
score += histogram[(index, letter)]
# Multiply score by -1 so that higher frequencies come first in PriorityQueue
return score * -1
def rank_words_by_letter_position_frequency(words, histogram):
ranked_words = PriorityQueue()
for word in words:
score = score_word_by_letter_position_frequency(word, histogram)
ranked_words.put((score, word))
return ranked_words
def find_best_words_by_letter_position_frequency():
words = get_five_letter_words()
histogram = letter_position_histogram(words)
ranked_words = rank_words_by_letter_position_frequency(words, histogram)
return ranked_words
def main():
print('Approach 1: Find best words by simple letter frequency')
ranked_words = find_best_words_by_letter_frequency()
for i in range(0,10):
print(ranked_words.get())
print('Approach 2: Find best words by letter frequency, adjusting for letter repetition')
ranked_words = find_best_words_by_letter_in_word_frequency()
for i in range(0, 10):
print(ranked_words.get())
print('Approach 3: Find best words by letter-position frequency')
ranked_words = find_best_words_by_letter_position_frequency()
for i in range(0,10):
print(ranked_words.get())
if __name__ == "__main__":
main()