-
Notifications
You must be signed in to change notification settings - Fork 126
/
vocab.py
46 lines (39 loc) · 1.08 KB
/
vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""
Constructing and loading dictionaries
"""
import cPickle as pkl
import numpy
from collections import OrderedDict
def build_dictionary(text):
"""
Build a dictionary
text: list of sentences (pre-tokenized)
"""
wordcount = OrderedDict()
for cc in text:
words = cc.split()
for w in words:
if w not in wordcount:
wordcount[w] = 0
wordcount[w] += 1
words = wordcount.keys()
freqs = wordcount.values()
sorted_idx = numpy.argsort(freqs)[::-1]
worddict = OrderedDict()
for idx, sidx in enumerate(sorted_idx):
worddict[words[sidx]] = idx+2 # 0: <eos>, 1: <unk>
return worddict, wordcount
def load_dictionary(loc='/ais/gobi3/u/rkiros/bookgen/book_dictionary_large.pkl'):
"""
Load a dictionary
"""
with open(loc, 'rb') as f:
worddict = pkl.load(f)
return worddict
def save_dictionary(worddict, wordcount, loc):
"""
Save a dictionary to the specified location
"""
with open(loc, 'wb') as f:
pkl.dump(worddict, f)
pkl.dump(wordcount, f)