-
Notifications
You must be signed in to change notification settings - Fork 10
/
run.py
105 lines (80 loc) · 2.58 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import zipfile
import logging
import pickle
import torch
from glove import GloVeModel
from tools import SpacyTokenizer, Dictionary
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
FILE_PATH = './data/text8.zip'
MODLE_PATH = './model/glove.pt'
DOC_PATH = './data/corpus.pickle'
COMATRIX_PATH = './data/comat.pickle'
LANG = 'en_core_web_sm'
EMBEDDING_SIZE = 128
CONTEXT_SIZE = 3
NUM_EPOCH = 100
BATHC_SIZE = 512
LEARNING_RATE = 0.01
def read_data(file_path, type='file'):
""" Read data into a string
Args:
file_path (str): path for the data file
"""
text = None
if type is 'file':
with open(file_path, mode='r', encoding='utf-8') as fp:
text = fp.read()
elif type is 'zip':
with zipfile.ZipFile(file_path) as fp:
text = fp.read(fp.namelist()[0]).decode()
return text
def preprocess(file_path):
""" Get corpus and vocab_size from raw text
Args:
file_path (str): raw file path
Returns:
corpus (list): list of idx words
vocab_size (int): vocabulary size
"""
# preprocess read raw text
# text = read_data(FILE_PATH, type='zip')
# logging.info("read raw data")
# init base model
# tokenizer = SpacyTokenizer(LANG)
dictionary = Dictionary()
# build corpus
# doc = tokenizer.tokenize(text)
# logging.info("after generate tokens from text")
# save doc
# with open(DOC_PATH, mode='wb') as fp:
# pickle.dump(doc, fp)
# logging.info("tokenized documents saved!")
# load doc
with open(DOC_PATH, 'rb') as fp:
doc = pickle.load(fp)
dictionary.update(doc)
logging.info("after generate dictionary")
corpus = dictionary.corpus(doc)
vocab_size = dictionary.vocab_size
return corpus, vocab_size
def train_glove_model():
# preprocess
corpus, vocab_size = preprocess(FILE_PATH)
# specify device type
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# init vector model
logging.info("init model hyperparameter")
model = GloVeModel(EMBEDDING_SIZE, CONTEXT_SIZE, vocab_size)
model.to(device)
# fit corpus to count cooccurance matrix
model.fit(corpus)
cooccurance_matrix = model.get_coocurrance_matrix()
# saving cooccurance_matrix
with open(COMATRIX_PATH, mode='wb') as fp:
pickle.dump(cooccurance_matrix, fp)
model.train(NUM_EPOCH, device, learning_rate=LEARNING_RATE)
# save model for evaluation
torch.save(model.state_dict(), MODLE_PATH)
if __name__ == '__main__':
train_glove_model()