-
Notifications
You must be signed in to change notification settings - Fork 0
/
vocab.py
85 lines (69 loc) · 2.89 KB
/
vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Copyright 2018 Stanford University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file contains a function to read the GloVe vectors from file,
and return them as an embedding matrix"""
from __future__ import absolute_import
from __future__ import division
from tqdm import tqdm
import numpy as np
_PAD = b"<pad>"
_UNK = b"<unk>"
_START_VOCAB = [_PAD, _UNK]
PAD_ID = 0
UNK_ID = 1
def get_glove(glove_path, glove_dim):
"""Reads from original GloVe .txt file and returns embedding matrix and
mappings from words to word ids.
Input:
glove_path: path to glove.6B.{glove_dim}d.txt
glove_dim: integer; needs to match the dimension in glove_path
Returns:
emb_matrix: Numpy array shape (400002, glove_dim) containing glove embeddings
(plus PAD and UNK embeddings in first two rows).
The rows of emb_matrix correspond to the word ids given in word2id and id2word
word2id: dictionary mapping word (string) to word id (int)
id2word: dictionary mapping word id (int) to word (string)
"""
print "Loading GLoVE vectors from file: %s" % glove_path
vocab_size = int(4e5) # this is the vocab size of the corpus we've downloaded
emb_matrix = np.zeros((vocab_size + len(_START_VOCAB), glove_dim))
word2id = {}
id2word = {}
random_init = True
# randomly initialize the special tokens
if random_init:
emb_matrix[:len(_START_VOCAB), :] = np.random.randn(len(_START_VOCAB), glove_dim)
# put start tokens in the dictionaries
idx = 0
for word in _START_VOCAB:
word2id[word] = idx
id2word[idx] = word
idx += 1
# go through glove vecs
with open(glove_path, 'r') as fh:
for line in tqdm(fh, total=vocab_size):
line = line.lstrip().rstrip().split(" ")
word = line[0]
vector = list(map(float, line[1:]))
if glove_dim != len(vector):
raise Exception("You set --glove_path=%s but --embedding_size=%i. If you set --glove_path yourself then make sure that --embedding_size matches!" % (glove_path, glove_dim))
emb_matrix[idx, :] = vector
word2id[word] = idx
id2word[idx] = word
idx += 1
final_vocab_size = vocab_size + len(_START_VOCAB)
assert len(word2id) == final_vocab_size
assert len(id2word) == final_vocab_size
assert idx == final_vocab_size
return emb_matrix, word2id, id2word