Skip to content

Commit

Permalink
Merge pull request #1 from rutum/rm
Browse files Browse the repository at this point in the history
Rm
  • Loading branch information
Rutu Mulkar-Mehta committed Jun 22, 2015
2 parents ac9beec + 765d16c commit 34afd95
Showing 1 changed file with 96 additions and 24 deletions.
120 changes: 96 additions & 24 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,14 @@ def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None):
def train_sg_pair(model, word, word2, alpha, labels, train_w1=True, train_w2=True):
l1 = model.syn0[word2.index]
neu1e = zeros(l1.shape)

if model.hs:
# work on the entire tree at once, to push as much work into numpy's C routines as possible (performance)
l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size
fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output
ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate
if train_w1:
model.syn1[word.point] += outer(ga, l1) # learn hidden -> output
neu1e += dot(ga, l2a) # save error
model.syn1[word.point] += outer(ga, l1) # learn hidden -> output
neu1e += dot(ga, l2a) # save error

if model.negative:
# use this word (label = 1) + `negative` other random words not from this sentence (label = 0)
Expand All @@ -181,27 +180,25 @@ def train_sg_pair(model, word, word2, alpha, labels, train_w1=True, train_w2=Tru
w = model.table[random.randint(model.table.shape[0])]
if w != word.index:
word_indices.append(w)
l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size
fb = 1. / (1. + exp(-dot(l1, l2b.T))) # propagate hidden -> output
gb = (labels - fb) * alpha # vector of error gradients multiplied by the learning rate
l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size
fb = 1. / (1. + exp(-dot(l1, l2b.T))) # propagate hidden -> output
gb = (labels - fb) * alpha # vector of error gradients multiplied by the learning rate
if train_w1:
model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output
neu1e += dot(gb, l2b) # save error
if train_w2:
model.syn0[word2.index] += neu1e # learn input -> hidden
neu1e += dot(gb, l2b) # save error
if train_w2 and model.syn0lock[word2.index] == 1:
model.syn0[word2.index] += neu1e # learn input -> hidden
return neu1e


def train_cbow_pair(model, word, word2_indices, l1, alpha, labels, train_w1=True, train_w2=True):
neu1e = zeros(l1.shape)

if model.hs:
l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size
fa = 1. / (1. + exp(-dot(l1, l2a.T))) # propagate hidden -> output
ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate
l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size
fa = 1. / (1. + exp(-dot(l1, l2a.T))) # propagate hidden -> output
ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate
if train_w1:
model.syn1[word.point] += outer(ga, l1) # learn hidden -> output
neu1e += dot(ga, l2a) # save error
model.syn1[word.point] += outer(ga, l1) # learn hidden -> output
neu1e += dot(ga, l2a) # save error

if model.negative:
# use this word (label = 1) + `negative` other random words not from this sentence (label = 0)
Expand All @@ -210,14 +207,14 @@ def train_cbow_pair(model, word, word2_indices, l1, alpha, labels, train_w1=True
w = model.table[random.randint(model.table.shape[0])]
if w != word.index:
word_indices.append(w)
l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size
fb = 1. / (1. + exp(-dot(l1, l2b.T))) # propagate hidden -> output
gb = (labels - fb) * alpha # vector of error gradients multiplied by the learning rate
l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size
fb = 1. / (1. + exp(-dot(l1, l2b.T))) # propagate hidden -> output
gb = (labels - fb) * alpha # vector of error gradients multiplied by the learning rate
if train_w1:
model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output
neu1e += dot(gb, l2b) # save error
if train_w2:
model.syn0[word2_indices] += neu1e # learn input -> hidden, here for all words in the window separately
model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output
neu1e += dot(gb, l2b) # save error
if train_w2 and model.syn0lock[word2.index] == 1:
model.syn0[word2_indices] += neu1e # learn input -> hidden, here for all words in the window separately
return neu1e


Expand Down Expand Up @@ -290,6 +287,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
`iter` = number of iterations (epochs) over the corpus.
"""
self.syn0lock = []
self.vocab = {} # mapping from a word (string) to a Vocab object
self.index2word = [] # map from a word's matrix index (int) to word (string)
self.sg = int(sg)
Expand Down Expand Up @@ -388,6 +386,33 @@ def precalc_sampling(self):
prob = (sqrt(v.count / threshold_count) + 1) * (threshold_count / v.count) if self.sample else 1.0
v.sample_probability = min(prob, 1.0)

def update_vocab(self, sentences):
logger.info("Adding new words and their counts")
vocab = self._vocab_from(sentences)

# set all the previous weights to 0 -> freeze them
for id in xrange(0, len(self.index2word)):
self.syn0lock[id] = 0

# add new words to the vocabulary
for word, v in iteritems(vocab):
if v.count >= self.min_count and word not in self.vocab:
v.index = len(self.vocab)
self.index2word.append(word)
self.vocab[word] = v
self.syn0lock.append(1)
logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

if self.hs:
# add info about each word's Huffman encoding
self.create_binary_tree()
if self.negative:
# build the table for drawing random words (for negative sampling)
self.make_table()
self.precalc_sampling()
self.update_weights()


def build_vocab(self, sentences):
"""
Build vocabulary from a sequence of sentences (can be a once-only generator stream).
Expand All @@ -403,6 +428,8 @@ def build_vocab(self, sentences):
v.index = len(self.vocab)
self.index2word.append(word)
self.vocab[word] = v
# All the words are unlocked for initial training
self.syn0lock.append(1)
logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

if self.hs:
Expand Down Expand Up @@ -515,6 +542,38 @@ def worker_train():
self.syn0norm = None
return word_count[0]

def update_weights(self):
"""
Reset the weight of the new vocab, and leave the old vocab
weights the same
"""
logger.info("resetting new vocab weights")
newsyn0 = empty((len(self.vocab), self.layer1_size), dtype=REAL)

# copy the weights that are already learned
for i in xrange(0, len(self.syn0)):
newsyn0[i] = self.syn0[i]

# randomize the remaining words
for i in xrange(len(self.syn0), len(newsyn0)):
random.seed(uint32(self.hashfxn(self.index2word[i] + str(self.seed))))
newsyn0[i] = (random.rand(self.layer1_size) - 0.5) / self.layer1_size

self.syn0 = newsyn0

# is the following correct?
if self.hs:
oldsyn1 = self.syn1
self.syn1 = zeros((len(self.vocab), self.layer1_size), dtype=REAL)
for i in xrange(0, len(oldsyn1)):
self.syn1[i] = oldsyn1[i]
if self.negative:
oldneg = self.syn1neg
self.syn1neg = zeros((len(self.vocab), self.layer1_size), dtype=REAL)
for i in xrange(0, len(oldneg)):
self.syn1neg[i] = oldneg[i]
self.syn0norm = None

def reset_weights(self):
"""Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
logger.info("resetting layer weights")
Expand Down Expand Up @@ -634,7 +693,7 @@ def most_similar(self, positive=[], negative=[], topn=10):
This method computes cosine similarity between a simple mean of the projection
weight vectors of the given words and the vectors for each word in the model. The method corresponds to the `word-analogy` and
`distance` scripts in the original word2vec implementation.
If topn is False, most_similar returns the vector of similarity scores.
Example::
Expand Down Expand Up @@ -918,6 +977,19 @@ def save(self, *args, **kwargs):
super(Word2Vec, self).save(*args, **kwargs)
save.__doc__ = utils.SaveLoad.save.__doc__

class Sentences(object):
def __init__(self, filename):
self.filename = filename

def __iter__(self):
for line in utils.smart_open(self.filename):
line = utils.to_unicode(line)
token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
# ignore words with non-alphabetic tags like ",", "!" etc punctuation, weird stuff)
words = [token.lower() for token in line.split(" ")]
if not words: # don't bother sending out empty sentences
continue
yield words

class BrownCorpus(object):
"""Iterate over sentences from the Brown corpus (part of NLTK data)."""
Expand Down

0 comments on commit 34afd95

Please sign in to comment.