From e9f86ba739abc513ffddf9d827544944d774b567 Mon Sep 17 00:00:00 2001 From: Rutu Mulkar-Mehta Date: Mon, 15 Jun 2015 09:58:07 -0700 Subject: [PATCH 1/3] recovering lost work recovering lost work updating bug in sentences iterator vector freeze after each training iteration cbow update clean up code update summarization tutorial image resolving merge conflicts resolving merge conflicts resolving merge conflicts resolving merge conflicts resolving merge conflicts resolving merge conflicts update_weights function change updating the numpy copy code updating the numpy copy code --- gensim/models/doc2vec.py | 2 +- gensim/models/word2vec.py | 122 ++++++++++++++++++++++++++--------- gensim/test/test_word2vec.py | 16 +++++ 3 files changed, 108 insertions(+), 32 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 960cd96c49..b8afede209 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -628,7 +628,7 @@ def reset_from(self, other_model): self.docvecs.borrow_from(other_model.docvecs) super(Doc2Vec, self).reset_from(other_model) - def scan_vocab(self, documents, progress_per=10000, trim_rule=None): + def scan_vocab(self, documents, update=False, progress_per=10000, trim_rule=None): logger.info("collecting all words and their counts") document_no = -1 total_words = 0 diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 3fd96fa33c..e409b03f26 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -509,11 +509,11 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ Each sentence must be a list of unicode strings. """ - self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule) # trim by min_count & precalculate downsampling - self.finalize_vocab() # build tables & arrays + self.scan_vocab(sentences, update, progress_per=progress_per, trim_rule=trim_rule) # initial survey + self.scale_vocab(update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule) # trim by min_count & precalculate downsampling + self.finalize_vocab(update) # build tables & arrays - def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): + def scan_vocab(self, sentences, update, progress_per=10000, trim_rule=None): """Do an initial scan of all words appearing in sentences.""" logger.info("collecting all words and their counts") sentence_no = -1 @@ -527,9 +527,10 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): for word in sentence: vocab[word] += 1 - if self.max_vocab_size and len(vocab) > self.max_vocab_size: - total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 + if not update: + if self.max_vocab_size and len(vocab) > self.max_vocab_size: + total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + min_reduce += 1 total_words += sum(itervalues(vocab)) logger.info("collected %i word types from a corpus of %i raw words and %i sentences", @@ -537,7 +538,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): self.corpus_count = sentence_no + 1 self.raw_vocab = vocab - def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None): + def scale_vocab(self, update, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None): """ Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). @@ -553,28 +554,51 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab """ min_count = min_count or self.min_count sample = sample or self.sample - - # Discard words less-frequent than min_count - if not dry_run: - self.index2word = [] - # make stored settings match these applied settings - self.min_count = min_count - self.sample = sample - self.vocab = {} drop_unique, drop_total, retain_total, original_total = 0, 0, 0, 0 retain_words = [] - for word, v in iteritems(self.raw_vocab): - if keep_vocab_item(word, v, min_count, trim_rule=trim_rule): - retain_words.append(word) - retain_total += v - original_total += v - if not dry_run: - self.vocab[word] = Vocab(count=v, index=len(self.index2word)) - self.index2word.append(word) - else: - drop_unique += 1 - drop_total += v - original_total += v + + if not update: + logger.info("Loading a fresh vocabulary") + # Discard words less-frequent than min_count + if not dry_run: + self.index2word = [] + # make stored settings match these applied settings + self.min_count = min_count + self.sample = sample + self.vocab = {} + + for word, v in iteritems(self.raw_vocab): + if keep_vocab_item(word, v, min_count, trim_rule=trim_rule): + retain_words.append(word) + retain_total += v + original_total += v + if not dry_run: + self.vocab[word] = Vocab(count=v, + index=len(self.index2word)) + self.index2word.append(word) + else: + drop_unique += 1 + drop_total += v + original_total += v + else: + logger.info("Updating model with new vocabulary") + for word, v in iteritems(self.raw_vocab): + if not word in self.vocab: + # the word does not already exist in vocab + if keep_vocab_item(word, v, min_count, + trim_rule=trim_rule): + retain_words.append(word) + retain_total += v + original_total += v + if not dry_run: + self.vocab[word] = Vocab(count=v, + index=len(self.index2word)) + self.index2word.append(word) + else: + drop_unique += 1 + drop_total += v + original_total += v + logger.info("min_count=%d retains %i unique words (drops %i)", min_count, len(retain_words), drop_unique) logger.info("min_count leaves %i word corpus (%i%% of original %i)", @@ -621,10 +645,10 @@ def scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab return report_values - def finalize_vocab(self): + def finalize_vocab(self, update): """Build tables and model weights based on final vocabulary settings.""" if not self.index2word: - self.scale_vocab() + self.scale_vocab(False) if self.sorted_vocab: self.sort_vocab() if self.hs: @@ -641,7 +665,10 @@ def finalize_vocab(self): self.index2word.append(word) self.vocab[word] = v # set initial input/projection and hidden weights - self.reset_weights() + if not update: + self.reset_weights() + else: + self.update_weights() def sort_vocab(self): """Sort the vocabulary so the most frequent words have the lowest indexes.""" @@ -983,6 +1010,39 @@ def worker_loop(): def clear_sims(self): self.syn0norm = None + def update_weights(self): + """ + Copy all the existing weights, and reset the weights for the newly + added vocabulary. + """ + logger.info("updating layer weights") + newsyn0 = empty((len(self.vocab), self.vector_size), dtype=REAL) + + # copy the weights that are already learned + for i in xrange(0, len(self.syn0)): + newsyn0[i] = deepcopy(self.syn0[i]) + + # randomize the remaining words + for i in xrange(len(self.vocab), len(newsyn0)): + # construct deterministic seed from word AND seed argument + self.syn0[i] = self.seeded_vector(self.index2word[i] + str(self.seed)) + self.syn0 = deepcopy(newsyn0) + + if self.hs: + oldsyn1 = deepcopy(self.syn1) + self.syn1 = zeros((len(self.vocab), self.layer1_size), dtype=REAL) + self.syn1[i] = deepcopy(oldsyn1[i]) + + if self.negative: + oldneg = deepcopy(self.syn1neg) + self.syn1neg = zeros((len(self.vocab), self.layer1_size), dtype=REAL) + self.syn1neg[i] = deepcopy(oldneg[i]) + + self.syn0norm = None + + # do not suppress learning for already learned words + self.syn0_lockf = ones(len(self.vocab), dtype=REAL) # zeros suppress learning + def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" logger.info("resetting layer weights") diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 0f1c58208a..92b682c48d 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -47,6 +47,11 @@ def __iter__(self): ['graph', 'minors', 'survey'] ] +new_sentences = [ + ['computer', 'artificial', 'intelligence'], + ['artificial', 'trees'] +] + def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_word2vec.tst') @@ -91,6 +96,17 @@ def testLambdaRule(self): model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule) self.assertTrue("human" not in model.vocab) +class TestWord2VecModel(unittest.TestCase): + def testOnlineLearning(self): + """Test that the algorithm is able to add new words to the + vocabulary and to a trained model""" + model = word2vec.Word2Vec(sentences, min_count=1) + model.build_vocab(new_sentences, update=True) + model.train(new_sentences) + self.assertEqual(len(model.vocab), 14) + self.assertEqual(model.syn0.shape[0], 14) + self.assertEqual(model.syn0.shape[1], 100) + def testPersistenceWord2VecFormat(self): """Test storing/loading the entire model in word2vec format.""" model = word2vec.Word2Vec(sentences, min_count=1) From be1a0f06852e617e764b10f58ebdd0d45a13db4f Mon Sep 17 00:00:00 2001 From: Zachary Deane-Mayer Date: Tue, 23 Feb 2016 09:53:00 -0500 Subject: [PATCH 2/3] fix test? --- gensim/test/test_word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 92b682c48d..cb54e4cd62 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -100,7 +100,7 @@ class TestWord2VecModel(unittest.TestCase): def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model""" - model = word2vec.Word2Vec(sentences, min_count=1) + model = word2vec.Word2Vec(sentences, min_count=1, sorted_vocab=0) model.build_vocab(new_sentences, update=True) model.train(new_sentences) self.assertEqual(len(model.vocab), 14) From 08480182b0bd7640827958deb792705280982055 Mon Sep 17 00:00:00 2001 From: Zachary Deane-Mayer Date: Tue, 23 Feb 2016 09:55:03 -0500 Subject: [PATCH 3/3] dont sort vocab when updating --- gensim/models/word2vec.py | 2 +- gensim/test/test_word2vec.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index e409b03f26..7f8823fba2 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -649,7 +649,7 @@ def finalize_vocab(self, update): """Build tables and model weights based on final vocabulary settings.""" if not self.index2word: self.scale_vocab(False) - if self.sorted_vocab: + if self.sorted_vocab and not update: self.sort_vocab() if self.hs: # add info about each word's Huffman encoding diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index cb54e4cd62..06619f27f1 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -107,6 +107,16 @@ def testOnlineLearning(self): self.assertEqual(model.syn0.shape[0], 14) self.assertEqual(model.syn0.shape[1], 100) + def testOnlineLearning(self): + """Test that the algorithm is able to add new words to the + vocabulary and to a trained model when using a sorted vocabulary""" + model = word2vec.Word2Vec(sentences, min_count=0, sorted_vocab=0) + model.build_vocab(new_sentences, update=True) + model.train(new_sentences) + self.assertEqual(len(model.vocab), 14) + self.assertEqual(model.syn0.shape[0], 14) + self.assertEqual(model.syn0.shape[1], 100) + def testPersistenceWord2VecFormat(self): """Test storing/loading the entire model in word2vec format.""" model = word2vec.Word2Vec(sentences, min_count=1)