From e2750f1f3ae45f628c534606f67a1cc1b6d0eb74 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 3 Feb 2019 18:32:22 +1100 Subject: [PATCH 01/10] WIP --- gensim/models/fasttext.py | 27 ++-- gensim/models/keyedvectors.py | 292 +++++++++++++++++++++++----------- gensim/test/test_fasttext.py | 14 ++ 3 files changed, 232 insertions(+), 101 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 712ad93a0c..0cc17a3fb5 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -690,31 +690,33 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.epochs) """ - if update: - if not len(self.wv.vocab): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus " - "before doing an online update.") + if not update: + self.wv.init_ngrams_weights(self.trainables.seed) + elif not len(self.wv.vocab): + raise RuntimeError( + "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus " + "before doing an online update.") + else: self.vocabulary.old_vocab_len = len(self.wv.vocab) - self.trainables.old_hash2index_len = len(self.wv.hash2index) - return super(FastText, self).build_vocab( + retval = super(FastText, self).build_vocab( sentences=sentences, corpus_file=corpus_file, update=update, progress_per=progress_per, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) + if update: + self.wv.update_ngrams_weights(self.trainables.seed, self.vocabulary.old_vocab_len) + def _set_train_params(self, **kwargs): # # We need the wv.buckets_word member to be initialized in order to # continue training. The _clear_post_train method destroys this # variable, so we reinitialize it here, if needed. # - # The .old_vocab_len and .old_hash2index_len members are set only to - # keep the init_ngrams_weights method happy. + # The .old_vocab_len member is set only to keep the init_ngrams_weights method happy. # if self.wv.buckets_word is None: self.vocabulary.old_vocab_len = len(self.wv.vocab) - self.trainables.old_hash2index_len = len(self.wv.hash2index) self.trainables.init_ngrams_weights(self.wv, update=True, vocabulary=self.vocabulary) def _clear_post_train(self): @@ -1068,6 +1070,9 @@ def load(cls, *args, **kwargs): """ try: model = super(FastText, cls).load(*args, **kwargs) + if hasattr(model.wv, 'hash2index'): + gensim.models.keyedvectors._rollback_optimization(model.wv) + if not hasattr(model.trainables, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'): model.trainables.vectors_vocab_lockf = ones(model.wv.vectors_vocab.shape, dtype=REAL) if not hasattr(model.trainables, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'): diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 1428503c8a..17cbf55e31 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1964,14 +1964,6 @@ class FastTextKeyedVectors(WordEmbeddingsKeyedVectors): replace=True. buckets_word : dict Maps vocabulary items (by their index) to the buckets they occur in. - hash2index : dict - Maps bucket numbers to an index within vectors_ngrams. So, given an - ngram, you can get its vector by determining its bucket, mapping the - bucket to an index, and then indexing into vectors_ngrams (in other - words, vectors_ngrams[hash2index[hash_fn(ngram) % bucket]]. - num_ngram_vectors : int - The number of vectors that correspond to ngrams, as opposed to terms - (full words). """ def __init__(self, vector_size, min_n, max_n, bucket, compatible_hash): @@ -1981,11 +1973,9 @@ def __init__(self, vector_size, min_n, max_n, bucket, compatible_hash): self.vectors_ngrams = None self.vectors_ngrams_norm = None self.buckets_word = None - self.hash2index = {} self.min_n = min_n self.max_n = max_n self.bucket = bucket - self.num_ngram_vectors = 0 self.compatible_hash = compatible_hash @classmethod @@ -2030,12 +2020,32 @@ def __contains__(self, word): bool True if `word` or any character ngrams in `word` are present in the vocabulary, False otherwise. + Note + ---- + This method **always** returns True, because of the way FastText works. + + If you want to check if a word is an in-vocabulary term, use this instead: + + .. pycon: + + >>> from gensim.test.utils import datapath + >>> from gensim.models import FastText + >>> cap_path = datapath("crime-and-punishment.bin") + >>> model = FastText.load_fasttext_format(cap_path, full_model=False) + >>> 'steamtrain' in model.wv.vocab # If False, is an OOV term + False + """ - if word in self.vocab: - return True - else: - hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash) - return any(h in self.hash2index for h in hashes) + # + # FIXME: what's the right behavior here? + # + # If we maintain the previous behavior of this method (check for word + # _and_ ngram presence) then this method will never return False. + # This is because there will _always_ be a vector for any ngram. + # However, there is no guarantee that it will be "learned" - it may + # be still at its initial "unlearned" value. + # + return True def save(self, *args, **kwargs): """Save object. @@ -2052,8 +2062,14 @@ def save(self, *args, **kwargs): """ # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get( - 'ignore', ['vectors_norm', 'vectors_vocab_norm', 'vectors_ngrams_norm', 'buckets_word']) + ignore_attrs = [ + 'vectors_norm', + 'vectors_vocab_norm', + 'vectors_ngrams_norm', + 'buckets_word', + 'hash2index', + ] + kwargs['ignore'] = kwargs.get('ignore', ignore_attrs) super(FastTextKeyedVectors, self).save(*args, **kwargs) def word_vec(self, word, use_norm=False): @@ -2087,15 +2103,18 @@ def word_vec(self, word, use_norm=False): ngram_weights = self.vectors_ngrams_norm else: ngram_weights = self.vectors_ngrams - ngrams_found = 0 - for ngram_hash in ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash): - if ngram_hash in self.hash2index: - word_vec += ngram_weights[self.hash2index[ngram_hash]] - ngrams_found += 1 - if word_vec.any(): - return word_vec / max(1, ngrams_found) - else: # No ngrams of the word are present in self.ngrams - raise KeyError('all ngrams for word %s absent from model' % word) + ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash) + for nh in ngram_hashes: + word_vec += ngram_weights[nh] + return word_vec / len(ngram_hashes) + + @classmethod + def load(cls, fname_or_handle, **kwargs): + kv = super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs) + if kv.hasattr('hash2index'): + _rollback_optimization(kv) + + return kv def init_sims(self, replace=False): """Precompute L2-normalized vectors. @@ -2140,41 +2159,69 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None) fname, self.vocab, self.vectors, fvocab=fvocab, binary=binary, total_vec=total_vec) def init_ngrams_weights(self, seed): - self.hash2index = {} - ngram_indices, self.buckets_word = _process_fasttext_vocab( + """Initialize the vocabulary and ngrams weights prior to training. + + Creates the weight matrices and initializes them with uniform random values. + + Parameters + ---------- + seed : float + The seed for the PRNG. + + Note + ---- + Call this **after** the vocabulary has been fully initialized. + + """ + self.buckets_word = _process_fasttext_vocab( self.vocab.items(), self.min_n, self.max_n, self.bucket, self.compatible_hash, - self.hash2index ) - self.num_ngram_vectors = len(ngram_indices) - logger.info("Total number of ngrams is %d", self.num_ngram_vectors) rand_obj = np.random rand_obj.seed(seed) lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size vocab_shape = (len(self.vocab), self.vector_size) - ngrams_shape = (len(ngram_indices), self.vector_size) + ngrams_shape = (self.bucket, self.vector_size) self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL) + + # + # We could have initialized vectors_ngrams at construction time, but we + # do it here for two reasons: + # + # 1. The constructor does not have access to the random seed + # 2. We want to use the same rand_obj to fill vectors_vocab _and_ + # vectors_ngrams, and vectors_vocab cannot happen at construction + # time because the vocab is not initialized at that stage. + # self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL) def update_ngrams_weights(self, seed, old_vocab_len): - old_hash2index_len = len(self.hash2index) + """Update the vocabulary weights for training continuation. - new_ngram_hashes, self.buckets_word = _process_fasttext_vocab( + Parameters + ---------- + seed : float + The seed for the PRNG. + old_vocab_length : int + The length of the vocabulary prior to its update. + + Note + ---- + Call this **after** the vocabulary has been updated. + + """ + self.buckets_word = _process_fasttext_vocab( self.vocab.items(), self.min_n, self.max_n, self.bucket, self.compatible_hash, - self.hash2index ) - num_new_ngrams = len(new_ngram_hashes) - self.num_ngram_vectors += num_new_ngrams - logger.info("Number of new ngrams is %d", num_new_ngrams) rand_obj = np.random rand_obj.seed(seed) @@ -2182,10 +2229,7 @@ def update_ngrams_weights(self, seed, old_vocab_len): new_vocab = len(self.vocab) - old_vocab_len self.vectors_vocab = _pad_random(self.vectors_vocab, new_vocab, rand_obj) - new_ngrams = len(self.hash2index) - old_hash2index_len - self.vectors_ngrams = _pad_random(self.vectors_ngrams, new_ngrams, rand_obj) - - def init_post_load(self, vectors, match_gensim=False): + def init_post_load(self, vectors): """Perform initialization after loading a native Facebook model. Expects that the vocabulary (self.vocab) has already been initialized. @@ -2198,9 +2242,7 @@ def init_post_load(self, vectors, match_gensim=False): The order of the vectors must correspond to the indices in the vocabulary. match_gensim : boolean, optional - Match the behavior of gensim's FastText implementation and take a - subset of vectors_ngrams. This behavior appears to be incompatible - with Facebook's implementation. + No longer supported. """ vocab_words = len(self.vocab) @@ -2215,31 +2257,7 @@ def init_post_load(self, vectors, match_gensim=False): self.vectors = np.array(vectors[:vocab_words, :]) self.vectors_vocab = np.array(vectors[:vocab_words, :]) self.vectors_ngrams = np.array(vectors[vocab_words:, :]) - self.hash2index = {i: i for i in range(self.bucket)} self.buckets_word = None # This can get initialized later - self.num_ngram_vectors = self.bucket - - if match_gensim: - # - # This gives us the same shape for vectors_ngrams, and we can - # satisfy our unit tests when running gensim vs native comparisons, - # but because we're discarding some ngrams, the accuracy of the - # model suffers. - # - ngram_hashes, _ = _process_fasttext_vocab( - self.vocab.items(), - self.min_n, - self.max_n, - self.bucket, - self.compatible_hash, - dict(), # we don't care what goes here in this case - ) - ngram_hashes = sorted(set(ngram_hashes)) - - keep_indices = [self.hash2index[h] for h in self.hash2index if h in ngram_hashes] - self.num_ngram_vectors = len(keep_indices) - self.vectors_ngrams = self.vectors_ngrams.take(keep_indices, axis=0) - self.hash2index = {hsh: idx for (idx, hsh) in enumerate(ngram_hashes)} self.adjust_vectors() @@ -2257,12 +2275,17 @@ def adjust_vectors(self): word_vec = np.copy(self.vectors_vocab[v.index]) ngram_hashes = ft_ngram_hashes(w, self.min_n, self.max_n, self.bucket, self.compatible_hash) for nh in ngram_hashes: - word_vec += self.vectors_ngrams[self.hash2index[nh]] + word_vec += self.vectors_ngrams[nh] word_vec /= len(ngram_hashes) + 1 self.vectors[v.index] = word_vec + @property + @deprecated("Attribute will be removed in 4.0.0, use self.bucket instead") + def num_ngram_vectors(self): + return self.bucket + -def _process_fasttext_vocab(iterable, min_n, max_n, num_buckets, compatible_hash, hash2index): +def _process_fasttext_vocab(iterable, min_n, max_n, num_buckets, compatible_hash): """ Performs a common operation for FastText weight initialization and updates: scan the vocabulary, calculate ngrams and their hashes, keep @@ -2282,42 +2305,27 @@ def _process_fasttext_vocab(iterable, min_n, max_n, num_buckets, compatible_hash compatible_hash : boolean True for compatibility with the Facebook implementation. False for compatibility with the old Gensim implementation. - hash2index : dict - Updated in-place. Returns ------- - A tuple of two elements. - - word_indices : dict + dict Keys are indices of entities in the vocabulary (words). Values are arrays containing indices into vectors_ngrams for each ngram of the word. - new_ngram_hashes : list - A list of hashes for newly encountered ngrams. Each hash is modulo - num_buckets. """ - old_hash2index_len = len(hash2index) word_indices = {} - new_ngram_hashes = [] if num_buckets == 0: - return [], {v.index: np.array([], dtype=np.uint32) for w, v in iterable} + return {v.index: np.array([], dtype=np.uint32) for w, v in iterable} for word, vocab in iterable: wi = [] for ngram_hash in ft_ngram_hashes(word, min_n, max_n, num_buckets, compatible_hash): - if ngram_hash not in hash2index: - # - # This is a new ngram. Reserve a new index in hash2index. - # - hash2index[ngram_hash] = old_hash2index_len + len(new_ngram_hashes) - new_ngram_hashes.append(ngram_hash) - wi.append(hash2index[ngram_hash]) + wi.append(ngram_hash) word_indices[vocab.index] = np.array(wi, dtype=np.uint32) - return new_ngram_hashes, word_indices + return word_indices def _pad_random(m, new_rows, rand): @@ -2349,3 +2357,107 @@ def _l2_norm(m, replace=False): return m else: return (m / dist).astype(REAL) + + +def _rollback_optimization(kv): + """Undo the optimization that pruned buckets. + + This unfortunate optimization saves memory and CPU cycles, but breaks + compatibility with Facebook's model by introducing divergent behavior + for OOV words. + + """ + logger.warning( + "This older model was trained with a premature optimization. " + "The current Gensim version reverses this optimization during loading. " + "Save the model to a new file and reload to suppress this message." + ) + assert hasattr(kv, 'hash2index') + assert hasattr(kv, 'num_ngram_vectors') + + kv.vectors_ngrams = _unpack_copy(kv.vectors_ngrams, kv.bucket, kv.hash2index) + + # + # We have replaced num_ngram_vectors with a property and deprecated it. + # We can't delete it because the new attribute masks the member. + # + del kv.hash2index + + +def _unpack_copy(m, num_rows, hash2index, seed=1): + """Same as _unpack, but makes a copy of the matrix. + + Simpler implementation, but uses more RAM. + + """ + rows, columns = m.shape + if rows == num_rows: + # + # Nothing to do. + # + return m + assert num_rows > rows + + rand_obj = np.random + rand_obj.seed(seed) + + n = np.empty((0, columns), dtype=m.dtype) + n = _pad_random(n, num_rows, rand_obj) + + for src, dst in hash2index.items(): + n[src] = m[dst] + + return n + + +def _unpack(m, num_rows, seed, hash2index): + """Restore the array to its natural shape, undoing the optimization. + + Parameters + ---------- + + a : np.ndarray + The matrix to restore. + num_rows : int + The number of rows that this array should have. + seed : float + The seed for the PRNG. Will be used to initialize new rows. + hash2index : dict + the product of the optimization we are undoing. + + Returns + ------- + np.array + The unpacked matrix. + + Notes + ----- + + The unpacked matrix will reference some rows in the input matrix to save memory. + Throw away the old matrix after calling this function, or use np.copy. + + This implementation is a work in progress. + + """ + rows, columns = a.shape + if rows == num_rows: + # + # Nothing to do. + # + return a + assert num_rows > rows + + rand_obj = np.random + rand_obj.seed(seed) + + # + # Rows at the bottom of the matrix will be "free": initialized to random values. + # Rows at the top of the matrix will contain learned vectors. + # Move the learned vectors down into their "proper" positions, starting from the bottom. + # + m = _pad_random(m, num_rows - rows, rand_obj) + for src, dst in reversed(hash2index.items()): + assert src < dst + m[src], m[dst] = m[dst], m[src] + + return m diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 5437a9b3c8..d0dd778555 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -1186,6 +1186,20 @@ def test_out_of_vocab(self): self.assertRaises(KeyError, model.wv.word_vec, 'streamtrain') +class UnpackTest(unittest.TestCase): + def test_copy_sanity(self): + import gensim.models.keyedvectors + + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {10: 0, 11: 1, 12: 2} + + n = gensim.models.keyedvectors._unpack_copy(m, 25, hash2index) + self.assertTrue(np.all(m[0] == n[10])) + self.assertTrue(np.all(m[1] == n[11])) + self.assertTrue(np.all(m[2] == n[12])) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From ed52a5f4d25a0aa5924650dbcb0998cf15a0ef3d Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 3 Feb 2019 18:42:27 +1100 Subject: [PATCH 02/10] flake8 --- gensim/models/fasttext.py | 2 ++ gensim/models/keyedvectors.py | 17 ++++++----------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 0cc17a3fb5..5b72d531a5 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -707,6 +707,8 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p if update: self.wv.update_ngrams_weights(self.trainables.seed, self.vocabulary.old_vocab_len) + return retval + def _set_train_params(self, **kwargs): # # We need the wv.buckets_word member to be initialized in order to diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 17cbf55e31..6421e29d70 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1984,6 +1984,9 @@ def load(cls, fname_or_handle, **kwargs): if not hasattr(model, 'compatible_hash'): model.compatible_hash = False + if model.hasattr('hash2index'): + _rollback_optimization(model) + return model @property @@ -2108,14 +2111,6 @@ def word_vec(self, word, use_norm=False): word_vec += ngram_weights[nh] return word_vec / len(ngram_hashes) - @classmethod - def load(cls, fname_or_handle, **kwargs): - kv = super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs) - if kv.hasattr('hash2index'): - _rollback_optimization(kv) - - return kv - def init_sims(self, replace=False): """Precompute L2-normalized vectors. @@ -2416,7 +2411,7 @@ def _unpack(m, num_rows, seed, hash2index): Parameters ---------- - a : np.ndarray + m : np.ndarray The matrix to restore. num_rows : int The number of rows that this array should have. @@ -2439,12 +2434,12 @@ def _unpack(m, num_rows, seed, hash2index): This implementation is a work in progress. """ - rows, columns = a.shape + rows, columns = m.shape if rows == num_rows: # # Nothing to do. # - return a + return m assert num_rows > rows rand_obj = np.random From 8fd9f81c58badbedafbd4ac444924980d6b97e2a Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 3 Feb 2019 23:01:52 +1100 Subject: [PATCH 03/10] implement memory-efficient _unpack function --- gensim/models/keyedvectors.py | 50 ++++++++++++++++++++++++----------- gensim/test/test_fasttext.py | 24 +++++++++++++++++ 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 6421e29d70..de0efde925 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -2370,7 +2370,7 @@ def _rollback_optimization(kv): assert hasattr(kv, 'hash2index') assert hasattr(kv, 'num_ngram_vectors') - kv.vectors_ngrams = _unpack_copy(kv.vectors_ngrams, kv.bucket, kv.hash2index) + kv.vectors_ngrams = _unpack(kv.vectors_ngrams, kv.bucket, kv.hash2index) # # We have replaced num_ngram_vectors with a property and deprecated it. @@ -2405,9 +2405,19 @@ def _unpack_copy(m, num_rows, hash2index, seed=1): return n -def _unpack(m, num_rows, seed, hash2index): +def _unpack(m, num_rows, hash2index, seed=1): """Restore the array to its natural shape, undoing the optimization. + A packed matrix contains contiguous vectors for ngrams, as well as a hashmap. + The hash map maps the ngram hash to its index in the packed matrix. + To unpack the matrix, we need to do several things: + + 1. Restore the matrix to its "natural" shape, where the number of rows + equals the number of buckets. + 2. Rearrange the existing rows such that the hashmap becomes the identity + function and is thus redundant. + 3. Fill the new rows with random values. + Parameters ---------- @@ -2415,10 +2425,10 @@ def _unpack(m, num_rows, seed, hash2index): The matrix to restore. num_rows : int The number of rows that this array should have. - seed : float - The seed for the PRNG. Will be used to initialize new rows. hash2index : dict the product of the optimization we are undoing. + seed : float, optional + The seed for the PRNG. Will be used to initialize new rows. Returns ------- @@ -2431,28 +2441,38 @@ def _unpack(m, num_rows, seed, hash2index): The unpacked matrix will reference some rows in the input matrix to save memory. Throw away the old matrix after calling this function, or use np.copy. - This implementation is a work in progress. - """ - rows, columns = m.shape - if rows == num_rows: + orig_rows, orig_columns = m.shape + if orig_rows == num_rows: # # Nothing to do. # return m - assert num_rows > rows + assert num_rows > orig_rows rand_obj = np.random rand_obj.seed(seed) # + # Rows at the top of the matrix (the first orig_rows) will contain "packed" learned vectors. # Rows at the bottom of the matrix will be "free": initialized to random values. - # Rows at the top of the matrix will contain learned vectors. - # Move the learned vectors down into their "proper" positions, starting from the bottom. # - m = _pad_random(m, num_rows - rows, rand_obj) - for src, dst in reversed(hash2index.items()): - assert src < dst - m[src], m[dst] = m[dst], m[src] + m = _pad_random(m, num_rows - orig_rows, rand_obj) + + # + # Swap rows to transform hash2index into the identify function. + # There are two kinds of swaps. + # First, rearrange the rows that belong entirely within the original matrix dimensions. + # Second, swap out rows from the original matrix dimensions, replacing them with + # randomly initialized values. + # + # N.B. We only do the swap in one direction, because doing it in both directions + # nullifies the effect. + # + swap = {h: i for (h, i) in hash2index.items() if h < i < orig_rows} + swap.update({h: i for (h, i) in hash2index.items() if h >= orig_rows}) + for h, i in swap.items(): + assert h != i + m[[h, i]] = m[[i, h]] # swap rows i and h return m diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index d0dd778555..e6bcd7d121 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -1199,6 +1199,30 @@ def test_copy_sanity(self): self.assertTrue(np.all(m[1] == n[11])) self.assertTrue(np.all(m[2] == n[12])) + def test_sanity(self): + import gensim.models.keyedvectors + + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {10: 0, 11: 1, 12: 2} + + n = gensim.models.keyedvectors._unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[10])) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[11])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) + + def test_tricky(self): + import gensim.models.keyedvectors + + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {1: 0, 0: 1, 12: 2} + + n = gensim.models.keyedvectors._unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[0])) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[1])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From 1423298a9c556695ab4fe393aa3507b7652d63c3 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 3 Feb 2019 23:03:25 +1100 Subject: [PATCH 04/10] fixup --- gensim/models/keyedvectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index de0efde925..b52746bfe8 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1984,7 +1984,7 @@ def load(cls, fname_or_handle, **kwargs): if not hasattr(model, 'compatible_hash'): model.compatible_hash = False - if model.hasattr('hash2index'): + if hasattr(model, 'hash2index'): _rollback_optimization(model) return model From 0604de2505758dacf9e817854ae277cc5fc82c07 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Mon, 4 Feb 2019 14:14:16 +1100 Subject: [PATCH 05/10] move tests, add test_identity --- gensim/test/test_fasttext.py | 38 ----------------------------- gensim/test/test_keyedvectors.py | 42 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index e6bcd7d121..5437a9b3c8 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -1186,44 +1186,6 @@ def test_out_of_vocab(self): self.assertRaises(KeyError, model.wv.word_vec, 'streamtrain') -class UnpackTest(unittest.TestCase): - def test_copy_sanity(self): - import gensim.models.keyedvectors - - m = np.array(range(9)) - m.shape = (3, 3) - hash2index = {10: 0, 11: 1, 12: 2} - - n = gensim.models.keyedvectors._unpack_copy(m, 25, hash2index) - self.assertTrue(np.all(m[0] == n[10])) - self.assertTrue(np.all(m[1] == n[11])) - self.assertTrue(np.all(m[2] == n[12])) - - def test_sanity(self): - import gensim.models.keyedvectors - - m = np.array(range(9)) - m.shape = (3, 3) - hash2index = {10: 0, 11: 1, 12: 2} - - n = gensim.models.keyedvectors._unpack(m, 25, hash2index) - self.assertTrue(np.all(np.array([0, 1, 2]) == n[10])) - self.assertTrue(np.all(np.array([3, 4, 5]) == n[11])) - self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) - - def test_tricky(self): - import gensim.models.keyedvectors - - m = np.array(range(9)) - m.shape = (3, 3) - hash2index = {1: 0, 0: 1, 12: 2} - - n = gensim.models.keyedvectors._unpack(m, 25, hash2index) - self.assertTrue(np.all(np.array([3, 4, 5]) == n[0])) - self.assertTrue(np.all(np.array([0, 1, 2]) == n[1])) - self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) - - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index 59e361cc6c..13a5b8c5a9 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -306,6 +306,48 @@ def test(self): self.assertTrue(np.allclose(m, norm)) +class UnpackTest(unittest.TestCase): + def test_copy_sanity(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {10: 0, 11: 1, 12: 2} + + n = gensim.models.keyedvectors._unpack_copy(m, 25, hash2index) + self.assertTrue(np.all(m[0] == n[10])) + self.assertTrue(np.all(m[1] == n[11])) + self.assertTrue(np.all(m[2] == n[12])) + + def test_sanity(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {10: 0, 11: 1, 12: 2} + + n = gensim.models.keyedvectors._unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[10])) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[11])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) + + def test_tricky(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {1: 0, 0: 1, 12: 2} + + n = gensim.models.keyedvectors._unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[0])) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[1])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) + + def test_identity(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {0: 0, 1: 1, 2: 2} + + n = gensim.models.keyedvectors._unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[0])) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[1])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[2])) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From a52caba4f91adf7f47bed17629186a58b09ebd9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Thu, 7 Feb 2019 21:04:17 +1100 Subject: [PATCH 06/10] Update gensim/models/keyedvectors.py Co-Authored-By: mpenkov --- gensim/models/keyedvectors.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index b52746bfe8..ce680ccb2a 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -2363,7 +2363,9 @@ def _rollback_optimization(kv): """ logger.warning( - "This older model was trained with a premature optimization. " + "This saved FastText model was trained with an optimization we no longer support. " + "The current Gensim version automatically reverses this optimization during loading. " + "Save the loaded model to a new file and reload to suppress this message." "The current Gensim version reverses this optimization during loading. " "Save the model to a new file and reload to suppress this message." ) From 009db283bb0a4bcb2c71dce7c98bbf7e5a378e81 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 7 Feb 2019 21:08:37 +1100 Subject: [PATCH 07/10] review response: improve comment --- gensim/models/fasttext.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 5b72d531a5..fda40c4b0a 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -696,7 +696,9 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p raise RuntimeError( "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " "First build the vocabulary of your model with a corpus " - "before doing an online update.") + "by calling the :meth:`gensim.models.fasttext.FastText.build_vocab` method " + "before doing an online update." + ) else: self.vocabulary.old_vocab_len = len(self.wv.vocab) From 5a6eda041d604eb8ec2f1ed74571d661fd80d352 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Wed, 13 Feb 2019 13:34:07 +0900 Subject: [PATCH 08/10] review response: fix redundant log message --- gensim/models/keyedvectors.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index ce680ccb2a..8c450149ca 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -2366,8 +2366,6 @@ def _rollback_optimization(kv): "This saved FastText model was trained with an optimization we no longer support. " "The current Gensim version automatically reverses this optimization during loading. " "Save the loaded model to a new file and reload to suppress this message." - "The current Gensim version reverses this optimization during loading. " - "Save the model to a new file and reload to suppress this message." ) assert hasattr(kv, 'hash2index') assert hasattr(kv, 'num_ngram_vectors') From 26254017d6973ebcfa89ae5e27240a0f727afcaa Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Wed, 13 Feb 2019 13:35:00 +0900 Subject: [PATCH 09/10] review response: remove sphinx markup from log message --- gensim/models/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index fda40c4b0a..7381d272a2 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -696,7 +696,7 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p raise RuntimeError( "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " "First build the vocabulary of your model with a corpus " - "by calling the :meth:`gensim.models.fasttext.FastText.build_vocab` method " + "by calling the gensim.models.fasttext.FastText.build_vocab method " "before doing an online update." ) else: From 1cb15fd3e8ad3d5d90bf2a9d7476e861c811d91e Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 7 Mar 2019 13:17:18 +0900 Subject: [PATCH 10/10] remove FIXME --- gensim/models/keyedvectors.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 8c450149ca..b4cf61abd6 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -2039,15 +2039,6 @@ def __contains__(self, word): False """ - # - # FIXME: what's the right behavior here? - # - # If we maintain the previous behavior of this method (check for word - # _and_ ngram presence) then this method will never return False. - # This is because there will _always_ be a vector for any ngram. - # However, there is no guarantee that it will be "learned" - it may - # be still at its initial "unlearned" value. - # return True def save(self, *args, **kwargs):