diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 712ad93a0c..7381d272a2 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -690,31 +690,37 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.epochs) """ - if update: - if not len(self.wv.vocab): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus " - "before doing an online update.") + if not update: + self.wv.init_ngrams_weights(self.trainables.seed) + elif not len(self.wv.vocab): + raise RuntimeError( + "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus " + "by calling the gensim.models.fasttext.FastText.build_vocab method " + "before doing an online update." + ) + else: self.vocabulary.old_vocab_len = len(self.wv.vocab) - self.trainables.old_hash2index_len = len(self.wv.hash2index) - return super(FastText, self).build_vocab( + retval = super(FastText, self).build_vocab( sentences=sentences, corpus_file=corpus_file, update=update, progress_per=progress_per, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) + if update: + self.wv.update_ngrams_weights(self.trainables.seed, self.vocabulary.old_vocab_len) + + return retval + def _set_train_params(self, **kwargs): # # We need the wv.buckets_word member to be initialized in order to # continue training. The _clear_post_train method destroys this # variable, so we reinitialize it here, if needed. # - # The .old_vocab_len and .old_hash2index_len members are set only to - # keep the init_ngrams_weights method happy. + # The .old_vocab_len member is set only to keep the init_ngrams_weights method happy. # if self.wv.buckets_word is None: self.vocabulary.old_vocab_len = len(self.wv.vocab) - self.trainables.old_hash2index_len = len(self.wv.hash2index) self.trainables.init_ngrams_weights(self.wv, update=True, vocabulary=self.vocabulary) def _clear_post_train(self): @@ -1068,6 +1074,9 @@ def load(cls, *args, **kwargs): """ try: model = super(FastText, cls).load(*args, **kwargs) + if hasattr(model.wv, 'hash2index'): + gensim.models.keyedvectors._rollback_optimization(model.wv) + if not hasattr(model.trainables, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'): model.trainables.vectors_vocab_lockf = ones(model.wv.vectors_vocab.shape, dtype=REAL) if not hasattr(model.trainables, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'): diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 1428503c8a..b4cf61abd6 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1964,14 +1964,6 @@ class FastTextKeyedVectors(WordEmbeddingsKeyedVectors): replace=True. buckets_word : dict Maps vocabulary items (by their index) to the buckets they occur in. - hash2index : dict - Maps bucket numbers to an index within vectors_ngrams. So, given an - ngram, you can get its vector by determining its bucket, mapping the - bucket to an index, and then indexing into vectors_ngrams (in other - words, vectors_ngrams[hash2index[hash_fn(ngram) % bucket]]. - num_ngram_vectors : int - The number of vectors that correspond to ngrams, as opposed to terms - (full words). """ def __init__(self, vector_size, min_n, max_n, bucket, compatible_hash): @@ -1981,11 +1973,9 @@ def __init__(self, vector_size, min_n, max_n, bucket, compatible_hash): self.vectors_ngrams = None self.vectors_ngrams_norm = None self.buckets_word = None - self.hash2index = {} self.min_n = min_n self.max_n = max_n self.bucket = bucket - self.num_ngram_vectors = 0 self.compatible_hash = compatible_hash @classmethod @@ -1994,6 +1984,9 @@ def load(cls, fname_or_handle, **kwargs): if not hasattr(model, 'compatible_hash'): model.compatible_hash = False + if hasattr(model, 'hash2index'): + _rollback_optimization(model) + return model @property @@ -2030,12 +2023,23 @@ def __contains__(self, word): bool True if `word` or any character ngrams in `word` are present in the vocabulary, False otherwise. + Note + ---- + This method **always** returns True, because of the way FastText works. + + If you want to check if a word is an in-vocabulary term, use this instead: + + .. pycon: + + >>> from gensim.test.utils import datapath + >>> from gensim.models import FastText + >>> cap_path = datapath("crime-and-punishment.bin") + >>> model = FastText.load_fasttext_format(cap_path, full_model=False) + >>> 'steamtrain' in model.wv.vocab # If False, is an OOV term + False + """ - if word in self.vocab: - return True - else: - hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash) - return any(h in self.hash2index for h in hashes) + return True def save(self, *args, **kwargs): """Save object. @@ -2052,8 +2056,14 @@ def save(self, *args, **kwargs): """ # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get( - 'ignore', ['vectors_norm', 'vectors_vocab_norm', 'vectors_ngrams_norm', 'buckets_word']) + ignore_attrs = [ + 'vectors_norm', + 'vectors_vocab_norm', + 'vectors_ngrams_norm', + 'buckets_word', + 'hash2index', + ] + kwargs['ignore'] = kwargs.get('ignore', ignore_attrs) super(FastTextKeyedVectors, self).save(*args, **kwargs) def word_vec(self, word, use_norm=False): @@ -2087,15 +2097,10 @@ def word_vec(self, word, use_norm=False): ngram_weights = self.vectors_ngrams_norm else: ngram_weights = self.vectors_ngrams - ngrams_found = 0 - for ngram_hash in ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash): - if ngram_hash in self.hash2index: - word_vec += ngram_weights[self.hash2index[ngram_hash]] - ngrams_found += 1 - if word_vec.any(): - return word_vec / max(1, ngrams_found) - else: # No ngrams of the word are present in self.ngrams - raise KeyError('all ngrams for word %s absent from model' % word) + ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash) + for nh in ngram_hashes: + word_vec += ngram_weights[nh] + return word_vec / len(ngram_hashes) def init_sims(self, replace=False): """Precompute L2-normalized vectors. @@ -2140,41 +2145,69 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None) fname, self.vocab, self.vectors, fvocab=fvocab, binary=binary, total_vec=total_vec) def init_ngrams_weights(self, seed): - self.hash2index = {} - ngram_indices, self.buckets_word = _process_fasttext_vocab( + """Initialize the vocabulary and ngrams weights prior to training. + + Creates the weight matrices and initializes them with uniform random values. + + Parameters + ---------- + seed : float + The seed for the PRNG. + + Note + ---- + Call this **after** the vocabulary has been fully initialized. + + """ + self.buckets_word = _process_fasttext_vocab( self.vocab.items(), self.min_n, self.max_n, self.bucket, self.compatible_hash, - self.hash2index ) - self.num_ngram_vectors = len(ngram_indices) - logger.info("Total number of ngrams is %d", self.num_ngram_vectors) rand_obj = np.random rand_obj.seed(seed) lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size vocab_shape = (len(self.vocab), self.vector_size) - ngrams_shape = (len(ngram_indices), self.vector_size) + ngrams_shape = (self.bucket, self.vector_size) self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL) + + # + # We could have initialized vectors_ngrams at construction time, but we + # do it here for two reasons: + # + # 1. The constructor does not have access to the random seed + # 2. We want to use the same rand_obj to fill vectors_vocab _and_ + # vectors_ngrams, and vectors_vocab cannot happen at construction + # time because the vocab is not initialized at that stage. + # self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL) def update_ngrams_weights(self, seed, old_vocab_len): - old_hash2index_len = len(self.hash2index) + """Update the vocabulary weights for training continuation. - new_ngram_hashes, self.buckets_word = _process_fasttext_vocab( + Parameters + ---------- + seed : float + The seed for the PRNG. + old_vocab_length : int + The length of the vocabulary prior to its update. + + Note + ---- + Call this **after** the vocabulary has been updated. + + """ + self.buckets_word = _process_fasttext_vocab( self.vocab.items(), self.min_n, self.max_n, self.bucket, self.compatible_hash, - self.hash2index ) - num_new_ngrams = len(new_ngram_hashes) - self.num_ngram_vectors += num_new_ngrams - logger.info("Number of new ngrams is %d", num_new_ngrams) rand_obj = np.random rand_obj.seed(seed) @@ -2182,10 +2215,7 @@ def update_ngrams_weights(self, seed, old_vocab_len): new_vocab = len(self.vocab) - old_vocab_len self.vectors_vocab = _pad_random(self.vectors_vocab, new_vocab, rand_obj) - new_ngrams = len(self.hash2index) - old_hash2index_len - self.vectors_ngrams = _pad_random(self.vectors_ngrams, new_ngrams, rand_obj) - - def init_post_load(self, vectors, match_gensim=False): + def init_post_load(self, vectors): """Perform initialization after loading a native Facebook model. Expects that the vocabulary (self.vocab) has already been initialized. @@ -2198,9 +2228,7 @@ def init_post_load(self, vectors, match_gensim=False): The order of the vectors must correspond to the indices in the vocabulary. match_gensim : boolean, optional - Match the behavior of gensim's FastText implementation and take a - subset of vectors_ngrams. This behavior appears to be incompatible - with Facebook's implementation. + No longer supported. """ vocab_words = len(self.vocab) @@ -2215,31 +2243,7 @@ def init_post_load(self, vectors, match_gensim=False): self.vectors = np.array(vectors[:vocab_words, :]) self.vectors_vocab = np.array(vectors[:vocab_words, :]) self.vectors_ngrams = np.array(vectors[vocab_words:, :]) - self.hash2index = {i: i for i in range(self.bucket)} self.buckets_word = None # This can get initialized later - self.num_ngram_vectors = self.bucket - - if match_gensim: - # - # This gives us the same shape for vectors_ngrams, and we can - # satisfy our unit tests when running gensim vs native comparisons, - # but because we're discarding some ngrams, the accuracy of the - # model suffers. - # - ngram_hashes, _ = _process_fasttext_vocab( - self.vocab.items(), - self.min_n, - self.max_n, - self.bucket, - self.compatible_hash, - dict(), # we don't care what goes here in this case - ) - ngram_hashes = sorted(set(ngram_hashes)) - - keep_indices = [self.hash2index[h] for h in self.hash2index if h in ngram_hashes] - self.num_ngram_vectors = len(keep_indices) - self.vectors_ngrams = self.vectors_ngrams.take(keep_indices, axis=0) - self.hash2index = {hsh: idx for (idx, hsh) in enumerate(ngram_hashes)} self.adjust_vectors() @@ -2257,12 +2261,17 @@ def adjust_vectors(self): word_vec = np.copy(self.vectors_vocab[v.index]) ngram_hashes = ft_ngram_hashes(w, self.min_n, self.max_n, self.bucket, self.compatible_hash) for nh in ngram_hashes: - word_vec += self.vectors_ngrams[self.hash2index[nh]] + word_vec += self.vectors_ngrams[nh] word_vec /= len(ngram_hashes) + 1 self.vectors[v.index] = word_vec + @property + @deprecated("Attribute will be removed in 4.0.0, use self.bucket instead") + def num_ngram_vectors(self): + return self.bucket -def _process_fasttext_vocab(iterable, min_n, max_n, num_buckets, compatible_hash, hash2index): + +def _process_fasttext_vocab(iterable, min_n, max_n, num_buckets, compatible_hash): """ Performs a common operation for FastText weight initialization and updates: scan the vocabulary, calculate ngrams and their hashes, keep @@ -2282,42 +2291,27 @@ def _process_fasttext_vocab(iterable, min_n, max_n, num_buckets, compatible_hash compatible_hash : boolean True for compatibility with the Facebook implementation. False for compatibility with the old Gensim implementation. - hash2index : dict - Updated in-place. Returns ------- - A tuple of two elements. - - word_indices : dict + dict Keys are indices of entities in the vocabulary (words). Values are arrays containing indices into vectors_ngrams for each ngram of the word. - new_ngram_hashes : list - A list of hashes for newly encountered ngrams. Each hash is modulo - num_buckets. """ - old_hash2index_len = len(hash2index) word_indices = {} - new_ngram_hashes = [] if num_buckets == 0: - return [], {v.index: np.array([], dtype=np.uint32) for w, v in iterable} + return {v.index: np.array([], dtype=np.uint32) for w, v in iterable} for word, vocab in iterable: wi = [] for ngram_hash in ft_ngram_hashes(word, min_n, max_n, num_buckets, compatible_hash): - if ngram_hash not in hash2index: - # - # This is a new ngram. Reserve a new index in hash2index. - # - hash2index[ngram_hash] = old_hash2index_len + len(new_ngram_hashes) - new_ngram_hashes.append(ngram_hash) - wi.append(hash2index[ngram_hash]) + wi.append(ngram_hash) word_indices[vocab.index] = np.array(wi, dtype=np.uint32) - return new_ngram_hashes, word_indices + return word_indices def _pad_random(m, new_rows, rand): @@ -2349,3 +2343,127 @@ def _l2_norm(m, replace=False): return m else: return (m / dist).astype(REAL) + + +def _rollback_optimization(kv): + """Undo the optimization that pruned buckets. + + This unfortunate optimization saves memory and CPU cycles, but breaks + compatibility with Facebook's model by introducing divergent behavior + for OOV words. + + """ + logger.warning( + "This saved FastText model was trained with an optimization we no longer support. " + "The current Gensim version automatically reverses this optimization during loading. " + "Save the loaded model to a new file and reload to suppress this message." + ) + assert hasattr(kv, 'hash2index') + assert hasattr(kv, 'num_ngram_vectors') + + kv.vectors_ngrams = _unpack(kv.vectors_ngrams, kv.bucket, kv.hash2index) + + # + # We have replaced num_ngram_vectors with a property and deprecated it. + # We can't delete it because the new attribute masks the member. + # + del kv.hash2index + + +def _unpack_copy(m, num_rows, hash2index, seed=1): + """Same as _unpack, but makes a copy of the matrix. + + Simpler implementation, but uses more RAM. + + """ + rows, columns = m.shape + if rows == num_rows: + # + # Nothing to do. + # + return m + assert num_rows > rows + + rand_obj = np.random + rand_obj.seed(seed) + + n = np.empty((0, columns), dtype=m.dtype) + n = _pad_random(n, num_rows, rand_obj) + + for src, dst in hash2index.items(): + n[src] = m[dst] + + return n + + +def _unpack(m, num_rows, hash2index, seed=1): + """Restore the array to its natural shape, undoing the optimization. + + A packed matrix contains contiguous vectors for ngrams, as well as a hashmap. + The hash map maps the ngram hash to its index in the packed matrix. + To unpack the matrix, we need to do several things: + + 1. Restore the matrix to its "natural" shape, where the number of rows + equals the number of buckets. + 2. Rearrange the existing rows such that the hashmap becomes the identity + function and is thus redundant. + 3. Fill the new rows with random values. + + Parameters + ---------- + + m : np.ndarray + The matrix to restore. + num_rows : int + The number of rows that this array should have. + hash2index : dict + the product of the optimization we are undoing. + seed : float, optional + The seed for the PRNG. Will be used to initialize new rows. + + Returns + ------- + np.array + The unpacked matrix. + + Notes + ----- + + The unpacked matrix will reference some rows in the input matrix to save memory. + Throw away the old matrix after calling this function, or use np.copy. + + """ + orig_rows, orig_columns = m.shape + if orig_rows == num_rows: + # + # Nothing to do. + # + return m + assert num_rows > orig_rows + + rand_obj = np.random + rand_obj.seed(seed) + + # + # Rows at the top of the matrix (the first orig_rows) will contain "packed" learned vectors. + # Rows at the bottom of the matrix will be "free": initialized to random values. + # + m = _pad_random(m, num_rows - orig_rows, rand_obj) + + # + # Swap rows to transform hash2index into the identify function. + # There are two kinds of swaps. + # First, rearrange the rows that belong entirely within the original matrix dimensions. + # Second, swap out rows from the original matrix dimensions, replacing them with + # randomly initialized values. + # + # N.B. We only do the swap in one direction, because doing it in both directions + # nullifies the effect. + # + swap = {h: i for (h, i) in hash2index.items() if h < i < orig_rows} + swap.update({h: i for (h, i) in hash2index.items() if h >= orig_rows}) + for h, i in swap.items(): + assert h != i + m[[h, i]] = m[[i, h]] # swap rows i and h + + return m diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index 59e361cc6c..13a5b8c5a9 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -306,6 +306,48 @@ def test(self): self.assertTrue(np.allclose(m, norm)) +class UnpackTest(unittest.TestCase): + def test_copy_sanity(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {10: 0, 11: 1, 12: 2} + + n = gensim.models.keyedvectors._unpack_copy(m, 25, hash2index) + self.assertTrue(np.all(m[0] == n[10])) + self.assertTrue(np.all(m[1] == n[11])) + self.assertTrue(np.all(m[2] == n[12])) + + def test_sanity(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {10: 0, 11: 1, 12: 2} + + n = gensim.models.keyedvectors._unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[10])) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[11])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) + + def test_tricky(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {1: 0, 0: 1, 12: 2} + + n = gensim.models.keyedvectors._unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[0])) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[1])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[12])) + + def test_identity(self): + m = np.array(range(9)) + m.shape = (3, 3) + hash2index = {0: 0, 1: 1, 2: 2} + + n = gensim.models.keyedvectors._unpack(m, 25, hash2index) + self.assertTrue(np.all(np.array([0, 1, 2]) == n[0])) + self.assertTrue(np.all(np.array([3, 4, 5]) == n[1])) + self.assertTrue(np.all(np.array([6, 7, 8]) == n[2])) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main()