From e2750f1f3ae45f628c534606f67a1cc1b6d0eb74 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 3 Feb 2019 18:32:22 +1100
Subject: [PATCH 01/10] WIP

---
 gensim/models/fasttext.py     |  27 ++--
 gensim/models/keyedvectors.py | 292 +++++++++++++++++++++++-----------
 gensim/test/test_fasttext.py  |  14 ++
 3 files changed, 232 insertions(+), 101 deletions(-)

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index 712ad93a0c..0cc17a3fb5 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -690,31 +690,33 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p
             >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.epochs)
 
         """
-        if update:
-            if not len(self.wv.vocab):
-                raise RuntimeError(
-                    "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
-                    "First build the vocabulary of your model with a corpus "
-                    "before doing an online update.")
+        if not update:
+            self.wv.init_ngrams_weights(self.trainables.seed)
+        elif not len(self.wv.vocab):
+            raise RuntimeError(
+                "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
+                "First build the vocabulary of your model with a corpus "
+                "before doing an online update.")
+        else:
             self.vocabulary.old_vocab_len = len(self.wv.vocab)
-            self.trainables.old_hash2index_len = len(self.wv.hash2index)
 
-        return super(FastText, self).build_vocab(
+        retval = super(FastText, self).build_vocab(
             sentences=sentences, corpus_file=corpus_file, update=update, progress_per=progress_per,
             keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs)
 
+        if update:
+            self.wv.update_ngrams_weights(self.trainables.seed, self.vocabulary.old_vocab_len)
+
     def _set_train_params(self, **kwargs):
         #
         # We need the wv.buckets_word member to be initialized in order to
         # continue training. The _clear_post_train method destroys this
         # variable, so we reinitialize it here, if needed.
         #
-        # The .old_vocab_len and .old_hash2index_len members are set only to
-        # keep the init_ngrams_weights method happy.
+        # The .old_vocab_len member is set only to keep the init_ngrams_weights method happy.
         #
         if self.wv.buckets_word is None:
             self.vocabulary.old_vocab_len = len(self.wv.vocab)
-            self.trainables.old_hash2index_len = len(self.wv.hash2index)
             self.trainables.init_ngrams_weights(self.wv, update=True, vocabulary=self.vocabulary)
 
     def _clear_post_train(self):
@@ -1068,6 +1070,9 @@ def load(cls, *args, **kwargs):
         """
         try:
             model = super(FastText, cls).load(*args, **kwargs)
+            if hasattr(model.wv, 'hash2index'):
+                gensim.models.keyedvectors._rollback_optimization(model.wv)
+
             if not hasattr(model.trainables, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'):
                 model.trainables.vectors_vocab_lockf = ones(model.wv.vectors_vocab.shape, dtype=REAL)
             if not hasattr(model.trainables, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'):
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
index 1428503c8a..17cbf55e31 100644
--- a/gensim/models/keyedvectors.py
+++ b/gensim/models/keyedvectors.py
@@ -1964,14 +1964,6 @@ class FastTextKeyedVectors(WordEmbeddingsKeyedVectors):
         replace=True.
     buckets_word : dict
         Maps vocabulary items (by their index) to the buckets they occur in.
-    hash2index : dict
-        Maps bucket numbers to an index within vectors_ngrams.  So, given an
-        ngram, you can get its vector by determining its bucket, mapping the
-        bucket to an index, and then indexing into vectors_ngrams (in other
-        words, vectors_ngrams[hash2index[hash_fn(ngram) % bucket]].
-    num_ngram_vectors : int
-        The number of vectors that correspond to ngrams, as opposed to terms
-        (full words).
 
     """
     def __init__(self, vector_size, min_n, max_n, bucket, compatible_hash):
@@ -1981,11 +1973,9 @@ def __init__(self, vector_size, min_n, max_n, bucket, compatible_hash):
         self.vectors_ngrams = None
         self.vectors_ngrams_norm = None
         self.buckets_word = None
-        self.hash2index = {}
         self.min_n = min_n
         self.max_n = max_n
         self.bucket = bucket
-        self.num_ngram_vectors = 0
         self.compatible_hash = compatible_hash
 
     @classmethod
@@ -2030,12 +2020,32 @@ def __contains__(self, word):
         bool
             True if `word` or any character ngrams in `word` are present in the vocabulary, False otherwise.
 
+        Note
+        ----
+        This method **always** returns True, because of the way FastText works.
+
+        If you want to check if a word is an in-vocabulary term, use this instead:
+
+        .. pycon:
+
+            >>> from gensim.test.utils import datapath
+            >>> from gensim.models import FastText
+            >>> cap_path = datapath("crime-and-punishment.bin")
+            >>> model = FastText.load_fasttext_format(cap_path, full_model=False)
+            >>> 'steamtrain' in model.wv.vocab  # If False, is an OOV term
+            False
+
         """
-        if word in self.vocab:
-            return True
-        else:
-            hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash)
-            return any(h in self.hash2index for h in hashes)
+        #
+        # FIXME: what's the right behavior here?
+        #
+        # If we maintain the previous behavior of this method (check for word
+        # _and_ ngram presence) then this method will never return False.
+        # This is because there will _always_ be a vector for any ngram.
+        # However, there is no guarantee that it will be "learned" - it may
+        # be still at its initial "unlearned" value.
+        #
+        return True
 
     def save(self, *args, **kwargs):
         """Save object.
@@ -2052,8 +2062,14 @@ def save(self, *args, **kwargs):
 
         """
         # don't bother storing the cached normalized vectors
-        kwargs['ignore'] = kwargs.get(
-            'ignore', ['vectors_norm', 'vectors_vocab_norm', 'vectors_ngrams_norm', 'buckets_word'])
+        ignore_attrs = [
+            'vectors_norm',
+            'vectors_vocab_norm',
+            'vectors_ngrams_norm',
+            'buckets_word',
+            'hash2index',
+        ]
+        kwargs['ignore'] = kwargs.get('ignore', ignore_attrs)
         super(FastTextKeyedVectors, self).save(*args, **kwargs)
 
     def word_vec(self, word, use_norm=False):
@@ -2087,15 +2103,18 @@ def word_vec(self, word, use_norm=False):
                 ngram_weights = self.vectors_ngrams_norm
             else:
                 ngram_weights = self.vectors_ngrams
-            ngrams_found = 0
-            for ngram_hash in ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash):
-                if ngram_hash in self.hash2index:
-                    word_vec += ngram_weights[self.hash2index[ngram_hash]]
-                    ngrams_found += 1
-            if word_vec.any():
-                return word_vec / max(1, ngrams_found)
-            else:  # No ngrams of the word are present in self.ngrams
-                raise KeyError('all ngrams for word %s absent from model' % word)
+            ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash)
+            for nh in ngram_hashes:
+                word_vec += ngram_weights[nh]
+            return word_vec / len(ngram_hashes)
+
+    @classmethod
+    def load(cls, fname_or_handle, **kwargs):
+        kv = super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs)
+        if kv.hasattr('hash2index'):
+            _rollback_optimization(kv)
+
+        return kv
 
     def init_sims(self, replace=False):
         """Precompute L2-normalized vectors.
@@ -2140,41 +2159,69 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None)
             fname, self.vocab, self.vectors, fvocab=fvocab, binary=binary, total_vec=total_vec)
 
     def init_ngrams_weights(self, seed):
-        self.hash2index = {}
-        ngram_indices, self.buckets_word = _process_fasttext_vocab(
+        """Initialize the vocabulary and ngrams weights prior to training.
+
+        Creates the weight matrices and initializes them with uniform random values.
+
+        Parameters
+        ----------
+        seed : float
+            The seed for the PRNG.
+
+        Note
+        ----
+        Call this **after** the vocabulary has been fully initialized.
+
+        """
+        self.buckets_word = _process_fasttext_vocab(
             self.vocab.items(),
             self.min_n,
             self.max_n,
             self.bucket,
             self.compatible_hash,
-            self.hash2index
         )
-        self.num_ngram_vectors = len(ngram_indices)
-        logger.info("Total number of ngrams is %d", self.num_ngram_vectors)
 
         rand_obj = np.random
         rand_obj.seed(seed)
 
         lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size
         vocab_shape = (len(self.vocab), self.vector_size)
-        ngrams_shape = (len(ngram_indices), self.vector_size)
+        ngrams_shape = (self.bucket, self.vector_size)
         self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL)
+
+        #
+        # We could have initialized vectors_ngrams at construction time, but we
+        # do it here for two reasons:
+        #
+        # 1. The constructor does not have access to the random seed
+        # 2. We want to use the same rand_obj to fill vectors_vocab _and_
+        #    vectors_ngrams, and vectors_vocab cannot happen at construction
+        #    time because the vocab is not initialized at that stage.
+        #
         self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL)
 
     def update_ngrams_weights(self, seed, old_vocab_len):
-        old_hash2index_len = len(self.hash2index)
+        """Update the vocabulary weights for training continuation.
 
-        new_ngram_hashes, self.buckets_word = _process_fasttext_vocab(
+        Parameters
+        ----------
+        seed : float
+            The seed for the PRNG.
+        old_vocab_length : int
+            The length of the vocabulary prior to its update.
+
+        Note
+        ----
+        Call this **after** the vocabulary has been updated.
+
+        """
+        self.buckets_word = _process_fasttext_vocab(
             self.vocab.items(),
             self.min_n,
             self.max_n,
             self.bucket,
             self.compatible_hash,
-            self.hash2index
         )
-        num_new_ngrams = len(new_ngram_hashes)
-        self.num_ngram_vectors += num_new_ngrams
-        logger.info("Number of new ngrams is %d", num_new_ngrams)
 
         rand_obj = np.random
         rand_obj.seed(seed)
@@ -2182,10 +2229,7 @@ def update_ngrams_weights(self, seed, old_vocab_len):
         new_vocab = len(self.vocab) - old_vocab_len
         self.vectors_vocab = _pad_random(self.vectors_vocab, new_vocab, rand_obj)
 
-        new_ngrams = len(self.hash2index) - old_hash2index_len
-        self.vectors_ngrams = _pad_random(self.vectors_ngrams, new_ngrams, rand_obj)
-
-    def init_post_load(self, vectors, match_gensim=False):
+    def init_post_load(self, vectors):
         """Perform initialization after loading a native Facebook model.
 
         Expects that the vocabulary (self.vocab) has already been initialized.
@@ -2198,9 +2242,7 @@ def init_post_load(self, vectors, match_gensim=False):
             The order of the vectors must correspond to the indices in
             the vocabulary.
         match_gensim : boolean, optional
-            Match the behavior of gensim's FastText implementation and take a
-            subset of vectors_ngrams.  This behavior appears to be incompatible
-            with Facebook's implementation.
+            No longer supported.
 
         """
         vocab_words = len(self.vocab)
@@ -2215,31 +2257,7 @@ def init_post_load(self, vectors, match_gensim=False):
         self.vectors = np.array(vectors[:vocab_words, :])
         self.vectors_vocab = np.array(vectors[:vocab_words, :])
         self.vectors_ngrams = np.array(vectors[vocab_words:, :])
-        self.hash2index = {i: i for i in range(self.bucket)}
         self.buckets_word = None  # This can get initialized later
-        self.num_ngram_vectors = self.bucket
-
-        if match_gensim:
-            #
-            # This gives us the same shape for vectors_ngrams, and we can
-            # satisfy our unit tests when running gensim vs native comparisons,
-            # but because we're discarding some ngrams, the accuracy of the
-            # model suffers.
-            #
-            ngram_hashes, _ = _process_fasttext_vocab(
-                self.vocab.items(),
-                self.min_n,
-                self.max_n,
-                self.bucket,
-                self.compatible_hash,
-                dict(),  # we don't care what goes here in this case
-            )
-            ngram_hashes = sorted(set(ngram_hashes))
-
-            keep_indices = [self.hash2index[h] for h in self.hash2index if h in ngram_hashes]
-            self.num_ngram_vectors = len(keep_indices)
-            self.vectors_ngrams = self.vectors_ngrams.take(keep_indices, axis=0)
-            self.hash2index = {hsh: idx for (idx, hsh) in enumerate(ngram_hashes)}
 
         self.adjust_vectors()
 
@@ -2257,12 +2275,17 @@ def adjust_vectors(self):
             word_vec = np.copy(self.vectors_vocab[v.index])
             ngram_hashes = ft_ngram_hashes(w, self.min_n, self.max_n, self.bucket, self.compatible_hash)
             for nh in ngram_hashes:
-                word_vec += self.vectors_ngrams[self.hash2index[nh]]
+                word_vec += self.vectors_ngrams[nh]
             word_vec /= len(ngram_hashes) + 1
             self.vectors[v.index] = word_vec
 
+    @property
+    @deprecated("Attribute will be removed in 4.0.0, use self.bucket instead")
+    def num_ngram_vectors(self):
+        return self.bucket
+
 
-def _process_fasttext_vocab(iterable, min_n, max_n, num_buckets, compatible_hash, hash2index):
+def _process_fasttext_vocab(iterable, min_n, max_n, num_buckets, compatible_hash):
     """
     Performs a common operation for FastText weight initialization and
     updates: scan the vocabulary, calculate ngrams and their hashes, keep
@@ -2282,42 +2305,27 @@ def _process_fasttext_vocab(iterable, min_n, max_n, num_buckets, compatible_hash
     compatible_hash : boolean
         True for compatibility with the Facebook implementation.
         False for compatibility with the old Gensim implementation.
-    hash2index : dict
-        Updated in-place.
 
     Returns
     -------
-    A tuple of two elements.
-
-    word_indices : dict
+    dict
         Keys are indices of entities in the vocabulary (words).  Values are
         arrays containing indices into vectors_ngrams for each ngram of the
         word.
-    new_ngram_hashes : list
-        A list of hashes for newly encountered ngrams.  Each hash is modulo
-        num_buckets.
 
     """
-    old_hash2index_len = len(hash2index)
     word_indices = {}
-    new_ngram_hashes = []
 
     if num_buckets == 0:
-        return [], {v.index: np.array([], dtype=np.uint32) for w, v in iterable}
+        return {v.index: np.array([], dtype=np.uint32) for w, v in iterable}
 
     for word, vocab in iterable:
         wi = []
         for ngram_hash in ft_ngram_hashes(word, min_n, max_n, num_buckets, compatible_hash):
-            if ngram_hash not in hash2index:
-                #
-                # This is a new ngram.  Reserve a new index in hash2index.
-                #
-                hash2index[ngram_hash] = old_hash2index_len + len(new_ngram_hashes)
-                new_ngram_hashes.append(ngram_hash)
-            wi.append(hash2index[ngram_hash])
+            wi.append(ngram_hash)
         word_indices[vocab.index] = np.array(wi, dtype=np.uint32)
 
-    return new_ngram_hashes, word_indices
+    return word_indices
 
 
 def _pad_random(m, new_rows, rand):
@@ -2349,3 +2357,107 @@ def _l2_norm(m, replace=False):
         return m
     else:
         return (m / dist).astype(REAL)
+
+
+def _rollback_optimization(kv):
+    """Undo the optimization that pruned buckets.
+
+    This unfortunate optimization saves memory and CPU cycles, but breaks
+    compatibility with Facebook's model by introducing divergent behavior
+    for OOV words.
+
+    """
+    logger.warning(
+        "This older model was trained with a premature optimization. "
+        "The current Gensim version reverses this optimization during loading. "
+        "Save the model to a new file and reload to suppress this message."
+    )
+    assert hasattr(kv, 'hash2index')
+    assert hasattr(kv, 'num_ngram_vectors')
+
+    kv.vectors_ngrams = _unpack_copy(kv.vectors_ngrams, kv.bucket, kv.hash2index)
+
+    #
+    # We have replaced num_ngram_vectors with a property and deprecated it.
+    # We can't delete it because the new attribute masks the member.
+    #
+    del kv.hash2index
+
+
+def _unpack_copy(m, num_rows, hash2index, seed=1):
+    """Same as _unpack, but makes a copy of the matrix.
+
+    Simpler implementation, but uses more RAM.
+
+    """
+    rows, columns = m.shape
+    if rows == num_rows:
+        #
+        # Nothing to do.
+        #
+        return m
+    assert num_rows > rows
+
+    rand_obj = np.random
+    rand_obj.seed(seed)
+
+    n = np.empty((0, columns), dtype=m.dtype)
+    n = _pad_random(n, num_rows, rand_obj)
+
+    for src, dst in hash2index.items():
+        n[src] = m[dst]
+
+    return n
+
+
+def _unpack(m, num_rows, seed, hash2index):
+    """Restore the array to its natural shape, undoing the optimization.
+
+    Parameters
+    ----------
+
+    a : np.ndarray
+        The matrix to restore.
+    num_rows : int
+        The number of rows that this array should have.
+    seed : float
+        The seed for the PRNG.  Will be used to initialize new rows.
+    hash2index : dict
+        the product of the optimization we are undoing.
+
+    Returns
+    -------
+    np.array
+        The unpacked matrix.
+
+    Notes
+    -----
+
+    The unpacked matrix will reference some rows in the input matrix to save memory.
+    Throw away the old matrix after calling this function, or use np.copy.
+
+    This implementation is a work in progress.
+
+    """
+    rows, columns = a.shape
+    if rows == num_rows:
+        #
+        # Nothing to do.
+        #
+        return a
+    assert num_rows > rows
+
+    rand_obj = np.random
+    rand_obj.seed(seed)
+
+    #
+    # Rows at the bottom of the matrix will be "free": initialized to random values.
+    # Rows at the top of the matrix will contain learned vectors.
+    # Move the learned vectors down into their "proper" positions, starting from the bottom.
+    #
+    m = _pad_random(m, num_rows - rows, rand_obj)
+    for src, dst in reversed(hash2index.items()):
+        assert src < dst
+        m[src], m[dst] = m[dst], m[src]
+
+    return m
diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
index 5437a9b3c8..d0dd778555 100644
--- a/gensim/test/test_fasttext.py
+++ b/gensim/test/test_fasttext.py
@@ -1186,6 +1186,20 @@ def test_out_of_vocab(self):
         self.assertRaises(KeyError, model.wv.word_vec, 'streamtrain')
 
 
+class UnpackTest(unittest.TestCase):
+    def test_copy_sanity(self):
+        import gensim.models.keyedvectors
+
+        m = np.array(range(9))
+        m.shape = (3, 3)
+        hash2index = {10: 0, 11: 1, 12: 2}
+
+        n = gensim.models.keyedvectors._unpack_copy(m, 25, hash2index)
+        self.assertTrue(np.all(m[0] == n[10]))
+        self.assertTrue(np.all(m[1] == n[11]))
+        self.assertTrue(np.all(m[2] == n[12]))
+
+
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()

From ed52a5f4d25a0aa5924650dbcb0998cf15a0ef3d Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 3 Feb 2019 18:42:27 +1100
Subject: [PATCH 02/10] flake8

---
 gensim/models/fasttext.py     |  2 ++
 gensim/models/keyedvectors.py | 17 ++++++-----------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index 0cc17a3fb5..5b72d531a5 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -707,6 +707,8 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p
         if update:
             self.wv.update_ngrams_weights(self.trainables.seed, self.vocabulary.old_vocab_len)
 
+        return retval
+
     def _set_train_params(self, **kwargs):
         #
         # We need the wv.buckets_word member to be initialized in order to
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
index 17cbf55e31..6421e29d70 100644
--- a/gensim/models/keyedvectors.py
+++ b/gensim/models/keyedvectors.py
@@ -1984,6 +1984,9 @@ def load(cls, fname_or_handle, **kwargs):
         if not hasattr(model, 'compatible_hash'):
             model.compatible_hash = False
 
+        if model.hasattr('hash2index'):
+            _rollback_optimization(model)
+
         return model
 
     @property
@@ -2108,14 +2111,6 @@ def word_vec(self, word, use_norm=False):
                 word_vec += ngram_weights[nh]
             return word_vec / len(ngram_hashes)
 
-    @classmethod
-    def load(cls, fname_or_handle, **kwargs):
-        kv = super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs)
-        if kv.hasattr('hash2index'):
-            _rollback_optimization(kv)
-
-        return kv
-
     def init_sims(self, replace=False):
         """Precompute L2-normalized vectors.
 
@@ -2416,7 +2411,7 @@ def _unpack(m, num_rows, seed, hash2index):
     Parameters
     ----------
 
-    a : np.ndarray
+    m : np.ndarray
         The matrix to restore.
     num_rows : int
         The number of rows that this array should have.
@@ -2439,12 +2434,12 @@ def _unpack(m, num_rows, seed, hash2index):
     This implementation is a work in progress.
 
     """
-    rows, columns = a.shape
+    rows, columns = m.shape
     if rows == num_rows:
         #
         # Nothing to do.
         #
-        return a
+        return m
     assert num_rows > rows
 
     rand_obj = np.random

From 8fd9f81c58badbedafbd4ac444924980d6b97e2a Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 3 Feb 2019 23:01:52 +1100
Subject: [PATCH 03/10] implement memory-efficient _unpack function

---
 gensim/models/keyedvectors.py | 50 ++++++++++++++++++++++++-----------
 gensim/test/test_fasttext.py  | 24 +++++++++++++++++
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
index 6421e29d70..de0efde925 100644
--- a/gensim/models/keyedvectors.py
+++ b/gensim/models/keyedvectors.py
@@ -2370,7 +2370,7 @@ def _rollback_optimization(kv):
     assert hasattr(kv, 'hash2index')
     assert hasattr(kv, 'num_ngram_vectors')
 
-    kv.vectors_ngrams = _unpack_copy(kv.vectors_ngrams, kv.bucket, kv.hash2index)
+    kv.vectors_ngrams = _unpack(kv.vectors_ngrams, kv.bucket, kv.hash2index)
 
     #
     # We have replaced num_ngram_vectors with a property and deprecated it.
@@ -2405,9 +2405,19 @@ def _unpack_copy(m, num_rows, hash2index, seed=1):
     return n
 
 
-def _unpack(m, num_rows, seed, hash2index):
+def _unpack(m, num_rows, hash2index, seed=1):
     """Restore the array to its natural shape, undoing the optimization.
 
+    A packed matrix contains contiguous vectors for ngrams, as well as a hashmap.
+    The hash map maps the ngram hash to its index in the packed matrix.
+    To unpack the matrix, we need to do several things:
+
+    1. Restore the matrix to its "natural" shape, where the number of rows
+       equals the number of buckets.
+    2. Rearrange the existing rows such that the hashmap becomes the identity
+       function and is thus redundant.
+    3. Fill the new rows with random values.
+
     Parameters
     ----------
 
@@ -2415,10 +2425,10 @@ def _unpack(m, num_rows, seed, hash2index):
         The matrix to restore.
     num_rows : int
         The number of rows that this array should have.
-    seed : float
-        The seed for the PRNG.  Will be used to initialize new rows.
     hash2index : dict
         the product of the optimization we are undoing.
+    seed : float, optional
+        The seed for the PRNG.  Will be used to initialize new rows.
 
     Returns
     -------
@@ -2431,28 +2441,38 @@ def _unpack(m, num_rows, seed, hash2index):
     The unpacked matrix will reference some rows in the input matrix to save memory.
     Throw away the old matrix after calling this function, or use np.copy.
 
-    This implementation is a work in progress.
-
     """
-    rows, columns = m.shape
-    if rows == num_rows:
+    orig_rows, orig_columns = m.shape
+    if orig_rows == num_rows:
         #
         # Nothing to do.
         #
         return m
-    assert num_rows > rows
+    assert num_rows > orig_rows
 
     rand_obj = np.random
     rand_obj.seed(seed)
 
     #
+    # Rows at the top of the matrix (the first orig_rows) will contain "packed" learned vectors.
     # Rows at the bottom of the matrix will be "free": initialized to random values.
-    # Rows at the top of the matrix will contain learned vectors.
-    # Move the learned vectors down into their "proper" positions, starting from the bottom.
     #
-    m = _pad_random(m, num_rows - rows, rand_obj)
-    for src, dst in reversed(hash2index.items()):
-        assert src < dst
-        m[src], m[dst] = m[dst], m[src]
+    m = _pad_random(m, num_rows - orig_rows, rand_obj)
+
+    #
+    # Swap rows to transform hash2index into the identify function.
+    # There are two kinds of swaps.
+    # First, rearrange the rows that belong entirely within the original matrix dimensions.
+    # Second, swap out rows from the original matrix dimensions, replacing them with
+    # randomly initialized values.
+    #
+    # N.B. We only do the swap in one direction, because doing it in both directions
+    # nullifies the effect.
+    #
+    swap = {h: i for (h, i) in hash2index.items() if h < i < orig_rows}
+    swap.update({h: i for (h, i) in hash2index.items() if h >= orig_rows})
+    for h, i in swap.items():
+        assert h != i
+        m[[h, i]] = m[[i, h]]  # swap rows i and h
 
     return m
diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
index d0dd778555..e6bcd7d121 100644
--- a/gensim/test/test_fasttext.py
+++ b/gensim/test/test_fasttext.py
@@ -1199,6 +1199,30 @@ def test_copy_sanity(self):
         self.assertTrue(np.all(m[1] == n[11]))
         self.assertTrue(np.all(m[2] == n[12]))
 
+    def test_sanity(self):
+        import gensim.models.keyedvectors
+
+        m = np.array(range(9))
+        m.shape = (3, 3)
+        hash2index = {10: 0, 11: 1, 12: 2}
+
+        n = gensim.models.keyedvectors._unpack(m, 25, hash2index)
+        self.assertTrue(np.all(np.array([0, 1, 2]) == n[10]))
+        self.assertTrue(np.all(np.array([3, 4, 5]) == n[11]))
+        self.assertTrue(np.all(np.array([6, 7, 8]) == n[12]))
+
+    def test_tricky(self):
+        import gensim.models.keyedvectors
+
+        m = np.array(range(9))
+        m.shape = (3, 3)
+        hash2index = {1: 0, 0: 1, 12: 2}
+
+        n = gensim.models.keyedvectors._unpack(m, 25, hash2index)
+        self.assertTrue(np.all(np.array([3, 4, 5]) == n[0]))
+        self.assertTrue(np.all(np.array([0, 1, 2]) == n[1]))
+        self.assertTrue(np.all(np.array([6, 7, 8]) == n[12]))
+
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

From 1423298a9c556695ab4fe393aa3507b7652d63c3 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 3 Feb 2019 23:03:25 +1100
Subject: [PATCH 04/10] fixup

---
 gensim/models/keyedvectors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
index de0efde925..b52746bfe8 100644
--- a/gensim/models/keyedvectors.py
+++ b/gensim/models/keyedvectors.py
@@ -1984,7 +1984,7 @@ def load(cls, fname_or_handle, **kwargs):
         if not hasattr(model, 'compatible_hash'):
             model.compatible_hash = False
 
-        if model.hasattr('hash2index'):
+        if hasattr(model, 'hash2index'):
             _rollback_optimization(model)
 
         return model

From 0604de2505758dacf9e817854ae277cc5fc82c07 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Mon, 4 Feb 2019 14:14:16 +1100
Subject: [PATCH 05/10] move tests, add test_identity

---
 gensim/test/test_fasttext.py     | 38 -----------------------------
 gensim/test/test_keyedvectors.py | 42 ++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
index e6bcd7d121..5437a9b3c8 100644
--- a/gensim/test/test_fasttext.py
+++ b/gensim/test/test_fasttext.py
@@ -1186,44 +1186,6 @@ def test_out_of_vocab(self):
         self.assertRaises(KeyError, model.wv.word_vec, 'streamtrain')
 
 
-class UnpackTest(unittest.TestCase):
-    def test_copy_sanity(self):
-        import gensim.models.keyedvectors
-
-        m = np.array(range(9))
-        m.shape = (3, 3)
-        hash2index = {10: 0, 11: 1, 12: 2}
-
-        n = gensim.models.keyedvectors._unpack_copy(m, 25, hash2index)
-        self.assertTrue(np.all(m[0] == n[10]))
-        self.assertTrue(np.all(m[1] == n[11]))
-        self.assertTrue(np.all(m[2] == n[12]))
-
-    def test_sanity(self):
-        import gensim.models.keyedvectors
-
-        m = np.array(range(9))
-        m.shape = (3, 3)
-        hash2index = {10: 0, 11: 1, 12: 2}
-
-        n = gensim.models.keyedvectors._unpack(m, 25, hash2index)
-        self.assertTrue(np.all(np.array([0, 1, 2]) == n[10]))
-        self.assertTrue(np.all(np.array([3, 4, 5]) == n[11]))
-        self.assertTrue(np.all(np.array([6, 7, 8]) == n[12]))
-
-    def test_tricky(self):
-        import gensim.models.keyedvectors
-
-        m = np.array(range(9))
-        m.shape = (3, 3)
-        hash2index = {1: 0, 0: 1, 12: 2}
-
-        n = gensim.models.keyedvectors._unpack(m, 25, hash2index)
-        self.assertTrue(np.all(np.array([3, 4, 5]) == n[0]))
-        self.assertTrue(np.all(np.array([0, 1, 2]) == n[1]))
-        self.assertTrue(np.all(np.array([6, 7, 8]) == n[12]))
-
-
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()
diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
index 59e361cc6c..13a5b8c5a9 100644
--- a/gensim/test/test_keyedvectors.py
+++ b/gensim/test/test_keyedvectors.py
@@ -306,6 +306,48 @@ def test(self):
         self.assertTrue(np.allclose(m, norm))
 
 
+class UnpackTest(unittest.TestCase):
+    def test_copy_sanity(self):
+        m = np.array(range(9))
+        m.shape = (3, 3)
+        hash2index = {10: 0, 11: 1, 12: 2}
+
+        n = gensim.models.keyedvectors._unpack_copy(m, 25, hash2index)
+        self.assertTrue(np.all(m[0] == n[10]))
+        self.assertTrue(np.all(m[1] == n[11]))
+        self.assertTrue(np.all(m[2] == n[12]))
+
+    def test_sanity(self):
+        m = np.array(range(9))
+        m.shape = (3, 3)
+        hash2index = {10: 0, 11: 1, 12: 2}
+
+        n = gensim.models.keyedvectors._unpack(m, 25, hash2index)
+        self.assertTrue(np.all(np.array([0, 1, 2]) == n[10]))
+        self.assertTrue(np.all(np.array([3, 4, 5]) == n[11]))
+        self.assertTrue(np.all(np.array([6, 7, 8]) == n[12]))
+
+    def test_tricky(self):
+        m = np.array(range(9))
+        m.shape = (3, 3)
+        hash2index = {1: 0, 0: 1, 12: 2}
+
+        n = gensim.models.keyedvectors._unpack(m, 25, hash2index)
+        self.assertTrue(np.all(np.array([3, 4, 5]) == n[0]))
+        self.assertTrue(np.all(np.array([0, 1, 2]) == n[1]))
+        self.assertTrue(np.all(np.array([6, 7, 8]) == n[12]))
+
+    def test_identity(self):
+        m = np.array(range(9))
+        m.shape = (3, 3)
+        hash2index = {0: 0, 1: 1, 2: 2}
+
+        n = gensim.models.keyedvectors._unpack(m, 25, hash2index)
+        self.assertTrue(np.all(np.array([0, 1, 2]) == n[0]))
+        self.assertTrue(np.all(np.array([3, 4, 5]) == n[1]))
+        self.assertTrue(np.all(np.array([6, 7, 8]) == n[2]))
+
+
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()

From a52caba4f91adf7f47bed17629186a58b09ebd9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <me@radimrehurek.com>
Date: Thu, 7 Feb 2019 21:04:17 +1100
Subject: [PATCH 06/10] Update gensim/models/keyedvectors.py

Co-Authored-By: mpenkov <penkov@pm.me>
---
 gensim/models/keyedvectors.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
index b52746bfe8..ce680ccb2a 100644
--- a/gensim/models/keyedvectors.py
+++ b/gensim/models/keyedvectors.py
@@ -2363,7 +2363,9 @@ def _rollback_optimization(kv):
 
     """
     logger.warning(
-        "This older model was trained with a premature optimization. "
+        "This saved FastText model was trained with an optimization we no longer support. "
+        "The current Gensim version automatically reverses this optimization during loading. "
+        "Save the loaded model to a new file and reload to suppress this message."
         "The current Gensim version reverses this optimization during loading. "
         "Save the model to a new file and reload to suppress this message."
     )

From 009db283bb0a4bcb2c71dce7c98bbf7e5a378e81 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Thu, 7 Feb 2019 21:08:37 +1100
Subject: [PATCH 07/10] review response: improve comment

---
 gensim/models/fasttext.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index 5b72d531a5..fda40c4b0a 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -696,7 +696,9 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p
             raise RuntimeError(
                 "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
                 "First build the vocabulary of your model with a corpus "
-                "before doing an online update.")
+                "by calling the :meth:`gensim.models.fasttext.FastText.build_vocab` method "
+                "before doing an online update."
+            )
         else:
             self.vocabulary.old_vocab_len = len(self.wv.vocab)
 

From 5a6eda041d604eb8ec2f1ed74571d661fd80d352 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Wed, 13 Feb 2019 13:34:07 +0900
Subject: [PATCH 08/10] review response: fix redundant log message

---
 gensim/models/keyedvectors.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
index ce680ccb2a..8c450149ca 100644
--- a/gensim/models/keyedvectors.py
+++ b/gensim/models/keyedvectors.py
@@ -2366,8 +2366,6 @@ def _rollback_optimization(kv):
         "This saved FastText model was trained with an optimization we no longer support. "
         "The current Gensim version automatically reverses this optimization during loading. "
         "Save the loaded model to a new file and reload to suppress this message."
-        "The current Gensim version reverses this optimization during loading. "
-        "Save the model to a new file and reload to suppress this message."
     )
     assert hasattr(kv, 'hash2index')
     assert hasattr(kv, 'num_ngram_vectors')

From 26254017d6973ebcfa89ae5e27240a0f727afcaa Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Wed, 13 Feb 2019 13:35:00 +0900
Subject: [PATCH 09/10] review response: remove sphinx markup from log message

---
 gensim/models/fasttext.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index fda40c4b0a..7381d272a2 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -696,7 +696,7 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p
             raise RuntimeError(
                 "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
                 "First build the vocabulary of your model with a corpus "
-                "by calling the :meth:`gensim.models.fasttext.FastText.build_vocab` method "
+                "by calling the gensim.models.fasttext.FastText.build_vocab method "
                 "before doing an online update."
             )
         else:

From 1cb15fd3e8ad3d5d90bf2a9d7476e861c811d91e Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Thu, 7 Mar 2019 13:17:18 +0900
Subject: [PATCH 10/10] remove FIXME

---
 gensim/models/keyedvectors.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
index 8c450149ca..b4cf61abd6 100644
--- a/gensim/models/keyedvectors.py
+++ b/gensim/models/keyedvectors.py
@@ -2039,15 +2039,6 @@ def __contains__(self, word):
             False
 
         """
-        #
-        # FIXME: what's the right behavior here?
-        #
-        # If we maintain the previous behavior of this method (check for word
-        # _and_ ngram presence) then this method will never return False.
-        # This is because there will _always_ be a vector for any ngram.
-        # However, there is no guarantee that it will be "learned" - it may
-        # be still at its initial "unlearned" value.
-        #
         return True
 
     def save(self, *args, **kwargs):