From 5a615946159853e28f63450e2ba161b38cbf69a2 Mon Sep 17 00:00:00 2001
From: Carl Saroufim <casarouf@microsoft.com>
Date: Thu, 14 Dec 2017 17:02:30 -0800
Subject: [PATCH] Bug Fix 1771

---
 gensim/models/wrappers/fasttext.py | 38 ++++++------------------------
 1 file changed, 7 insertions(+), 31 deletions(-)

diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py
index 8e7a966cee..e32fd9e747 100644
--- a/gensim/models/wrappers/fasttext.py
+++ b/gensim/models/wrappers/fasttext.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Author: Jayant Jain <jayantjain1992@gmail.com>
-# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
+# Copyright (C) 2017 Radim Rehurek <radim@rare-technologies.com>
+# Copyright (C) 2017 Carl Saroufim <carl_saroufim@hotmail.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
@@ -390,10 +390,7 @@ def struct_unpack(self, file_handle, fmt):
 
     def init_ngrams(self):
         """
-        Computes ngrams of all words present in vocabulary and stores vectors for only those ngrams.
-        Vectors for other ngrams are initialized with a random uniform distribution in FastText. These
-        vectors are discarded here to save space.
-
+        Computes ngrams of all words present in vocabulary and stores vectors for those ngrams.
         """
         self.wv.ngrams = {}
         all_ngrams = []
@@ -403,32 +400,11 @@ def init_ngrams(self):
             all_ngrams += compute_ngrams(w, self.wv.min_n, self.wv.max_n)
             self.wv.syn0[vocab.index] += np.array(self.wv.syn0_ngrams[vocab.index])
 
-        all_ngrams = set(all_ngrams)
-        self.num_ngram_vectors = len(all_ngrams)
-        ngram_indices = []
-        for i, ngram in enumerate(all_ngrams):
-            ngram_hash = ft_hash(ngram)
-            ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket)
-            self.wv.ngrams[ngram] = i
-        self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0)
-
-        ngram_weights = self.wv.syn0_ngrams
-
-        logger.info(
-            "loading weights for %s words for fastText model from %s",
-            len(self.wv.vocab), self.file_name
-        )
+        self.wv.syn0_ngrams = self.wv.syn0_ngrams[len(self.wv.vocab):]
 
-        for w, vocab in self.wv.vocab.items():
-            word_ngrams = compute_ngrams(w, self.wv.min_n, self.wv.max_n)
-            for word_ngram in word_ngrams:
-                self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]])
-
-            self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1)
-        logger.info(
-            "loaded %s weight matrix for fastText model from %s",
-            self.wv.syn0.shape, self.file_name
-        )
+        all_ngrams = set(all_ngrams)
+        for ngram in all_ngrams:
+            self.wv.ngrams[ngram] = ft_hash(ngram) % self.bucket
 
 
 def compute_ngrams(word, min_n, max_n):