piskvorky · menshikh-iv · Jun 28, 2017 · May 22, 2017 · May 22, 2017 · May 22, 2017
diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py
@@ -35,7 +35,7 @@
 import numpy as np
 from numpy import float32 as REAL, sqrt, newaxis
 from gensim import utils
-from gensim.models.keyedvectors import KeyedVectors
+from gensim.models.keyedvectors import KeyedVectors, Vocab
 from gensim.models.word2vec import Word2Vec
 
 from six import string_types
@@ -219,9 +219,11 @@ def save(self, *args, **kwargs):
         kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_all_norm'])
         super(FastText, self).save(*args, **kwargs)
 
+    """
     @classmethod
     def load_word2vec_format(cls, *args, **kwargs):
         return FastTextKeyedVectors.load_word2vec_format(*args, **kwargs)
+    """
 
     @classmethod
     def load_fasttext_format(cls, model_file, encoding='utf8'):
@@ -233,11 +235,12 @@ def load_fasttext_format(cls, model_file, encoding='utf8'):
 
         `model_file` is the path to the FastText output files.
         FastText outputs two training files - `/path/to/train.vec` and `/path/to/train.bin`
-        Expected value for this example: `/path/to/train`
+        Expected value for this example: `/path/to/train`. However, you only need .bin
+        file to load the entire model.
 
         """
+
         model = cls()
-        model.wv = cls.load_word2vec_format('%s.vec' % model_file, encoding=encoding)
         model.load_binary_data('%s.bin' % model_file, encoding=encoding)
         return model
 
@@ -284,12 +287,12 @@ def load_model_params(self, file_handle):
     def load_dict(self, file_handle, encoding='utf8'):
         vocab_size, nwords, _ = self.struct_unpack(file_handle, '@3i')
         # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
-        assert len(self.wv.vocab) == nwords, 'mismatch between vocab sizes'
-        assert len(self.wv.vocab) == vocab_size, 'mismatch between vocab sizes'
+        logger.info("loading vocabulary words")
+
         self.struct_unpack(file_handle, '@1q')  # number of tokens
         if self.new_format:
             pruneidx_size, = self.struct_unpack(file_handle, '@q')
-        for i in range(nwords):
+        for i in range(vocab_size):
             word_bytes = b''
             char_byte = file_handle.read(1)
             # Read vocab word
@@ -298,8 +301,28 @@ def load_dict(self, file_handle, encoding='utf8'):
                 char_byte = file_handle.read(1)
             word = word_bytes.decode(encoding)
             count, _ = self.struct_unpack(file_handle, '@qb')
-            assert self.wv.vocab[word].index == i, 'mismatch between gensim word index and fastText word index'
-            self.wv.vocab[word].count = count
+
+            #if word != "__label__":
+
+
+            if i == nwords and i < vocab_size:
+                """
+                To handle the error in pretrained vector wiki.fr (French).
+                For more info : https://github.com/facebookresearch/fastText/issues/218
+
+                """
+                assert word == "__label__"
+                continue   # don't add word to vocab
+
+            self.wv.vocab[word] = Vocab(index=i, count=count)
+            self.wv.index2word.append(word)
+
+        assert len(self.wv.vocab) == nwords, 'mismatch between vocab sizes'
+        if len(self.wv.vocab) != vocab_size:
+            logger.warning("mismatch between vocab sizes")
+            logger.warning("If you are loading any model other than pretrained vector wiki.fr, ")
+            logger.warning("Please report to Gensim.")
+
 
         if self.new_format:
             for j in range(pruneidx_size):
@@ -337,8 +360,12 @@ def init_ngrams(self):
         """
         self.wv.ngrams = {}
         all_ngrams = []
-        for w, v in self.wv.vocab.items():
+        self.wv.syn0 = np.zeros((len(self.wv.vocab), self.vector_size), dtype=REAL)
+
+        for w, vocab in self.wv.vocab.items():
             all_ngrams += self.compute_ngrams(w, self.wv.min_n, self.wv.max_n)
+            self.wv.syn0[vocab.index] += np.array(self.wv.syn0_all[vocab.index])
+
         all_ngrams = set(all_ngrams)
         self.num_ngram_vectors = len(all_ngrams)
         ngram_indices = []
@@ -348,6 +375,18 @@ def init_ngrams(self):
             self.wv.ngrams[ngram] = i
         self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0)
 
+        ngram_weights = self.wv.syn0_all
+
+        logger.info("loading vocabulary weights")
+
+        for w, vocab in self.wv.vocab.items():
+            word_ngrams = self.compute_ngrams(w, self.wv.min_n, self.wv.max_n)
+            for word_ngram in word_ngrams:
+                self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]])
+
+            self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1)
+        logger.info("loaded %s matrix", self.wv.syn0.shape)
+
     @staticmethod
     def compute_ngrams(word, min_n, max_n):
         ngram_indices = []