avoid collisions when decoding bad unicode (#2411)

* avoid collisions when decoding bad unicode * Py2.7 support * improve Py2.7 handling during collision avoidance * avoid division by zero * backport backslashreplace for Py2
piskvorky · Apr 6, 2019 · bd199aa · bd199aa
1 parent 54d2f69
commit bd199aa
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 4 deletions.
diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
@@ -29,12 +29,14 @@
 
 """
 
+import codecs
 import collections
 import io
 import logging
 import struct
 
 import numpy as np
+import six
 
 _END_OF_WORD_MARKER = b'\x00'
 
@@ -182,9 +184,9 @@ def _load_vocab(fin, new_format, encoding='utf-8'):
         try:
             word = word_bytes.decode(encoding)
         except UnicodeDecodeError:
-            word = word_bytes.decode(encoding, errors='ignore')
+            word = word_bytes.decode(encoding, errors='backslashreplace')
             logger.error(
-                'failed to decode invalid unicode bytes %r; ignoring invalid characters, using %r',
+                'failed to decode invalid unicode bytes %r; replacing invalid characters, using %r',
                 word_bytes, word
             )
         count, _ = _struct_unpack(fin, '@qb')
@@ -280,3 +282,39 @@ def load(fin, encoding='utf-8', full_model=True):
     model.update(vectors_ngrams=vectors_ngrams, hidden_output=hidden_output)
     model = {k: v for k, v in model.items() if k in _FIELD_NAMES}
     return Model(**model)
+
+
+def _backslashreplace_backport(ex):
+    """Replace byte sequences that failed to decode with character escapes.
+
+    Does the same thing as errors="backslashreplace" from Python 3.  Python 2
+    lacks this functionality out of the box, so we need to backport it.
+
+    Parameters
+    ----------
+    ex: UnicodeDecodeError
+        contains arguments of the string and start/end indexes of the bad portion.
+
+    Returns
+    -------
+    text: unicode
+        The Unicode string corresponding to the decoding of the bad section.
+    end: int
+        The index from which to continue decoding.
+
+    Note
+    ----
+    Works on Py2 only.  Py3 already has backslashreplace built-in.
+
+    """
+    #
+    # Based on:
+    # https://stackoverflow.com/questions/42860186/exact-equivalent-of-b-decodeutf-8-backslashreplace-in-python-2
+    #
+    bstr, start, end = ex.object, ex.start, ex.end
+    text = u''.join('\\x{:02x}'.format(ord(c)) for c in bstr[start:end])
+    return text, end
+
+
+if six.PY2:
+    codecs.register_error('backslashreplace', _backslashreplace_backport)
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -2098,6 +2098,17 @@ def word_vec(self, word, use_norm=False):
             else:
                 ngram_weights = self.vectors_ngrams
             ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash)
+            if len(ngram_hashes) == 0:
+                #
+                # If it is impossible to extract _any_ ngrams from the input
+                # word, then the best we can do is return a vector that points
+                # to the origin.  The reference FB implementation does this,
+                # too.
+                #
+                # https://github.com/RaRe-Technologies/gensim/issues/2402
+                #
+                logger.warning('could not extract any ngrams from %r, returning origin vector', word)
+                return word_vec
             for nh in ngram_hashes:
                 word_vec += ngram_weights[nh]
             return word_vec / len(ngram_hashes)

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -1119,6 +1119,13 @@ def test_load_native_vectors(self):
         iv_vector = fbkv['landlady']
         self.assertFalse(np.allclose(oov_vector, iv_vector))
 
+    def test_no_ngrams(self):
+        model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin'))
+
+        v1 = model.wv['']
+        origin = np.zeros(v1.shape, v1.dtype)
+        self.assertTrue(np.allclose(v1, origin))
+
 
 def _train_model_with_pretrained_vectors():
     """Generate toy-model-pretrained.bin for use in test_load_native_pretrained.
@@ -1261,10 +1268,12 @@ def test_bad_unicode(self):
         buf.seek(0)
 
         raw_vocab, vocab_size, nlabels = gensim.models._fasttext_bin._load_vocab(buf, False)
+
         expected = {
-            u'英語版ウィキペディアへの投稿はいつでも': 1,
-            u'административно-территориальн': 2,
+            u'英語版ウィキペディアへの投稿はいつでも\\xe6': 1,
+            u'административно-территориальн\\xd1': 2,
         }
+
         self.assertEqual(expected, dict(raw_vocab))
 
         self.assertEqual(vocab_size, 2)