Skip to content

Commit

Permalink
avoid collisions when decoding bad unicode (#2411)
Browse files Browse the repository at this point in the history
* avoid collisions when decoding bad unicode

* Py2.7 support

* improve Py2.7 handling during collision avoidance

* avoid division by zero

* backport backslashreplace for Py2
  • Loading branch information
mpenkov authored Apr 6, 2019
1 parent 54d2f69 commit bd199aa
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 4 deletions.
42 changes: 40 additions & 2 deletions gensim/models/_fasttext_bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@
"""

import codecs
import collections
import io
import logging
import struct

import numpy as np
import six

_END_OF_WORD_MARKER = b'\x00'

Expand Down Expand Up @@ -182,9 +184,9 @@ def _load_vocab(fin, new_format, encoding='utf-8'):
try:
word = word_bytes.decode(encoding)
except UnicodeDecodeError:
word = word_bytes.decode(encoding, errors='ignore')
word = word_bytes.decode(encoding, errors='backslashreplace')
logger.error(
'failed to decode invalid unicode bytes %r; ignoring invalid characters, using %r',
'failed to decode invalid unicode bytes %r; replacing invalid characters, using %r',
word_bytes, word
)
count, _ = _struct_unpack(fin, '@qb')
Expand Down Expand Up @@ -280,3 +282,39 @@ def load(fin, encoding='utf-8', full_model=True):
model.update(vectors_ngrams=vectors_ngrams, hidden_output=hidden_output)
model = {k: v for k, v in model.items() if k in _FIELD_NAMES}
return Model(**model)


def _backslashreplace_backport(ex):
"""Replace byte sequences that failed to decode with character escapes.
Does the same thing as errors="backslashreplace" from Python 3. Python 2
lacks this functionality out of the box, so we need to backport it.
Parameters
----------
ex: UnicodeDecodeError
contains arguments of the string and start/end indexes of the bad portion.
Returns
-------
text: unicode
The Unicode string corresponding to the decoding of the bad section.
end: int
The index from which to continue decoding.
Note
----
Works on Py2 only. Py3 already has backslashreplace built-in.
"""
#
# Based on:
# https://stackoverflow.com/questions/42860186/exact-equivalent-of-b-decodeutf-8-backslashreplace-in-python-2
#
bstr, start, end = ex.object, ex.start, ex.end
text = u''.join('\\x{:02x}'.format(ord(c)) for c in bstr[start:end])
return text, end


if six.PY2:
codecs.register_error('backslashreplace', _backslashreplace_backport)
11 changes: 11 additions & 0 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2098,6 +2098,17 @@ def word_vec(self, word, use_norm=False):
else:
ngram_weights = self.vectors_ngrams
ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash)
if len(ngram_hashes) == 0:
#
# If it is impossible to extract _any_ ngrams from the input
# word, then the best we can do is return a vector that points
# to the origin. The reference FB implementation does this,
# too.
#
# https://github.com/RaRe-Technologies/gensim/issues/2402
#
logger.warning('could not extract any ngrams from %r, returning origin vector', word)
return word_vec
for nh in ngram_hashes:
word_vec += ngram_weights[nh]
return word_vec / len(ngram_hashes)
Expand Down
13 changes: 11 additions & 2 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1119,6 +1119,13 @@ def test_load_native_vectors(self):
iv_vector = fbkv['landlady']
self.assertFalse(np.allclose(oov_vector, iv_vector))

def test_no_ngrams(self):
model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin'))

v1 = model.wv['']
origin = np.zeros(v1.shape, v1.dtype)
self.assertTrue(np.allclose(v1, origin))


def _train_model_with_pretrained_vectors():
"""Generate toy-model-pretrained.bin for use in test_load_native_pretrained.
Expand Down Expand Up @@ -1261,10 +1268,12 @@ def test_bad_unicode(self):
buf.seek(0)

raw_vocab, vocab_size, nlabels = gensim.models._fasttext_bin._load_vocab(buf, False)

expected = {
u'英語版ウィキペディアへの投稿はいつでも': 1,
u'административно-территориальн': 2,
u'英語版ウィキペディアへの投稿はいつでも\\xe6': 1,
u'административно-территориальн\\xd1': 2,
}

self.assertEqual(expected, dict(raw_vocab))

self.assertEqual(vocab_size, 2)
Expand Down

0 comments on commit bd199aa

Please sign in to comment.