Skip to content

Commit

Permalink
respond to review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
mpenkov committed Feb 20, 2019
1 parent 6dc4aef commit 2983a55
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions gensim/models/_fasttext_bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@

import numpy as np

_UNICODE_REPLACE = u'\ufffd'
"""The character Python's unicode handling uses to denote characters that couldn't be decoded."""

_END_OF_WORD_MARKER = b'\x00'

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -185,8 +182,11 @@ def _load_vocab(fin, new_format, encoding='utf-8'):
try:
word = word_bytes.decode(encoding)
except UnicodeDecodeError:
word = word_bytes.decode(encoding, errors='replace').replace(_UNICODE_REPLACE, '')
logger.error('unable to cleanly decode bytes %r to word %r', word_bytes, word)
word = word_bytes.decode(encoding, errors='ignore')
logger.error(
'failed to decode invalid unicode bytes %r; ignoring invalid characters, using %r',
word_bytes, word
)
count, _ = _struct_unpack(fin, '@qb')
raw_vocab[word] = count

Expand Down

0 comments on commit 2983a55

Please sign in to comment.