From 6dc4aeff2d91416eb7b8c5b787f8808544880627 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Wed, 13 Feb 2019 13:23:09 +0900 Subject: [PATCH] avoid byte concatenation --- gensim/models/_fasttext_bin.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index e44b293b11..10d1a602b6 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -30,6 +30,7 @@ """ import collections +import io import logging import struct @@ -173,12 +174,14 @@ def _load_vocab(fin, new_format, encoding='utf-8'): raw_vocab = collections.OrderedDict() for i in range(vocab_size): - word_bytes = b'' + word_bytes = io.BytesIO() char_byte = fin.read(1) - # Read vocab word + while char_byte != _END_OF_WORD_MARKER: - word_bytes += char_byte + word_bytes.write(char_byte) char_byte = fin.read(1) + + word_bytes = word_bytes.getvalue() try: word = word_bytes.decode(encoding) except UnicodeDecodeError: