Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize FastText.load_fasttext_model #2340

Merged
merged 35 commits into from
Jan 24, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e1b9ba0
add docstring for Model namedtuple
mpenkov Jan 19, 2019
53497a6
add option to skip hidden matrix loading
mpenkov Jan 19, 2019
6ae0bb9
review response: rename fast -> full_model
mpenkov Jan 19, 2019
3406bf0
speed up hash function based on ideas from @horpto and @menshikh-iv
mpenkov Jan 20, 2019
e458ff0
remove obsolete ft_hash function
mpenkov Jan 20, 2019
8144e20
review response: update docstring
mpenkov Jan 20, 2019
d735099
attempt to hack around appveyor Py2.7 build missing stdint.h
mpenkov Jan 21, 2019
fd32340
fixup: add missing int8_t typedef
mpenkov Jan 21, 2019
4e022d8
Merge remote-tracking branch 'upstream/develop' into fb-improv
mpenkov Jan 21, 2019
8e7bd40
review response: avoid split and join
mpenkov Jan 22, 2019
2ea4672
review response: add comment to explain hack
mpenkov Jan 22, 2019
9137a27
review response: improve logging message
mpenkov Jan 22, 2019
a654c3c
review response: fix hash_main function
mpenkov Jan 22, 2019
f392d80
fixup: fix test_utils.py
mpenkov Jan 22, 2019
4fd56d5
add tests for ngram generation
mpenkov Jan 22, 2019
98d0a09
Merge remote-tracking branch 'upstream/develop' into fb-improv
mpenkov Jan 22, 2019
b614030
fixup in tests
mpenkov Jan 22, 2019
46e1ec1
add emoji test case
mpenkov Jan 22, 2019
d1b80c0
minor fixup in logging message
mpenkov Jan 22, 2019
07c49c2
add byte tests
mpenkov Jan 22, 2019
ccaba08
remove FIXME, absense of ord does not influence correctness
mpenkov Jan 22, 2019
0b0e46e
review response: introduce list slicing
mpenkov Jan 22, 2019
cdab5b6
avoid using fstrings for Py2 compatibility
mpenkov Jan 23, 2019
d13e4c9
flake8
mpenkov Jan 23, 2019
a412e48
more Py2 compatibility
mpenkov Jan 23, 2019
6617b80
flake8
mpenkov Jan 23, 2019
da025d3
review response: get rid of set()
mpenkov Jan 23, 2019
402db7d
review response: remove excess bytes() call
mpenkov Jan 23, 2019
bb7282d
fix tests (wide unicode issue)
menshikh-iv Jan 23, 2019
8e72e18
Merge remote-tracking branch 'upstream/develop' into fb-improv
menshikh-iv Jan 23, 2019
07baf84
add test against actual FB implementation
mpenkov Jan 23, 2019
ce1a631
adding temporary benchmarking code
mpenkov Jan 24, 2019
e87ca05
replacing non-optimized code with optimized code
mpenkov Jan 24, 2019
38146bd
removing temporary benchmarking code
mpenkov Jan 24, 2019
136dc85
remove wide characters from fb test code
mpenkov Jan 24, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 53 additions & 5 deletions gensim/models/_fasttext_bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,49 @@ def _yield_field_names():

_FIELD_NAMES = sorted(set(_yield_field_names()))
Model = collections.namedtuple('Model', _FIELD_NAMES)
"""Holds data loaded from the Facebook binary.

Fields
------
dim : int
The dimensionality of the vectors.
ws : int
The window size.
epoch : int
The number of training epochs.
neg : int
If non-zero, indicates that the model uses negative sampling.
loss : int
If equal to 1, indicates that the model uses hierarchical sampling.
model : int
If equal to 2, indicates that the model uses skip-grams.
bucket : int
The number of buckets.
min_count : int
The threshold below which the model ignores terms.
t : float
The sample threshold.
minn : int
The minimum ngram length.
maxn : int
The maximum ngram length.
raw_vocab : collections.OrderedDict
A map from words (str) to their frequency (int). The order in the dict
corresponds to the order of the words in the Facebook binary.
nwords : int
The number of words.
vocab_size : int
The size of the vocabulary.
vectors_ngrams : numpy.array
This is a matrix that contains vectors learned by the model.
Each row corresponds to a vector.
The number of vectors is equal to the number of words plus the number of buckets.
The number of columns is equal to the vector dimensionality.
hidden_output : numpy.array
This is a matrix that contains the shallow neural network output.
This array has the same dimensions as vectors_ngrams.
May be None - in that case, it is impossible to continue training the model.
"""


def _struct_unpack(fin, fmt):
Expand Down Expand Up @@ -177,7 +220,7 @@ def _load_matrix(fin, new_format=True):
return matrix


def load(fin, encoding='utf-8'):
def load(fin, encoding='utf-8', full_model=True):
"""Load a model from a binary stream.

Parameters
Expand All @@ -186,6 +229,9 @@ def load(fin, encoding='utf-8'):
The readable binary stream.
encoding : str, optional
The encoding to use for decoding text
full_model : boolean, optional
If False, skips loading the hidden output matrix. This saves a fair bit
of time, but prevents training continuation.
mpenkov marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
Expand All @@ -209,10 +255,12 @@ def load(fin, encoding='utf-8'):

vectors_ngrams = _load_matrix(fin, new_format=new_format)

hidden_output = _load_matrix(fin, new_format=new_format)
model.update(vectors_ngrams=vectors_ngrams, hidden_output=hidden_output)

assert fin.read() == b'', 'expected to reach EOF'
if not full_model:
hidden_output = None
else:
hidden_output = _load_matrix(fin, new_format=new_format)
assert fin.read() == b'', 'expected to reach EOF'

model.update(vectors_ngrams=vectors_ngrams, hidden_output=hidden_output)
model = {k: v for k, v in model.items() if k in _FIELD_NAMES}
return Model(**model)
21 changes: 17 additions & 4 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,13 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor
>>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

"""
cant_train = hasattr(self.trainables, 'syn1neg') and self.trainables.syn1neg is None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stupid question: what if self.trainables does not have syn1neg attr at all, so can model train ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I don't see any other code that sets syn1neg to None. So, the new code uses that value to mean "cannot continue training".

If trainables does not have syn1neg at all, it is possible to start training.

if cant_train:
raise ValueError(
'this model cannot be trained any further, '
'if this is a native model, try loading it with full_model=True'
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
)

super(FastText, self).train(
sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words,
epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
Expand Down Expand Up @@ -754,7 +761,7 @@ def __contains__(self, word):
return self.wv.__contains__(word)

@classmethod
def load_fasttext_format(cls, model_file, encoding='utf8'):
def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.

Notes
Expand All @@ -770,14 +777,17 @@ def load_fasttext_format(cls, model_file, encoding='utf8'):
as Gensim requires only `.bin` file to the load entire fastText model.
encoding : str, optional
Specifies the file encoding.
full_model : boolean, optional
If False, skips loading the hidden output matrix. This saves a fair bit
of time, but prevents training continuation.

Returns
-------
:class: `~gensim.models.fasttext.FastText`
The loaded model.

"""
return _load_fasttext_format(model_file, encoding=encoding)
return _load_fasttext_format(model_file, encoding=encoding, full_model=full_model)

def load_binary_data(self, encoding='utf8'):
"""Load data from a binary file created by Facebook's native FastText.
Expand Down Expand Up @@ -959,7 +969,7 @@ def _pad_ones(m, new_shape):
return vstack([m, suffix])


def _load_fasttext_format(model_file, encoding='utf-8'):
def _load_fasttext_format(model_file, encoding='utf-8', full_model=True):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.

Parameters
Expand All @@ -971,6 +981,9 @@ def _load_fasttext_format(model_file, encoding='utf-8'):
as Gensim requires only `.bin` file to the load entire fastText model.
encoding : str, optional
Specifies the file encoding.
full_model : boolean, optional
If False, skips loading the hidden output matrix. This saves a fair bit
of time, but prevents training continuation.

Returns
-------
Expand All @@ -980,7 +993,7 @@ def _load_fasttext_format(model_file, encoding='utf-8'):
if not model_file.endswith('.bin'):
model_file += '.bin'
with smart_open(model_file, 'rb') as fin:
m = gensim.models._fasttext_bin.load(fin, encoding=encoding)
m = gensim.models._fasttext_bin.load(fin, encoding=encoding, full_model=full_model)

model = FastText(
size=m.dim,
Expand Down
15 changes: 15 additions & 0 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,21 @@ def setUp(self):
self.test_model = FT_gensim.load_fasttext_format(self.test_model_file)
self.test_new_model_file = datapath('lee_fasttext_new')

def test_native_partial_model(self):
menshikh-iv marked this conversation as resolved.
Show resolved Hide resolved
"""Can we skip loading the NN and still get a working model?"""
menshikh-iv marked this conversation as resolved.
Show resolved Hide resolved
model = FT_gensim.load_fasttext_format(self.test_model_file, full_model=False)

#
# Training continuation should be impossible
#
self.assertIsNone(model.trainables.syn1neg)
self.assertRaises(ValueError, model.train, sentences,
total_examples=model.corpus_count, epochs=model.epochs)

model.wv['green']
model.wv['foobar']
model.wv['thisworddoesnotexist']

def test_training(self):
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model.build_vocab(sentences)
Expand Down