Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Enable continuation of training of models loaded from native fastText #2299

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 76 additions & 23 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,33 +863,31 @@ def _load_vectors(self, file_handle):
Open file handle to persisted vectors.

"""
if self.new_format:
self.struct_unpack(file_handle, '@?') # bool quant_input in fasttext.cc
num_vectors, dim = self.struct_unpack(file_handle, '@2q')
# Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc)
assert self.wv.vector_size == dim, (
'mismatch between vector size in model params ({}) and model vectors ({})'
.format(self.wv.vector_size, dim)
self.wv.vectors_ngrams = _load_matrix(
file_handle,
new_format=self.new_format,
expected_vector_size=self.wv.vector_size
)
float_size = struct.calcsize('@f')
if float_size == 4:
dtype = np.dtype(np.float32)
elif float_size == 8:
dtype = np.dtype(np.float64)

self.num_original_vectors = num_vectors
self.wv.vectors_ngrams = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim)
self.wv.vectors_ngrams = self.wv.vectors_ngrams.reshape((num_vectors, dim))
assert self.wv.vectors_ngrams.shape == (
self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size), \
'mismatch between actual weight matrix shape {} and expected shape {}'\
.format(
self.wv.vectors_ngrams.shape, (self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size)
self.num_original_vectors = self.wv.vectors_ngrams.shape[0]

expected_shape = (self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size)
assert self.wv.vectors_ngrams.shape == expected_shape, \
'mismatch between actual weight matrix shape {} and expected shape {}'.format(
self.wv.vectors_ngrams.shape, expected_shape
)

self.trainables.init_ngrams_post_load(self.file_name, self.wv)
self._clear_post_train()

#
# FIXME: not sure what to do with this yet, but we will need it.
#
hidden_output = _load_matrix(
file_handle,
new_format=self.new_format,
expected_vector_size=self.wv.vector_size
)
mpenkov marked this conversation as resolved.
Show resolved Hide resolved

def struct_unpack(self, file_handle, fmt):
"""Read a single object from an open file.

Expand All @@ -906,8 +904,7 @@ def struct_unpack(self, file_handle, fmt):
Unpacked structure.

"""
num_bytes = struct.calcsize(fmt)
return struct.unpack(fmt, file_handle.read(num_bytes))
return _struct_unpack(file_handle, fmt)

def save(self, *args, **kwargs):
"""Save the Fasttext model. This saved model can be loaded again using
Expand Down Expand Up @@ -967,6 +964,62 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse
return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)


def _struct_unpack(file_handle, fmt):
num_bytes = struct.calcsize(fmt)
return struct.unpack(fmt, file_handle.read(num_bytes))


def _load_matrix(file_handle, new_format=True, expected_vector_size=None):
"""Load a matrix from fastText native format.
mpenkov marked this conversation as resolved.
Show resolved Hide resolved

Interprets the matrix dimensions and type from the file stream.

Parameters
----------
file_handle : file
A file handle opened for reading.
new_format : boolean
True if the quant_input variable precedes
the matrix declaration. Should be True for newer versions of fastText.
expected_vector_size : int
The expected dimensionality of each vector.
If you specify this and the matrix's dimensionality is different,
will raise an assertion.

Returns
-------
:class:`numpy.array`
The vectors as an array.
Each vector will be a row in the array.
The number of columns of the array will correspond to the vector size.

See Also
--------
https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc

"""
if new_format:
_struct_unpack(file_handle, '@?') # bool quant_input in fasttext.cc

num_vectors, dim = _struct_unpack(file_handle, '@2q')
assert expected_vector_size is None or expected_vector_size == dim, (
'mismatch between vector size in model params ({}) and model vectors ({})'
.format(expected_vector_size, dim)
)

float_size = struct.calcsize('@f')
if float_size == 4:
dtype = np.dtype(np.float32)
elif float_size == 8:
dtype = np.dtype(np.float64)
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
else:
raise ValueError("Incompatible float size: %r" % float_size)

matrix = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim)
matrix = matrix.reshape((num_vectors, dim))
return matrix


class FastTextVocab(Word2VecVocab):
"""Vocabulary used by :class:`~gensim.models.fasttext.FastText`."""
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75):
Expand Down
1 change: 1 addition & 0 deletions gensim/test/test_data/toy-data.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic institutions anarchists advocate social relations based upon voluntary association of autonomous individuals mutual aid and self governance while anarchism is most easily defined by what it is against anarchists also offer positive visions of what they believe to be a truly free society however ideas about how an anarchist society might work vary considerably especially with respect to economics there is also disagreement about how a free society might be brought about origins and predecessors kropotkin and others argue that before recorded history human society was organized on anarchist principles most anthropologists follow kropotkin and engels in believing that hunter gatherer bands were egalitarian and lacked division of labour accumulated wealth or decreed law and had equal access to resources william godwin anarchists including the the anarchy organisation and rothbard find anarchist attitudes in taoism from ancient china kropotkin found similar ideas in stoic zeno of citium according to kropotkin zeno repudiated the omnipotence of the state its intervention and regimentation and proclaimed the sovereignty of the moral law of the individual the anabaptists of one six th century europe are sometimes considered to be religious forerunners of modern anarchism bertrand russell in his history of western philosophy writes that the anabaptists repudiated all law since they held that the good man will be guided at every moment by the holy spirit from this premise they arrive at communism the diggers or true levellers were an early communistic movement during the time of the english civil war and are considered by some as forerunners of modern anarchism in the modern era the first to use the term to mean something other than chaos was louis armand baron de lahontan in his nouveaux voyages dans l am rique septentrionale one seven zero three where he described the indigenous american society which had no state laws prisons priests or private property as being in anarchy russell means a libertarian and leader in the american indian movement has repeatedly stated that he is an anarchist and so are all his ancestors in one seven nine three in the thick of the french revolution william godwin published an enquiry concerning political justice although godwin did not use the word anarchism many later anarchists have regarded this book as the first major anarchist text and godwin as the founder of philosophical anarchism but at this point no anarchist movement yet existed and the term anarchiste was known mainly as an insult hurled by the bourgeois girondins at more radical elements in the french revolution the first self labelled anarchist pierre joseph proudhon it is commonly held that it wasn t until pierre joseph proudhon published what is property in one eight four zero that the term anarchist was adopted as a self description it is for this reason that some claim proudhon as the founder of modern anarchist theory in what is property proudhon answers with the famous accusation property is theft in this work he opposed the institution of decreed property p
Binary file added gensim/test/test_data/toy-model.bin
Binary file not shown.
Loading