diff --git a/gensim/matutils.py b/gensim/matutils.py index 38e2431caf..717612cb0f 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -602,13 +602,12 @@ def jaccard_distance(set1, set2): def dirichlet_expectation(alpha): """ For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`. - """ if len(alpha.shape) == 1: result = psi(alpha) - psi(np.sum(alpha)) else: result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis] - return result.astype(alpha.dtype) # keep the same precision as input + return result.astype(alpha.dtype, copy=False) # keep the same precision as input def qr_destroy(la): diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 9c691b8c25..6abcf6d34a 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -70,6 +70,7 @@ def __init__(self, eta, lambda_shape, gamma_shape): self.sstats = np.zeros(lambda_shape) self.gamma = np.zeros(gamma_shape) self.numdocs = 0 + self.dtype = np.float64 # To be compatible with LdaState def construct_doc2author(corpus, author2doc): @@ -203,6 +204,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data """ + # NOTE: this doesn't call constructor of a base class, but duplicates most of this code + # so we have to set dtype to float64 default here + self.dtype = np.float64 # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState. diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index ebae837b57..aeb771e637 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -538,7 +538,7 @@ def suggested_lda_model(self): The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta. """ alpha, beta = self.hdp_to_lda() - ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state) + ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state, dtype=np.float64) ldam.expElogbeta[:] = beta return ldam diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 543ee37c3e..a469608bd2 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -60,7 +60,7 @@ def update_dir_prior(prior, N, logphat, rho): **Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.** http://jonathan-huang.org/research/dirichlet/dirichlet.pdf """ - dprior = np.copy(prior) + dprior = np.copy(prior) # TODO: unused var??? gradf = N * (psi(np.sum(prior)) - psi(prior) + logphat) c = N * polygamma(1, np.sum(prior)) @@ -87,10 +87,11 @@ class LdaState(utils.SaveLoad): """ - def __init__(self, eta, shape): - self.eta = eta - self.sstats = np.zeros(shape) + def __init__(self, eta, shape, dtype=np.float32): + self.eta = eta.astype(dtype, copy=False) + self.sstats = np.zeros(shape, dtype=dtype) self.numdocs = 0 + self.dtype = dtype def reset(self): """ @@ -165,6 +166,17 @@ def get_lambda(self): def get_Elogbeta(self): return dirichlet_expectation(self.get_lambda()) + + @classmethod + def load(cls, fname, *args, **kwargs): + result = super(LdaState, cls).load(fname, *args, **kwargs) + + # dtype could be absent in old models + if not hasattr(result, 'dtype'): + result.dtype = np.float64 # float64 was implicitly used before (cause it's default in numpy) + logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname) + + return result # endclass LdaState @@ -191,7 +203,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf=None, minimum_phi_value=0.01, - per_word_topics=False, callbacks=None): + per_word_topics=False, callbacks=None, dtype=np.float32): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). @@ -233,9 +245,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, `minimum_probability` controls filtering the topics returned for a document (bow). - `random_state` can be a np.random.RandomState object or the seed for one + `random_state` can be a np.random.RandomState object or the seed for one. + + `callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training. - `callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training + `dtype` is data-type to use during calculations inside model. All inputs are also converted to this dtype. Example: @@ -247,6 +261,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ + self.dtype = dtype # store user-supplied parameters self.id2word = id2word @@ -330,10 +345,14 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, raise RuntimeError("failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) - self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) - self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) + self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype) + self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) + # Check that we haven't accidentally fall back to np.float64 + assert self.eta.dtype == self.dtype + assert self.expElogbeta.dtype == self.dtype + # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None @@ -354,25 +373,25 @@ def init_dir_prior(self, prior, name): if isinstance(prior, six.string_types): if prior == 'symmetric': - logger.info("using symmetric %s at %s", name, 1.0 / prior_shape) - init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)]) + logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics) + init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype) elif prior == 'asymmetric': - init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)]) + init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype) init_prior /= init_prior.sum() logger.info("using asymmetric %s %s", name, list(init_prior)) elif prior == 'auto': is_auto = True - init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)]) + init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype) if name == 'alpha': logger.info("using autotuned %s, starting with %s", name, list(init_prior)) else: raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior)) elif isinstance(prior, list): - init_prior = np.asarray(prior) + init_prior = np.asarray(prior, dtype=self.dtype) elif isinstance(prior, np.ndarray): - init_prior = prior + init_prior = prior.astype(self.dtype, copy=False) elif isinstance(prior, np.number) or isinstance(prior, numbers.Real): - init_prior = np.asarray([prior] * prior_shape) + init_prior = np.asarray([prior] * prior_shape, dtype=self.dtype) else: raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name) @@ -385,6 +404,7 @@ def __str__(self): def sync_state(self): self.expElogbeta = np.exp(self.state.get_Elogbeta()) + assert self.expElogbeta.dtype == self.dtype def clear(self): """Clear model state (free up some memory). Used in the distributed algo.""" @@ -418,11 +438,15 @@ def inference(self, chunk, collect_sstats=False): logger.debug("performing inference on a chunk of %i documents", len(chunk)) # Initialize the variational distribution q(theta|gamma) for the chunk - gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)) + gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) + + assert Elogtheta.dtype == self.dtype + assert expElogtheta.dtype == self.dtype + if collect_sstats: - sstats = np.zeros_like(self.expElogbeta) + sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype) else: sstats = None converged = 0 @@ -437,7 +461,7 @@ def inference(self, chunk, collect_sstats=False): ids = [int(idx) for idx, _ in doc] else: ids = [idx for idx, _ in doc] - cts = np.array([cnt for _, cnt in doc]) + cts = np.array([cnt for _, cnt in doc], dtype=self.dtype) gammad = gamma[d, :] Elogthetad = Elogtheta[d, :] expElogthetad = expElogtheta[d, :] @@ -464,6 +488,7 @@ def inference(self, chunk, collect_sstats=False): converged += 1 break gamma[d, :] = gammad + assert gammad.dtype == self.dtype if collect_sstats: # Contribution of document d to the expected sufficient # statistics for the M step. @@ -478,6 +503,9 @@ def inference(self, chunk, collect_sstats=False): # sstats[k, w] = \sum_d n_{dw} * phi_{dwk} # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. sstats *= self.expElogbeta + assert sstats.dtype == self.dtype + + assert gamma.dtype == self.dtype return gamma, sstats def do_estep(self, chunk, state=None): @@ -491,6 +519,7 @@ def do_estep(self, chunk, state=None): gamma, sstats = self.inference(chunk, collect_sstats=True) state.sstats += sstats state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator + assert gamma.dtype == self.dtype return gamma def update_alpha(self, gammat, rho): @@ -500,10 +529,12 @@ def update_alpha(self, gammat, rho): """ N = float(len(gammat)) logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N + assert logphat.dtype == self.dtype self.alpha = update_dir_prior(self.alpha, N, logphat, rho) logger.info("optimized alpha %s", list(self.alpha)) + assert self.alpha.dtype == self.dtype return self.alpha def update_eta(self, lambdat, rho): @@ -513,9 +544,11 @@ def update_eta(self, lambdat, rho): """ N = float(lambdat.shape[0]) logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,)) + assert logphat.dtype == self.dtype self.eta = update_dir_prior(self.eta, N, logphat, rho) + assert self.eta.dtype == self.dtype return self.eta def log_perplexity(self, chunk, total_docs=None): @@ -647,7 +680,7 @@ def rho(): logger.info('initializing %s workers', self.numworkers) self.dispatcher.reset(self.state) else: - other = LdaState(self.eta, self.state.sstats.shape) + other = LdaState(self.eta, self.state.sstats.shape, self.dtype) dirty = False reallen = 0 @@ -691,7 +724,7 @@ def rho(): logger.info('initializing workers') self.dispatcher.reset(self.state) else: - other = LdaState(self.eta, self.state.sstats.shape) + other = LdaState(self.eta, self.state.sstats.shape, self.dtype) dirty = False # endfor single corpus iteration @@ -772,6 +805,9 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0): gammad = gamma[d] Elogthetad = dirichlet_expectation(gammad) + assert gammad.dtype == self.dtype + assert Elogthetad.dtype == self.dtype + # E[log p(doc | theta, beta)] score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc) @@ -820,6 +856,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): # add a little random jitter, to randomize results around the same alpha sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha)) + # random_state.rand returns float64, but converting back to dtype won't speed up anything sorted_topics = list(matutils.argsort(sort_alpha)) chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:] @@ -856,7 +893,7 @@ def show_topic(self, topicid, topn=10): def get_topics(self): """ Returns: - np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents + np.ndarray: `num_topics` x `vocabulary_size` array of floats (self.dtype) which represents the term topic matrix learned during inference. """ topics = self.state.get_lambda() @@ -1028,6 +1065,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100, >>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2` >>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2` + Note: this ignores difference in model dtypes """ distances = { @@ -1186,9 +1224,14 @@ def load(cls, fname, *args, **kwargs): result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)` logging.warning("random_state not set so using default value") + # dtype could be absent in old models + if not hasattr(result, 'dtype'): + result.dtype = np.float64 # float64 was implicitly used before (cause it's default in numpy) + logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname) + state_fname = utils.smart_extension(fname, '.state') try: - result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) + result.state = LdaState.load(state_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load state from %s: %s", state_fname, e) diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py index 0c22c64f7c..186029c971 100644 --- a/gensim/models/ldamulticore.py +++ b/gensim/models/ldamulticore.py @@ -49,6 +49,8 @@ import logging +import numpy as np + from gensim import utils from gensim.models.ldamodel import LdaModel, LdaState @@ -82,7 +84,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, chunksize=2000, passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, random_state=None, minimum_probability=0.01, - minimum_phi_value=0.01, per_word_topics=False): + minimum_phi_value=0.01, per_word_topics=False, dtype=np.float32): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). @@ -148,7 +150,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta, decay=decay, offset=offset, eval_every=eval_every, iterations=iterations, gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability, - minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics + minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype ) def update(self, corpus, chunks_as_numpy=False): diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 26709c04d6..8173d0d292 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -130,7 +130,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ if initialize == 'gensim': lda_model = ldamodel.LdaModel( corpus, id2word=self.id2word, num_topics=self.num_topics, - passes=passes, alpha=self.alphas, random_state=random_state + passes=passes, alpha=self.alphas, random_state=random_state, + dtype=np.float64 ) self.sstats = np.transpose(lda_model.state.sstats) if initialize == 'ldamodel': @@ -244,7 +245,7 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, vocab_len = self.vocab_len bound = 0.0 - lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word) + lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64) lda.topics = np.array(np.split(np.zeros(vocab_len * num_topics), vocab_len)) ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda) @@ -419,7 +420,7 @@ def __getitem__(self, doc): """ Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed. """ - lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word) + lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64) lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len)) ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 19c93e5f6c..e58c85f17a 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -373,7 +373,8 @@ def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, alpha=mallet_model.alpha, iterations=iterations, - gamma_threshold=gamma_threshold + gamma_threshold=gamma_threshold, + dtype=numpy.float64 # don't loose precision when converting from MALLET ) model_gensim.expElogbeta[:] = mallet_model.wordtopics return model_gensim diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py index ede5074b99..3c7b747e61 100644 --- a/gensim/models/wrappers/ldavowpalwabbit.py +++ b/gensim/models/wrappers/ldavowpalwabbit.py @@ -586,7 +586,8 @@ def vwmodel2ldamodel(vw_model, iterations=50): model_gensim = LdaModel( num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, - offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold + offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold, + dtype=numpy.float32 ) model_gensim.expElogbeta[:] = vw_model._get_topics() return model_gensim diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index 40d7c52db0..76620cb498 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -26,7 +26,7 @@ class LdaTransformer(TransformerMixin, BaseEstimator): def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, - minimum_probability=0.01, random_state=None, scorer='perplexity'): + minimum_probability=0.01, random_state=None, scorer='perplexity', dtype=np.float32): """ Sklearn wrapper for LDA model. See gensim.model.LdaModel for parameter details. @@ -50,6 +50,7 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat self.minimum_probability = minimum_probability self.random_state = random_state self.scorer = scorer + self.dtype = dtype def fit(self, X, y=None): """ @@ -67,7 +68,7 @@ def fit(self, X, y=None): alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, - random_state=self.random_state + random_state=self.random_state, dtype=self.dtype ) return self @@ -109,7 +110,8 @@ def partial_fit(self, X): chunksize=self.chunksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold, - minimum_probability=self.minimum_probability, random_state=self.random_state + minimum_probability=self.minimum_probability, random_state=self.random_state, + dtype=self.dtype ) self.gensim_model.update(corpus=X) diff --git a/gensim/test/basetmtests.py b/gensim/test/basetmtests.py index e8cb1d259d..78587c1e56 100644 --- a/gensim/test/basetmtests.py +++ b/gensim/test/basetmtests.py @@ -48,6 +48,7 @@ def test_get_topics(self): vocab_size = len(self.model.id2word) for topic in topics: self.assertTrue(isinstance(topic, np.ndarray)) - self.assertEqual(topic.dtype, np.float64) + # Note: started moving to np.float32 as default + # self.assertEqual(topic.dtype, np.float64) self.assertEqual(vocab_size, topic.shape[0]) self.assertAlmostEqual(np.sum(topic), 1.0, 5) diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index 5947280f59..f1d68ae7a7 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -541,6 +541,20 @@ def testLargeMmapCompressed(self): # test loading the large model arrays with mmap self.assertRaises(IOError, self.class_.load, fname, mmap='r') + def testDtypeBackwardCompatibility(self): + atmodel_3_0_1_fname = datapath('atmodel_3_0_1_model') + expected_topics = [(0, 0.068200842977296727), (1, 0.93179915702270333)] + + # save model to use in test + # self.model.save(atmodel_3_0_1_fname) + + # load a model saved using a 3.0.1 version of Gensim + model = self.class_.load(atmodel_3_0_1_fname) + + # and test it on a predefined document + topics = model['jane'] + self.assertTrue(np.allclose(expected_topics, topics)) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) diff --git a/gensim/test/test_data/DTM/ldaseq_3_0_1_model b/gensim/test/test_data/DTM/ldaseq_3_0_1_model new file mode 100644 index 0000000000..c34eac22f5 Binary files /dev/null and b/gensim/test/test_data/DTM/ldaseq_3_0_1_model differ diff --git a/gensim/test/test_data/atmodel_3_0_1_model b/gensim/test/test_data/atmodel_3_0_1_model new file mode 100644 index 0000000000..e96b4fb271 Binary files /dev/null and b/gensim/test/test_data/atmodel_3_0_1_model differ diff --git a/gensim/test/test_data/atmodel_3_0_1_model.expElogbeta.npy b/gensim/test/test_data/atmodel_3_0_1_model.expElogbeta.npy new file mode 100644 index 0000000000..4e3098881b Binary files /dev/null and b/gensim/test/test_data/atmodel_3_0_1_model.expElogbeta.npy differ diff --git a/gensim/test/test_data/atmodel_3_0_1_model.id2word b/gensim/test/test_data/atmodel_3_0_1_model.id2word new file mode 100644 index 0000000000..7b3193139b Binary files /dev/null and b/gensim/test/test_data/atmodel_3_0_1_model.id2word differ diff --git a/gensim/test/test_data/atmodel_3_0_1_model.state b/gensim/test/test_data/atmodel_3_0_1_model.state new file mode 100644 index 0000000000..d75a70aee0 Binary files /dev/null and b/gensim/test/test_data/atmodel_3_0_1_model.state differ diff --git a/gensim/test/test_data/lda_3_0_1_model b/gensim/test/test_data/lda_3_0_1_model new file mode 100644 index 0000000000..059dfe0d5b Binary files /dev/null and b/gensim/test/test_data/lda_3_0_1_model differ diff --git a/gensim/test/test_data/lda_3_0_1_model.expElogbeta.npy b/gensim/test/test_data/lda_3_0_1_model.expElogbeta.npy new file mode 100644 index 0000000000..b81cca4f6b Binary files /dev/null and b/gensim/test/test_data/lda_3_0_1_model.expElogbeta.npy differ diff --git a/gensim/test/test_data/lda_3_0_1_model.id2word b/gensim/test/test_data/lda_3_0_1_model.id2word new file mode 100644 index 0000000000..7b3193139b Binary files /dev/null and b/gensim/test/test_data/lda_3_0_1_model.id2word differ diff --git a/gensim/test/test_data/lda_3_0_1_model.state b/gensim/test/test_data/lda_3_0_1_model.state new file mode 100644 index 0000000000..2ca675618d Binary files /dev/null and b/gensim/test/test_data/lda_3_0_1_model.state differ diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index df9e0c1c72..98c4a38b74 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -15,6 +15,7 @@ import six import numpy as np +from numpy.testing import assert_allclose from gensim.corpora import mmcorpus, Dictionary from gensim.models import ldamodel, ldamulticore @@ -81,32 +82,32 @@ def testAlpha(self): kwargs['alpha'] = 'symmetric' model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) - self.assertTrue(all(model.alpha == np.array([0.5, 0.5]))) + assert_allclose(model.alpha, np.array([0.5, 0.5])) kwargs['alpha'] = 'asymmetric' model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) - self.assertTrue(np.allclose(model.alpha, [0.630602, 0.369398])) + assert_allclose(model.alpha, [0.630602, 0.369398], rtol=1e-5) kwargs['alpha'] = 0.3 model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) - self.assertTrue(all(model.alpha == np.array([0.3, 0.3]))) + assert_allclose(model.alpha, np.array([0.3, 0.3])) kwargs['alpha'] = 3 model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) - self.assertTrue(all(model.alpha == np.array([3, 3]))) + assert_allclose(model.alpha, np.array([3, 3])) kwargs['alpha'] = [0.3, 0.3] model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) - self.assertTrue(all(model.alpha == np.array([0.3, 0.3]))) + assert_allclose(model.alpha, np.array([0.3, 0.3])) kwargs['alpha'] = np.array([0.3, 0.3]) model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) - self.assertTrue(all(model.alpha == np.array([0.3, 0.3]))) + assert_allclose(model.alpha, np.array([0.3, 0.3])) # all should raise an exception for being wrong shape kwargs['alpha'] = [0.3, 0.3, 0.3] @@ -126,7 +127,7 @@ def testEtaAuto(self): modelauto = self.class_(corpus, id2word=dictionary, eta='auto', passes=10) # did we learn something? - self.assertFalse(all(np.equal(model1.eta, modelauto.eta))) + self.assertFalse(np.allclose(model1.eta, modelauto.eta)) def testEta(self): kwargs = dict( @@ -140,32 +141,32 @@ def testEta(self): # should not raise anything model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) - self.assertTrue(all(model.eta == np.array([0.5] * num_terms))) + assert_allclose(model.eta, np.array([0.5] * num_terms)) kwargs['eta'] = 'symmetric' model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) - self.assertTrue(all(model.eta == np.array([0.5] * num_terms))) + assert_allclose(model.eta, np.array([0.5] * num_terms)) kwargs['eta'] = 0.3 model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) - self.assertTrue(all(model.eta == np.array([0.3] * num_terms))) + assert_allclose(model.eta, np.array([0.3] * num_terms)) kwargs['eta'] = 3 model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) - self.assertTrue(all(model.eta == np.array([3] * num_terms))) + assert_allclose(model.eta, np.array([3] * num_terms)) kwargs['eta'] = [0.3] * num_terms model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) - self.assertTrue(all(model.eta == np.array([0.3] * num_terms))) + assert_allclose(model.eta, np.array([0.3] * num_terms)) kwargs['eta'] = np.array([0.3] * num_terms) model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) - self.assertTrue(all(model.eta == np.array([0.3] * num_terms))) + assert_allclose(model.eta, np.array([0.3] * num_terms)) # should be ok with num_topics x num_terms testeta = np.array([[0.5] * len(dictionary)] * 2) @@ -197,14 +198,14 @@ def testTopTopics(self): for v, k in topic: self.assertTrue(isinstance(k, six.string_types)) - self.assertTrue(isinstance(v, float)) + self.assertTrue(np.issubdtype(v, float)) def testGetTopicTerms(self): topic_terms = self.model.get_topic_terms(1) for k, v in topic_terms: self.assertTrue(isinstance(k, numbers.Integral)) - self.assertTrue(isinstance(v, float)) + self.assertTrue(np.issubdtype(v, float)) def testGetDocumentTopics(self): @@ -218,7 +219,7 @@ def testGetDocumentTopics(self): self.assertTrue(isinstance(topic, list)) for k, v in topic: self.assertTrue(isinstance(k, int)) - self.assertTrue(isinstance(v, float)) + self.assertTrue(np.issubdtype(v, float)) # Test case to use the get_document_topic function for the corpus all_topics = model.get_document_topics(self.corpus, per_word_topics=True) @@ -229,7 +230,7 @@ def testGetDocumentTopics(self): self.assertTrue(isinstance(topic, tuple)) for k, v in topic[0]: # list of doc_topics self.assertTrue(isinstance(k, int)) - self.assertTrue(isinstance(v, float)) + self.assertTrue(np.issubdtype(v, float)) for w, topic_list in topic[1]: # list of word_topics self.assertTrue(isinstance(w, int)) @@ -253,7 +254,7 @@ def testGetDocumentTopics(self): self.assertTrue(isinstance(topic, tuple)) for k, v in topic[0]: # list of doc_topics self.assertTrue(isinstance(k, int)) - self.assertTrue(isinstance(v, float)) + self.assertTrue(np.issubdtype(v, float)) if len(topic[0]) != 0: doc_topic_count_na += 1 @@ -274,7 +275,7 @@ def testGetDocumentTopics(self): for k, v in doc_topics: self.assertTrue(isinstance(k, int)) - self.assertTrue(isinstance(v, float)) + self.assertTrue(np.issubdtype(v, float)) for w, topic_list in word_topics: self.assertTrue(isinstance(w, int)) @@ -302,7 +303,7 @@ def testTermTopics(self): result = model.get_term_topics(2) for topic_no, probability in result: self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(probability, float)) + self.assertTrue(np.issubdtype(probability, float)) # checks if topic '1' is in the result list # FIXME: Fails on osx and win @@ -312,7 +313,7 @@ def testTermTopics(self): result = model.get_term_topics(str(model.id2word[2])) for topic_no, probability in result: self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(probability, float)) + self.assertTrue(np.issubdtype(probability, float)) # checks if topic '1' is in the result list # FIXME: Fails on osx and win @@ -472,6 +473,21 @@ def testRandomStateBackwardCompatibility(self): self.assertTrue(isinstance(i[0], int)) self.assertTrue(isinstance(i[1], six.string_types)) + def testDtypeBackwardCompatibility(self): + lda_3_0_1_fname = datapath('lda_3_0_1_model') + test_doc = [(0, 1), (1, 1), (2, 1)] + expected_topics = [(0, 0.87005886977475178), (1, 0.12994113022524822)] + + # save model to use in test + # self.model.save(lda_3_0_1_fname) + + # load a model saved using a 3.0.1 version of Gensim + model = self.class_.load(lda_3_0_1_fname) + + # and test it on a predefined document + topics = model[test_doc] + self.assertTrue(np.allclose(expected_topics, topics)) + # endclass TestLdaModel diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index eac238dcdc..eb6ea120f6 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -221,6 +221,21 @@ def testDocTopic(self): expected_doc_topic = 0.00066577896138482028 self.assertAlmostEqual(doc_topic[0], expected_doc_topic, places=2) + def testDtypeBackwardCompatibility(self): + ldaseq_3_0_1_fname = datapath('DTM/ldaseq_3_0_1_model') + test_doc = [(547, 1), (549, 1), (552, 1), (555, 1)] + expected_topics = [0.99751244, 0.00248756] + + # save model to use in test + # self.ldaseq.save(ldaseq_3_0_1_fname) + + # load a model saved using a 3.0.1 version of Gensim + model = ldaseqmodel.LdaSeqModel.load(ldaseq_3_0_1_fname) + + # and test it on a predefined document + topics = model[test_doc] + self.assertTrue(np.allclose(expected_topics, topics)) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)