piskvorky · menshikh-iv · Nov 14, 2017 · Oct 26, 2017 · Oct 28, 2017 · Oct 28, 2017
diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -600,13 +600,12 @@ def jaccard_distance(set1, set2):
 def dirichlet_expectation(alpha):
     """
     For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`.
-
     """
     if len(alpha.shape) == 1:
         result = psi(alpha) - psi(np.sum(alpha))
     else:
         result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
-    return result.astype(alpha.dtype)  # keep the same precision as input
+    return result
 
 
 def qr_destroy(la):

diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py
@@ -70,6 +70,7 @@ def __init__(self, eta, lambda_shape, gamma_shape):
         self.sstats = np.zeros(lambda_shape)
         self.gamma = np.zeros(gamma_shape)
         self.numdocs = 0
+        self.dtype = np.float64  # To be compatible with LdaState
 
 
 def construct_doc2author(corpus, author2doc):
@@ -203,6 +204,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d
         >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5)  # train asymmetric alpha from data
 
         """
+        # NOTE: this doesn't call constructor of a base class, but duplicates most of this code
+        # so we have to set dtype to float64 default here
+        self.dtype = np.float64
 
         # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
         # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState.

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -87,10 +87,11 @@ class LdaState(utils.SaveLoad):
 
     """
 
-    def __init__(self, eta, shape):
-        self.eta = eta
-        self.sstats = np.zeros(shape)
+    def __init__(self, eta, shape, dtype=np.float32):
+        self.eta = eta.astype(dtype, copy=False)
+        self.sstats = np.zeros(shape, dtype)
         self.numdocs = 0
+        self.dtype = dtype
 
     def reset(self):
         """
@@ -165,6 +166,18 @@ def get_lambda(self):
 
     def get_Elogbeta(self):
         return dirichlet_expectation(self.get_lambda())
+
+    @classmethod
+    def load(cls, fname, *args, **kwargs):
+        result = super(LdaState, cls).load(fname, *args, **kwargs)
+
+        # Check if `dtype` is set after main pickle load
+        # if not, then it's an old model and we should set it to default `np.float64`
+        if not hasattr(result, 'dtype'):
+            result.dtype = np.float64  # float64 was used before as default in numpy
+            logging.warning("dtype was not set in LdaState, so using np.float64")
+
+        return result
 # endclass LdaState
 
 
@@ -191,7 +204,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                  alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10,
                  iterations=50, gamma_threshold=0.001, minimum_probability=0.01,
                  random_state=None, ns_conf=None, minimum_phi_value=0.01,
-                 per_word_topics=False, callbacks=None):
+                 per_word_topics=False, callbacks=None, dtype=np.float32):
         """
         If given, start training from the iterable `corpus` straight away. If not given,
         the model is left untrained (presumably because you want to call `update()` manually).
@@ -233,9 +246,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
 
         `minimum_probability` controls filtering the topics returned for a document (bow).
 
-        `random_state` can be a np.random.RandomState object or the seed for one
+        `random_state` can be a np.random.RandomState object or the seed for one.
 
-        `callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training
+        `callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training.
+
+        `dtype` is data-type to use during calculations inside model. All inputs are also converted to this dtype.
 
         Example:
 
@@ -247,6 +262,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data
 
         """
+        self.dtype = dtype
 
         # store user-supplied parameters
         self.id2word = id2word
@@ -330,10 +346,13 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                 raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
 
         # Initialize the variational distribution q(beta|lambda)
-        self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
-        self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
+        self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype)
+        self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
         self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))
 
+        assert self.eta.dtype == self.dtype
+        assert self.expElogbeta.dtype == self.dtype
+
         # if a training corpus was provided, start estimating the model right away
         if corpus is not None:
             use_numpy = self.dispatcher is not None
@@ -354,25 +373,25 @@ def init_dir_prior(self, prior, name):
 
         if isinstance(prior, six.string_types):
             if prior == 'symmetric':
-                logger.info("using symmetric %s at %s", name, 1.0 / prior_shape)
-                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
+                logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
+                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
             elif prior == 'asymmetric':
-                init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)])
+                init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype)
                 init_prior /= init_prior.sum()
                 logger.info("using asymmetric %s %s", name, list(init_prior))
             elif prior == 'auto':
                 is_auto = True
-                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
+                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
                 if name == 'alpha':
                     logger.info("using autotuned %s, starting with %s", name, list(init_prior))
             else:
                 raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior))
         elif isinstance(prior, list):
-            init_prior = np.asarray(prior)
+            init_prior = np.asarray(prior, dtype=self.dtype)
         elif isinstance(prior, np.ndarray):
-            init_prior = prior
+            init_prior = prior.astype(self.dtype, copy=False)
         elif isinstance(prior, np.number) or isinstance(prior, numbers.Real):
-            init_prior = np.asarray([prior] * prior_shape)
+            init_prior = np.asarray([prior] * prior_shape, dtype=self.dtype)
         else:
             raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name)
 
@@ -385,6 +404,7 @@ def __str__(self):
 
     def sync_state(self):
         self.expElogbeta = np.exp(self.state.get_Elogbeta())
+        assert self.expElogbeta.dtype == self.dtype
 
     def clear(self):
         """Clear model state (free up some memory). Used in the distributed algo."""
@@ -418,11 +438,13 @@ def inference(self, chunk, collect_sstats=False):
             logger.debug("performing inference on a chunk of %i documents", len(chunk))
 
         # Initialize the variational distribution q(theta|gamma) for the chunk
-        gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics))
+        gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False)
         Elogtheta = dirichlet_expectation(gamma)
         expElogtheta = np.exp(Elogtheta)
+        assert expElogtheta.dtype == self.dtype
+
         if collect_sstats:
-            sstats = np.zeros_like(self.expElogbeta)
+            sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype)
         else:
             sstats = None
         converged = 0
@@ -478,6 +500,9 @@ def inference(self, chunk, collect_sstats=False):
             # sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
             # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
             sstats *= self.expElogbeta
+            assert sstats.dtype == self.dtype
+
+        assert gamma.dtype == self.dtype
         return gamma, sstats
 
     def do_estep(self, chunk, state=None):
@@ -500,10 +525,13 @@ def update_alpha(self, gammat, rho):
         """
         N = float(len(gammat))
         logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N
+        assert logphat.dtype == self.dtype
 
         self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
         logger.info("optimized alpha %s", list(self.alpha))
 
+        assert self.alpha.dtype == self.dtype
+
         return self.alpha
 
     def update_eta(self, lambdat, rho):
@@ -513,9 +541,12 @@ def update_eta(self, lambdat, rho):
         """
         N = float(lambdat.shape[0])
         logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))
+        assert logphat.dtype == self.dtype
 
         self.eta = update_dir_prior(self.eta, N, logphat, rho)
 
+        assert self.eta.dtype == self.dtype
+
         return self.eta
 
     def log_perplexity(self, chunk, total_docs=None):
@@ -647,7 +678,7 @@ def rho():
                 logger.info('initializing %s workers', self.numworkers)
                 self.dispatcher.reset(self.state)
             else:
-                other = LdaState(self.eta, self.state.sstats.shape)
+                other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
             dirty = False
 
             reallen = 0
@@ -671,6 +702,7 @@ def rho():
                         pass_, chunk_no * chunksize + len(chunk), lencorpus
                     )
                     gammat = self.do_estep(chunk, other)
+                    assert gammat.dtype == self.dtype
 
                     if self.optimize_alpha:
                         self.update_alpha(gammat, rho())
@@ -691,7 +723,7 @@ def rho():
                         logger.info('initializing workers')
                         self.dispatcher.reset(self.state)
                     else:
-                        other = LdaState(self.eta, self.state.sstats.shape)
+                        other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
                     dirty = False
             # endfor single corpus iteration
 
@@ -723,6 +755,7 @@ def do_mstep(self, rho, other, extra_pass=False):
 
         """
         logger.debug("updating topics")
+        assert other.dtype == self.dtype
         # update self with the new blend; also keep track of how much did
         # the topics change through this update, to assess convergence
         diff = np.log(self.expElogbeta)
@@ -772,6 +805,9 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0):
                 gammad = gamma[d]
             Elogthetad = dirichlet_expectation(gammad)
 
+            assert gammad.dtype == self.dtype
+            assert Elogthetad.dtype == self.dtype
+
             # E[log p(doc | theta, beta)]
             score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
 
@@ -793,6 +829,8 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0):
         else:
             sum_eta = np.sum(self.eta)
 
+        assert sum_eta.dtype == self.dtype
+
         score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))
 
         return score
@@ -820,6 +858,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
 
             # add a little random jitter, to randomize results around the same alpha
             sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))
+            # random_state.rand returns float64, but converting back to dtype won't speed up anything
 
             sorted_topics = list(matutils.argsort(sort_alpha))
             chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]
@@ -856,11 +895,13 @@ def show_topic(self, topicid, topn=10):
     def get_topics(self):
         """
         Returns:
-            np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents
+            np.ndarray: `num_topics` x `vocabulary_size` array of floats (self.dtype) which represents
             the term topic matrix learned during inference.
         """
         topics = self.state.get_lambda()
-        return topics / topics.sum(axis=1)[:, None]
+        tmp = topics / topics.sum(axis=1)[:, None]
+        assert tmp.dtype == self.dtype
+        return tmp
 
     def get_topic_terms(self, topicid, topn=10):
         """
@@ -1028,6 +1069,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100,
         >>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2`
         >>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2`
 
+        Note: this ignores difference in model dtypes
         """
 
         distances = {
@@ -1186,9 +1228,14 @@ def load(cls, fname, *args, **kwargs):
             result.random_state = utils.get_random_state(None)  # using default value `get_random_state(None)`
             logging.warning("random_state not set so using default value")
 
+        # the same goes for dtype (except it was added later)
+        if not hasattr(result, 'dtype'):
+            result.dtype = np.float64  # float64 was used before as default in numpy
+            logging.warning("dtype was not set, so using np.float64")
+
         state_fname = utils.smart_extension(fname, '.state')
         try:
-            result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs)
+            result.state = LdaState.load(state_fname, *args, **kwargs)
         except Exception as e:
             logging.warning("failed to load state from %s: %s", state_fname, e)
 

diff --git a/gensim/test/basetmtests.py b/gensim/test/basetmtests.py
@@ -48,6 +48,7 @@ def test_get_topics(self):
         vocab_size = len(self.model.id2word)
         for topic in topics:
             self.assertTrue(isinstance(topic, np.ndarray))
-            self.assertEqual(topic.dtype, np.float64)
+            # Note: started moving to np.float32 as default
+            # self.assertEqual(topic.dtype, np.float64)
             self.assertEqual(vocab_size, topic.shape[0])
             self.assertAlmostEqual(np.sum(topic), 1.0, 5)