Numpy.random.RandomState instead of numpy.random in LDA. Fixes #113. (#…

…748)
piskvorky · Jun 25, 2016 · 2e0ed26 · 2e0ed26
1 parent 225fa67
commit 2e0ed26
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@ Changes
 * topics, topn parameters changed to num_topics and num_words in show_topics() and print_topics()(@droudy, #747)
   - In hdpmodel and dtmmodel
   - NOT BACKWARDS COMPATIBLE!
+* Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113)
 
 0.13.1, 2016-06-22
 
@@ -37,6 +38,7 @@ Changes
 * Mallet wrapper sparse format support (@RishabGoel, #664)
 * Doc2vec pre-processing script translated from bash to Python (@andrewjlm, #720)
 
+
 0.12.4, 2016-01-29
 
 * Better internal handling of job batching in word2vec (#535)

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -91,6 +91,19 @@ def update_dir_prior(prior, N, logphat, rho):
 
     return prior
 
+def get_random_state(seed):
+     """ Turn seed into a np.random.RandomState instance.
+
+         Method originally from maciejkula/glove-python, and written by @joshloyal
+     """
+     if seed is None or seed is numpy.random:
+         return numpy.random.mtrand._rand
+     if isinstance(seed, (numbers.Integral, numpy.integer)):
+         return numpy.random.RandomState(seed)
+     if isinstance(seed, numpy.random.RandomState):
+        return seed
+     raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
+                      ' instance' % seed)
 
 class LdaState(utils.SaveLoad):
     """
@@ -203,7 +216,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                  distributed=False, chunksize=2000, passes=1, update_every=1,
                  alpha='symmetric', eta=None, decay=0.5, offset=1.0,
                  eval_every=10, iterations=50, gamma_threshold=0.001,
-                 minimum_probability=0.01):
+                 minimum_probability=0.01, random_state=None):
         """
         If given, start training from the iterable `corpus` straight away. If not given,
         the model is left untrained (presumably because you want to call `update()` manually).
@@ -243,6 +256,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         Hoffman et al, respectively.
 
         `minimum_probability` controls filtering the topics returned for a document (bow).
+        
+        `random_state` can be a numpy.random.RandomState object or the seed for one
 
         Example:
 
@@ -288,6 +303,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
 
         self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')
+
+        self.random_state = get_random_state(random_state)
 
         assert (self.eta.shape == (self.num_topics, 1) or self.eta.shape == (self.num_topics, self.num_terms)), (
             "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
@@ -321,7 +338,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
 
         # Initialize the variational distribution q(beta|lambda)
         self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
-        self.state.sstats = numpy.random.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
+        self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
         self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats))
 
         # if a training corpus was provided, start estimating the model right away
@@ -407,7 +424,7 @@ def inference(self, chunk, collect_sstats=False):
             logger.debug("performing inference on a chunk of %i documents", len(chunk))
 
         # Initialize the variational distribution q(theta|gamma) for the chunk
-        gamma = numpy.random.gamma(100., 1. / 100., (len(chunk), self.num_topics))
+        gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics))
         Elogtheta = dirichlet_expectation(gamma)
         expElogtheta = numpy.exp(Elogtheta)
         if collect_sstats:
@@ -776,7 +793,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
             num_topics = min(num_topics, self.num_topics)
 
             # add a little random jitter, to randomize results around the same alpha
-            sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha))
+            sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))
 
             sorted_topics = list(matutils.argsort(sort_alpha))
             chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]

diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py
@@ -80,7 +80,7 @@ class LdaMulticore(LdaModel):
     def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
                  chunksize=2000, passes=1, batch=False, alpha='symmetric',
                  eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50,
-                 gamma_threshold=0.001):
+                 gamma_threshold=0.001, random_state=None):
         """
         If given, start training from the iterable `corpus` straight away. If not given,
         the model is left untrained (presumably because you want to call `update()` manually).
@@ -124,6 +124,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
 
         `decay` and `offset` parameters are the same as Kappa and Tau_0 in
         Hoffman et al, respectively.
+        
+        `random_state` can be a numpy.random.RandomState object or the seed for one
 
         Example:
 
@@ -142,7 +144,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
         super(LdaMulticore, self).__init__(corpus=corpus, num_topics=num_topics,
             id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
             decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
-            gamma_threshold=gamma_threshold)
+            gamma_threshold=gamma_threshold, random_state=random_state)
 
 
     def update(self, corpus, chunks_as_numpy=False):

diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
@@ -48,6 +48,12 @@ def testfile():
     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 
 
+def testRandomState():
+    testcases = [numpy.random.seed(0), None, numpy.random.RandomState(0), 0]
+    for testcase in testcases:
+        assert(isinstance(ldamodel.get_random_state(testcase), numpy.random.RandomState))
+        assertEqual(ldamodel.get_random_state(testcase), numpy.random.RandomState(0))
+
 class TestLdaModel(unittest.TestCase):
     def setUp(self):
         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
@@ -251,8 +257,7 @@ def testShowTopics(self):
 
     def testGetDocumentTopics(self):
 
-        numpy.random.seed(0)
-        model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100)
+        model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100, random_state=numpy.random.seed(0))
 
         doc_topics = model.get_document_topics(self.corpus)
 
@@ -285,8 +290,7 @@ def testGetDocumentTopics(self):
 
     def testTermTopics(self):
 
-        numpy.random.seed(0)
-        model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100)
+        model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=numpy.random.seed(0))
 
         # check with word_type
         result = model.get_term_topics(2)