Skip to content

Commit

Permalink
Numpy.random.RandomState instead of numpy.random in LDA. Fixes #113. (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
droudy authored and tmylk committed Jun 25, 2016
1 parent 225fa67 commit 2e0ed26
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 10 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Changes
* topics, topn parameters changed to num_topics and num_words in show_topics() and print_topics()(@droudy, #747)
- In hdpmodel and dtmmodel
- NOT BACKWARDS COMPATIBLE!
* Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113)

0.13.1, 2016-06-22

Expand Down Expand Up @@ -37,6 +38,7 @@ Changes
* Mallet wrapper sparse format support (@RishabGoel, #664)
* Doc2vec pre-processing script translated from bash to Python (@andrewjlm, #720)


0.12.4, 2016-01-29

* Better internal handling of job batching in word2vec (#535)
Expand Down
25 changes: 21 additions & 4 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,19 @@ def update_dir_prior(prior, N, logphat, rho):

return prior

def get_random_state(seed):
""" Turn seed into a np.random.RandomState instance.
Method originally from maciejkula/glove-python, and written by @joshloyal
"""
if seed is None or seed is numpy.random:
return numpy.random.mtrand._rand
if isinstance(seed, (numbers.Integral, numpy.integer)):
return numpy.random.RandomState(seed)
if isinstance(seed, numpy.random.RandomState):
return seed
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
' instance' % seed)

class LdaState(utils.SaveLoad):
"""
Expand Down Expand Up @@ -203,7 +216,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
distributed=False, chunksize=2000, passes=1, update_every=1,
alpha='symmetric', eta=None, decay=0.5, offset=1.0,
eval_every=10, iterations=50, gamma_threshold=0.001,
minimum_probability=0.01):
minimum_probability=0.01, random_state=None):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -243,6 +256,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
Hoffman et al, respectively.
`minimum_probability` controls filtering the topics returned for a document (bow).
`random_state` can be a numpy.random.RandomState object or the seed for one
Example:
Expand Down Expand Up @@ -288,6 +303,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)

self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

self.random_state = get_random_state(random_state)

assert (self.eta.shape == (self.num_topics, 1) or self.eta.shape == (self.num_topics, self.num_terms)), (
"Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
Expand Down Expand Up @@ -321,7 +338,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,

# Initialize the variational distribution q(beta|lambda)
self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
self.state.sstats = numpy.random.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats))

# if a training corpus was provided, start estimating the model right away
Expand Down Expand Up @@ -407,7 +424,7 @@ def inference(self, chunk, collect_sstats=False):
logger.debug("performing inference on a chunk of %i documents", len(chunk))

# Initialize the variational distribution q(theta|gamma) for the chunk
gamma = numpy.random.gamma(100., 1. / 100., (len(chunk), self.num_topics))
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics))
Elogtheta = dirichlet_expectation(gamma)
expElogtheta = numpy.exp(Elogtheta)
if collect_sstats:
Expand Down Expand Up @@ -776,7 +793,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
num_topics = min(num_topics, self.num_topics)

# add a little random jitter, to randomize results around the same alpha
sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha))
sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))

sorted_topics = list(matutils.argsort(sort_alpha))
chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]
Expand Down
6 changes: 4 additions & 2 deletions gensim/models/ldamulticore.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class LdaMulticore(LdaModel):
def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
chunksize=2000, passes=1, batch=False, alpha='symmetric',
eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50,
gamma_threshold=0.001):
gamma_threshold=0.001, random_state=None):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -124,6 +124,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
`decay` and `offset` parameters are the same as Kappa and Tau_0 in
Hoffman et al, respectively.
`random_state` can be a numpy.random.RandomState object or the seed for one
Example:
Expand All @@ -142,7 +144,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
super(LdaMulticore, self).__init__(corpus=corpus, num_topics=num_topics,
id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
gamma_threshold=gamma_threshold)
gamma_threshold=gamma_threshold, random_state=random_state)


def update(self, corpus, chunks_as_numpy=False):
Expand Down
12 changes: 8 additions & 4 deletions gensim/test/test_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ def testfile():
return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')


def testRandomState():
testcases = [numpy.random.seed(0), None, numpy.random.RandomState(0), 0]
for testcase in testcases:
assert(isinstance(ldamodel.get_random_state(testcase), numpy.random.RandomState))
assertEqual(ldamodel.get_random_state(testcase), numpy.random.RandomState(0))

class TestLdaModel(unittest.TestCase):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
Expand Down Expand Up @@ -251,8 +257,7 @@ def testShowTopics(self):

def testGetDocumentTopics(self):

numpy.random.seed(0)
model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100)
model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100, random_state=numpy.random.seed(0))

doc_topics = model.get_document_topics(self.corpus)

Expand Down Expand Up @@ -285,8 +290,7 @@ def testGetDocumentTopics(self):

def testTermTopics(self):

numpy.random.seed(0)
model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100)
model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100, random_state=numpy.random.seed(0))

# check with word_type
result = model.get_term_topics(2)
Expand Down

0 comments on commit 2e0ed26

Please sign in to comment.