-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding dtype to LDAModel to speed it up #1656
Changes from 9 commits
8e4c56a
cdc603f
38ebc1b
c44ca8a
e4d98ba
88845fd
f2e22f4
a95648e
c6cd798
d2651f9
bc6cc4f
f7baf17
ef319f3
998b0b2
7541dd9
5abf970
61c3263
4705422
956612a
add7bc0
4deee31
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,10 +87,11 @@ class LdaState(utils.SaveLoad): | |
|
||
""" | ||
|
||
def __init__(self, eta, shape): | ||
self.eta = eta | ||
self.sstats = np.zeros(shape) | ||
def __init__(self, eta, shape, dtype=np.float32): | ||
self.eta = eta.astype(dtype, copy=False) | ||
self.sstats = np.zeros(shape, dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using positional arguments can lead to subtle bugs with numpy. Better use explicit names for keyword parameters: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
self.numdocs = 0 | ||
self.dtype = dtype | ||
|
||
def reset(self): | ||
""" | ||
|
@@ -165,6 +166,18 @@ def get_lambda(self): | |
|
||
def get_Elogbeta(self): | ||
return dirichlet_expectation(self.get_lambda()) | ||
|
||
@classmethod | ||
def load(cls, fname, *args, **kwargs): | ||
result = super(LdaState, cls).load(fname, *args, **kwargs) | ||
|
||
# Check if `dtype` is set after main pickle load | ||
# if not, then it's an old model and we should set it to default `np.float64` | ||
if not hasattr(result, 'dtype'): | ||
result.dtype = np.float64 # float64 was used before as default in numpy | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Old LDA used float64, really? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pretty much everything is using |
||
logging.warning("dtype was not set in LdaState, so using np.float64") | ||
|
||
return result | ||
# endclass LdaState | ||
|
||
|
||
|
@@ -191,7 +204,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, | |
alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, | ||
iterations=50, gamma_threshold=0.001, minimum_probability=0.01, | ||
random_state=None, ns_conf=None, minimum_phi_value=0.01, | ||
per_word_topics=False, callbacks=None): | ||
per_word_topics=False, callbacks=None, dtype=np.float32): | ||
""" | ||
If given, start training from the iterable `corpus` straight away. If not given, | ||
the model is left untrained (presumably because you want to call `update()` manually). | ||
|
@@ -233,9 +246,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, | |
|
||
`minimum_probability` controls filtering the topics returned for a document (bow). | ||
|
||
`random_state` can be a np.random.RandomState object or the seed for one | ||
`random_state` can be a np.random.RandomState object or the seed for one. | ||
|
||
`callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training | ||
`callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training. | ||
|
||
`dtype` is data-type to use during calculations inside model. All inputs are also converted to this dtype. | ||
|
||
Example: | ||
|
||
|
@@ -247,6 +262,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, | |
>>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data | ||
|
||
""" | ||
self.dtype = dtype | ||
|
||
# store user-supplied parameters | ||
self.id2word = id2word | ||
|
@@ -330,10 +346,13 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, | |
raise RuntimeError("failed to initialize distributed LDA (%s)" % err) | ||
|
||
# Initialize the variational distribution q(beta|lambda) | ||
self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) | ||
self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) | ||
self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype) | ||
self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) | ||
self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) | ||
|
||
assert self.eta.dtype == self.dtype | ||
assert self.expElogbeta.dtype == self.dtype | ||
|
||
# if a training corpus was provided, start estimating the model right away | ||
if corpus is not None: | ||
use_numpy = self.dispatcher is not None | ||
|
@@ -354,25 +373,25 @@ def init_dir_prior(self, prior, name): | |
|
||
if isinstance(prior, six.string_types): | ||
if prior == 'symmetric': | ||
logger.info("using symmetric %s at %s", name, 1.0 / prior_shape) | ||
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)]) | ||
logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics) | ||
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype) | ||
elif prior == 'asymmetric': | ||
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)]) | ||
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype) | ||
init_prior /= init_prior.sum() | ||
logger.info("using asymmetric %s %s", name, list(init_prior)) | ||
elif prior == 'auto': | ||
is_auto = True | ||
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)]) | ||
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype) | ||
if name == 'alpha': | ||
logger.info("using autotuned %s, starting with %s", name, list(init_prior)) | ||
else: | ||
raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior)) | ||
elif isinstance(prior, list): | ||
init_prior = np.asarray(prior) | ||
init_prior = np.asarray(prior, dtype=self.dtype) | ||
elif isinstance(prior, np.ndarray): | ||
init_prior = prior | ||
init_prior = prior.astype(self.dtype, copy=False) | ||
elif isinstance(prior, np.number) or isinstance(prior, numbers.Real): | ||
init_prior = np.asarray([prior] * prior_shape) | ||
init_prior = np.asarray([prior] * prior_shape, dtype=self.dtype) | ||
else: | ||
raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name) | ||
|
||
|
@@ -385,6 +404,7 @@ def __str__(self): | |
|
||
def sync_state(self): | ||
self.expElogbeta = np.exp(self.state.get_Elogbeta()) | ||
assert self.expElogbeta.dtype == self.dtype | ||
|
||
def clear(self): | ||
"""Clear model state (free up some memory). Used in the distributed algo.""" | ||
|
@@ -418,11 +438,13 @@ def inference(self, chunk, collect_sstats=False): | |
logger.debug("performing inference on a chunk of %i documents", len(chunk)) | ||
|
||
# Initialize the variational distribution q(theta|gamma) for the chunk | ||
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)) | ||
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False) | ||
Elogtheta = dirichlet_expectation(gamma) | ||
expElogtheta = np.exp(Elogtheta) | ||
assert expElogtheta.dtype == self.dtype | ||
|
||
if collect_sstats: | ||
sstats = np.zeros_like(self.expElogbeta) | ||
sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype) | ||
else: | ||
sstats = None | ||
converged = 0 | ||
|
@@ -478,6 +500,9 @@ def inference(self, chunk, collect_sstats=False): | |
# sstats[k, w] = \sum_d n_{dw} * phi_{dwk} | ||
# = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. | ||
sstats *= self.expElogbeta | ||
assert sstats.dtype == self.dtype | ||
|
||
assert gamma.dtype == self.dtype | ||
return gamma, sstats | ||
|
||
def do_estep(self, chunk, state=None): | ||
|
@@ -500,10 +525,13 @@ def update_alpha(self, gammat, rho): | |
""" | ||
N = float(len(gammat)) | ||
logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N | ||
assert logphat.dtype == self.dtype | ||
|
||
self.alpha = update_dir_prior(self.alpha, N, logphat, rho) | ||
logger.info("optimized alpha %s", list(self.alpha)) | ||
|
||
assert self.alpha.dtype == self.dtype | ||
|
||
return self.alpha | ||
|
||
def update_eta(self, lambdat, rho): | ||
|
@@ -513,9 +541,12 @@ def update_eta(self, lambdat, rho): | |
""" | ||
N = float(lambdat.shape[0]) | ||
logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,)) | ||
assert logphat.dtype == self.dtype | ||
|
||
self.eta = update_dir_prior(self.eta, N, logphat, rho) | ||
|
||
assert self.eta.dtype == self.dtype | ||
|
||
return self.eta | ||
|
||
def log_perplexity(self, chunk, total_docs=None): | ||
|
@@ -647,7 +678,7 @@ def rho(): | |
logger.info('initializing %s workers', self.numworkers) | ||
self.dispatcher.reset(self.state) | ||
else: | ||
other = LdaState(self.eta, self.state.sstats.shape) | ||
other = LdaState(self.eta, self.state.sstats.shape, self.dtype) | ||
dirty = False | ||
|
||
reallen = 0 | ||
|
@@ -671,6 +702,7 @@ def rho(): | |
pass_, chunk_no * chunksize + len(chunk), lencorpus | ||
) | ||
gammat = self.do_estep(chunk, other) | ||
assert gammat.dtype == self.dtype | ||
|
||
if self.optimize_alpha: | ||
self.update_alpha(gammat, rho()) | ||
|
@@ -691,7 +723,7 @@ def rho(): | |
logger.info('initializing workers') | ||
self.dispatcher.reset(self.state) | ||
else: | ||
other = LdaState(self.eta, self.state.sstats.shape) | ||
other = LdaState(self.eta, self.state.sstats.shape, self.dtype) | ||
dirty = False | ||
# endfor single corpus iteration | ||
|
||
|
@@ -723,6 +755,7 @@ def do_mstep(self, rho, other, extra_pass=False): | |
|
||
""" | ||
logger.debug("updating topics") | ||
assert other.dtype == self.dtype | ||
# update self with the new blend; also keep track of how much did | ||
# the topics change through this update, to assess convergence | ||
diff = np.log(self.expElogbeta) | ||
|
@@ -772,6 +805,9 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0): | |
gammad = gamma[d] | ||
Elogthetad = dirichlet_expectation(gammad) | ||
|
||
assert gammad.dtype == self.dtype | ||
assert Elogthetad.dtype == self.dtype | ||
|
||
# E[log p(doc | theta, beta)] | ||
score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc) | ||
|
||
|
@@ -793,6 +829,8 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0): | |
else: | ||
sum_eta = np.sum(self.eta) | ||
|
||
assert sum_eta.dtype == self.dtype | ||
|
||
score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1))) | ||
|
||
return score | ||
|
@@ -820,6 +858,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): | |
|
||
# add a little random jitter, to randomize results around the same alpha | ||
sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha)) | ||
# random_state.rand returns float64, but converting back to dtype won't speed up anything | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consistency vs one additional array copy. I'm not sure :) |
||
|
||
sorted_topics = list(matutils.argsort(sort_alpha)) | ||
chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:] | ||
|
@@ -856,11 +895,13 @@ def show_topic(self, topicid, topn=10): | |
def get_topics(self): | ||
""" | ||
Returns: | ||
np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents | ||
np.ndarray: `num_topics` x `vocabulary_size` array of floats (self.dtype) which represents | ||
the term topic matrix learned during inference. | ||
""" | ||
topics = self.state.get_lambda() | ||
return topics / topics.sum(axis=1)[:, None] | ||
tmp = topics / topics.sum(axis=1)[:, None] | ||
assert tmp.dtype == self.dtype | ||
return tmp | ||
|
||
def get_topic_terms(self, topicid, topn=10): | ||
""" | ||
|
@@ -1028,6 +1069,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100, | |
>>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2` | ||
>>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2` | ||
|
||
Note: this ignores difference in model dtypes | ||
""" | ||
|
||
distances = { | ||
|
@@ -1186,9 +1228,14 @@ def load(cls, fname, *args, **kwargs): | |
result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)` | ||
logging.warning("random_state not set so using default value") | ||
|
||
# the same goes for dtype (except it was added later) | ||
if not hasattr(result, 'dtype'): | ||
result.dtype = np.float64 # float64 was used before as default in numpy | ||
logging.warning("dtype was not set, so using np.float64") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A more concrete message please. When reading this warning, users will be left scratching their heads: set where? Why? What does this mean to me? How about Question: isn't it better to infer the dtype from the loaded object? Can it ever happen that it's something else, not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed message, decided info level suites better. About inferring. Not clear how to do it. Infer from Anyway, let's imagine situation some of |
||
|
||
state_fname = utils.smart_extension(fname, '.state') | ||
try: | ||
result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) | ||
result.state = LdaState.load(state_fname, *args, **kwargs) | ||
except Exception as e: | ||
logging.warning("failed to load state from %s: %s", state_fname, e) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,6 +48,7 @@ def test_get_topics(self): | |
vocab_size = len(self.model.id2word) | ||
for topic in topics: | ||
self.assertTrue(isinstance(topic, np.ndarray)) | ||
self.assertEqual(topic.dtype, np.float64) | ||
# Note: started moving to np.float32 as default | ||
# self.assertEqual(topic.dtype, np.float64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to enable + switch to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will break other topic models then |
||
self.assertEqual(vocab_size, topic.shape[0]) | ||
self.assertAlmostEqual(np.sum(topic), 1.0, 5) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please return
astype
, becausenp.float32 -> np.float32
np.float64 -> np.float64
but
np.float16 -> np.float32
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, my bad, you're right!
Then tests that I added in separate file aren't needed.