diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bced80d09..592badab61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ Changes ======= +* Add suggested_lda_model to HDP class and clean HDP class. (@bhargavvader, #996) * Fix automatic learning of eta (prior over words) in LDA (@olavurmortensen, [#1024](https://github.com/RaRe-Technologies/gensim/pull/1024#)). * eta should have dimensionality V (size of vocab) not K (number of topics). eta with shape K x V is still allowed, as the user may want to impose specific prior information to each topic. * eta is no longer allowed the "asymmetric" option. Asymmetric priors over words in general are fine (learned or user defined). diff --git a/gensim/matutils.py b/gensim/matutils.py index 4c92be4562..195d4be136 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -16,8 +16,9 @@ from gensim import utils -import numpy +import numpy as np import scipy.sparse +from scipy.special import gammaln, psi # gamma function utils from scipy.stats import entropy import scipy.linalg from scipy.linalg.lapack import get_lapack_funcs @@ -35,11 +36,11 @@ try: from numpy import triu_indices except ImportError: - # numpy < 1.4 + # np < 1.4 def triu_indices(n, k=0): - m = numpy.ones((n, n), int) + m = np.ones((n, n), int) a = triu(m, k) - return numpy.where(a != 0) + return np.where(a != 0) blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] @@ -53,21 +54,21 @@ def argsort(x, topn=None, reverse=False): If reverse is True, return the greatest elements instead, in descending order. """ - x = numpy.asarray(x) # unify code path for when `x` is not a numpy array (list, tuple...) + x = np.asarray(x) # unify code path for when `x` is not a np array (list, tuple...) if topn is None: topn = x.size if topn <= 0: return [] if reverse: x = -x - if topn >= x.size or not hasattr(numpy, 'argpartition'): - return numpy.argsort(x)[:topn] - # numpy >= 1.8 has a fast partial argsort, use that! - most_extreme = numpy.argpartition(x, topn)[:topn] - return most_extreme.take(numpy.argsort(x.take(most_extreme))) # resort topn into order + if topn >= x.size or not hasattr(np, 'argpartition'): + return np.argsort(x)[:topn] + # np >= 1.8 has a fast partial argsort, use that! + most_extreme = np.argpartition(x, topn)[:topn] + return most_extreme.take(np.argsort(x.take(most_extreme))) # resort topn into order -def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_nnz=None, printprogress=0): +def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0): """ Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format, with documents as columns. @@ -96,8 +97,8 @@ def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_n if num_terms is not None and num_docs is not None and num_nnz is not None: # faster and much more memory-friendly version of creating the sparse csc posnow, indptr = 0, [0] - indices = numpy.empty((num_nnz,), dtype=numpy.int32) # HACK assume feature ids fit in 32bit integer - data = numpy.empty((num_nnz,), dtype=dtype) + indices = np.empty((num_nnz,), dtype=np.int32) # HACK assume feature ids fit in 32bit integer + data = np.empty((num_nnz,), dtype=dtype) for docno, doc in enumerate(corpus): if printprogress and docno % printprogress == 0: logger.info("PROGRESS: at document #%i/%i" % (docno, num_docs)) @@ -122,15 +123,15 @@ def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_n num_terms = max(indices) + 1 if indices else 0 num_docs = len(indptr) - 1 # now num_docs, num_terms and num_nnz contain the correct values - data = numpy.asarray(data, dtype=dtype) - indices = numpy.asarray(indices) + data = np.asarray(data, dtype=dtype) + indices = np.asarray(indices) result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype) return result def pad(mat, padrow, padcol): """ - Add additional rows/columns to a numpy.matrix `mat`. The new rows/columns + Add additional rows/columns to a np.matrix `mat`. The new rows/columns will be initialized with zeros. """ if padrow < 0: @@ -138,36 +139,36 @@ def pad(mat, padrow, padcol): if padcol < 0: padcol = 0 rows, cols = mat.shape - return numpy.bmat([[mat, numpy.matrix(numpy.zeros((rows, padcol)))], - [numpy.matrix(numpy.zeros((padrow, cols + padcol)))]]) + return np.bmat([[mat, np.matrix(np.zeros((rows, padcol)))], + [np.matrix(np.zeros((padrow, cols + padcol)))]]) def zeros_aligned(shape, dtype, order='C', align=128): - """Like `numpy.zeros()`, but the array will be aligned at `align` byte boundary.""" - nbytes = numpy.prod(shape, dtype=numpy.int64) * numpy.dtype(dtype).itemsize - buffer = numpy.zeros(nbytes + align, dtype=numpy.uint8) # problematic on win64 ("maximum allowed dimension exceeded") + """Like `np.zeros()`, but the array will be aligned at `align` byte boundary.""" + nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize + buffer = np.zeros(nbytes + align, dtype=np.uint8) # problematic on win64 ("maximum allowed dimension exceeded") start_index = -buffer.ctypes.data % align return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order) def ismatrix(m): - return isinstance(m, numpy.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) + return isinstance(m, np.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) def any2sparse(vec, eps=1e-9): - """Convert a numpy/scipy vector into gensim document format (=list of 2-tuples).""" - if isinstance(vec, numpy.ndarray): + """Convert a np/scipy vector into gensim document format (=list of 2-tuples).""" + if isinstance(vec, np.ndarray): return dense2vec(vec, eps) if scipy.sparse.issparse(vec): return scipy2sparse(vec, eps) - return [(int(fid), float(fw)) for fid, fw in vec if numpy.abs(fw) > eps] + return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps] def scipy2sparse(vec, eps=1e-9): """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples).""" vec = vec.tocsr() assert vec.shape[0] == 1 - return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps] + return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if np.abs(val) > eps] class Scipy2Corpus(object): @@ -179,15 +180,15 @@ class Scipy2Corpus(object): """ def __init__(self, vecs): """ - `vecs` is a sequence of dense and/or sparse vectors, such as a 2d numpy array, - or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d numpy/scipy vectors. + `vecs` is a sequence of dense and/or sparse vectors, such as a 2d np array, + or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d np/scipy vectors. """ self.vecs = vecs def __iter__(self): for vec in self.vecs: - if isinstance(vec, numpy.ndarray): + if isinstance(vec, np.ndarray): yield full2sparse(vec) else: yield scipy2sparse(vec) @@ -199,12 +200,12 @@ def __len__(self): def sparse2full(doc, length): """ Convert a document in sparse document format (=sequence of 2-tuples) into a dense - numpy array (of size `length`). + np array (of size `length`). This is the mirror function to `full2sparse`. """ - result = numpy.zeros(length, dtype=numpy.float32) # fill with zeroes (default value) + result = np.zeros(length, dtype=np.float32) # fill with zeroes (default value) doc = dict(doc) # overwrite some of the zeroes with explicit values result[list(doc)] = list(itervalues(doc)) @@ -213,15 +214,15 @@ def sparse2full(doc, length): def full2sparse(vec, eps=1e-9): """ - Convert a dense numpy array into the sparse document format (sequence of 2-tuples). + Convert a dense np array into the sparse document format (sequence of 2-tuples). Values of magnitude < `eps` are treated as zero (ignored). This is the mirror function to `sparse2full`. """ - vec = numpy.asarray(vec, dtype=float) - nnz = numpy.nonzero(abs(vec) > eps)[0] + vec = np.asarray(vec, dtype=float) + nnz = np.nonzero(abs(vec) > eps)[0] return list(zip(nnz, vec.take(nnz))) dense2vec = full2sparse @@ -232,19 +233,19 @@ def full2sparse_clipped(vec, topn, eps=1e-9): Like `full2sparse`, but only return the `topn` elements of the greatest magnitude (abs). """ - # use numpy.argpartition/argsort and only form tuples that are actually returned. + # use np.argpartition/argsort and only form tuples that are actually returned. # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on. if topn <= 0: return [] - vec = numpy.asarray(vec, dtype=float) - nnz = numpy.nonzero(abs(vec) > eps)[0] + vec = np.asarray(vec, dtype=float) + nnz = np.nonzero(abs(vec) > eps)[0] biggest = nnz.take(argsort(abs(vec).take(nnz), topn, reverse=True)) return list(zip(biggest, vec.take(biggest))) -def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32): +def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32): """ - Convert corpus into a dense numpy array (documents will be columns). You + Convert corpus into a dense np array (documents will be columns). You must supply the number of features `num_terms`, because dimensionality cannot be deduced from the sparse vectors alone. @@ -256,19 +257,19 @@ def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32): """ if num_docs is not None: # we know the number of documents => don't bother column_stacking - docno, result = -1, numpy.empty((num_terms, num_docs), dtype=dtype) + docno, result = -1, np.empty((num_terms, num_docs), dtype=dtype) for docno, doc in enumerate(corpus): result[:, docno] = sparse2full(doc, num_terms) assert docno + 1 == num_docs else: - result = numpy.column_stack(sparse2full(doc, num_terms) for doc in corpus) + result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus) return result.astype(dtype) class Dense2Corpus(object): """ - Treat dense numpy array as a sparse, streamed gensim corpus. + Treat dense np array as a sparse, streamed gensim corpus. No data copy is made (changes to the underlying matrix imply changes in the corpus). @@ -329,18 +330,18 @@ def ret_normalized_vec(vec, length): def ret_log_normalize_vec(vec, axis=1): log_max = 100.0 if len(vec.shape) == 1: - max_val = numpy.max(vec) - log_shift = log_max - numpy.log(len(vec) + 1.0) - max_val - tot = numpy.sum(numpy.exp(vec + log_shift)) - log_norm = numpy.log(tot) - log_shift + max_val = np.max(vec) + log_shift = log_max - np.log(len(vec) + 1.0) - max_val + tot = np.sum(np.exp(vec + log_shift)) + log_norm = np.log(tot) - log_shift vec = vec - log_norm else: if axis == 1: # independently normalize each sample - max_val = numpy.max(vec, 1) - log_shift = log_max - numpy.log(vec.shape[1] + 1.0) - max_val - tot = numpy.sum(numpy.exp(vec + log_shift[:, numpy.newaxis]), 1) - log_norm = numpy.log(tot) - log_shift - vec = vec - log_norm[:, numpy.newaxis] + max_val = np.max(vec, 1) + log_shift = log_max - np.log(vec.shape[1] + 1.0) - max_val + tot = np.sum(np.exp(vec + log_shift[:, np.newaxis]), 1) + log_norm = np.log(tot) - log_shift + vec = vec - log_norm[:, np.newaxis] elif axis == 0: # normalize each feature k = ret_log_normalize_vec(vec.T) return (k[0].T, k[1]) @@ -349,8 +350,8 @@ def ret_log_normalize_vec(vec, axis=1): return (vec, log_norm) -blas_nrm2 = blas('nrm2', numpy.array([], dtype=float)) -blas_scal = blas('scal', numpy.array([], dtype=float)) +blas_nrm2 = blas('nrm2', np.array([], dtype=float)) +blas_scal = blas('scal', np.array([], dtype=float)) def unitvec(vec, norm='l2'): @@ -359,25 +360,25 @@ def unitvec(vec, norm='l2'): is returned back unchanged. Output will be in the same format as input (i.e., gensim vector=>gensim vector, - or numpy array=>numpy array, scipy.sparse=>scipy.sparse). + or np array=>np array, scipy.sparse=>scipy.sparse). """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm) if scipy.sparse.issparse(vec): vec = vec.tocsr() if norm == 'l1': - veclen = numpy.sum(numpy.abs(vec.data)) + veclen = np.sum(np.abs(vec.data)) if norm == 'l2': - veclen = numpy.sqrt(numpy.sum(vec.data ** 2)) + veclen = np.sqrt(np.sum(vec.data ** 2)) if veclen > 0.0: return vec / veclen else: return vec - if isinstance(vec, numpy.ndarray): - vec = numpy.asarray(vec, dtype=float) + if isinstance(vec, np.ndarray): + vec = np.asarray(vec, dtype=float) if norm == 'l1': - veclen = numpy.sum(numpy.abs(vec)) + veclen = np.sum(np.abs(vec)) if norm == 'l2': veclen = blas_nrm2(vec) if veclen > 0.0: @@ -436,6 +437,18 @@ def isbow(vec): return True +def dirichlet_expectation(alpha): + """ + For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`. + + """ + if (len(alpha.shape) == 1): + result = psi(alpha) - psi(np.sum(alpha)) + else: + result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis] + return result.astype(alpha.dtype) # keep the same precision as input + + def kullback_leibler(vec1, vec2, num_features=None): """ A distance metric between two probability distributions. @@ -481,10 +494,10 @@ def hellinger(vec1, vec2): vec1, vec2 = dict(vec1), dict(vec2) if len(vec2) < len(vec1): vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector - sim = numpy.sqrt(0.5*sum((numpy.sqrt(value) - numpy.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))) + sim = np.sqrt(0.5*sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))) return sim else: - sim = numpy.sqrt(0.5 * ((numpy.sqrt(vec1) - numpy.sqrt(vec2))**2).sum()) + sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum()) return sim @@ -514,9 +527,9 @@ def jaccard(vec1, vec2): return 1 - float(intersection) / float(union) else: # if it isn't in bag of words format, we can use sets to calculate intersection and union - if isinstance(vec1, numpy.ndarray): + if isinstance(vec1, np.ndarray): vec1 = vec1.tolist() - if isinstance(vec2, numpy.ndarray): + if isinstance(vec2, np.ndarray): vec2 = vec2.tolist() vec1 = set(vec1) vec2 = set(vec2) @@ -532,7 +545,7 @@ def qr_destroy(la): Using this function should be less memory intense than calling `scipy.linalg.qr(la[0])`, because the memory used in `la[0]` is reclaimed earlier. """ - a = numpy.asfortranarray(la[0]) + a = np.asfortranarray(la[0]) del la[0], la # now `a` is the only reference to the input matrix m, n = a.shape # perform q, r = QR(a); code hacked out of scipy.linalg.qr diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 940295a6c6..ef549bdf81 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -35,34 +35,29 @@ import logging, time import numpy as np -import scipy.special as sp +from scipy.special import gammaln, psi # gamma function utils from gensim import interfaces, utils, matutils -from gensim.models import basemodel +from gensim.models import basemodel, ldamodel from six.moves import xrange +from gensim.matutils import dirichlet_expectation +from gensim.utils import get_random_state + logger = logging.getLogger(__name__) meanchangethresh = 0.00001 rhot_bound = 0.0 -def dirichlet_expectation(alpha): - """ - For a vector theta ~ Dir(alpha), compute E[log(theta)] given alpha. - """ - if (len(alpha.shape) == 1): - return(sp.psi(alpha) - sp.psi(np.sum(alpha))) - return(sp.psi(alpha) - sp.psi(np.sum(alpha, 1))[:, np.newaxis]) - def expect_log_sticks(sticks): """ For stick-breaking hdp, return the E[log(sticks)] """ - dig_sum = sp.psi(np.sum(sticks, 0)) - ElogW = sp.psi(sticks[0]) - dig_sum - Elog1_W = sp.psi(sticks[1]) - dig_sum + dig_sum = psi(np.sum(sticks, 0)) + ElogW = psi(sticks[0]) - dig_sum + Elog1_W = psi(sticks[1]) - dig_sum n = len(sticks[0]) + 1 Elogsticks = np.zeros(n) @@ -90,8 +85,8 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): likelihood = np.sum(counts * np.log(phinorm)) likelihood += np.sum((alpha - gamma) * Elogtheta) - likelihood += np.sum(sp.gammaln(gamma) - sp.gammaln(alpha)) - likelihood += sp.gammaln(np.sum(alpha)) - sp.gammaln(np.sum(gamma)) + likelihood += np.sum(gammaln(gamma) - gammaln(alpha)) + likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma)) return (likelihood, gamma) @@ -130,7 +125,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, - outputdir=None): + outputdir=None, random_state=None): """ `gamma`: first level concentration `alpha`: second level concentration @@ -151,6 +146,8 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, self.max_time = max_time self.outputdir = outputdir + self.random_state = get_random_state(random_state) + self.lda_alpha = None self.lda_beta = None @@ -169,7 +166,7 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, self.m_var_sticks[1] = range(T - 1, 0, -1) self.m_varphi_ss = np.zeros(T) - self.m_lambda = np.random.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta + self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta self.m_eta = eta self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda) @@ -273,8 +270,8 @@ def update_chunk(self, chunk, update=True, opt_o=True): rw = np.array([self.m_r[t] for t in self.m_timestamp[word_list]]) self.m_lambda[:, word_list] *= np.exp(self.m_r[-1] - rw) self.m_Elogbeta[:, word_list] = \ - sp.psi(self.m_eta + self.m_lambda[:, word_list]) - \ - sp.psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) + psi(self.m_eta + self.m_lambda[:, word_list]) - \ + psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) ss = SuffStats(self.m_T, Wt, len(chunk)) @@ -360,9 +357,9 @@ def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K - 1) * log_alpha - dig_sum = sp.psi(np.sum(v, 0)) - likelihood += np.sum((np.array([1.0, self.m_alpha])[:, np.newaxis] - v) * (sp.psi(v) - dig_sum)) - likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum(sp.gammaln(v)) + dig_sum = psi(np.sum(v, 0)) + likelihood += np.sum((np.array([1.0, self.m_alpha])[:, np.newaxis] - v) * (psi(v) - dig_sum)) + likelihood -= np.sum(gammaln(np.sum(v, 0))) - np.sum(gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) @@ -436,8 +433,8 @@ def update_expectations(self): for w in xrange(self.m_W): self.m_lambda[:, w] *= np.exp(self.m_r[-1] - self.m_r[self.m_timestamp[w]]) - self.m_Elogbeta = sp.psi(self.m_eta + self.m_lambda) - \ - sp.psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) + self.m_Elogbeta = psi(self.m_eta + self.m_lambda) - \ + psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) self.m_timestamp[:] = self.m_updatect self.m_status_up_to_date = True @@ -457,6 +454,21 @@ def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): hdp_formatter = HdpTopicFormatter(self.id2word, betas) return hdp_formatter.show_topics(num_topics, num_words, log, formatted) + def show_topic(self, topic_id, num_words=20, log=False, formatted=False): + """ + Print the `num_words` most probable words for `topics` number of topics. + Set `topics=-1` to print all topics. + + Set `formatted=True` to return the topics as a list of strings, or + `False` as lists of (weight, word) pairs. + + """ + if not self.m_status_up_to_date: + self.update_expectations() + betas = self.m_lambda + self.m_eta + hdp_formatter = HdpTopicFormatter(self.id2word, betas) + return hdp_formatter.show_topic(topic_id, num_words, log, formatted) + def save_topics(self, doc_count=None): """legacy method; use `self.save()` instead""" if not self.outputdir: @@ -490,6 +502,7 @@ def save_options(self): fout.write('eta: %s\n' % str(self.m_eta)) fout.write('gamma: %s\n' % str(self.m_gamma)) + def hdp_to_lda(self): """ Compute the LDA almost equivalent HDP. @@ -510,6 +523,18 @@ def hdp_to_lda(self): return (alpha, beta) + + def suggested_lda_model(self): + """ + Returns closest corresponding ldamodel object corresponding to current hdp model. + The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta. + """ + alpha, beta = self.hdp_to_lda() + ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state) + ldam.expElogbeta[:] = beta + return ldam + + def evaluate_test_corpus(self, corpus): logger.info('TEST: evaluating test corpus') if self.lda_alpha is None or self.lda_beta is None: @@ -529,6 +554,7 @@ def evaluate_test_corpus(self, corpus): total_words += sum(doc_word_counts) logger.info('TEST: average score: %.5f, total score: %.5f, test docs: %d' % (score / total_words, score, len(corpus))) return score + #endclass HdpModel @@ -589,6 +615,32 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): return shown + def print_topic(self, topic_id, num_words): + return self.show_topic(topic_id, num_words, formatted=True) + + def show_topic(self, topic_id, num_words, log=False, formatted=False): + + lambdak = list(self.data[topic_id, :]) + lambdak = lambdak / sum(lambdak) + + temp = zip(lambdak, xrange(len(lambdak))) + temp = sorted(temp, key=lambda x: x[0], reverse=True) + + topic_terms = self.show_topic_terms(temp, num_words) + + if formatted: + topic = self.format_topic(topic_id, topic_terms) + + # assuming we only output formatted topics + if log: + logger.info(topic) + else: + topic = (topic_id, topic_terms) + + # we only return the topic_terms + return topic[1] + + def show_topic_terms(self, topic_data, num_words): return [(self.dictionary[wid], weight) for (weight, wid) in topic_data[:num_words]] diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 048e5d4c51..c0d447531c 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -44,6 +44,8 @@ from six.moves import xrange import six +from gensim.matutils import dirichlet_expectation +from gensim.utils import get_random_state # log(sum(exp(x))) that tries to avoid overflow try: # try importing from here if older scipy is installed @@ -56,17 +58,6 @@ logger = logging.getLogger('gensim.models.ldamodel') -def dirichlet_expectation(alpha): - """ - For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`. - - """ - if (len(alpha.shape) == 1): - result = psi(alpha) - psi(np.sum(alpha)) - else: - result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis] - return result.astype(alpha.dtype) # keep the same precision as input - def update_dir_prior(prior, N, logphat, rho): """ @@ -91,19 +82,6 @@ def update_dir_prior(prior, N, logphat, rho): return prior -def get_random_state(seed): - """ Turn seed into a np.random.RandomState instance. - - Method originally from maciejkula/glove-python, and written by @joshloyal - """ - if seed is None or seed is np.random: - return np.random.mtrand._rand - if isinstance(seed, (numbers.Integral, np.integer)): - return np.random.RandomState(seed) - if isinstance(seed, np.random.RandomState): - return seed - raise ValueError('%r cannot be used to seed a np.random.RandomState' - ' instance' % seed) class LdaState(utils.SaveLoad): """ diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py index 441a5dbcf4..e2b543687c 100644 --- a/gensim/test/test_hdpmodel.py +++ b/gensim/test/test_hdpmodel.py @@ -23,6 +23,7 @@ from gensim import matutils from gensim.test import basetests +import numpy as np module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -51,12 +52,29 @@ class TestHdpModel(unittest.TestCase, basetests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = hdpmodel.HdpModel - self.model = self.class_(corpus, id2word=dictionary) + self.model = self.class_(corpus, id2word=dictionary, random_state=np.random.seed(0)) - def testShowTopic(self): - # TODO create show_topic in HdpModel and then test + def testTopicValues(self): + """ + Check show topics method + """ + results = self.model.show_topics()[0] + expected_prob, expected_word = '0.264', 'trees ' + prob, word = results[1].split('+')[0].split('*') + self.assertEqual(results[0], 0) + self.assertEqual(prob, expected_prob) + self.assertEqual(word, expected_word) + return + def testLDAmodel(self): + """ + Create ldamodel object, and check if the corresponding alphas are equal. + """ + ldam = self.model.suggested_lda_model() + self.assertEqual(ldam.alpha[0], self.model.lda_alpha[0]) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/utils.py b/gensim/utils.py index 5db16a92e5..9bf1b9dd62 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -23,6 +23,7 @@ except ImportError: import pickle as _pickle +import numbers import re import unicodedata import os @@ -36,7 +37,7 @@ from contextlib import contextmanager import subprocess -import numpy +import numpy as np import scipy.sparse if sys.version_info[0] >= 3: @@ -160,6 +161,21 @@ def copytree_hardlink(source, dest): shutil.copy2 = copy2 +def get_random_state(seed): + """ Turn seed into a np.random.RandomState instance. + + Method originally from maciejkula/glove-python, and written by @joshloyal + """ + if seed is None or seed is np.random: + return np.random.mtrand._rand + if isinstance(seed, (numbers.Integral, np.integer)): + return np.random.RandomState(seed) + if isinstance(seed, np.random.RandomState): + return seed + raise ValueError('%r cannot be used to seed a np.random.RandomState' + ' instance' % seed) + + def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False, lower=False): """ Iteratively yield tokens as unicode strings, removing accent marks @@ -266,7 +282,7 @@ def _load_specials(self, fname, mmap, compress, subname): attrib, cfname, mmap)) getattr(self, attrib)._load_specials(cfname, mmap, compress, subname) - for attrib in getattr(self, '__numpys', []): + for attrib in getattr(self, '__nps', []): logger.info("loading %s from %s with mmap=%s" % ( attrib, subname(fname, attrib), mmap)) @@ -274,9 +290,9 @@ def _load_specials(self, fname, mmap, compress, subname): if mmap: raise mmap_error(attrib, subname(fname, attrib)) - val = numpy.load(subname(fname, attrib))['val'] + val = np.load(subname(fname, attrib))['val'] else: - val = numpy.load(subname(fname, attrib), mmap_mode=mmap) + val = np.load(subname(fname, attrib), mmap_mode=mmap) setattr(self, attrib, val) @@ -288,14 +304,14 @@ def _load_specials(self, fname, mmap, compress, subname): if mmap: raise mmap_error(attrib, subname(fname, attrib)) - with numpy.load(subname(fname, attrib, 'sparse')) as f: + with np.load(subname(fname, attrib, 'sparse')) as f: sparse.data = f['data'] sparse.indptr = f['indptr'] sparse.indices = f['indices'] else: - sparse.data = numpy.load(subname(fname, attrib, 'data'), mmap_mode=mmap) - sparse.indptr = numpy.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap) - sparse.indices = numpy.load(subname(fname, attrib, 'indices'), mmap_mode=mmap) + sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap) + sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap) + sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap) setattr(self, attrib, sparse) @@ -322,7 +338,7 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, Save the object to file (also see `load`). If `separately` is None, automatically detect large - numpy/scipy.sparse arrays in the object being stored, and store + np/scipy.sparse arrays in the object being stored, and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back on load efficiently. @@ -371,7 +387,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, if separately is None: separately = [] for attrib, val in iteritems(self.__dict__): - if isinstance(val, numpy.ndarray) and val.size >= sep_limit: + if isinstance(val, np.ndarray) and val.size >= sep_limit: separately.append(attrib) elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit: separately.append(attrib) @@ -392,17 +408,17 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, pickle_protocol, compress, subname)) try: - numpys, scipys, ignoreds = [], [], [] + nps, scipys, ignoreds = [], [], [] for attrib, val in iteritems(asides): - if isinstance(val, numpy.ndarray) and attrib not in ignore: - numpys.append(attrib) - logger.info("storing numpy array '%s' to %s" % ( + if isinstance(val, np.ndarray) and attrib not in ignore: + nps.append(attrib) + logger.info("storing np array '%s' to %s" % ( attrib, subname(fname, attrib))) if compress: - numpy.savez_compressed(subname(fname, attrib), val=numpy.ascontiguousarray(val)) + np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val)) else: - numpy.save(subname(fname, attrib), numpy.ascontiguousarray(val)) + np.save(subname(fname, attrib), np.ascontiguousarray(val)) elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: scipys.append(attrib) @@ -410,14 +426,14 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, attrib, subname(fname, attrib))) if compress: - numpy.savez_compressed(subname(fname, attrib, 'sparse'), + np.savez_compressed(subname(fname, attrib, 'sparse'), data=val.data, indptr=val.indptr, indices=val.indices) else: - numpy.save(subname(fname, attrib, 'data'), val.data) - numpy.save(subname(fname, attrib, 'indptr'), val.indptr) - numpy.save(subname(fname, attrib, 'indices'), val.indices) + np.save(subname(fname, attrib, 'data'), val.data) + np.save(subname(fname, attrib, 'indptr'), val.indptr) + np.save(subname(fname, attrib, 'indices'), val.indices) data, indptr, indices = val.data, val.indptr, val.indices val.data, val.indptr, val.indices = None, None, None @@ -431,7 +447,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, logger.info("not storing attribute %s" % (attrib)) ignoreds.append(attrib) - self.__dict__['__numpys'] = numpys + self.__dict__['__nps'] = nps self.__dict__['__scipys'] = scipys self.__dict__['__ignoreds'] = ignoreds self.__dict__['__recursive_saveloads'] = recursive_saveloads @@ -454,7 +470,7 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, performed; all attributes will be saved to the same file. If `separately` is None, automatically detect large - numpy/scipy.sparse arrays in the object being stored, and store + np/scipy.sparse arrays in the object being stored, and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back on load efficiently. @@ -588,7 +604,7 @@ def is_corpus(obj): doc1 = next(iter(obj)) # empty corpus is resolved to False here if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...) return True, obj # the first document is empty=>assume this is a corpus - id1, val1 = next(iter(doc1)) # if obj is a numpy array, it resolves to False here + id1, val1 = next(iter(doc1)) # if obj is a np array, it resolves to False here id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float) except Exception: return False, obj @@ -694,11 +710,11 @@ def __init__(self, corpus, slice_): Negative slicing can only be used if the corpus is indexable. Otherwise, the corpus will be iterated over. - Slice can also be a numpy.ndarray to support fancy indexing. + Slice can also be a np.ndarray to support fancy indexing. NOTE: calculating the size of a SlicedCorpus is expensive when using a slice as the corpus has to be iterated over once. - Using a list or numpy.ndarray does not have this drawback, but + Using a list or np.ndarray does not have this drawback, but consumes more memory. """ self.corpus = corpus @@ -716,7 +732,7 @@ def __iter__(self): def __len__(self): # check cached length, calculate if needed if self.length is None: - if isinstance(self.slice_, (list, numpy.ndarray)): + if isinstance(self.slice_, (list, np.ndarray)): self.length = len(self.slice_) else: self.length = sum(1 for x in self) @@ -781,13 +797,13 @@ def chunkize_serial(iterable, chunksize, as_numpy=False): [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] """ - import numpy + import numpy as np it = iter(iterable) while True: if as_numpy: - # convert each document to a 2d numpy array (~6x faster when transmitting + # convert each document to a 2d np array (~6x faster when transmitting # chunk data over the wire, in Pyro) - wrapped_chunk = [[numpy.array(doc) for doc in itertools.islice(it, int(chunksize))]] + wrapped_chunk = [[np.array(doc) for doc in itertools.islice(it, int(chunksize))]] else: wrapped_chunk = [list(itertools.islice(it, int(chunksize)))] if not wrapped_chunk[0]: @@ -799,25 +815,25 @@ def chunkize_serial(iterable, chunksize, as_numpy=False): class InputQueue(multiprocessing.Process): - def __init__(self, q, corpus, chunksize, maxsize, as_numpy): + def __init__(self, q, corpus, chunksize, maxsize, as_np): super(InputQueue, self).__init__() self.q = q self.maxsize = maxsize self.corpus = corpus self.chunksize = chunksize - self.as_numpy = as_numpy + self.as_np = as_np def run(self): - if self.as_numpy: - import numpy # don't clutter the global namespace with a dependency on numpy + if self.as_np: + import np # don't clutter the global namespace with a dependency on np it = iter(self.corpus) while True: chunk = itertools.islice(it, self.chunksize) - if self.as_numpy: - # HACK XXX convert documents to numpy arrays, to save memory. + if self.as_np: + # HACK XXX convert documents to np arrays, to save memory. # This also gives a scipy warning at runtime: # "UserWarning: indices array has non-integer dtype (float64)" - wrapped_chunk = [[numpy.asarray(doc) for doc in chunk]] + wrapped_chunk = [[np.asarray(doc) for doc in chunk]] else: wrapped_chunk = [list(chunk)] @@ -838,11 +854,11 @@ def run(self): if os.name == 'nt': warnings.warn("detected Windows; aliasing chunkize to chunkize_serial") - def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): - for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): + def chunkize(corpus, chunksize, maxsize=0, as_np=False): + for chunk in chunkize_serial(corpus, chunksize, as_np=as_np): yield chunk else: - def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): + def chunkize(corpus, chunksize, maxsize=0, as_np=False): """ Split a stream of values into smaller chunks. Each chunk is of length `chunksize`, except the last one which may be smaller. @@ -868,7 +884,7 @@ def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): if maxsize > 0: q = multiprocessing.Queue(maxsize=maxsize) - worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy) + worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_np=as_np) worker.daemon = True worker.start() while True: @@ -877,7 +893,7 @@ def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): break yield chunk.pop() else: - for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): + for chunk in chunkize_serial(corpus, chunksize, as_np=as_np): yield chunk @@ -1066,8 +1082,8 @@ def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0): a Poisson distribution with parameter lambda equal to `lam`. """ - nnz = numpy.random.uniform(size=(dim,)) - data = [(i, float(numpy.random.poisson(lam=lam) + 1.0)) + nnz = np.random.uniform(size=(dim,)) + data = [(i, float(np.random.poisson(lam=lam) + 1.0)) for i in xrange(dim) if nnz[i] < prob_nnz] return data