diff --git a/examples/ctm_network.py b/examples/ctm_network.py new file mode 100644 index 0000000..8acfec3 --- /dev/null +++ b/examples/ctm_network.py @@ -0,0 +1,68 @@ +''' +This example shows how to perform a Correlated Topic Model using tomotopy +and visualize the correlation between topics. + + +Required Packages: + nltk, sklearn, pyvis +''' + +import tomotopy as tp +import nltk +from nltk.corpus import stopwords +import re +from sklearn.datasets import fetch_20newsgroups +from pyvis.network import Network + +try: + # load if preprocessed corpus exists + corpus = tp.utils.Corpus.load('preprocessed_20news.cps') +except IOError: + porter_stemmer = nltk.PorterStemmer().stem + english_stops = set(porter_stemmer(w) for w in stopwords.words('english')) + pat = re.compile('^[a-z]{2,}$') + corpus = tp.utils.Corpus( + tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), + stopwords=lambda x: x in english_stops or not pat.match(x) + ) + newsgroups_train = fetch_20newsgroups() + corpus.process(d.lower() for d in newsgroups_train.data) + # save preprocessed corpus for reuse + corpus.save('preprocessed_20news.cps') + +mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=40, k=30, corpus=corpus) +mdl.train(0) + +# Since we have more than ten thousand of documents, +# setting the `num_beta_sample` smaller value will not cause an inaccurate result. +mdl.num_beta_sample = 5 +print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( + len(mdl.docs), len(mdl.used_vocabs), mdl.num_words +)) +print('Removed Top words: ', *mdl.removed_top_words) + +# Let's train the model +for i in range(0, 1000, 20): + print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word)) + mdl.train(20) +print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word)) + +# Let's visualize the result +g = Network(width=800, height=800, font_color="#333") +correl = mdl.get_correlations().reshape([-1]) +correl.sort() +top_tenth = mdl.k * (mdl.k - 1) // 10 +top_tenth = correl[-mdl.k - top_tenth] + +for k in range(mdl.k): + label = "#{}".format(k) + title= ' '.join(word for word, _ in mdl.get_topic_words(k, top_n=6)) + print('Topic', label, title) + g.add_node(k, label=label, title=title, shape='ellipse') + for l, correlation in zip(range(k - 1), mdl.get_correlations(k)): + if correlation < top_tenth: continue + g.add_edge(k, l, value=float(correlation), title='{:.02}'.format(correlation)) + +g.barnes_hut(gravity=-1000, spring_length=20) +g.show_buttons() +g.show("topic_network.html") diff --git a/examples/gdmr_plot.py b/examples/gdmr_plot.py new file mode 100644 index 0000000..57e1594 --- /dev/null +++ b/examples/gdmr_plot.py @@ -0,0 +1,105 @@ +''' +This example show how to perform a g-DMR topic model using tomotopy +and visualize a topic distribution map. + +Required Packages: + matplotlib +''' + +import tomotopy as tp +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.colors as clr + +class ExpNormalize(clr.Normalize): + def __init__(self, scale): + super().__init__() + self.scale = scale + + def __call__(self, value, clip=None): + if clip is None: + clip = self.clip + + result, is_scalar = self.process_value(value) + + self.autoscale_None(result) + (vmin,), _ = self.process_value(self.vmin) + (vmax,), _ = self.process_value(self.vmax) + if vmin == vmax: + result.fill(0) + elif vmin > vmax: + raise ValueError("minvalue must be less than or equal to maxvalue") + else: + if clip: + mask = np.ma.getmask(result) + result = np.ma.array(np.clip(result.filled(vmax), vmin, vmax), + mask=mask) + resdat = result.data + resdat = 1 - np.exp(-2 * resdat / self.scale) + result = np.ma.array(resdat, mask=result.mask, copy=False) + if is_scalar: + result = result[0] + return result + +heat = clr.LinearSegmentedColormap.from_list('heat', + [(0, 0, 0), (0, 0, 1), (0, 1, 1), (0, 1, 0), (1, 1, 0), (1, 0, 0), (1, 1, 1)], + N=1024 +) + +''' +You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data . +''' + +corpus = tp.utils.Corpus() +for line in open('examples/dataset2.txt', encoding='utf-8'): + fd = line.strip().split() + corpus.add_doc(fd[2:], metadata=list(map(float, fd[:2]))) + +# We set a range of the first metadata as [2000, 2017] +# and one of the second metadata as [0, 1]. +mdl = tp.GDMRModel(tw=tp.TermWeight.PMI, k=30, degrees=[4, 3], + alpha=1e-2, sigma=0.25, sigma0=3.0, + metadata_range=[(2000, 2017), (0, 1)], corpus=corpus +) +mdl.optim_interval = 20 +mdl.burn_in = 200 + +mdl.train(0) + +print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( + len(mdl.docs), len(mdl.used_vocabs), mdl.num_words +)) + +# Let's train the model +for i in range(0, 1000, 20): + print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word)) + mdl.train(20) +print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word)) + +# Let's visualize the result +topic_counts = mdl.get_count_by_topics() +lambdas = mdl.lambdas + +md_range = mdl.metadata_range +# Our topic distribution map has +# 400 pixels for the first axis and +# 200 pixels for the second axis. +r = mdl.tdf_linspace( + [md_range[0][0], md_range[1][0]], + [md_range[0][1], md_range[1][1]], + [400, 200] +) + +for k in (-topic_counts).argsort(): + print('Topic #{} ({})'.format(k, topic_counts[k])) + print(*(w for w, _ in mdl.get_topic_words(k))) + print('Lambda:', lambdas[k]) + + imgplot = plt.imshow(r[:, :, k].transpose(), clim=(0.0, r[:, :, k].max()), + origin='lower', cmap=heat, norm=ExpNormalize(scale=0.04), + extent=[*md_range[0], *md_range[1]], + aspect='auto' + ) + plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5)))) + plt.colorbar() + plt.show() diff --git a/examples/lda_visualization.py b/examples/lda_visualization.py new file mode 100644 index 0000000..17b94a2 --- /dev/null +++ b/examples/lda_visualization.py @@ -0,0 +1,61 @@ +''' +This example shows how to perform a Latent Dirichlet Allocation using tomotopy +and visualize the result. + + +Required Packages: + nltk, sklearn, pyldavis +''' + +import tomotopy as tp +import nltk +from nltk.corpus import stopwords +import re +from sklearn.datasets import fetch_20newsgroups +import numpy as np +import pyLDAvis + +try: + # load if preprocessed corpus exists + corpus = tp.utils.Corpus.load('preprocessed_20news.cps') +except IOError: + porter_stemmer = nltk.PorterStemmer().stem + english_stops = set(porter_stemmer(w) for w in stopwords.words('english')) + pat = re.compile('^[a-z]{2,}$') + corpus = tp.utils.Corpus( + tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), + stopwords=lambda x: x in english_stops or not pat.match(x) + ) + newsgroups_train = fetch_20newsgroups() + corpus.process(d.lower() for d in newsgroups_train.data) + # save preprocessed corpus for reuse + corpus.save('preprocessed_20news.cps') + +mdl = tp.LDAModel(min_df=5, rm_top=40, k=30, corpus=corpus) +mdl.train(0) + +print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( + len(mdl.docs), len(mdl.used_vocabs), mdl.num_words +)) +print('Removed Top words: ', *mdl.removed_top_words) + +# Let's train the model +for i in range(0, 1000, 20): + print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word)) + mdl.train(20) +print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word)) + +topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)]) +doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs]) +doc_lengths = np.array([len(doc.words) for doc in mdl.docs]) +vocab = list(mdl.used_vocabs) +term_frequency = mdl.used_vocab_freq + +prepared_data = pyLDAvis.prepare( + topic_term_dists, + doc_topic_dists, + doc_lengths, + vocab, + term_frequency +) +pyLDAvis.save_html(prepared_data, 'ldavis.html')