add examples

bab2min · Jun 9, 2020 · 400f060 · 400f060
1 parent 8461e70
commit 400f060
Show file tree

Hide file tree

Showing 3 changed files with 234 additions and 0 deletions.
diff --git a/examples/ctm_network.py b/examples/ctm_network.py
@@ -0,0 +1,68 @@
+'''
+This example shows how to perform a Correlated Topic Model using tomotopy 
+and visualize the correlation between topics.
+
+
+Required Packages:
+    nltk, sklearn, pyvis
+'''
+
+import tomotopy as tp
+import nltk
+from nltk.corpus import stopwords
+import re
+from sklearn.datasets import fetch_20newsgroups
+from pyvis.network import Network
+
+try:
+    # load if preprocessed corpus exists
+    corpus = tp.utils.Corpus.load('preprocessed_20news.cps')
+except IOError:
+    porter_stemmer = nltk.PorterStemmer().stem
+    english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
+    pat = re.compile('^[a-z]{2,}$')
+    corpus = tp.utils.Corpus(
+        tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), 
+        stopwords=lambda x: x in english_stops or not pat.match(x)
+    )
+    newsgroups_train = fetch_20newsgroups()
+    corpus.process(d.lower() for d in newsgroups_train.data)
+    # save preprocessed corpus for reuse
+    corpus.save('preprocessed_20news.cps')
+
+mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=40, k=30, corpus=corpus)
+mdl.train(0)
+
+# Since we have more than ten thousand of documents, 
+# setting the `num_beta_sample` smaller value will not cause an inaccurate result.
+mdl.num_beta_sample = 5
+print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
+    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
+))
+print('Removed Top words: ', *mdl.removed_top_words)
+
+# Let's train the model
+for i in range(0, 1000, 20):
+    print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
+    mdl.train(20)
+print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))
+
+# Let's visualize the result
+g = Network(width=800, height=800, font_color="#333")
+correl = mdl.get_correlations().reshape([-1])
+correl.sort()
+top_tenth = mdl.k * (mdl.k - 1) // 10
+top_tenth = correl[-mdl.k - top_tenth]
+
+for k in range(mdl.k):
+    label = "#{}".format(k)
+    title= ' '.join(word for word, _ in mdl.get_topic_words(k, top_n=6))
+    print('Topic', label, title)
+    g.add_node(k, label=label, title=title, shape='ellipse')
+    for l, correlation in zip(range(k - 1), mdl.get_correlations(k)):
+        if correlation < top_tenth: continue
+        g.add_edge(k, l, value=float(correlation), title='{:.02}'.format(correlation))
+
+g.barnes_hut(gravity=-1000, spring_length=20)
+g.show_buttons()
+g.show("topic_network.html")
diff --git a/examples/gdmr_plot.py b/examples/gdmr_plot.py
@@ -0,0 +1,105 @@
+'''
+This example show how to perform a g-DMR topic model using tomotopy
+and visualize a topic distribution map.
+
+Required Packages:
+    matplotlib
+'''
+
+import tomotopy as tp
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.colors as clr
+
+class ExpNormalize(clr.Normalize):
+    def __init__(self, scale):
+        super().__init__()
+        self.scale = scale
+
+    def __call__(self, value, clip=None):
+        if clip is None:
+            clip = self.clip
+
+        result, is_scalar = self.process_value(value)
+
+        self.autoscale_None(result)
+        (vmin,), _ = self.process_value(self.vmin)
+        (vmax,), _ = self.process_value(self.vmax)
+        if vmin == vmax:
+            result.fill(0)
+        elif vmin > vmax:
+            raise ValueError("minvalue must be less than or equal to maxvalue")
+        else:
+            if clip:
+                mask = np.ma.getmask(result)
+                result = np.ma.array(np.clip(result.filled(vmax), vmin, vmax),
+                                    mask=mask)
+            resdat = result.data
+            resdat = 1 - np.exp(-2 * resdat / self.scale)
+            result = np.ma.array(resdat, mask=result.mask, copy=False)
+        if is_scalar:
+            result = result[0]
+        return result
+
+heat = clr.LinearSegmentedColormap.from_list('heat', 
+    [(0, 0, 0), (0, 0, 1), (0, 1, 1), (0, 1, 0), (1, 1, 0), (1, 0, 0), (1, 1, 1)],
+    N=1024
+)
+
+'''
+You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data .
+'''
+
+corpus = tp.utils.Corpus()
+for line in open('examples/dataset2.txt', encoding='utf-8'):
+    fd = line.strip().split()
+    corpus.add_doc(fd[2:], metadata=list(map(float, fd[:2])))
+
+# We set a range of the first metadata as [2000, 2017] 
+# and one of the second metadata as [0, 1].
+mdl = tp.GDMRModel(tw=tp.TermWeight.PMI, k=30, degrees=[4, 3], 
+    alpha=1e-2, sigma=0.25, sigma0=3.0,
+    metadata_range=[(2000, 2017), (0, 1)], corpus=corpus
+)
+mdl.optim_interval = 20
+mdl.burn_in = 200
+
+mdl.train(0)
+
+print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
+    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
+))
+
+# Let's train the model
+for i in range(0, 1000, 20):
+    print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
+    mdl.train(20)
+print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word))
+
+# Let's visualize the result
+topic_counts = mdl.get_count_by_topics()
+lambdas = mdl.lambdas
+
+md_range = mdl.metadata_range
+# Our topic distribution map has 
+# 400 pixels for the first axis and 
+# 200 pixels for the second axis.
+r = mdl.tdf_linspace(
+    [md_range[0][0], md_range[1][0]], 
+    [md_range[0][1], md_range[1][1]], 
+    [400, 200]
+)
+
+for k in (-topic_counts).argsort():
+    print('Topic #{} ({})'.format(k, topic_counts[k]))
+    print(*(w for w, _ in mdl.get_topic_words(k)))
+    print('Lambda:', lambdas[k])
+
+    imgplot = plt.imshow(r[:, :, k].transpose(), clim=(0.0, r[:, :, k].max()), 
+        origin='lower', cmap=heat, norm=ExpNormalize(scale=0.04),
+        extent=[*md_range[0], *md_range[1]],
+        aspect='auto'
+    )
+    plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5))))
+    plt.colorbar()
+    plt.show()
diff --git a/examples/lda_visualization.py b/examples/lda_visualization.py
@@ -0,0 +1,61 @@
+'''
+This example shows how to perform a Latent Dirichlet Allocation using tomotopy 
+and visualize the result.
+
+
+Required Packages:
+    nltk, sklearn, pyldavis
+'''
+
+import tomotopy as tp
+import nltk
+from nltk.corpus import stopwords
+import re
+from sklearn.datasets import fetch_20newsgroups
+import numpy as np
+import pyLDAvis
+
+try:
+    # load if preprocessed corpus exists
+    corpus = tp.utils.Corpus.load('preprocessed_20news.cps')
+except IOError:
+    porter_stemmer = nltk.PorterStemmer().stem
+    english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
+    pat = re.compile('^[a-z]{2,}$')
+    corpus = tp.utils.Corpus(
+        tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), 
+        stopwords=lambda x: x in english_stops or not pat.match(x)
+    )
+    newsgroups_train = fetch_20newsgroups()
+    corpus.process(d.lower() for d in newsgroups_train.data)
+    # save preprocessed corpus for reuse
+    corpus.save('preprocessed_20news.cps')
+
+mdl = tp.LDAModel(min_df=5, rm_top=40, k=30, corpus=corpus)
+mdl.train(0)
+
+print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
+    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
+))
+print('Removed Top words: ', *mdl.removed_top_words)
+
+# Let's train the model
+for i in range(0, 1000, 20):
+    print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
+    mdl.train(20)
+print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))
+
+topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
+doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
+doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
+vocab = list(mdl.used_vocabs)
+term_frequency = mdl.used_vocab_freq
+
+prepared_data = pyLDAvis.prepare(
+    topic_term_dists, 
+    doc_topic_dists, 
+    doc_lengths, 
+    vocab, 
+    term_frequency
+)
+pyLDAvis.save_html(prepared_data, 'ldavis.html')