Skip to content

Commit

Permalink
add examples
Browse files Browse the repository at this point in the history
  • Loading branch information
bab2min committed Jun 9, 2020
1 parent 8461e70 commit 400f060
Show file tree
Hide file tree
Showing 3 changed files with 234 additions and 0 deletions.
68 changes: 68 additions & 0 deletions examples/ctm_network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
'''
This example shows how to perform a Correlated Topic Model using tomotopy
and visualize the correlation between topics.
Required Packages:
nltk, sklearn, pyvis
'''

import tomotopy as tp
import nltk
from nltk.corpus import stopwords
import re
from sklearn.datasets import fetch_20newsgroups
from pyvis.network import Network

try:
# load if preprocessed corpus exists
corpus = tp.utils.Corpus.load('preprocessed_20news.cps')
except IOError:
porter_stemmer = nltk.PorterStemmer().stem
english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
pat = re.compile('^[a-z]{2,}$')
corpus = tp.utils.Corpus(
tokenizer=tp.utils.SimpleTokenizer(porter_stemmer),
stopwords=lambda x: x in english_stops or not pat.match(x)
)
newsgroups_train = fetch_20newsgroups()
corpus.process(d.lower() for d in newsgroups_train.data)
# save preprocessed corpus for reuse
corpus.save('preprocessed_20news.cps')

mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=40, k=30, corpus=corpus)
mdl.train(0)

# Since we have more than ten thousand of documents,
# setting the `num_beta_sample` smaller value will not cause an inaccurate result.
mdl.num_beta_sample = 5
print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
))
print('Removed Top words: ', *mdl.removed_top_words)

# Let's train the model
for i in range(0, 1000, 20):
print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
mdl.train(20)
print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))

# Let's visualize the result
g = Network(width=800, height=800, font_color="#333")
correl = mdl.get_correlations().reshape([-1])
correl.sort()
top_tenth = mdl.k * (mdl.k - 1) // 10
top_tenth = correl[-mdl.k - top_tenth]

for k in range(mdl.k):
label = "#{}".format(k)
title= ' '.join(word for word, _ in mdl.get_topic_words(k, top_n=6))
print('Topic', label, title)
g.add_node(k, label=label, title=title, shape='ellipse')
for l, correlation in zip(range(k - 1), mdl.get_correlations(k)):
if correlation < top_tenth: continue
g.add_edge(k, l, value=float(correlation), title='{:.02}'.format(correlation))

g.barnes_hut(gravity=-1000, spring_length=20)
g.show_buttons()
g.show("topic_network.html")
105 changes: 105 additions & 0 deletions examples/gdmr_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
'''
This example show how to perform a g-DMR topic model using tomotopy
and visualize a topic distribution map.
Required Packages:
matplotlib
'''

import tomotopy as tp
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as clr

class ExpNormalize(clr.Normalize):
def __init__(self, scale):
super().__init__()
self.scale = scale

def __call__(self, value, clip=None):
if clip is None:
clip = self.clip

result, is_scalar = self.process_value(value)

self.autoscale_None(result)
(vmin,), _ = self.process_value(self.vmin)
(vmax,), _ = self.process_value(self.vmax)
if vmin == vmax:
result.fill(0)
elif vmin > vmax:
raise ValueError("minvalue must be less than or equal to maxvalue")
else:
if clip:
mask = np.ma.getmask(result)
result = np.ma.array(np.clip(result.filled(vmax), vmin, vmax),
mask=mask)
resdat = result.data
resdat = 1 - np.exp(-2 * resdat / self.scale)
result = np.ma.array(resdat, mask=result.mask, copy=False)
if is_scalar:
result = result[0]
return result

heat = clr.LinearSegmentedColormap.from_list('heat',
[(0, 0, 0), (0, 0, 1), (0, 1, 1), (0, 1, 0), (1, 1, 0), (1, 0, 0), (1, 1, 1)],
N=1024
)

'''
You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data .
'''

corpus = tp.utils.Corpus()
for line in open('examples/dataset2.txt', encoding='utf-8'):
fd = line.strip().split()
corpus.add_doc(fd[2:], metadata=list(map(float, fd[:2])))

# We set a range of the first metadata as [2000, 2017]
# and one of the second metadata as [0, 1].
mdl = tp.GDMRModel(tw=tp.TermWeight.PMI, k=30, degrees=[4, 3],
alpha=1e-2, sigma=0.25, sigma0=3.0,
metadata_range=[(2000, 2017), (0, 1)], corpus=corpus
)
mdl.optim_interval = 20
mdl.burn_in = 200

mdl.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
))

# Let's train the model
for i in range(0, 1000, 20):
print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
mdl.train(20)
print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word))

# Let's visualize the result
topic_counts = mdl.get_count_by_topics()
lambdas = mdl.lambdas

md_range = mdl.metadata_range
# Our topic distribution map has
# 400 pixels for the first axis and
# 200 pixels for the second axis.
r = mdl.tdf_linspace(
[md_range[0][0], md_range[1][0]],
[md_range[0][1], md_range[1][1]],
[400, 200]
)

for k in (-topic_counts).argsort():
print('Topic #{} ({})'.format(k, topic_counts[k]))
print(*(w for w, _ in mdl.get_topic_words(k)))
print('Lambda:', lambdas[k])

imgplot = plt.imshow(r[:, :, k].transpose(), clim=(0.0, r[:, :, k].max()),
origin='lower', cmap=heat, norm=ExpNormalize(scale=0.04),
extent=[*md_range[0], *md_range[1]],
aspect='auto'
)
plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5))))
plt.colorbar()
plt.show()
61 changes: 61 additions & 0 deletions examples/lda_visualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
'''
This example shows how to perform a Latent Dirichlet Allocation using tomotopy
and visualize the result.
Required Packages:
nltk, sklearn, pyldavis
'''

import tomotopy as tp
import nltk
from nltk.corpus import stopwords
import re
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pyLDAvis

try:
# load if preprocessed corpus exists
corpus = tp.utils.Corpus.load('preprocessed_20news.cps')
except IOError:
porter_stemmer = nltk.PorterStemmer().stem
english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
pat = re.compile('^[a-z]{2,}$')
corpus = tp.utils.Corpus(
tokenizer=tp.utils.SimpleTokenizer(porter_stemmer),
stopwords=lambda x: x in english_stops or not pat.match(x)
)
newsgroups_train = fetch_20newsgroups()
corpus.process(d.lower() for d in newsgroups_train.data)
# save preprocessed corpus for reuse
corpus.save('preprocessed_20news.cps')

mdl = tp.LDAModel(min_df=5, rm_top=40, k=30, corpus=corpus)
mdl.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
))
print('Removed Top words: ', *mdl.removed_top_words)

# Let's train the model
for i in range(0, 1000, 20):
print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
mdl.train(20)
print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))

topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

prepared_data = pyLDAvis.prepare(
topic_term_dists,
doc_topic_dists,
doc_lengths,
vocab,
term_frequency
)
pyLDAvis.save_html(prepared_data, 'ldavis.html')

0 comments on commit 400f060

Please sign in to comment.