-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster_lda.py
29 lines (22 loc) · 955 Bytes
/
cluster_lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from gensim import corpora, models, similarities
from nltk import WordPunctTokenizer
import re
NUM_TOPICS = 40
stopwords = open('stopwords.txt').read().split('\n')
word_re = re.compile('[a-z0-9\s]+')
tokenizer = WordPunctTokenizer()
tokenize = lambda text: [w.lower()
for w in tokenizer.tokenize(text)
if re.match(word_re, w.lower()) and w.lower() not in stopwords]
id2word = corpora.Dictionary.load('dictionary.dict')
mm = corpora.MmCorpus('tfidf.mm')
lsi = models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=NUM_TOPICS)
dic = corpora.Dictionary.load('dictionary.dict')
def get_topics(text, num, model=lsi):
""" get +num+ topics for text +text+ """
topics = []
for t in sorted(model[dic.doc2bow(tokenize(text))],
key=lambda t: t[1],
reverse=True)[:num]:
topics.append([u[1] for u in lsi.show_topic(t[0])])
return topics