-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_topics.R
137 lines (111 loc) · 3.81 KB
/
get_topics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Topic modelling on Alexader Stubb's tweets
# auth: [email protected]
# 01.02.2016
source("utility/functions.R")
access_libraries()
# #load data
tweeter <- "alexstubb"
filepath <- paste0("data/",tweeter,"_tweets.Rda")
tw <- get(load(filepath))
tweeter_languages <- unique(tw$lang)
# prepare data for topic modelling
# separate by language
TW <- lapply(tweeter_languages, function(l) tw[tw$lang==l,])
names(TW) <- tweeter_languages
# get document term matrix by language
DTM <- lapply(TW, function(tw) get_documentTermMatrix(tw$tweet))
# save(file=paste0(tweeter,"_DTM.Rda"),DTM)
# get tfidf and remove most used words
TFIDF <- lapply(DTM, function(dtm) {
tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
log2(nDocs(dtm)/col_sums(dtm > 0))
})
#term frequencies before cuts
lapply(DTM,function(dtm) summary(col_sums(dtm)))
#use 0.1 quantile as tfidf cutoff
DTM2 <- list()
temp <- mapply(function(dtm,tfidf,lang) {
cutoff <- quantile(tfidf, 0.1)
dtm <- dtm[,tfidf >= cutoff]
dtm <- dtm[,col_sums(dtm)>1]
DTM2[[lang]] <<- dtm[row_sums(dtm) > 0,]
return()
}, DTM, TFIDF, tweeter_languages)
#term frequencies after cuts
lapply(DTM2,function(dtm) summary(col_sums(dtm)))
# fit topic models using three methods (separately for each language)
k <- 4 # number of topics
TM <- lapply(DTM2, function(dtm) {
list(
VEM = LDA(dtm, k = k),
Gibbs = LDA(dtm, k = k, method = "Gibbs",
control = list(burnin = 1000,thin = 10, iter = 10000)),
CTM = CTM(dtm, k = k)
)
})
# for each language, use the model with lowest entropy
ENTR <- lapply(TM, function(tm) {
sapply(tm, function(x) {
mean(apply(posterior(x)$topics,
1, function(z) - sum(z * log(z))))
})})
model_choice <- sapply(ENTR, function(e) names(e[e==min(e)]))
model_choice
# keep only the models with lowest entropy
MODELS <- mapply(TM, model_choice, FUN = function(tm, choice) tm[[choice]])
names(MODELS) <- tweeter_languages
# topic distributions
TOPICS <- lapply(MODELS, topics)
lapply(TOPICS, table)
# label the tweets by topic and add the frequency of the most likely topic
for(i in 1:length(TW)) {
tpcs <- TOPICS[[i]]
havelabels <- as.numeric(names(tpcs))
TW[[i]]$topic <- NA
TW[[i]][havelabels,]$topic <- tpcs
posteriors <- posterior(MODELS[[i]])$topics
TW[[i]]["P_max"] <- NA
TW[[i]][havelabels,]["P_max"] <- apply(posteriors,1,max)
for(j in 1:k) {
colname <- paste0("P_topic",j)
TW[[i]][colname] <- NA
TW[[i]][havelabels,][colname] <- posteriors[,j]
}
}
# now all the tweets have a topic distribution and a most likely topic attached to them
head(TW[["finnish"]])
# Some exploration
# ----------------
# get the most frequent terms by topic
for(i in 1:length(MODELS)) {
trms <- terms(MODELS[[i]],40)
write.csv(file=paste0("data/",tweeter,"_top_terms_by_topics_",
names(MODELS[i]),".csv"),
row.names=F,
trms)
}
# get 10 'most representative' tweets from each topic
for(lang in tweeter_languages) {
tw <- TW[[lang]]
tw <- tw[!is.na(tw$topic),]
path <- paste0("data/",tweeter,"_topicsample_",lang,".csv")
write(file=path,paste("Top 10",lang,"tweets by topic \n"))
write(file=path,"Topic distributions: \n",append=T)
suppressWarnings(
write.table(file=path,table(TOPICS[[lang]]),
row.names=F,col.names=c("Topic","Freq"),append=T))
for(t in sort(unique(tw$topic))) {
tw_sub <- tw[tw$topic==t,]
tw_sub <- tw_sub[with(tw_sub,order(P_max,decreasing=T)),]
s <- tw_sub$text[1:min(10,nrow(tw_sub))]
write(file=path,paste("\n ### TOPIC",t,"### \n"), append=T)
j <- 1
for(tweet in s) {
max_post <- round(tw_sub$P_max[j],2)
write(file=path,paste0("P(T",t,")=",max_post," ",tweet," \n "),append=T)
j <- j + 1
}
}
}
# save all data frames as a list
save(file=paste0("data/",tweeter,"_topicdata.Rda"),TW)