-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlda.R
More file actions
136 lines (101 loc) · 4.36 KB
/
lda.R
File metadata and controls
136 lines (101 loc) · 4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
library(dplyr)
library(tidytext)
library(topicmodels)
library(ggplot2)
library(tidyr)
library(SnowballC)
library(reshape2)
#harmonicMean <- function(logLikelihoods, precision = 2000L) {
# llMed <- median(logLikelihoods)
# as.double(llMed - log(mean(exp(-logLikelihoods + llMed))))
#}
ntopics <- 5
CV.questions <-
# read.csv("data/questions_with_body.csv.orig",
read.csv("data/questions_with_body.csv",
header=TRUE, sep=",")
#CV.questions <- transform(CV.questions, body = gsub("\\n"," ",body))
#CV.questions <-
# transform(CV.questions,
# body = gsub("<span class=\"math-container\">.*?</span>","",body))
#CV.questions <- transform(CV.questions, qbody = gsub("<.*?>","",body))
#tidy.words <- (select(CV.questions,c("question_id","title"))
# %>% transform(title=as.character(title))
# %>% unnest_tokens(word,title))
tidy.words <- (select(CV.questions,c("question_id","body")))
rm(CV.questions)
gc()
tidy.words <- transform(tidy.words, body=as.character(body))
tidy.words <- transform(tidy.words, body= gsub("&.*;", " ", body))
tidy.words <- (transform(tidy.words, body= gsub("[^[:alpha:][:space:]]+", " ", body)) %>%
unnest_tokens(word,body))
tidy.words <- (tidy.words %>% anti_join(stop_words))
tidy.words <- transform(tidy.words, word=wordStem(word, language="english"))
tidy.words.1 <- full_join(count(tidy.words,question_id),tidy.words) %>% subset(.,n>=0) %>% select(c("question_id","word"))
#tidy.words.2 <- full_join(count(tidy.words,question_id),tidy.words) %>% subset(.,n<50) %>% select(c("question_id","word"))
write.csv(tidy.words,"data/tidy_words.csv")
nq <- tidy.words.1["question_id"] %>% unique() %>% count()
q_max <- as.numeric(nq*1.1)
q_min <- 20
unique.question.words <- tidy.words.1[c("question_id","word")] %>% unique() %>% count(.,word)
my.stop.words <- subset(unique.question.words,(n < q_min | n > q_max))
tidy.words.1 <- anti_join(tidy.words.1, my.stop.words, by="word")
word.counts.1 <- (tidy.words.1 %>% count(.,question_id,word,sort=TRUE))
#word.counts.2 <- (tidy.words.2 %>% count(.,question_id,word,sort=TRUE))
rm(tidy.words)
gc()
ungroup(word.counts.1)
#ungroup(word.counts.2)
questions.dtm.1 <- word.counts.1 %>% cast_dtm(question_id,word,n)
#questions.dtm.2 <- word.counts.2 %>% cast_dtm(question_id,word,n)
rm(word.counts.1)
#rm(word.counts.2)
gc()
#burnin <- 1000
#iter <- 1000
#keep <- 50
questions.lda.1 <- LDA(questions.dtm.1,
k=ntopics,
method="Gibbs",
control = list(seed=1234,
verbose=10))
#
# burnin=burnin,
# iter=iter,
# keep=keep))
#questions.lda.2 <- posterior(questions.lda.1, questions.dtm.2)
questions.topics <- tidy(questions.lda.1, matrix="beta")
csv.file.name <-
sprintf("data/terms_for_%d_topics.csv", ntopics)
write.csv(questions.topics,csv.file.name)
#documents.topics.2 <- melt(questions.lda.2[[2]])
#names(documents.topics.2) <- c("document","topic","gamma")
documents.topics.1 <- tidy(questions.lda.1, matrix="gamma")
#documents.topics.all <- rbind(documents.topics.1,documents.topics.2)
documents.topics.all <- documents.topics.1
csv.file.name <-
sprintf("data/question_topics_for_%d_topics.csv", ntopics)
write.csv(documents.topics.all,csv.file.name)
questions.top.terms <- (questions.topics
%>% group_by(topic)
%>% top_n(10,beta)
%>% ungroup()
%>% arrange(topic,-beta))
img.height <- 3 * ( ((ntopics-1) %/% 3) + 1 )
img.file.name <-
sprintf("images/terms_for_%d_topics.svg", ntopics)
svg(img.file.name, width=7, height=img.height)
top.terms.plot <- (questions.top.terms
%>% mutate(term=reorder(term,beta))
%>% ggplot(aes(term,beta,fill = factor(topic))) +
geom_col(show.legend=FALSE) +
facet_wrap(~ topic,scales="free") +
coord_flip()) +
labs(x="όρος",y="β" )
print(top.terms.plot)
dev.off()
#beta.spread <- (questions.topics
# %>% mutate(topic=paste0("topic",topic ))
# %>% spread(topic,beta)
# %>% filter(topic1> .001 | topic2 > .001)
# %>% mutate(log_ratio=log2(topic2/topic1)))