-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathBushData-CleaningMethods.R
More file actions
56 lines (49 loc) · 1.5 KB
/
BushData-CleaningMethods.R
File metadata and controls
56 lines (49 loc) · 1.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
library(stringr)
library(tm)
library(data.table)
library(lubridate)
library(parallel)
library(SnowballC)
library(topicmodels)
library(slam)
readFile <- function(title) {
fileStuff <- readLines(title)
date <- mdy(fileStuff[7])
content <- gsub("<.*?>", "",
paste(fileStuff[12:length(fileStuff)], collapse=" "))
content <- str_replace_all(content, "\\[\\[.+?\\]\\]", "")
data.table(Date=date, Content=content)
}
genTdm <- function(docs){
ctrl <- list(tokenize="MC",
tolower=T,
removePunctation=T,
removeNumbers=T,
stopwords=T,
stemming=wordStem,
wordLengths=c(1,Inf)
)
dtm <- DocumentTermMatrix(VCorpus(VectorSource(docs)), control = ctrl)
dtm[,setdiff(colnames(dtm), intersect(colnames(dtm), stpwrds))]
}
pruneWords <- function(dtm) {
terms.df <- colapply_simple_triplet_matrix(dtm, function(tf) {sum(tf > 0)})
dtm[,terms.df > 50]
}
stpwrds <- wordStem(scan(file="stopwords.txt", what=character()));
write.lda <- function(dtmToWrite, title="write_lda_output") {
vocab <- dtmToWrite$dimnames$Terms
cat(file = paste0(title, "_vocab.txt"), vocab, sep="\n")
dtmFile <- file(paste0(title, ".dat"), "wt")
rowapply_simple_triplet_matrix(dtmToWrite, function(wordFreqs) {
docHas <- which(wordFreqs != 0)
cat(file=dtmFile, length(docHas), " ", sep="")
invisible(lapply(docHas, function(wordIndex) {
cat(file=dtmFile, wordIndex-1, ":", wordFreqs[wordIndex], " ",
sep="")
}))
cat(file=dtmFile, "\n")
})
close(dtmFile)
}
# extractDate <-