-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathglobal.R
More file actions
56 lines (48 loc) · 1.63 KB
/
global.R
File metadata and controls
56 lines (48 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(tm)
library(wordcloud)
library(memoise)
library(stopwords)
library(pdftools)
library(qdapTools)
library(tools)
library(qdapTools)
library(rvest)
getTermMatrix <- memoise(function(files) {
type <- file_ext(files)
if (type == "pdf") {
Rpdf <- readPDF(control = list(text = "-layout"))
text <- Corpus(URISource(files),
readerControl = list(reader = Rpdf))
}
else if (type == "doc") {
text <- Corpus(URISource(files), readerControl = list(reader=readDOC))
}
else if (type == "docx") {
docx <- paste(read_docx(files), collapse='\n')
text <- VCorpus(VectorSource(docx))
}
else if (type == "html") {
# default encoding: UTF-8/UTF-16
html_file <- read_html(files, options = "NOBLANKS")
html_file <- html_text(html_file)
text <- Corpus(VectorSource(html_file))
}
else if (type == "txt") {
txt <- read.delim(files, header = F, sep = '\n', stringsAsFactor = F)
text <- VCorpus(VectorSource(txt))
}
else {
print("Wrong file type")
return (NULL)
}
myCorpus <- tm_map(text, content_transformer(tolower))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, removeWords,
c(stopwords("SMART"), stopwords("pl", source = "stopwords-iso")))
myCorpus <- tm_map(myCorpus, stripWhitespace)
myDTM <- TermDocumentMatrix(myCorpus,
control = list(minWordLength = 1))
m <- as.matrix(myDTM)
sort(rowSums(m), decreasing = TRUE)
})