-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpReader.R
More file actions
112 lines (90 loc) · 3.47 KB
/
pReader.R
File metadata and controls
112 lines (90 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
library(pdftools)
library(dplyr)
library(tidytext)
library(tm)
library("stringr")
classes <- data.frame(class=c(list.files("data",recursive = F)),stringsAsFactors = F)
docs <- data.frame(stringsAsFactors = F)
doc <- data.frame(stringsAsFactors = F)
for(myClass in classes$class) {
files <- list.files(paste0("data/",myClass),recursive = F)
docs <- sapply(paste0("data/",myClass,"/",files),pdf_text)
for(i in 1:length(docs)) {
docLn <- toString(docs[[i]])
docx <- data.frame(file=files[i],text=docLn,class=myClass)
doc <- rbind(doc,docx)
}
}
myDataset <- data.frame(stringsAsFactors = FALSE)
datasetTemp <- data.frame(doc,stringsAsFactors = F)
datasetTemp$text <- as.character(datasetTemp$text)
datasetTemp$file <- as.character(datasetTemp$file)
datasetTemp$class <- as.character(datasetTemp$class)
head(datasetTemp)
ct <- 0 # counter to read files
for(ind in 1:length(datasetTemp$file)) {
texto <- datasetTemp$text[ind]
texto <- paste(texto,collapse = " ")
texto <- gsub("<.*?>", "", texto)
documents <- Corpus(VectorSource(texto))
documents = tm_map(documents, tolower)
documents = tm_map(documents, removePunctuation)
texto = tm_map(documents, removeNumbers)$content
texto <- tm_map(documents, removeWords,stopwords("en"))$content
myDataset <- rbind(myDataset,c(datasetTemp$id[ind],datasetTemp$class[ind],
t(texto)), stringsAsFactors = FALSE)
}
myDataset <- datasetTemp
book_words <- myDataset %>%
unnest_tokens(word, text,to_lower = TRUE) %>%
count(file,word, sort = TRUE) %>% ungroup() %>% as.data.frame()
book_words <- book_words[order(book_words$file),]
names(book_words) <- c("file","word","f")
conta <- function(x) {
y <- 0
for(i in 1:length(x)) {
y <- y+1
}
return(y)
}
bind_tfidf <- function(files,f,n) {
nfiles <- length(unique(book_words$file))
nfiles <- rep(nfiles,length(book_words$file))
tf <- 1+log2(f)
idf <- log2(nfiles/n)
tf_idf <- tf * idf
book_words$df <- tf
book_words$idf <- idf
book_words$tf_idf <- tf_idf
}
total_file <- book_words
total_file <- aggregate(total_file,by = list(total_file$word),conta)
total_file <- total_file[,-2:-3]
names(total_file) <- c("word","f")
book_words <- merge(book_words, total_file, by = "word")
book_words <- left_join(book_words, total_file, by = "word")
names(book_words) <- c("file","word","n","f")
head(book_words)
# Create matrix with TF-IDF
book_words$tf_idf <- bind_tfidf(book_words$file,book_words$f,book_words$n)
head(book_words)
tail(book_words)
library("wordcloud")
classFlagged <- subset(x = book_words,subset = file=="file_a (1).pdf")
classFlagged <- classFlagged[order(classFlagged$tf_idf,decreasing = TRUE),]
wordcloud(words = classFlagged$word, freq = classFlagged$tf_idf,min.freq=1,
max.words=500, random.order=FALSE, rot.per=.5,
colors=brewer.pal(8,"Dark2"))
write.csv(book_words,file = "book_words.csv")
df <- read.csv("book_words.csv")
s1df <- df[df$file== "file_a (1).pdf",]
s2df <- df[df$file== "file_a (2).pdf",]
s3df <- df[df$file== "file_a (3).pdf",]
s4df <- df[df$file== "file_a (4).pdf",]
par(mfrow=c(2,2))
plot(s1df$n,s1df$tf_idf)
plot(s2df$f,s2df$tf_idf)
plot(s3df$n,s3df$tf_idf)
plot(s4df$f,s4df$tf_idf)
library("psych")
correl <- cancor(as.matrix(s1df$tf_idf),as.matrix(s2df$tf_idf))