Natural-Language-Processing/Assignment 3-NLP.Rmd at master · devan-github/Natural-Language-Processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
---
title: "Assignment 3 - NLP"
author: "Devan Goto"
date: "2/23/2017"
output: html_document
---

## Libraries
```{r}

#Install packages to use libraries

install.packages("NLP")
install.packages("RColorBrewer")
install.packages("topicmodels")

#Make sure you install and load the following libraries

library(tm)
library(SnowballC)
library(wordcloud)
library(ggplot2)
library(dplyr)
library(tidyr)
library(topicmodels)

#IF USING A MAC PLEASE RUN THIS CODE
Sys.setlocale("LC_ALL", "C")

```

## Import all document files and the list of weeks file
```{r}

#Given
#Create a list of all the files
file.list <- list.files(path="~/YOUR FILE PATH", pattern=".csv")

#Loop over file list importing them and binding them together
D1 <- do.call("rbind", lapply(file.list, read.csv, header = TRUE, stringsAsFactors = FALSE))

D2 <- read.csv("~/YOUR FILE PATH/week-list.csv", header = TRUE)

#View Current Working Directory: Use "getwd" function
#Set New Working Directory: Use "setwd" function

#Working Directory: A hierarchical file system dynamically associated with each process.  In order to get the file list function to work I must first set the working directory to A3-files.  Click on the "A3-files" folder -> Then click on "More," & "Set As Working Directory."  Doing this allows R to recognize the csv's needed to run the file list command.

#What I Used
#Create a list of all the files

setwd("~/HUDK 4051/Assignment 3 - NLP/A3-files")

file.list <- list.files(path="/Users/Devan/HUDK 4051/Assignment 3 - NLP/A3-files", pattern=".csv")

#Loop over file list importing them and binding them together

D1 <- do.call("rbind", lapply(file.list, read.csv, header = TRUE, stringsAsFactors = FALSE))

#Must change working directory again, to utilize the "week-list.csv"

setwd("~/HUDK 4051/Assignment 3 - NLP/A3-files/Week-List")

D2 <- read.csv("~/HUDK 4051/Assignment 3 - NLP/A3-files/Week-List/week-list.csv", header = TRUE)

#Set the working directory back to what it was originally.

setwd("~/HUDK 4051/Assignment 3 - NLP")

```

## Clean the html tags from your text
```{r}

#gsub() function replaces all matches of a string, if the parameter is a string vector, returns a string vector of the same length and with the same attributes (after possible coercion to character). Elements of string vectors which are not substituted will be returned unchanged (including any declared encoding).

#gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)

#pattern: string to be matched
#replacement: string for replacement
#x: string or string vector
#ignore.case: if TRUE, ignore case

#Our code says... if "given pattern" then "replace with nothing".  To get rid of all html tags we used the following code.

D1$Notes2<-gsub("<.*?>","",D1$Notes)

D1$Notes2<-gsub("nbsp","",D1$Notes2)

D1$Notes2<-gsub("<[^>]*>", " ",D1$Notes2)

D1$Notes2<-gsub("&;", " ",D1$Notes2)

```

## Merge with week list so you have a variable representing weeks for each entry
```{r}

D1<-dplyr::left_join(D1, D2, by = "Title")

```

## Process text using the tm package - Code has been altered to account for changes in the tm package
```{r}

#Convert the data frame to the corpus format that the tm package uses
corpus <- Corpus(VectorSource(D1$Notes2))

#Remove spaces
corpus <- tm_map(corpus, stripWhitespace)

#Convert to lower case
corpus <- tm_map(corpus, content_transformer(tolower))

#Remove pre-defined stop words ('the', 'a', etc)
corpus <- tm_map(corpus, removeWords, stopwords('english'))

#Convert words to stems ("education" = "edu") for analysis, for more info see  http://tartarus.org/~martin/PorterStemmer/
corpus <- tm_map(corpus, stemDocument, lazy=TRUE)

#Remove numbers
corpus <- tm_map(corpus, removeNumbers, lazy=TRUE)

#remove punctuation
corpus <- tm_map(corpus, removePunctuation, lazy=TRUE)
```

#### Create a Term Document Matrix
```{r}

#Convert corpus to a term document matrix - so each word can be analyzed individuallly
tdm.corpus <- TermDocumentMatrix(corpus)

```

# Sentiment Analysis

### Match words in corpus to lexicons of positive & negative words
```{r}

#Sentiment Analysis: The process of computationally identifying and categorizing opinions expressed in a piece of text, especially in order to determine whether the writer's attitude towards a particular topic, product, etc., is positive, negative, or neutral.

#Set working directory to recognize txt files.
setwd("~/HUDK 4051/Assignment 3 - NLP/A3-files")

#Upload positive and negative word lexicons
positive <- readLines("positive-words.txt")
negative <- readLines("negative-words.txt")

#Search for matches between each word and the two lexicons
D1$positive <- tm_term_score(tdm.corpus, positive)
D1$negative <- tm_term_score(tdm.corpus, negative)

#Generate an overall pos-neg score for each line
D1$score <- D1$positive - D1$negative

```

## Generate a graph of the sum of the sentiment score over weeks
```{r}

#Create a table that reresents what we want to graph
G1<-dplyr::select(D1,week.x,score)
G1<-na.omit(G1)
G2<-G1
G2<-G2 %>% dplyr::group_by(week.x)%>%dplyr::summarise(sum(score))

#Plot table to get graph

plot(G2, col="red",main = "Sentiment Score Over Weeks")

```

# LDA Topic Modelling
```{r}

#Term Frequency Inverse Document Frequency
dtm.tfi <- DocumentTermMatrix(corpus, control = list(weighting = weightTf))

#Remove very uncommon terms (term freq inverse document freq < 0.1)
dtm.tfi <- dtm.tfi[,dtm.tfi$v >= 0.1]

#Remove non-zero entries
rowTotals <- apply(dtm.tfi , 1, sum)

#Find the sum of words in each Document
dtm.tfi   <- dtm.tfi[rowTotals> 0, ]

#Divide by sum across rows
lda.model = LDA(dtm.tfi, k = 3, seed = 150)

#Which terms are most common in each topic
terms(lda.model)

Topic 1   Topic 2   Topic 3
   "data" "network"    "data"

#Which documents belong to which topic
topics(lda.model)

tlm<-as.data.frame(topics(lda.model))

```

# Main Task

Your task is to generate a *single* visualization showing:
- Sentiment for each week and
- One important topic for that week

```{r}

#TASK 1: Find average (mean) sentiment per week

M1<-G1
M2<-M1 %>% dplyr::group_by(week.x)%>%dplyr::summarise(mean(score))

#Task 2: Find one important topic for each week

#Make a table with the needed variables
Q1<-dplyr::select(D1,week.x,Notes2)

#Omitting rows with na values
Q1<-na.omit(Q1)
Q2<-Q1

#Deleting additional html tags I missed earlier
Q2$Notes2<-gsub("<e2><80><90>","",Q2$Notes2)

#Omitting rows with no values
Q3<-Q2
Q3[Q3==""]<-NA
Q3<-na.omit(Q3)

#Grouping data by week and combining all text to fit in each week.  This is done to text mine each week to identify a topic of interest each week.  ###When I did this some weeks yielded no text in the rows.  I am unsure why this happened, but I decided to continued onwards.

Q4<-Q3
Q4<-Q3 %>% dplyr::group_by(week.x)%>%dplyr::summarise(paste(Notes2,sep = " ",collapse = "+"))

#Prepare corpus and documentary term matrix, used to find one topic per week.

#Convert the data frame to the corpus format that the tm package uses
corpus2 <- Corpus(VectorSource(Q4$`paste(Notes2, sep = " ", collapse = "...`))

#Remove spaces
corpus2 <- tm_map(corpus2, stripWhitespace)

#Convert to lower case
corpus2 <- tm_map(corpus2, content_transformer(tolower))

#Remove pre-defined stop words ('the', 'a', etc)
corpus2 <- tm_map(corpus2, removeWords, stopwords('english'))

#Convert words to stems ("education" = "edu") for analysis
corpus2 <- tm_map(corpus2, stemDocument, lazy=TRUE)

#Remove numbers
corpus2 <- tm_map(corpus2, removeNumbers, lazy=TRUE)

#Remove puncuation
corpus2 <- tm_map(corpus2, removePunctuation, lazy=TRUE)

#Convert corpus to a term document matrix - so each word can be analyzed individuallly
tdm.corpus2 <- TermDocumentMatrix(corpus2)

#Term Frequency Inverse Document Frequency
dtm.tfi2 <- DocumentTermMatrix(corpus2, control = list(weighting = weightTf))

#Remove very uncommon terms (term freq inverse document freq < 0.1)
dtm.tfi2 <- dtm.tfi2[,dtm.tfi$v >= 0.1]

#Remove non-zero entries
rowTotals <- apply(dtm.tfi2 , 1, sum)
dtm.tfi2   <- dtm.tfi2[rowTotals> 0, ]

lda.model2 = LDA(dtm.tfi2, k = 3, seed = 150)

#Find most common topics each week
terms(lda.model2)
 Topic 1   Topic 2   Topic 3   Topic 4
   "data" "student"   "skill"   "learn"

#Create Data Frame. This shows us a topic each week
tlm2<-topics(lda.model2)
tlm2<-as.data.frame(topics(lda.model2))

#Clean & merge the data frame

#rename column name
tlm3<-tlm2
colnames(tlm3) <- "topics"

#Create "dummy variable ($delete)" on each data table (mean sentiment score per week (M3), & main topic per week(tlm4)), used to merge our two main data tables together

tlm4<-tlm3
tlm4$delete<-row.names(tlm4)

M3<-M2
M3$delete<-row.names(M3)

#Join them together with dummy variable
M4<-dplyr::left_join(M3, tlm4, by = "delete")

#Delete the dummy variable
M4$delete<-NULL

#Change column names (makes it easier to understand)
M5<-M4
colnames(M5) [1]<- "week"
colnames(M5) [2]<- "score"

#Change numbers into actual topics, in "topics" column (first change "topics" into character variables)

M5$topics<-as.character(M5$topics)
M5[M5$topics == "1",]$topics = "data"
M5[M5$topics == "2",]$topics = "student"
M5[M5$topics == "3",]$topics = "skill"
M5[M5$topics == "4",]$topics = "learn"

#Task 3: Finally plot your graph with your final data table (M5). This table has the three variables needed to create the visualization.

ggplot(data=M5, aes(x=week, y=score,fill=topics)) + geom_bar(stat = "identity") + scale_x_continuous(breaks = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14))+ ylab("Mean Sentiment Score") + ggtitle("Mean Sentiment For Each Week With Topics")

```