-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathgot_sentiment.py
More file actions
424 lines (370 loc) · 16.8 KB
/
got_sentiment.py
File metadata and controls
424 lines (370 loc) · 16.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
# coding: utf-8
# authors: dagrha, ngreeney
# Game of Thrones (Book 1) Sentiment Analysis.
'''This class creates a basic chart and a statistical summary table for a selected
chapter in the novel Game of Thrones (George R.R. Martin). Can upload to WordPress
site automatically.'''
# collections from the Standard Library
import collections
import logging
# libepub (https://github.com/jharjono/libepub/)
from libepub.book import Book
# pandas dataframe library (http://pandas.pydata.org/)
import pandas as pd
# Beautiful Soup (http://www.crummy.com/software/BeautifulSoup/)
from bs4 import BeautifulSoup
# TextBlob (http://textblob.readthedocs.org/en/dev/)
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
# Bokeh plotting library (http://bokeh.pydata.org/en/latest/)
from bokeh.io import show, save
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html
# Matplotlib for jpg
import matplotlib.pyplot as plt
# needs --> pip install python-wordpress-xmlrpc
from blogpost import BlogPost
# NLTK
import nltk.tokenize.punkt as punkt
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk.data as nldata
# Numpy - handy arrays
import numpy as np
class BookAnalysis:
def __init__(self, epub):
'''
Pulls all the chapter info from an epub file.
'''
ebook = Book(epub)
self.make_book(ebook)
def make_book(self,ebook):
'''
Breaks epub book into chapters and creates a dictionary called book_dict\n\n
Takes in ebook which is a libepub Book type
'''
self.book_dict = collections.OrderedDict()
for chapter in ebook.chapters:
soup = BeautifulSoup(chapter.content, 'xml')
try:
if soup.h1.attrs['class'] == 'chapter0':
chapter_dict = dict()
string = str()
for tag in soup('p', class_=['indent', 'nonindent']):
string += str(tag.get_text()) + ' '
string = string.replace('“','"')
string = string.replace('”','"')
chapter_data = list()
chapter_num = soup.h1.attrs['id']
chapter_data.append(chapter_num)
chapter_author = soup.h1.get_text()
chapter_data.append(chapter_author)
chapter_data.append(string)
page_num = soup.a.attrs['id']
chapter_dict[page_num] = chapter_data
self.book_dict.update(chapter_dict)
else:
pass
except AttributeError:
print('No h1 in {} --- Skipping'.format(soup.div.attrs))
pass
self.renumber_prologue()
self.repack_dict()
self.dict_to_frame()
def renumber_prologue(self):
'''Book is slightly malformed in that it by default labels two chapters as "c01"
so here I just re-label the prologue as "c00"'''
self.book_dict['page1'][0] = 'c00'
def repack_dict(self):
new_dict = collections.OrderedDict()
for item in self.book_dict.values():
new_dict[item[0]]=item[1:]
self.book_dict = new_dict
return
def dict_to_frame(self):
self.pf = pd.DataFrame.from_dict(self.book_dict,orient="index")
self.pf.columns = ["Character","Text"]
return
def text_for_character(self,character):
try:
temp = self.pf[self.pf.Character == character.upper()]
except:
self.dictToFrame()
temp = self.pf[self.pf.Character == character.upper()]
text = ''
chapters = np.array(temp.index)
chapters.sort()
for chap in chapters:
text += temp.loc[chap]['Text'] + '\n\n'
return text
def tokenize(self,text):
'''
breaks the dataframe from rows of chapters down to rows of sentences
'''
trainer = punkt.PunktTrainer()
trainer.ABBREV = 1.0
for i in self.pf.Text:
trainer.train(i,verbose=True,finalize=False)
param = trainer.get_params()
tok = punkt.PunktSentenceTokenizer(param,True)
return tok.tokenize(text)
def natural(self):
self.select_chapter()
'''tokenize - using chapter selected, pull text from dict and feed to NLTK
NLTK then breaks into sentences then into words ending with 2-d list of words'''
text = self.book_dict[self.chapter_code][1]
text = text.replace('?”','? ”').replace('!”', '! ”').replace('.”', '. ”')
self.nat_analysis = [word_tokenize(t) for t in sent_tokenize(text)]
'''test difference to TextBlob Breakdown of sentences'''
temp_df = self.single_chapter()
for i in range(len(self.nat_analysis)):
x = ''.join(temp_df.iat[i,3].split())
y = ''.join(self.nat_analysis[i])
if x!=y:
print("Difference at:")
print(x)
print(y)
break
return
def blobWholeBook(self):
'''Create a dataframe and populate the fields with information about each chapter'''
self.df = pd.DataFrame()
for chapter in self.book_dict:
chapter_no = chapter
author = self.book_dict[chapter][0]
text = self.book_dict[chapter][1]
tb = TextBlob(text, analyzer=NaiveBayesAnalyzer())
chap_df = pd.DataFrame(tb.serialized)
chap_df['chapter'] = chapter_no
chap_df['author'] = author
self.df = pd.concat([self.df, chap_df])
'''Group the dataframe by chapter, author; run a cumulative summation of the polarity over each group.'''
self.df.reset_index(drop=True, inplace=True)
self.df['sent_index'] = self.df.index
self.df['chap_cumsum'] = self.df.groupby(['chapter'])['polarity'].cumsum()
self.df['char_cumsum'] = self.df.groupby(['author'])['polarity'].cumsum()
self.df['book_cumsum'] = self.df['polarity'].cumsum()
def blobChapter(self):
'''Create a dataframe and populate the fields with information about the active chapter\n
Asks for a chapter if there is currently not an active chapter'''
self.df = pd.DataFrame()
try:
chapter = self.chapter_code
except:
self.select_chapter()
chapter = self.chapter_code
chapter_no = chapter
author = self.book_dict[chapter][0]
text = self.book_dict[chapter][1]
tb = TextBlob(text, analyzer=NaiveBayesAnalyzer())
chap_df = pd.DataFrame(tb.serialized)
chap_df['chapter'] = chapter_no
chap_df['author'] = author
self.df = pd.concat([self.df, chap_df])
'''Group the dataframe by chapter and run a cumulative summation of the polarity over each chapter.'''
self.df['chap_cumsum'] = self.df.groupby(['chapter'])['polarity'].cumsum()
def blobText(self,grouping,text):
'''Create a dataframe and populate the fields with information passed to method\n\n
grouping = string explaining what the text is, ie. 'Bran', 'Chapters 1-10', etc.\n
text = raw text to pass to text blob
'''
self.df = pd.DataFrame()
chapter_no = '999'
author = grouping
tb = TextBlob(text, analyzer=NaiveBayesAnalyzer())
chap_df = pd.DataFrame(tb.serialized)
chap_df['chapter'] = chapter_no
chap_df['author'] = author
self.df = pd.concat([self.df, chap_df])
'''Group the dataframe by chapter and run a cumulative summation of the polarity over each chapter.'''
self.df['chap_cumsum'] = self.df.groupby(['chapter'])['polarity'].cumsum()
def select_chapter(self):
'''Get user input for the chapter to examine'''
user_input = 0
while True:
try:
try:
if self.chapter_code != None:
print("Current active Chapter is {}.".format(self.chapter_code),end='')
except AttributeError:
pass
user_input = int(input("Enter the number of the chapter you'd like to examine: "))
except ValueError:
print("Please give an integer instead.")
print()
continue
else:
print("Chapter %s will be set as active." %user_input)
print()
break
self.set_chapter(user_input)
def event_locator(self, sub_df, text):
'''Takes the _text_ argument and returns the index of that
sentence in the given dataframe (_sub_df_ argument) that is
closest to the index of the wholeBook dataframe. _sub_df_,
for example, could just be the df of a single character, or
it could be the entire book. Returns a tuple of the index
and the event summary (_event_ argument)'''
text_filter = self.df.raw.str.contains(text)
sent_idx = int(self.df[text_filter].sent_index.values)
sub_sent_idx = int(sub_df.iloc[(sub_df.sent_index - sent_idx).abs().argsort()[:1]].index.values)
return sub_sent_idx
def set_chapter(self,chap_num):
self.chapter_code = 'c' + str(user_input).zfill(2)
def single_chapter(self):
'''Returns subset of dataframe matching active chapter'''
return self.df[self.df.chapter == self.chapter_code]
def get_chardf(self,character):
char_df = self.df[self.df['author']==character.upper()]
char_df.reset_index(inplace=True, drop=True)
return char_df
def chapter_info(self):
'''Quick look at the most negative and positive sentences/n
temp_df is a dataframe for the current active chapter
'''
temp_df = self.single_chapter()
self.info = list()
print('The most negative sentences are: ')
self.info.append(temp_df[temp_df.polarity < -0.5][['polarity', 'raw']].values)
print(self.info[-1])
print()
print('The most positive sentences are: ')
self.info.append(temp_df[temp_df.polarity > 0.5][['polarity', 'raw']].values)
print(self.info[-1])
print()
'''Create a table of summary statistics. Note that any sentence with a polarity
of 0 has been excluded from the statistics!!'''
self.info.append(temp_df[temp_df.polarity != 0.0].describe().round(2))
print(self.info[-1])
print()
def plot_html(self): #df, chapter_code):
'''Create a plot of the cumulative sentiment polarity, show it inline in the notebook,
and save copies as png and html'''
temp_df = self.single_chapter()
self.title = ' '.join(['Chapter', str(int(self.chapter_code[1:])), '-', temp_df.author.unique()[0],
':', 'NB senitment polarity'])
png_name = self.chapter_code + '_' + temp_df.author.unique()[0] + '.png'
html_name = self.chapter_code + '_' + temp_df.author.unique()[0] + '_embed.html'
TOOLS = "pan,wheel_zoom,reset,save"
p1 = figure(title=self.title, tools=TOOLS, title_text_font_size='18')
p1.line(temp_df.index, temp_df['chap_cumsum'])
p1.line(temp_df.index, temp_df['polarity'])
show(p1)
save(p1, png_name, title=self.title, resources=CDN)
self.html = file_html(p1, CDN, html_name)
with open(html_name, 'w') as f:
f.write(self.html)
def plot_singleChap(self):
'''Create a plot of the cumulative sentiment polarity and subjectivity and save as JPEG image'''
temp_df = self.single_chapter()
try:
self.title = ' '.join(['Chapter', str(int(self.chapter_code[1:])), '-', temp_df.author.unique()[0],
':', 'NB senitment polarity'])
self.filename = self.chapter_code + '_' + temp_df.author.unique()[0] + '.jpg'
except:
self.title = "Test"
self.filename = "test.jpg"
plt.figure()
plt.plot(temp_df.index, temp_df['chap_cumsum'], label="Polarity")
plt.plot(temp_df.index, temp_df['subjectivity'], label="Subjectivity")
plt.title(self.title)
plt.xlim(temp_df.index[0], temp_df.index[-1])
plt.xlabel("Sentence Number")
plt.ylabel("Cumulative Sentiment Polarity")
plt.legend(loc="upper left")
plt.savefig(self.filename, bbox='tight')
return
def plot_singleChar(self,character):
'''Plot just the chapters written from a single characters POV/n/n
character = name of chapter example='bran'
'''
char_df = self.get_chardf(character)
plt.figure()
plt.plot(char_df.char_cumsum)
plt.title('Sentiment Polarity across all {} chapters'.format(character.upper()))
plt.xlabel('Sentence Number')
plt.ylabel('Cumulative Sentiment Polarity')
return
def plot_wholeBook(self):
'''Plot whole book'''
plt.figure()
plt.plot(self.df.book_cumsum.tolist())
plt.title('Sentiment Polarity across Game of Thrones')
plt.xlabel('Sentence Number')
plt.ylabel('Cumulative Sentiment Polarity')
return
def add_chapterlines(self,df):
'''Adds vertical lines at the start of each chapter
'''
df['sent_num'] = df.index
idx = df.groupby('chapter')['start_index'].idxmin()
chap_bounds = df.loc[idx, ['sent_num','char_cumsum', 'chapter']]
chap_bound_list = chap_bounds[['sent_num','char_cumsum', 'chapter']].values.tolist()
ylim=plt.ylim()
for item in chap_bound_list:
plt.vlines(item[0],ylim[0],ylim[1],'k','dotted')
plt.text(item[0]+5, ylim[1]*0.97, item[2], fontsize=14)
return
def add_annot(self, sub_df, event, text, text_height=0.9):
'''Adds a annotation of _event_ text at the given sentence containing _text_
'''
event_ix = self.event_locator(sub_df, text)
event_y=sub_df.loc[event_ix,'char_cumsum']
ylim = plt.ylim()
text_y=(ylim[1]-ylim[0])*text_height+ylim[0]
print(text_y)
plt.annotate(event,(event_ix,event_y),(event_ix,text_y),
arrowprops=dict(width=1,headwidth=3))
return
def start_post(self):
'''Post the positive and negative sentences along with the description table to WordPress'''
password = input('Password:')
wp = BlogPost('python', password)
user_input = input("Upload File? (enter yes):")
if user_input == 'yes':
wp.uploadJPG(self.filename)
self.title = ' '.join(['Chapter', str(int(self.chapter_code[1:])), '-', self.df_chapter.author.unique()[0],
':', 'NB senitment polarity'])
self.neg_sentences = str()
for i in self.info[0]:
self.neg_sentences += '\t'.join(['%.2f' %float(str(i[0])), i[1], '\n'])
self.pos_sentences = str()
for i in self.info[1]:
self.pos_sentences += '\t'.join(['%.2f' %float(str(i[0])), i[1], '\n'])
neg_title = '<strong>The most negative sentences are:</strong>'
pos_title = '\n<strong>The most positive sentences are:</strong>'
table_id_string = '\n\n[table id=' + self.chapter_code[1:].zfill(3) + ' /]'
copy_string = "Copy code below to TablePress Import as HTML\nChange ID to " + self.title + "\n"
body = '\n'.join([neg_title, self.neg_sentences, pos_title, self.pos_sentences, table_id_string,
copy_string, self.info[2].to_html()])
wp.postDraft(self.title, body)
if __name__ == '__main__':
'''Load the book as an epub book'''
got = BookAnalysis(r'books/game.epub')
user_input = input("What to analyze?\nChapter Number, Character Name, else Whole Book: ")
try:
chap_num = int(user_input)
got.set_chapter(chap_num)
got.blobChapter()
got.chapter_info()
# game_of_thrones.single_chapter() #selected in blobChapter now
except:
if user_input.upper() in got.pf['Character'].unique():
print(user_input+' in Book')
got.blobWholeBook()
got.plot_singleChar(user_input)
got.add_chapterlines(got.get_chardf(user_input))
else:
print("Blob-ing the whole book")
got.blobWholeBook()
got.plot_wholeBook()
got.add_chapterlines(got.df)
'''NLTK analysis'''
# game_of_thrones.natural()
'''Commented out for NLTK testing'''
# game_of_thrones.plot_html()
# game_of_thrones.plot_jpg()
# user_input = input("Start Post? (enter yes):")
# if user_input == 'yes':
# game_of_thrones.start_post()
## game_of_thrones.plot_html()