Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# TextMining

This is the base repo for the text mining and analysis project for Software Design at Olin College.

Look for my pdf submitted in the general repository.
Binary file added TextMiningWriteUp.pdf
Binary file not shown.
36 changes: 36 additions & 0 deletions text_mining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import requests
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

amontillado = requests.get('http://www.gutenberg.org/cache/epub/1063/pg1063.txt').text
raven = requests.get('http://www.gutenberg.org/cache/epub/17192/pg17192.txt').text
house = requests.get('http://www.gutenberg.org/cache/epub/932/pg932.txt').text

def words(text):
#This removed all of the non-alphanumeric characters in a given string
return re.compile(r'\W+', re.UNICODE).split(text)

def makeDict(wlist):
#This returns a dictionary of words and how frequently they appear in the story
wfreq = [wlist.count(p) for p in wlist]
return dict(zip(wlist,wfreq))

def sortDict(dfreq):
#This sorts the dictionary created in makeDict
sort = [(dfreq[key], key) for key in dfreq]
sort.sort()
sort.reverse()
return sort

analyzer = SentimentIntensityAnalyzer()
analyzer2 = SentimentIntensityAnalyzer()
analyzer3 = SentimentIntensityAnalyzer()


if __name__ == "__main__":
print(sortDict(makeDict(words(amontillado))))
print(sortDict(makeDict(words(raven))))
print(sortDict(makeDict(words(house))))
print(analyzer.polarity_scores(amontillado))
print(analyzer2.polarity_scores(raven))
print(analyzer3.polarity_scores(house))