diff --git a/README.md b/README.md index 8cce527..15be262 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ # TextMining This is the base repo for the text mining and analysis project for Software Design at Olin College. + +Look for my pdf submitted in the general repository. diff --git a/TextMiningWriteUp.pdf b/TextMiningWriteUp.pdf new file mode 100644 index 0000000..d289e7e Binary files /dev/null and b/TextMiningWriteUp.pdf differ diff --git a/text_mining.py b/text_mining.py new file mode 100644 index 0000000..83cd38f --- /dev/null +++ b/text_mining.py @@ -0,0 +1,36 @@ +import requests +import re +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + +amontillado = requests.get('http://www.gutenberg.org/cache/epub/1063/pg1063.txt').text +raven = requests.get('http://www.gutenberg.org/cache/epub/17192/pg17192.txt').text +house = requests.get('http://www.gutenberg.org/cache/epub/932/pg932.txt').text + +def words(text): + #This removed all of the non-alphanumeric characters in a given string + return re.compile(r'\W+', re.UNICODE).split(text) + +def makeDict(wlist): + #This returns a dictionary of words and how frequently they appear in the story + wfreq = [wlist.count(p) for p in wlist] + return dict(zip(wlist,wfreq)) + +def sortDict(dfreq): + #This sorts the dictionary created in makeDict + sort = [(dfreq[key], key) for key in dfreq] + sort.sort() + sort.reverse() + return sort + +analyzer = SentimentIntensityAnalyzer() +analyzer2 = SentimentIntensityAnalyzer() +analyzer3 = SentimentIntensityAnalyzer() + + +if __name__ == "__main__": + print(sortDict(makeDict(words(amontillado)))) + print(sortDict(makeDict(words(raven)))) + print(sortDict(makeDict(words(house)))) + print(analyzer.polarity_scores(amontillado)) + print(analyzer2.polarity_scores(raven)) + print(analyzer3.polarity_scores(house))