diff --git a/Final writeup.odt b/Final writeup.odt new file mode 100644 index 0000000..ca232e3 Binary files /dev/null and b/Final writeup.odt differ diff --git a/Finalwriteup.pdf b/Finalwriteup.pdf new file mode 100644 index 0000000..4d8e611 Binary files /dev/null and b/Finalwriteup.pdf differ diff --git a/README.md b/README.md index d0674e5..5877452 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ # TextMining This is the base repo for the text mining and analysis project for Software Design, Spring 2016 at Olin College. +https://github.com/msausville/TextMining/blob/master/Finalwriteup.pdf diff --git a/markov.py b/markov.py new file mode 100644 index 0000000..e77beee --- /dev/null +++ b/markov.py @@ -0,0 +1,62 @@ +from bs4 import BeautifulSoup +import requests +from numpy.random import choice + + +""" +Function that pulls text from web +""" +megalist = [] +markovdict = {} +# print("in global scope", id(markovdict)) +def words_from_internet(link='http://www.olin.edu/academic-life/student-affairs-resources/student-life/honor-code/', start='The Olin Honor Code Values', stop='Quick Links'): + html = BeautifulSoup(requests.get(link).text, 'lxml') + startpoint = html.get_text().find(start) + stoppoint = html.get_text().find(stop) + + honor = html.get_text()[startpoint:stoppoint] + listhonor = str(honor).split() + + # Combing all text into listhonor + return megalist.extend(listhonor) + + +""" +Make dictionary +""" + + +def markov(megalist): + i = 0 + # print("in markov", id(markovdict)) + for i in range(len(megalist)-1): + if megalist[i] not in markovdict: + markovdict[megalist[i]] = [] + markovdict[megalist[i]].append(megalist[i+1]) + return markovdict + + +def smushit(markovdict, megalist): + + finallist = [] + capitals = filter(lambda x: x.lower() != x, megalist) + word = choice(list(capitals)) + finallist.append(word) + while not word.endswith("."): + finallist.append(word) + return " ".join(finallist) + + +def main_important_part(): + words_from_internet(link='http://www.olin.edu/academic-life/student-affairs-resources/student-life/honor-code/', + start='The Olin Honor Code Values', stop='Quick Links') + words_from_internet(link='https://en.wikipedia.org/wiki/Felony', + start='Broadly, felonies', stop='are the least serious') + words_from_internet(link='http://www.olin.edu', + start='At Olin', stop='institutions.') + markov(megalist) + print(smushit(markovdict, megalist)) + + +if __name__ == "__main__": + main_important_part()