Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added Final writeup.odt
Binary file not shown.
Binary file added Finalwriteup.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# TextMining
This is the base repo for the text mining and analysis project for Software Design, Spring 2016 at Olin College.
https://github.com/msausville/TextMining/blob/master/Finalwriteup.pdf
62 changes: 62 additions & 0 deletions markov.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from bs4 import BeautifulSoup
import requests
from numpy.random import choice


"""
Function that pulls text from web
"""
megalist = []
markovdict = {}
# print("in global scope", id(markovdict))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: commented test code.

def words_from_internet(link='http://www.olin.edu/academic-life/student-affairs-resources/student-life/honor-code/', start='The Olin Honor Code Values', stop='Quick Links'):
html = BeautifulSoup(requests.get(link).text, 'lxml')
startpoint = html.get_text().find(start)
stoppoint = html.get_text().find(stop)

honor = html.get_text()[startpoint:stoppoint]
listhonor = str(honor).split()

# Combing all text into listhonor
return megalist.extend(listhonor)


"""
Make dictionary
"""


def markov(megalist):
i = 0
# print("in markov", id(markovdict))
for i in range(len(megalist)-1):
if megalist[i] not in markovdict:
markovdict[megalist[i]] = []
markovdict[megalist[i]].append(megalist[i+1])
return markovdict


def smushit(markovdict, megalist):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could use a bit more by way of documentation. You've cleaned up the script/jumbles into clean functions, but it's not immediately clear what the functions themselves do.


finallist = []
capitals = filter(lambda x: x.lower() != x, megalist)
word = choice(list(capitals))
finallist.append(word)
while not word.endswith("."):
finallist.append(word)
return " ".join(finallist)


def main_important_part():

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, not the ideal name, but it gets the point across!

words_from_internet(link='http://www.olin.edu/academic-life/student-affairs-resources/student-life/honor-code/',
start='The Olin Honor Code Values', stop='Quick Links')
words_from_internet(link='https://en.wikipedia.org/wiki/Felony',
start='Broadly, felonies', stop='are the least serious')
words_from_internet(link='http://www.olin.edu',
start='At Olin', stop='institutions.')
markov(megalist)
print(smushit(markovdict, megalist))


if __name__ == "__main__":
main_important_part()