Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,5 @@ ENV/
.ropeproject

# End of https://www.gitignore.io/api/python

*.txt
31 changes: 31 additions & 0 deletions bad_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@

def bad_words():
list_bad_words = ['george', 'washington', 'adams', 'thomas', 'washingtons'
'jefferson', 'madison', 'james', 'monroe', 'quincy', 'jacksons',
'adams', 'andrew', 'jackson','martin', 'buren', 'tylers',
'henry', 'harrison', 'tyler', 'polk', 'zachary', 'polks', 'taylors',
'taylor', 'millard', 'fillmore', 'pierce', 'fillmores', 'pierces',
'buchanan', 'abraham', 'lincoln', 'andrew', 'johnson', 'ulysses',
'grant','rutherford', 'hayes', 'garfield', 'chester', 'johnsons',
'arthur','grover', 'cleveland', 'benjamin', 'harrison', 'william',
'mckinley','theodore', 'roosevelt','howard', 'grants', 'hayess',
'woodrow', 'wilson','warren', 'harding', 'calvin', 'coolidge',
'herbert', 'hoover','franklin', 'harry', 'truman','dwight','arthurs'
'eisenhower', 'kennedy', 'Lyndon', 'Johnson', 'richard', 'nixon',
'gerald', 'ford','jimmy', 'carter', 'ronald', 'reagan', 'clevelands',
'bill', 'clinton', 'barack', 'obama', 'donald', 'trump', 'trumps',
'president', 'william', 'would', 'which', 'years', 'zachary', 'presidential',
'though', 'while', 'because', 'harrisons', 'mckinleys', 'hardings',
'eisenhowers', 'kennedys', 'johnsons', 'carters', 'reagans', 'clintons',
'state', 'states', 'wilsons', 'after', 'trumans', 'roosevelts', 'garfields',
'coolidges', 'burens', 'lincolns', 'buchanans', 'nixons', 'fords', 'during',
'their', 'united', 'house', 'later', 'american', 'republican', 'jeffersons'
]
f = open('bad_words.txt', 'w')
for string in list_bad_words:
f.write(string + ' ' )
f.close()


if __name__ == '__main__':
bad_words()
30 changes: 30 additions & 0 deletions pages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import wikipedia

def presidents_to_texts():
presidents = ['George Washington', 'John Adams', 'Thomas Jefferson',
'James Madison', 'James Monroe', 'John Quincy Adams', 'Andrew Jackson',
'Martin Van Buren', 'William Henry Harrison', 'John Tyler',
'James K. Polk', 'Zachary Taylor', 'Millard Fillmore', 'Franklin Pierce',
'James Buchanan', 'Abraham Lincoln', 'Andrew Johnson', 'Ulysses S. Grant',
'Rutherford B. Hayes', 'James A. Garfield', 'Chester Arthur',
'Grover Cleveland', 'Benjamin Harrison', 'William McKinley',
'Theodore Roosevelt', 'William Howard Taft', 'Woodrow Wilson',
'Warren G. Harding', 'Calvin Coolidge', 'Herbert Hoover',
'Franklin D. Roosevelt', 'Harry S. Truman', 'Dwight D. Eisenhower',
'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon', 'Gerald Ford',
'Jimmy Carter', 'Ronald Reagan', 'George H. W. Bush', 'Bill Clinton',
'George W. Bush', 'Barack Obama', 'Donald Trump'
]
for name in presidents:
page = wikipedia.page(name)
# text = page.summary
text = page.content
with open(name + '.txt', 'w') as f:
f.write(text)

# def format_filename(president_name):
# return '/tmp/president_data/' + '_'.join(president_name.lower().split(' '))(
# + '.txt')

if __name__ == '__main__':
presidents_to_texts()
22 changes: 22 additions & 0 deletions project_writeup.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Kerry McConnaughay

Project 3 Writeup and Reflection

I used Wikipedia as my data source and saved every page for each President of the United States. To analyze the word frequency of the pages, I used multiple functions together. I wanted to determine what the most frequent word is on each page so that I could try to apply that word to trends from the corresponding President’s time period. I was interested in learning if there is a correlation between major issues of the time and the most frequent word in the article.

In choosing how to implement my system, I had to have multiple basic functions that were necessary for my system to actually analyze word frequency. Additionally, I had to be able to change the lines of strings I got from the Wikipedia articles into words and then format those words to exclude punctuation and whitespace. In order to avoid searching Wikipedia multiple times for every page I needed, I searched the page once in a separate file and converted the online articles to local text files. These text files were used by my main file to actually get the words I needed. There is one file that searches Wikipedia for specified pages, which are converted to text and then run through a number of functions.

When I started running the code, I realized that the code would return frequent words such as “the” or “and” from the articles. Because such words have little implicit or explicit meaning, I had to choose from different options which allowed me to exclude the words. I thought about making separate lists of conjunctions, helping verbs, pronouns, and other similar short words that are unimportant, but decided against multiple lists because it is a large amount of work and increases the likelihood of mistakes in the bodies of the functions. Initially, I had one list of “bad words” that the function used, but every time I ran the program, there were more returned and undesired “bad words.” This was also inefficient, so I edited the program and created a separate text file of the unwanted words that the function would read through. The words in the file were skipped and not added to the dictionary. I modified the list of the presidents’ names that I used to search Wikipedia and made sure the word counter did not use the presidents’ names as the most frequent. Additionally, I chose to include a parameter that excludes a word from the dictionary based on word length; words shorter than five characters are typically not very interesting. There are exclusions to that condition, of course, but I chose to sacrifice some words. The final choice was better than the other routes I could have taken because it made the code neater and offered an easier way to add words to the list of “bad words.”

One other important consideration was whether I would read the entire content of each Wikipedia page or just the summary of each of the Presidents’ profiles. To analyze more data, I chose the entire Wikipedia page, even though there are some sections that are not only about a President’s life or history.

After doing some text analysis, I saw that the most frequent words often correlated with the issue or topic each President is most well-known for. For example, “constitution” is the most frequent word on the Wikipedia page for James Madison and he is known as the “Father of the Constitution” because he helped draft and promote the United State Constitution and the Bill of Rights. I thought it was interesting that the Wikipedia articles reflected the key events and topics people usually associate with every president. Repetition of those words reinforces individuals’ automatic reflex that matches George Washington to his role in fighting the British for American freedom. Such associations are helpful in memorizing a vague timeline of the presidents and their contributions to American history. The table below reflects some of the most interesting words and their relation to certain political topics and events.

It is also interesting that some of the words do not match up with the actual actions of the president. Jimmy Carter for example, had a very rich political history in Georgia before he became President of the United States. Although individuals may not see this as his most important contribution, that part of his history certainly influenced him and gave him the experience necessary to be a president.

![table]
(table.jpeg)

Looking at the overall process of the project, I am happy with the progress I made. After reading the initial notes on the Mini Project, I was a little lost, so I took notes on a paper and I wrote down some ideas before going to my terminal. Once I decided on Wikipedia and Presidents, I started thinking of how I would be able to analyze the text using word frequency. Writing down the basic steps needed for such a program out on paper is very helpful overall. I think it eliminated time I would have otherwise been wasting writing bad code. Another part that went well was my ability to totally understand everything I needed to write for the program to work the way I wanted it to. Storing text as separate files and reading those files, I expanded my abilities to pull data from other sources. Although I have had difficulties writing code that has clear doctests and explanations for what the functions should be doing, I was able to write the tests and comments myself. One thing that could be improved on is the time spent on the project. Had I started the project a little earlier, I would have been able to explore more and possibly have a deeper understanding of analytic and processing techniques. It would have also been better if I could have done some more challenging analysis of the texts as well. For example, I considered choosing to return multiple words that show up frequently and sorting them into categories, but I was unable to do this. The scope of this project was appropriate for me; I struggled with writing some of the code, but in the end, the program was successful after I spent enough time debugging and doing some outside research. Going forward, I will continue to increase my use of unit testing, which is very useful, and plan out the code I need on paper before starting to write it in atom.


177 changes: 177 additions & 0 deletions revised_textmining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import wikipedia
import string
import unittest

class Text():

def __init__(self):
self.presidents = [
'George Washington',
'John Adams',
'Thomas Jefferson',
'James Madison',
'James Monroe',
'John Quincy Adams',
'Andrew Jackson',
'Martin Van Buren',
'William Henry Harrison',
'John Tyler',
'James K. Polk',
'Zachary Taylor',
'Millard Fillmore',
'Franklin Pierce',
'James Buchanan',
'Abraham Lincoln',
'Andrew Johnson',
'Ulysses S. Grant',
'Rutherford B. Hayes',
'James A. Garfield',
'Chester Arthur',
'Grover Cleveland',
'Benjamin Harrison',
'William McKinley',
'Theodore Roosevelt',
'William Howard Taft',
'Woodrow Wilson',
'Warren G. Harding',
'Calvin Coolidge',
'Herbert Hoover',
'Franklin D. Roosevelt',
'Harry S. Truman',
'Dwight D. Eisenhower',
'John F. Kennedy',
'Lyndon B. Johnson',
'Richard Nixon',
'Gerald Ford',
'Jimmy Carter',
'Ronald Reagan',
'George H. W. Bush',
'Bill Clinton',
'George W. Bush',
'Barack Obama',
'Donald Trump']

self.blacklist = ['george', 'washington', 'adams', 'thomas', 'washingtons',
'jefferson', 'madison', 'james', 'monroe', 'quincy', 'jacksons',
'adams', 'andrew', 'jackson','martin', 'buren', 'tylers',
'henry', 'harrison', 'tyler', 'polk', 'zachary', 'polks', 'taylors',
'taylor', 'millard', 'fillmore', 'pierce', 'fillmores', 'pierces',
'buchanan', 'abraham', 'lincoln', 'andrew', 'johnson', 'ulysses',
'grant','rutherford', 'hayes', 'garfield', 'chester', 'johnsons',
'arthur','grover', 'cleveland', 'benjamin', 'harrison', 'william',
'mckinley','theodore', 'roosevelt','howard', 'grants', 'hayess',
'woodrow', 'wilson','warren', 'harding', 'calvin', 'coolidge',
'herbert', 'hoover','franklin', 'harry', 'truman','dwight','arthurs'
'eisenhower', 'kennedy', 'Lyndon', 'Johnson', 'richard', 'nixon',
'gerald', 'ford','jimmy', 'carter', 'ronald', 'reagan', 'clevelands',
'bill', 'clinton', 'barack', 'obama', 'donald', 'trump', 'trumps',
'president', 'william', 'would', 'which', 'years', 'zachary', 'presidential',
'though', 'while', 'because', 'harrisons', 'mckinleys', 'hardings',
'eisenhowers', 'kennedys', 'johnsons', 'carters', 'reagans', 'clintons',
'state', 'states', 'wilsons', 'after', 'trumans', 'roosevelts', 'garfields',
'coolidges', 'burens', 'lincolns', 'buchanans', 'nixons', 'fords', 'during',
'their', 'united', 'house', 'later', 'american', 'republican', 'jeffersons'
]

def load_file(self, filename):
""" Opens each file and saves the text in a list. """

# Open the file and store it as text
with open(filename, 'r') as f:
text = [line for line in f]

return text

def to_words(self, text):
""" Takes a President text and splits into individual words. """
split_words = []

for line in text:
split = line.split()
split_words.extend(split)

return split_words

def remove_punctuation_and_whitespace(self, word):
""" Excludes characters classified as punctuation and whitespace. """
letters = [
letter.lower() for letter in word
if letter not in string.punctuation
and letter not in string.whitespace]

return ''.join(letters) # Join list of letters into a string

def most_frequent(self, words, blacklist):
"""This function will take all words from the Wikipedia pages and return
the most frequent from each page, with certain exclusions. """

counts = {}
for word in words:
long_enough = len(word) >= 5
whitelisted = word not in blacklist

if long_enough and whitelisted:
counts[word] = counts.get(word, 0) + 1

return max(counts, key=counts.get)

def main(self):
most_common_words = []

for filename in self.presidents:
text = self.load_file(filename + '.txt')
words = self.to_words(text)
stripped = [
self.remove_punctuation_and_whitespace(word) for word in words]
mode = self.most_frequent(stripped, self.blacklist)
most_common_words.append(mode)

return most_common_words

class TestText(unittest.TestCase):

def setUp(self):
self.subject = Text()

def test_to_words(self):
# Arrange
lines = ['That makes some sense.', 'What is up?']

# Act
result = self.subject.to_words(lines)

# Assert
expected = ['That', 'makes', 'some', 'sense.', 'What', 'is', 'up?']
self.assertEqual(result, expected)

def test_most_frequent(self):
# Arrange
blacklist = ['hello']
words = [
'then', 'then', 'then',
'hello', 'hello', 'hello', 'hello',
'fires', 'fires',
'huzzah']

# Act
result = self.subject.most_frequent(words, blacklist)

# Assert
expected = 'fires'
self.assertEqual(result, expected)

def test_remove_punctuation_and_whitespace(self):
# Arrange
word = 'W,h!At\'\n '
# Act
result = self.subject.remove_punctuation_and_whitespace(word)

# Assert
expected = 'what'
self.assertEqual(result, expected)


if __name__ == '__main__':
t = Text()
print(t.main())
unittest.main()
Binary file added table.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
115 changes: 115 additions & 0 deletions textmining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import wikipedia
import string

def read_presidents_texts():
presidents = ['George Washington', 'John Adams', 'Thomas Jefferson',
'James Madison', 'James Monroe', 'John Quincy Adams', 'Andrew Jackson',
'Martin Van Buren', 'William Henry Harrison', 'John Tyler',
'James K. Polk', 'Zachary Taylor', 'Millard Fillmore', 'Franklin Pierce',
'James Buchanan', 'Abraham Lincoln', 'Andrew Johnson', 'Ulysses S. Grant',
'Rutherford B. Hayes', 'James A. Garfield', 'Chester Arthur',
'Grover Cleveland', 'Benjamin Harrison', 'William McKinley',
'Theodore Roosevelt', 'William Howard Taft', 'Woodrow Wilson',
'Warren G. Harding', 'Calvin Coolidge', 'Herbert Hoover',
'Franklin D. Roosevelt', 'Harry S. Truman', 'Dwight D. Eisenhower',
'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon', 'Gerald Ford',
'Jimmy Carter', 'Ronald Reagan', 'George H. W. Bush', 'Bill Clinton',
'George W. Bush', 'Barack Obama', 'Donald Trump'
]
for name in presidents:
with open(name + '.txt', 'r') as f:
words = make_words(f.readlines())
formatted_words = format_words(words)
frequencies = word_frequency(formatted_words)
print(name)
print(frequencies)


def make_words(president_file_lines):
""" This function takes each President file and splits the lines into words.
>>> make_words(['What do you mean?'])
['What', 'do', 'you', 'mean?']
"""
split_words = []
for string in president_file_lines:
split = string.split()
split_words.extend(split)
return split_words


def exclude(letter):
"""This function should exclude characters that are punctuation or
whitespace.
>>> exclude(' ')
True
>>> exclude('#')
True
"""
return letter in string.punctuation or letter in string.whitespace


def format_word(word):
"""This function will strip all of the words of punctuation and whitespaces
then make the words lowercase.
>>> format_word('DOG')
'dog'
>>> format_word('!!!')
''
"""
return ''.join(letter for letter in word if not exclude(letter)).lower()


def format_words(words):
"""This function should run all of the words from the Wikipedia pages through
format_word function and return a list of the formatted words.
>>> format_words(['D!oG', 'what', 'CELLAR'])
['dog', 'what', 'cellar']
"""
formatted_words = []
for word in words:
better_word = format_word(word)
formatted_words.append(better_word)
return formatted_words


def word_frequency(formatted_words):
"""This function will take all words from the Wikipedia pages and return
the most frequent from each page, with certain exclusions.
>>> word_frequency(['dogs', 'cat', 'the', 'fires', 'fires'])
'fires'
>>> word_frequency(['tetris', 'tetris', 'and', 'but'])
'tetris'
"""
d = {}
for word in formatted_words:
with open('bad_words.txt', 'r') as f:
for line in f:
if word in line:
continue
elif len(word) < 5:
continue
else:
count = d.get(word, 0) + 1
d[word] = count
maximum_frequency = max(d.values())
word_frequencies = lookup(d, maximum_frequency)
return word_frequencies


def lookup(d, v):
"""This function should take a value and search for its corresponding key in
the dictionary.
>>> d = {'dog' : 5}
>>> lookup(d, 5)
'dog'
"""
for k in d:
if d[k] == v:
return k
raise ValueError


if __name__ == '__main__':
import doctest
doctest.testmod()
read_presidents_texts()