Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
# TextMining
### Long German Words (lange deutsche Wörter)

This is the base repo for the text mining and analysis project for Software Design at Olin College.
### What it Does
To English speakers, very long German words often look silly. This software
analyzes some of the works of Johann Wolfgang von Goethe. It uses the texts
from Project Gutenberg (in German) to find the three longest words in each
of several of his works.

### How to Use It
Install pickle and requests libraries before using. Run
"textmining_requests.py" to get the text files from Project Gutenberg.
Then, run "textmining_processing.py" to find the longest German words!
They will print in the terminal.

### Progress
This project was originally written in Week 6 of Software Design, and I
reworked it in Week 13. It is now written in an object-oriented style and
has some updated text mining processes.

### Some long words found
- freundschaftliche - friendly
- Allerdurchlauchtigster - most gracious/serene
- unwahrscheinlichem - unlikely
- zusammenschrumpfte - shriveled up
- Freundschaftsbezeigungen - demonstrations of friendship
- durcheinandergeschüttelt - agitated
- Amtschreiberstochter - office clerk's daughter
69 changes: 69 additions & 0 deletions classText.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
Sarah Barden
This is a class that creates Text objects for my text mining project. It
includes two methods called cleanText and longestWords.
"""
import pickle


class Text:

def __init__(self, title, author, fileName):
self.title = title
self.author = author
self.fileName = fileName

def __str__(self):
string = ''
string += '{} by {}'.format(self.title, self.author)
return string

def cleanText(self):
"""
Takes a Gutenberg text file and cleans it, removes punctuation, etc.
Returns the cleaned text as a string
"""
inputFile = open(self.fileName, 'rb')
text = pickle.load(inputFile)

# removing unwanted punctuation
punctuation = [',', ';', '.', '-', '--', '!', '?', ')', '(', "'", '@', '*']
for mark in punctuation:
text = text.replace(mark, ' ')

# Project Gutenberg text files have a long footer and header in every
# file. The following finds the end of the header and the start of the
# footer and removes those sections.
start = text.find('START OF THIS PROJECT GUTENBERG')
if start == -1:
start = text.find('END THE SMALL PRINT') + 150 # approx end of the header text

end = text.find('END OF THIS PROJECT GUTENBERG')
if end == -1:
end = text.find('Ende dieses') # ending statement in German

text = text[start:end] # Cut the text to remove footer/header
return text

def longestWords(self, number):
"""
Takes in a Gutenberg text file and outputs the longest n words, where n
is an input. Returns a list of tuples, where each tuple is the length
of the word and the word as a string
"""

text = self.cleanText()
text = text.split()
words = []
# After splitting the whole text into a list of words, this sorts them
# all by the length into a list, from longest to shortest.
for word in text:
length = len(word)
words.append((length, word))
words.sort(reverse=True)
top = words[0:number]
return top


if __name__ == '__main__':
pass
45 changes: 45 additions & 0 deletions textmining_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
Sarah Barden
This script is the final processing for my text mining project. It initializes
seven different works by Goethe as Text objects (see classText.py) and finds
the three longest words in each.
"""

from classText import Text

geschwister = Text('Die Geschwister', 'Johann Wolfgang von Goethe', 'geschwister.txt')
berlichingen = Text('Götz von Berlichingen', 'Johann Wolfgang von Goethe', 'berlichingen.txt')
iphigenie = Text('Iphigenie auf Tauris', 'Johann Wolfgang von Goethe', 'iphigenie.txt')
reinekefuchs = Text('Reineke Fuchs', 'Johann Wolfgang von Goethe', 'reinekefuchs.txt')
werther1 = Text('Die Leiden des jungen Werthers 1', 'Johann Wolfgang von Goethe', 'werther1.txt')
werther2 = Text('Die Leiden des jungen Werthers 2', 'Johann Wolfgang von Goethe', 'werther2.txt')

works = [geschwister, berlichingen, iphigenie, reinekefuchs,
werther1, werther2]


def analyze(text):
"""
Find the longest three words in a single text. Takes a Text object as input
and outputs a list of words and their lengths.
"""
wordsAndLengths = text.longestWords(3)
wordsOnly = [word[1] for word in wordsAndLengths]
return wordsOnly


def analyzeAll(works):
"""
Finds the longest three words for each work in a list of multiple works.
Takes in a list of Text objects and outputs a list of words and their lengths.
"""
result = [analyze(work) for work in works]
return result


# running full analysis on all seven works initialized above. Prints each word
# in the console.
final = analyzeAll(works)
for work in final:
for word in work:
print(word)
36 changes: 36 additions & 0 deletions textmining_requests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Sarah Barden
Requests project gutenberg pages. There are six requests for Goethe's works.
"""
import pickle
import requests

geschwister = requests.get('http://www.gutenberg.org/cache/epub/2406/pg2406.txt').text
f = open('geschwister.txt', 'wb')
pickle.dump(geschwister, f)
f.close()

berlichingen = requests.get('http://www.gutenberg.org/cache/epub/2321/pg2321.txt').text
f = open('berlichingen.txt', 'wb')
pickle.dump(berlichingen, f)
f.close()

reinekefuchs = requests.get('http://www.gutenberg.org/cache/epub/2228/pg2228.txt').text
f = open('reinekefuchs.txt', 'wb')
pickle.dump(reinekefuchs, f)
f.close()

iphigenie = requests.get('http://www.gutenberg.org/cache/epub/2054/pg2054.txt').text
f = open('iphigenie.txt', 'wb')
pickle.dump(iphigenie, f)
f.close()

werther1 = requests.get('http://www.gutenberg.org/cache/epub/2407/pg2407.txt').text
f = open('werther1.txt', 'wb')
pickle.dump(werther1, f)
f.close()

werther2 = requests.get('http://www.gutenberg.org/cache/epub/2408/pg2408.txt').text
f = open('werther2.txt', 'wb')
pickle.dump(werther2, f)
f.close()