sd17spring · srbarden · Mar 2, 2017 · Apr 19, 2017 · Apr 19, 2017 · Apr 19, 2017
diff --git a/README.md b/README.md
@@ -1,3 +1,27 @@
-# TextMining
+### Long German Words (lange deutsche Wörter)
 
-This is the base repo for the text mining and analysis project for Software Design at Olin College.
+### What it Does
+To English speakers, very long German words often look silly.  This software
+analyzes some of the works of Johann Wolfgang von Goethe.  It uses the texts
+from Project Gutenberg (in German) to find the three longest words in each
+of several of his works.
+
+### How to Use It
+Install pickle and requests libraries before using.  Run
+"textmining_requests.py" to get the text files from Project Gutenberg.
+Then, run "textmining_processing.py" to find the longest German words!
+They will print in the terminal.
+
+### Progress
+This project was originally written in Week 6 of Software Design, and I
+reworked it in Week 13.  It is now written in an object-oriented style and
+has some updated text mining processes.
+
+### Some long words found
+- freundschaftliche - friendly
+- Allerdurchlauchtigster - most gracious/serene
+- unwahrscheinlichem - unlikely
+- zusammenschrumpfte - shriveled up
+- Freundschaftsbezeigungen - demonstrations of friendship
+- durcheinandergeschüttelt - agitated
+- Amtschreiberstochter - office clerk's daughter
diff --git a/classText.py b/classText.py
@@ -0,0 +1,69 @@
+"""
+Sarah Barden
+This is a class that creates Text objects for my text mining project.  It
+includes two methods called cleanText and longestWords.
+"""
+import pickle
+
+
+class Text:
+
+    def __init__(self, title, author, fileName):
+        self.title = title
+        self.author = author
+        self.fileName = fileName
+
+    def __str__(self):
+        string = ''
+        string += '{} by {}'.format(self.title, self.author)
+        return string
+
+    def cleanText(self):
+        """
+        Takes a Gutenberg text file and cleans it, removes punctuation, etc.
+        Returns the cleaned text as a string
+        """
+        inputFile = open(self.fileName, 'rb')
+        text = pickle.load(inputFile)
+
+        # removing unwanted punctuation
+        punctuation = [',', ';', '.', '-', '--', '!', '?', ')', '(', "'", '@', '*']
+        for mark in punctuation:
+            text = text.replace(mark, ' ')
+
+        # Project Gutenberg text files have a long footer and header in every
+        # file. The following finds the end of the header and the start of the
+        # footer and removes those sections.
+        start = text.find('START OF THIS PROJECT GUTENBERG')
+        if start == -1:
+            start = text.find('END THE SMALL PRINT') + 150  # approx end of the header text
+
+        end = text.find('END OF THIS PROJECT GUTENBERG')
+        if end == -1:
+            end = text.find('Ende dieses')  # ending statement in German
+
+        text = text[start:end]  # Cut the text to remove footer/header
+        return text
+
+    def longestWords(self, number):
+        """
+        Takes in a Gutenberg text file and outputs the longest n words, where n
+        is an input.  Returns a list of tuples, where each tuple is the length
+        of the word and the word as a string
+        """
+
+        text = self.cleanText()
+        text = text.split()
+        words = []
+        # After splitting the whole text into a list of words, this sorts them
+        # all by the length into a list, from longest to shortest.
+        for word in text:
+            length = len(word)
+            words.append((length, word))
+            words.sort(reverse=True)
+        top = words[0:number]
+        return top
+
+
+if __name__ == '__main__':
+    pass
diff --git a/textmining_processing.py b/textmining_processing.py
@@ -0,0 +1,45 @@
+"""
+Sarah Barden
+This script is the final processing for my text mining project.  It initializes
+seven different works by Goethe as Text objects (see classText.py) and finds
+the three longest words in each.
+"""
+
+from classText import Text
+
+geschwister = Text('Die Geschwister', 'Johann Wolfgang von Goethe', 'geschwister.txt')
+berlichingen = Text('Götz von Berlichingen', 'Johann Wolfgang von Goethe', 'berlichingen.txt')
+iphigenie = Text('Iphigenie auf Tauris', 'Johann Wolfgang von Goethe', 'iphigenie.txt')
+reinekefuchs = Text('Reineke Fuchs', 'Johann Wolfgang von Goethe', 'reinekefuchs.txt')
+werther1 = Text('Die Leiden des jungen Werthers 1', 'Johann Wolfgang von Goethe', 'werther1.txt')
+werther2 = Text('Die Leiden des jungen Werthers 2', 'Johann Wolfgang von Goethe', 'werther2.txt')
+
+works = [geschwister, berlichingen, iphigenie, reinekefuchs,
+         werther1, werther2]
+
+
+def analyze(text):
+    """
+    Find the longest three words in a single text.  Takes a Text object as input
+    and outputs a list of words and their lengths.
+    """
+    wordsAndLengths = text.longestWords(3)
+    wordsOnly = [word[1] for word in wordsAndLengths]
+    return wordsOnly
+
+
+def analyzeAll(works):
+    """
+    Finds the longest three words for each work in a list of multiple works.
+    Takes in a list of Text objects and outputs a list of words and their lengths.
+    """
+    result = [analyze(work) for work in works]
+    return result
+
+
+# running full analysis on all seven works initialized above. Prints each word
+# in the console.
+final = analyzeAll(works)
+for work in final:
+    for word in work:
+        print(word)
diff --git a/textmining_requests.py b/textmining_requests.py
@@ -0,0 +1,36 @@
+"""
+Sarah Barden
+Requests project gutenberg pages.  There are six requests for Goethe's works.
+"""
+import pickle
+import requests
+
+geschwister = requests.get('http://www.gutenberg.org/cache/epub/2406/pg2406.txt').text
+f = open('geschwister.txt', 'wb')
+pickle.dump(geschwister, f)
+f.close()
+
+berlichingen = requests.get('http://www.gutenberg.org/cache/epub/2321/pg2321.txt').text
+f = open('berlichingen.txt', 'wb')
+pickle.dump(berlichingen, f)
+f.close()
+
+reinekefuchs = requests.get('http://www.gutenberg.org/cache/epub/2228/pg2228.txt').text
+f = open('reinekefuchs.txt', 'wb')
+pickle.dump(reinekefuchs, f)
+f.close()
+
+iphigenie = requests.get('http://www.gutenberg.org/cache/epub/2054/pg2054.txt').text
+f = open('iphigenie.txt', 'wb')
+pickle.dump(iphigenie, f)
+f.close()
+
+werther1 = requests.get('http://www.gutenberg.org/cache/epub/2407/pg2407.txt').text
+f = open('werther1.txt', 'wb')
+pickle.dump(werther1, f)
+f.close()
+
+werther2 = requests.get('http://www.gutenberg.org/cache/epub/2408/pg2408.txt').text
+f = open('werther2.txt', 'wb')
+pickle.dump(werther2, f)
+f.close()