diff --git a/README.md b/README.md index 8cce527..453bc3b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,27 @@ -# TextMining +### Long German Words (lange deutsche Wörter) -This is the base repo for the text mining and analysis project for Software Design at Olin College. +### What it Does +To English speakers, very long German words often look silly. This software +analyzes some of the works of Johann Wolfgang von Goethe. It uses the texts +from Project Gutenberg (in German) to find the three longest words in each +of several of his works. + +### How to Use It +Install pickle and requests libraries before using. Run +"textmining_requests.py" to get the text files from Project Gutenberg. +Then, run "textmining_processing.py" to find the longest German words! +They will print in the terminal. + +### Progress +This project was originally written in Week 6 of Software Design, and I +reworked it in Week 13. It is now written in an object-oriented style and +has some updated text mining processes. + +### Some long words found +- freundschaftliche - friendly +- Allerdurchlauchtigster - most gracious/serene +- unwahrscheinlichem - unlikely +- zusammenschrumpfte - shriveled up +- Freundschaftsbezeigungen - demonstrations of friendship +- durcheinandergeschüttelt - agitated +- Amtschreiberstochter - office clerk's daughter diff --git a/classText.py b/classText.py new file mode 100644 index 0000000..293b046 --- /dev/null +++ b/classText.py @@ -0,0 +1,69 @@ +""" +Sarah Barden +This is a class that creates Text objects for my text mining project. It +includes two methods called cleanText and longestWords. +""" +import pickle + + +class Text: + + def __init__(self, title, author, fileName): + self.title = title + self.author = author + self.fileName = fileName + + def __str__(self): + string = '' + string += '{} by {}'.format(self.title, self.author) + return string + + def cleanText(self): + """ + Takes a Gutenberg text file and cleans it, removes punctuation, etc. + Returns the cleaned text as a string + """ + inputFile = open(self.fileName, 'rb') + text = pickle.load(inputFile) + + # removing unwanted punctuation + punctuation = [',', ';', '.', '-', '--', '!', '?', ')', '(', "'", '@', '*'] + for mark in punctuation: + text = text.replace(mark, ' ') + + # Project Gutenberg text files have a long footer and header in every + # file. The following finds the end of the header and the start of the + # footer and removes those sections. + start = text.find('START OF THIS PROJECT GUTENBERG') + if start == -1: + start = text.find('END THE SMALL PRINT') + 150 # approx end of the header text + + end = text.find('END OF THIS PROJECT GUTENBERG') + if end == -1: + end = text.find('Ende dieses') # ending statement in German + + text = text[start:end] # Cut the text to remove footer/header + return text + + def longestWords(self, number): + """ + Takes in a Gutenberg text file and outputs the longest n words, where n + is an input. Returns a list of tuples, where each tuple is the length + of the word and the word as a string + """ + + text = self.cleanText() + text = text.split() + words = [] + # After splitting the whole text into a list of words, this sorts them + # all by the length into a list, from longest to shortest. + for word in text: + length = len(word) + words.append((length, word)) + words.sort(reverse=True) + top = words[0:number] + return top + + +if __name__ == '__main__': + pass diff --git a/textmining_processing.py b/textmining_processing.py new file mode 100644 index 0000000..23dfdf3 --- /dev/null +++ b/textmining_processing.py @@ -0,0 +1,45 @@ +""" +Sarah Barden +This script is the final processing for my text mining project. It initializes +seven different works by Goethe as Text objects (see classText.py) and finds +the three longest words in each. +""" + +from classText import Text + +geschwister = Text('Die Geschwister', 'Johann Wolfgang von Goethe', 'geschwister.txt') +berlichingen = Text('Götz von Berlichingen', 'Johann Wolfgang von Goethe', 'berlichingen.txt') +iphigenie = Text('Iphigenie auf Tauris', 'Johann Wolfgang von Goethe', 'iphigenie.txt') +reinekefuchs = Text('Reineke Fuchs', 'Johann Wolfgang von Goethe', 'reinekefuchs.txt') +werther1 = Text('Die Leiden des jungen Werthers 1', 'Johann Wolfgang von Goethe', 'werther1.txt') +werther2 = Text('Die Leiden des jungen Werthers 2', 'Johann Wolfgang von Goethe', 'werther2.txt') + +works = [geschwister, berlichingen, iphigenie, reinekefuchs, + werther1, werther2] + + +def analyze(text): + """ + Find the longest three words in a single text. Takes a Text object as input + and outputs a list of words and their lengths. + """ + wordsAndLengths = text.longestWords(3) + wordsOnly = [word[1] for word in wordsAndLengths] + return wordsOnly + + +def analyzeAll(works): + """ + Finds the longest three words for each work in a list of multiple works. + Takes in a list of Text objects and outputs a list of words and their lengths. + """ + result = [analyze(work) for work in works] + return result + + +# running full analysis on all seven works initialized above. Prints each word +# in the console. +final = analyzeAll(works) +for work in final: + for word in work: + print(word) diff --git a/textmining_requests.py b/textmining_requests.py new file mode 100644 index 0000000..c875b1c --- /dev/null +++ b/textmining_requests.py @@ -0,0 +1,36 @@ +""" +Sarah Barden +Requests project gutenberg pages. There are six requests for Goethe's works. +""" +import pickle +import requests + +geschwister = requests.get('http://www.gutenberg.org/cache/epub/2406/pg2406.txt').text +f = open('geschwister.txt', 'wb') +pickle.dump(geschwister, f) +f.close() + +berlichingen = requests.get('http://www.gutenberg.org/cache/epub/2321/pg2321.txt').text +f = open('berlichingen.txt', 'wb') +pickle.dump(berlichingen, f) +f.close() + +reinekefuchs = requests.get('http://www.gutenberg.org/cache/epub/2228/pg2228.txt').text +f = open('reinekefuchs.txt', 'wb') +pickle.dump(reinekefuchs, f) +f.close() + +iphigenie = requests.get('http://www.gutenberg.org/cache/epub/2054/pg2054.txt').text +f = open('iphigenie.txt', 'wb') +pickle.dump(iphigenie, f) +f.close() + +werther1 = requests.get('http://www.gutenberg.org/cache/epub/2407/pg2407.txt').text +f = open('werther1.txt', 'wb') +pickle.dump(werther1, f) +f.close() + +werther2 = requests.get('http://www.gutenberg.org/cache/epub/2408/pg2408.txt').text +f = open('werther2.txt', 'wb') +pickle.dump(werther2, f) +f.close()