diff --git a/berlichingen.txt b/berlichingen.txt new file mode 100644 index 0000000..0c9abfc Binary files /dev/null and b/berlichingen.txt differ diff --git a/geschwister.txt b/geschwister.txt new file mode 100644 index 0000000..4bd2d0a Binary files /dev/null and b/geschwister.txt differ diff --git a/iphigenie.txt b/iphigenie.txt new file mode 100644 index 0000000..bc3370f Binary files /dev/null and b/iphigenie.txt differ diff --git a/reinekefuchs.txt b/reinekefuchs.txt new file mode 100644 index 0000000..4bebfbc Binary files /dev/null and b/reinekefuchs.txt differ diff --git a/textmining_processing.py b/textmining_processing.py new file mode 100644 index 0000000..ba59ad6 --- /dev/null +++ b/textmining_processing.py @@ -0,0 +1,60 @@ + +import pickle +import string + +""" +Finding the longest word +""" +plays = ['geschwister.txt', 'berlichingen.txt', 'iphigenie.txt'] +prose = ['reinekefuchs.txt'] +book = ['werther1.txt', 'werther2.txt'] + + +def clean_text(file_name): + input_file = open(file_name, 'rb') + text = pickle.load(input_file) + if file_name in plays: + start = text.find('Personen') # beginning of play + elif file_name in prose: + start = text.find('Inhalt') # beginning of prose + elif file_name in book: + start = text.find('Ausgabe') # beginning of prose + text = text[start:] + text = text.replace(',', '') + text = text.replace('.', '') + text = text.replace('--', ' ') + text = text.replace('!', '') + text = text.replace('?', '') + text = text.replace(')', '') + text = text.replace('(', '') + text = text.replace("'", '') + print(file_name) + return text + + +def longest_words(text): + text = text.split() + # print(text) + words = [] + for word in text: + length = len(word) + words.append((length, word)) + words.sort(reverse=True) + print(words) + top = words[0:4] + return top + + +def final_analysis(text): + clean = clean_text(text) + top = longest_words(clean) + for pair in top: + print(pair) + + +# final_analysis('berlichingen.txt') +final_analysis('geschwister.txt') +# final_analysis('iphigenie.txt') +# final_analysis('reinekefuchs.txt') +# final_analysis('werther1.txt') +# final_analysis('werther2.txt') diff --git a/textmining_requests.py b/textmining_requests.py new file mode 100644 index 0000000..282ff79 --- /dev/null +++ b/textmining_requests.py @@ -0,0 +1,32 @@ +import pickle +import requests + +geschwister = requests.get('http://www.gutenberg.org/cache/epub/2406/pg2406.txt').text +f = open('geschwister.txt', 'wb') +pickle.dump(geschwister, f) +f.close() + +berlichingen = requests.get('http://www.gutenberg.org/cache/epub/2321/pg2321.txt').text +f = open('berlichingen.txt', 'wb') +pickle.dump(berlichingen, f) +f.close() + +reinekefuchs = requests.get('http://www.gutenberg.org/cache/epub/2228/pg2228.txt').text +f = open('reinekefuchs.txt', 'wb') +pickle.dump(reinekefuchs, f) +f.close() + +iphigenie = requests.get('http://www.gutenberg.org/cache/epub/2054/pg2054.txt').text +f = open('iphigenie.txt', 'wb') +pickle.dump(iphigenie, f) +f.close() + +werther1 = requests.get('http://www.gutenberg.org/cache/epub/2407/pg2407.txt').text +f = open('werther1.txt', 'wb') +pickle.dump(werther1, f) +f.close() + +werther2 = requests.get('http://www.gutenberg.org/cache/epub/2408/pg2408.txt').text +f = open('werther2.txt', 'wb') +pickle.dump(werther2, f) +f.close() diff --git a/werther1.txt b/werther1.txt new file mode 100644 index 0000000..74ca30a Binary files /dev/null and b/werther1.txt differ diff --git a/werther2.txt b/werther2.txt new file mode 100644 index 0000000..59b619b Binary files /dev/null and b/werther2.txt differ