Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added berlichingen.txt
Binary file not shown.
Binary file added geschwister.txt
Binary file not shown.
Binary file added iphigenie.txt
Binary file not shown.
Binary file added reinekefuchs.txt
Binary file not shown.
60 changes: 60 additions & 0 deletions textmining_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@

import pickle
import string

"""
Finding the longest word
"""
plays = ['geschwister.txt', 'berlichingen.txt', 'iphigenie.txt']
prose = ['reinekefuchs.txt']
book = ['werther1.txt', 'werther2.txt']


def clean_text(file_name):
input_file = open(file_name, 'rb')
text = pickle.load(input_file)
if file_name in plays:
start = text.find('Personen') # beginning of play
elif file_name in prose:
start = text.find('Inhalt') # beginning of prose
elif file_name in book:
start = text.find('Ausgabe') # beginning of prose
text = text[start:]
text = text.replace(',', '')
text = text.replace('.', '')
text = text.replace('--', ' ')
text = text.replace('!', '')
text = text.replace('?', '')
text = text.replace(')', '')
text = text.replace('(', '')
text = text.replace("'", '')
print(file_name)
return text


def longest_words(text):
text = text.split()
# print(text)
words = []
for word in text:
length = len(word)
words.append((length, word))
words.sort(reverse=True)
print(words)
top = words[0:4]
return top


def final_analysis(text):
clean = clean_text(text)
top = longest_words(clean)
for pair in top:
print(pair)


# final_analysis('berlichingen.txt')
final_analysis('geschwister.txt')
# final_analysis('iphigenie.txt')
# final_analysis('reinekefuchs.txt')
# final_analysis('werther1.txt')
# final_analysis('werther2.txt')
32 changes: 32 additions & 0 deletions textmining_requests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pickle
import requests

geschwister = requests.get('http://www.gutenberg.org/cache/epub/2406/pg2406.txt').text
f = open('geschwister.txt', 'wb')
pickle.dump(geschwister, f)
f.close()

berlichingen = requests.get('http://www.gutenberg.org/cache/epub/2321/pg2321.txt').text
f = open('berlichingen.txt', 'wb')
pickle.dump(berlichingen, f)
f.close()

reinekefuchs = requests.get('http://www.gutenberg.org/cache/epub/2228/pg2228.txt').text
f = open('reinekefuchs.txt', 'wb')
pickle.dump(reinekefuchs, f)
f.close()

iphigenie = requests.get('http://www.gutenberg.org/cache/epub/2054/pg2054.txt').text
f = open('iphigenie.txt', 'wb')
pickle.dump(iphigenie, f)
f.close()

werther1 = requests.get('http://www.gutenberg.org/cache/epub/2407/pg2407.txt').text
f = open('werther1.txt', 'wb')
pickle.dump(werther1, f)
f.close()

werther2 = requests.get('http://www.gutenberg.org/cache/epub/2408/pg2408.txt').text
f = open('werther2.txt', 'wb')
pickle.dump(werther2, f)
f.close()
Binary file added werther1.txt
Binary file not shown.
Binary file added werther2.txt
Binary file not shown.