diff --git a/TextMining.py b/TextMining.py new file mode 100644 index 0000000..ac50701 --- /dev/null +++ b/TextMining.py @@ -0,0 +1,249 @@ +""" +This is a self directional project with guidance from the Software Design +teaching team. + +My project is focuesd around letter frequency in all words of the english +language compared to that of authors. + +In addition, the same functions can be used to compare word choice (because of +letter frequencies) between two authors. + +@author: Colvin Chapman + +NLTK package: + Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language + Processing with Python. O’Reilly Media Inc. + """ +import nltk +import doctest +import pickle +import string +import math + +# #nltk.download() +# +# unabridged_webster = requests.get( +# 'http://www.gutenberg.org/cache/epub/29765/pg29765.txt').text +# +# pickle.dump('Webster_Dictionary', unabridged_webster,) +# +# print(len(unabridged_webster)) +# # print(unabridged_webster[0: 3000]) + + +def word(webster_dict, start, end): + """ Takes in a string with words in it, and returns a list of all the words + in that string + + range: the area on the string that is used + + >>> word("I want to go", 1,12) + ['want', 'to', 'go'] + """ + return nltk.word_tokenize(webster_dict[start: end]) + + +def trim_string(st): + """turns string into more managable format. + >>> trim_string('SGHsdg.14!\n') + 'sghsdg' + """ + st = st.upper() + exclude = set(string.punctuation+string.digits+'\n'+'’ ‘ÉÈÀ“ ”') + st = ''.join(ch for ch in st if ch not in exclude) + return st + + +def trim_string_webster(st): + """turns string into more managable format. + >>> trim_string('SG Hsdg.14!\n') + 'SGH' + """ + + exclude = set(string.punctuation + string.digits + '\n' + + string.ascii_lowercase) + st = ''.join(ch for ch in st if ch not in exclude) + return st + + +def word_entries(word_list): + """Takes on a list of words and returns only the words that are all + capital letters. + + >>> word_entries(['adH', 'HAND', 'HAHA', 'UNiCoRN']) + ['HAND', 'HAHA'] + """ + good_list = [] + for word in word_list: + if len(word) > 2: + if str.upper(word) == word: + good_list.append(word) + return good_list + + +def pickle_a_list(name, filey): + """name: string that represents name of file""" + + item = open(name + '.pickle', 'wb') + pickle.dump(filey, item) + item.close() + return + + +def pickle_test(): + """making sure the functions work before wasting time processig an entire + dictionary + + It works!""" + + pickle_a_list('test_list', ['GAAA', 'BE', 'bsvavv']) + + test_pickle = open('test_list.pickle', 'rb') + + reloaded_test_pickle = pickle.load(test_pickle) + return (reloaded_test_pickle) + + +def english_language(): + websterfile = open('Webster.txt') + webster = trim_string_webster(websterfile.read()) + web_words = word(webster, 0, len(webster)) + web_word_list = word_entries(web_words) + pickle_a_list('english_words_only', web_word_list) + + +def any_text(name_of_file): + """Opens the text file, + runs histogram + returns dictionary of histogram""" + of_file = open(name_of_file) + s = trim_string(of_file.read()) + d = dict() + for c in s: + d[c] = d.get(c, 0) + 1 + return d + + +def word_choice(name_of_file): + """Compiles letters in a string in order of most to least common letters + used. + """ + hist = any_text(name_of_file) + couples = [] + end_string = '' + for item in hist: + couples.append((hist[item], item)) + couples.sort(key=lambda tup: tup[0]) # sorts in place + coupless = couples[::-1] + for pair in coupless: + end_string += pair[1] + return end_string + + +def levenshtein(s1, s2): + """find out how similar + the match is to other word_choice strings + + (taken from a reading journal) + """ + if len(s1) < len(s2): + return levenshtein(s2, s1) + + # len(s1) >= len(s2) + if len(s2) == 0: + return len(s1) + + previous_row = range(len(s2) + 1) + for i, c1 in enumerate(s1): + current_row = [i + 1] + for j, c2 in enumerate(s2): + insertions = previous_row[j + 1] + 1 + deletions = current_row[j] + 1 + substitutions = previous_row[j] + (c1 != c2) + current_row.append(min(insertions, deletions, substitutions)) + previous_row = current_row + + return previous_row[-1] + + +def histogram(wordstring): + d = dict() + for char in wordstring: + d[char] = d.get(char, 0) + 1 + return d + + +def pickle_tester(file_name): + """ Primariy for the dictionary which is stored as a list of words. + returns a string in order of the most to least common characters. + """ + english_file = open(file_name, 'rb') + reloaded_english_words = pickle.load(english_file) + + hist = histogram(''.join(reloaded_english_words)) + couples = [] + end_string = '' + for item in hist: + couples.append((hist[item], item)) + couples.sort(key=lambda tup: tup[0]) # sorts in place + coupless = couples[::-1] + for pair in coupless: + end_string += pair[1] + return end_string + + +def norm(d1, d2): + """takes in two dectionaries of frequency and returns a resultant vector + representing how similar they are.""" + resultant_sum = 0 + total1 = 0 + total2 = 0 + for key in d1: + total1 += d1[key] + total2 += d2[key] + for key in d1: + resultant_sum += ((d1[key]/total1 - d2[key]/total2))**2 + return 100*resultant_sum**.5 + + +def dict_resultant(filename): + english_file = open('english_words_only.pickle', 'rb') + reloaded_english_words = pickle.load(english_file) + + hist = histogram(''.join(reloaded_english_words)) + return norm(any_text(filename), hist) + + +print(norm(any_text('Oliver_Twist'), any_text('David_Copperfield'))) +print(norm(any_text('Oliver_Twist'), any_text('Sherlock_Holmes'))) + +print(dict_resultant('Oliver_Twist')) + +print(word_choice('Oliver_Twist')) +print(word_choice('David_Copperfield')) +print(word_choice('Sherlock_Holmes')) +print(word_choice('bible.txt')) + +end_dict = 'EAISRTONCLPDMUHBGYFWVKJZXQ' +end_Oliver = 'ETAOINHRSDLUMWCGFYPBVKXJQZ' +end_David = 'ETAOINSHRDLMUWCYFGPBVKXJQZ' +end_Sherlock = 'ETAOINHSRDLUMWCYFGPBVKXJQZ' +end_Bible = 'ETHAONSIRDLUMFWCYGBPVKJZXQ' + + +print('Dict vs Oliver : ' + str(levenshtein(end_dict, end_Oliver))+'\n' + + str(dict_resultant('Oliver_Twist'))) +print('Oliver vs David : ' + str(levenshtein(end_Oliver, end_David))+'\n' + + str(norm(any_text('Oliver_Twist'), any_text('David_Copperfield')))) +print('David vs Dict: ' + str(levenshtein(end_David, end_dict))+'\n' + + str(dict_resultant('David_Copperfield'))) +print('Dict vs Sherlock: ' + str(levenshtein(end_dict, end_Sherlock))+'\n' + + str(dict_resultant('Sherlock_Holmes'))) +print('Oliver vs Sherlock: '+ str(levenshtein(end_Oliver, end_Sherlock)) + + '\n' + str(norm(any_text('Oliver_Twist'), any_text('Sherlock_Holmes')))) +print('David vs Sherlock: '+str(levenshtein(end_David, end_Sherlock))+'\n' + + str(norm(any_text('Sherlock_Holmes'), any_text('David_Copperfield')))) +print('Bible vs Sherlock: ' + str(levenshtein(end_Bible, end_Sherlock)) + + '\n' + str(norm(any_text('bible.txt'), any_text('Sherlock_Holmes')))) +print('Dict vs Bible: ' + str(levenshtein(end_David, end_Bible))+'\n' + + str(dict_resultant('bible.txt'))) diff --git a/TextMiningReflection.pdf b/TextMiningReflection.pdf new file mode 100644 index 0000000..66123ef Binary files /dev/null and b/TextMiningReflection.pdf differ diff --git a/dictionary_pickler.py b/dictionary_pickler.py new file mode 100644 index 0000000..bb7fa2e --- /dev/null +++ b/dictionary_pickler.py @@ -0,0 +1,42 @@ +""" +This is a self directional project with guidance from the Software Design +teaching team. + +@author: Colvin Chapman +""" + +import requests +import pickle + +unabridged_webster = requests.get( + 'http://www.gutenberg.org/cache/epub/29765/pg29765.txt').text + +'''I attempted to do some pickling, but in the interest of time, left this as + unfinished business: + +pickle.dump('Webster_Dictionary', unabridged_webster,) + ''' + + +def test_writing(): + """Hey, it made a file! + """ + test = open('test.txt', 'w') + test.write('This should appear in a text file somewhere!') + test.close() + return + + +def write_dictionary(): + cooldict = open('Webster.txt', 'w') + cooldict.write(unabridged_webster) + cooldict.close() + + +write_dictionary() + +print(len(unabridged_webster)) +# print(unabridged_webster[0: 3000]) + +"the length of webster is 28,930,471 characters" +"the english language has about 129,000 words" diff --git a/txtSaver.py b/txtSaver.py new file mode 100644 index 0000000..b0646d0 --- /dev/null +++ b/txtSaver.py @@ -0,0 +1,26 @@ +""" +This is a self directional project with guidance from the Software Design +teaching team. + +@author: Colvin Chapman +""" + +import requests + + +def write_book(URL, name_file): + body = requests.get(URL).text + cooldict = open(name_file, 'w') + cooldict.write(body) + cooldict.close() + + +write_book("http://www.gutenberg.org/cache/epub/730/pg730.txt", "Oliver_Twist") +write_book("http://www.gutenberg.org/files/766/766-0.txt", "David_Copperfield") +write_book("http://www.gutenberg.org/cache/epub/1661/pg1661.txt", + "Sherlock_Holmes") + + +""" +the length of webster is 28,930,471 characters +the english language has about 129,000 words"""