forked from sd17spring/TextMining
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_analysis.py
More file actions
74 lines (60 loc) · 2.12 KB
/
text_analysis.py
File metadata and controls
74 lines (60 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
This file does the word frequency analysis on any text that is passed
through the program
"""
import string
import doctest
from heapq import nlargest
from wordcloud import WordCloud
def count_words(text, stop_words=None):
"""
Here is the count_words function, which counts the frequency of words. It
is passed a text and all the words that are not supposed to be counted
as part of the word frequency analysis.
The unit test below both tests the accuracy of count_words and demonstrates
that stop_words is being utilized correctly (and removing words like 'a'
and 'is').
input: text is the string of text
stop_words is all the extraneous words
returns: a dictionary of words and their word counts
>>> count_words('Java Java Java a', ['a', 'an'])
{'java': 3}
"""
# read the file and split into words
words = text.split()
# create dict with word count
word_count = dict()
for word in words:
# remove digits from the word
word = ''.join(c for c in word if not c.isdigit())
# remove puctuation and whitespace
word = word.strip(string.punctuation + string.whitespace)
# convert the word to lowercase
word = word.lower()
# if stop word, skip
if word == '' or word in stop_words:
continue
if word not in word_count:
word_count[word] = 1
else:
word_count[word] += 1
return word_count
def top50_cloud(dictionary, title, header):
"""
prints top 50 words and generates word cloud
input: dictionary is the word frequency dict
title is the name of the section
header are the column titles
"""
top50 = ''
print('\n' + title)
print('-' * len(title))
print(header)
for word in nlargest(50, dictionary, key=dictionary.get):
print(word, dictionary[word], sep='\t')
top50 = top50 + (word.replace('-', '') + ' ') * dictionary[word]
wordcloud = WordCloud().generate(top50)
image = wordcloud.to_image()
image.show()
if __name__ == '__main__':
doctest.testmod(verbose=True)