-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnormalize_text.py
More file actions
93 lines (82 loc) · 2.92 KB
/
normalize_text.py
File metadata and controls
93 lines (82 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import sys
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet, stopwords
from nltk import word_tokenize, pos_tag
def do_lowercasing(file):
words = []
for word in file.split():
words.append(word.lower())
return words
def clean_text(file):
words = []
for word in file.split():
words.append(re.sub(r'[^\w\s]', '', word))
return words
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def lemmatize_passage(text):
words = word_tokenize(text)
pos_tags = pos_tag(words)
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
return lemmatized_words
def do_stemming(text):
words = word_tokenize(text)
porter_stemmer = PorterStemmer()
stemmed_words = [porter_stemmer.stem(word) for word in words]
return stemmed_words
def remove_stopwords(text):
stop_words = set(stopwords.words('english'))
words = text.split()
filtered_words = [word for word in words if word not in stop_words]
return filtered_words
def count_words(words):
word_counts = {}
for word in words:
if word in word_counts:
word_counts[word] += 1
else:
word_counts[word] = 1
return word_counts
def main():
text_file = sys.argv[1]
preprocessing_method = sys.argv[2]
try:
counted_words = {}
words = ""
with open(text_file, "r") as myfile:
if preprocessing_method == "lowercasing":
words = do_lowercasing(myfile.read())
elif preprocessing_method == "lemmatization":
words = lemmatize_passage(myfile.read())
elif preprocessing_method == "stemming":
words = do_stemming(myfile.read())
elif preprocessing_method == "removal_of_stopwords":
words = remove_stopwords(myfile.read())
elif preprocessing_method == "text_cleaning":
words = clean_text(myfile.read())
else:
raise Exception("Preprocessing method not valid")
counted_words = count_words(words)
sorted_counts = dict(sorted(counted_words.items(), key=lambda item: item[1], reverse=True))
output = open("output.txt", "w")
output.write(f"Word counts using {preprocessing_method}\n")
for index, word in enumerate(sorted_counts, start=1):
output.write(f"{index} {word} {sorted_counts[word]}\n")
output.close()
except FileNotFoundError:
print(f"Error: File '{text_file}' not found.")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
main()