preprocessing-methods/normalize_text.py at main · hemalr24/preprocessing-methods · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import sys
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet, stopwords
from nltk import word_tokenize, pos_tag

def do_lowercasing(file):
    words = []
    for word in file.split():
        words.append(word.lower())
    return words

def clean_text(file):
    words = []
    for word in file.split():
        words.append(re.sub(r'[^\w\s]', '', word))
    return words

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_passage(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return lemmatized_words

def do_stemming(text):
    words = word_tokenize(text)
    porter_stemmer = PorterStemmer()
    stemmed_words = [porter_stemmer.stem(word) for word in words]
    return stemmed_words

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

def count_words(words):
    word_counts = {}
    for word in words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
    return word_counts

def main():
    text_file = sys.argv[1]
    preprocessing_method = sys.argv[2]
    try:
        counted_words = {}
        words = ""
        with open(text_file, "r") as myfile:
            if preprocessing_method == "lowercasing":
                words = do_lowercasing(myfile.read())
            elif preprocessing_method == "lemmatization":
                words = lemmatize_passage(myfile.read())
            elif preprocessing_method == "stemming":
                words = do_stemming(myfile.read())
            elif preprocessing_method == "removal_of_stopwords":
                words = remove_stopwords(myfile.read())
            elif preprocessing_method == "text_cleaning":
                words = clean_text(myfile.read())
            else:
                raise Exception("Preprocessing method not valid")

        counted_words = count_words(words)
        sorted_counts = dict(sorted(counted_words.items(), key=lambda item: item[1], reverse=True))
        output = open("output.txt", "w")
        output.write(f"Word counts using {preprocessing_method}\n")
        for index, word in enumerate(sorted_counts, start=1):
            output.write(f"{index} {word} {sorted_counts[word]}\n")
        output.close()

    except FileNotFoundError:
        print(f"Error: File '{text_file}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()