word_parser/main.py at master · j-a-vandervalk/word_parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
import re
import string
from dataclasses import dataclass
from typing import Dict, List, Tuple

TEST_DATA_PATH = r"./test_docs/"


@dataclass
class Word:
    """
    This is a dataclass which encapsulates the data associated with a word from the test data.
    """

    word: str
    count: int
    documents: list
    sentences: list


def get_file_paths(directory: str) -> List[str]:
    """
    This function takes a directory path as parameter and finds all the files with `.txt` extension
    which are in that directory.
    """

    files: List = []

    for dirpath, _, file_names in os.walk(directory):
        for f in file_names:
            if ".txt" in f:
                files.append(os.path.abspath(os.path.join(dirpath, f)))

    return files


def load_data(path=TEST_DATA_PATH) -> List[Dict[str, str]]:
    """
    This function takes as default the location of the test files as parameter, it then collects the
    data inside of the files and adds them to a list of dictionary objects.
    """

    data: List = []

    for f in get_file_paths(path):
        with open(file=f, mode="r", encoding="utf-8") as fp:
            # The dict keys in this case are the names of the files and the values are the contents.
            data.append({os.path.splitext(os.path.basename(f))[0]: fp.read()})

    return data


def get_word_count(data: List[Dict[str, str]]) -> List[Tuple[str, int]]:
    """
    This function finds the number of times a word was used in a list of dicts containing data.

    The number of times a word was used is then returned in a sorted way.
    """

    word_count: Dict[str, int] = {}

    for d in data:
        for k, v in d.items():
            # Split based on words only, remove the punctuation from the text, lower-case.
            s = re.split(
                r"\W+", v.translate(str.maketrans("", "", string.punctuation)).lower()
            )
            for word in s:
                if word not in word_count:
                    word_count[word] = 0
                word_count[word] += 1

    sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)

    return sorted_word_count


def get_sentences_containing_word(word: str, data: List[Dict[str, str]]) -> List[str]:
    """
    This function finds all of the sentences a word appears in across a given list of dicts of data.
    """

    sentences: List[List[str]] = []
    regex = fr"([^.]*?{word}[^.]*\.)"

    for d in data:
        for k, v in d.items():
            sentences.append(re.findall(regex, v.lower()))

    # Because we are looping through a list to create one we end up with a kind of 2D structure,
    # we can use list comprehension to "flatten" it out; i.e. [[str], [str]] -> [str, str].
    flat_list = [sub for sub_list in sentences for sub in sub_list]

    return flat_list


def get_document_names_containing_word(
    word: str, data: List[Dict[str, str]]
) -> List[str]:
    """
    This function is used to get the names of all the documents which contain a given word.
    """

    docs: List[str] = []
    seen = set(docs)

    for d in data:
        for k, v in d.items():
            # Split based on words only, remove the punctuation from the text, lower-case.
            s = re.split(
                r"\W+", v.translate(str.maketrans("", "", string.punctuation)).lower()
            )
            for w in s:
                # We want to prevent that the name of the document will get added multiple times.
                if w == word and k not in seen:
                    # We use a set here because it is more efficient than checking the list,
                    # using `in` for a list runs in O(n) as opposed to O(1) for sets.
                    seen.add(k)
                    docs.append(k)

    return docs


if __name__ == "__main__":
    # This is the list containing the `Word` objects we create from our chosen `words_list` below.
    final_data: List[Word] = []

    # This list contains the words which will be made into `Word` objects.
    word_list = [
        "audacity",
        "homegrown",
        "lobbyists",
        "generation",
        "humility",
        "freedom",
        "party",
        "time",
        "progress",
        "corruption",
        "promise",
        "iraq",
        "recommendation",
    ]

    # Load the data into our project.
    loaded_data = load_data()

    # Get the word count for our data.
    word_count = get_word_count(loaded_data)

    # Loop through the `word_count` list and if the word we encounter is in the list we will create
    # a word object for it.
    for w in word_count:
        if w[0] in word_list:
            word = w[0]
            count = w[1]

            documents = get_document_names_containing_word(w[0], loaded_data)
            sentences = get_sentences_containing_word(w[0], loaded_data)

            final_data.append(Word(word, count, documents, sentences))

    # Print out the final `word` objects we created from the test data.
    for f in final_data:
        print(f)