SearchEngine_Implementation/_f_indexer.py at master · s-mrb/SearchEngine_Implementation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import csv
from nltk.stem import PorterStemmer
import time
import re
import winsound
from _helper import store_on_hashed_directory, title_year_artist_genera_dir


def forward_indexer(stopwords_file, data_set, output_file):
    start = time.time()
    # string that will hold entire posting of a batch of forward index , this is to avoid loops that will slow program
    out = ""
    f = None
    try:
        # read stopwords
        f = open(stopwords_file, 'r')

    except Exception as e:
        print(e)
        f.close()

    # rstrip() method returns a copy of the string with trailing characters removed
    stopwords = [line.rstrip() for line in f]

    # close stopwords file
    f.close()

    # sw_d = dict.fromkeys(stopwords)
    ps = PorterStemmer()

    # forward_batch is a dictionary, its key is word and element is a list of positions
    forward_batch = {}

    try:

        with open(data_set, encoding="utf8", errors='ignore') as csvFile:

            # creating a csv reader object
            read_csv = csv.reader(csvFile, delimiter=',')

            # parse row-wise
            for row in read_csv:
                # if row[0] != "8651":
                #     continue

                # concatenate title,year,artist,genre and lyric

                # row[5] = lyric
                tuple_a = row[1] + " " + row[2] + " " + row[3] + " " + row[4] + " " + row[5]
                # tyag_string = row[1] + "," + row[2] + "," + row[3] + "," + row[4]

                # store_on_hashed_directory(str(row[0]), tyag_string, title_year_artist_genera_dir,0)

                # wrd_loci is all the possible locations of word
                # on the basis of which precedence could be given to it on a query
                wrd_loci = row[1] + " " + row[2] + " " + row[3] + " " + row[4]

                # to check whether the word in any of wrd_loci we must convert wrd_loci into the form in which
                # our corpus is indexed that is we must convert them to lower case and stem them

                wrd_loci = wrd_loci.lower()

                # get only alphanumeric and replace other by space
                # put spaces instead of non-alphanumeric characters
                wrd_loci = re.sub(r'[^a-z0-9 ]', ' ', wrd_loci)

                # convert to list
                wrd_loci = wrd_loci.split()

                # eliminate the stopwords
                wrd_loci = [x for x in wrd_loci if x not in stopwords]

                wrd_loci = [ps.stem(word) for word in wrd_loci]

                # lowercase all
                # tuple_a is an entire tuple entire tuple of excel
                tuple_a = tuple_a.lower()

                # get only alphanumeric and replace other by space
                tokens = re.sub(r'[^a-z0-9 ]', ' ', tuple_a)  # put spaces instead of non-alphanumeric characters

                # convert tokens string into a list so that it gets easy to remove stop words
                tokens = tokens.split()

                tokens = [x for x in tokens if x not in stopwords]  # eliminate the stopwords
                # stemming tokens
                tokens = [ps.stem(word) for word in tokens]
                doc_size = len(tokens)
                print(row[0])
                # index storing index of word in tokenized  list
                for index, word in enumerate(tokens):

                    # whether word is located in title/artist_name/year/genera or not
                    # if it is in these columns then in how many of them it is present
                    # this location weight will be used for ranking purpose
                    location_weight = 0

                    for wrd_locus in wrd_loci:
                        if word == wrd_locus:
                            location_weight = location_weight + 1

                    # storing (|)+word in dictionary, this is to make inverted indexing easier,
                    # | will act as signal character for word while reading forward index file
                    if "(|)" + word in forward_batch:
                        forward_batch["(|)" + word].append(index)

                    else:
                        temp_list = [str(location_weight / 10), index]
                        forward_batch["(|)" + word] = temp_list

                out = out + row[0] + "," + str(doc_size) + "," + str(forward_batch) + "\n"
                forward_batch = {}
                # tokens = []
    except Exception as e:
        # print(tuple_a)
        freq = 2500
        duration = 1000

        winsound.Beep(freq, duration)

        print("doc:" + row[0] + " index: " + index )
        print(word)
        print("In Read:" + str(e))
        # no need to close as "with open" method automatically does this

    try:
        # print(out)

        with open(output_file, "w", encoding="utf8") as fileOut:

            fileOut.write(out)

            end = time.time()
            print(end - start)
    except Exception as e:
        freqc = 2500
        durationn = 1000

        winsound.Beep(freqc, durationn)
        print("In write:" + str(e))
        # no need to close as "with open" method automatically does this


freq = 2500
duration = 1000

winsound.Beep(freq, duration)