-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path_f_indexer.py
More file actions
151 lines (107 loc) · 5.11 KB
/
_f_indexer.py
File metadata and controls
151 lines (107 loc) · 5.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import csv
from nltk.stem import PorterStemmer
import time
import re
import winsound
from _helper import store_on_hashed_directory, title_year_artist_genera_dir
def forward_indexer(stopwords_file, data_set, output_file):
start = time.time()
# string that will hold entire posting of a batch of forward index , this is to avoid loops that will slow program
out = ""
f = None
try:
# read stopwords
f = open(stopwords_file, 'r')
except Exception as e:
print(e)
f.close()
# rstrip() method returns a copy of the string with trailing characters removed
stopwords = [line.rstrip() for line in f]
# close stopwords file
f.close()
# sw_d = dict.fromkeys(stopwords)
ps = PorterStemmer()
# forward_batch is a dictionary, its key is word and element is a list of positions
forward_batch = {}
try:
with open(data_set, encoding="utf8", errors='ignore') as csvFile:
# creating a csv reader object
read_csv = csv.reader(csvFile, delimiter=',')
# parse row-wise
for row in read_csv:
# if row[0] != "8651":
# continue
# concatenate title,year,artist,genre and lyric
# row[5] = lyric
tuple_a = row[1] + " " + row[2] + " " + row[3] + " " + row[4] + " " + row[5]
# tyag_string = row[1] + "," + row[2] + "," + row[3] + "," + row[4]
# store_on_hashed_directory(str(row[0]), tyag_string, title_year_artist_genera_dir,0)
# wrd_loci is all the possible locations of word
# on the basis of which precedence could be given to it on a query
wrd_loci = row[1] + " " + row[2] + " " + row[3] + " " + row[4]
# to check whether the word in any of wrd_loci we must convert wrd_loci into the form in which
# our corpus is indexed that is we must convert them to lower case and stem them
wrd_loci = wrd_loci.lower()
# get only alphanumeric and replace other by space
# put spaces instead of non-alphanumeric characters
wrd_loci = re.sub(r'[^a-z0-9 ]', ' ', wrd_loci)
# convert to list
wrd_loci = wrd_loci.split()
# eliminate the stopwords
wrd_loci = [x for x in wrd_loci if x not in stopwords]
wrd_loci = [ps.stem(word) for word in wrd_loci]
# lowercase all
# tuple_a is an entire tuple entire tuple of excel
tuple_a = tuple_a.lower()
# get only alphanumeric and replace other by space
tokens = re.sub(r'[^a-z0-9 ]', ' ', tuple_a) # put spaces instead of non-alphanumeric characters
# convert tokens string into a list so that it gets easy to remove stop words
tokens = tokens.split()
tokens = [x for x in tokens if x not in stopwords] # eliminate the stopwords
# stemming tokens
tokens = [ps.stem(word) for word in tokens]
doc_size = len(tokens)
print(row[0])
# index storing index of word in tokenized list
for index, word in enumerate(tokens):
# whether word is located in title/artist_name/year/genera or not
# if it is in these columns then in how many of them it is present
# this location weight will be used for ranking purpose
location_weight = 0
for wrd_locus in wrd_loci:
if word == wrd_locus:
location_weight = location_weight + 1
# storing (|)+word in dictionary, this is to make inverted indexing easier,
# | will act as signal character for word while reading forward index file
if "(|)" + word in forward_batch:
forward_batch["(|)" + word].append(index)
else:
temp_list = [str(location_weight / 10), index]
forward_batch["(|)" + word] = temp_list
out = out + row[0] + "," + str(doc_size) + "," + str(forward_batch) + "\n"
forward_batch = {}
# tokens = []
except Exception as e:
# print(tuple_a)
freq = 2500
duration = 1000
winsound.Beep(freq, duration)
print("doc:" + row[0] + " index: " + index )
print(word)
print("In Read:" + str(e))
# no need to close as "with open" method automatically does this
try:
# print(out)
with open(output_file, "w", encoding="utf8") as fileOut:
fileOut.write(out)
end = time.time()
print(end - start)
except Exception as e:
freqc = 2500
durationn = 1000
winsound.Beep(freqc, durationn)
print("In write:" + str(e))
# no need to close as "with open" method automatically does this
freq = 2500
duration = 1000
winsound.Beep(freq, duration)