-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindexer.py
More file actions
executable file
·214 lines (171 loc) · 6.7 KB
/
indexer.py
File metadata and controls
executable file
·214 lines (171 loc) · 6.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import sys
import re
import string
import json
import time
from stop_words import get_stop_words
import operator
from bs4 import BeautifulSoup
from nltk.stem.snowball import SnowballStemmer
# global declarations for doclist, postings, vocabulary
docids = []
doclength = {}
cache = []
postings = {}
vocab = []
# main is used for offline testing only
def main():
# code for testing offline
# time start
if len(sys.argv) != 2:
print('usage: ./indexer.py file')
sys.exit(1)
filename = sys.argv[1]
try:
input_file = open(filename, 'r')
except (IOError) as ex:
print('Cannot open ', filename, '\n Error: ', ex)
else:
page_contents = input_file.read() # read the input file
url = 'http://www.' + filename + '/'
print(url, page_contents)
make_index(url, page_contents)
finally:
input_file.close()
#writes the indexs to the files
def write_index():
# declare refs to global variables
global docids
global doclength
global cache
global postings
global vocab
# writes to index files: docids, vocab, postings
outlist1 = open('docids.txt', 'w')
outlist2 = open('vocab.txt', 'w')
outlist3 = open('postings.txt', 'w')
outlist4 = open('doclength.txt', 'w')
outlist5 = open('cache.txt', 'w')
json.dump(docids, outlist1)
json.dump(vocab, outlist2)
json.dump(postings, outlist3)
json.dump(doclength, outlist4)
json.dump(cache, outlist5)
outlist1.close()
outlist2.close()
outlist3.close()
outlist4.close()
outlist5.close()
return
#cleans HTML and tokenizes
def clean_html(page_contents):
# function to clean html
# removes tags that are not needed
remove = r"(<script(\s|\S)*?<\/script>)|(<header(\s|\S)*?<\/header>)|(<nav(\s|\S)*?<\/nav>)|" \
r"(<footer(\s|\S)*?<\/footer>)|(<style(\s|\S)*?<\/style>)|" \
r"(<div(.+)?id=\"(.+)?menu\">(\s|\S)*?<\/div>)|(<div(.+)?class=\"(.+)?menu\">(\s|\S)*?<\/div>)" \
r"(<!--(\s|\S)*?-->)|(<\/?(\s|\S)*?>)|\t|\b\\u....\b"
regex = r"\w{3,}|\d" # get words 3 chars or longer
get_words = re.sub(remove, '\n', page_contents) #removes new lines
get_words = get_words.lower() #oonverts to lower case
get_words = re.findall(regex, get_words) # puts words in list
get_words = [get_words for get_words in get_words if get_words not in get_stop_words('english')]
# uses get_stop_words to compare and remove stopwords
#print(get_words)
#print("------------------------------------")
# STEM THE WORD, BASIC REMOVING ING, S AND IF ENDS WITH ED ONLY REMOVING THE D
stemmer = SnowballStemmer("english") #stems terms with NLTK English stemmer
page_contents = [stemmer.stem(plural) for plural in get_words]
return page_contents
#finds keywrods in page and give them weight
def keywords(page_contents):
soup = BeautifulSoup(page_contents, "html.parser")
_title = [element.get_text() for element in soup.findAll('title')]#find terms in <title>
_h1 = [element.get_text() for element in soup.findAll('h1')]#finds all header tags
_h2 = [element.get_text() for element in soup.findAll('h2')]
_h3 = [element.get_text() for element in soup.findAll('h3')]
_h4 = [element.get_text() for element in soup.findAll('h4')]
_h5 = [element.get_text() for element in soup.findAll('h5')]
title = ' '.join(_title)
h1 = ' '.join(_h1)
h2 = ' '.join(_h2)
h3 = ' '.join(_h3)
h4 = ' '.join(_h4)
h5 = ' '.join(_h5)
page_contents = title + " " + h1 + " " + h2 + " " + h3 + " " + h4 + " " + h5 #puts all keywords in a list
page_contents = page_contents.lower() #convert to lower case
return page_contents
#gets page title and snippet
def get_title(page_contents):
soup = BeautifulSoup(page_contents, "html.parser")
title = soup.title.string
_p = [element.get_text() for element in soup.findAll('p')] #find all P elements for snippit
p = ' '.join(_p)
p = p[0:295] + " ..." #get the first 295 chars and ad ... at the end
page_contents = [title, p] #store title and snippet in a list
return page_contents
def make_index(url, page_contents):
# declare refs to global variables
global docids
global doclength
global postings
global vocab
# first convert bytes to string if necessary
if isinstance(page_contents, bytes):
# page_contents = page_contents.decode('utf-8')
page_contents = page_contents.decode('latin-1', 'ignore')
# page_contents = page_contents.decode(encoding='utf-8', errors='ignore')
print('===============================================')
print('make_index: url = ', url)
print('===============================================')
#get all the data and store in lists
title = get_title(page_contents)
page_text = clean_html(page_contents)
keywordss = keywords(page_contents)
##document Id to table
docids.insert(len(docids), url)
cache.append(title)
docid = docids.index(url)
docwordcount = 0
# loop though page_text, and check if that is in vocab list
# if it is, get vocab ID , if not Add to the list and vocab ID
for words in page_text:
docwordcount += 1
# Adds to vocab
if words in vocab:
wordid = int(vocab.index(words))
else:
vocab.append(words)
wordid = int(vocab.index(words))
# check if word is in keywords
if words in keywordss:
# print(words, "is in keywords")
if wordid not in postings:
postings[wordid] = [[docid, 1, 1]]
else:
# if word is found, check if docId is in dict
if docid in postings[wordid][-1]: # using -1 as this is this doc would be
# if id is found, add 1
postings[wordid][-1][1] += 1
postings[wordid][-1][2] += 1
else:
# if not found add docid and freq
postings[wordid].append([docid, 1, 1])
#if word not in keyword add but dont increment weight.
else:
if wordid not in postings:
postings[wordid] = [[docid, 1, 0]]
else:
# if word is found, check if docId is in dict
if docid in postings[wordid][-1]: # using -1 as this is this doc would be
# if id is found, add 1
postings[wordid][-1][1] += 1
else:
# if not found add docid and freq
postings[wordid].append([docid, 1, 0])
doclength[docid] = docwordcount
return
# Standard boilerplate to call the main() function to begin
# the program.
if __name__ == '__main__':
main()