-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_crawler.py
More file actions
123 lines (92 loc) · 4.06 KB
/
test_crawler.py
File metadata and controls
123 lines (92 loc) · 4.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# TESTING Correctness of the Crawler
# Have url .txt files in the same directory as this file and just run test_crawler.py in terminal
# Some test cases have been provided (names of files stated later in the code)
# This tester gives you an indication of possible errors and tests the correctness of the
# crawler by verifying the persistent data structures have been built correctly
from crawler import crawler
# Check that every word has a doc_id(s) associated to it [crawler]
# Note that by construction, every word has a word_id
def verify_lexicon(obj):
"""Verify that every word actually comes from a document"""
res_inv_idx = obj.get_resolved_inverted_index()
# Ensure each word has at least 1 URL attached to it
for word in obj._word_id_cache:
# print word, ": ", res_inv_idx[word]
if word not in res_inv_idx:
return False
elif len(res_inv_idx[word]) == 0:
return False
return True
# Check that every url in text file has an index characterizing it
def verify_doc_idx(obj, url_file):
"""Verify that every processed URL is in the crawler's database"""
# Read through all URLs provided in .txt file
txt_file = open(url_file, "r")
seen = set({})
non_empty_urls = True
for url in txt_file:
# if duplicate URL, no need to check its index again
if url not in seen:
seen.add(url)
else:
continue
print (obj._fix_url(url.strip(), ""), "has",)
if obj._doc_id_cache[obj._fix_url(url.strip(), "")] in obj._doc_idx_cache:
# Valid URL
print (len((obj._doc_idx_cache[obj._doc_id_cache[obj._fix_url(url.strip(), "")]])), "words on its page")
else:
# URL has no doc_idx associated to it - means the URL is invalid or not parsed
print ("nothing on its page")
non_empty_urls = False
return non_empty_urls
# Check that inverted_index == resolved_inverted_index
def verify_inverted_index(obj):
"""Verify that inverted_index and resolved_inverted_index agree"""
inv_idx = obj.get_inverted_index()
res_idx = obj.get_resolved_inverted_index()
for keyword in res_idx:
if obj._word_id_cache[keyword] not in inv_idx:
# Check that every word in resolved_inverted_index has its word_id
# in inverted_index
print ("There is a word without a word_id!")
return False
else:
for url in res_idx[keyword]:
# Check that every URL in resolved_inverted_index has its doc_id
# in inverted_index
if obj._doc_id_cache[url] not in inv_idx[obj._word_id_cache[keyword]]:
print ("There is a URL without a doc_id OR mapped to the wrong keyword!")
return False
return True
# TESTING
# Provided URL test cases:
# urls.txt
# empty_urls.txt
# duplicate_urls.txt
# invalid_urls.txt
# Set file containing URLs - CHANGE ME!
url_files = "urls/urls.txt"
# Crawl the provided URLs
bot = crawler(None, url_files)
bot.crawl(depth=0)
# Run Test Cases!
print ("\nTESTING the CRAWLER for", url_files, "\n")
# Check that our lexicon is valid
if not verify_lexicon(bot):
print ("\nThere is a word not associated to any document!\n")
else:
print ("\nAll words in lexicon are associated to at least one document!\n")
# NOTE. invalid urls must be empty
# NOTE. if a url has no doc_idx, then it cannot be found since no words are associated to it
if not verify_doc_idx(bot, url_files):
print ("\nThere is a URL that is empty!\n")
else:
print ("\nAll URLs have words associated to them!\n")
# Check our inverted_index and resolved_inverted_index match
if not verify_inverted_index(bot):
print ("\ninverted_index and resolved_index don't match!\n")
else:
print ("\ninverted_index and resolved_index are the same!\n")
# OBSERVATIONS.
# http and https are considered different URLs despite mapping to the same website
# In _doc_id_cache, there are urls that have not been parsed for a doc_idx due to the restriction on depth