csc326-Query-Project/crawler.py at master · erickmu1/csc326-Query-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
# Copyright (C) 2011 by Peter Goodman
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

# import urllib2
# import urlparse
# from BeautifulSoup import *

import urllib.request as urllib2
import urllib.parse as urlparse
from bs4 import BeautifulSoup, Tag
from collections import defaultdict
import re
from pagerank import page_rank
import sqlite3


def attr(elem, attr):
    """An html attribute from an html element. E.g. <a href="">, then
    attr(elem, "href") will get the href or an empty string."""
    try:
        return elem[attr]
    except:
        return ""


WORD_SEPARATORS = re.compile(r'\s|\n|\r|\t|[^a-zA-Z0-9\-_]')


class crawler(object):
    """Represents 'Googlebot'. Populates a database by crawling and indexing
    a subset of the Internet.

    This crawler keeps track of font sizes and makes it simpler to manage word
    ids and document ids."""

    def __init__(self, db_conn, url_file):
        """Initialize the crawler with a connection to the database to populate
        and with the file containing the list of seed URLs to begin indexing."""
        self._url_queue = []
        self._doc_id_cache = {}         # maps: url --> doc_id
        self._word_id_cache = {}        # maps: word --> word_id (stores lexicon as keys)
        self._doc_idx_cache = {}        # maps: doc_id --> doc_idx
        self._inv_idx_cache = {}        # maps: word_id --> doc_id(s)
        self._res_inv_idx_cache = {}    # maps: word --> url(s)
        self.links = []
        self.pagerank = {}
        self._db_conn = db_conn

        # Database Initialization (Persistent storage)
        if db_conn is not None:
            crsr = db_conn.cursor()

        crsr.execute("CREATE TABLE lexicon (word_id INTEGER, word TEXT)")
        crsr.execute("CREATE TABLE page_content (doc_id INTEGER, word_id INTEGER)")
        crsr.execute("CREATE TABLE document (url TEXT, doc_id INTEGER, page_rank REAL)")
        crsr.execute("CREATE TABLE resolved_map (word TEXT, url TEXT, page_rank REAL)")

        # Save changes to db
        db_conn.commit()

        # functions to call when entering and exiting specific tags
        self._enter = defaultdict(lambda *a, **ka: self._visit_ignore)
        self._exit = defaultdict(lambda *a, **ka: self._visit_ignore)

        # add a link to our graph, and indexing info to the related page
        self._enter['a'] = self._visit_a

        # record the currently indexed document's title an increase
        # the font size
        def visit_title(*args, **kargs):
            self._visit_title(*args, **kargs)
            self._increase_font_factor(7)(*args, **kargs)

        # increase the font size when we enter these tags
        self._enter['b'] = self._increase_font_factor(2)
        self._enter['strong'] = self._increase_font_factor(2)
        self._enter['i'] = self._increase_font_factor(1)
        self._enter['em'] = self._increase_font_factor(1)
        self._enter['h1'] = self._increase_font_factor(7)
        self._enter['h2'] = self._increase_font_factor(6)
        self._enter['h3'] = self._increase_font_factor(5)
        self._enter['h4'] = self._increase_font_factor(4)
        self._enter['h5'] = self._increase_font_factor(3)
        self._enter['title'] = visit_title

        # decrease the font size when we exit these tags
        self._exit['b'] = self._increase_font_factor(-2)
        self._exit['strong'] = self._increase_font_factor(-2)
        self._exit['i'] = self._increase_font_factor(-1)
        self._exit['em'] = self._increase_font_factor(-1)
        self._exit['h1'] = self._increase_font_factor(-7)
        self._exit['h2'] = self._increase_font_factor(-6)
        self._exit['h3'] = self._increase_font_factor(-5)
        self._exit['h4'] = self._increase_font_factor(-4)
        self._exit['h5'] = self._increase_font_factor(-3)
        self._exit['title'] = self._increase_font_factor(-7)

        # never go in and parse these tags
        self._ignored_tags = set([
            'meta', 'script', 'link', 'meta', 'embed', 'iframe', 'frame',
            'noscript', 'object', 'svg', 'canvas', 'applet', 'frameset',
            'textarea', 'style', 'area', 'map', 'base', 'basefont', 'param',
        ])

        # set of words to ignore
        self._ignored_words = set([
            '', 'the', 'of', 'at', 'on', 'in', 'is', 'it',
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
            'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
            'u', 'v', 'w', 'x', 'y', 'z', 'and', 'or',
        ])

        # TODO remove me in real version
        self._mock_next_doc_id = 1
        self._mock_next_word_id = 1

        # keep track of some info about the page we are currently parsing
        self._curr_depth = 0
        self._curr_url = ""
        self._curr_doc_id = 0
        self._font_size = 0
        self._curr_words = None

        # get all urls into the queue
        try:
            with open(url_file, 'r') as f:
                for line in f:
                    self._url_queue.append((self._fix_url(line.strip(), ""), 0))
        except IOError:
            pass

    # TODO remove me in real version
    def _mock_insert_document(self, url):
        """A function that pretends to insert a url into a document db table
        and then returns that newly inserted document's id."""
        ret_id = self._mock_next_doc_id
        self._mock_next_doc_id += 1
        return ret_id

    # TODO remove me in real version
    def _mock_insert_word(self, word):
        """A function that pretends to inster a word into the lexicon db table
        and then returns that newly inserted word's id."""

        # Insert word to lexicon db table
        lex_pair = (self._mock_next_word_id, word)
        self._db_conn.cursor().execute("INSERT INTO lexicon VALUES (?, ?)", lex_pair)

        # Update current word_id
        ret_id = self._mock_next_word_id
        self._mock_next_word_id += 1
        return ret_id

    def word_id(self, word):
        """Get the word id of some specific word."""
        if word in self._word_id_cache:
            return self._word_id_cache[word]

        # TODO: 1) add the word to the lexicon, if that fails, then the
        #          word is in the lexicon
        #       2) query the lexicon for the id assigned to this word,
        #          store it in the word id cache, and return the id.

        word_id = self._mock_insert_word(word)
        self._word_id_cache[word] = word_id
        return word_id

    def document_id(self, url):
        """Get the document id for some url."""
        if url in self._doc_id_cache:
            return self._doc_id_cache[url]

        # TODO: just like word id cache, but for documents. if the document
        #       doesn't exist in the db then only insert the url and leave
        #       the rest to their defaults.

        doc_id = self._mock_insert_document(url)
        self._doc_id_cache[url] = doc_id
        return doc_id

    def _fix_url(self, curr_url, rel):
        """Given a url and either something relative to that url or another url,
        get a properly parsed url."""

        rel_l = rel.lower()
        if rel_l.startswith("http://") or rel_l.startswith("https://"):
            curr_url, rel = rel, ""

        # compute the new url based on import
        curr_url = urlparse.urldefrag(curr_url)[0]
        parsed_url = urlparse.urlparse(curr_url)
        return urlparse.urljoin(parsed_url.geturl(), rel)

    def add_link(self, from_doc_id, to_doc_id):
        """Add a link into the database, or increase the number of links between
        two pages in the database."""

        if (from_doc_id, to_doc_id) not in self.links:
            self.links.append((from_doc_id, to_doc_id))

    def _visit_title(self, elem):
        """Called when visiting the <title> tag."""
        title_text = self._text_of(elem).strip()
        # print("document title=" + repr(title_text))

        # TODO update document title for document id self._curr_doc_id

    def _visit_a(self, elem):
        """Called when visiting <a> tags."""

        dest_url = self._fix_url(self._curr_url, attr(elem, "href"))

        # print "href="+repr(dest_url), \
        #      "title="+repr(attr(elem,"title")), \
        #      "alt="+repr(attr(elem,"alt")), \
        #      "text="+repr(self._text_of(elem))

        # add the just found URL to the url queue
        self._url_queue.append((dest_url, self._curr_depth))

        # add a link entry into the database from the current document to the
        # other document
        self.add_link(self._curr_doc_id, self.document_id(dest_url))

        # TODO add title/alt/text to index for destination url

    def _add_words_to_document(self):
        # TODO: knowing self._curr_doc_id and the list of all words and their
        #       font sizes (in self._curr_words), add all the words into the
        #       database for this document
        # print ("    num words=" + str(len(self._curr_words)))

        # Add information (doc_idx) pertaining to current document indexed by doc_id
        self._doc_idx_cache[self._curr_doc_id] = self._curr_words

        # Add word_ids (doc_idx) to document_idx db table
        doc_idx = []
        for word_font_pair in self._curr_words:
            doc_idx.append((self._curr_doc_id, word_font_pair[0]))
        self._db_conn.cursor().executemany("INSERT INTO page_content VALUES(?, ?)", doc_idx)

        # TODO: add Title and Description to doc_idx

    def _increase_font_factor(self, factor):
        """Increade/decrease the current font size."""

        def increase_it(elem):
            self._font_size += factor

        return increase_it

    def _visit_ignore(self, elem):
        """Ignore visiting this type of tag"""
        pass

    def _add_text(self, elem):
        """Add some text to the document. This records word ids and word font sizes
        into the self._curr_words list for later processing."""
        words = WORD_SEPARATORS.split(elem.string.lower())
        for word in words:
            word = word.strip()
            if word in self._ignored_words:
                continue
            self._curr_words.append((self.word_id(word), self._font_size))

            # Populate inverted index cache
            word_id = self.word_id(word)
            if word_id in self._inv_idx_cache:
                self._inv_idx_cache[word_id].add(self._curr_doc_id)
            else:
                self._inv_idx_cache[word_id] = {self._curr_doc_id}

            # Populate resolved inverted index cache
            if word in self._res_inv_idx_cache:
                self._res_inv_idx_cache[word].add(self._curr_url)
            else:
                self._res_inv_idx_cache[word] = {self._curr_url}

    def _text_of(self, elem):
        """Get the text inside some element without any tags."""
        if isinstance(elem, Tag):
            text = []
            for sub_elem in elem:
                text.append(self._text_of(sub_elem))

            return " ".join(text)
        else:
            return elem.string

    def _index_document(self, soup):
        """Traverse the document in depth-first order and call functions when entering
        and leaving tags. When we come accross some text, add it into the index. This
        handles ignoring tags that we have no business looking at."""

        class DummyTag(object):
            next = False
            name = ''

        class NextTag(object):
            def __init__(self, obj):
                self.next = obj

        tag = soup.html
        stack = [DummyTag(), soup.html]

        while tag and tag.next:
            tag = tag.next

            # html tag
            if isinstance(tag, Tag):

                if tag.parent != stack[-1]:
                    self._exit[stack[-1].name.lower()](stack[-1])
                    stack.pop()

                tag_name = tag.name.lower()

                # ignore this tag and everything in it
                if tag_name in self._ignored_tags:
                    if tag.nextSibling:
                        tag = NextTag(tag.nextSibling)
                    else:
                        self._exit[stack[-1].name.lower()](stack[-1])
                        stack.pop()
                        tag = NextTag(tag.parent.nextSibling)

                    continue

                # enter the tag
                self._enter[tag_name](tag)
                stack.append(tag)

            # text (text, cdata, comments, etc.)
            else:
                self._add_text(tag)

    def crawl(self, depth=2, timeout=3):  # was set depth = 2
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id)  # mark this document as haven't been visited

            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read(), features='html.parser')

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = []
                self._index_document(soup)  # Inside this function is where we call at some point add_link()
                self._add_words_to_document()  # updates dict() that maps: doc_id --> doc_idx
                # print ("    url=" + repr(self._curr_url))

            except Exception as e:
                print (e)
                pass
            finally:
                if socket:
                    socket.close()

        # Calculate Page_Ranks
        if len(self.links) > 0:
            self.pagerank = page_rank(self.links)

        # Populate DOCUMENTS
        new_data = []
        for url in self._doc_id_cache:
            doc_id = self._doc_id_cache[url]
            if doc_id in self.pagerank:
                new_data.append((url, int(doc_id), self.pagerank[doc_id]))
            else:
                new_data.append((url, doc_id, 0.0))
        self._db_conn.cursor().executemany("INSERT INTO document VALUES (?, ?, ?)", new_data)

        # STORE Resolved Inverted Index
        resolved_map = []

        for word in self._res_inv_idx_cache:
            for url in self._res_inv_idx_cache[word]:

                doc_id = self._doc_id_cache[url]

                # Insert URL with page_rank score
                if doc_id in self.pagerank:
                    resolved_map.append((word, url, self.pagerank[doc_id]))
                # NOTE. there are some urls that don't have any links to them.
                # These will be given a default score of "0" because they aren't sent to page_rank()
                else:
                    resolved_map.append((word, url, 0.0))

        self._db_conn.cursor().executemany("INSERT INTO resolved_map VALUES (?, ?, ?)", resolved_map)

        # Save changes to database
        self._db_conn.commit()

    # Returns a dict() that maps: word_id --> doc_id(s)
    def get_inverted_index(self):
        """Return all doc_id(s) pertaining to any word_id"""
        return self._inv_idx_cache

    # Returns a dict() that maps: word --> url(s)
    def get_resolved_inverted_index(self):
        """Return all urls matching a specific word"""
        return self._res_inv_idx_cache


# if __name__ == "__main__":
import pprint

# Initialize database
db_conn = sqlite3.connect("dbFile.db")

# Populate the database
bot = crawler(db_conn, "urls/urls.txt")
bot.crawl(depth=1)

# print("\nLEXICON")
data = []
for row in db_conn.cursor().execute("SELECT * FROM lexicon"):
    data.append(row)
# pprint.pprint(data)

# print("\nPAGE CONTENTs")
data = []
for row in db_conn.cursor().execute("SELECT * FROM page_content"):
    data.append(row)
# pprint.pprint(data)

# print("\nPAGE RANK")
data = []
for row in db_conn.cursor().execute("SELECT * FROM document"):
    data.append(row)
# pprint.pprint(data)

# print("\nMAPPING")
data = []
for row in db_conn.cursor().execute("SELECT * FROM resolved_map"):
    data.append(row)
# pprint.pprint(data)

# NOTE. for some reason #s 13 and 16 are converted to binary before being stored into
# the SQL database (for page_rank table) and I don't know why...
# RESOLVED. explicitly type cast doc_id to int() before storing to database


# Delete tables from database
# db_conn.cursor().execute("DROP TABLE lexicon")
# db_conn.cursor().execute("DROP TABLE document_idx")
# db_conn.cursor().execute("DROP TABLE inverted_idx")
# db_conn.cursor().execute("DROP TABLE page_rank")
# db_conn.cursor().execute("DROP TABLE resolved_inverted_index")

# db_conn.commit()
db_conn.close()

# print('\nLinks\n')
# print(bot.links)