SimpleWebCrawler/readwg.py at master · ConorDrew/SimpleWebCrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#! /usr/bin/python

import sys;
import url_errors

DONE = False
DONE_string = "[-- DONE --]"

'''
Assumes that we are at the current line where that starts with '*'.  The format
is follows the following example:

--------------------------

*  1:2009-09-09:12:56:19 http://physics.nist.gov
e796febb7593fde042a7511a9761da8ce6d84299
http://physics.nist.gov/Divisions/Div842/div842.html
http://physics.nist.gov/MajResProj/Nanotech/nanotech.html

*  2:2009-09-09:12:56:20 http://physics.nist.gov/Divisions/Div842/div842.html
68e24912e780a2076f29b76b64066003a0f05fd0
http://physics.nist.gov/Divisions/Div842/Gp4/group4.html
http://physics.nist.gov/Divisions/Div842/Gp5/index.html
http://www.doc.gov

--------------------------


will process current page to visited_links (along with its hash-code)
and add outgoing links to frontier.

current line:   string
frontier:       set of strings (URLs) to visit
visited_links:  set of strings (URLs) already processed
hash_codes:     set of strings (alphanumeric hash codes) for visited pages

'''


'''
  read a URL (with possible spaces) in the current line
'''
def get_url(line):
  pos = line.find('http')
  if (pos < 0):
     return ''
  else  :
    return line[ line.find('http') : ]


#  The last letter in readline is a '\n', so let's not include it
#
def get_next_line(file):
     line = file.readline()
     if len(line) > 1:
       return line[:-1]
     elif len(line) == 1:
       return ' '
     else:
       return line


def scroll_to_next_webpage(file):
  global DONE
  if not file:
     DONE = True
  current_line = ""
  while (file and (not DONE)) :
    current_line = get_next_line(file)
    if ((current_line == DONE_string) or (current_line == '')):
        DONE = True
        return ''
    #print "scroll: " + current_line
    if (len(current_line)>0 and current_line[0] == '*'):
      #print "   scroll: stop at " + current_line
      return current_line
  return ''


'''
   file              text input file  (the webcrawl dump)
   current_line      string (current line of webcrawl dump)
   frontier          set of strings (URLs to visit)
   visited_links     set of strings (URLs already visited)
   sha_codes         set of strings (hexadecimal hash codes for vistied URLs)
'''
def process_page(file, current_line, frontier, visited_links, sha_codes):

  if DONE :
    return

  home_url = get_url(current_line)

  if (len(home_url) < 1):
      return

  visited_links.add( home_url )

  # remove **
  #print  "home_url = " + home_url
  # ^^^^^^^^^

  frontier.discard(home_url)    # set.remove() assume element is present
  hash = get_next_line(file)
  if (hash in url_errors.URL_errors):     # one of the special cases
     return

  if (hash[0] == '#'):
    hash = get_next_line(file)
  elif (hash[0] == '!'):         # already processed URL (via hash contents)
     return


  # remove**
  #print "hash = " + hash
  # ^^^^^^^^^

  sha_codes.add(hash)
  while (True) :
      line = get_next_line(file)
      url = get_url(line)
      if url == "":
         break
      if url not in visited_links:
         # print "   added outlink: " + url
         frontier.add(url)


def process_wg_file(file, visited_links, hash_codes, frontier):
  line = ""
  while (file and not DONE):
    line = scroll_to_next_webpage(file)
    process_page(file, line, frontier, visited_links, hash_codes)


# ***** MAIN *********

def main():

     file = sys.stdin
     frontier = set([])
     visited_links = set([])
     hash_codes = set([])
     process_wg_file(file, visited_links, hash_codes, frontier)


     # now print out the results


     print(" ")
     print("Visited Links: ")
     print("-------------  ")
     for url in visited_links:
       print(url)


     print(" ")
     print("Frontier: ")
     print("--------  ")
     for url in frontier:
       print(url)


     print(" ")
     print("Hash Codes: ")
     print("-----------  ")
     for url in hash_codes:
       print(url)


     print(" ")


if __name__ == "__main__":
  main()