-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl.py
More file actions
143 lines (134 loc) · 4.09 KB
/
crawl.py
File metadata and controls
143 lines (134 loc) · 4.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import urllib2
import re
import urlparse
import robotparser
import datetime
import time
import sys
import random
from html_parser import ScrapeCallback
from db_cache import MongoCache
class downloader:
def __init__(self, cache=None, delay=1,
user_agent='Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11',
proxies = None, num_retries=2):
self.cache = cache
self.user_agent = user_agent
self.proxies = proxies
self.num_retries = num_retries
self.throttle = Throttle(delay)
def __call__(self, url):
result = None
if self.cache:
try:
result = self.cache[url]
except KeyError:
# url is not available in cache
pass
else:
if self.num_retries > 0 and \
500 <= result['code'] < 600:
#server error so ignore result from cache
#and re-download
result = None
if result is None:
#result loaded failed from cache
#so still need to download
self.throttle.wait(url)
if self.proxies:
proxy = random.choice(self.proxies)
else:
proxy = None
headers = {'User-agent': self.user_agent}
result = self.download(url, headers, proxy, self.num_retries)
if self.cache:
# save result tp cache
self.cache[url] = result
print 'result', result
return result['html']
#from parser import ScrapeCallback
#modified to support proxy
def download(self, url, headers, proxy, num_retries):
print 'Downloading:', url
rq_body = ''
request = urllib2.Request(url, rq_body, headers)
#request = urllib2.Request(url)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme:proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
html = opener.open(request).read()
except urllib2.URLError as e:
print 'Download error:', e.reason
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
#recursively retry 5xx HTTP errors
return download(url, num_retries-1)
print html
return html
def download_test():
response = urllib2.urlopen('http://example.webscraping.com')
html = response.read()
print html
#def crawl_sitemap(url):
# sitemap = download(url)
# links = re.findall()
def get_links(html):
#retur a list of links from html
#a regular expression to extract all links from the webpage
print html
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
#return a list include all links from the webpage
return webpage_regex.findall(html)
def link_crawler(seed_url, delay=1, link_regex=None, proxies=None, max_depth=2, user_agent='wswp', num_retries=2, scrape_callback= ScrapeCallback(),
cache = None):
crawl_queue = [seed_url]
seen = {seed_url:0}
rp = robotparser.RobotFileParser()
#initialize downloader
cache = MongoCache()
D = downloader(cache=cache)
while crawl_queue:
url = crawl_queue.pop()
#detect whether this url is baned
#if rp.can_fetch(user_agent, url):
depth = seen[url]
if depth != max_depth:
html = D(url)
#html parser and convert to csv
if scrape_callback:
scrape_callback(url, html)
for link in get_links(html):
if re.search(link_regex, link):
link = urlparse.urljoin(seed_url, link)
if link not in seen:
seen[link] = depth + 1
crawl_queue.append(link)
#else:
# print 'Blocked by robot.txt:', url
#config delay bttween two downloads
class Throttle:
def __init__(self, delay):
self.delay = delay
self.domains = {}
def wait(self, url):
domain = urlparse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
if sleep_secs > 0:
#domain has benn accessed recently
#so need to sleep
time.sleep(sleep_secs)
self.domains[domain] = datetime.datetime.now()
#
#test
#
def main():
print datetime.datetime.now()
link_crawler('http://example.webscraping.com', link_regex='/(view|index)')
print datetime.datetime.now()
if __name__ == '__main__':
main()