forked from yaojialyu/crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebPage.py
More file actions
80 lines (67 loc) · 2.98 KB
/
webPage.py
File metadata and controls
80 lines (67 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#coding:utf8
"""
WebPage.py
~~~~~~~~~~~~~
该模块用于下载网页源代码, 允许自定义header与使用代理服务器
"""
import traceback
import re
import logging
import requests
log = logging.getLogger('Main.WebPage')
class WebPage(object):
def __init__(self, url):
self.url = url
self.pageSource = None
self.customeHeaders()
def fetch(self, retry=2, proxies=None):
'''获取html源代码'''
try:
#设置了prefetch=False,当访问response.text时才下载网页内容,避免下载非html文件
response = requests.get(self.url, headers=self.headers, timeout=10, prefetch=False, proxies=proxies)
if self._isResponseAvaliable(response):
self._handleEncoding(response)
self.pageSource = response.text
return True
else:
log.warning('Page not avaliable. Status code:%d URL: %s \n' % (
response.status_code, self.url) )
except Exception,e:
if retry>0: #超时重试
return self.fetch(retry-1)
else:
log.debug(str(e) + ' URL: %s \n' % self.url)
return None
def customeHeaders(self, **kargs):
#自定义header,防止被禁,某些情况如豆瓣,还需制定cookies,否则被ban
#使用参数传入可以覆盖默认值,或添加新参数,如cookies
self.headers = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset' : 'gb18030,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding' : 'gzip,deflate,sdch',
'Accept-Language' : 'en-US,en;q=0.8',
'Connection': 'keep-alive',
#设置Host会导致TooManyRedirects, 因为hostname不会随着原url跳转而更改,可不设置
#'Host':urlparse(self.url).hostname
'User-Agent' : 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4',
'Referer' : self.url,
}
self.headers.update(kargs)
def getDatas(self):
return self.url, self.pageSource
def _isResponseAvaliable(self, response):
#网页为200时再获取源码, 只选取html页面。
if response.status_code == requests.codes.ok:
if 'html' in response.headers['Content-Type']:
return True
return False
def _handleEncoding(self, response):
#requests会自动处理编码问题.
#但是当header没有指定charset并且content-type包含text时,
#会使用RFC2616标准,指定编码为ISO-8859-1
#因此需要用网页源码meta标签中的charset去判断编码
if response.encoding == 'ISO-8859-1':
charset_re = re.compile("((^|;)\s*charset\s*=)([^\"']*)", re.M)
charset=charset_re.search(response.text)
charset=charset and charset.group(3) or None
response.encoding = charset