forked from colons/pyfoot-plugins
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhttp.py
More file actions
179 lines (158 loc) · 7.67 KB
/
http.py
File metadata and controls
179 lines (158 loc) · 7.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 -*-
#import lxml.html
#import lxml.etree
import requests
import urlparse
import re
from random import choice
import urllib
from BaseHTTPServer import BaseHTTPRequestHandler
import chardet
import htmlentitydefs
from hurry import filesize
import time
import plugin
defaults = {
'http_url_blacklist': [],
}
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
]
html_types = ['text/html','application/xhtml+xml']
redirect_codes = [301,302,303]
responses = BaseHTTPRequestHandler.responses
def choose_agent():
return choice(user_agents)
def ajax_url(url):
""" AJAX HTML snapshot URL parsing, pretty much required for a modern scraper. """
""" https://developers.google.com/webmasters/ajax-crawling/docs/specification """
hashbang_index = url.find('#!')
if hashbang_index != -1:
base = url[:hashbang_index]
joiner = '?' if '?' not in base else '&'
url = ''.join((base, joiner, '_escaped_fragment_=', urllib.quote(url[hashbang_index+2:], '=')))
return url
def prettify_url(url):
""" Removes URL baggage to display a clean hostname/path. """
""" Can be passed a string or a urlparse.ParseResult object. """
if isinstance(url, urlparse.ParseResult) == False:
url = urlparse.urlparse(url)
return url.hostname + re.sub('/$', '', url.path)
class NoTitleError(Exception):
def __init__(self):
pass
""" Thanks to Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html """
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
filesizes = [
(1024 ** 5, ' PiB'),
(1024 ** 4, ' TiB'),
(1024 ** 3, ' GiB'),
(1024 ** 2, ' MiB'),
(1024 ** 1, ' KiB'),
(1024 ** 0, ' B'),
]
class Plugin(plugin.Plugin):
def register_commands(self):
self.regexes = [
('(?i).*https?://.', self.title)
]
def title(self, message, args):
""" Returns the HTML title tag of URLs posted.
$https://twitter.com/#!/camh/statuses/147449116551680001
>Twitter / Cameron Kenley Hunt: There are only three hard ... \x03#|\x03 \x02twitter.com\x02
"""
for word in re.findall('(?iu)https?://.*?(?=\\s|\\Z)', message.content.decode('utf-8')):
permitted = True
for i in self.conf.conf['http_url_blacklist']:
channel, blacklist = i.split(' ')
if channel == message.source and re.match(blacklist, word):
permitted = False
if permitted:
""" Set it up. """
url_parsed = urlparse.urlparse(word)
url_hostname = url_parsed.hostname
word = ajax_url(self.irc.strip_formatting(word))
agent = choose_agent()
request_headers = {'User-Agent': agent}
request_headers_405 = {'User-Agent': agent, 'Range': 'bytes=1-5'}
""" GO! """
start_time = time.time()
try:
resource = requests.head(word, headers=request_headers, allow_redirects=True)
if resource.status_code == 405:
resource = requests.get(word, headers=request_headers_405, allow_redirects=True)
else:
resource.raise_for_status()
if resource.history != [] and resource.history[-1].status_code in redirect_codes:
word = resource.history[-1].headers['Location']
redirection_url = urlparse.urlparse(word)
if redirection_url.netloc == '':
word = ''.join((url_parsed.scheme, '://', url_parsed.netloc, redirection_url.path))
elif redirection_url.hostname != url_hostname:
url_hostname = '%s \x03#->\x03 %s' % (url_hostname, prettify_url(word))
word = ajax_url(word)
resource_type = resource.headers['Content-Type'].split(';')[0]
if resource_type in html_types:
resource = requests.get(word, headers=request_headers)
resource.raise_for_status()
if resource.encoding == 'ISO-8859-1':
resource.encoding = chardet.detect(resource.content)['encoding']
"""Seems that most pages claiming to be XHTML—including many large websites—
are not strict enough to parse correctly, usually for some very minor reason,
and it's a waste to attempt to parse it as XML first. This code will remain
for the day we can reliably parse XHTML as XML for the majority of sites."""
#if (html_types[1] in resource_type) or (('xhtml' or 'xml') in resource.text.split('>')[0].lower()): # application/xhtml+xml
# title = lxml.etree.fromstring(resource.text).find('.//xhtml:title', namespaces={'xhtml':'http://www.w3.org/1999/xhtml'}).text.strip()
#else: # text/html
#title = lxml.html.fromstring(resource.text).find(".//title").text.replace('\n','').strip()
title = re.findall('(?si)(?<=<title>).*(?=</title>)', resource.text)[0]
if title == '':
raise NoTitleError
else:
title = re.sub('(?s)\s+', ' ', unescape(title).strip())
else:
"""TODO: Make this feature togglable, since it can seem spammy for image dumps."""
raise NoTitleError
except requests.exceptions.ConnectionError:
title = 'server connection error'
except requests.exceptions.HTTPError, httpe:
title = '%s %s' % (httpe.response.status_code, responses[httpe.response.status_code][0])
except NoTitleError:
try:
data_length = filesize.size(float(resource.headers['Content-Length']), filesizes)
except TypeError:
data_length = 'size unknown'
title = '%s \x03#|\x03 %s' % (resource_type, data_length)
end_time = time.time()
time_length = '%s seconds' % round(end_time-start_time, 2)
summary = '%s \x03#|\x03 %s \x03#|\x03 \x02%s\x02' % (title, time_length, url_hostname)
self.irc.privmsg(message.source, summary)