Skip to content

Commit f46b4c4

Browse files
authored
Merge pull request #71 from ArturGaspar/master
[MRG+1] data URI parser.
2 parents a997816 + 5ec787b commit f46b4c4

File tree

2 files changed

+189
-2
lines changed

2 files changed

+189
-2
lines changed

tests/test_url.py

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import unittest
55
from w3lib.url import (is_url, safe_url_string, safe_download_url,
66
url_query_parameter, add_or_replace_parameter, url_query_cleaner,
7-
file_uri_to_path, path_to_file_uri, any_to_uri, urljoin_rfc,
8-
canonicalize_url, parse_url)
7+
file_uri_to_path, parse_data_uri, path_to_file_uri, any_to_uri,
8+
urljoin_rfc, canonicalize_url, parse_url)
99
from six.moves.urllib.parse import urlparse
1010

1111

@@ -574,6 +574,94 @@ def test_canonicalize_url_idna_exceptions(self):
574574
label=u"example"*11))
575575

576576

577+
class DataURITests(unittest.TestCase):
578+
579+
def test_default_mediatype_charset(self):
580+
result = parse_data_uri("data:,A%20brief%20note")
581+
self.assertEqual(result.media_type, "text/plain")
582+
self.assertEqual(result.media_type_parameters, {"charset": "US-ASCII"})
583+
self.assertEqual(result.data, b"A brief note")
584+
585+
def test_text_uri(self):
586+
result = parse_data_uri(u"data:,A%20brief%20note")
587+
self.assertEqual(result.data, b"A brief note")
588+
589+
def test_bytes_uri(self):
590+
result = parse_data_uri(b"data:,A%20brief%20note")
591+
self.assertEqual(result.data, b"A brief note")
592+
593+
def test_unicode_uri(self):
594+
result = parse_data_uri(u"data:,é")
595+
self.assertEqual(result.data, u"é".encode('utf-8'))
596+
597+
def test_default_mediatype(self):
598+
result = parse_data_uri("data:;charset=iso-8859-7,%be%d3%be")
599+
self.assertEqual(result.media_type, "text/plain")
600+
self.assertEqual(result.media_type_parameters,
601+
{"charset": "iso-8859-7"})
602+
self.assertEqual(result.data, b"\xbe\xd3\xbe")
603+
604+
def test_text_charset(self):
605+
result = parse_data_uri("data:text/plain;charset=iso-8859-7,%be%d3%be")
606+
self.assertEqual(result.media_type, "text/plain")
607+
self.assertEqual(result.media_type_parameters,
608+
{"charset": "iso-8859-7"})
609+
self.assertEqual(result.data, b"\xbe\xd3\xbe")
610+
611+
def test_mediatype_parameters(self):
612+
result = parse_data_uri('data:text/plain;'
613+
'foo=%22foo;bar%5C%22%22;'
614+
'charset=utf-8;'
615+
'bar=%22foo;%5C%22foo%20;/%20,%22,'
616+
'%CE%8E%CE%A3%CE%8E')
617+
618+
self.assertEqual(result.media_type, "text/plain")
619+
self.assertEqual(result.media_type_parameters,
620+
{"charset": "utf-8",
621+
"foo": 'foo;bar"',
622+
"bar": 'foo;"foo ;/ ,'})
623+
self.assertEqual(result.data, b"\xce\x8e\xce\xa3\xce\x8e")
624+
625+
def test_base64(self):
626+
result = parse_data_uri("data:text/plain;base64,"
627+
"SGVsbG8sIHdvcmxkLg%3D%3D")
628+
self.assertEqual(result.media_type, "text/plain")
629+
self.assertEqual(result.data, b"Hello, world.")
630+
631+
def test_base64_spaces(self):
632+
result = parse_data_uri("data:text/plain;base64,SGVsb%20G8sIH%0A%20%20"
633+
"dvcm%20%20%20xk%20Lg%3D%0A%3D")
634+
self.assertEqual(result.media_type, "text/plain")
635+
self.assertEqual(result.data, b"Hello, world.")
636+
637+
result = parse_data_uri("data:text/plain;base64,SGVsb G8sIH\n "
638+
"dvcm xk Lg%3D\n%3D")
639+
self.assertEqual(result.media_type, "text/plain")
640+
self.assertEqual(result.data, b"Hello, world.")
641+
642+
def test_wrong_base64_param(self):
643+
with self.assertRaises(ValueError):
644+
parse_data_uri("data:text/plain;baes64,SGVsbG8sIHdvcmxkLg%3D%3D")
645+
646+
def test_missing_comma(self):
647+
with self.assertRaises(ValueError):
648+
parse_data_uri("data:A%20brief%20note")
649+
650+
def test_missing_scheme(self):
651+
with self.assertRaises(ValueError):
652+
parse_data_uri("text/plain,A%20brief%20note")
653+
654+
def test_wrong_scheme(self):
655+
with self.assertRaises(ValueError):
656+
parse_data_uri("http://example.com/")
657+
658+
def test_scheme_case_insensitive(self):
659+
result = parse_data_uri("DATA:,A%20brief%20note")
660+
self.assertEqual(result.data, b"A brief note")
661+
result = parse_data_uri("DaTa:,A%20brief%20note")
662+
self.assertEqual(result.data, b"A brief note")
663+
664+
577665
if __name__ == "__main__":
578666
unittest.main()
579667

w3lib/url.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
This module contains general purpose URL functions not found in the standard
33
library.
44
"""
5+
import base64
56
import codecs
67
import os
78
import re
89
import posixpath
910
import warnings
1011
import six
12+
from collections import namedtuple
1113
from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
1214
urldefrag, urlencode, urlparse,
1315
quote, parse_qs, parse_qsl,
@@ -285,11 +287,108 @@ def any_to_uri(uri_or_path):
285287
return uri_or_path if u.scheme else path_to_file_uri(uri_or_path)
286288

287289

290+
# ASCII characters.
291+
_char = set(map(chr, range(127)))
292+
293+
# RFC 2045 token.
294+
_token = r'[{}]+'.format(re.escape(''.join(_char -
295+
# Control characters.
296+
set(map(chr, range(0, 32))) -
297+
# tspecials and space.
298+
set('()<>@,;:\\"/[]?= '))))
299+
300+
# RFC 822 quoted-string, without surrounding quotation marks.
301+
_quoted_string = r'(?:[{}]|(?:\\[{}]))*'.format(
302+
re.escape(''.join(_char - {'"', '\\', '\r'})),
303+
re.escape(''.join(_char))
304+
)
305+
306+
# Encode the regular expression strings to make them into bytes, as Python 3
307+
# bytes have no format() method, but bytes must be passed to re.compile() in
308+
# order to make a pattern object that can be used to match on bytes.
309+
310+
# RFC 2397 mediatype.
311+
_mediatype_pattern = re.compile(
312+
r'{token}/{token}'.format(token=_token).encode()
313+
)
314+
_mediatype_parameter_pattern = re.compile(
315+
r';({token})=(?:({token})|"({quoted})")'.format(token=_token,
316+
quoted=_quoted_string
317+
).encode()
318+
)
319+
320+
_ParseDataURIResult = namedtuple("ParseDataURIResult",
321+
"media_type media_type_parameters data")
322+
323+
def parse_data_uri(uri):
324+
"""
325+
326+
Parse a data: URI, returning a 3-tuple of media type, dictionary of media
327+
type parameters, and data.
328+
329+
"""
330+
331+
if not isinstance(uri, bytes):
332+
uri = safe_url_string(uri).encode('ascii')
333+
334+
try:
335+
scheme, uri = uri.split(b':', 1)
336+
except ValueError:
337+
raise ValueError("invalid URI")
338+
if scheme.lower() != b'data':
339+
raise ValueError("not a data URI")
340+
341+
# RFC 3986 section 2.1 allows percent encoding to escape characters that
342+
# would be interpreted as delimiters, implying that actual delimiters
343+
# should not be percent-encoded.
344+
# Decoding before parsing will allow malformed URIs with percent-encoded
345+
# delimiters, but it makes parsing easier and should not affect
346+
# well-formed URIs, as the delimiters used in this URI scheme are not
347+
# allowed, percent-encoded or not, in tokens.
348+
if six.PY2:
349+
uri = unquote(uri)
350+
else:
351+
uri = unquote_to_bytes(uri)
352+
353+
media_type = "text/plain"
354+
media_type_params = {}
355+
356+
m = _mediatype_pattern.match(uri)
357+
if m:
358+
media_type = m.group().decode()
359+
uri = uri[m.end():]
360+
else:
361+
media_type_params['charset'] = "US-ASCII"
362+
363+
while True:
364+
m = _mediatype_parameter_pattern.match(uri)
365+
if m:
366+
attribute, value, value_quoted = m.groups()
367+
if value_quoted:
368+
value = re.sub(br'\\(.)', r'\1', value_quoted)
369+
media_type_params[attribute.decode()] = value.decode()
370+
uri = uri[m.end():]
371+
else:
372+
break
373+
374+
try:
375+
is_base64, data = uri.split(b',', 1)
376+
except ValueError:
377+
raise ValueError("invalid data URI")
378+
if is_base64:
379+
if is_base64 != b";base64":
380+
raise ValueError("invalid data URI")
381+
data = base64.b64decode(data)
382+
383+
return _ParseDataURIResult(media_type, media_type_params, data)
384+
385+
288386
__all__ = ["add_or_replace_parameter",
289387
"any_to_uri",
290388
"canonicalize_url",
291389
"file_uri_to_path",
292390
"is_url",
391+
"parse_data_uri",
293392
"path_to_file_uri",
294393
"safe_download_url",
295394
"safe_url_string",

0 commit comments

Comments
 (0)