Merge pull request #71 from ArturGaspar/master

kmike · web-flow · commit f46b4c4140df · 2017-02-08T18:50:42.000+05:00
[MRG+1] data URI parser.
diff --git a/tests/test_url.py b/tests/test_url.py
@@ -4,8 +4,8 @@
 import unittest
 from w3lib.url import (is_url, safe_url_string, safe_download_url,
     url_query_parameter, add_or_replace_parameter, url_query_cleaner,
-    file_uri_to_path, path_to_file_uri, any_to_uri, urljoin_rfc,
-    canonicalize_url, parse_url)
+    file_uri_to_path, parse_data_uri, path_to_file_uri, any_to_uri,
+    urljoin_rfc, canonicalize_url, parse_url)
 from six.moves.urllib.parse import urlparse
 
 
@@ -574,6 +574,94 @@ def test_canonicalize_url_idna_exceptions(self):
                     label=u"example"*11))
 
 
+class DataURITests(unittest.TestCase):
+
+    def test_default_mediatype_charset(self):
+        result = parse_data_uri("data:,A%20brief%20note")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.media_type_parameters, {"charset": "US-ASCII"})
+        self.assertEqual(result.data, b"A brief note")
+
+    def test_text_uri(self):
+        result = parse_data_uri(u"data:,A%20brief%20note")
+        self.assertEqual(result.data, b"A brief note")
+
+    def test_bytes_uri(self):
+        result = parse_data_uri(b"data:,A%20brief%20note")
+        self.assertEqual(result.data, b"A brief note")
+
+    def test_unicode_uri(self):
+        result = parse_data_uri(u"data:,é")
+        self.assertEqual(result.data, u"é".encode('utf-8'))
+
+    def test_default_mediatype(self):
+        result = parse_data_uri("data:;charset=iso-8859-7,%be%d3%be")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.media_type_parameters,
+                         {"charset": "iso-8859-7"})
+        self.assertEqual(result.data, b"\xbe\xd3\xbe")
+
+    def test_text_charset(self):
+        result = parse_data_uri("data:text/plain;charset=iso-8859-7,%be%d3%be")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.media_type_parameters,
+                         {"charset": "iso-8859-7"})
+        self.assertEqual(result.data, b"\xbe\xd3\xbe")
+
+    def test_mediatype_parameters(self):
+        result = parse_data_uri('data:text/plain;'
+                                'foo=%22foo;bar%5C%22%22;'
+                                'charset=utf-8;'
+                                'bar=%22foo;%5C%22foo%20;/%20,%22,'
+                                '%CE%8E%CE%A3%CE%8E')
+
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.media_type_parameters,
+                         {"charset": "utf-8",
+                          "foo": 'foo;bar"',
+                          "bar": 'foo;"foo ;/ ,'})
+        self.assertEqual(result.data, b"\xce\x8e\xce\xa3\xce\x8e")
+
+    def test_base64(self):
+        result = parse_data_uri("data:text/plain;base64,"
+                                "SGVsbG8sIHdvcmxkLg%3D%3D")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.data, b"Hello, world.")
+
+    def test_base64_spaces(self):
+        result = parse_data_uri("data:text/plain;base64,SGVsb%20G8sIH%0A%20%20"
+                                "dvcm%20%20%20xk%20Lg%3D%0A%3D")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.data, b"Hello, world.")
+
+        result = parse_data_uri("data:text/plain;base64,SGVsb G8sIH\n  "
+                                "dvcm   xk Lg%3D\n%3D")
+        self.assertEqual(result.media_type, "text/plain")
+        self.assertEqual(result.data, b"Hello, world.")
+
+    def test_wrong_base64_param(self):
+        with self.assertRaises(ValueError):
+            parse_data_uri("data:text/plain;baes64,SGVsbG8sIHdvcmxkLg%3D%3D")
+
+    def test_missing_comma(self):
+        with self.assertRaises(ValueError):
+            parse_data_uri("data:A%20brief%20note")
+
+    def test_missing_scheme(self):
+        with self.assertRaises(ValueError):
+            parse_data_uri("text/plain,A%20brief%20note")
+
+    def test_wrong_scheme(self):
+        with self.assertRaises(ValueError):
+            parse_data_uri("http://example.com/")
+
+    def test_scheme_case_insensitive(self):
+        result = parse_data_uri("DATA:,A%20brief%20note")
+        self.assertEqual(result.data, b"A brief note")
+        result = parse_data_uri("DaTa:,A%20brief%20note")
+        self.assertEqual(result.data, b"A brief note")
+
+
 if __name__ == "__main__":
     unittest.main()
 
diff --git a/w3lib/url.py b/w3lib/url.py
@@ -2,12 +2,14 @@
 This module contains general purpose URL functions not found in the standard
 library.
 """
+import base64
 import codecs
 import os
 import re
 import posixpath
 import warnings
 import six
+from collections import namedtuple
 from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
                                     urldefrag, urlencode, urlparse,
                                     quote, parse_qs, parse_qsl,
@@ -285,11 +287,108 @@ def any_to_uri(uri_or_path):
     return uri_or_path if u.scheme else path_to_file_uri(uri_or_path)
 
 
+# ASCII characters.
+_char = set(map(chr, range(127)))
+
+# RFC 2045 token.
+_token = r'[{}]+'.format(re.escape(''.join(_char -
+                                           # Control characters.
+                                           set(map(chr, range(0, 32))) -
+                                           # tspecials and space.
+                                           set('()<>@,;:\\"/[]?= '))))
+
+# RFC 822 quoted-string, without surrounding quotation marks.
+_quoted_string = r'(?:[{}]|(?:\\[{}]))*'.format(
+    re.escape(''.join(_char - {'"', '\\', '\r'})),
+    re.escape(''.join(_char))
+)
+
+# Encode the regular expression strings to make them into bytes, as Python 3
+# bytes have no format() method, but bytes must be passed to re.compile() in
+# order to make a pattern object that can be used to match on bytes.
+
+# RFC 2397 mediatype.
+_mediatype_pattern = re.compile(
+    r'{token}/{token}'.format(token=_token).encode()
+)
+_mediatype_parameter_pattern = re.compile(
+    r';({token})=(?:({token})|"({quoted})")'.format(token=_token,
+                                                    quoted=_quoted_string
+                                                    ).encode()
+)
+
+_ParseDataURIResult = namedtuple("ParseDataURIResult",
+                                 "media_type media_type_parameters data")
+
+def parse_data_uri(uri):
+    """
+
+    Parse a data: URI, returning a 3-tuple of media type, dictionary of media
+    type parameters, and data.
+
+    """
+
+    if not isinstance(uri, bytes):
+        uri = safe_url_string(uri).encode('ascii')
+
+    try:
+        scheme, uri = uri.split(b':', 1)
+    except ValueError:
+        raise ValueError("invalid URI")
+    if scheme.lower() != b'data':
+        raise ValueError("not a data URI")
+
+    # RFC 3986 section 2.1 allows percent encoding to escape characters that
+    # would be interpreted as delimiters, implying that actual delimiters
+    # should not be percent-encoded.
+    # Decoding before parsing will allow malformed URIs with percent-encoded
+    # delimiters, but it makes parsing easier and should not affect
+    # well-formed URIs, as the delimiters used in this URI scheme are not
+    # allowed, percent-encoded or not, in tokens.
+    if six.PY2:
+        uri = unquote(uri)
+    else:
+        uri = unquote_to_bytes(uri)
+
+    media_type = "text/plain"
+    media_type_params = {}
+
+    m = _mediatype_pattern.match(uri)
+    if m:
+        media_type = m.group().decode()
+        uri = uri[m.end():]
+    else:
+        media_type_params['charset'] = "US-ASCII"
+
+    while True:
+        m = _mediatype_parameter_pattern.match(uri)
+        if m:
+            attribute, value, value_quoted = m.groups()
+            if value_quoted:
+                value = re.sub(br'\\(.)', r'\1', value_quoted)
+            media_type_params[attribute.decode()] = value.decode()
+            uri = uri[m.end():]
+        else:
+            break
+
+    try:
+        is_base64, data = uri.split(b',', 1)
+    except ValueError:
+        raise ValueError("invalid data URI")
+    if is_base64:
+        if is_base64 != b";base64":
+            raise ValueError("invalid data URI")
+        data = base64.b64decode(data)
+
+    return _ParseDataURIResult(media_type, media_type_params, data)
+
+
 __all__ = ["add_or_replace_parameter",
            "any_to_uri",
            "canonicalize_url",
            "file_uri_to_path",
            "is_url",
+           "parse_data_uri",
            "path_to_file_uri",
            "safe_download_url",
            "safe_url_string",