Skip to content

Commit 19f33ac

Browse files
committed
use constants from RFC3986
* "|" is removed; * "[" and "]" are added.
1 parent 1095a42 commit 19f33ac

File tree

1 file changed

+9
-10
lines changed

1 file changed

+9
-10
lines changed

w3lib/url.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88
import re
99
import posixpath
1010
import warnings
11-
import six
11+
import string
1212
from collections import namedtuple
13+
import six
1314
from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
1415
urldefrag, urlencode, urlparse,
1516
quote, parse_qs, parse_qsl,
@@ -24,12 +25,11 @@ def _quote_byte(error):
2425

2526
codecs.register_error('percentencode', _quote_byte)
2627

27-
28-
# Python 2.x urllib.always_safe become private in Python 3.x;
29-
# its content is copied here
30-
_ALWAYS_SAFE_BYTES = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
31-
b'abcdefghijklmnopqrstuvwxyz'
32-
b'0123456789' b'_.-')
28+
# constants from RFC 3986, Section 2.2 and 2.3
29+
RFC3986_GEN_DELIMS = b':/?#[]@'
30+
RFC3986_SUB_DELIMS = b"!$&'()*+,;="
31+
RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS
32+
RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode('ascii')
3333

3434

3535
def urljoin_rfc(base, ref, encoding='utf-8'):
@@ -66,9 +66,8 @@ def urljoin_rfc(base, ref, encoding='utf-8'):
6666
str_ref = to_bytes(ref, encoding)
6767
return urljoin(str_base, str_ref)
6868

69-
_reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax)
70-
_unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3
71-
_safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks
69+
70+
_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + b'%'
7271

7372
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
7473
"""Convert the given URL into a legal URL by escaping unsafe characters

0 commit comments

Comments
 (0)