Skip to content

Commit b8c753f

Browse files
jvanascoGallaecio
authored andcommitted
fix: preservation of url encoded hash signs.
* issue #91 * description: don't decode `%23` to `#` when it appears in a url * tests-new: test_url.CanonicalizeUrlTest.test_preserve_nonfragment_hash * tests-pass: py27, py36 * notes: adjustment to _safe_chars suggested by @Gallaecio
1 parent 165f165 commit b8c753f

File tree

2 files changed

+18
-0
lines changed

2 files changed

+18
-0
lines changed

tests/test_url.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,21 @@ def test_canonicalize_url_idna_exceptions(self):
650650
"http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
651651
label=u"example"*11))
652652

653+
def test_preserve_nonfragment_hash(self):
654+
# don't decode `%23` to `#`
655+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar"),
656+
"http://www.example.com/path/to/%23/foo/bar")
657+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar#frag"),
658+
"http://www.example.com/path/to/%23/foo/bar")
659+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar#frag", keep_fragments=True),
660+
"http://www.example.com/path/to/%23/foo/bar#frag")
661+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2Fpath%2Fto%2F%23%2Fbar%2Ffoo"),
662+
"http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2Fpath%2Fto%2F%23%2Fbar%2Ffoo")
663+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo#frag"),
664+
"http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo")
665+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo#frag", keep_fragments=True),
666+
"http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo#frag")
667+
653668

654669
class DataURITests(unittest.TestCase):
655670

w3lib/url.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ def _quote_byte(error):
3434

3535
_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'
3636

37+
# see https://github.com/scrapy/w3lib/issues/91
38+
_safe_chars = _safe_chars.replace(b'#', b'')
39+
3740
_ascii_tab_newline_re = re.compile(r'[\t\n\r]') # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
3841

3942
def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True):

0 commit comments

Comments
 (0)