Skip to content

Commit 5861039

Browse files
committed
strip_html5_whitespace
1 parent f46b4c4 commit 5861039

File tree

1 file changed

+19
-0
lines changed

1 file changed

+19
-0
lines changed

w3lib/html.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
_meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)
1818
_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
1919

20+
HTML5_WHITESPACE = ' \t\n\r\x0c'
21+
22+
2023
def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
2124
r"""
2225
@@ -317,3 +320,19 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script',
317320
return interval, url
318321
else:
319322
return None, None
323+
324+
325+
def strip_html5_whitespace(text):
326+
r"""
327+
Strip all leading and trailing space characters (as defined in
328+
https://www.w3.org/TR/html5/infrastructure.html#space-character).
329+
330+
Such stripping is useful e.g. for processing HTML element attributes which
331+
contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
332+
defines them as "valid URL potentially surrounded by spaces"
333+
or "valid non-empty URL potentially surrounded by spaces".
334+
335+
>>> strip_html5_whitespace(' hello\n')
336+
'hello'
337+
"""
338+
return text.strip(HTML5_WHITESPACE)

0 commit comments

Comments
 (0)