Remove unneeded to_bytes() and to_unicode() usages

Gallaecio · Gallaecio · commit 3f505486e188 · 2021-04-21T12:24:30.000+02:00
diff --git a/w3lib/url.py b/w3lib/url.py
@@ -24,12 +24,12 @@
     urlunsplit,
 )
 from urllib.request import pathname2url, url2pathname
-from w3lib.util import to_bytes, to_unicode
+from w3lib.util import to_unicode
 
 
 # error handling function for bytes-to-Unicode decoding errors with URLs
 def _quote_byte(error):
-    return (to_unicode(quote(error.object[error.start:error.end])), error.end)
+    return (quote(error.object[error.start:error.end]), error.end)
 
 codecs.register_error('percentencode', _quote_byte)
 
@@ -77,26 +77,22 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True)
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
-        netloc = parts.netloc.encode('idna')
+        netloc = parts.netloc.encode('idna').decode()
     except UnicodeError:
         netloc = parts.netloc
 
     # default encoding for path component SHOULD be UTF-8
     if quote_path:
-        path = quote(to_bytes(parts.path, path_encoding), _path_safe_chars)
+        path = quote(parts.path.encode(path_encoding), _path_safe_chars)
     else:
-        path = to_unicode(parts.path)
+        path = parts.path
     
-    # quote() in Python2 return type follows input type;
-    # quote() in Python3 always returns Unicode (native str)
     return urlunsplit((
-        to_unicode(parts.scheme),
-        to_unicode(netloc).rstrip(':'),
+        parts.scheme,
+        netloc.rstrip(':'),
         path,
-        # encoding of query and fragment follows page encoding
-        # or form-charset (if known and passed)
-        quote(to_bytes(parts.query, encoding), _safe_chars),
-        quote(to_bytes(parts.fragment, encoding), _safe_chars),
+        quote(parts.query.encode(encoding), _safe_chars),
+        quote(parts.fragment.encode(encoding), _safe_chars),
     ))
 
 
@@ -410,22 +406,17 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
-        netloc = parts.netloc.encode('idna')
+        netloc = parts.netloc.encode('idna').decode()
     except UnicodeError:
         netloc = parts.netloc
 
     return (
-        to_unicode(parts.scheme),
-        to_unicode(netloc),
-
-        # default encoding for path component SHOULD be UTF-8
-        quote(to_bytes(parts.path, path_encoding), _path_safe_chars),
-        quote(to_bytes(parts.params, path_encoding), _safe_chars),
-
-        # encoding of query and fragment follows page encoding
-        # or form-charset (if known and passed)
-        quote(to_bytes(parts.query, encoding), _safe_chars),
-        quote(to_bytes(parts.fragment, encoding), _safe_chars)
+        parts.scheme,
+        netloc,
+        quote(parts.path.encode(path_encoding), _path_safe_chars),
+        quote(parts.params.encode(path_encoding), _safe_chars),
+        quote(parts.query.encode(encoding), _safe_chars),
+        quote(parts.fragment.encode(encoding), _safe_chars)
     )
 
 
@@ -466,7 +457,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
     # if not for proper URL expected by remote website.
     try:
         scheme, netloc, path, params, query, fragment = _safe_ParseResult(
-            parse_url(url), encoding=encoding)
+            parse_url(url), encoding=encoding or 'utf8')
     except UnicodeEncodeError as e:
         scheme, netloc, path, params, query, fragment = _safe_ParseResult(
             parse_url(url), encoding='utf8')
diff --git a/w3lib/util.py b/w3lib/util.py
@@ -33,8 +33,9 @@ def to_unicode(text, encoding=None, errors='strict'):
     if isinstance(text, str):
         return text
     if not isinstance(text, (bytes, str)):
-        raise TypeError('to_unicode must receive a bytes, str or unicode '
-                        'object, got %s' % type(text).__name__)
+        raise TypeError(
+            f'to_unicode must receive bytes or str, got {type(text).__name__}'
+        )
     if encoding is None:
         encoding = 'utf-8'
     return text.decode(encoding, errors)
@@ -45,8 +46,9 @@ def to_bytes(text, encoding=None, errors='strict'):
     if isinstance(text, bytes):
         return text
     if not isinstance(text, str):
-        raise TypeError('to_bytes must receive a unicode, str or bytes '
-                        'object, got %s' % type(text).__name__)
+        raise TypeError(
+            f'to_bytes must receive str or bytes, got {type(text).__name__}'
+        )
     if encoding is None:
         encoding = 'utf-8'
     return text.encode(encoding, errors)