Skip to content

Commit f6c53cb

Browse files
committed
gh-150771: Use output charset in set_text_content for shift_jis/euc-jp
Encode the payload with the charset output mapping (iso-2022-jp) when set_content is called with shift_jis or euc-jp, instead of patching serialization in body_encode and set_payload. Reverts those changes.
1 parent 24f6067 commit f6c53cb

5 files changed

Lines changed: 12 additions & 24 deletions

File tree

Lib/email/charset.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import email.quoprimime
1717

1818
from email import errors
19-
from email import utils
2019
from email.encoders import encode_7or8bit
2120

2221

@@ -439,12 +438,5 @@ def body_encode(self, string):
439438
return email.quoprimime.body_encode(string)
440439
else:
441440
if isinstance(string, str):
442-
if utils._has_surrogates(string):
443-
string = string.encode('ascii', 'surrogateescape')
444-
if self.input_charset != self.output_charset:
445-
string = (string.decode(self.input_codec)
446-
.encode(self.output_codec))
447-
string = string.decode('ascii', 'surrogateescape')
448-
else:
449-
string = string.encode(self.output_charset).decode('ascii')
441+
string = string.encode(self.output_charset).decode('ascii')
450442
return string

Lib/email/contentmanager.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,8 @@ def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None,
174174
params=None, headers=None):
175175
_prepare_set(msg, 'text', subtype, headers)
176176

177-
charset = email.charset.Charset(charset).input_charset
177+
cs = email.charset.Charset(charset)
178+
charset = cs.output_charset
178179
cte, payload = _encode_text(string, charset, cte, msg.policy)
179180
msg.set_payload(payload)
180181
msg.set_param('charset', charset, replace=True)

Lib/email/message.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -352,9 +352,7 @@ def set_payload(self, payload, charset=None):
352352
return
353353
if not isinstance(charset, Charset):
354354
charset = Charset(charset)
355-
if not utils._has_surrogates(payload):
356-
payload = payload.encode(charset.output_charset,
357-
'surrogateescape')
355+
payload = payload.encode(charset.output_charset, 'surrogateescape')
358356
if hasattr(payload, 'decode'):
359357
self._payload = payload.decode('ascii', 'surrogateescape')
360358
else:

Lib/test/test_email/test_contentmanager.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -366,11 +366,9 @@ def test_set_text_charset_shift_jis(self):
366366
m = self._make_message()
367367
content = "\u65e5\u672c\u8a9e\n"
368368
raw_data_manager.set_content(m, content, charset='shift_jis')
369-
self.assertEqual(m['Content-Type'], 'text/plain; charset="shift_jis"')
370-
self.assertEqual(m['Content-Transfer-Encoding'], '8bit')
371-
self.assertEqual(m.get_payload(decode=True), content.encode('shift_jis'))
369+
self.assertEqual(m['Content-Type'], 'text/plain; charset="iso-2022-jp"')
370+
self.assertEqual(m.get_payload(decode=True), content.encode('iso-2022-jp'))
372371
self.assertEqual(m.get_content(), content)
373-
# Serialization converts the payload to iso-2022-jp for output.
374372
self.assertEqual(str(m), textwrap.dedent("""\
375373
Content-Type: text/plain; charset="iso-2022-jp"
376374
Content-Transfer-Encoding: 7bit
@@ -382,11 +380,9 @@ def test_set_text_charset_euc_jp(self):
382380
m = self._make_message()
383381
content = "\u65e5\u672c\u8a9e\n"
384382
raw_data_manager.set_content(m, content, charset='euc-jp')
385-
self.assertEqual(m['Content-Type'], 'text/plain; charset="euc-jp"')
386-
self.assertEqual(m['Content-Transfer-Encoding'], '8bit')
387-
self.assertEqual(m.get_payload(decode=True), content.encode('euc-jp'))
383+
self.assertEqual(m['Content-Type'], 'text/plain; charset="iso-2022-jp"')
384+
self.assertEqual(m.get_payload(decode=True), content.encode('iso-2022-jp'))
388385
self.assertEqual(m.get_content(), content)
389-
# Serialization converts the payload to iso-2022-jp for output.
390386
self.assertEqual(str(m), textwrap.dedent("""\
391387
Content-Type: text/plain; charset="iso-2022-jp"
392388
Content-Transfer-Encoding: 7bit
Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
Fix serialization of :mod:`email` messages using ``shift_jis`` or ``euc-jp``
2-
charsets. Converting surrogate-escaped payloads to the required
3-
``iso-2022-jp`` output charset no longer raises :exc:`UnicodeEncodeError`.
1+
Fix :mod:`email` messages created with ``shift_jis`` or ``euc-jp`` charsets.
2+
:func:`email.contentmanager.set_text_content` now stores the payload using
3+
the output charset (``iso-2022-jp``) so :func:`str` on the message no longer
4+
raises :exc:`UnicodeEncodeError`.

0 commit comments

Comments
 (0)