Skip to content

Commit e58d8fd

Browse files
Default to UTF-8 when no charset in emails
1 parent 75b9ce6 commit e58d8fd

File tree

2 files changed

+28
-1
lines changed

2 files changed

+28
-1
lines changed

cardinal_pythonlib/extract_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1286,7 +1286,7 @@ def _get_email_content(
12861286
if ext is not None and ext in ext_map:
12871287
content = message.get_content()
12881288
if isinstance(content, str):
1289-
charset = message["Content-Type"].params["charset"]
1289+
charset = message["Content-Type"].params.get("charset", "utf-8")
12901290
blob = content.encode(charset)
12911291
elif isinstance(content, EmailMessage):
12921292
blob = content.as_bytes()

cardinal_pythonlib/tests/extract_text_tests.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,33 @@ def test_eml_latin1_html_decoded_correctly(self) -> None:
424424

425425
self.assertIn("Café", text)
426426

427+
def test_eml_with_no_charset_converted(self) -> None:
428+
text_content = self.fake.paragraph(nb_sentences=10)
429+
430+
content = f"""From: bar@example.org
431+
Subject: No charset
432+
To: foo@example.org
433+
Mime-Version: 1.0
434+
Content-Type: multipart/mixed;boundary="==="
435+
436+
--===
437+
Content-Type: text/plain
438+
439+
{text_content}
440+
441+
--===--
442+
443+
"""
444+
445+
message = message_from_string(content, policy=policy.default)
446+
blob = message.as_bytes()
447+
448+
text = document_to_text(
449+
blob=blob, extension=".eml", config=self.config
450+
)
451+
452+
self.assertIn(text_content, text)
453+
427454
def test_unsupported_converted(self) -> None:
428455
with mock.patch.multiple(
429456
"cardinal_pythonlib.extract_text.subprocess",

0 commit comments

Comments
 (0)