Skip to content

Commit 4a11b49

Browse files
Default to UTF-8 when no content type header in emails
1 parent e58d8fd commit 4a11b49

File tree

2 files changed

+30
-1
lines changed

2 files changed

+30
-1
lines changed

cardinal_pythonlib/extract_text.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1286,7 +1286,10 @@ def _get_email_content(
12861286
if ext is not None and ext in ext_map:
12871287
content = message.get_content()
12881288
if isinstance(content, str):
1289-
charset = message["Content-Type"].params.get("charset", "utf-8")
1289+
charset = "utf-8"
1290+
content_type_header = message.get("Content-Type")
1291+
if content_type_header:
1292+
charset = content_type_header.params.get("charset", "utf-8")
12901293
blob = content.encode(charset)
12911294
elif isinstance(content, EmailMessage):
12921295
blob = content.as_bytes()

cardinal_pythonlib/tests/extract_text_tests.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,32 @@ def test_eml_with_no_charset_converted(self) -> None:
440440
441441
--===--
442442
443+
"""
444+
445+
message = message_from_string(content, policy=policy.default)
446+
blob = message.as_bytes()
447+
448+
text = document_to_text(
449+
blob=blob, extension=".eml", config=self.config
450+
)
451+
452+
self.assertIn(text_content, text)
453+
454+
def test_eml_with_no_content_type_converted(self) -> None:
455+
text_content = self.fake.paragraph(nb_sentences=10)
456+
457+
content = f"""From: bar@example.org
458+
Subject: No content type
459+
To: foo@example.org
460+
Mime-Version: 1.0
461+
Content-Type: multipart/mixed;boundary="==="
462+
463+
--===
464+
465+
{text_content}
466+
467+
--===--
468+
443469
"""
444470

445471
message = message_from_string(content, policy=policy.default)

0 commit comments

Comments
 (0)