From 16731d8fbd57477704f7ff6aa213c4d3c2e407b7 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Fri, 6 Feb 2026 11:13:40 +0000 Subject: [PATCH 1/3] Handle Outlook msg attachments with no extension --- cardinal_pythonlib/extract_text.py | 15 ++++++------ .../tests/extract_text_tests.py | 24 +++++++++++++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index b0b7371..a5b2a7e 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1369,13 +1369,14 @@ def _gen_msg_content( ) for attachment in message.attachments: - # null termination seen in the real world - # https://github.com/TeamMsgExtractor/msg-extractor/issues/464 - ext = attachment.extension.replace("\x00", "") - if ext is not None and ext in ext_map: - yield document_to_text( - blob=attachment.data, extension=ext, config=config - ) + if (ext := attachment.extension) is not None: + # null termination seen in the real world + # https://github.com/TeamMsgExtractor/msg-extractor/issues/464 + ext = ext.replace("\x00", "") + if ext in ext_map: + yield document_to_text( + blob=attachment.data, extension=ext, config=config + ) # ============================================================================= diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index ce4bab9..3f9a5ce 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -735,3 +735,27 @@ def test_attachment_converted(self) -> None: converted = convert_msg_to_text(dummy_filename, config=self.config) self.assertEqual(converted.strip(), content) + + def test_attachment_with_no_extension_skipped(self) -> None: + self.fake.add_provider(DocxFileProvider) + + dummy_filename = "dummy_filename.msg" + + content = self.fake.paragraph(nb_sentences=10) + docx = self.fake.docx_file(content=content, raw=True) + mock_attachment = mock.Mock( + extension=None, + data=BytesIO(docx).read(), + ) + mock_msgfile = mock.Mock( + body=None, htmlBody=None, attachments=[mock_attachment] + ) + mock_openmsg = mock.Mock(return_value=mock_msgfile) + with mock.patch.multiple( + "cardinal_pythonlib.extract_text", + openMsg=mock_openmsg, + ): + self.config.width = 0 + converted = convert_msg_to_text(dummy_filename, config=self.config) + + self.assertEqual(converted.strip(), "") From b8ed2928392d11222d048d91832fada7b30a124f Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Fri, 6 Feb 2026 11:28:59 +0000 Subject: [PATCH 2/3] Handle Outlook SignedAttachment with no extension --- cardinal_pythonlib/extract_text.py | 8 +++--- .../tests/extract_text_tests.py | 27 ++++++++++++++++++- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index a5b2a7e..b74d676 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1369,13 +1369,13 @@ def _gen_msg_content( ) for attachment in message.attachments: - if (ext := attachment.extension) is not None: + if (extension := getattr(attachment, "extension", None)) is not None: # null termination seen in the real world # https://github.com/TeamMsgExtractor/msg-extractor/issues/464 - ext = ext.replace("\x00", "") - if ext in ext_map: + extension = extension.replace("\x00", "") + if extension in ext_map: yield document_to_text( - blob=attachment.data, extension=ext, config=config + blob=attachment.data, extension=extension, config=config ) diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 3f9a5ce..2f9dd14 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -33,6 +33,7 @@ from tempfile import mkdtemp, NamedTemporaryFile from unittest import mock, TestCase +from extract_msg import SignedAttachment from faker import Faker from faker_file.providers.docx_file import DocxFileProvider from faker_file.providers.eml_file import EmlFileProvider @@ -736,7 +737,7 @@ def test_attachment_converted(self) -> None: self.assertEqual(converted.strip(), content) - def test_attachment_with_no_extension_skipped(self) -> None: + def test_attachment_with_null_extension_skipped(self) -> None: self.fake.add_provider(DocxFileProvider) dummy_filename = "dummy_filename.msg" @@ -759,3 +760,27 @@ def test_attachment_with_no_extension_skipped(self) -> None: converted = convert_msg_to_text(dummy_filename, config=self.config) self.assertEqual(converted.strip(), "") + + def test_signed_attachment_with_no_extension_skipped(self) -> None: + self.fake.add_provider(DocxFileProvider) + + dummy_filename = "dummy_filename.msg" + + content = self.fake.paragraph(nb_sentences=10) + docx = self.fake.docx_file(content=content, raw=True) + mock_attachment = mock.Mock( + spec=SignedAttachment, + data=BytesIO(docx).read(), + ) + mock_msgfile = mock.Mock( + body=None, htmlBody=None, attachments=[mock_attachment] + ) + mock_openmsg = mock.Mock(return_value=mock_msgfile) + with mock.patch.multiple( + "cardinal_pythonlib.extract_text", + openMsg=mock_openmsg, + ): + self.config.width = 0 + converted = convert_msg_to_text(dummy_filename, config=self.config) + + self.assertEqual(converted.strip(), "") From 21a9f571ca3a201365f84f96431c433fae9c6802 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Fri, 6 Feb 2026 11:32:54 +0000 Subject: [PATCH 3/3] Update changelog --- docs/source/changelog.rst | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 52732aa..e3d989a 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -910,6 +910,11 @@ Quick links: document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to :func:`cardinal_pythonlib.extract_text.document_to_text`. +.. _changelog_2026: + +2026 +~~~~ + **2.1.2 (2026-01-27)** - ``cardinalpythonlib_grep_in_openxml``: new facility to search XML node text @@ -919,9 +924,9 @@ Quick links: - Fix extraction of text from HTML files in :func:`cardinal_pythonlib.extract_text.document_to_text`. -.. _changelog_2026: - -2026 -~~~~ **2.1.3 (IN PROGRESS)** + +- Skip extraction of text from Outlook ``.msg`` files where the file extension + is null or missing (SignedAttachment). + :func:`cardinal_pythonlib.extract_text.document_to_text`.