diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index b0b7371..b74d676 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1369,13 +1369,14 @@ def _gen_msg_content( ) for attachment in message.attachments: - # null termination seen in the real world - # https://github.com/TeamMsgExtractor/msg-extractor/issues/464 - ext = attachment.extension.replace("\x00", "") - if ext is not None and ext in ext_map: - yield document_to_text( - blob=attachment.data, extension=ext, config=config - ) + if (extension := getattr(attachment, "extension", None)) is not None: + # null termination seen in the real world + # https://github.com/TeamMsgExtractor/msg-extractor/issues/464 + extension = extension.replace("\x00", "") + if extension in ext_map: + yield document_to_text( + blob=attachment.data, extension=extension, config=config + ) # ============================================================================= diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index ce4bab9..2f9dd14 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -33,6 +33,7 @@ from tempfile import mkdtemp, NamedTemporaryFile from unittest import mock, TestCase +from extract_msg import SignedAttachment from faker import Faker from faker_file.providers.docx_file import DocxFileProvider from faker_file.providers.eml_file import EmlFileProvider @@ -735,3 +736,51 @@ def test_attachment_converted(self) -> None: converted = convert_msg_to_text(dummy_filename, config=self.config) self.assertEqual(converted.strip(), content) + + def test_attachment_with_null_extension_skipped(self) -> None: + self.fake.add_provider(DocxFileProvider) + + dummy_filename = "dummy_filename.msg" + + content = self.fake.paragraph(nb_sentences=10) + docx = self.fake.docx_file(content=content, raw=True) + mock_attachment = mock.Mock( + extension=None, + data=BytesIO(docx).read(), + ) + mock_msgfile = mock.Mock( + body=None, htmlBody=None, attachments=[mock_attachment] + ) + mock_openmsg = mock.Mock(return_value=mock_msgfile) + with mock.patch.multiple( + "cardinal_pythonlib.extract_text", + openMsg=mock_openmsg, + ): + self.config.width = 0 + converted = convert_msg_to_text(dummy_filename, config=self.config) + + self.assertEqual(converted.strip(), "") + + def test_signed_attachment_with_no_extension_skipped(self) -> None: + self.fake.add_provider(DocxFileProvider) + + dummy_filename = "dummy_filename.msg" + + content = self.fake.paragraph(nb_sentences=10) + docx = self.fake.docx_file(content=content, raw=True) + mock_attachment = mock.Mock( + spec=SignedAttachment, + data=BytesIO(docx).read(), + ) + mock_msgfile = mock.Mock( + body=None, htmlBody=None, attachments=[mock_attachment] + ) + mock_openmsg = mock.Mock(return_value=mock_msgfile) + with mock.patch.multiple( + "cardinal_pythonlib.extract_text", + openMsg=mock_openmsg, + ): + self.config.width = 0 + converted = convert_msg_to_text(dummy_filename, config=self.config) + + self.assertEqual(converted.strip(), "") diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 52732aa..e3d989a 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -910,6 +910,11 @@ Quick links: document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to :func:`cardinal_pythonlib.extract_text.document_to_text`. +.. _changelog_2026: + +2026 +~~~~ + **2.1.2 (2026-01-27)** - ``cardinalpythonlib_grep_in_openxml``: new facility to search XML node text @@ -919,9 +924,9 @@ Quick links: - Fix extraction of text from HTML files in :func:`cardinal_pythonlib.extract_text.document_to_text`. -.. _changelog_2026: - -2026 -~~~~ **2.1.3 (IN PROGRESS)** + +- Skip extraction of text from Outlook ``.msg`` files where the file extension + is null or missing (SignedAttachment). + :func:`cardinal_pythonlib.extract_text.document_to_text`.