Skip to content

Commit 4a61661

Browse files
Merge pull request #43 from RudolfCardinal/outlook-msg-fixups
Skip extraction of text from Outlook .msg files where the file extension is null or missing (e.g. SignedAttachment)
2 parents 3bb8bce + 21a9f57 commit 4a61661

File tree

3 files changed

+66
-11
lines changed

3 files changed

+66
-11
lines changed

cardinal_pythonlib/extract_text.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,13 +1369,14 @@ def _gen_msg_content(
13691369
)
13701370

13711371
for attachment in message.attachments:
1372-
# null termination seen in the real world
1373-
# https://github.com/TeamMsgExtractor/msg-extractor/issues/464
1374-
ext = attachment.extension.replace("\x00", "")
1375-
if ext is not None and ext in ext_map:
1376-
yield document_to_text(
1377-
blob=attachment.data, extension=ext, config=config
1378-
)
1372+
if (extension := getattr(attachment, "extension", None)) is not None:
1373+
# null termination seen in the real world
1374+
# https://github.com/TeamMsgExtractor/msg-extractor/issues/464
1375+
extension = extension.replace("\x00", "")
1376+
if extension in ext_map:
1377+
yield document_to_text(
1378+
blob=attachment.data, extension=extension, config=config
1379+
)
13791380

13801381

13811382
# =============================================================================

cardinal_pythonlib/tests/extract_text_tests.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from tempfile import mkdtemp, NamedTemporaryFile
3434
from unittest import mock, TestCase
3535

36+
from extract_msg import SignedAttachment
3637
from faker import Faker
3738
from faker_file.providers.docx_file import DocxFileProvider
3839
from faker_file.providers.eml_file import EmlFileProvider
@@ -735,3 +736,51 @@ def test_attachment_converted(self) -> None:
735736
converted = convert_msg_to_text(dummy_filename, config=self.config)
736737

737738
self.assertEqual(converted.strip(), content)
739+
740+
def test_attachment_with_null_extension_skipped(self) -> None:
741+
self.fake.add_provider(DocxFileProvider)
742+
743+
dummy_filename = "dummy_filename.msg"
744+
745+
content = self.fake.paragraph(nb_sentences=10)
746+
docx = self.fake.docx_file(content=content, raw=True)
747+
mock_attachment = mock.Mock(
748+
extension=None,
749+
data=BytesIO(docx).read(),
750+
)
751+
mock_msgfile = mock.Mock(
752+
body=None, htmlBody=None, attachments=[mock_attachment]
753+
)
754+
mock_openmsg = mock.Mock(return_value=mock_msgfile)
755+
with mock.patch.multiple(
756+
"cardinal_pythonlib.extract_text",
757+
openMsg=mock_openmsg,
758+
):
759+
self.config.width = 0
760+
converted = convert_msg_to_text(dummy_filename, config=self.config)
761+
762+
self.assertEqual(converted.strip(), "")
763+
764+
def test_signed_attachment_with_no_extension_skipped(self) -> None:
765+
self.fake.add_provider(DocxFileProvider)
766+
767+
dummy_filename = "dummy_filename.msg"
768+
769+
content = self.fake.paragraph(nb_sentences=10)
770+
docx = self.fake.docx_file(content=content, raw=True)
771+
mock_attachment = mock.Mock(
772+
spec=SignedAttachment,
773+
data=BytesIO(docx).read(),
774+
)
775+
mock_msgfile = mock.Mock(
776+
body=None, htmlBody=None, attachments=[mock_attachment]
777+
)
778+
mock_openmsg = mock.Mock(return_value=mock_msgfile)
779+
with mock.patch.multiple(
780+
"cardinal_pythonlib.extract_text",
781+
openMsg=mock_openmsg,
782+
):
783+
self.config.width = 0
784+
converted = convert_msg_to_text(dummy_filename, config=self.config)
785+
786+
self.assertEqual(converted.strip(), "")

docs/source/changelog.rst

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,11 @@ Quick links:
910910
document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to
911911
:func:`cardinal_pythonlib.extract_text.document_to_text`.
912912

913+
.. _changelog_2026:
914+
915+
2026
916+
~~~~
917+
913918
**2.1.2 (2026-01-27)**
914919

915920
- ``cardinalpythonlib_grep_in_openxml``: new facility to search XML node text
@@ -919,9 +924,9 @@ Quick links:
919924
- Fix extraction of text from HTML files in
920925
:func:`cardinal_pythonlib.extract_text.document_to_text`.
921926

922-
.. _changelog_2026:
923-
924-
2026
925-
~~~~
926927

927928
**2.1.3 (IN PROGRESS)**
929+
930+
- Skip extraction of text from Outlook ``.msg`` files where the file extension
931+
is null or missing (SignedAttachment).
932+
:func:`cardinal_pythonlib.extract_text.document_to_text`.

0 commit comments

Comments
 (0)