Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions cardinal_pythonlib/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -1369,13 +1369,14 @@ def _gen_msg_content(
)

for attachment in message.attachments:
# null termination seen in the real world
# https://github.com/TeamMsgExtractor/msg-extractor/issues/464
ext = attachment.extension.replace("\x00", "")
if ext is not None and ext in ext_map:
yield document_to_text(
blob=attachment.data, extension=ext, config=config
)
if (extension := getattr(attachment, "extension", None)) is not None:
# null termination seen in the real world
# https://github.com/TeamMsgExtractor/msg-extractor/issues/464
extension = extension.replace("\x00", "")
if extension in ext_map:
yield document_to_text(
blob=attachment.data, extension=extension, config=config
)


# =============================================================================
Expand Down
49 changes: 49 additions & 0 deletions cardinal_pythonlib/tests/extract_text_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from tempfile import mkdtemp, NamedTemporaryFile
from unittest import mock, TestCase

from extract_msg import SignedAttachment
from faker import Faker
from faker_file.providers.docx_file import DocxFileProvider
from faker_file.providers.eml_file import EmlFileProvider
Expand Down Expand Up @@ -735,3 +736,51 @@ def test_attachment_converted(self) -> None:
converted = convert_msg_to_text(dummy_filename, config=self.config)

self.assertEqual(converted.strip(), content)

def test_attachment_with_null_extension_skipped(self) -> None:
self.fake.add_provider(DocxFileProvider)

dummy_filename = "dummy_filename.msg"

content = self.fake.paragraph(nb_sentences=10)
docx = self.fake.docx_file(content=content, raw=True)
mock_attachment = mock.Mock(
extension=None,
data=BytesIO(docx).read(),
)
mock_msgfile = mock.Mock(
body=None, htmlBody=None, attachments=[mock_attachment]
)
mock_openmsg = mock.Mock(return_value=mock_msgfile)
with mock.patch.multiple(
"cardinal_pythonlib.extract_text",
openMsg=mock_openmsg,
):
self.config.width = 0
converted = convert_msg_to_text(dummy_filename, config=self.config)

self.assertEqual(converted.strip(), "")

def test_signed_attachment_with_no_extension_skipped(self) -> None:
self.fake.add_provider(DocxFileProvider)

dummy_filename = "dummy_filename.msg"

content = self.fake.paragraph(nb_sentences=10)
docx = self.fake.docx_file(content=content, raw=True)
mock_attachment = mock.Mock(
spec=SignedAttachment,
data=BytesIO(docx).read(),
)
mock_msgfile = mock.Mock(
body=None, htmlBody=None, attachments=[mock_attachment]
)
mock_openmsg = mock.Mock(return_value=mock_msgfile)
with mock.patch.multiple(
"cardinal_pythonlib.extract_text",
openMsg=mock_openmsg,
):
self.config.width = 0
converted = convert_msg_to_text(dummy_filename, config=self.config)

self.assertEqual(converted.strip(), "")
13 changes: 9 additions & 4 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,11 @@ Quick links:
document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to
:func:`cardinal_pythonlib.extract_text.document_to_text`.

.. _changelog_2026:

2026
~~~~

**2.1.2 (2026-01-27)**

- ``cardinalpythonlib_grep_in_openxml``: new facility to search XML node text
Expand All @@ -919,9 +924,9 @@ Quick links:
- Fix extraction of text from HTML files in
:func:`cardinal_pythonlib.extract_text.document_to_text`.

.. _changelog_2026:

2026
~~~~

**2.1.3 (IN PROGRESS)**

- Skip extraction of text from Outlook ``.msg`` files where the file extension
is null or missing (SignedAttachment).
:func:`cardinal_pythonlib.extract_text.document_to_text`.