Skip to content

Commit 409d8da

Browse files
Merge pull request #38 from RudolfCardinal/outlook-msg-text-extraction
Support text extraction from Outlook msg files
2 parents ccca6e2 + 55ded1a commit 409d8da

File tree

4 files changed

+201
-12
lines changed

4 files changed

+201
-12
lines changed

cardinal_pythonlib/extract_text.py

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,15 @@
100100
Iterator,
101101
List,
102102
Optional,
103+
TYPE_CHECKING,
103104
)
104105
from xml.etree import ElementTree as ElementTree
105106
import zipfile
106107

107108
import bs4
108109
import chardet
109110
from chardet.universaldetector import UniversalDetector
111+
from extract_msg import openMsg
110112
import pdfminer # pip install pdfminer.six
111113
import pdfminer.pdfinterp
112114
import pdfminer.converter
@@ -117,6 +119,8 @@
117119

118120
from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler
119121

122+
if TYPE_CHECKING:
123+
from extract_msg import MSGFile
120124

121125
log = get_brace_style_log_with_null_handler(__name__)
122126

@@ -1320,6 +1324,60 @@ def _get_email_content(
13201324
return None
13211325

13221326

1327+
# =============================================================================
1328+
# MSG (Outlook binary format)
1329+
# =============================================================================
1330+
1331+
1332+
def convert_msg_to_text(
1333+
filename: str = None,
1334+
blob: bytes = None,
1335+
config: TextProcessingConfig = _DEFAULT_CONFIG,
1336+
) -> str:
1337+
message_content_list: list[str] = []
1338+
1339+
if not filename and blob is None:
1340+
raise ValueError("convert_msg_to_text: no filename and no blob")
1341+
if filename and blob:
1342+
raise ValueError(
1343+
"convert_msg_to_text: specify either filename or blob"
1344+
)
1345+
1346+
if blob is not None:
1347+
filename_or_blob = blob
1348+
else:
1349+
filename_or_blob = filename
1350+
1351+
message = openMsg(filename_or_blob, delayAttachments=False)
1352+
for message_content in _gen_msg_content(message, config=config):
1353+
if message_content_list is not None:
1354+
message_content_list.append(message_content)
1355+
1356+
text = "\n".join(message_content_list)
1357+
1358+
return text
1359+
1360+
1361+
def _gen_msg_content(
1362+
message: "MSGFile", config: TextProcessingConfig
1363+
) -> Generator[Optional[str], None, None]:
1364+
if message.body is not None:
1365+
yield message.body
1366+
elif message.htmlBody is not None:
1367+
yield document_to_text(
1368+
blob=message.htmlBody, extension=".htm", config=config
1369+
)
1370+
1371+
for attachment in message.attachments:
1372+
# null termination seen in the real world
1373+
# https://github.com/TeamMsgExtractor/msg-extractor/issues/464
1374+
ext = attachment.extension.replace("\x00", "")
1375+
if ext is not None and ext in ext_map:
1376+
yield document_to_text(
1377+
blob=attachment.data, extension=ext, config=config
1378+
)
1379+
1380+
13231381
# =============================================================================
13241382
# Anything
13251383
# =============================================================================
@@ -1371,12 +1429,7 @@ def availability_anything() -> bool:
13711429
".htm": {CONVERTER: convert_html_to_text, AVAILABILITY: True},
13721430
".html": {CONVERTER: convert_html_to_text, AVAILABILITY: True},
13731431
".log": {CONVERTER: get_file_contents_text, AVAILABILITY: True},
1374-
# .msg is often Outlook binary, not text
1375-
#
1376-
# '.msg': {
1377-
# CONVERTER: get_file_contents_text,
1378-
# AVAILABILITY: True,
1379-
# },
1432+
".msg": {CONVERTER: convert_msg_to_text, AVAILABILITY: True},
13801433
".odt": {CONVERTER: convert_odt_to_text, AVAILABILITY: True},
13811434
".pdf": {CONVERTER: convert_pdf_to_txt, AVAILABILITY: availability_pdf},
13821435
".rtf": {CONVERTER: convert_rtf_to_text, AVAILABILITY: availability_rtf},

cardinal_pythonlib/tests/extract_text_tests.py

Lines changed: 135 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
from email import message_from_string, policy
2929
from email.message import EmailMessage
30+
from io import BytesIO
3031
import os
3132
import subprocess
3233
from tempfile import mkdtemp, NamedTemporaryFile
@@ -44,18 +45,26 @@
4445
from faker_file.providers.xml_file import XmlFileProvider
4546

4647
from cardinal_pythonlib.extract_text import (
48+
convert_msg_to_text,
4749
document_to_text,
4850
TextProcessingConfig,
4951
update_external_tools,
5052
)
5153

5254

53-
class DocumentToTextTests(TestCase):
55+
class ExtractTextTestCase(TestCase):
5456
def setUp(self) -> None:
57+
self.config = TextProcessingConfig()
58+
self.fake = Faker("en-US") # en-US to avoid Lorem Ipsum from en-GB
59+
self.fake.seed_instance(12345)
60+
61+
62+
class DocumentToTextTests(ExtractTextTestCase):
63+
def setUp(self) -> None:
64+
super().setUp()
5565
self.empty_dir = mkdtemp()
5666

5767
self._replace_external_tools_with_fakes()
58-
self.config = TextProcessingConfig()
5968
self._create_mock_objects()
6069
self._register_faker_providers()
6170

@@ -69,8 +78,6 @@ def _create_mock_objects(self) -> None:
6978
)
7079

7180
def _register_faker_providers(self) -> None:
72-
self.fake = Faker("en-US") # To avoid Lorem Ipsum
73-
self.fake.seed_instance(12345)
7481
self.fake.add_provider(DocxFileProvider)
7582
self.fake.add_provider(EmlFileProvider)
7683
self.fake.add_provider(OdtFileProvider)
@@ -109,13 +116,13 @@ def test_raises_when_filename_empty(self) -> None:
109116

110117
def test_raises_when_filename_and_blob(self) -> None:
111118
with self.assertRaises(ValueError) as cm:
112-
document_to_text(filename="foo", blob="bar")
119+
document_to_text(filename="foo", blob=b"bar")
113120

114121
self.assertIn("specify either filename or blob", str(cm.exception))
115122

116123
def test_raises_when_blob_but_no_extension(self) -> None:
117124
with self.assertRaises(ValueError) as cm:
118-
document_to_text(blob="bar")
125+
document_to_text(blob=b"bar")
119126

120127
self.assertIn("need extension hint for blob", str(cm.exception))
121128

@@ -586,3 +593,125 @@ def test_unsupported_will_be_converted_with_strings(self) -> None:
586593
),
587594
]
588595
self.mock_popen.assert_has_calls(expected_calls)
596+
597+
598+
class ConvertMsgToTextTests(ExtractTextTestCase):
599+
# There is no easy way to create test Outlook msg files and we don't want
600+
# to store real ones so we mock the interface to extract-msg and assume the
601+
# library itself is working correctly.
602+
def setUp(self) -> None:
603+
super().setUp()
604+
self.dummy_filename = "dummy_filename.msg"
605+
self.dummy_blob = b"dummy blob"
606+
607+
def test_raises_when_no_filename_or_blob(self) -> None:
608+
with self.assertRaises(ValueError) as cm:
609+
convert_msg_to_text()
610+
611+
self.assertIn("no filename and no blob", str(cm.exception))
612+
613+
def test_raises_when_filename_and_blob(self) -> None:
614+
with self.assertRaises(ValueError) as cm:
615+
convert_msg_to_text(filename="foo", blob=b"bar")
616+
617+
self.assertIn("specify either filename or blob", str(cm.exception))
618+
619+
def test_blob_passed_to_openmsg(self) -> None:
620+
content = self.fake.paragraph(nb_sentences=10)
621+
622+
mock_msgfile = mock.Mock(body=content, htmlBody=None, attachments=[])
623+
mock_openmsg = mock.Mock(return_value=mock_msgfile)
624+
with mock.patch.multiple(
625+
"cardinal_pythonlib.extract_text",
626+
openMsg=mock_openmsg,
627+
):
628+
convert_msg_to_text(blob=self.dummy_blob, config=self.config)
629+
630+
expected_calls = [mock.call(self.dummy_blob, delayAttachments=False)]
631+
mock_openmsg.assert_has_calls(expected_calls)
632+
633+
def test_file_passed_to_openmsg(self) -> None:
634+
content = self.fake.paragraph(nb_sentences=10)
635+
636+
mock_msgfile = mock.Mock(body=content, htmlBody=None, attachments=[])
637+
mock_openmsg = mock.Mock(return_value=mock_msgfile)
638+
with mock.patch.multiple(
639+
"cardinal_pythonlib.extract_text",
640+
openMsg=mock_openmsg,
641+
):
642+
convert_msg_to_text(
643+
filename=self.dummy_filename, config=self.config
644+
)
645+
646+
expected_calls = [
647+
mock.call(self.dummy_filename, delayAttachments=False)
648+
]
649+
mock_openmsg.assert_has_calls(expected_calls)
650+
651+
def test_text_body_converted(self) -> None:
652+
content = self.fake.paragraph(nb_sentences=10)
653+
654+
mock_msgfile = mock.Mock(body=content, htmlBody=None, attachments=[])
655+
mock_openmsg = mock.Mock(return_value=mock_msgfile)
656+
with mock.patch.multiple(
657+
"cardinal_pythonlib.extract_text",
658+
openMsg=mock_openmsg,
659+
):
660+
converted = convert_msg_to_text(
661+
filename=self.dummy_filename, config=self.config
662+
)
663+
664+
self.assertEqual(converted, content)
665+
666+
def test_html_body_converted(self) -> None:
667+
content = self.fake.paragraph(nb_sentences=10)
668+
html = f"""
669+
<!DOCTYPE html>
670+
<html>
671+
<head>
672+
</head>
673+
<body>
674+
{content}
675+
</body>
676+
</html>
677+
"""
678+
679+
mock_msgfile = mock.Mock(
680+
body=None, htmlBody=html.encode("utf-8"), attachments=[]
681+
)
682+
mock_openmsg = mock.Mock(return_value=mock_msgfile)
683+
with mock.patch.multiple(
684+
"cardinal_pythonlib.extract_text",
685+
openMsg=mock_openmsg,
686+
):
687+
converted = convert_msg_to_text(
688+
filename=self.dummy_filename, config=self.config
689+
)
690+
691+
self.assertEqual(converted.strip(), content)
692+
693+
def test_attachment_converted(self) -> None:
694+
self.fake.add_provider(DocxFileProvider)
695+
696+
dummy_filename = "dummy_filename.msg"
697+
698+
content = self.fake.paragraph(nb_sentences=10)
699+
docx = self.fake.docx_file(content=content, raw=True)
700+
mock_attachment = mock.Mock(
701+
# null termination seen in the real world
702+
# https://github.com/TeamMsgExtractor/msg-extractor/issues/464
703+
extension=".docx\x00",
704+
data=BytesIO(docx).read(),
705+
)
706+
mock_msgfile = mock.Mock(
707+
body=None, htmlBody=None, attachments=[mock_attachment]
708+
)
709+
mock_openmsg = mock.Mock(return_value=mock_msgfile)
710+
with mock.patch.multiple(
711+
"cardinal_pythonlib.extract_text",
712+
openMsg=mock_openmsg,
713+
):
714+
self.config.width = 0
715+
converted = convert_msg_to_text(dummy_filename, config=self.config)
716+
717+
self.assertEqual(converted.strip(), content)

docs/source/changelog.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -902,3 +902,9 @@ Quick links:
902902
- Add support for ``.eml`` files with attachments processed by supported
903903
document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to
904904
:func:`cardinal_pythonlib.extract_text.document_to_text`.
905+
906+
**2.1.1**
907+
908+
- Add support for Outlook ``.msg`` files with attachments processed by supported
909+
document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to
910+
:func:`cardinal_pythonlib.extract_text.document_to_text`.

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
"beautifulsoup4", # "import bs4" or "from bs4 import ..."
5454
"chardet>=5.0.0",
5555
"colorlog",
56+
"extract_msg",
5657
"isodate>=0.5.4",
5758
"numba", # just-in-time compilation
5859
"numpy>=1.20.0,<2.0", # 1.20.0 required for numpy.typing

0 commit comments

Comments
 (0)