2727
2828from email import message_from_string , policy
2929from email .message import EmailMessage
30+ from io import BytesIO
3031import os
3132import subprocess
3233from tempfile import mkdtemp , NamedTemporaryFile
4445from faker_file .providers .xml_file import XmlFileProvider
4546
4647from cardinal_pythonlib .extract_text import (
48+ convert_msg_to_text ,
4749 document_to_text ,
4850 TextProcessingConfig ,
4951 update_external_tools ,
5052)
5153
5254
53- class DocumentToTextTests (TestCase ):
55+ class ExtractTextTestCase (TestCase ):
5456 def setUp (self ) -> None :
57+ self .config = TextProcessingConfig ()
58+ self .fake = Faker ("en-US" ) # en-US to avoid Lorem Ipsum from en-GB
59+ self .fake .seed_instance (12345 )
60+
61+
62+ class DocumentToTextTests (ExtractTextTestCase ):
63+ def setUp (self ) -> None :
64+ super ().setUp ()
5565 self .empty_dir = mkdtemp ()
5666
5767 self ._replace_external_tools_with_fakes ()
58- self .config = TextProcessingConfig ()
5968 self ._create_mock_objects ()
6069 self ._register_faker_providers ()
6170
@@ -69,8 +78,6 @@ def _create_mock_objects(self) -> None:
6978 )
7079
7180 def _register_faker_providers (self ) -> None :
72- self .fake = Faker ("en-US" ) # To avoid Lorem Ipsum
73- self .fake .seed_instance (12345 )
7481 self .fake .add_provider (DocxFileProvider )
7582 self .fake .add_provider (EmlFileProvider )
7683 self .fake .add_provider (OdtFileProvider )
@@ -109,13 +116,13 @@ def test_raises_when_filename_empty(self) -> None:
109116
110117 def test_raises_when_filename_and_blob (self ) -> None :
111118 with self .assertRaises (ValueError ) as cm :
112- document_to_text (filename = "foo" , blob = "bar" )
119+ document_to_text (filename = "foo" , blob = b "bar" )
113120
114121 self .assertIn ("specify either filename or blob" , str (cm .exception ))
115122
116123 def test_raises_when_blob_but_no_extension (self ) -> None :
117124 with self .assertRaises (ValueError ) as cm :
118- document_to_text (blob = "bar" )
125+ document_to_text (blob = b "bar" )
119126
120127 self .assertIn ("need extension hint for blob" , str (cm .exception ))
121128
@@ -586,3 +593,125 @@ def test_unsupported_will_be_converted_with_strings(self) -> None:
586593 ),
587594 ]
588595 self .mock_popen .assert_has_calls (expected_calls )
596+
597+
598+ class ConvertMsgToTextTests (ExtractTextTestCase ):
599+ # There is no easy way to create test Outlook msg files and we don't want
600+ # to store real ones so we mock the interface to extract-msg and assume the
601+ # library itself is working correctly.
602+ def setUp (self ) -> None :
603+ super ().setUp ()
604+ self .dummy_filename = "dummy_filename.msg"
605+ self .dummy_blob = b"dummy blob"
606+
607+ def test_raises_when_no_filename_or_blob (self ) -> None :
608+ with self .assertRaises (ValueError ) as cm :
609+ convert_msg_to_text ()
610+
611+ self .assertIn ("no filename and no blob" , str (cm .exception ))
612+
613+ def test_raises_when_filename_and_blob (self ) -> None :
614+ with self .assertRaises (ValueError ) as cm :
615+ convert_msg_to_text (filename = "foo" , blob = b"bar" )
616+
617+ self .assertIn ("specify either filename or blob" , str (cm .exception ))
618+
619+ def test_blob_passed_to_openmsg (self ) -> None :
620+ content = self .fake .paragraph (nb_sentences = 10 )
621+
622+ mock_msgfile = mock .Mock (body = content , htmlBody = None , attachments = [])
623+ mock_openmsg = mock .Mock (return_value = mock_msgfile )
624+ with mock .patch .multiple (
625+ "cardinal_pythonlib.extract_text" ,
626+ openMsg = mock_openmsg ,
627+ ):
628+ convert_msg_to_text (blob = self .dummy_blob , config = self .config )
629+
630+ expected_calls = [mock .call (self .dummy_blob , delayAttachments = False )]
631+ mock_openmsg .assert_has_calls (expected_calls )
632+
633+ def test_file_passed_to_openmsg (self ) -> None :
634+ content = self .fake .paragraph (nb_sentences = 10 )
635+
636+ mock_msgfile = mock .Mock (body = content , htmlBody = None , attachments = [])
637+ mock_openmsg = mock .Mock (return_value = mock_msgfile )
638+ with mock .patch .multiple (
639+ "cardinal_pythonlib.extract_text" ,
640+ openMsg = mock_openmsg ,
641+ ):
642+ convert_msg_to_text (
643+ filename = self .dummy_filename , config = self .config
644+ )
645+
646+ expected_calls = [
647+ mock .call (self .dummy_filename , delayAttachments = False )
648+ ]
649+ mock_openmsg .assert_has_calls (expected_calls )
650+
651+ def test_text_body_converted (self ) -> None :
652+ content = self .fake .paragraph (nb_sentences = 10 )
653+
654+ mock_msgfile = mock .Mock (body = content , htmlBody = None , attachments = [])
655+ mock_openmsg = mock .Mock (return_value = mock_msgfile )
656+ with mock .patch .multiple (
657+ "cardinal_pythonlib.extract_text" ,
658+ openMsg = mock_openmsg ,
659+ ):
660+ converted = convert_msg_to_text (
661+ filename = self .dummy_filename , config = self .config
662+ )
663+
664+ self .assertEqual (converted , content )
665+
666+ def test_html_body_converted (self ) -> None :
667+ content = self .fake .paragraph (nb_sentences = 10 )
668+ html = f"""
669+ <!DOCTYPE html>
670+ <html>
671+ <head>
672+ </head>
673+ <body>
674+ { content }
675+ </body>
676+ </html>
677+ """
678+
679+ mock_msgfile = mock .Mock (
680+ body = None , htmlBody = html .encode ("utf-8" ), attachments = []
681+ )
682+ mock_openmsg = mock .Mock (return_value = mock_msgfile )
683+ with mock .patch .multiple (
684+ "cardinal_pythonlib.extract_text" ,
685+ openMsg = mock_openmsg ,
686+ ):
687+ converted = convert_msg_to_text (
688+ filename = self .dummy_filename , config = self .config
689+ )
690+
691+ self .assertEqual (converted .strip (), content )
692+
693+ def test_attachment_converted (self ) -> None :
694+ self .fake .add_provider (DocxFileProvider )
695+
696+ dummy_filename = "dummy_filename.msg"
697+
698+ content = self .fake .paragraph (nb_sentences = 10 )
699+ docx = self .fake .docx_file (content = content , raw = True )
700+ mock_attachment = mock .Mock (
701+ # null termination seen in the real world
702+ # https://github.com/TeamMsgExtractor/msg-extractor/issues/464
703+ extension = ".docx\x00 " ,
704+ data = BytesIO (docx ).read (),
705+ )
706+ mock_msgfile = mock .Mock (
707+ body = None , htmlBody = None , attachments = [mock_attachment ]
708+ )
709+ mock_openmsg = mock .Mock (return_value = mock_msgfile )
710+ with mock .patch .multiple (
711+ "cardinal_pythonlib.extract_text" ,
712+ openMsg = mock_openmsg ,
713+ ):
714+ self .config .width = 0
715+ converted = convert_msg_to_text (dummy_filename , config = self .config )
716+
717+ self .assertEqual (converted .strip (), content )
0 commit comments