File tree Expand file tree Collapse file tree 2 files changed +39
-1
lines changed
Expand file tree Collapse file tree 2 files changed +39
-1
lines changed Original file line number Diff line number Diff line change @@ -1149,7 +1149,15 @@ def convert_html_to_text(
11491149
11501150 with get_filelikeobject (filename , blob ) as fp :
11511151 soup = bs4 .BeautifulSoup (fp , "html.parser" )
1152- return soup .get_text ()
1152+
1153+ # In the real world we can end up with UTF-16 characters embedded as
1154+ # numbered entities in Windows-1252 encoded HTML such as
1155+ # �� "Slightly smiling face". Replacing these here
1156+ # avoids "UnicodeEncodeError: 'utf-8' codec can't encode characters in
1157+ # position ... surrogates not allowed".
1158+ text = soup .get_text ().encode (errors = "replace" ).decode ()
1159+
1160+ return text
11531161
11541162
11551163# =============================================================================
Original file line number Diff line number Diff line change @@ -531,6 +531,36 @@ def test_eml_with_illegal_multibyte_sequence_replaced(self) -> None:
531531
532532 self .assertEqual (text .strip (), "??" )
533533
534+ def test_eml_invalid_surrogate_characters_replaced (self ) -> None :
535+ content = """From: bar@example.org
536+ Subject: Invalid surrogate characters
537+ To: foo@example.org
538+ Mime-Version: 1.0
539+ Content-Type: multipart/mixed;boundary="==="
540+
541+ --===
542+ Content-Type: text/html; charset="windows-1252"
543+ Content-Transfer-Encoding: quoted-printable
544+
545+ <html><head>
546+ <meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3DWindows-1=
547+ 252">
548+ </head>
549+ <body>
550+ ��
551+ </body>
552+ </html>
553+ --===--
554+ """
555+ message = message_from_string (content , policy = policy .default )
556+ blob = message .as_bytes ()
557+
558+ text = document_to_text (
559+ blob = blob , extension = ".eml" , config = self .config
560+ )
561+
562+ self .assertEqual (text .strip (), "??" )
563+
534564 def test_unsupported_converted (self ) -> None :
535565 with mock .patch .multiple (
536566 "cardinal_pythonlib.extract_text.subprocess" ,
You can’t perform that action at this time.
0 commit comments