Skip to content

Commit 51e9295

Browse files
Handle invalid surrogate characters in HTML conversion
1 parent dc92a17 commit 51e9295

File tree

2 files changed

+39
-1
lines changed

2 files changed

+39
-1
lines changed

cardinal_pythonlib/extract_text.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1149,7 +1149,15 @@ def convert_html_to_text(
11491149

11501150
with get_filelikeobject(filename, blob) as fp:
11511151
soup = bs4.BeautifulSoup(fp, "html.parser")
1152-
return soup.get_text()
1152+
1153+
# In the real world we can end up with UTF-16 characters embedded as
1154+
# numbered entities in Windows-1252 encoded HTML such as
1155+
# �� "Slightly smiling face". Replacing these here
1156+
# avoids "UnicodeEncodeError: 'utf-8' codec can't encode characters in
1157+
# position ... surrogates not allowed".
1158+
text = soup.get_text().encode(errors="replace").decode()
1159+
1160+
return text
11531161

11541162

11551163
# =============================================================================

cardinal_pythonlib/tests/extract_text_tests.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,36 @@ def test_eml_with_illegal_multibyte_sequence_replaced(self) -> None:
531531

532532
self.assertEqual(text.strip(), "??")
533533

534+
def test_eml_invalid_surrogate_characters_replaced(self) -> None:
535+
content = """From: bar@example.org
536+
Subject: Invalid surrogate characters
537+
To: foo@example.org
538+
Mime-Version: 1.0
539+
Content-Type: multipart/mixed;boundary="==="
540+
541+
--===
542+
Content-Type: text/html; charset="windows-1252"
543+
Content-Transfer-Encoding: quoted-printable
544+
545+
<html><head>
546+
<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3DWindows-1=
547+
252">
548+
</head>
549+
<body>
550+
&#55357;&#56898;
551+
</body>
552+
</html>
553+
--===--
554+
"""
555+
message = message_from_string(content, policy=policy.default)
556+
blob = message.as_bytes()
557+
558+
text = document_to_text(
559+
blob=blob, extension=".eml", config=self.config
560+
)
561+
562+
self.assertEqual(text.strip(), "??")
563+
534564
def test_unsupported_converted(self) -> None:
535565
with mock.patch.multiple(
536566
"cardinal_pythonlib.extract_text.subprocess",

0 commit comments

Comments
 (0)