From 4f25d3fbd30734cff99dd1dbc73bb6ab9e554915 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Mon, 26 Jan 2026 17:07:43 +0000 Subject: [PATCH 1/3] Fix silly mistake introduced in bdc9983e1cb1e2cf07783284903cab1a789a2b9e --- cardinal_pythonlib/extract_text.py | 2 +- .../tests/extract_text_tests.py | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index c589402..b0b7371 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1148,7 +1148,7 @@ def convert_html_to_text( # https://bugs.launchpad.net/beautifulsoup/+bug/2110492 # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array # So we just workaround this here: - if bytes is not None and len(blob) == 0: + if blob is not None and len(blob) == 0: return "" with get_filelikeobject(filename, blob) as fp: diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 202555b..ce4bab9 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -217,6 +217,26 @@ def test_htm_converted(self) -> None: ) self.assertEqual(text.strip(), content) + def test_htm_file_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + + html = f""" + + + + + +{content} + + +""" + with NamedTemporaryFile(suffix=".htm", delete=False) as temp_file: + temp_file.write(html.encode("utf-8")) + temp_file.close() + text = document_to_text(filename=temp_file.name) + + self.assertEqual(text.strip(), content) + def test_empty_htm_converted(self) -> None: text = document_to_text( blob="".encode("utf-8"), extension="htm", config=self.config From d8f60cdf7e08496777b2059fbb737f612aa22f92 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Mon, 26 Jan 2026 17:13:15 +0000 Subject: [PATCH 2/3] Update changelog --- docs/source/changelog.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index ba425c9..cd93226 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -914,3 +914,6 @@ Quick links: - ``cardinalpythonlib_grep_in_openxml``: new facility to search XML node text (rather than raw file text), and this is now the default. Also, behind the scenes, exceptions in subprocesses are now reported. + +- Fix extraction of text from HTML files in + :func:`cardinal_pythonlib.extract_text.document_to_text`. From 2f98e5fbf973119b9c11078c627d0a2ebb0e77b4 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Mon, 26 Jan 2026 19:13:35 +0000 Subject: [PATCH 3/3] Install pycairo dependencies before running tests --- .github/workflows/run_tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 2cc89ef..b543127 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -25,6 +25,10 @@ jobs: python-version: ${{ matrix.python-version }} - name: Change apt mirror run: ${GITHUB_WORKSPACE}/.github/scripts/change_apt_mirror.sh + - name: Install prerequisites + run: | + set -euo pipefail + sudo apt-get -y install libcairo2-dev pkg-config python3-dev - name: Create virtualenv run: ${GITHUB_WORKSPACE}/.github/scripts/create_virtualenv.sh - name: Install test Python packages