diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 2cc89ef..b543127 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -25,6 +25,10 @@ jobs: python-version: ${{ matrix.python-version }} - name: Change apt mirror run: ${GITHUB_WORKSPACE}/.github/scripts/change_apt_mirror.sh + - name: Install prerequisites + run: | + set -euo pipefail + sudo apt-get -y install libcairo2-dev pkg-config python3-dev - name: Create virtualenv run: ${GITHUB_WORKSPACE}/.github/scripts/create_virtualenv.sh - name: Install test Python packages diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index c589402..b0b7371 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -1148,7 +1148,7 @@ def convert_html_to_text( # https://bugs.launchpad.net/beautifulsoup/+bug/2110492 # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array # So we just workaround this here: - if bytes is not None and len(blob) == 0: + if blob is not None and len(blob) == 0: return "" with get_filelikeobject(filename, blob) as fp: diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py index 202555b..ce4bab9 100644 --- a/cardinal_pythonlib/tests/extract_text_tests.py +++ b/cardinal_pythonlib/tests/extract_text_tests.py @@ -217,6 +217,26 @@ def test_htm_converted(self) -> None: ) self.assertEqual(text.strip(), content) + def test_htm_file_converted(self) -> None: + content = self.fake.paragraph(nb_sentences=10) + + html = f""" + + + + + +{content} + + +""" + with NamedTemporaryFile(suffix=".htm", delete=False) as temp_file: + temp_file.write(html.encode("utf-8")) + temp_file.close() + text = document_to_text(filename=temp_file.name) + + self.assertEqual(text.strip(), content) + def test_empty_htm_converted(self) -> None: text = document_to_text( blob="".encode("utf-8"), extension="htm", config=self.config diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index ba425c9..cd93226 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -914,3 +914,6 @@ Quick links: - ``cardinalpythonlib_grep_in_openxml``: new facility to search XML node text (rather than raw file text), and this is now the default. Also, behind the scenes, exceptions in subprocesses are now reported. + +- Fix extraction of text from HTML files in + :func:`cardinal_pythonlib.extract_text.document_to_text`.