From 63e3b6e3987b7db8eb7809da7404efdc88b87404 Mon Sep 17 00:00:00 2001 From: Rudolf Cardinal Date: Tue, 18 Feb 2025 17:36:29 +0000 Subject: [PATCH] bugfix relating to pdfminer when pdftotext not available, and some related tidying --- cardinal_pythonlib/extract_text.py | 182 +++++---------------------- cardinal_pythonlib/version_string.py | 2 +- docs/source/changelog.rst | 5 + 3 files changed, 40 insertions(+), 149 deletions(-) diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py index 8cf4f78..e4cdb94 100755 --- a/cardinal_pythonlib/extract_text.py +++ b/cardinal_pythonlib/extract_text.py @@ -29,8 +29,14 @@ .. code-block:: bash - sudo apt-get install antiword - pip install docx pdfminer + sudo apt-get install antiword # required for DOC + sudo apt-get install pdftotext # optional, but best way for PDF + sudo apt-get install strings # strings/strings2 needed as generic fallback + sudo apt-get install strings2 # as above + sudo apt-get install unrtf # required for RTF + + pip install chardet # improves character type detection + pip install pdfminer.six # optional, backup optional for PDF - Author: Rudolf Cardinal (rudolf@pobox.com) - Created: Feb 2015 @@ -71,7 +77,7 @@ # ============================================================================= import argparse -from io import StringIO # Python 3 +from io import StringIO import io import logging import os @@ -88,7 +94,6 @@ Iterator, List, Optional, - Union, ) from xml.etree import ElementTree as ElementTree @@ -101,8 +106,6 @@ import prettytable from semantic_version import Version -# import texttable # ... can't deal with Unicode properly - from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler try: @@ -112,47 +115,14 @@ chardet = None UniversalDetector = None -try: - # noinspection PyPackageRequirements - import docx # pip install python-docx (NOT docx) - BUT python-docx requires lxml which has C dependencies # noqa: E501 - - # noinspection PyPackageRequirements - import docx.document - - # noinspection PyPackageRequirements - import docx.oxml.table - - # noinspection PyPackageRequirements - import docx.oxml.text.paragraph - - # noinspection PyPackageRequirements - import docx.table - - # noinspection PyPackageRequirements - import docx.text.paragraph - - DOCX_DOCUMENT_TYPE = "docx.document.Document" - DOCX_TABLE_TYPE = Union["docx.table.Table", "CustomDocxTable"] - DOCX_CONTAINER_TYPE = Union[DOCX_DOCUMENT_TYPE, "docx.table._Cell"] - DOCX_BLOCK_ITEM_TYPE = Union[ - "docx.text.paragraph.Paragraph", "docx.table.Table" - ] -except ImportError: - docx = None - DOCX_DOCUMENT_TYPE = None - DOCX_TABLE_TYPE = "CustomDocxTable" - DOCX_CONTAINER_TYPE = None - DOCX_BLOCK_ITEM_TYPE = None - -try: - import docx2txt # pip install docx2txt -except ImportError: - docx2txt = None - try: # noinspection PyPackageRequirements import pdfminer # pip install pdfminer + assert ( + int(pdfminer.__version__) > 20191010 + ), "pdfminer installed but too old" # version string is e.g. '20191125' + # noinspection PyPackageRequirements import pdfminer.pdfinterp @@ -167,20 +137,9 @@ except ImportError: pdfminer = None -try: - # noinspection PyPackageRequirements - import pyth # pip install pyth (PYTHON 2 ONLY; https://pypi.python.org/pypi/pyth/0.5.4) # noqa: E501 - - # noinspection PyPackageRequirements - import pyth.plugins.rtf15.reader - - # noinspection PyPackageRequirements - import pyth.plugins.plaintext.writer -except ImportError: - pyth = None - log = get_brace_style_log_with_null_handler(__name__) + # ============================================================================= # Constants # ============================================================================= @@ -192,6 +151,7 @@ SYS_ENCODING = sys.getdefaultencoding() ENCODING = "utf-8" + # ============================================================================= # External tool map # ============================================================================= @@ -542,6 +502,7 @@ def rstrip_all_lines(text: str) -> str: # PDF # ============================================================================= + # noinspection PyUnresolvedReferences,PyUnusedLocal def convert_pdf_to_txt( filename: str = None, @@ -561,11 +522,11 @@ def convert_pdf_to_txt( elif pdfminer: # Memory-hogging method with get_filelikeobject(filename, blob) as fp: rsrcmgr = pdfminer.pdfinterp.PDFResourceManager() - retstr = StringIO() + str_io = StringIO() codec = ENCODING laparams = pdfminer.layout.LAParams() device = pdfminer.converter.TextConverter( - rsrcmgr, retstr, codec=codec, laparams=laparams + rsrcmgr, str_io, codec=codec, laparams=laparams ) interpreter = pdfminer.pdfinterp.PDFPageInterpreter( rsrcmgr, device @@ -583,7 +544,7 @@ def convert_pdf_to_txt( check_extractable=True, ): interpreter.process_page(page) - text = retstr.getvalue().decode(ENCODING) + text = str_io.getvalue() return text else: raise AssertionError("No PDF-reading tool available") @@ -949,7 +910,7 @@ def wordwrap(text: str, width: int) -> str: def docx_process_table( - table: DOCX_TABLE_TYPE, config: TextProcessingConfig + table: CustomDocxTable, config: TextProcessingConfig ) -> str: """ Converts a DOCX table to text. @@ -1061,69 +1022,9 @@ def get_cell_text(cell_) -> str: # ----------------------------------------------------------------------------- -# With the docx library +# DOCX # ----------------------------------------------------------------------------- -_ = ''' -# noinspection PyProtectedMember,PyUnresolvedReferences -def docx_docx_iter_block_items(parent: DOCX_CONTAINER_TYPE) \ - -> Iterator[DOCX_BLOCK_ITEM_TYPE]: - """ - Iterate through items of a DOCX file. - - See https://github.com/python-openxml/python-docx/issues/40. - - Yield each paragraph and table child within ``parent``, in document order. - Each returned value is an instance of either :class:`Table` or - :class:`Paragraph`. ``parent`` would most commonly be a reference to a main - :class:`Document` object, but also works for a :class:`_Cell` object, which - itself can contain paragraphs and tables. - - NOTE: uses internals of the ``python-docx`` (``docx``) library; subject to - change; this version works with ``docx==0.8.5``. - """ - if isinstance(parent, docx.document.Document): - parent_elm = parent.element.body - elif isinstance(parent, docx.table._Cell): - parent_elm = parent._tc - else: - raise ValueError("something's not right") - - for child in parent_elm.iterchildren(): - if isinstance(child, docx.oxml.text.paragraph.CT_P): - yield docx.text.paragraph.Paragraph(child, parent) - elif isinstance(child, docx.oxml.table.CT_Tbl): - yield docx.table.Table(child, parent) - - -# noinspection PyUnresolvedReferences -def docx_docx_gen_text(doc: DOCX_DOCUMENT_TYPE, - config: TextProcessingConfig) -> Iterator[str]: - """ - Iterate through a DOCX file and yield text. - - Args: - doc: DOCX document to process - config: :class:`TextProcessingConfig` control object - - Yields: - pieces of text (paragraphs) - - """ - if in_order: - for thing in docx_docx_iter_block_items(doc): - if isinstance(thing, docx.text.paragraph.Paragraph): - yield docx_process_simple_text(thing.text, config.width) - elif isinstance(thing, docx.table.Table): - yield docx_process_table(thing, config) - else: - for paragraph in doc.paragraphs: - yield docx_process_simple_text(paragraph.text, config.width) - for table in doc.tables: - yield docx_process_table(table, config) -''' - - # noinspection PyUnusedLocal def convert_docx_to_text( filename: str = None, @@ -1211,25 +1112,12 @@ def convert_docx_to_text( text += docx_text_from_xml(xml, config) return text - # elif docx: - # with get_filelikeobject(filename, blob) as fp: - # # noinspection PyUnresolvedReferences - # document = docx.Document(fp) - # return '\n\n'.join( - # docx_docx_gen_text(document, config)) - # elif docx2txt: - # if filename: - # return docx2txt.process(filename) - # else: - # raise NotImplementedError("docx2txt BLOB handling not written") - # else: - # raise AssertionError("No DOCX-reading tool available") - # ============================================================================= # ODT # ============================================================================= + # noinspection PyUnusedLocal def convert_odt_to_text( filename: str = None, @@ -1259,6 +1147,7 @@ def convert_odt_to_text( # HTML # ============================================================================= + # noinspection PyUnusedLocal def convert_html_to_text( filename: str = None, @@ -1277,6 +1166,7 @@ def convert_html_to_text( # XML # ============================================================================= + # noinspection PyUnusedLocal def convert_xml_to_text( filename: str = None, @@ -1295,6 +1185,7 @@ def convert_xml_to_text( # RTF # ============================================================================= + # noinspection PyUnresolvedReferences,PyUnusedLocal def convert_rtf_to_text( filename: str = None, @@ -1314,13 +1205,6 @@ def convert_rtf_to_text( return get_cmd_output(*args) else: return get_cmd_output_from_stdin(blob, *args) - elif pyth: # Very memory-consuming: - # https://github.com/brendonh/pyth/blob/master/pyth/plugins/rtf15/reader.py # noqa: E501 - with get_filelikeobject(filename, blob) as fp: - doc = pyth.plugins.rtf15.reader.Rtf15Reader.read(fp) - return pyth.plugins.plaintext.writer.PlaintextWriter.write( - doc - ).getvalue() else: raise AssertionError("No RTF-reading tool available") @@ -1332,11 +1216,6 @@ def availability_rtf() -> bool: unrtf = tools["unrtf"] if unrtf: return True - elif pyth: - log.warning( - "RTF conversion: unrtf missing; " "using pyth (less efficient)" - ) - return True else: return False @@ -1345,6 +1224,7 @@ def availability_rtf() -> bool: # DOC # ============================================================================= + # noinspection PyUnusedLocal def convert_doc_to_text( filename: str = None, @@ -1378,6 +1258,7 @@ def availability_doc() -> bool: # Anything # ============================================================================= + # noinspection PyUnusedLocal def convert_anything_to_text( filename: str = None, @@ -1523,7 +1404,7 @@ def is_text_extractor_available(extension: str) -> bool: if info is None: return False availability = info[AVAILABILITY] - if type(availability) == bool: + if type(availability) is bool: return availability elif callable(availability): return availability() @@ -1551,7 +1432,6 @@ def main() -> None: """ Command-line processor. See ``--help`` for details. """ - logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) @@ -1582,7 +1462,13 @@ def main() -> None: default=DEFAULT_MIN_COL_WIDTH, help="Minimum column width for tables", ) + parser.add_argument( + "--verbose", + action="store_true", + help="Be verbose", + ) args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) if args.availability: for ext in args.availability: if ext.lower() == "none": diff --git a/cardinal_pythonlib/version_string.py b/cardinal_pythonlib/version_string.py index 280fa93..610f996 100644 --- a/cardinal_pythonlib/version_string.py +++ b/cardinal_pythonlib/version_string.py @@ -31,5 +31,5 @@ """ -VERSION_STRING = "2.0.1" +VERSION_STRING = "2.0.2" # Use semantic versioning: https://semver.org/ diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 4cd5b76..029d7f0 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -859,3 +859,8 @@ Quick links: :func:`cardinal_pythonlib.sqlalchemy.alembic_func.get_current_revision` where since SQLAlchemy 2.0, the database connection was persisting, resulting in a metadata lock. + +- Bugfix to :func:`cardinal_pythonlib.extract_text.convert_pdf_to_txt` where + ``pdftotext`` was unavailable. Also remove antique ``pyth`` support. And + shift from unmaintained ``pdfminer`` to maintained ``pdfminer.six``. Also + removed unused code around importing ``docx`` and ``docx2txt``.