From 9bdd8449920ee2d01bfc5aa00bb7d230f3480cd5 Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Wed, 24 Dec 2025 10:51:50 +0000 Subject: [PATCH 1/2] Made PDF conversion use mp properly. --- ocr_service/processor/converter.py | 62 +++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/ocr_service/processor/converter.py b/ocr_service/processor/converter.py index ac4df2f..79d2e0f 100644 --- a/ocr_service/processor/converter.py +++ b/ocr_service/processor/converter.py @@ -1,11 +1,12 @@ from __future__ import annotations +import atexit +import multiprocessing import os import time import traceback import uuid from io import BytesIO -from multiprocessing.dummy import Pool from subprocess import PIPE, Popen from threading import Timer from typing import Any @@ -66,36 +67,61 @@ def _extract_text_fallback(self, stream: bytes, *, is_html: bool, is_xml: bool, return text.strip() + @staticmethod + def initialize_pdf_worker(stream) -> None: + # we are making this a global so that we can use it in the process pool + # since Pypdfium2 PdfDocument objects are not thread-safe + global CURRENT_PDF_FILE + CURRENT_PDF_FILE = pdfium.PdfDocument(stream) + + def _close_pdf(): + global CURRENT_PDF_FILE + if CURRENT_PDF_FILE is not None: + CURRENT_PDF_FILE.close() + + atexit.register(_close_pdf) + + @staticmethod + def render_page(page_num) -> Image.Image: + scale = int(settings.OCR_SERVICE_IMAGE_DPI / 72) + page = CURRENT_PDF_FILE.get_page(page_num) + img = page.render( + scale=scale, + may_draw_forms=False, + no_smoothtext=True, + no_smoothimage=True, + no_smoothpath=True, + rotation=0, + crop=(0, 0, 0, 0), + grayscale=settings.OCR_CONVERT_GRAYSCALE_IMAGES, + ).to_pil() + page.close() + + return img + def _pdf_to_img(self, stream: bytes) -> tuple[list[Image.Image], dict]: pdf_image_pages = [] doc_metadata: dict[str, Any] = {} pdf = pdfium.PdfDocument(stream) + page_count = len(pdf) + pdf.close() + + doc_metadata["pages"] = page_count pdf_conversion_start_time = time.time() - scale = int(settings.OCR_SERVICE_IMAGE_DPI / 72) - def render_page(index: int) -> Image.Image: - page = pdf[index] - return page.render( - scale=scale, - may_draw_forms=False, - no_smoothtext=True, - no_smoothimage=True, - no_smoothpath=True, - rotation=0, - crop=(0, 0, 0, 0), - grayscale=settings.OCR_CONVERT_GRAYSCALE_IMAGES - ).to_pil() - - with Pool(settings.CONVERTER_THREAD_NUM) as pool: - pdf_image_pages = pool.map(render_page, range(len(pdf))) + ctx = multiprocessing.get_context("spawn") + + with ctx.Pool(processes=min(settings.CONVERTER_THREAD_NUM, page_count), + initializer=DocumentConverter.initialize_pdf_worker, + initargs=(stream,)) as pool: + pdf_image_pages = list(pool.imap_unordered(DocumentConverter.render_page, range(page_count), chunksize=1)) pdf_conversion_end_time = time.time() self.log.info("PDF conversion to image(s) finished | Elapsed : " + str(pdf_conversion_end_time - pdf_conversion_start_time) + " seconds") - return pdf_image_pages, doc_metadata def _pdf_to_text(self, stream: bytes) -> tuple[str, dict]: From 80c1dd76d19abd5139201721423cdd9a17dd1040 Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Wed, 24 Dec 2025 11:19:07 +0000 Subject: [PATCH 2/2] Requirements cleanup. --- requirements-dev.txt | 2 +- requirements.txt | 17 +++++------------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 5547e70..0a375dd 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -ruff==0.12.12 +ruff==0.14.10 mypy==1.17.0 mypy-extensions==1.1.0 types-aiofiles==24.1.0.20250708 diff --git a/requirements.txt b/requirements.txt index bf191ab..cca693a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,22 +1,14 @@ setuptools==80.9.0 wheel==0.45.0 -pkgconfig==1.5.0 -cython==3.1.2 -virtualenv==20.31.2 psutil==6.1.1 filetype==1.2.0 -Pillow==11.3.0 +Pillow==12.0.0 html2image==2.0.7 -MarkupSafe==3.0.2 -python-multipart==0.0.20 -tesserocr==2.9.1 +tesserocr==2.9.2 gunicorn==23.0.0 pypdfium2==5.2.0 -uharfbuzz==0.50.2 -pyxml2pdf==0.3.4 -matplotlib==3.10.3 opencv-python-headless==4.12.0.88 -atomicwrites==1.4.1 +pyxml2pdf==0.3.4 fastapi==0.116.1 orjson==3.11.2 a2wsgi==1.10.10 @@ -24,7 +16,8 @@ pydantic==2.12.5 pydantic-settings==2.12.0 httpx==0.28.1 beautifulsoup4==4.12.3 -striprtf==0.0.26 +striprtf==0.0.29 +python-multipart==0.0.21 # Pillow package dependencies defusedxml==0.7.1