From df510233a35508c54b5dec1e44d31c779c7494c4 Mon Sep 17 00:00:00 2001 From: zeetee Date: Thu, 12 Mar 2026 21:21:18 +0900 Subject: [PATCH] Improve OCR throughput with parallel workers and optional CUDA preprocess --- README.md | 9 +++- ocr/bridge/ocr_bridge.py | 98 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 96 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 2cab4b9..8eddd29 100644 --- a/README.md +++ b/README.md @@ -551,7 +551,14 @@ output_dir/ | `output` | `PathBuf` | Path to the output directory | | `dpi` | `u32` | Rendering DPI passed to `pdftoppm` and the OCR bridge | -Environment overrides for the OCR bridge path are respected at runtime (see `src/ocr/bridge.rs`). +Environment overrides for the OCR bridge runtime: + +| Variable | Default | Description | +|---|---|---| +| `DOCSTRUCT_BRIDGE` | `ocr/bridge/ocr_bridge.py` | Override OCR bridge script path | +| `DOCSTRUCT_PYTHON` | `python3` | Python executable used to run the OCR bridge | +| `DOCSTRUCT_OCR_WORKERS` | `min(cpu_count, 8)` | Number of parallel OCR workers for per-block Tesseract calls | +| `DOCSTRUCT_OCR_USE_CUDA` | `0` | If `1`, attempts OpenCV CUDA acceleration for grayscale/threshold preprocessing | --- diff --git a/ocr/bridge/ocr_bridge.py b/ocr/bridge/ocr_bridge.py index e23cf49..c38016b 100644 --- a/ocr/bridge/ocr_bridge.py +++ b/ocr/bridge/ocr_bridge.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 import argparse from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor import json +import os import re import sys import unicodedata @@ -17,6 +19,54 @@ _latex_model = None +def _resolve_ocr_workers() -> int: + """Resolve OCR worker count from env (default: physical CPU-friendly).""" + raw = os.environ.get("DOCSTRUCT_OCR_WORKERS", "") + if raw: + try: + return max(1, int(raw)) + except ValueError: + pass + cpu_count = os.cpu_count() or 1 + return max(1, min(cpu_count, 8)) + + +def _cuda_enabled() -> bool: + return os.environ.get("DOCSTRUCT_OCR_USE_CUDA", "0") == "1" + + +def _to_grayscale(img: np.ndarray) -> np.ndarray: + if len(img.shape) == 2: + return img + + if _cuda_enabled() and hasattr(cv2, "cuda"): + try: + gpu = cv2.cuda_GpuMat() + gpu.upload(img) + gray_gpu = cv2.cuda.cvtColor(gpu, cv2.COLOR_BGR2GRAY) + return gray_gpu.download() + except Exception: + pass + + return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + +def _otsu_threshold(gray: np.ndarray) -> np.ndarray: + if _cuda_enabled() and hasattr(cv2, "cuda"): + try: + gpu = cv2.cuda_GpuMat() + gpu.upload(gray) + # Otsu is not available in cv2.cuda; use a fixed threshold from Otsu estimate. + otsu_val, _ = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + thresh_gpu = cv2.cuda.threshold(gpu, otsu_val, 255, cv2.THRESH_BINARY)[1] + return thresh_gpu.download() + except Exception: + pass + + _, roi_bin = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + return roi_bin + + def combine_hangul_jamos(text: str) -> str: """ Combine separated Hangul jamos into complete syllables. @@ -277,7 +327,7 @@ def detect_blocks(image_path: Path, min_area: int = 2000, merge_kernel: tuple = Reduced from (25,15) to (15,10) to avoid over-merging equations """ img = cv2.imread(str(image_path)) - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + gray = _to_grayscale(img) thresh = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 35, 15 ) @@ -521,21 +571,21 @@ def run_ocr(image_path: Path, lang: str = "eng") -> list[dict]: results = [] latex_model = None - - for block in blocks: + + def process_block(block: dict) -> dict | None: x, y, w, h = block["x"], block["y"], block["w"], block["h"] roi = img[y:y+h, x:x+w] if roi.size == 0: - continue + return None # Multi-pass OCR for higher recall/precision: # - original ROI # - grayscale ROI # - Otsu-thresholded ROI # with both PSM 6 (block) and PSM 11 (sparse text). - roi_gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) if len(roi.shape) == 3 else roi - _, roi_bin = cv2.threshold(roi_gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + roi_gray = _to_grayscale(roi) + roi_bin = _otsu_threshold(roi_gray) variants = [roi, roi_gray, roi_bin] psm_modes = [6, 11] best_text = "" @@ -553,15 +603,43 @@ def run_ocr(image_path: Path, lang: str = "eng") -> list[dict]: text = normalize_ocr_text(best_text) block_type = classify_block_type(roi, text) - result = { + return { "text": text, "bbox": [float(x), float(y), float(x + w), float(y + h)], "block_type": block_type, "confidence": max(0.05, min(1.0, best_conf / 100.0)), } - + + worker_count = _resolve_ocr_workers() + indexed_blocks = list(enumerate(blocks)) + ordered_results: list[tuple[int, dict]] = [] + + if worker_count <= 1 or len(indexed_blocks) <= 1: + for idx, block in indexed_blocks: + result = process_block(block) + if result is not None: + ordered_results.append((idx, result)) + else: + def process_indexed(pair: tuple[int, dict]) -> tuple[int, dict | None]: + idx, block = pair + return idx, process_block(block) + + with ThreadPoolExecutor(max_workers=worker_count) as pool: + for idx, result in pool.map(process_indexed, indexed_blocks): + if result is not None: + ordered_results.append((idx, result)) + + ordered_results.sort(key=lambda item: item[0]) + + for _, result in ordered_results: # For math blocks, try LaTeX OCR - if block_type == "math": + if result["block_type"] == "math": + x0, y0, x1, y1 = (int(v) for v in result["bbox"]) + roi = img[y0:y1, x0:x1] + if roi.size == 0: + results.append(result) + continue + if latex_model is None: latex_model = get_latex_model() @@ -577,7 +655,7 @@ def run_ocr(image_path: Path, lang: str = "eng") -> list[dict]: result["latex"] = "" else: result["latex"] = "" - + results.append(result) return results