Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,14 @@ output_dir/
| `output` | `PathBuf` | Path to the output directory |
| `dpi` | `u32` | Rendering DPI passed to `pdftoppm` and the OCR bridge |

Environment overrides for the OCR bridge path are respected at runtime (see `src/ocr/bridge.rs`).
Environment overrides for the OCR bridge runtime:

| Variable | Default | Description |
|---|---|---|
| `DOCSTRUCT_BRIDGE` | `ocr/bridge/ocr_bridge.py` | Override OCR bridge script path |
| `DOCSTRUCT_PYTHON` | `python3` | Python executable used to run the OCR bridge |
| `DOCSTRUCT_OCR_WORKERS` | `min(cpu_count, 8)` | Number of parallel OCR workers for per-block Tesseract calls |
| `DOCSTRUCT_OCR_USE_CUDA` | `0` | If `1`, attempts OpenCV CUDA acceleration for grayscale/threshold preprocessing |

---

Expand Down
98 changes: 88 additions & 10 deletions ocr/bridge/ocr_bridge.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/usr/bin/env python3
import argparse
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
import json
import os
import re
import sys
import unicodedata
Expand All @@ -17,6 +19,54 @@
_latex_model = None


def _resolve_ocr_workers() -> int:
"""Resolve OCR worker count from env (default: physical CPU-friendly)."""
raw = os.environ.get("DOCSTRUCT_OCR_WORKERS", "")
if raw:
try:
return max(1, int(raw))
except ValueError:
pass
cpu_count = os.cpu_count() or 1
return max(1, min(cpu_count, 8))


def _cuda_enabled() -> bool:
return os.environ.get("DOCSTRUCT_OCR_USE_CUDA", "0") == "1"


def _to_grayscale(img: np.ndarray) -> np.ndarray:
if len(img.shape) == 2:
return img

if _cuda_enabled() and hasattr(cv2, "cuda"):
try:
gpu = cv2.cuda_GpuMat()
gpu.upload(img)
gray_gpu = cv2.cuda.cvtColor(gpu, cv2.COLOR_BGR2GRAY)
return gray_gpu.download()
except Exception:
pass

return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)


def _otsu_threshold(gray: np.ndarray) -> np.ndarray:
if _cuda_enabled() and hasattr(cv2, "cuda"):
try:
gpu = cv2.cuda_GpuMat()
gpu.upload(gray)
# Otsu is not available in cv2.cuda; use a fixed threshold from Otsu estimate.
otsu_val, _ = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
thresh_gpu = cv2.cuda.threshold(gpu, otsu_val, 255, cv2.THRESH_BINARY)[1]
return thresh_gpu.download()
except Exception:
pass

_, roi_bin = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return roi_bin


def combine_hangul_jamos(text: str) -> str:
"""
Combine separated Hangul jamos into complete syllables.
Expand Down Expand Up @@ -277,7 +327,7 @@ def detect_blocks(image_path: Path, min_area: int = 2000, merge_kernel: tuple =
Reduced from (25,15) to (15,10) to avoid over-merging equations
"""
img = cv2.imread(str(image_path))
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = _to_grayscale(img)
thresh = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 35, 15
)
Expand Down Expand Up @@ -521,21 +571,21 @@ def run_ocr(image_path: Path, lang: str = "eng") -> list[dict]:

results = []
latex_model = None
for block in blocks:

def process_block(block: dict) -> dict | None:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Avoid Python 3.10-only union syntax in bridge

The new annotation -> dict | None in run_ocr uses PEP 604 syntax, which is only valid on Python 3.10+, so the bridge now fails to import with a SyntaxError on Python 3.9 runtimes that previously worked; this is a functional regression because the OCR process cannot start at all in those environments (README still documents Python 3.8+ support).

Useful? React with 👍 / 👎.

x, y, w, h = block["x"], block["y"], block["w"], block["h"]
roi = img[y:y+h, x:x+w]

if roi.size == 0:
continue
return None

# Multi-pass OCR for higher recall/precision:
# - original ROI
# - grayscale ROI
# - Otsu-thresholded ROI
# with both PSM 6 (block) and PSM 11 (sparse text).
roi_gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY) if len(roi.shape) == 3 else roi
_, roi_bin = cv2.threshold(roi_gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
roi_gray = _to_grayscale(roi)
roi_bin = _otsu_threshold(roi_gray)
variants = [roi, roi_gray, roi_bin]
psm_modes = [6, 11]
best_text = ""
Expand All @@ -553,15 +603,43 @@ def run_ocr(image_path: Path, lang: str = "eng") -> list[dict]:
text = normalize_ocr_text(best_text)
block_type = classify_block_type(roi, text)

result = {
return {
"text": text,
"bbox": [float(x), float(y), float(x + w), float(y + h)],
"block_type": block_type,
"confidence": max(0.05, min(1.0, best_conf / 100.0)),
}


worker_count = _resolve_ocr_workers()
indexed_blocks = list(enumerate(blocks))
ordered_results: list[tuple[int, dict]] = []

if worker_count <= 1 or len(indexed_blocks) <= 1:
for idx, block in indexed_blocks:
result = process_block(block)
if result is not None:
ordered_results.append((idx, result))
else:
def process_indexed(pair: tuple[int, dict]) -> tuple[int, dict | None]:
idx, block = pair
return idx, process_block(block)

with ThreadPoolExecutor(max_workers=worker_count) as pool:
for idx, result in pool.map(process_indexed, indexed_blocks):
if result is not None:
ordered_results.append((idx, result))

ordered_results.sort(key=lambda item: item[0])

for _, result in ordered_results:
# For math blocks, try LaTeX OCR
if block_type == "math":
if result["block_type"] == "math":
x0, y0, x1, y1 = (int(v) for v in result["bbox"])
roi = img[y0:y1, x0:x1]
if roi.size == 0:
results.append(result)
continue

if latex_model is None:
latex_model = get_latex_model()

Expand All @@ -577,7 +655,7 @@ def run_ocr(image_path: Path, lang: str = "eng") -> list[dict]:
result["latex"] = ""
else:
result["latex"] = ""

results.append(result)

return results
Expand Down
Loading