From 5d5033ac1a535d307643f3f9cf0f3fcf1bd12745 Mon Sep 17 00:00:00 2001 From: chengke <404835780@qq.com> Date: Tue, 9 Jun 2026 17:19:53 +0800 Subject: [PATCH 1/4] feat: unify PDF document profile routing --- AGENTS.md | 26 +- .../bootstrap/aggregate_stats.py | 37 + .../services/document_agent/coordinator.py | 150 ++-- .../app/services/document_agent/manifest.py | 1 + .../document_agent/planner/planner.py | 9 +- .../document_agent/planner/prompts.py | 14 +- .../services/document_agent/profile_agent.py | 22 +- .../formats/atlas/classifier.py | 179 ----- .../document_parser/formats/atlas/parser.py | 6 +- .../document_parser/formats/pdf/parser.py | 28 +- .../formats/pdf/shard_splitter.py | 38 +- .../orchestration/parse_session.py | 68 +- .../profiling/doc_profile_model.py | 75 -- .../profiling/doc_profile_pdf.py | 671 ------------------ .../document_parser/profiling/doc_profiler.py | 100 ++- .../profiling/profile_model.py | 56 ++ .../document_parser/profiling/taxonomy.py | 24 + .../contract/test_parse_task_contract.py | 108 ++- 18 files changed, 425 insertions(+), 1187 deletions(-) delete mode 100644 apps/worker/app/services/document_parser/formats/atlas/classifier.py delete mode 100644 apps/worker/app/services/document_parser/profiling/doc_profile_model.py delete mode 100644 apps/worker/app/services/document_parser/profiling/doc_profile_pdf.py create mode 100644 apps/worker/app/services/document_parser/profiling/profile_model.py create mode 100644 apps/worker/app/services/document_parser/profiling/taxonomy.py diff --git a/AGENTS.md b/AGENTS.md index a625cdf5..6cceff82 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -124,8 +124,13 @@ flowchart TB This is the typed `ParseOutput` entry for all file types. The parser flow: -1. **Profiles** the document via `profiling.doc_profiler.profile_document()` to detect - file type, page count, and special categories (e.g. `atlas`). +1. **Profiles** the document via `profiling.doc_profiler.profile_document()`. PDF + profiling uses `document_agent` as the single PyMuPDF feature source, then runs + VLM coarse classification with two fields: open semantic `category` (for example, + `Financial Prospectus`) and routing-only `routing_category` + (`atlas/scanned/slides/generic`). Oversized non-atlas PDFs additionally run the + structural anatomy stage once at the entry point and pass the resulting shard + plan to PDF parsing. 2. **Routes** to the appropriate parser based on file extension. 3. **Post-processes**: cleans up unreferenced images, compresses PNG→JPG. 4. Returns typed parse output with task-local artifact paths. @@ -134,7 +139,7 @@ This is the typed `ParseOutput` entry for all file types. The parser flow: | Extension | Parser Module | Strategy | |:---|:---|:---| -| `.pdf` | `formats.pdf.parser.parse_pdfs` | MinerU API → Markdown parser → `structure.layout_parser.pred_titles` | +| `.pdf` | `formats.pdf.parser.parse_pdfs` | DOC_PROFILE category dispatch: `atlas` → atlas parser; oversized with entry anatomy → shard MinerU; otherwise MinerU API → Markdown parser → `structure.layout_parser.pred_titles` | | `.docx` | `formats.docx.parser.parse_docx` + `convert_doc2dics` | OXML iteration → heading detection → hierarchical tree | | `.doc` | `conversion.legacy_converter.doc_to_docx` → `.docx` pipeline | LibreOffice headless conversion first | | `.pptx` | `formats.pptx.parser.parse_pptx` | iLoveAPI PPTX→PDF → MinerU pipeline | @@ -187,12 +192,19 @@ Key logic in `parse_docx()`: - **Table handling**: `table2html()` converts python-docx Table to HTML with accurate `rowspan`/`colspan` via direct OXML inspection. -### PDF Parsing: MinerU Pipeline +### PDF Parsing: DOC_PROFILE + MinerU Pipeline ```mermaid flowchart LR - PDF[formats.pdf.parser] --> MinerU[MinerU Cloud API] - MinerU --> MDFile[Markdown + layout.json] + PDF[profiling.doc_profiler.profile_document] --> Probe[document_agent probe_page_features] + Probe --> Coarse[VLM coarse category] + Coarse -->|atlas| Atlas[formats.atlas.parser] + Coarse -->|oversized generic/scanned/slides| Anatomy[document_agent structural anatomy + shard_plan] + Coarse -->|standard| MinerU[MinerU Cloud API] + Anatomy --> Shards[Shard MinerU pipeline] + Shards --> MDFile[Markdown + layout.json] + Atlas --> Chunks[Atlas page chunks] + MinerU --> MDFile MDFile --> MDParser[formats.markdown.parser.parse_md] MDParser --> EvalHeadings[eval_md_headings + layout.json] EvalHeadings --> PredTitles[structure.layout_parser.pred_titles] @@ -207,7 +219,7 @@ flowchart LR | Heading hierarchy recognition | `HIERARCHY_LLM_MODEL` | Falls back to `NORMOL_MODEL` | | Image description (VLM) | `IMAGE_MODEL` | `qwen3.5-flash` | | Image OCR / Q&A | `IMAGE_MODEL_MAX` | `qwen3.5-flash` | -| Atlas classification | VLM via `formats.atlas.classifier` | `IMAGE_MODEL` | +| PDF coarse classification | `IMAGE_MODEL` | `qwen3.5-flash` | --- diff --git a/apps/worker/app/services/document_agent/bootstrap/aggregate_stats.py b/apps/worker/app/services/document_agent/bootstrap/aggregate_stats.py index 2177cbf6..f0739edf 100644 --- a/apps/worker/app/services/document_agent/bootstrap/aggregate_stats.py +++ b/apps/worker/app/services/document_agent/bootstrap/aggregate_stats.py @@ -54,6 +54,7 @@ def aggregate_doc_stats(ctx: ToolContext, _args: dict[str, Any]) -> ToolResult: start = time.monotonic() features = list(ctx.blackboard.page_features) stats: dict[str, Any] = {} + page_count = len(features) extrema_pages: list[int] = [] extrema_samples: list[dict[str, Any]] = [] for metric in PROFILE_METRICS: @@ -95,9 +96,43 @@ def aggregate_doc_stats(ctx: ToolContext, _args: dict[str, Any]) -> ToolResult: ) deduped_extrema = sorted(set(extrema_pages)) + landscape_pages = sum(1 for feature in features if feature.orientation == "landscape") + scan_like_pages = sum( + 1 + for feature in features + if feature.raw_text_length < 50 and feature.image_coverage >= 0.5 + ) + image_heavy_pages = sum( + 1 for feature in features if feature.image_coverage >= 0.35 + ) + table_signal_pages = sum( + 1 + for feature in features + if feature.table_count > 0 or feature.drawings_count >= 25 + ) + doc_shape = { + "page_count": page_count, + "landscape_pages": landscape_pages, + "landscape_ratio": round(landscape_pages / page_count, 4) + if page_count + else 0.0, + "scan_like_pages": scan_like_pages, + "scan_like_ratio": round(scan_like_pages / page_count, 4) + if page_count + else 0.0, + "image_heavy_pages": image_heavy_pages, + "image_heavy_ratio": round(image_heavy_pages / page_count, 4) + if page_count + else 0.0, + "table_signal_pages": table_signal_pages, + "table_signal_ratio": round(table_signal_pages / page_count, 4) + if page_count + else 0.0, + } ctx.blackboard.doc_stats = stats ctx.blackboard.extrema_pages = deduped_extrema ctx.blackboard.global_signals["doc_stats"] = stats + ctx.blackboard.global_signals["doc_shape"] = doc_shape ctx.blackboard.global_signals["extrema_pages"] = deduped_extrema ctx.blackboard.global_signals["extrema_samples"] = extrema_samples return ToolResult( @@ -106,10 +141,12 @@ def aggregate_doc_stats(ctx: ToolContext, _args: dict[str, Any]) -> ToolResult: "metric_count": len(PROFILE_METRICS), "extrema_pages": deduped_extrema, "extrema_samples": extrema_samples, + "doc_shape": doc_shape, }, latency_ms=int((time.monotonic() - start) * 1000), output_summary={ "doc_stats": stats, + "doc_shape": doc_shape, "extrema_pages": deduped_extrema, "extrema_samples": extrema_samples, }, diff --git a/apps/worker/app/services/document_agent/coordinator.py b/apps/worker/app/services/document_agent/coordinator.py index c32b05d4..374adbf3 100644 --- a/apps/worker/app/services/document_agent/coordinator.py +++ b/apps/worker/app/services/document_agent/coordinator.py @@ -14,7 +14,11 @@ ) from app.services.document_agent.budget import BudgetTracker from app.services.document_agent.executor import ReActExecutor -from app.services.document_agent.manifest import PageAnatomyMap, ToolContext +from app.services.document_agent.manifest import ( + DocumentProfile, + PageAnatomyMap, + ToolContext, +) from app.services.document_agent.persist import build_anatomy_map, persist_anatomy_map from app.services.document_agent.planner import ProfilePlanner from app.services.document_agent.registry import REGISTRY @@ -58,64 +62,100 @@ def __init__( def run(self) -> PageAnatomyMap: try: - self.state = DocumentAgentState.RUNNING - self._run_bootstrap() - self._run_toc_pipeline() - profile, initial_decision, planner_result = ProfilePlanner(self.ctx).propose() - self.blackboard.document_profile = profile - self.blackboard.global_signals["document_profile"] = profile.to_dict() - self.trace.record_step( - round_index=self.round_index, - actor="planner", - action_type="plan", - result=planner_result, - tool_name=None, - tool_args={}, - ) - self.round_index += 1 + return self._run_structural() + except Exception as exc: + self._record_failure(exc) + raise - executor_result = ReActExecutor( - self.ctx, - registry=REGISTRY, - max_rounds=int(self.ctx.settings.get("max_rounds", 30)), - initial_decision=initial_decision, - ).run() - if executor_result.verdict.status != "success": - raise RuntimeError( - f"profile aborted: {executor_result.verdict.rationale}" - ) - anatomy = build_anatomy_map(self.ctx) - persist_result = persist_anatomy_map(self.ctx, {}) - self.trace.record_step( - round_index=self.round_index, - actor="persist", - action_type="persist", - result=persist_result, - tool_name="persist.anatomy_map", - tool_args={}, - ) - self.state = DocumentAgentState.READY - self.trace.write_trace_artifact( - self.ctx.output_dir, - final_status="ready", - summary=anatomy.trace_summary | self.trace.summary(), - ) - self.trace.flush( - final_status="ready", - summary=anatomy.trace_summary | self.trace.summary(), - ) - return anatomy + def run_coarse(self) -> DocumentProfile: + try: + return self._run_coarse() except Exception as exc: - logger.error(f"[document_agent] profile failed: {exc}") - self.state = DocumentAgentState.FAILED - self.trace.write_trace_artifact( - self.ctx.output_dir, - final_status="failed", - summary={"error": str(exc), "budget": self.ctx.budget.snapshot()}, - ) - self.trace.flush(final_status="failed", summary={"error": str(exc)}) + self._record_failure(exc) + raise + + def run_structural(self) -> PageAnatomyMap: + try: + return self._run_structural() + except Exception as exc: + self._record_failure(exc) raise + def _run_coarse(self) -> DocumentProfile: + self.state = DocumentAgentState.RUNNING + if not self.blackboard.page_features: + self._run_bootstrap() + profile, _initial_decision, planner_result = ProfilePlanner(self.ctx).propose() + self.blackboard.document_profile = profile + self.blackboard.global_signals["document_profile"] = profile.to_dict() + self.trace.record_step( + round_index=self.round_index, + actor="planner:coarse", + action_type="plan", + result=planner_result, + tool_name=None, + tool_args={}, + ) + self.round_index += 1 + return profile + + def _run_structural(self) -> PageAnatomyMap: + self.state = DocumentAgentState.RUNNING + if not self.blackboard.page_features: + self._run_bootstrap() + self._run_toc_pipeline() + profile, initial_decision, planner_result = ProfilePlanner(self.ctx).propose() + self.blackboard.document_profile = profile + self.blackboard.global_signals["document_profile"] = profile.to_dict() + self.trace.record_step( + round_index=self.round_index, + actor="planner", + action_type="plan", + result=planner_result, + tool_name=None, + tool_args={}, + ) + self.round_index += 1 + executor_result = ReActExecutor( + self.ctx, + registry=REGISTRY, + max_rounds=int(self.ctx.settings.get("max_rounds", 30)), + initial_decision=initial_decision, + ).run() + if executor_result.verdict.status != "success": + raise RuntimeError(f"profile aborted: {executor_result.verdict.rationale}") + anatomy = build_anatomy_map(self.ctx) + persist_result = persist_anatomy_map(self.ctx, {}) + self.trace.record_step( + round_index=self.round_index, + actor="persist", + action_type="persist", + result=persist_result, + tool_name="persist.anatomy_map", + tool_args={}, + ) + self.state = DocumentAgentState.READY + self.trace.write_trace_artifact( + self.ctx.output_dir, + final_status="ready", + summary=anatomy.trace_summary | self.trace.summary(), + ) + self.trace.flush( + final_status="ready", + summary=anatomy.trace_summary | self.trace.summary(), + ) + return anatomy + + def _record_failure(self, exc: Exception) -> None: + logger.error(f"[document_agent] profile failed: {exc}") + self.state = DocumentAgentState.FAILED + self.trace.write_trace_artifact( + self.ctx.output_dir, + final_status="failed", + summary={"error": str(exc), "budget": self.ctx.budget.snapshot()}, + ) + self.trace.flush(final_status="failed", summary={"error": str(exc)}) + def _run_bootstrap(self) -> None: for tool_name, handler in ( ("probe.page_features", probe_page_features), diff --git a/apps/worker/app/services/document_agent/manifest.py b/apps/worker/app/services/document_agent/manifest.py index 3fd747ca..b89f6164 100644 --- a/apps/worker/app/services/document_agent/manifest.py +++ b/apps/worker/app/services/document_agent/manifest.py @@ -47,6 +47,7 @@ def to_dict(self) -> dict[str, Any]: class DocumentProfile: is_scanned: bool category: str + routing_category: str = "generic" category_rationale: str = "" language: str = "unknown" rationale: str = "" diff --git a/apps/worker/app/services/document_agent/planner/planner.py b/apps/worker/app/services/document_agent/planner/planner.py index bada42b0..7c796586 100644 --- a/apps/worker/app/services/document_agent/planner/planner.py +++ b/apps/worker/app/services/document_agent/planner/planner.py @@ -18,6 +18,7 @@ ) from app.services.document_agent.planner.prompts import PLANNER_INSTRUCTIONS from app.services.document_agent.visual import render_pages +from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory from shared.utils.token_estimate import estimate_tokens PAGE_KIND_DEFINITIONS = { @@ -119,6 +120,9 @@ def _parse_profile_and_decision(raw: str) -> tuple[DocumentProfile, ReflexionDec if not isinstance(data, dict): raise ValueError("planner output must be a JSON object") category = " ".join(str(data.get("category") or "unknown document").split()[:5]) + routing_category = PdfRoutingCategory.normalize( + data.get("routing_category") or data.get("category") + ).value raw_is_scanned = data.get("is_scanned") if isinstance(raw_is_scanned, bool): is_scanned = raw_is_scanned @@ -129,6 +133,7 @@ def _parse_profile_and_decision(raw: str) -> tuple[DocumentProfile, ReflexionDec profile = DocumentProfile( is_scanned=is_scanned, category=category or "unknown document", + routing_category=routing_category, category_rationale=str(data.get("category_rationale") or ""), language=str(data.get("language") or "unknown"), rationale=str(data.get("rationale") or ""), @@ -196,6 +201,7 @@ def propose(self) -> tuple[DocumentProfile, ReflexionDecision, ToolResult]: profile = DocumentProfile( is_scanned=False, category="unknown document", + routing_category=PdfRoutingCategory.GENERIC.value, rationale="No planner model configured.", ) decision = ReflexionDecision( @@ -227,6 +233,7 @@ def propose(self) -> tuple[DocumentProfile, ReflexionDecision, ToolResult]: {}, ), "page_kind_definitions": PAGE_KIND_DEFINITIONS, + "doc_shape": self.ctx.blackboard.global_signals.get("doc_shape", {}), "doc_stats": self.ctx.blackboard.doc_stats, "extrema_samples": self.ctx.blackboard.global_signals.get( "extrema_samples", @@ -314,5 +321,3 @@ def propose(self) -> tuple[DocumentProfile, ReflexionDecision, ToolResult]: except Exception: self.ctx.budget.refund("visual", est=prompt_tokens_est) raise - - diff --git a/apps/worker/app/services/document_agent/planner/prompts.py b/apps/worker/app/services/document_agent/planner/prompts.py index 49feab39..7bb1f316 100644 --- a/apps/worker/app/services/document_agent/planner/prompts.py +++ b/apps/worker/app/services/document_agent/planner/prompts.py @@ -2,11 +2,15 @@ PLANNER_INSTRUCTIONS = ( "You are a document profile agent. Use global page-feature statistics, " - "TOC/H1 evidence, and page screenshots to classify the document and decide " - "whether enough evidence exists to continue toward sharding. Return strict " - "JSON only with keys: is_scanned, category, category_rationale, language, " - "rationale, next_action, inspect_pages, grep_query. category must be at " - "most 5 English words. next_action must be one of inspect_more, grep_text, " + "optional TOC/H1 evidence, and page screenshots to classify the PDF. Return strict " + "JSON only with keys: is_scanned, category, routing_category, " + "category_rationale, language, rationale, next_action, inspect_pages, grep_query. " + "category is a concise semantic document type, at most 5 English words, such " + "as Financial Prospectus, Technical Manual, Corporate Policy, Research Report, " + "Engineering Atlas, or Scanned Handbook. routing_category must be one of " + "atlas, scanned, slides, generic. Set routing_category=atlas only for " + "engineering drawing collections, construction standard atlases, or page sets " + "whose primary unit is a drawing/detail sheet rather than prose. next_action must be one of inspect_more, grep_text, " "ready_to_shard, verdict_now. Use inspect_more only when specific extra " "page screenshots are needed. Use grep_text only for native PDFs when a " "global text search would clarify structure. Do not output a fixed step " diff --git a/apps/worker/app/services/document_agent/profile_agent.py b/apps/worker/app/services/document_agent/profile_agent.py index 6a73cf74..c56e18c1 100644 --- a/apps/worker/app/services/document_agent/profile_agent.py +++ b/apps/worker/app/services/document_agent/profile_agent.py @@ -6,7 +6,7 @@ from typing import Any from app.services.document_agent.coordinator import ProfileCoordinator -from app.services.document_agent.manifest import PageAnatomyMap +from app.services.document_agent.manifest import DocumentProfile, PageAnatomyMap class ProfileAgent: @@ -38,3 +38,23 @@ def run( settings=self._settings, ) return coordinator.run() + + def run_coarse( + self, + file_path: str, + job_id: str, + *, + output_dir: str | None = None, + db: Any | None = None, + ) -> DocumentProfile: + if not os.path.exists(file_path): + raise FileNotFoundError(file_path) + coordinator = ProfileCoordinator( + pdf_path=file_path, + job_id=job_id, + output_dir=output_dir, + db=db, + model=self._model, + settings=self._settings, + ) + return coordinator.run_coarse() diff --git a/apps/worker/app/services/document_parser/formats/atlas/classifier.py b/apps/worker/app/services/document_parser/formats/atlas/classifier.py deleted file mode 100644 index 87c43e86..00000000 --- a/apps/worker/app/services/document_parser/formats/atlas/classifier.py +++ /dev/null @@ -1,179 +0,0 @@ -""" -VLM-based Atlas Classifier - -Second-pass visual confirmation for atlas_candidate documents. -Renders the first 3 pages of a PDF as PNG images, then asks the vision -model to decide whether the document is an engineering atlas. - -Architecture: - - Page rendering via PyMuPDF runs in a spawned child process (consistent - with the rest of the parsing pipeline). - - VLM call is made in the *main process* after the child exits cleanly. - - Fails gracefully: any error returns False (treat as non-atlas). -""" - -import base64 -import os -import tempfile -from typing import Optional - -from app.services.document_parser.formats.pdf.pymupdf_subprocess import run_in_child_process, worker -from loguru import logger -from openai.types.chat import ( - ChatCompletionContentPartImageParam, - ChatCompletionContentPartParam, - ChatCompletionContentPartTextParam, - ChatCompletionMessageParam, -) - -# ── Prompt ────────────────────────────────────────────────────────────────── -_ATLAS_JUDGE_PROMPT = """You are a document classification expert. Please observe the following PDF page screenshots and determine whether the document is an engineering atlas (drawing collection). - -[Typical Characteristics of an Engineering Atlas] -- Page content is primarily technical drawings (e.g., architectural floor plans, structural details, pipeline installation diagrams, equipment layout plans). -- Usually contains a title block / info bar (including drawing name, drawing number, design institute, scale, date, etc.). -- Pages consist mainly of graphics, lines, annotations, and dimensions, with very little pure text. -- Page orientation is typically landscape (mostly A3 landscape). -- Common types: National standard design atlases (e.g., 09 series, 22 series), construction drawings, installation detail drawings. - -[Judgment Criteria] -- If this IS an engineering atlas, reply ONLY with: yes -- If this IS NOT an engineering atlas (e.g., normal report, academic paper, presentation slides), reply ONLY with: no - -You must reply ONLY with "yes" or "no", do not say anything else.""" - -# ── Child-process renderer ─────────────────────────────────────────────────── - - -@worker -def _render_pages_worker( - queue, pdf_path: str, page_indices: list, dpi: int, out_dir: str -) -> None: - """Child process: render given PDF pages to PNG files in out_dir.""" - import pymupdf - - mat = pymupdf.Matrix(dpi / 72, dpi / 72) - rendered: list[str] = [] - try: - doc = pymupdf.open(pdf_path) - for idx in page_indices: - if idx >= doc.page_count: - break - page = doc[idx] - pix = page.get_pixmap(matrix=mat, alpha=False) - out_path = os.path.join(out_dir, f"atlas_preview_p{idx + 1}.png") - pix.save(out_path) - rendered.append(out_path) - pix = None - page = None - doc.close() - except Exception as exc: - queue.put({"ok": False, "error": str(exc), "rendered": []}) - return - queue.put({"ok": True, "rendered": rendered}) - - -def _render_preview_pages( - pdf_path: str, - page_indices: list[int], - out_dir: str, - dpi: int = 120, -) -> list[str]: - """Render pages to PNG files. Returns list of file paths.""" - result = run_in_child_process( - _render_pages_worker, pdf_path, page_indices, dpi, out_dir, timeout=30 - ) - if not result.get("ok"): - raise RuntimeError(f"Page render failed: {result.get('error')}") - return result["rendered"] - - -def _png_to_data_url(path: str) -> Optional[str]: - """Base64-encode a PNG file as a data URL.""" - try: - with open(path, "rb") as f: - data = base64.b64encode(f.read()).decode("utf-8") - return f"data:image/png;base64,{data}" - except Exception as exc: - logger.warning(f"[atlas_classifier] Failed to encode {path}: {exc}") - return None - - -def _call_vlm(image_data_urls: list[str]) -> bool: - """Call VLM with preview images. Returns True if atlas, False otherwise.""" - from shared.core.config import settings - from shared.services.ai.openai_compatible_client_sync import get_openai_client - - model = settings.IMAGE_MODEL or "qwen-vl-plus" - client = get_openai_client(model=model) - - content: list[ChatCompletionContentPartParam] = [ - ChatCompletionContentPartTextParam( - type="text", - text=_ATLAS_JUDGE_PROMPT, - ) - ] - for url in image_data_urls: - content.append( - ChatCompletionContentPartImageParam( - type="image_url", - image_url={"url": url}, - ) - ) - - messages: list[ChatCompletionMessageParam] = [ - {"role": "user", "content": content} - ] - resp: str = client.chat_completion( - messages=messages, - model=model, - temperature=0.0, - max_tokens=8, - ) - answer = resp.strip().lower().strip(".") - logger.info(f"[atlas_classifier] VLM answer: {repr(resp)}") - return answer.startswith("yes") or answer == "1" or answer == "true" - - -# ── Public API ─────────────────────────────────────────────────────────────── - - -def classify_atlas_with_vlm(pdf_path: str, n_pages: int = 3) -> bool: - """ - Render the first `n_pages` pages of `pdf_path` and ask the VLM whether - the document is an engineering atlas. - - Returns: - True → confirmed atlas - False → not an atlas (or error — fail-safe default) - """ - page_indices = list(range(n_pages)) - with tempfile.TemporaryDirectory(prefix="atlas_clf_") as tmp_dir: - try: - png_paths = _render_preview_pages(pdf_path, page_indices, tmp_dir) - if not png_paths: - logger.warning( - "[atlas_classifier] No pages rendered, defaulting to non-atlas" - ) - return False - - data_urls = [u for p in png_paths if (u := _png_to_data_url(p)) is not None] - if not data_urls: - logger.warning( - "[atlas_classifier] No images encoded, defaulting to non-atlas" - ) - return False - - logger.info( - f"[atlas_classifier] Sending {len(data_urls)} page(s) to VLM for atlas check" - ) - is_atlas = _call_vlm(data_urls) - logger.info(f"[atlas_classifier] VLM result: is_atlas={is_atlas}") - return is_atlas - - except Exception as exc: - logger.warning( - f"[atlas_classifier] VLM atlas check failed for {pdf_path!r}, " - f"defaulting to non-atlas. Error: {exc}" - ) - return False diff --git a/apps/worker/app/services/document_parser/formats/atlas/parser.py b/apps/worker/app/services/document_parser/formats/atlas/parser.py index ddac6a6d..efca015e 100644 --- a/apps/worker/app/services/document_parser/formats/atlas/parser.py +++ b/apps/worker/app/services/document_parser/formats/atlas/parser.py @@ -2,7 +2,7 @@ """ Atlas-specific parsing pipeline. -For documents detected as atlas (doc_category="atlas") — e.g. engineering drawing +For documents detected as atlas (category="atlas") — e.g. engineering drawing collections — this module BYPASSES MinerU entirely and uses PyMuPDF directly to: 1. Extract text from each page (for naming and content) 2. Render each page as a single complete image (preserving full-page layout) @@ -282,7 +282,7 @@ def parse_atlas( output_dir: output directory for images, full.md, etc. base_llm_paras: LLM parameters dict relative_root: path prefix for chunk path field - profile: DocProfile with scan_type info + profile: parser profile with is_scanned info Returns: pd.DataFrame with ALL_DF_COLS columns @@ -305,7 +305,7 @@ def parse_atlas( # ── Determine if VLM is needed ── use_vlm = True - is_scanned = profile and profile.scan_type == "scanned" + is_scanned = bool(profile and profile.is_scanned) scan_label = "scanned" if is_scanned else "non-scanned" logger.info(f"📐 Atlas: {scan_label} document, VLM enabled for info extraction") diff --git a/apps/worker/app/services/document_parser/formats/pdf/parser.py b/apps/worker/app/services/document_parser/formats/pdf/parser.py index 217c89f6..08cfab8d 100755 --- a/apps/worker/app/services/document_parser/formats/pdf/parser.py +++ b/apps/worker/app/services/document_parser/formats/pdf/parser.py @@ -8,6 +8,7 @@ build_oversized_pdf_processing_failed_exception, ) from app.services.document_parser.providers.mineru.pdf_service import parse_via_full +from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory from app.services.document_parser.support.stage_profiler import stage_timer from loguru import logger @@ -25,11 +26,10 @@ def parse_pdfs( s3_key=None, job_id=None, ): - route = profile.route if profile else "standard" base_llm_paras.update({"doc_name": filename}) # ── Atlas routing: bypass MinerU entirely ── - if profile and profile.doc_category == "atlas": + if profile and profile.routing_category is PdfRoutingCategory.ATLAS: logger.info(f"📐 Atlas detected, bypassing MinerU for {filename}") from app.services.document_parser.formats.atlas.parser import parse_atlas @@ -61,7 +61,7 @@ def parse_pdfs( ) from exc # ── Standard single-pass MinerU ── - logger.info(f"📄 Standard MinerU parse for {filename} [route={route}]") + logger.info(f"📄 Standard MinerU parse for {filename}") with stage_timer("pdf.extract.standard", filename=filename): parse_via_full(pdf_path, filename, output_dir, s3_key=s3_key) @@ -100,23 +100,21 @@ def _parse_oversized_pdf( merge_html_tables, ) from app.services.document_parser.formats.pdf.shard_merger import merge_images, merge_shard_lines - from app.services.document_parser.formats.pdf.shard_splitter import ( - bin_pack_shards, - run_doc_agent, - split_pdf, - ) + from app.services.document_parser.formats.pdf.shard_splitter import bin_pack_shards, split_pdf - doc_agent_job_id = job_id or base_llm_paras.get("doc_name", filename) work_dir: str | None = None temp_shard_s3_keys: list[str] = [] try: - # 1. Run doc_agent to get full anatomy map (shard plan + TOC info) - with stage_timer("pdf.doc_agent", filename=filename): - anatomy = run_doc_agent( - pdf_path, - job_id=doc_agent_job_id, - output_dir=output_dir, + # 1. Reuse the entry DOC_PROFILE anatomy map (shard plan + TOC info). + anatomy = getattr(profile, "anatomy", None) + if anatomy is None: + raise RuntimeError( + f"Oversized PDF profile for {filename} is missing structural anatomy" + ) + if not anatomy.shard_plan.enabled or not anatomy.shard_plan.shards: + raise RuntimeError( + f"Oversized PDF profile for {filename} did not produce a shard plan" ) agent_shards = anatomy.shard_plan.shards diff --git a/apps/worker/app/services/document_parser/formats/pdf/shard_splitter.py b/apps/worker/app/services/document_parser/formats/pdf/shard_splitter.py index 99413e75..6fe1308b 100644 --- a/apps/worker/app/services/document_parser/formats/pdf/shard_splitter.py +++ b/apps/worker/app/services/document_parser/formats/pdf/shard_splitter.py @@ -1,4 +1,4 @@ -"""PDF shard splitting: doc_agent integration + bin-packing + physical split.""" +"""PDF shard splitting: bin-packing + physical split.""" from __future__ import annotations @@ -10,7 +10,7 @@ from loguru import logger if TYPE_CHECKING: - from app.services.document_agent.manifest import PageAnatomyMap, Shard + from app.services.document_agent.manifest import Shard @dataclass @@ -31,40 +31,6 @@ def page_offset(self) -> int: return self.page_start - 1 -def run_doc_agent( - pdf_path: str, job_id: str, output_dir: str -) -> "PageAnatomyMap": - """Run doc_agent ProfileCoordinator and return the full anatomy map. - - Returns the complete PageAnatomyMap so callers can access TOC info - (toc_result.toc_pages, toc_hierarchies) in addition to the shard plan. - - Raises RuntimeError if the agent fails or produces no shards. - """ - from app.services.document_agent.coordinator import ProfileCoordinator - - agent_output_dir = os.path.join(output_dir, "_doc_agent") - os.makedirs(agent_output_dir, exist_ok=True) - - coordinator = ProfileCoordinator( - pdf_path=pdf_path, - job_id=job_id, - output_dir=agent_output_dir, - ) - anatomy = coordinator.run() - - if not anatomy.shard_plan.enabled or not anatomy.shard_plan.shards: - raise RuntimeError( - f"Doc agent did not produce a valid shard plan for {job_id}" - ) - - shards = anatomy.shard_plan.shards - logger.info( - f"📋 Doc agent: {len(shards)} shards via {anatomy.shard_plan.reason}" - ) - return anatomy - - def bin_pack_shards( agent_shards: list["Shard"], max_pages: int, diff --git a/apps/worker/app/services/document_parser/orchestration/parse_session.py b/apps/worker/app/services/document_parser/orchestration/parse_session.py index beec3c35..a5cf33d3 100644 --- a/apps/worker/app/services/document_parser/orchestration/parse_session.py +++ b/apps/worker/app/services/document_parser/orchestration/parse_session.py @@ -4,14 +4,10 @@ from dataclasses import dataclass from typing import Any -from app.services.document_parser.formats.atlas.classifier import classify_atlas_with_vlm from app.services.document_parser.orchestration.path_segment import ( build_parser_path_segment, ) from app.services.document_parser.orchestration.parse_input import ParseInput -from app.services.document_parser.orchestration.oversized_pdf_policy import ( - raise_if_oversized_pdf_not_supported, -) from app.services.document_parser.profiling.doc_profiler import profile_document from app.services.document_parser.support.stage_profiler import stage_timer from loguru import logger @@ -91,49 +87,11 @@ def build_parse_session(parse_input: ParseInput) -> ParseSession: profile = profile_document( parse_input.file_full_path, parse_input.internal_output_filename, - ) - logger.info(f"📋 DocProfile: {profile.summary()}") - logger.debug(f"📋 Reasoning: {profile.reasoning}") - - if profile.atlas_candidate and profile.doc_category not in ("atlas", "ppt_converted"): - logger.info( - f"🔍 Atlas candidate detected, running VLM visual check for {parse_input.filename}" - ) - with stage_timer("document.atlas_vlm_check", filename=parse_input.filename): - vlm_is_atlas = classify_atlas_with_vlm(parse_input.file_full_path) - if vlm_is_atlas: - profile.doc_category = "atlas" - profile.reasoning += " | vlm_confirmed_atlas=True" - logger.info(f"✅ VLM confirmed atlas for {parse_input.filename}") - else: - profile.reasoning += " | vlm_confirmed_atlas=False" - logger.info( - f"ℹ️ VLM rejected atlas for {parse_input.filename}, routing as generic" - ) - - if profile.file_type == "pdf" and profile.page_count > settings.MAX_PDF_PAGE_LIMIT: - raise_if_oversized_pdf_not_supported(page_count=profile.page_count) - - if profile.doc_category == "atlas": - filename, internal_output_filename, relative_root, full_output_dir = ( - _rename_atlas_output( - filename=parse_input.filename, - internal_output_filename=parse_input.internal_output_filename, - output_dir=parse_input.output_dir, - ) - ) - logger.info(f"📐 Atlas output renamed: {filename}") - parse_input = ParseInput( - file_full_path=parse_input.file_full_path, - filename=filename, - output_dir=parse_input.output_dir, - internal_output_filename=internal_output_filename, job_id=parse_input.job_id, - options=parse_input.options, - base_url=parse_input.base_url, - fragment_content=parse_input.fragment_content, - s3_key=parse_input.s3_key, + output_dir=full_output_dir, ) + logger.info(f"📋 DOC_PROFILE: {profile.summary()}") + logger.debug(f"📋 Reasoning: {profile.reasoning}") return ParseSession.from_input( parse_input=parse_input, @@ -141,25 +99,7 @@ def build_parse_session(parse_input: ParseInput) -> ParseSession: full_output_dir=full_output_dir, profile=profile, relative_root=relative_root, - ) - - -def _rename_atlas_output( - *, - filename: str, - internal_output_filename: str, - output_dir: str, -) -> tuple[str, str, str, str]: - name_base, _ = os.path.splitext(filename) - internal_name_base, _ = os.path.splitext(internal_output_filename) - atlas_filename = name_base + ".atlas" - atlas_internal_filename = internal_name_base + ".atlas" - relative_root, full_output_dir = _resolve_output_paths( - filename=atlas_filename, - internal_output_filename=atlas_internal_filename, - output_dir=output_dir, - ) - return atlas_filename, atlas_internal_filename, relative_root, full_output_dir +) def _resolve_output_paths( diff --git a/apps/worker/app/services/document_parser/profiling/doc_profile_model.py b/apps/worker/app/services/document_parser/profiling/doc_profile_model.py deleted file mode 100644 index ad1a12e6..00000000 --- a/apps/worker/app/services/document_parser/profiling/doc_profile_model.py +++ /dev/null @@ -1,75 +0,0 @@ -from __future__ import annotations - -import gc -import json -import os -from dataclasses import asdict, dataclass, field -from typing import List, Literal, Optional - -from loguru import logger - - -@dataclass -class DocProfile: - """Document profile data contract used by parser routing.""" - - file_type: str = "" - route: Literal["fast", "standard"] = "standard" - decision_band: Literal["safe_fast", "gray_zone", "safe_standard"] = "safe_standard" - scan_type: Optional[Literal["electronic", "scanned", "mixed"]] = None - doc_category: Literal["generic", "atlas", "ppt_converted"] = "generic" - page_count: int = 0 - avg_text_density: float = 0.0 - avg_image_coverage: float = 0.0 - has_tables: bool = False - has_embedded_fonts: bool = False - is_multi_column: bool = False - is_degraded_electronic: bool = False - sample_text: str = "" - has_significant_images: bool = False - significant_image_count: int = 0 - max_image_coverage_on_page: float = 0.0 - pages_with_significant_images: int = 0 - large_image_page_ratio: float = 0.0 - table_signal_pages: int = 0 - table_signal_strength: float = 0.0 - complex_pages: int = 0 - complex_page_ratio: float = 0.0 - max_drawing_count: int = 0 - min_text_density_page: float = 0.0 - text_density_std: float = 0.0 - estimated_fast_benefit: float = 0.0 - estimated_risk_score: float = 0.0 - atlas_candidate: bool = False - page_details: List[dict] = field(default_factory=list) - reasoning: str = "" - - def to_dict(self) -> dict: - data = asdict(self) - data.pop("page_details", None) - data.pop("sample_text", None) - return data - - def summary(self) -> str: - parts = ( - f"[{self.file_type.upper()}] route={self.route}, band={self.decision_band}, " - f"scan={self.scan_type}, category={self.doc_category}, " - f"pages={self.page_count}, text_density={self.avg_text_density:.0f}, " - f"img_coverage={self.avg_image_coverage:.1%}, " - f"risk={self.estimated_risk_score:.2f}, gain={self.estimated_fast_benefit:.2f}" - ) - if self.is_degraded_electronic: - parts += ", degraded=True" - return parts - - -def publish_profile_result(queue, profile: DocProfile) -> None: - gc.collect() - queue.put({"ok": True, "profile": asdict(profile)}) - - -def save_profile_metadata(profile: DocProfile, output_dir: str) -> None: - profile_path = os.path.join(output_dir, "profile.json") - with open(profile_path, "w", encoding="utf-8") as file_obj: - json.dump(profile.to_dict(), file_obj, ensure_ascii=False, indent=2) - logger.debug(f"Profile metadata saved to {profile_path}") diff --git a/apps/worker/app/services/document_parser/profiling/doc_profile_pdf.py b/apps/worker/app/services/document_parser/profiling/doc_profile_pdf.py deleted file mode 100644 index 32a13840..00000000 --- a/apps/worker/app/services/document_parser/profiling/doc_profile_pdf.py +++ /dev/null @@ -1,671 +0,0 @@ -# pyright: reportAttributeAccessIssue=false, reportOperatorIssue=false -from __future__ import annotations - -import math -from typing import Any - -from app.services.document_parser.profiling.doc_profile_model import ( - DocProfile, - publish_profile_result, -) -from app.services.document_parser.formats.pdf.pymupdf_subprocess import run_in_child_process, worker -from loguru import logger - -# Thresholds -SCAN_TEXT_THRESHOLD = 50 -SCAN_IMAGE_COVERAGE_MIN = 0.6 -SCAN_PAGE_RATIO = 0.7 - -ATLAS_TEXT_THRESHOLD = 200 -ATLAS_CANDIDATE_IMAGE_COVERAGE_MIN = 0.30 -ATLAS_MIN_LANDSCAPE_RATIO = 0.5 # ≥50% of sampled pages must be landscape -ATLAS_MIN_PAGES = 2 # single-page scans (resumes, posters) are not atlases - -FAST_TEXT_THRESHOLD = 500 -MIN_FAST_TEXT_DENSITY_FLOOR = 120 -SAFE_FAST_MAX_PAGE_COUNT = 80 -HARD_STANDARD_PAGE_COUNT = 150 - -MULTI_COL_GAP_RATIO = 0.15 -MULTI_COL_MIN_BLOCKS = 4 - -DEGRADED_SKINNY_ASPECT = 50 -DEGRADED_SKINNY_MAX_H = 30 -DEGRADED_SKINNY_MIN_PER_PAGE = 50 -DEGRADED_PAGE_RATIO = 0.5 - -SIGNIFICANT_IMAGE_AREA_RATIO = 0.12 -MEDIUM_IMAGE_AREA_RATIO = 0.03 -LARGE_IMAGE_PAGE_RATIO = 0.25 -SIGNIFICANT_IMAGE_MIN_DIM = 400 -SIGNIFICANT_IMAGE_MIN_PIXELS = 250_000 - -PROFILE_MAX_NEW_XREFS_PER_PAGE = 30 - -TABLE_DRAWING_LINE_THRESHOLD = 12 -TABLE_DRAWING_STRONG_THRESHOLD = 18 -TABLE_DRAWING_RECT_THRESHOLD = 2 - -SAFE_FAST_MAX_COMPLEX_PAGE_RATIO = 0.05 -SAFE_FAST_MAX_IMAGE_COVERAGE_ON_PAGE = 0.08 -SAFE_FAST_MAX_AVG_IMAGE_COVERAGE = 0.03 -SAFE_FAST_MAX_TEXT_STD = 600.0 -HARD_COMPLEX_PAGE_RATIO = 0.2 -HARD_SIGNIFICANT_IMAGE_PAGES = 3 -HARD_LARGE_IMAGE_PAGE_RATIO = 0.15 - - -def _clamp(value: float, min_value: float = 0.0, max_value: float = 1.0) -> float: - return max(min_value, min(max_value, value)) - - -def _stddev(values: list[float]) -> float: - if not values: - return 0.0 - mean = sum(values) / len(values) - variance = sum((value - mean) ** 2 for value in values) / len(values) - return math.sqrt(variance) - - -def _count_detected_tables(page: Any) -> int: - try: - finder = page.find_tables() - except Exception: - return 0 - - if not finder: - return 0 - - tables = getattr(finder, "tables", finder) - try: - return len(tables) - except TypeError: - return 1 if tables else 0 - - -def _is_stroked_drawing(drawing: dict[str, Any]) -> bool: - stroke_width = drawing.get("width") - return drawing.get("color") is not None or ( - stroke_width is not None and stroke_width > 0 - ) - - -def _estimate_fast_benefit(profile: DocProfile) -> float: - if profile.page_count <= 2: - page_factor = 0.35 - elif profile.page_count <= 10: - page_factor = 0.7 - elif profile.page_count <= SAFE_FAST_MAX_PAGE_COUNT: - page_factor = 1.0 - elif profile.page_count <= HARD_STANDARD_PAGE_COUNT: - page_factor = 0.8 - else: - page_factor = 0.45 - - density_factor = _clamp(profile.avg_text_density / 1200.0) - stability_factor = _clamp( - 1.0 - - (profile.complex_page_ratio * 1.5) - - (profile.large_image_page_ratio * 1.2) - - (profile.table_signal_strength * 0.8) - ) - return _clamp( - (0.35 * page_factor) + (0.40 * density_factor) + (0.25 * stability_factor) - ) - - -def _estimate_risk_score(profile: DocProfile) -> float: - risk = 0.0 - if profile.scan_type != "electronic": - risk += 0.35 - if profile.doc_category != "generic": - risk += 0.20 - if profile.is_multi_column: - risk += 0.20 - if profile.is_degraded_electronic: - risk += 0.20 - if profile.has_tables: - risk += 0.30 - - risk += min(0.20, profile.large_image_page_ratio * 1.2) - risk += min(0.20, profile.complex_page_ratio * 0.8) - risk += min(0.15, profile.table_signal_strength * 0.2) - risk += min(0.12, profile.pages_with_significant_images * 0.04) - - if profile.page_count > HARD_STANDARD_PAGE_COUNT: - risk += 0.10 - - return _clamp(risk) - - -def _classify_route(profile: DocProfile) -> tuple[str, str, float, float, list[str]]: - hard_gate_reasons: list[str] = [] - - if profile.scan_type != "electronic": - hard_gate_reasons.append(f"scan_type={profile.scan_type}") - if profile.doc_category != "generic": - hard_gate_reasons.append(f"doc_category={profile.doc_category}") - if profile.is_multi_column: - hard_gate_reasons.append("multi_column") - if profile.is_degraded_electronic: - hard_gate_reasons.append("degraded_electronic") - if profile.has_tables: - hard_gate_reasons.append( - f"table_signals={profile.table_signal_pages}p/{profile.table_signal_strength:.2f}" - ) - if ( - profile.max_image_coverage_on_page >= LARGE_IMAGE_PAGE_RATIO - or profile.pages_with_significant_images >= HARD_SIGNIFICANT_IMAGE_PAGES - or profile.large_image_page_ratio >= HARD_LARGE_IMAGE_PAGE_RATIO - ): - hard_gate_reasons.append( - "significant_images=" - f"{profile.pages_with_significant_images}p,max={profile.max_image_coverage_on_page:.1%}" - ) - if profile.complex_page_ratio >= HARD_COMPLEX_PAGE_RATIO: - hard_gate_reasons.append(f"complex_pages={profile.complex_page_ratio:.0%}") - if profile.page_count > HARD_STANDARD_PAGE_COUNT: - hard_gate_reasons.append( - f"page_count={profile.page_count}>{HARD_STANDARD_PAGE_COUNT}" - ) - - benefit = _estimate_fast_benefit(profile) - risk = _estimate_risk_score(profile) - - if hard_gate_reasons: - return ( - "standard", - "safe_standard", - benefit, - risk, - [ - "decision=safe_standard: hard gate matched", - "hard_gates=" + ",".join(hard_gate_reasons), - ], - ) - - safe_fast_checks = [ - ( - profile.page_count <= SAFE_FAST_MAX_PAGE_COUNT, - f"page_count={profile.page_count}<={SAFE_FAST_MAX_PAGE_COUNT}", - ), - ( - profile.avg_text_density >= MIN_FAST_TEXT_DENSITY_FLOOR, - "text_density_floor=" - f"{profile.avg_text_density:.0f}>={MIN_FAST_TEXT_DENSITY_FLOOR}", - ), - ( - not profile.has_significant_images, - f"has_significant_images={profile.has_significant_images}", - ), - ( - profile.max_image_coverage_on_page <= SAFE_FAST_MAX_IMAGE_COVERAGE_ON_PAGE, - "max_image_coverage_on_page=" - f"{profile.max_image_coverage_on_page:.1%}<={SAFE_FAST_MAX_IMAGE_COVERAGE_ON_PAGE:.0%}", - ), - ( - profile.avg_image_coverage <= SAFE_FAST_MAX_AVG_IMAGE_COVERAGE, - f"avg_image_coverage={profile.avg_image_coverage:.1%}<={SAFE_FAST_MAX_AVG_IMAGE_COVERAGE:.0%}", - ), - ( - profile.complex_page_ratio <= SAFE_FAST_MAX_COMPLEX_PAGE_RATIO, - f"complex_page_ratio={profile.complex_page_ratio:.0%}<={SAFE_FAST_MAX_COMPLEX_PAGE_RATIO:.0%}", - ), - ( - profile.text_density_std <= SAFE_FAST_MAX_TEXT_STD, - f"text_density_std={profile.text_density_std:.0f}<={SAFE_FAST_MAX_TEXT_STD:.0f}", - ), - ( - risk <= 0.35, - f"estimated_risk_score={risk:.2f}<=0.35", - ), - ] - - failed_checks = [reason for passed, reason in safe_fast_checks if not passed] - if not failed_checks: - return ( - "fast", - "safe_fast", - benefit, - risk, - [ - "decision=safe_fast: low-complexity high-yield pdf", - "safe_fast_checks_passed", - ], - ) - - return ( - "standard", - "gray_zone", - benefit, - risk, - [ - "decision=gray_zone: conservative fallback to standard in phase1", - "borderline=" + ",".join(failed_checks[:4]), - ], - ) - - - -@worker -def _profile_pdf_worker(queue, file_path: str) -> None: - """Child process: analyze PDF features, return profile as dict.""" - import pymupdf - - profile = DocProfile(file_type="pdf") - reasons: list[str] = [] - - try: - doc = pymupdf.open(file_path) - except Exception as exc: - profile.reasoning = f"Cannot open file: {exc}" - publish_profile_result(queue, profile) - return - - profile.page_count = doc.page_count - - if doc.page_count == 0: - profile.reasoning = "Empty file (0 pages)" - doc.close() - del doc - publish_profile_result(queue, profile) - return - - if doc.page_count <= 50: - sample_indices = list(range(doc.page_count)) - else: - step = max(1, doc.page_count // 20) - sample_indices = list(range(0, doc.page_count, step))[:20] - sample_indices = sorted( - set( - sample_indices - + [0, 1, 2] - + [doc.page_count - 3, doc.page_count - 2, doc.page_count - 1] - ) - ) - sample_indices = [idx for idx in sample_indices if 0 <= idx < doc.page_count] - - page_details = [] - text_lengths: list[float] = [] - total_text_len = 0 - total_image_coverage = 0.0 - scanned_pages = 0 - all_text_parts: list[str] = [] - has_any_fonts = False - has_any_tables = False - table_signal_pages = 0 - total_table_signal_strength = 0.0 - multi_col_pages = 0 - landscape_pages = 0 - degraded_pages = 0 - doc_page_sizes = [] - - significant_image_count = 0 - pages_with_significant_images = 0 - large_image_pages = 0 - max_image_coverage_on_page = 0.0 - - complex_pages = 0 - max_drawing_count = 0 - - # Track xrefs already processed across pages to avoid redundant - # get_image_rects() calls on shared/inherited image resources. - # PDFs with shared xrefs (e.g. scanned docs) report ALL document - # images on every page; without dedup this causes O(pages × images) - # content-stream scans. - seen_xrefs: set = set() - - for idx in sample_indices: - page = doc[idx] - page_width = page.rect.width - page_height = page.rect.height - page_area = page_width * page_height - - if page_width > page_height: - landscape_pages += 1 - doc_page_sizes.append((page_width, page_height)) - - text = page.get_text().strip() - text_len = len(text) - text_lengths.append(float(text_len)) - total_text_len += text_len - - if len("".join(all_text_parts)) < 500: - all_text_parts.append(text[:200]) - - images = page.get_images(full=True) - img_total_area = 0.0 - page_significant_image_count = 0 - page_max_rect_ratio = 0.0 - page_medium_image_coverage = 0.0 - skinny_count = 0 - - new_xref_count = 0 - for img in images: - xref = img[0] - img_w, img_h = img[2], img[3] - if ( - img_h > 0 - and img_w / img_h > DEGRADED_SKINNY_ASPECT - and img_h < DEGRADED_SKINNY_MAX_H - ): - skinny_count += 1 - - # ── xref dedup: skip images already analyzed on earlier pages ── - if xref in seen_xrefs: - continue - seen_xrefs.add(xref) - new_xref_count += 1 - # Cap expensive get_image_rects calls per page - if new_xref_count > PROFILE_MAX_NEW_XREFS_PER_PAGE: - continue - - try: - rects = page.get_image_rects(xref) - except Exception: - rects = [] - - for rect in rects: - rect_area = rect.width * rect.height - img_total_area += rect_area - - area_ratio = rect_area / page_area if page_area > 0 else 0.0 - page_max_rect_ratio = max(page_max_rect_ratio, area_ratio) - - is_significant = ( - area_ratio >= SIGNIFICANT_IMAGE_AREA_RATIO - or ( - area_ratio >= 0.05 - and ( - max(img_w, img_h) >= SIGNIFICANT_IMAGE_MIN_DIM - or (img_w * img_h) >= SIGNIFICANT_IMAGE_MIN_PIXELS - ) - ) - or ( - area_ratio >= 0.02 - and (img_w * img_h) >= (SIGNIFICANT_IMAGE_MIN_PIXELS * 2) - ) - ) - - if is_significant: - page_significant_image_count += 1 - elif area_ratio >= MEDIUM_IMAGE_AREA_RATIO: - page_medium_image_coverage += area_ratio - - if skinny_count >= DEGRADED_SKINNY_MIN_PER_PAGE: - degraded_pages += 1 - - img_coverage = img_total_area / page_area if page_area > 0 else 0.0 - img_coverage = min(img_coverage, 1.0) - total_image_coverage += img_coverage - - fonts = page.get_fonts() - if fonts: - has_any_fonts = True - - drawings = page.get_drawings() - drawing_count = len(drawings) - max_drawing_count = max(max_drawing_count, drawing_count) - line_like_items = 0 - horizontal_line_items = 0 - vertical_line_items = 0 - rect_items = 0 - fill_only_rect_items = 0 - for drawing in drawings: - is_stroked = _is_stroked_drawing(drawing) - for item in drawing.get("items", []): - if item[0] == "l": - if is_stroked: - line_like_items += 1 - point_a = item[1] - point_b = item[2] - if abs(point_a.y - point_b.y) <= 2: - horizontal_line_items += 1 - if abs(point_a.x - point_b.x) <= 2: - vertical_line_items += 1 - elif item[0] == "re": - if is_stroked: - rect_items += 1 - line_like_items += 4 - horizontal_line_items += 2 - vertical_line_items += 2 - else: - fill_only_rect_items += 1 - - detected_table_count = _count_detected_tables(page) - drawing_table_signal = line_like_items >= TABLE_DRAWING_LINE_THRESHOLD and ( - (horizontal_line_items >= 2 and vertical_line_items >= 2) - or rect_items >= TABLE_DRAWING_RECT_THRESHOLD - ) - # NOTE: - # `page.find_tables()` produces too many false positives on Word / Writer - # exported pure-text PDFs, where paragraph background boxes are inferred as - # full-page tables. For Phase 1 fast-path routing, keep `find_tables()` - # only as debug evidence and rely on explicit drawing-grid signals for - # table hard gates. - table_hit = drawing_table_signal - page_table_strength = 0.0 - if drawing_table_signal: - page_table_strength = min( - 1.0, - line_like_items / float(TABLE_DRAWING_STRONG_THRESHOLD), - ) - - if table_hit: - has_any_tables = True - table_signal_pages += 1 - total_table_signal_strength += page_table_strength - - blocks = page.get_text("blocks") - text_blocks = [ - block - for block in blocks - if block[6] == 0 - and (block[2] - block[0]) > 20 - and (block[3] - block[1]) > 10 - ] - - is_multi_col_page = False - if len(text_blocks) >= MULTI_COL_MIN_BLOCKS: - min_x_gap = page.rect.width * MULTI_COL_GAP_RATIO - side_by_side_count = 0 - - for i in range(len(text_blocks)): - for j in range(i + 1, len(text_blocks)): - block_i = text_blocks[i] - block_j = text_blocks[j] - y_overlap = min(block_i[3], block_j[3]) - max( - block_i[1], block_j[1] - ) - if y_overlap <= 0: - continue - x_gap = max(block_j[0] - block_i[2], block_i[0] - block_j[2]) - if x_gap > min_x_gap: - side_by_side_count += 1 - if side_by_side_count >= 3: - is_multi_col_page = True - break - if is_multi_col_page: - break - - if is_multi_col_page: - multi_col_pages += 1 - - is_scan_page = ( - text_len < SCAN_TEXT_THRESHOLD and img_coverage > SCAN_IMAGE_COVERAGE_MIN - ) - if is_scan_page: - scanned_pages += 1 - - page_has_significant_images = ( - page_significant_image_count > 0 or page_medium_image_coverage >= 0.18 - ) - if page_has_significant_images: - pages_with_significant_images += 1 - significant_image_count += page_significant_image_count or 1 - - page_has_large_image = ( - page_max_rect_ratio >= LARGE_IMAGE_PAGE_RATIO or img_coverage >= 0.35 - ) - if page_has_large_image: - large_image_pages += 1 - - max_image_coverage_on_page = max( - max_image_coverage_on_page, page_max_rect_ratio - ) - - is_complex_page = ( - table_hit - or page_has_large_image - or is_multi_col_page - or (page_has_significant_images and text_len < FAST_TEXT_THRESHOLD) - or (drawing_count >= 25 and text_len < FAST_TEXT_THRESHOLD) - ) - if is_complex_page: - complex_pages += 1 - - page_details.append( - { - "page": idx + 1, - "text_len": text_len, - "image_count": len(images), - "img_coverage": round(img_coverage, 3), - "font_count": len(fonts), - "drawing_count": drawing_count, - "line_like_items": line_like_items, - "horizontal_line_items": horizontal_line_items, - "vertical_line_items": vertical_line_items, - "table_hit": table_hit, - "detected_table_count": detected_table_count, - "stroked_rect_count": rect_items, - "fill_only_rect_count": fill_only_rect_items, - "significant_image_count": page_significant_image_count, - "max_image_coverage": round(page_max_rect_ratio, 3), - "is_multi_col_page": is_multi_col_page, - "is_scan_page": is_scan_page, - "is_complex_page": is_complex_page, - "text_block_count": len(text_blocks), - } - ) - - del text_blocks - del blocks - del drawings - del fonts - del images - del page - - doc.close() - del doc - - n_sampled = len(sample_indices) - profile.avg_text_density = total_text_len / n_sampled if n_sampled > 0 else 0.0 - profile.avg_image_coverage = ( - total_image_coverage / n_sampled if n_sampled > 0 else 0.0 - ) - profile.has_embedded_fonts = has_any_fonts - profile.has_tables = has_any_tables - profile.is_multi_column = multi_col_pages > (n_sampled * 0.3) - profile.is_degraded_electronic = degraded_pages > (n_sampled * DEGRADED_PAGE_RATIO) - profile.sample_text = " ".join(all_text_parts)[:500] - profile.page_details = page_details - - profile.has_significant_images = pages_with_significant_images > 0 - profile.significant_image_count = significant_image_count - profile.max_image_coverage_on_page = max_image_coverage_on_page - profile.pages_with_significant_images = pages_with_significant_images - profile.large_image_page_ratio = ( - large_image_pages / n_sampled if n_sampled > 0 else 0.0 - ) - - profile.table_signal_pages = table_signal_pages - profile.table_signal_strength = ( - total_table_signal_strength / n_sampled if n_sampled > 0 else 0.0 - ) - - profile.complex_pages = complex_pages - profile.complex_page_ratio = complex_pages / n_sampled if n_sampled > 0 else 0.0 - profile.max_drawing_count = max_drawing_count - profile.min_text_density_page = min(text_lengths) if text_lengths else 0.0 - profile.text_density_std = _stddev(text_lengths) - - scan_ratio = scanned_pages / n_sampled if n_sampled > 0 else 0.0 - if scan_ratio >= SCAN_PAGE_RATIO: - profile.scan_type = "scanned" - reasons.append( - f"scanned: {scanned_pages}/{n_sampled} sampled pages are scanned ({scan_ratio:.0%})" - ) - elif scanned_pages > 0: - profile.scan_type = "mixed" - reasons.append(f"mixed: {scanned_pages}/{n_sampled} sampled pages are scanned") - else: - profile.scan_type = "electronic" - reasons.append( - f"electronic: sampled pages contain extractable text (avg={profile.avg_text_density:.0f})" - ) - - landscape_ratio = landscape_pages / n_sampled if n_sampled > 0 else 0.0 - - # ── Linear atlas gate: VLM always makes the final call ── - # Any document meeting all 4 conditions is sent for VLM visual confirmation. - # We do NOT heuristically commit here — VLM decides in parse_service. - is_atlas_candidate = ( - profile.avg_text_density - < ATLAS_TEXT_THRESHOLD # text-sparse (< 200 chars/page) - and profile.avg_image_coverage - > ATLAS_CANDIDATE_IMAGE_COVERAGE_MIN # image-heavy (> 30%) - and landscape_ratio >= ATLAS_MIN_LANDSCAPE_RATIO # mostly landscape (>= 50%) - and profile.page_count >= ATLAS_MIN_PAGES # multi-page (>= 2) - ) - if is_atlas_candidate: - profile.doc_category = ( - "generic" # provisional — VLM will promote to "atlas" if confirmed - ) - profile.atlas_candidate = True - reasons.append( - f"atlas_candidate: text={profile.avg_text_density:.0f}<{ATLAS_TEXT_THRESHOLD}, " - f"img={profile.avg_image_coverage:.1%}>{ATLAS_CANDIDATE_IMAGE_COVERAGE_MIN:.0%}, " - f"landscape={landscape_ratio:.0%}>={ATLAS_MIN_LANDSCAPE_RATIO:.0%}, " - f"pages={profile.page_count}>={ATLAS_MIN_PAGES} → VLM confirmation required" - ) - else: - profile.doc_category = "generic" - - if landscape_ratio >= 0.8 and profile.doc_category == "generic": - slide_ratios = [1.333, 1.778, 1.600] - tolerance = 0.05 - ref_page = doc_page_sizes[0] if doc_page_sizes else None - if ref_page: - page_ratio = ref_page[0] / ref_page[1] if ref_page[1] > 0 else 0.0 - is_slide_ratio = any( - abs(page_ratio - ratio) < tolerance for ratio in slide_ratios - ) - if is_slide_ratio: - profile.doc_category = "ppt_converted" - reasons.append( - f"ppt_converted: {landscape_pages}/{n_sampled} landscape, ratio={page_ratio:.2f}" - ) - - route, decision_band, benefit, risk, route_reasons = _classify_route(profile) - profile.route = route - profile.decision_band = decision_band - profile.estimated_fast_benefit = benefit - profile.estimated_risk_score = risk - reasons.extend(route_reasons) - - profile.reasoning = " | ".join(reasons) - publish_profile_result(queue, profile) - - -def profile_pdf(file_path: str) -> DocProfile: - """Profile a PDF by running PyMuPDF analysis in a spawned child process.""" - result = run_in_child_process(_profile_pdf_worker, file_path, timeout=300) - profile = DocProfile(**result["profile"]) - logger.info( - f"[doc-profiler] route={profile.route} band={profile.decision_band} " - f"category={profile.doc_category} scan={profile.scan_type} " - f"pages={profile.page_count} text_density={profile.avg_text_density:.0f} " - f"img_coverage={profile.avg_image_coverage:.1%} risk={profile.estimated_risk_score:.2f} " - f"gain={profile.estimated_fast_benefit:.2f}" - ) - return profile diff --git a/apps/worker/app/services/document_parser/profiling/doc_profiler.py b/apps/worker/app/services/document_parser/profiling/doc_profiler.py index 184f1364..28076bfb 100644 --- a/apps/worker/app/services/document_parser/profiling/doc_profiler.py +++ b/apps/worker/app/services/document_parser/profiling/doc_profiler.py @@ -1,41 +1,107 @@ -""" -Agentic Document Profiler +"""Parser-entry document profiling.""" -Before data enters the pipeline, use lightweight analysis (~50ms) to generate -DocProfile, driving routing decisions and type annotations. - -Usage: - from app.services.document_parser.profiling.doc_profiler import profile_document - profile = profile_document("/path/to/file.pdf") -""" +from __future__ import annotations import os -from app.services.document_parser.profiling.doc_profile_model import DocProfile -from app.services.document_parser.profiling.doc_profile_pdf import profile_pdf +from app.services.document_agent.coordinator import ProfileCoordinator +from app.services.document_parser.orchestration.oversized_pdf_policy import ( + build_oversized_pdf_processing_failed_exception, + raise_if_oversized_pdf_not_supported, +) +from app.services.document_parser.profiling.profile_model import ParserDocumentProfile +from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory + +from shared.core.config import settings -def profile_document(file_path: str, filename: str = "") -> DocProfile: +def profile_document( + file_path: str, + filename: str = "", + *, + job_id: str | None = None, + output_dir: str | None = None, +) -> ParserDocumentProfile: """ General document profiling entry point. Args: file_path: Local file path filename: File name (used to infer type) + job_id: Parse job id for profile trace artifacts + output_dir: Parser output directory Returns: - DocProfile + ParserDocumentProfile """ if not filename: filename = os.path.basename(file_path) ext = os.path.splitext(filename)[1].lower() if ext == ".pdf": - return profile_pdf(file_path) + return _profile_pdf(file_path, filename, job_id=job_id, output_dir=output_dir) - return DocProfile( + return ParserDocumentProfile( file_type=ext.lstrip("."), - route="standard", - decision_band="safe_standard", + category=f"{ext.lstrip('.') or 'unknown'} document", + routing_category=PdfRoutingCategory.GENERIC, reasoning=f"Non-PDF format ({ext}), using default route", ) + + +def _profile_pdf( + file_path: str, + filename: str, + *, + job_id: str | None, + output_dir: str | None, +) -> ParserDocumentProfile: + profile_job_id = job_id or filename + agent_output_dir = os.path.join(output_dir, "_doc_agent") if output_dir else None + coordinator = ProfileCoordinator( + pdf_path=file_path, + job_id=profile_job_id, + output_dir=agent_output_dir, + model=settings.IMAGE_MODEL, + settings={ + "planner_model": settings.IMAGE_MODEL, + "vlm_model": settings.IMAGE_MODEL, + "model": settings.HIERARCHY_LLM_MODEL or settings.NORMOL_MODEL, + }, + ) + agent_profile = coordinator.run_coarse() + routing_category = PdfRoutingCategory.normalize(agent_profile.routing_category) + profile = ParserDocumentProfile( + file_type="pdf", + category=agent_profile.category, + routing_category=routing_category, + is_scanned=agent_profile.is_scanned, + page_count=coordinator.blackboard.page_count, + language=agent_profile.language, + reasoning=agent_profile.rationale, + category_rationale=agent_profile.category_rationale, + metrics={ + "doc_stats": coordinator.blackboard.doc_stats, + "doc_shape": coordinator.blackboard.global_signals.get("doc_shape", {}), + "page_kind_counts": coordinator.blackboard.global_signals.get( + "page_kind_counts", + {}, + ), + }, + ) + + if profile.page_count > settings.MAX_PDF_PAGE_LIMIT: + raise_if_oversized_pdf_not_supported(page_count=profile.page_count) + if not profile.is_atlas: + try: + profile.anatomy = coordinator.run_structural() + except Exception as exc: + raise build_oversized_pdf_processing_failed_exception( + page_count=profile.page_count, + original_exception=exc, + ) from exc + + return profile + + +__all__ = ["profile_document"] diff --git a/apps/worker/app/services/document_parser/profiling/profile_model.py b/apps/worker/app/services/document_parser/profiling/profile_model.py new file mode 100644 index 00000000..e553ca41 --- /dev/null +++ b/apps/worker/app/services/document_parser/profiling/profile_model.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from typing import Any + +from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory + + +@dataclass +class ParserDocumentProfile: + """Parser-entry document profile used for routing and PDF anatomy reuse.""" + + file_type: str + category: str = "unknown document" + routing_category: PdfRoutingCategory = PdfRoutingCategory.GENERIC + is_scanned: bool = False + page_count: int = 0 + language: str = "unknown" + reasoning: str = "" + category_rationale: str = "" + anatomy: Any | None = None + metrics: dict[str, Any] = field(default_factory=dict) + + @property + def is_pdf(self) -> bool: + return self.file_type == "pdf" + + @property + def is_atlas(self) -> bool: + return self.routing_category is PdfRoutingCategory.ATLAS + + @property + def has_structural_anatomy(self) -> bool: + return self.anatomy is not None + + def to_dict(self) -> dict[str, Any]: + data = asdict(self) + data["routing_category"] = self.routing_category.value + if self.anatomy is not None and hasattr(self.anatomy, "to_dict"): + data["anatomy"] = self.anatomy.to_dict() + else: + data["anatomy"] = None + return data + + def summary(self) -> str: + parts = ( + f"[{self.file_type.upper()}] category={self.category}, " + f"routing={self.routing_category.value}, " + f"scanned={self.is_scanned}, pages={self.page_count}" + ) + if self.has_structural_anatomy: + parts += ", anatomy=True" + return parts + + +__all__ = ["ParserDocumentProfile"] diff --git a/apps/worker/app/services/document_parser/profiling/taxonomy.py b/apps/worker/app/services/document_parser/profiling/taxonomy.py new file mode 100644 index 00000000..2fb0303d --- /dev/null +++ b/apps/worker/app/services/document_parser/profiling/taxonomy.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from enum import Enum + + +class PdfRoutingCategory(str, Enum): + ATLAS = "atlas" + GENERIC = "generic" + SCANNED = "scanned" + SLIDES = "slides" + + @classmethod + def normalize(cls, value: object) -> "PdfRoutingCategory": + raw = str(value or "").strip().lower().replace("-", "_").replace(" ", "_") + if raw in {"atlas", "engineering_atlas", "drawing_atlas", "drawing_collection"}: + return cls.ATLAS + if raw in {"scan", "scanned", "scanned_pdf", "image_only"}: + return cls.SCANNED + if raw in {"slide", "slides", "ppt", "pptx", "presentation"}: + return cls.SLIDES + return cls.GENERIC + + +__all__ = ["PdfRoutingCategory"] diff --git a/apps/worker/tests/contract/test_parse_task_contract.py b/apps/worker/tests/contract/test_parse_task_contract.py index 355966cd..c2f33121 100644 --- a/apps/worker/tests/contract/test_parse_task_contract.py +++ b/apps/worker/tests/contract/test_parse_task_contract.py @@ -499,13 +499,10 @@ def test_oversized_pdf_shard_failure_preserves_processing_error( monkeypatch.setenv("S3_TEMP_PATH", str(tmp_path)) from app.services.document_parser.formats.pdf import parser as pdf_parser + from app.services.document_parser.profiling.profile_model import ParserDocumentProfile + from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory from shared.core.exceptions.domain_exceptions import PDFParsingException - class _Profile: - route = "standard" - doc_category = "generic" - page_count = 2 - monkeypatch.setattr(pdf_parser.settings, "MAX_PDF_PAGE_LIMIT", 1) def _fail_oversized_parse(*args, **kwargs): @@ -519,7 +516,12 @@ def _fail_oversized_parse(*args, **kwargs): "source.pdf", str(tmp_path), {}, - profile=_Profile(), + profile=ParserDocumentProfile( + file_type="pdf", + category="generic document", + routing_category=PdfRoutingCategory.GENERIC, + page_count=2, + ), ) assert exc_info.value.details["reason"] == "OVERSIZED_SHARD_PIPELINE_FAILED" @@ -546,17 +548,14 @@ def test_oversized_pdf_happy_path_uses_shard_pipeline_without_external_services( TocResult, ) from app.services.document_parser.formats.pdf import parser as pdf_parser + from app.services.document_parser.profiling.profile_model import ParserDocumentProfile + from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory pdf_path = tmp_path / "oversized.pdf" output_dir = tmp_path / "output" output_dir.mkdir() _write_blank_pdf(pdf_path, page_count=3) - class _Profile: - route = "standard" - doc_category = "generic" - page_count = 3 - calls: dict[str, object] = {} parse_s3_keys: list[str | None] = [] deleted_s3_keys: list[str] = [] @@ -566,46 +565,40 @@ def delete_upload_file(self, storage_key: str) -> bool: deleted_s3_keys.append(storage_key) return True - def _fake_run_doc_agent(pdf_path_arg: str, job_id: str, output_dir: str): - calls["doc_agent"] = { - "pdf_path": pdf_path_arg, - "job_id": job_id, - "output_dir": output_dir, - } - return PageAnatomyMap( - job_id=job_id, - file_path=pdf_path_arg, - page_count=3, - page_features=[], - page_labels=[], - toc_result=TocResult(toc_pages=[1], method="vlm_batch"), - h1_result=H1BoundaryResult(method="toc_grep"), - shard_plan=ShardPlan( - enabled=True, - reason="too_large", - shards=[ - Shard( - shard_index=0, - page_start=1, - page_end=2, - page_offset=0, - anchor_type="h1_boundary", - anchor_evidence="Chapter 1", - confidence=0.9, - ), - Shard( - shard_index=1, - page_start=3, - page_end=3, - page_offset=2, - anchor_type="h1_boundary", - anchor_evidence="Chapter 2", - confidence=0.9, - ), - ], - ), - toc_hierarchies=[{"toc_tree": {"Chapter 1": {}, "Chapter 2": {}}}], - ) + anatomy = PageAnatomyMap( + job_id="job-oversized", + file_path=str(pdf_path), + page_count=3, + page_features=[], + page_labels=[], + toc_result=TocResult(toc_pages=[1], method="vlm_batch"), + h1_result=H1BoundaryResult(method="toc_grep"), + shard_plan=ShardPlan( + enabled=True, + reason="too_large", + shards=[ + Shard( + shard_index=0, + page_start=1, + page_end=2, + page_offset=0, + anchor_type="h1_boundary", + anchor_evidence="Chapter 1", + confidence=0.9, + ), + Shard( + shard_index=1, + page_start=3, + page_end=3, + page_offset=2, + anchor_type="h1_boundary", + anchor_evidence="Chapter 2", + confidence=0.9, + ), + ], + ), + toc_hierarchies=[{"toc_tree": {"Chapter 1": {}, "Chapter 2": {}}}], + ) def _fake_split_pdf(pdf_path_arg, shards, work_dir, exclude_pages=None): calls["exclude_pages"] = exclude_pages @@ -643,10 +636,6 @@ def _identity_eval_md_headings( monkeypatch.setattr(pdf_parser.settings, "MAX_PDF_PAGE_LIMIT", 2) monkeypatch.setattr(pdf_parser.settings, "MINERU_SHARD_CONCURRENCY", 1) - monkeypatch.setattr( - "app.services.document_parser.formats.pdf.shard_splitter.run_doc_agent", - _fake_run_doc_agent, - ) monkeypatch.setattr( "app.services.document_parser.formats.pdf.shard_splitter.split_pdf", _fake_split_pdf, @@ -671,14 +660,19 @@ def _identity_eval_md_headings( "model_name": "mock-model", "hierarchy_model_name": "mock-model", }, - profile=_Profile(), + profile=ParserDocumentProfile( + file_type="pdf", + category="generic document", + routing_category=PdfRoutingCategory.GENERIC, + page_count=3, + anatomy=anatomy, + ), relative_root="oversized.pdf", s3_key="uploads/job-oversized.pdf", job_id="job-oversized", ) assert calls["exclude_pages"] == {1} - assert calls["doc_agent"]["job_id"] == "job-oversized" assert len(calls["heading_dirs"]) == 2 expected_s3_keys = [ "tmp/mineru-shards/job-oversized/shard_0.pdf", From 474ef68595a5d15d36336c7bd50ecd4fdcb137ae Mon Sep 17 00:00:00 2001 From: chengke <404835780@qq.com> Date: Wed, 10 Jun 2026 16:15:57 +0800 Subject: [PATCH 2/4] feat: add TOC profiling support and upgrade Qwen models to 3.6-flash --- apps/api/.env.example | 5 +- apps/worker/.env.example | 5 +- .../services/document_agent/coordinator.py | 210 ++++- .../app/services/document_agent/manifest.py | 17 +- .../tools/extract_toc_with_boundaries.py | 82 +- .../document_agent/tools/match_h1_pages.py | 70 +- .../app/services/document_agent/validators.py | 2 +- .../document_parser/formats/image/parser.py | 6 +- .../formats/markdown/parser.py | 4 + .../document_parser/formats/pdf/parser.py | 197 +++-- .../document_parser/profiling/doc_profiler.py | 37 +- .../profiling/profile_model.py | 28 +- .../structure/body_boundary.py | 72 ++ .../structure/heading_hierarchy.py | 2 + .../structure/layout_parser.py | 25 + .../test_doc_profile_anatomy_contract.py | 783 ++++++++++++++++++ .../contract/test_parse_task_contract.py | 34 +- .../shared-python/shared/core/config/ai.py | 4 +- .../shared/core/config/storage.py | 8 + .../shared/services/retrieval/llm_adapter.py | 4 +- 20 files changed, 1402 insertions(+), 193 deletions(-) create mode 100644 apps/worker/app/services/document_parser/structure/body_boundary.py create mode 100644 apps/worker/tests/contract/test_doc_profile_anatomy_contract.py diff --git a/apps/api/.env.example b/apps/api/.env.example index 139aa41c..820e849a 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -81,8 +81,8 @@ ARK_API_KEY= # ARK_URL=https://ark.cn-beijing.volces.com/api/v3/chat/completions # NORMOL_MODEL=deepseek-chat # HIERARCHY_LLM_MODEL= -# IMAGE_MODEL=qwen3.5-flash -# IMAGE_MODEL_MAX=qwen3.5-flash +# IMAGE_MODEL=qwen3.6-flash +# IMAGE_MODEL_MAX=qwen3.6-flash # Optional retrieval overrides have code defaults. Retrieval is evidence-only: # evidence_text is the primary output and answer_text is always empty. Set @@ -95,6 +95,7 @@ MAX_FILE_SIZE=314572800 MAX_PDF_PAGE_LIMIT=200 OVERSIZED_PDF_SHARD_ENABLED=true OVERSIZED_PDF_SOFT_LIMIT=1500 +PDF_PROFILE_TOC_ENABLED=false MINERU_SHARD_CONCURRENCY=3 # Required for specific features: webhooks and callbacks diff --git a/apps/worker/.env.example b/apps/worker/.env.example index cc2a3687..c9029a58 100644 --- a/apps/worker/.env.example +++ b/apps/worker/.env.example @@ -86,8 +86,8 @@ ARK_API_KEY= # ARK_URL=https://ark.cn-beijing.volces.com/api/v3/chat/completions # NORMOL_MODEL=deepseek-chat # HIERARCHY_LLM_MODEL= -# IMAGE_MODEL=qwen3.5-flash -# IMAGE_MODEL_MAX=qwen3.5-flash +# IMAGE_MODEL=qwen3.6-flash +# IMAGE_MODEL_MAX=qwen3.6-flash # Optional retrieval overrides have code defaults. Retrieval is evidence-only: # evidence_text is the primary output and answer_text is always empty. Set @@ -120,6 +120,7 @@ MAX_FILE_SIZE=314572800 MAX_PDF_PAGE_LIMIT=200 OVERSIZED_PDF_SHARD_ENABLED=true OVERSIZED_PDF_SOFT_LIMIT=1500 +PDF_PROFILE_TOC_ENABLED=false MINERU_SHARD_CONCURRENCY=3 # Legacy parser compatibility fields. diff --git a/apps/worker/app/services/document_agent/coordinator.py b/apps/worker/app/services/document_agent/coordinator.py index 374adbf3..bcb4edb9 100644 --- a/apps/worker/app/services/document_agent/coordinator.py +++ b/apps/worker/app/services/document_agent/coordinator.py @@ -17,7 +17,9 @@ from app.services.document_agent.manifest import ( DocumentProfile, PageAnatomyMap, + TocResult, ToolContext, + ToolResult, ) from app.services.document_agent.persist import build_anatomy_map, persist_anatomy_map from app.services.document_agent.planner import ProfilePlanner @@ -59,6 +61,7 @@ def __init__( self.trace = ParseRunRecorder(job_id=job_id, db=db) self.ctx.trace = self.trace self.round_index = 0 + self._planner_cache: tuple[DocumentProfile, Any, ToolResult] | None = None def run(self) -> PageAnatomyMap: try: @@ -81,41 +84,49 @@ def run_structural(self) -> PageAnatomyMap: self._record_failure(exc) raise + def run_toc(self) -> TocResult: + try: + return self._run_toc() + except Exception as exc: + logger.warning( + "[document_agent] TOC profiling failed, degrading to empty TOC: {}", + exc, + ) + self.blackboard.toc_result = TocResult( + method="none", + notes=f"degraded: {type(exc).__name__}: {exc}", + failure_kind="degraded", + ) + self.blackboard.toc_hierarchies = None + return self.blackboard.toc_result + + def run_lightweight_anatomy(self) -> PageAnatomyMap: + try: + return self._run_lightweight_anatomy() + except Exception as exc: + self._record_failure(exc) + raise + def _run_coarse(self) -> DocumentProfile: self.state = DocumentAgentState.RUNNING if not self.blackboard.page_features: self._run_bootstrap() - profile, _initial_decision, planner_result = ProfilePlanner(self.ctx).propose() - self.blackboard.document_profile = profile - self.blackboard.global_signals["document_profile"] = profile.to_dict() - self.trace.record_step( - round_index=self.round_index, - actor="planner:coarse", - action_type="plan", - result=planner_result, - tool_name=None, - tool_args={}, + if self._should_run_toc_before_coarse(): + self._ensure_toc_profile(strict=False) + profile, _initial_decision, _planner_result = self._propose_profile( + actor="planner:coarse" ) - self.round_index += 1 return profile def _run_structural(self) -> PageAnatomyMap: self.state = DocumentAgentState.RUNNING if not self.blackboard.page_features: self._run_bootstrap() - self._run_toc_pipeline() - profile, initial_decision, planner_result = ProfilePlanner(self.ctx).propose() - self.blackboard.document_profile = profile - self.blackboard.global_signals["document_profile"] = profile.to_dict() - self.trace.record_step( - round_index=self.round_index, - actor="planner", - action_type="plan", - result=planner_result, - tool_name=None, - tool_args={}, + self._ensure_toc_profile(strict=True) + profile, initial_decision, _planner_result = self._propose_profile( + actor="planner" ) - self.round_index += 1 + self._run_h1_boundary_pipeline() executor_result = ReActExecutor( self.ctx, registry=REGISTRY, @@ -125,6 +136,48 @@ def _run_structural(self) -> PageAnatomyMap: if executor_result.verdict.status != "success": raise RuntimeError(f"profile aborted: {executor_result.verdict.rationale}") anatomy = build_anatomy_map(self.ctx) + self._persist_ready_anatomy(anatomy) + return anatomy + + def _run_toc(self) -> TocResult: + self.state = DocumentAgentState.RUNNING + if not self.blackboard.page_features: + self._run_bootstrap() + self._ensure_toc_profile(strict=False) + if self.blackboard.toc_result is None: + self.blackboard.toc_result = TocResult( + method="none", + notes="TOC extraction completed without a result", + ) + return self.blackboard.toc_result + + def _run_lightweight_anatomy(self) -> PageAnatomyMap: + self.state = DocumentAgentState.RUNNING + if not self.blackboard.page_features: + self._run_bootstrap() + if self.blackboard.toc_result is None: + self.blackboard.toc_result = TocResult( + method="none", + notes="TOC profiling disabled or not attempted", + ) + self._run_h1_boundary_pipeline() + result = REGISTRY.dispatch("propose.shard_plan", self.ctx, {}) + self.trace.record_step( + round_index=self.round_index, + actor="anatomy:propose.shard_plan", + action_type="anatomy", + result=result, + tool_name="propose.shard_plan", + tool_args={}, + ) + if result.status not in {"ok", "invalid"}: + raise RuntimeError(result.error or "propose.shard_plan failed") + self.round_index += 1 + anatomy = build_anatomy_map(self.ctx) + self._persist_ready_anatomy(anatomy) + return anatomy + + def _persist_ready_anatomy(self, anatomy: PageAnatomyMap) -> None: persist_result = persist_anatomy_map(self.ctx, {}) self.trace.record_step( round_index=self.round_index, @@ -144,7 +197,6 @@ def _run_structural(self) -> PageAnatomyMap: final_status="ready", summary=anatomy.trace_summary | self.trace.summary(), ) - return anatomy def _record_failure(self, exc: Exception) -> None: logger.error(f"[document_agent] profile failed: {exc}") @@ -175,21 +227,99 @@ def _run_bootstrap(self) -> None: raise RuntimeError(result.error or f"{tool_name} failed") self.round_index += 1 - def _run_toc_pipeline(self) -> None: - for tool_name in ( - "find.toc_anchor_pages", - "extract.toc_with_boundaries", - "match.h1_pages", - ): - result = REGISTRY.dispatch(tool_name, self.ctx, {}) - self.trace.record_step( - round_index=self.round_index, - actor=f"toc:{tool_name}", - action_type="toc", - result=result, + def _toc_result_requires_strict_retry(self) -> bool: + toc_result = self.blackboard.toc_result + return bool( + toc_result + and toc_result.method == "none" + and toc_result.failure_kind in {"confirm_failed", "degraded"} + ) + + def _should_run_toc_before_coarse(self) -> bool: + if self.ctx.settings.get("toc_before_coarse"): + return True + try: + page_limit = int(self.ctx.settings.get("toc_before_coarse_page_limit", 0)) + except (TypeError, ValueError): + page_limit = 0 + return page_limit > 0 and self.blackboard.page_count > page_limit + + def _ensure_toc_profile(self, *, strict: bool) -> None: + should_run = self.blackboard.toc_result is None + if strict and self._toc_result_requires_strict_retry(): + self.blackboard.toc_result = None + self.blackboard.toc_hierarchies = None + should_run = True + + if not should_run: + return + + self._planner_cache = None + try: + self._run_toc_extraction_pipeline() + except Exception as exc: + logger.warning( + "[document_agent] TOC profiling failed, " + "degrading to empty TOC: {}", + exc, + ) + self.blackboard.toc_result = TocResult( + method="none", + notes=f"degraded: {type(exc).__name__}: {exc}", + failure_kind="degraded", + ) + self.blackboard.toc_hierarchies = None + return + + if self.blackboard.toc_result is None: + self.blackboard.toc_result = TocResult( + method="none", + notes="TOC extraction completed without a result", + ) + + def _propose_profile(self, *, actor: str) -> tuple[DocumentProfile, Any, ToolResult]: + if self._planner_cache is not None: + return self._planner_cache + + profile, initial_decision, planner_result = ProfilePlanner(self.ctx).propose() + self.blackboard.document_profile = profile + self.blackboard.global_signals["document_profile"] = profile.to_dict() + self.trace.record_step( + round_index=self.round_index, + actor=actor, + action_type="plan", + result=planner_result, + tool_name=None, + tool_args={}, + ) + self.round_index += 1 + self._planner_cache = (profile, initial_decision, planner_result) + return self._planner_cache + + def _dispatch_profile_tool(self, *, tool_name: str, actor: str) -> ToolResult: + result = REGISTRY.dispatch(tool_name, self.ctx, {}) + self.trace.record_step( + round_index=self.round_index, + actor=actor, + action_type="toc", + result=result, + tool_name=tool_name, + tool_args={}, + ) + if result.status not in {"ok", "invalid"}: + raise RuntimeError(result.error or f"{tool_name} failed") + self.round_index += 1 + return result + + def _run_toc_extraction_pipeline(self) -> None: + for tool_name in ("find.toc_anchor_pages", "extract.toc_with_boundaries"): + self._dispatch_profile_tool( tool_name=tool_name, - tool_args={}, + actor=f"toc:{tool_name}", ) - if result.status not in {"ok", "invalid"}: - raise RuntimeError(result.error or f"{tool_name} failed") - self.round_index += 1 + + def _run_h1_boundary_pipeline(self) -> None: + self._dispatch_profile_tool( + tool_name="match.h1_pages", + actor="toc:match.h1_pages", + ) diff --git a/apps/worker/app/services/document_agent/manifest.py b/apps/worker/app/services/document_agent/manifest.py index b89f6164..2518e282 100644 --- a/apps/worker/app/services/document_agent/manifest.py +++ b/apps/worker/app/services/document_agent/manifest.py @@ -8,6 +8,7 @@ PageKind = Literal["normal", "table_heavy", "image_heavy", "low_content", "landscape"] +TocFailureKind = Literal["none", "confirm_failed", "rejected_all", "degraded"] ReflexionAction = Literal["tool_call", "verdict_now"] VerdictStatus = Literal["success", "abort"] @@ -107,16 +108,30 @@ def to_dict(self) -> dict[str, Any]: return asdict(self) +@dataclass +class TocEvidence: + page_index: int + source: str + confidence: float + reason: str = "" + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + @dataclass class TocResult: toc_pages: list[int] = field(default_factory=list) - candidates: list[TocCandidate] = field(default_factory=list) + candidates: list[TocAnchorPage] = field(default_factory=list) + evidence: list[TocEvidence] = field(default_factory=list) method: Literal["toc_marker", "vlm_progressive", "vlm_batch", "visual_scan", "none"] = "none" notes: str = "" + failure_kind: TocFailureKind = "none" def to_dict(self) -> dict[str, Any]: data = asdict(self) data["candidates"] = [candidate.to_dict() for candidate in self.candidates] + data["evidence"] = [item.to_dict() for item in self.evidence] return data diff --git a/apps/worker/app/services/document_agent/tools/extract_toc_with_boundaries.py b/apps/worker/app/services/document_agent/tools/extract_toc_with_boundaries.py index 4a424a5c..ddb876e4 100644 --- a/apps/worker/app/services/document_agent/tools/extract_toc_with_boundaries.py +++ b/apps/worker/app/services/document_agent/tools/extract_toc_with_boundaries.py @@ -13,6 +13,7 @@ from app.services.document_agent.manifest import ( TocAnchorPage, + TocEvidence, TocResult, ToolContext, ToolResult, @@ -67,12 +68,12 @@ def _vlm_confirm_anchors( anchor_pages: list[TocAnchorPage], model: str, budget: Any | None = None, -) -> tuple[list[TocAnchorPage], bool]: +) -> tuple[list[TocAnchorPage], bool, list[TocEvidence]]: """Phase 1: send all anchor PNGs to VLM, ask which are real TOC starts.""" from shared.services.ai.openai_compatible_client_sync import get_openai_client if not anchor_pages: - return [], False + return [], False, [] import base64 @@ -120,7 +121,7 @@ def _vlm_confirm_anchors( est = estimate_tokens(str(content_parts[0]["text"])) + len(anchor_pages) * 800 if budget and not budget.try_reserve("visual", est): logger.warning("[extract.toc] insufficient visual budget for anchor confirmation") - return [], True + return [], True, [] try: client = get_openai_client(model=model) @@ -144,18 +145,50 @@ def _vlm_confirm_anchors( items = [] confirmed_pages: set[int] = set() + evidence_by_page: dict[int, TocEvidence] = {} for item in items: - if isinstance(item, dict) and item.get("is_toc_start"): - confirmed_pages.add(int(item["page"])) + if not isinstance(item, dict) or "page" not in item: + continue + page = int(item["page"]) + is_toc_start = bool(item.get("is_toc_start")) + if is_toc_start: + confirmed_pages.add(page) + raw_confidence = item.get("confidence") + try: + confidence = ( + float(raw_confidence) + if raw_confidence is not None + else (0.95 if is_toc_start else 0.05) + ) + except (TypeError, ValueError): + confidence = 0.95 if is_toc_start else 0.05 + evidence_by_page[page] = TocEvidence( + page_index=page, + source="vlm", + confidence=max(0.0, min(1.0, confidence)), + reason=str(item.get("reason") or ""), + ) confirmed = [a for a in anchor_pages if a.page in confirmed_pages] rejected = [a.page for a in anchor_pages if a.page not in confirmed_pages] + evidence = [ + evidence_by_page.get( + a.page, + TocEvidence( + page_index=a.page, + source="vlm", + confidence=0.05, + reason="VLM response omitted this candidate page", + ), + ) + for a in anchor_pages + ] logger.info( "[extract.toc] VLM confirmed {} TOC starts, rejected pages: {}", len(confirmed), rejected, ) - return confirmed, False + return confirmed, False, evidence except Exception as exc: if budget: budget.refund("visual", est=est) @@ -164,7 +197,7 @@ def _vlm_confirm_anchors( "falling back to no confirmed anchors (safe degradation)", exc, ) - return [], True + return [], True, [] # -- Main tool ----------------------------------------------------------------- @@ -190,6 +223,7 @@ def extract_toc_with_boundaries( ctx.blackboard.toc_result = TocResult( method="none", notes="No TOC anchor pages found by find.toc_anchor_pages", + failure_kind="none", ) return ToolResult( status="ok", @@ -203,6 +237,7 @@ def extract_toc_with_boundaries( ctx.blackboard.toc_result = TocResult( method="none", notes="No VLM model configured for TOC extraction", + failure_kind="degraded", ) return ToolResult( status="ok", @@ -219,18 +254,40 @@ def extract_toc_with_boundaries( os.makedirs(output_dir, exist_ok=True) # -- Phase 1: VLM confirm anchors ----------------------------------------- - confirmed, confirm_failed = _vlm_confirm_anchors(anchors, model, budget=ctx.budget) + confirmed, confirm_failed, confirm_evidence = _vlm_confirm_anchors( + anchors, model, budget=ctx.budget + ) if confirm_failed: warnings.append("vlm_anchor_confirmation_failed") debug_info["phase1_confirmed"] = [a.page for a in confirmed] - debug_info["phase1_rejected"] = [ - a.page for a in anchors if a not in confirmed - ] + debug_info["phase1_rejected"] = ( + [] if confirm_failed else [a.page for a in anchors if a not in confirmed] + ) + if confirm_failed: + debug_info["phase1_unconfirmed"] = [a.page for a in anchors] if not confirmed: + if confirm_failed: + ctx.blackboard.toc_result = TocResult( + candidates=list(anchors), + evidence=confirm_evidence, + method="none", + notes="VLM anchor confirmation failed; TOC candidates left unconfirmed", + failure_kind="confirm_failed", + ) + return ToolResult( + status="ok", + payload={"toc_count": 0}, + latency_ms=int((time.monotonic() - start) * 1000), + warnings=warnings, + debug=debug_info, + ) ctx.blackboard.toc_result = TocResult( + candidates=list(anchors), + evidence=confirm_evidence, method="none", notes="VLM rejected all TOC anchor candidates", + failure_kind="rejected_all", ) return ToolResult( status="ok", @@ -364,12 +421,14 @@ def extract_toc_with_boundaries( ctx.blackboard.toc_result = TocResult( toc_pages=all_toc_pages_sorted, + evidence=confirm_evidence, method="vlm_batch", notes=( f"VLM confirmed {len(confirmed)} TOC starts, " f"batch classify+extract found {toc_region_count} regions, " f"toc_pages={all_toc_pages_sorted}" ), + failure_kind="none", ) ctx.blackboard.toc_hierarchies = toc_hierarchies if toc_hierarchies else None ctx.blackboard.global_signals["vlm_toc_entries"] = { @@ -421,4 +480,3 @@ def extract_toc_with_boundaries( warnings=warnings, debug=debug_info, ) - diff --git a/apps/worker/app/services/document_agent/tools/match_h1_pages.py b/apps/worker/app/services/document_agent/tools/match_h1_pages.py index 1a6d0d07..7cae13bf 100644 --- a/apps/worker/app/services/document_agent/tools/match_h1_pages.py +++ b/apps/worker/app/services/document_agent/tools/match_h1_pages.py @@ -5,9 +5,7 @@ import base64 import json import os -import re import time -import unicodedata from typing import Any, cast from app.services.document_agent.manifest import ( @@ -19,42 +17,12 @@ from app.services.document_agent.pdf_text import read_page_texts from app.services.document_agent.registry import has_toc_result, register_tool from app.services.document_agent.visual import render_pages -from loguru import logger - - -# ── Text normalization for matching ────────────────────────────────────── - -_LEADING_NUMBER_RE = re.compile( - r"""^ - (?: - [#]+\s* - | 第\s*[零一二三四五六七八九十百千\d]+\s*[章节篇部分] - | [零一二三四五六七八九十百千]+\s*[、。,,] - | [((]\s*[零一二三四五六七八九十百千\d]+\s*[))] - | \d+(?:\.\d+)*\.?\s* - | [IVXLCDM]+\.?\s+ - | [A-Za-z]\.\s+ - | Chapter\s+\w+\s* - ) - """, - re.VERBOSE | re.IGNORECASE, +from app.services.document_parser.structure.body_boundary import ( + clean_toc_title, + extract_level1_titles, + normalize_heading_text, ) - -_PAGE_SUFFIX_RE = re.compile(r"[\s\.\-·…]+\d+\s*$") - - -def _normalize(text: str) -> str: - """Normalize text for fuzzy heading matching.""" - text = unicodedata.normalize("NFKC", text or "") - text = re.sub(r"\s+", " ", text).strip() - return text - - -def _clean_toc_title(title: str) -> str: - """Remove leading numbering/hashes and trailing page numbers from a TOC title.""" - cleaned = _PAGE_SUFFIX_RE.sub("", title or "").strip() - cleaned = _LEADING_NUMBER_RE.sub("", cleaned).strip() - return cleaned +from loguru import logger # ── C1: Unified grep matching ──────────────────────────────────────────── @@ -80,14 +48,14 @@ def grep_titles_in_pages( unmatched: list[str] = [] for title in titles: - normalized_title = _normalize(title) + normalized_title = normalize_heading_text(title) found = False for page in search_pages: text = page_texts.get(page, "") - if normalized_title in _normalize(text): + if normalized_title in normalize_heading_text(text): matched_line = "" for line in text.splitlines(): - if normalized_title in _normalize(line): + if normalized_title in normalize_heading_text(line): matched_line = line.strip()[:100] break candidates.append( @@ -130,30 +98,20 @@ def extract_children_titles( in_scope = False for entry in entries: if entry.get("level") == 1: - cleaned = _clean_toc_title(entry.get("heading", "")) - in_scope = _normalize(cleaned) == _normalize(parent_title) + cleaned = clean_toc_title(entry.get("heading", "")) + in_scope = normalize_heading_text(cleaned) == normalize_heading_text( + parent_title + ) continue if in_scope and entry.get("level") == 2: - cleaned = _clean_toc_title(entry.get("heading", "")) + cleaned = clean_toc_title(entry.get("heading", "")) if cleaned and len(cleaned) >= 2: titles.append(cleaned) return titles def _extract_level1_titles(toc_hierarchies: list[dict[str, Any]]) -> list[str]: - """Extract level-1 titles from toc_hierarchies. - - Each hierarchy dict contains ``toc_tree`` – a nested dict where top-level - keys are level-1 headings (values are sub-heading dicts). - """ - titles: list[str] = [] - for hier in toc_hierarchies: - toc_tree = hier.get("toc_tree") or {} - for raw_title in toc_tree.keys(): - cleaned = _clean_toc_title(raw_title) - if cleaned and len(cleaned) >= 2: - titles.append(cleaned) - return titles + return extract_level1_titles(toc_hierarchies) # ── C2: Lazy VLM verification ──────────────────────────────────────────── diff --git a/apps/worker/app/services/document_agent/validators.py b/apps/worker/app/services/document_agent/validators.py index 6f6f01f5..12ea930b 100644 --- a/apps/worker/app/services/document_agent/validators.py +++ b/apps/worker/app/services/document_agent/validators.py @@ -35,7 +35,7 @@ def validate_shard_plan( if shard.page_offset != shard.page_start - 1: errors.append(f"shard {shard.shard_index} page_offset mismatch") length = shard.page_end - shard.page_start + 1 - if plan.enabled and length > max_pages: + if length > max_pages: errors.append(f"shard {shard.shard_index} exceeds max_pages={max_pages}") if plan.enabled and length < min_pages: if is_last: diff --git a/apps/worker/app/services/document_parser/formats/image/parser.py b/apps/worker/app/services/document_parser/formats/image/parser.py index 251e2e28..df7555bb 100755 --- a/apps/worker/app/services/document_parser/formats/image/parser.py +++ b/apps/worker/app/services/document_parser/formats/image/parser.py @@ -55,7 +55,7 @@ def perceptual_hash(data: bytes) -> str: def _get_vision_client() -> OpenAICompatibleClientSync: """Create OpenAI-compatible client for vision models, auto-routing by IMAGE_MODEL name.""" - image_model = settings.IMAGE_MODEL or "qwen-vl-plus" + image_model = settings.IMAGE_MODEL or "qwen3.6-flash" return get_openai_client(model=image_model) @@ -141,9 +141,9 @@ def ask_image( urls_ = process_img_path4read(valid_paths, image_root_dir, size_cut) if task in ("summary-images", "atlas-page-info"): - image_model = settings.IMAGE_MODEL or "gpt-4-vision-preview" + image_model = settings.IMAGE_MODEL or "qwen3.6-flash" else: # OCR and image type classification use higher-capability models - image_model = settings.IMAGE_MODEL_MAX or "gpt-4-vision-preview" + image_model = settings.IMAGE_MODEL_MAX or "qwen3.6-flash" if len(urls_) > 0: prompt, temperature, top_p, max_tokens = build_prompt( diff --git a/apps/worker/app/services/document_parser/formats/markdown/parser.py b/apps/worker/app/services/document_parser/formats/markdown/parser.py index 7fdfd4df..925bd524 100755 --- a/apps/worker/app/services/document_parser/formats/markdown/parser.py +++ b/apps/worker/app/services/document_parser/formats/markdown/parser.py @@ -113,6 +113,7 @@ def eval_md_headings( model_name=None, output_dir=None, layout_json_path=None, + is_first_shard=True, ): """Evaluate markdown headings with optional TOC hierarchies context""" heading_preds = predict_heading_hierarchy( @@ -125,6 +126,7 @@ def eval_md_headings( model_name=model_name, output_dir=output_dir, layout_json_path=layout_json_path, + is_first_shard=is_first_shard, ) ) @@ -225,6 +227,7 @@ def parse_md( relative_root=None, toc_hierarchies=None, lines_with_heading=None, + is_first_shard=True, ): if lines_with_heading is not None: # ── Phase A bypass ── @@ -312,6 +315,7 @@ def parse_md( model_name=hierarchy_model_name, output_dir=output_dir, layout_json_path=layout_json_path, + is_first_shard=is_first_shard, ) # ── Phase B: MarkdownParseState traversal ── diff --git a/apps/worker/app/services/document_parser/formats/pdf/parser.py b/apps/worker/app/services/document_parser/formats/pdf/parser.py index 08cfab8d..791cc40e 100755 --- a/apps/worker/app/services/document_parser/formats/pdf/parser.py +++ b/apps/worker/app/services/document_parser/formats/pdf/parser.py @@ -9,7 +9,9 @@ ) from app.services.document_parser.providers.mineru.pdf_service import parse_via_full from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory +from app.services.document_parser.structure.toc_parser import detect_tocs_in_texts from app.services.document_parser.support.stage_profiler import stage_timer +from app.services.document_parser.support.text_helpers import normalize_md from loguru import logger from shared.core.config import settings @@ -37,19 +39,27 @@ def parse_pdfs( pdf_path, output_dir, base_llm_paras, relative_root, profile=profile ) - # ── Oversized PDF: doc_agent → shard → parallel MinerU → merge → parse_md ── - if profile and profile.page_count > settings.MAX_PDF_PAGE_LIMIT: - logger.info( - f"📄 Oversized PDF: {profile.page_count} pages > " - f"{settings.MAX_PDF_PAGE_LIMIT} limit, entering shard pipeline" - ) + # ── Unified anatomy path: DOC_PROFILE → shard wrapper → parse_md Phase B ── + if profile and getattr(profile, "anatomy", None) is not None: + is_oversized = profile.page_count > settings.MAX_PDF_PAGE_LIMIT + if is_oversized: + logger.info( + f"📄 Oversized PDF: {profile.page_count} pages > " + f"{settings.MAX_PDF_PAGE_LIMIT} limit, entering shard pipeline" + ) + else: + logger.info( + f"📄 Profile anatomy detected, entering PDF shard pipeline for {filename}" + ) try: - return _parse_oversized_pdf( + return _parse_pdf_via_shards( pdf_path, filename, output_dir, base_llm_paras, profile=profile, relative_root=relative_root, s3_key=s3_key, job_id=job_id, ) except Exception as exc: + if not is_oversized: + raise logger.exception( "Oversized PDF shard pipeline failed for {} (pages={})", filename, @@ -77,11 +87,11 @@ def parse_pdfs( ) -def _parse_oversized_pdf( +def _parse_pdf_via_shards( pdf_path, filename, output_dir, base_llm_paras, profile=None, relative_root=None, s3_key=None, job_id=None, ): - """Handle PDFs exceeding MinerU's page limit via shard-first hierarchy. + """Handle PDFs via the unified shard-first hierarchy pipeline. Pipeline: 1. DOC_AGENT → shard plan + TOC @@ -99,8 +109,14 @@ def _parse_oversized_pdf( eval_md_headings, merge_html_tables, ) - from app.services.document_parser.formats.pdf.shard_merger import merge_images, merge_shard_lines - from app.services.document_parser.formats.pdf.shard_splitter import bin_pack_shards, split_pdf + from app.services.document_parser.formats.pdf.shard_merger import ( + merge_images, + merge_shard_lines, + ) + from app.services.document_parser.formats.pdf.shard_splitter import ( + bin_pack_shards, + split_pdf, + ) work_dir: str | None = None temp_shard_s3_keys: list[str] = [] @@ -110,21 +126,20 @@ def _parse_oversized_pdf( anatomy = getattr(profile, "anatomy", None) if anatomy is None: raise RuntimeError( - f"Oversized PDF profile for {filename} is missing structural anatomy" + f"PDF profile for {filename} is missing structural anatomy" ) - if not anatomy.shard_plan.enabled or not anatomy.shard_plan.shards: + if not anatomy.shard_plan.shards: raise RuntimeError( - f"Oversized PDF profile for {filename} did not produce a shard plan" + f"PDF profile for {filename} did not produce a shard plan" ) agent_shards = anatomy.shard_plan.shards # 2. Extract TOC info from anatomy for page exclusion and heading constraint toc_pages: set[int] = set() - toc_hierarchies = None + toc_hierarchies = anatomy.toc_hierarchies if anatomy.toc_result and anatomy.toc_result.toc_pages: toc_pages = set(anatomy.toc_result.toc_pages) - toc_hierarchies = anatomy.toc_hierarchies logger.info( f"📌 DOC_AGENT TOC detected: {len(toc_pages)} pages to exclude " f"({sorted(toc_pages)}), " @@ -146,55 +161,77 @@ def _parse_oversized_pdf( f"({ms.page_count} pages)" ) - # 4. Physically split PDF (exclude TOC pages if detected) - work_dir = os.path.join(output_dir, "_shards") - os.makedirs(work_dir, exist_ok=True) - with stage_timer("pdf.split", filename=filename): - shard_pdf_paths, _page_remap = split_pdf( - pdf_path, merged_shards, work_dir, - exclude_pages=toc_pages if toc_pages else None, - ) + fast_path_original_pdf = len(merged_shards) == 1 and not toc_pages - temp_shard_s3_keys = [ - _build_temp_shard_s3_key( - source_s3_key=s3_key, - job_id=job_id, - filename=filename, - shard_index=shard_index, - ) - for shard_index, _shard_pdf_path in enumerate(shard_pdf_paths) - ] - - # 5. Parse each shard via MinerU (parallel) - shard_output_dirs: list[str | None] = [None] * len(shard_pdf_paths) + # 4. Parse via MinerU. The 1-shard/no-TOC case keeps the original + # PDF/S3 object to avoid temporary split/upload churn. + shard_output_dirs: list[str | None] concurrency = settings.MINERU_SHARD_CONCURRENCY - def _parse_single_shard(shard_idx, shard_pdf): - assert work_dir is not None - shard_out = os.path.join(work_dir, f"shard_{shard_idx}_output") - os.makedirs(shard_out, exist_ok=True) - shard_filename = ( - f"{os.path.splitext(filename)[0]}_shard{shard_idx}.pdf" - ) - shard_s3_key = temp_shard_s3_keys[shard_idx] - logger.info( - f" 🔄 MinerU shard_{shard_idx}: parsing via S3 URL " - f"({shard_s3_key})" - ) - parse_via_full(shard_pdf, shard_filename, shard_out, s3_key=shard_s3_key) - return shard_out - - with stage_timer( - "pdf.mineru_parallel", filename=filename, shard_count=len(shard_pdf_paths) - ): - with ThreadPoolExecutor(max_workers=concurrency) as executor: - futures = { - executor.submit(_parse_single_shard, i, shard_pdf_path): i - for i, shard_pdf_path in enumerate(shard_pdf_paths) - } - for future in as_completed(futures): - idx = futures[future] - shard_output_dirs[idx] = future.result() + if fast_path_original_pdf: + logger.info("📄 Single shard without TOC pages; using original PDF fast path") + with stage_timer("pdf.extract.single_shard_fast", filename=filename): + parse_via_full(pdf_path, filename, output_dir, s3_key=s3_key) + shard_output_dirs = [output_dir] + else: + # Physically split PDF when TOC pages must be excluded or multiple + # MinerU requests are required. + work_dir = os.path.join(output_dir, "_shards") + os.makedirs(work_dir, exist_ok=True) + with stage_timer("pdf.split", filename=filename): + shard_pdf_paths, _page_remap = split_pdf( + pdf_path, merged_shards, work_dir, + exclude_pages=toc_pages if toc_pages else None, + ) + + if not shard_pdf_paths: + raise RuntimeError( + f"PDF shard split for {filename} produced no shard PDFs" + ) + + temp_shard_s3_keys = [ + _build_temp_shard_s3_key( + source_s3_key=s3_key, + job_id=job_id, + filename=filename, + shard_index=shard_index, + ) + for shard_index, _shard_pdf_path in enumerate(shard_pdf_paths) + ] + + # 5. Parse each shard via MinerU (parallel) + shard_output_dirs = [None] * len(shard_pdf_paths) + + def _parse_single_shard(shard_idx, shard_pdf): + assert work_dir is not None + shard_out = os.path.join(work_dir, f"shard_{shard_idx}_output") + os.makedirs(shard_out, exist_ok=True) + shard_filename = ( + f"{os.path.splitext(filename)[0]}_shard{shard_idx}.pdf" + ) + shard_s3_key = temp_shard_s3_keys[shard_idx] + logger.info( + f" 🔄 MinerU shard_{shard_idx}: parsing via S3 URL " + f"({shard_s3_key})" + ) + parse_via_full( + shard_pdf, shard_filename, shard_out, s3_key=shard_s3_key + ) + return shard_out + + with stage_timer( + "pdf.mineru_parallel", + filename=filename, + shard_count=len(shard_pdf_paths), + ): + with ThreadPoolExecutor(max_workers=concurrency) as executor: + futures = { + executor.submit(_parse_single_shard, i, shard_pdf_path): i + for i, shard_pdf_path in enumerate(shard_pdf_paths) + } + for future in as_completed(futures): + idx = futures[future] + shard_output_dirs[idx] = future.result() # 6. Per-shard heading prediction (parallel) @dataclass @@ -204,12 +241,16 @@ class ShardHeadingResult: heading_count: int smart_parse = base_llm_paras.get("smart_title_parse", True) + toc_model_name = base_llm_paras.get("model_name", settings.NORMOL_MODEL) hierarchy_model_name = ( base_llm_paras.get("hierarchy_model_name") or base_llm_paras.get("model_name", settings.NORMOL_MODEL) ) - def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingResult: + def _predict_shard_headings( + shard_idx: int, + shard_out_dir: str, + ) -> ShardHeadingResult: """Run full heading prediction pipeline on a single shard's full.md.""" md_path = os.path.join(shard_out_dir, "full.md") if not os.path.exists(md_path): @@ -220,10 +261,18 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR md_lines = [line.strip() for line in md_lines if line.strip() != ""] md_lines = merge_html_tables(md_lines) - # TOC context: first TOC shared by all shards; subsequent TOCs assigned - # by page boundary. For simplicity, all TOCs are passed since pred_titles - # only matches headings actually present in this shard's content. + is_first_shard = shard_idx == 0 shard_toc = toc_hierarchies + if shard_toc is None and is_first_shard and _md_has_toc_keyword(md_lines): + logger.info( + f"📌 shard_{shard_idx}: TOC keyword found without profile TOC; " + "reusing markdown TOC detector" + ) + shard_toc, md_lines = detect_tocs_in_texts( + md_lines, + model_name=toc_model_name, + hierarchy_model_name=hierarchy_model_name, + ) lines_with_heading = eval_md_headings( md_lines, @@ -237,6 +286,7 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR if os.path.exists(os.path.join(shard_out_dir, "layout.json")) else None ), + is_first_shard=is_first_shard, ) heading_count = sum(1 for line in lines_with_heading if line.startswith("#")) @@ -250,7 +300,9 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR heading_count=heading_count, ) - shard_heading_results: list[ShardHeadingResult | None] = [None] * len(shard_output_dirs) + shard_heading_results: list[ShardHeadingResult | None] = [None] * len( + shard_output_dirs + ) with stage_timer( "pdf.shard_headings", filename=filename, shard_count=len(shard_output_dirs) @@ -274,7 +326,7 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR # Compute level offsets: continuation shards get shifted deeper. shard_offsets: list[int] = [] - for shard in agent_shards: + for shard in agent_shards[: len(complete_heading_results)]: if shard.is_continuation: shard_offsets.append(max(shard.split_depth - 1, 0)) else: @@ -296,7 +348,8 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR ) with stage_timer("pdf.merge_images", filename=filename): - merge_images(shard_output_dirs, output_dir) + if not fast_path_original_pdf: + merge_images(shard_output_dirs, output_dir) logger.info("✅ Shard-first hierarchy complete, entering parse_md Phase B") @@ -314,6 +367,11 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR _cleanup_local_shard_workspace(work_dir) +def _md_has_toc_keyword(md_lines: list[str]) -> bool: + toc_keywords = {"目录", "目次", "tableofcontents", "contents"} + return any(normalize_md(line) in toc_keywords for line in md_lines) + + def _build_temp_shard_s3_key( *, source_s3_key: str | None, @@ -334,6 +392,7 @@ def _source_key_stem(source_s3_key: str | None) -> str | None: stem, _extension = os.path.splitext(key_name) return stem or None + def _sanitize_temp_storage_segment(value: object) -> str: normalized = re.sub(r"[^A-Za-z0-9_.-]+", "-", str(value)).strip(".-") return normalized or "document" diff --git a/apps/worker/app/services/document_parser/profiling/doc_profiler.py b/apps/worker/app/services/document_parser/profiling/doc_profiler.py index 28076bfb..8944c7ab 100644 --- a/apps/worker/app/services/document_parser/profiling/doc_profiler.py +++ b/apps/worker/app/services/document_parser/profiling/doc_profiler.py @@ -9,7 +9,11 @@ build_oversized_pdf_processing_failed_exception, raise_if_oversized_pdf_not_supported, ) -from app.services.document_parser.profiling.profile_model import ParserDocumentProfile +from app.services.document_parser.profiling.profile_model import ( + ParserDocumentProfile, + ParserTocProfile, + TocEvidence, +) from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory from shared.core.config import settings @@ -67,6 +71,8 @@ def _profile_pdf( "planner_model": settings.IMAGE_MODEL, "vlm_model": settings.IMAGE_MODEL, "model": settings.HIERARCHY_LLM_MODEL or settings.NORMOL_MODEL, + "toc_before_coarse": settings.PDF_PROFILE_TOC_ENABLED, + "toc_before_coarse_page_limit": settings.MAX_PDF_PAGE_LIMIT, }, ) agent_profile = coordinator.run_coarse() @@ -95,13 +101,42 @@ def _profile_pdf( if not profile.is_atlas: try: profile.anatomy = coordinator.run_structural() + profile.toc = _map_toc_profile(coordinator) except Exception as exc: raise build_oversized_pdf_processing_failed_exception( page_count=profile.page_count, original_exception=exc, ) from exc + elif settings.PDF_PROFILE_TOC_ENABLED: + if not profile.is_atlas: + profile.anatomy = coordinator.run_lightweight_anatomy() + profile.toc = _map_toc_profile(coordinator) return profile +def _map_toc_profile(coordinator: ProfileCoordinator) -> ParserTocProfile: + toc_result = coordinator.blackboard.toc_result + if toc_result is None: + return ParserTocProfile() + evidence = [ + TocEvidence( + page_index=item.page_index, + source=item.source, + confidence=item.confidence, + reason=item.reason, + ) + for item in toc_result.evidence + ] + source = "pdf_vlm" if toc_result.method != "none" else "none" + return ParserTocProfile( + toc_pages=list(toc_result.toc_pages), + hierarchies=coordinator.blackboard.toc_hierarchies, + evidence=evidence, + source=source, + method=toc_result.method, + notes=toc_result.notes, + ) + + __all__ = ["profile_document"] diff --git a/apps/worker/app/services/document_parser/profiling/profile_model.py b/apps/worker/app/services/document_parser/profiling/profile_model.py index e553ca41..9334ae77 100644 --- a/apps/worker/app/services/document_parser/profiling/profile_model.py +++ b/apps/worker/app/services/document_parser/profiling/profile_model.py @@ -6,6 +6,28 @@ from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory +@dataclass +class TocEvidence: + page_index: int + source: str + confidence: float + reason: str = "" + + +@dataclass +class ParserTocProfile: + toc_pages: list[int] = field(default_factory=list) + hierarchies: list[dict[str, Any]] | None = None + evidence: list[TocEvidence] = field(default_factory=list) + source: str = "none" + method: str = "none" + notes: str = "" + + @property + def has_toc(self) -> bool: + return bool(self.toc_pages or self.hierarchies) + + @dataclass class ParserDocumentProfile: """Parser-entry document profile used for routing and PDF anatomy reuse.""" @@ -18,6 +40,8 @@ class ParserDocumentProfile: language: str = "unknown" reasoning: str = "" category_rationale: str = "" + toc: ParserTocProfile = field(default_factory=ParserTocProfile) + granularity: str = "page" anatomy: Any | None = None metrics: dict[str, Any] = field(default_factory=dict) @@ -48,9 +72,11 @@ def summary(self) -> str: f"routing={self.routing_category.value}, " f"scanned={self.is_scanned}, pages={self.page_count}" ) + if self.toc.has_toc: + parts += f", toc={self.toc.method}" if self.has_structural_anatomy: parts += ", anatomy=True" return parts -__all__ = ["ParserDocumentProfile"] +__all__ = ["ParserDocumentProfile", "ParserTocProfile", "TocEvidence"] diff --git a/apps/worker/app/services/document_parser/structure/body_boundary.py b/apps/worker/app/services/document_parser/structure/body_boundary.py new file mode 100644 index 00000000..a7ad6d3d --- /dev/null +++ b/apps/worker/app/services/document_parser/structure/body_boundary.py @@ -0,0 +1,72 @@ +"""Line-based body boundary helpers for TOC-derived headings.""" + +from __future__ import annotations + +import re +import unicodedata +from typing import Any + + +_LEADING_NUMBER_RE = re.compile( + r"""^ + (?: + [#]+\s* + | 第\s*[零一二三四五六七八九十百千\d]+\s*[章节篇部分] + | [零一二三四五六七八九十百千]+\s*[、。,,] + | [((]\s*[零一二三四五六七八九十百千\d]+\s*[))] + | \d+(?:\.\d+)*\.?\s* + | [IVXLCDM]+\.?\s+ + | [A-Za-z]\.\s+ + | Chapter\s+\w+\s* + ) + """, + re.VERBOSE | re.IGNORECASE, +) + +_PAGE_SUFFIX_RE = re.compile(r"[\s\.\-·…]+\d+\s*$") + + +def normalize_heading_text(text: str) -> str: + """Normalize text for fuzzy heading matching.""" + text = unicodedata.normalize("NFKC", text or "") + text = re.sub(r"\s+", " ", text).strip() + return text + + +def clean_toc_title(title: str) -> str: + """Remove leading numbering/hashes and trailing page numbers from a TOC title.""" + cleaned = _PAGE_SUFFIX_RE.sub("", title or "").strip() + cleaned = _LEADING_NUMBER_RE.sub("", cleaned).strip() + return cleaned + + +def extract_level1_titles(toc_hierarchies: list[dict[str, Any]]) -> list[str]: + """Extract cleaned level-1 titles from TOC hierarchy payloads.""" + titles: list[str] = [] + for hier in toc_hierarchies: + toc_tree = hier.get("toc_tree") or {} + for raw_title in toc_tree.keys(): + cleaned = clean_toc_title(str(raw_title)) + if cleaned and len(cleaned) >= 2: + titles.append(cleaned) + return titles + + +def find_first_body_boundary( + lines: list[str], + level1_titles: list[str], +) -> int | None: + """Return the first line index matching a TOC level-1 title, if any.""" + normalized_titles = [ + normalize_heading_text(title) + for title in level1_titles + if normalize_heading_text(title) + ] + if not normalized_titles: + return None + + for index, line in enumerate(lines): + normalized_line = normalize_heading_text(line.lstrip("#").strip()) + if any(title in normalized_line for title in normalized_titles): + return index + return None diff --git a/apps/worker/app/services/document_parser/structure/heading_hierarchy.py b/apps/worker/app/services/document_parser/structure/heading_hierarchy.py index 20a9b170..30f6f0f1 100644 --- a/apps/worker/app/services/document_parser/structure/heading_hierarchy.py +++ b/apps/worker/app/services/document_parser/structure/heading_hierarchy.py @@ -20,6 +20,7 @@ class HeadingHierarchyInput: output_dir: str | None = None layout_json_path: str | None = None first_toc_ele_num: int | None = None + is_first_shard: bool = True def predict_heading_hierarchy(heading_input: HeadingHierarchyInput) -> pd.DataFrame: @@ -34,4 +35,5 @@ def predict_heading_hierarchy(heading_input: HeadingHierarchyInput) -> pd.DataFr output_dir=heading_input.output_dir, layout_json_path=heading_input.layout_json_path, first_toc_ele_num=heading_input.first_toc_ele_num, + is_first_shard=heading_input.is_first_shard, ) diff --git a/apps/worker/app/services/document_parser/structure/layout_parser.py b/apps/worker/app/services/document_parser/structure/layout_parser.py index 7c2545a6..6c49910c 100755 --- a/apps/worker/app/services/document_parser/structure/layout_parser.py +++ b/apps/worker/app/services/document_parser/structure/layout_parser.py @@ -20,6 +20,10 @@ from app.services.document_parser.structure.heading_tree import ( tree_to_dataframe as heading_tree_to_dataframe, ) +from app.services.document_parser.structure.body_boundary import ( + extract_level1_titles, + find_first_body_boundary, +) from app.services.document_parser.support.stage_profiler import stage_timer from app.services.document_parser.tables.table_text_parser import df2md from gevent.pool import Pool as GeventPool @@ -415,6 +419,12 @@ def _resolve_first_toc_boundary(toc_hierarchies=None, first_toc_ele_num=None): return resolved_start +def _first_toc_range_unit(toc_hierarchies=None) -> str | None: + if not toc_hierarchies: + return None + return toc_hierarchies[0].get("toc_range_unit") + + def pred_titles( infos, doc_type, @@ -426,6 +436,7 @@ def pred_titles( output_dir=None, layout_json_path=None, first_toc_ele_num=None, + is_first_shard=True, ): """ predict title hierarchy @@ -441,6 +452,7 @@ def pred_titles( output_dir: output directory for saving intermediate CSV results layout_json_path: path to layout.json for META features (optional) first_toc_ele_num: ele_num of the first TOC block in DOCX (for pre-TOC exclusion) + is_first_shard: whether PDF-derived markdown belongs to the first document shard """ model_name = _resolve_hierarchy_model_name(model_name) logger.info( @@ -465,6 +477,18 @@ def pred_titles( first_toc_start = None if doc_type == "md": first_toc_start = _resolve_first_toc_boundary(toc_hierarchies=toc_hierarchies) + if ( + first_toc_start is None + and is_first_shard + and _first_toc_range_unit(toc_hierarchies) == "page" + ): + level1_titles = extract_level1_titles(toc_hierarchies or []) + first_toc_start = find_first_body_boundary(infos, level1_titles) + if first_toc_start is not None: + logger.info( + f"📌 Demoting PDF front matter before first TOC H1 line " + f"(id < {first_toc_start})" + ) elif doc_type == "docx": first_toc_start = _resolve_first_toc_boundary( toc_hierarchies=toc_hierarchies, @@ -494,6 +518,7 @@ def pred_titles( and len(toc_hierarchies) > 1 and doc_type in {"md", "docx"} and smart_parse + and _first_toc_range_unit(toc_hierarchies) != "page" ): # Multiple TOCs divide the document into independent zones. # Each zone gets its own naive + LLM pipeline with zone-specific TOC context. diff --git a/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py b/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py new file mode 100644 index 00000000..f1d108ad --- /dev/null +++ b/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py @@ -0,0 +1,783 @@ +from __future__ import annotations + +import os +from pathlib import Path +from types import SimpleNamespace + +os.environ.setdefault("DATABASE_URL", "postgresql+asyncpg://test:test@localhost/test") +os.environ.setdefault("TMP_PATH", "/tmp/knowhere-test") +os.environ.setdefault("S3_BUCKET_NAME", "test-uploads") +os.environ.setdefault("S3_ACCESS_KEY_ID", "test") +os.environ.setdefault("S3_SECRET_ACCESS_KEY", "test") +os.environ.setdefault("S3_TEMP_PATH", "/tmp") + +from app.services.document_agent.coordinator import ProfileCoordinator +from app.services.document_agent.manifest import ( + DocumentProfile, + H1BoundaryResult, + PageAnatomyMap, + PageFeature, + PageLabel, + Shard, + ShardPlan, + TocAnchorPage, + TocEvidence as AgentTocEvidence, + TocResult, + ToolResult, +) +from app.services.document_agent.validators import validate_shard_plan +from app.services.document_parser.profiling.doc_profiler import profile_document +from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory + + +def _page_feature(page: int = 1) -> PageFeature: + return PageFeature( + page=page, + raw_text_length=20, + text_density=0.1, + image_coverage=0.0, + image_count=0, + table_count=0, + drawings_count=0, + orientation="portrait", + width=72.0, + height=72.0, + is_blank_like=False, + text_lines_preview=["Section 1"], + ) + + +def test_run_toc_degrades_to_empty_result_on_standard_failure(tmp_path: Path) -> None: + coordinator = ProfileCoordinator( + pdf_path=str(tmp_path / "standard.pdf"), + job_id="job-toc-fail-soft", + output_dir=str(tmp_path / "profile"), + ) + coordinator.blackboard.page_count = 1 + coordinator.blackboard.page_features = [_page_feature()] + + def _fail_toc_extraction() -> None: + raise RuntimeError("VLM JSON parse failed") + + coordinator._run_toc_extraction_pipeline = _fail_toc_extraction # type: ignore[method-assign] + + toc_result = coordinator.run_toc() + + assert toc_result.method == "none" + assert toc_result.toc_pages == [] + assert toc_result.failure_kind == "degraded" + assert "degraded" in toc_result.notes + assert coordinator.blackboard.toc_hierarchies is None + + +def test_run_lightweight_anatomy_builds_single_shard_without_planner_llm( + tmp_path: Path, +) -> None: + output_dir = tmp_path / "profile" + coordinator = ProfileCoordinator( + pdf_path=str(tmp_path / "standard.pdf"), + job_id="job-lightweight", + output_dir=str(output_dir), + settings={"shard_threshold": 200}, + ) + coordinator.blackboard.page_count = 2 + coordinator.blackboard.page_features = [_page_feature(1), _page_feature(2)] + coordinator.blackboard.page_labels = [ + PageLabel(page=1, kind="normal", confidence=1.0), + PageLabel(page=2, kind="normal", confidence=1.0), + ] + coordinator.blackboard.doc_stats = {"page_count": 2} + coordinator.blackboard.global_signals["page_kind_counts"] = {"normal": 2} + coordinator.blackboard.document_profile = DocumentProfile( + is_scanned=False, + category="Research Report", + routing_category=PdfRoutingCategory.GENERIC.value, + ) + coordinator.blackboard.toc_result = TocResult(method="none") + + anatomy = coordinator.run_lightweight_anatomy() + + assert anatomy.shard_plan.enabled is False + assert len(anatomy.shard_plan.shards) == 1 + assert anatomy.shard_plan.shards[0].page_start == 1 + assert anatomy.shard_plan.shards[0].page_end == 2 + assert anatomy.toc_result.method == "none" + assert (output_dir / "anatomy_map.json").exists() + + +def test_run_structural_retries_transient_confirm_failed_toc_result( + monkeypatch, + tmp_path: Path, +) -> None: + coordinator = ProfileCoordinator( + pdf_path=str(tmp_path / "oversized.pdf"), + job_id="job-suspect-empty-toc", + output_dir=str(tmp_path / "profile"), + ) + (tmp_path / "profile").mkdir() + coordinator.blackboard.page_count = 3 + coordinator.blackboard.page_features = [_page_feature(1), _page_feature(2)] + coordinator.blackboard.page_labels = [ + PageLabel(page=1, kind="normal", confidence=1.0), + PageLabel(page=2, kind="normal", confidence=1.0), + ] + coordinator.blackboard.doc_stats = {"page_count": 3} + coordinator.blackboard.global_signals["page_kind_counts"] = {"normal": 3} + coordinator.blackboard.document_profile = DocumentProfile( + is_scanned=False, + category="Prospectus", + routing_category=PdfRoutingCategory.GENERIC.value, + ) + coordinator.blackboard.toc_result = TocResult( + candidates=[ + TocAnchorPage(page=17, png_path="/tmp/toc_anchor_page_17.png", source="text_scan") + ], + evidence=[ + AgentTocEvidence( + page_index=17, + source="vlm", + confidence=0.05, + reason="rejected", + ) + ], + method="none", + notes="VLM anchor confirmation failed; TOC candidates left unconfirmed", + failure_kind="confirm_failed", + ) + + calls: list[str] = [] + + def fake_toc_extraction() -> None: + calls.append("toc") + coordinator.blackboard.toc_result = TocResult(toc_pages=[17], method="vlm_batch") + coordinator.blackboard.toc_hierarchies = [ + {"toc_range": [17, 17], "toc_range_unit": "page", "toc_tree": {}} + ] + + def fake_h1_boundary() -> None: + calls.append("h1") + coordinator.blackboard.h1_result = H1BoundaryResult(method="toc_grep") + + def fake_persist(_anatomy): + calls.append("persist") + + monkeypatch.setattr(coordinator, "_run_toc_extraction_pipeline", fake_toc_extraction) + monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary) + monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist) + + from app.services.document_agent import coordinator as coordinator_module + + monkeypatch.setattr( + coordinator_module.ProfilePlanner, + "propose", + lambda self: ( + coordinator.blackboard.document_profile, + None, + ToolResult(status="ok", payload={}), + ), + ) + + class FakeExecutor: + def __init__(self, *_args, **_kwargs) -> None: + pass + + def run(self): + coordinator.blackboard.shard_plan = ShardPlan( + enabled=True, + reason="too_large", + shards=[ + Shard( + shard_index=0, + page_start=1, + page_end=3, + page_offset=0, + anchor_type="forced_max_size", + anchor_evidence="fixture", + confidence=1.0, + ) + ], + ) + return SimpleNamespace( + success=True, + verdict=SimpleNamespace(status="success", rationale="ok"), + trace_summary={}, + ) + + monkeypatch.setattr(coordinator_module, "ReActExecutor", FakeExecutor) + + anatomy = coordinator.run_structural() + + assert calls[:2] == ["toc", "h1"] + assert anatomy.toc_result.toc_pages == [17] + + +def test_run_structural_trusts_rejected_all_toc_and_fails_open( + monkeypatch, + tmp_path: Path, +) -> None: + coordinator = ProfileCoordinator( + pdf_path=str(tmp_path / "oversized.pdf"), + job_id="job-rejected-all-toc-fail-open", + output_dir=str(tmp_path / "profile"), + ) + (tmp_path / "profile").mkdir() + coordinator.blackboard.page_count = 3 + coordinator.blackboard.page_features = [_page_feature(1), _page_feature(2)] + coordinator.blackboard.page_labels = [ + PageLabel(page=1, kind="normal", confidence=1.0), + PageLabel(page=2, kind="normal", confidence=1.0), + ] + coordinator.blackboard.doc_stats = {"page_count": 3} + coordinator.blackboard.global_signals["page_kind_counts"] = {"normal": 3} + coordinator.blackboard.document_profile = DocumentProfile( + is_scanned=False, + category="Prospectus", + routing_category=PdfRoutingCategory.GENERIC.value, + ) + coordinator.blackboard.toc_result = TocResult( + candidates=[ + TocAnchorPage( + page=17, + png_path="/tmp/toc_anchor_page_17.png", + source="text_scan", + ) + ], + method="none", + notes="VLM rejected all TOC anchor candidates", + failure_kind="rejected_all", + ) + + def fake_toc_extraction() -> None: + raise AssertionError("rejected_all should be trusted and not retried") + + calls: list[str] = [] + + def fake_h1_boundary() -> None: + calls.append("h1") + coordinator.blackboard.h1_result = H1BoundaryResult(method="none") + + def fake_persist(_anatomy): + calls.append("persist") + + monkeypatch.setattr(coordinator, "_run_toc_extraction_pipeline", fake_toc_extraction) + monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary) + monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist) + + from app.services.document_agent import coordinator as coordinator_module + + monkeypatch.setattr( + coordinator_module.ProfilePlanner, + "propose", + lambda self: ( + coordinator.blackboard.document_profile, + None, + ToolResult(status="ok", payload={}), + ), + ) + + class FakeExecutor: + def __init__(self, *_args, **_kwargs) -> None: + pass + + def run(self): + coordinator.blackboard.shard_plan = ShardPlan( + enabled=True, + reason="too_large", + shards=[ + Shard( + shard_index=0, + page_start=1, + page_end=3, + page_offset=0, + anchor_type="forced_max_size", + anchor_evidence="fixture", + confidence=1.0, + ) + ], + ) + return SimpleNamespace( + success=True, + verdict=SimpleNamespace(status="success", rationale="ok"), + trace_summary={}, + ) + + monkeypatch.setattr(coordinator_module, "ReActExecutor", FakeExecutor) + + anatomy = coordinator.run_structural() + + assert calls == ["h1", "persist"] + assert anatomy.toc_result.failure_kind == "rejected_all" + assert anatomy.toc_result.toc_pages == [] + + +def test_run_coarse_runs_toc_before_planner_for_oversized_and_reuses_planner( + monkeypatch, + tmp_path: Path, +) -> None: + coordinator = ProfileCoordinator( + pdf_path=str(tmp_path / "oversized.pdf"), + job_id="job-toc-before-coarse", + output_dir=str(tmp_path / "profile"), + settings={"toc_before_coarse_page_limit": 2}, + ) + (tmp_path / "profile").mkdir() + coordinator.blackboard.page_count = 3 + coordinator.blackboard.page_features = [_page_feature(1), _page_feature(2)] + coordinator.blackboard.page_labels = [ + PageLabel(page=1, kind="normal", confidence=1.0), + PageLabel(page=2, kind="normal", confidence=1.0), + ] + coordinator.blackboard.doc_stats = {"page_count": 3} + coordinator.blackboard.global_signals["page_kind_counts"] = {"normal": 3} + + calls: list[str] = [] + + def fake_toc_extraction() -> None: + calls.append("toc") + coordinator.blackboard.toc_result = TocResult(toc_pages=[17], method="vlm_batch") + coordinator.blackboard.toc_hierarchies = [ + {"toc_range": [17, 17], "toc_range_unit": "page", "toc_tree": {}} + ] + + def fake_h1_boundary() -> None: + calls.append("h1") + coordinator.blackboard.h1_result = H1BoundaryResult(method="toc_grep") + + def fake_persist(_anatomy): + calls.append("persist") + + monkeypatch.setattr(coordinator, "_run_toc_extraction_pipeline", fake_toc_extraction) + monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary) + monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist) + + from app.services.document_agent import coordinator as coordinator_module + + def fake_propose(_self): + calls.append("planner") + return ( + DocumentProfile( + is_scanned=False, + category="Prospectus", + routing_category=PdfRoutingCategory.GENERIC.value, + ), + None, + ToolResult(status="ok", payload={}), + ) + + monkeypatch.setattr(coordinator_module.ProfilePlanner, "propose", fake_propose) + + class FakeExecutor: + def __init__(self, *_args, **_kwargs) -> None: + pass + + def run(self): + coordinator.blackboard.shard_plan = ShardPlan( + enabled=True, + reason="too_large", + shards=[ + Shard( + shard_index=0, + page_start=1, + page_end=3, + page_offset=0, + anchor_type="forced_max_size", + anchor_evidence="fixture", + confidence=1.0, + ) + ], + ) + return SimpleNamespace( + success=True, + verdict=SimpleNamespace(status="success", rationale="ok"), + trace_summary={}, + ) + + monkeypatch.setattr(coordinator_module, "ReActExecutor", FakeExecutor) + + coordinator.run_coarse() + anatomy = coordinator.run_structural() + + assert calls == ["toc", "planner", "h1", "persist"] + assert anatomy.toc_result.toc_pages == [17] + + +def test_anchor_confirmation_failure_requires_one_strict_retry(tmp_path: Path) -> None: + coordinator = ProfileCoordinator( + pdf_path=str(tmp_path / "oversized.pdf"), + job_id="job-confirm-failed-not-suspect", + output_dir=str(tmp_path / "profile"), + ) + coordinator.blackboard.toc_result = TocResult( + candidates=[ + TocAnchorPage( + page=17, + png_path="/tmp/toc_anchor_page_17.png", + source="text_scan", + ) + ], + method="none", + notes="VLM anchor confirmation failed; TOC candidates left unconfirmed", + failure_kind="confirm_failed", + ) + + assert coordinator._toc_result_requires_strict_retry() is True + + +def test_oversized_single_shard_plan_is_invalid() -> None: + report = validate_shard_plan( + ShardPlan( + enabled=False, + reason="not_needed", + shards=[ + Shard( + shard_index=0, + page_start=1, + page_end=407, + page_offset=0, + anchor_type="forced_max_size", + anchor_evidence="final shard", + confidence=1.0, + ) + ], + ), + page_count=407, + min_pages=20, + max_pages=200, + ) + + assert report.valid is False + assert report.errors == ["shard 0 exceeds max_pages=200"] + + +def test_standard_pdf_profile_toc_flag_off_preserves_current_behavior( + monkeypatch, + tmp_path: Path, +) -> None: + from app.services.document_parser.profiling import doc_profiler + + fake_instances: list[object] = [] + + class FakeCoordinator: + def __init__(self, **_kwargs) -> None: + self.calls: list[str] = [] + self.blackboard = SimpleNamespace( + page_count=2, + doc_stats={"page_count": 2}, + global_signals={}, + toc_result=None, + toc_hierarchies=None, + ) + fake_instances.append(self) + + def run_coarse(self) -> DocumentProfile: + self.calls.append("run_coarse") + return DocumentProfile( + is_scanned=False, + category="Research Report", + routing_category=PdfRoutingCategory.GENERIC.value, + ) + + def run_toc(self) -> TocResult: + self.calls.append("run_toc") + raise AssertionError("run_toc should be flag-gated for standard PDFs") + + def run_lightweight_anatomy(self): + self.calls.append("run_lightweight_anatomy") + raise AssertionError("lightweight anatomy should be flag-gated") + + monkeypatch.setattr(doc_profiler, "ProfileCoordinator", FakeCoordinator) + monkeypatch.setattr(doc_profiler.settings, "PDF_PROFILE_TOC_ENABLED", False) + monkeypatch.setattr(doc_profiler.settings, "MAX_PDF_PAGE_LIMIT", 200) + + profile = profile_document( + str(tmp_path / "standard.pdf"), + "standard.pdf", + job_id="job-flag-off", + output_dir=str(tmp_path), + ) + + assert profile.toc.has_toc is False + assert profile.anatomy is None + assert fake_instances[0].calls == ["run_coarse"] + + +def test_standard_pdf_profile_toc_flag_on_builds_toc_and_lightweight_anatomy( + monkeypatch, + tmp_path: Path, +) -> None: + from app.services.document_parser.profiling import doc_profiler + + fake_anatomy = object() + + class FakeCoordinator: + def __init__(self, **_kwargs) -> None: + self.calls: list[str] = [] + self.blackboard = SimpleNamespace( + page_count=2, + doc_stats={"page_count": 2}, + global_signals={}, + toc_result=None, + toc_hierarchies=None, + ) + + def run_coarse(self) -> DocumentProfile: + self.calls.append("run_coarse") + self.blackboard.toc_result = TocResult( + toc_pages=[2], + evidence=[ + AgentTocEvidence( + page_index=2, + source="vlm", + confidence=0.95, + reason="table of contents", + ) + ], + method="vlm_batch", + ) + self.blackboard.toc_hierarchies = [ + {"toc_range": [2, 2], "toc_range_unit": "page", "toc_tree": {}} + ] + return DocumentProfile( + is_scanned=False, + category="Research Report", + routing_category=PdfRoutingCategory.GENERIC.value, + ) + + def run_toc(self) -> TocResult: + self.calls.append("run_toc") + raise AssertionError("run_toc should be no-op after TOC-before-coarse") + + def run_lightweight_anatomy(self): + self.calls.append("run_lightweight_anatomy") + return fake_anatomy + + fake_instances: list[FakeCoordinator] = [] + + class CapturingCoordinator(FakeCoordinator): + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + fake_instances.append(self) + + monkeypatch.setattr(doc_profiler, "ProfileCoordinator", CapturingCoordinator) + monkeypatch.setattr(doc_profiler.settings, "PDF_PROFILE_TOC_ENABLED", True) + monkeypatch.setattr(doc_profiler.settings, "MAX_PDF_PAGE_LIMIT", 200) + + profile = profile_document( + str(tmp_path / "standard.pdf"), + "standard.pdf", + job_id="job-flag-on", + output_dir=str(tmp_path), + ) + + assert fake_instances[0].calls == [ + "run_coarse", + "run_lightweight_anatomy", + ] + assert profile.toc.has_toc is True + assert profile.toc.method == "vlm_batch" + assert profile.toc.evidence[0].confidence == 0.95 + assert profile.anatomy is fake_anatomy + + +def test_pdf_shard_pipeline_accepts_single_shard_fast_path( + monkeypatch, + tmp_path: Path, +) -> None: + from app.services.document_parser.formats.markdown import parser as markdown_parser + from app.services.document_parser.formats.pdf import parser as pdf_parser + from app.services.document_parser.formats.pdf import shard_splitter + + output_dir = tmp_path / "out" + output_dir.mkdir() + calls: list[str] = [] + + def fake_parse_via_full(pdf_path, filename, out_dir, s3_key=None): + calls.append(f"parse:{filename}:{s3_key}") + Path(out_dir, "full.md").write_text("1. Introduction\nBody\n", encoding="utf-8") + + def fail_split(*_args, **_kwargs): + raise AssertionError("single shard without TOC should not split") + + def fake_eval_md_headings(md_lines, *_args, **_kwargs): + return [f"# {line}" if line.startswith("1.") else line for line in md_lines] + + def fake_parse_md(*_args, **kwargs): + calls.append("parse_md") + return {"lines": kwargs["lines_with_heading"]} + + monkeypatch.setattr(pdf_parser, "parse_via_full", fake_parse_via_full) + monkeypatch.setattr(shard_splitter, "split_pdf", fail_split) + monkeypatch.setattr(markdown_parser, "eval_md_headings", fake_eval_md_headings) + monkeypatch.setattr(pdf_parser, "parse_md", fake_parse_md) + + profile = SimpleNamespace( + anatomy=PageAnatomyMap( + job_id="job-single-shard", + file_path=str(tmp_path / "standard.pdf"), + page_count=2, + page_features=[_page_feature(1), _page_feature(2)], + page_labels=[ + PageLabel(page=1, kind="normal", confidence=1.0), + PageLabel(page=2, kind="normal", confidence=1.0), + ], + toc_result=TocResult(method="none"), + h1_result=H1BoundaryResult(method="none"), + shard_plan=ShardPlan( + enabled=False, + reason="not_needed", + shards=[ + Shard( + shard_index=0, + page_start=1, + page_end=2, + page_offset=0, + anchor_type="forced_max_size", + anchor_evidence="document within shard threshold", + confidence=1.0, + ) + ], + ), + ) + ) + + result = pdf_parser._parse_pdf_via_shards( + str(tmp_path / "standard.pdf"), + "standard.pdf", + str(output_dir), + {"smart_title_parse": False, "model_name": "test-model"}, + profile=profile, + s3_key="uploads/source.pdf", + ) + + assert calls == ["parse:standard.pdf:uploads/source.pdf", "parse_md"] + assert result["lines"] == ["# 1. Introduction", "Body"] + + +def test_pdf_first_shard_reuses_markdown_toc_detector_when_profile_misses_toc( + monkeypatch, + tmp_path: Path, +) -> None: + from app.services.document_parser.formats.markdown import parser as markdown_parser + from app.services.document_parser.formats.pdf import parser as pdf_parser + + output_dir = tmp_path / "out" + output_dir.mkdir() + detector_calls: list[list[str]] = [] + heading_contexts: list[object] = [] + + def fake_parse_via_full(_pdf_path, _filename, out_dir, s3_key=None): + Path(out_dir, "full.md").write_text( + "Contents\n1 Introduction .... 2\n1 Introduction\nBody\n", + encoding="utf-8", + ) + + def fake_detect_tocs_in_texts(md_lines, **_kwargs): + detector_calls.append(list(md_lines)) + return ( + [ + { + "toc_range": [0, 1], + "toc_range_unit": "line", + "toc_tree": {"Introduction": {}}, + } + ], + ["1 Introduction", "Body"], + ) + + def fake_eval_md_headings(md_lines, *_args, **kwargs): + heading_contexts.append(kwargs.get("toc_hierarchies")) + return [f"# {line}" if line.startswith("1 ") else line for line in md_lines] + + monkeypatch.setattr(pdf_parser, "parse_via_full", fake_parse_via_full) + monkeypatch.setattr(pdf_parser, "detect_tocs_in_texts", fake_detect_tocs_in_texts) + monkeypatch.setattr(markdown_parser, "eval_md_headings", fake_eval_md_headings) + monkeypatch.setattr( + pdf_parser, + "parse_md", + lambda *_args, **kwargs: {"lines": kwargs["lines_with_heading"]}, + ) + + profile = SimpleNamespace( + anatomy=PageAnatomyMap( + job_id="job-missed-toc", + file_path=str(tmp_path / "standard.pdf"), + page_count=3, + page_features=[_page_feature(1), _page_feature(2), _page_feature(3)], + page_labels=[ + PageLabel(page=1, kind="normal", confidence=1.0), + PageLabel(page=2, kind="normal", confidence=1.0), + PageLabel(page=3, kind="normal", confidence=1.0), + ], + toc_result=TocResult(method="none"), + h1_result=H1BoundaryResult(method="none"), + shard_plan=ShardPlan( + enabled=False, + reason="not_needed", + shards=[ + Shard( + shard_index=0, + page_start=1, + page_end=3, + page_offset=0, + anchor_type="forced_max_size", + anchor_evidence="document within shard threshold", + confidence=1.0, + ) + ], + ), + ) + ) + + result = pdf_parser._parse_pdf_via_shards( + str(tmp_path / "standard.pdf"), + "standard.pdf", + str(output_dir), + {"smart_title_parse": False, "model_name": "test-model"}, + profile=profile, + ) + + assert len(detector_calls) == 1 + assert heading_contexts[0][0]["toc_range_unit"] == "line" + assert result["lines"] == ["# 1 Introduction", "Body"] + + +def test_page_based_toc_demotes_front_matter_only_on_first_shard() -> None: + from app.services.document_parser.structure.layout_parser import pred_titles + + toc_hierarchies = [ + { + "toc_range": [2, 2], + "toc_range_unit": "page", + "toc_tree": {"Risk Factors": {}}, + } + ] + lines = [ + "1. Cover", + "2. Legal Notice", + "3. Risk Factors", + "4. Business", + ] + + first_shard = pred_titles( + lines, + doc_type="md", + toc_hierarchies=toc_hierarchies, + smart_parse=False, + is_first_shard=True, + ) + continuation = pred_titles( + lines, + doc_type="md", + toc_hierarchies=toc_hierarchies, + smart_parse=False, + is_first_shard=False, + ) + + first_levels = dict(zip(first_shard["id"], first_shard["level"], strict=False)) + continuation_levels = dict( + zip(continuation["id"], continuation["level"], strict=False) + ) + assert first_levels[0] == -1 + assert first_levels[1] == -1 + assert first_levels[2] > 0 + assert continuation_levels[0] > 0 diff --git a/apps/worker/tests/contract/test_parse_task_contract.py b/apps/worker/tests/contract/test_parse_task_contract.py index c2f33121..6548eb43 100644 --- a/apps/worker/tests/contract/test_parse_task_contract.py +++ b/apps/worker/tests/contract/test_parse_task_contract.py @@ -498,6 +498,13 @@ def test_oversized_pdf_shard_failure_preserves_processing_error( monkeypatch.setenv("S3_SECRET_ACCESS_KEY", "test") monkeypatch.setenv("S3_TEMP_PATH", str(tmp_path)) + from app.services.document_agent.manifest import ( + H1BoundaryResult, + PageAnatomyMap, + Shard, + ShardPlan, + TocResult, + ) from app.services.document_parser.formats.pdf import parser as pdf_parser from app.services.document_parser.profiling.profile_model import ParserDocumentProfile from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory @@ -508,7 +515,7 @@ def test_oversized_pdf_shard_failure_preserves_processing_error( def _fail_oversized_parse(*args, **kwargs): raise RuntimeError("MinerU shard 0 failed") - monkeypatch.setattr(pdf_parser, "_parse_oversized_pdf", _fail_oversized_parse) + monkeypatch.setattr(pdf_parser, "_parse_pdf_via_shards", _fail_oversized_parse) with pytest.raises(PDFParsingException) as exc_info: pdf_parser.parse_pdfs( @@ -521,6 +528,30 @@ def _fail_oversized_parse(*args, **kwargs): category="generic document", routing_category=PdfRoutingCategory.GENERIC, page_count=2, + anatomy=PageAnatomyMap( + job_id="job-oversized-fail", + file_path=str(tmp_path / "source.pdf"), + page_count=2, + page_features=[], + page_labels=[], + toc_result=TocResult(method="none"), + h1_result=H1BoundaryResult(method="none"), + shard_plan=ShardPlan( + enabled=True, + reason="too_large", + shards=[ + Shard( + shard_index=0, + page_start=1, + page_end=2, + page_offset=0, + anchor_type="forced_max_size", + anchor_evidence="fixture", + confidence=1.0, + ) + ], + ), + ), ), ) @@ -630,6 +661,7 @@ def _identity_eval_md_headings( model_name=None, output_dir=None, layout_json_path=None, + is_first_shard=True, ): calls.setdefault("heading_dirs", []).append(output_dir) return list(md_lines) diff --git a/packages/shared-python/shared/core/config/ai.py b/packages/shared-python/shared/core/config/ai.py index 9d035051..559e53f3 100644 --- a/packages/shared-python/shared/core/config/ai.py +++ b/packages/shared-python/shared/core/config/ai.py @@ -29,12 +29,12 @@ class AIConfig(BaseModel): description="Heading and outline recognition model; falls back to NORMOL_MODEL when empty", ) IMAGE_MODEL: str = Field( - default="qwen3.5-flash", + default="qwen3.6-flash", description="Image model for image summary, atlas, and OCR flows", ) IMAGE_MODEL_MAX: str = Field( - default="qwen3.5-flash", + default="qwen3.6-flash", description="Higher-capability image model for OCR and image type classification", ) RETRIEVAL_DECOMPOSITION_ENABLED: bool = Field( diff --git a/packages/shared-python/shared/core/config/storage.py b/packages/shared-python/shared/core/config/storage.py index 3daf7551..75046091 100644 --- a/packages/shared-python/shared/core/config/storage.py +++ b/packages/shared-python/shared/core/config/storage.py @@ -79,6 +79,14 @@ class StorageConfig(BaseModel): description="Soft page limit for oversized PDF shard pipeline. " "Documents exceeding this are rejected with a contact-support message.", ) + PDF_PROFILE_TOC_ENABLED: bool = Field( + default=False, + description=( + "Enable PDF TOC extraction during parser-entry DOC_PROFILE for " + "standard and atlas PDFs. Oversized PDFs still run TOC profiling as " + "part of the shard pipeline." + ), + ) MINERU_SHARD_CONCURRENCY: int = Field( default=3, ge=1, diff --git a/packages/shared-python/shared/services/retrieval/llm_adapter.py b/packages/shared-python/shared/services/retrieval/llm_adapter.py index f0a981b9..8de055a0 100644 --- a/packages/shared-python/shared/services/retrieval/llm_adapter.py +++ b/packages/shared-python/shared/services/retrieval/llm_adapter.py @@ -174,7 +174,7 @@ def create_retrieval_vlm_fn( ) -> LLMFn | None: """Create an async VLM callable for image-aware answer generation. - Uses the IMAGE_MODEL (e.g. qwen3.5-flash) for multimodal input. + Uses the IMAGE_MODEL (e.g. qwen3.6-flash) for multimodal input. Returns None when the image model is not configured. The returned function accepts the same ``LLMFnInput`` type as @@ -183,7 +183,7 @@ def create_retrieval_vlm_fn( """ from shared.core.config import settings - effective_model = model or getattr(settings, 'IMAGE_MODEL', '') or 'qwen3.5-flash' + effective_model = model or getattr(settings, 'IMAGE_MODEL', '') or 'qwen3.6-flash' if not _has_llm_credentials(): logger.debug('retrieval: no LLM credentials for VLM, image-aware answering disabled') From 637ba234e421141531e7e2224b209d0810b7e8a7 Mon Sep 17 00:00:00 2001 From: chengke <404835780@qq.com> Date: Wed, 10 Jun 2026 16:29:38 +0800 Subject: [PATCH 3/4] fix: update Qwen model references to version 3.6-flash in AGENTS.md and README.md --- AGENTS.md | 6 +++--- README.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 6cceff82..9e68dc18 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -217,9 +217,9 @@ flowchart LR |:---|:---|:---| | Text/table summarization | `NORMOL_MODEL` | `deepseek-chat` | | Heading hierarchy recognition | `HIERARCHY_LLM_MODEL` | Falls back to `NORMOL_MODEL` | -| Image description (VLM) | `IMAGE_MODEL` | `qwen3.5-flash` | -| Image OCR / Q&A | `IMAGE_MODEL_MAX` | `qwen3.5-flash` | -| PDF coarse classification | `IMAGE_MODEL` | `qwen3.5-flash` | +| Image description (VLM) | `IMAGE_MODEL` | `qwen3.6-flash` | +| Image OCR / Q&A | `IMAGE_MODEL_MAX` | `qwen3.6-flash` | +| PDF coarse classification | `IMAGE_MODEL` | `qwen3.6-flash` | --- diff --git a/README.md b/README.md index 703876dc..6aca6f3a 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ A: Knowhere uses MinerU as its default parser because it performs best in our te **Q: What LLM / VLM dependencies does Knowhere have?** -A: By default, DeepSeek (`deepseek-chat`) handles text and table summarization, and Qwen-VL (`qwen3.5-flash`) handles image OCR and descriptions. Knowhere is model-agnostic. Swap in OpenAI, DashScope, Zhipu, or Volcengine via environment variables. +A: By default, DeepSeek (`deepseek-chat`) handles text and table summarization, and Qwen-VL (`qwen3.6-flash`) handles image OCR and descriptions. Knowhere is model-agnostic. Swap in OpenAI, DashScope, Zhipu, or Volcengine via environment variables. **Q: How is Agentic Retrieval different from traditional RAG?** From ac87030e6ca5131f234f70cc7618f7f5143d7ef8 Mon Sep 17 00:00:00 2001 From: chengke <404835780@qq.com> Date: Wed, 10 Jun 2026 16:47:51 +0800 Subject: [PATCH 4/4] test: stabilize worker contract imports --- apps/worker/tests/contract/conftest.py | 39 +++++++++++++++-- .../test_doc_profile_anatomy_contract.py | 43 ++++++++++--------- 2 files changed, 57 insertions(+), 25 deletions(-) diff --git a/apps/worker/tests/contract/conftest.py b/apps/worker/tests/contract/conftest.py index 41ba3846..f9738370 100644 --- a/apps/worker/tests/contract/conftest.py +++ b/apps/worker/tests/contract/conftest.py @@ -17,6 +17,7 @@ _REPO_ROOT: Path = Path(__file__).resolve().parents[4] _WORKER_ROOT: Path = _REPO_ROOT / "apps" / "worker" +_API_ROOT: Path = _REPO_ROOT / "apps" / "api" _DOCUMENT_INGESTION_TASK_NAMES: tuple[str, ...] = ( "app.core.tasks.document_ingestion_tasks.upload_url_file_task", "app.core.tasks.kb_tasks.upload_url_file_task", @@ -25,6 +26,39 @@ ) +def _module_loaded_from(module_name: str, root: Path) -> bool: + module = sys.modules.get(module_name) + if module is None: + return False + + root_value = str(root) + module_file = getattr(module, "__file__", None) + if isinstance(module_file, str) and module_file.startswith(root_value): + return True + + module_paths = getattr(module, "__path__", ()) + return any(str(module_path).startswith(root_value) for module_path in module_paths) + + +def _ensure_worker_import_context() -> None: + worker_root_value = str(_WORKER_ROOT) + if worker_root_value in sys.path: + sys.path.remove(worker_root_value) + sys.path.insert(0, worker_root_value) + + cached_module_names = list(sys.modules) + for module_name in cached_module_names: + if module_name == "app" or module_name.startswith("app."): + if _module_loaded_from(module_name, _API_ROOT): + sys.modules.pop(module_name, None) + + +@pytest.fixture(autouse=True) +def worker_contract_import_context() -> Generator[None, None, None]: + _ensure_worker_import_context() + yield + + def _resolve_postgresql_executable() -> str | None: configured_executable: str | None = os.getenv("PYTEST_POSTGRESQL_EXECUTABLE") @@ -65,10 +99,7 @@ def worker_contract_environment( contract_runtime.configure_contract_environment(monkeypatch, postgresql_proc) asyncio.run(contract_runtime.prepare_contract_storage()) - worker_root_value = str(_WORKER_ROOT) - if worker_root_value in sys.path: - sys.path.remove(worker_root_value) - sys.path.insert(0, worker_root_value) + _ensure_worker_import_context() contract_runtime.clear_application_modules() from shared.core.celery_app import get_celery_app diff --git a/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py b/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py index f1d108ad..448e1630 100644 --- a/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py +++ b/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py @@ -1,5 +1,6 @@ from __future__ import annotations +import importlib import os from pathlib import Path from types import SimpleNamespace @@ -11,6 +12,7 @@ os.environ.setdefault("S3_SECRET_ACCESS_KEY", "test") os.environ.setdefault("S3_TEMP_PATH", "/tmp") +from app.services.document_agent import coordinator as coordinator_module from app.services.document_agent.coordinator import ProfileCoordinator from app.services.document_agent.manifest import ( DocumentProfile, @@ -26,8 +28,12 @@ ToolResult, ) from app.services.document_agent.validators import validate_shard_plan +from app.services.document_parser.formats.pdf import parser as pdf_parser +from app.services.document_parser.formats.pdf import shard_splitter +from app.services.document_parser.profiling import doc_profiler from app.services.document_parser.profiling.doc_profiler import profile_document from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory +from app.services.document_parser.structure.layout_parser import pred_titles def _page_feature(page: int = 1) -> PageFeature: @@ -165,8 +171,6 @@ def fake_persist(_anatomy): monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary) monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist) - from app.services.document_agent import coordinator as coordinator_module - monkeypatch.setattr( coordinator_module.ProfilePlanner, "propose", @@ -263,8 +267,6 @@ def fake_persist(_anatomy): monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary) monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist) - from app.services.document_agent import coordinator as coordinator_module - monkeypatch.setattr( coordinator_module.ProfilePlanner, "propose", @@ -350,8 +352,6 @@ def fake_persist(_anatomy): monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary) monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist) - from app.services.document_agent import coordinator as coordinator_module - def fake_propose(_self): calls.append("planner") return ( @@ -453,8 +453,6 @@ def test_standard_pdf_profile_toc_flag_off_preserves_current_behavior( monkeypatch, tmp_path: Path, ) -> None: - from app.services.document_parser.profiling import doc_profiler - fake_instances: list[object] = [] class FakeCoordinator: @@ -505,8 +503,6 @@ def test_standard_pdf_profile_toc_flag_on_builds_toc_and_lightweight_anatomy( monkeypatch, tmp_path: Path, ) -> None: - from app.services.document_parser.profiling import doc_profiler - fake_anatomy = object() class FakeCoordinator: @@ -583,10 +579,6 @@ def test_pdf_shard_pipeline_accepts_single_shard_fast_path( monkeypatch, tmp_path: Path, ) -> None: - from app.services.document_parser.formats.markdown import parser as markdown_parser - from app.services.document_parser.formats.pdf import parser as pdf_parser - from app.services.document_parser.formats.pdf import shard_splitter - output_dir = tmp_path / "out" output_dir.mkdir() calls: list[str] = [] @@ -605,9 +597,16 @@ def fake_parse_md(*_args, **kwargs): calls.append("parse_md") return {"lines": kwargs["lines_with_heading"]} + active_markdown_parser = importlib.import_module( + "app.services.document_parser.formats.markdown.parser" + ) monkeypatch.setattr(pdf_parser, "parse_via_full", fake_parse_via_full) monkeypatch.setattr(shard_splitter, "split_pdf", fail_split) - monkeypatch.setattr(markdown_parser, "eval_md_headings", fake_eval_md_headings) + monkeypatch.setattr( + active_markdown_parser, + "eval_md_headings", + fake_eval_md_headings, + ) monkeypatch.setattr(pdf_parser, "parse_md", fake_parse_md) profile = SimpleNamespace( @@ -657,9 +656,6 @@ def test_pdf_first_shard_reuses_markdown_toc_detector_when_profile_misses_toc( monkeypatch, tmp_path: Path, ) -> None: - from app.services.document_parser.formats.markdown import parser as markdown_parser - from app.services.document_parser.formats.pdf import parser as pdf_parser - output_dir = tmp_path / "out" output_dir.mkdir() detector_calls: list[list[str]] = [] @@ -688,9 +684,16 @@ def fake_eval_md_headings(md_lines, *_args, **kwargs): heading_contexts.append(kwargs.get("toc_hierarchies")) return [f"# {line}" if line.startswith("1 ") else line for line in md_lines] + active_markdown_parser = importlib.import_module( + "app.services.document_parser.formats.markdown.parser" + ) monkeypatch.setattr(pdf_parser, "parse_via_full", fake_parse_via_full) monkeypatch.setattr(pdf_parser, "detect_tocs_in_texts", fake_detect_tocs_in_texts) - monkeypatch.setattr(markdown_parser, "eval_md_headings", fake_eval_md_headings) + monkeypatch.setattr( + active_markdown_parser, + "eval_md_headings", + fake_eval_md_headings, + ) monkeypatch.setattr( pdf_parser, "parse_md", @@ -742,8 +745,6 @@ def fake_eval_md_headings(md_lines, *_args, **kwargs): def test_page_based_toc_demotes_front_matter_only_on_first_shard() -> None: - from app.services.document_parser.structure.layout_parser import pred_titles - toc_hierarchies = [ { "toc_range": [2, 2],