From 5d5033ac1a535d307643f3f9cf0f3fcf1bd12745 Mon Sep 17 00:00:00 2001
From: chengke <404835780@qq.com>
Date: Tue, 9 Jun 2026 17:19:53 +0800
Subject: [PATCH 1/4] feat: unify PDF document profile routing

---
 AGENTS.md                                     |  26 +-
 .../bootstrap/aggregate_stats.py              |  37 +
 .../services/document_agent/coordinator.py    | 150 ++--
 .../app/services/document_agent/manifest.py   |   1 +
 .../document_agent/planner/planner.py         |   9 +-
 .../document_agent/planner/prompts.py         |  14 +-
 .../services/document_agent/profile_agent.py  |  22 +-
 .../formats/atlas/classifier.py               | 179 -----
 .../document_parser/formats/atlas/parser.py   |   6 +-
 .../document_parser/formats/pdf/parser.py     |  28 +-
 .../formats/pdf/shard_splitter.py             |  38 +-
 .../orchestration/parse_session.py            |  68 +-
 .../profiling/doc_profile_model.py            |  75 --
 .../profiling/doc_profile_pdf.py              | 671 ------------------
 .../document_parser/profiling/doc_profiler.py | 100 ++-
 .../profiling/profile_model.py                |  56 ++
 .../document_parser/profiling/taxonomy.py     |  24 +
 .../contract/test_parse_task_contract.py      | 108 ++-
 18 files changed, 425 insertions(+), 1187 deletions(-)
 delete mode 100644 apps/worker/app/services/document_parser/formats/atlas/classifier.py
 delete mode 100644 apps/worker/app/services/document_parser/profiling/doc_profile_model.py
 delete mode 100644 apps/worker/app/services/document_parser/profiling/doc_profile_pdf.py
 create mode 100644 apps/worker/app/services/document_parser/profiling/profile_model.py
 create mode 100644 apps/worker/app/services/document_parser/profiling/taxonomy.py

diff --git a/AGENTS.md b/AGENTS.md
index a625cdf5..6cceff82 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -124,8 +124,13 @@ flowchart TB
 
 This is the typed `ParseOutput` entry for all file types. The parser flow:
 
-1. **Profiles** the document via `profiling.doc_profiler.profile_document()` to detect
-   file type, page count, and special categories (e.g. `atlas`).
+1. **Profiles** the document via `profiling.doc_profiler.profile_document()`. PDF
+   profiling uses `document_agent` as the single PyMuPDF feature source, then runs
+   VLM coarse classification with two fields: open semantic `category` (for example,
+   `Financial Prospectus`) and routing-only `routing_category`
+   (`atlas/scanned/slides/generic`). Oversized non-atlas PDFs additionally run the
+   structural anatomy stage once at the entry point and pass the resulting shard
+   plan to PDF parsing.
 2. **Routes** to the appropriate parser based on file extension.
 3. **Post-processes**: cleans up unreferenced images, compresses PNG→JPG.
 4. Returns typed parse output with task-local artifact paths.
@@ -134,7 +139,7 @@ This is the typed `ParseOutput` entry for all file types. The parser flow:
 
 | Extension | Parser Module | Strategy |
 |:---|:---|:---|
-| `.pdf` | `formats.pdf.parser.parse_pdfs` | MinerU API → Markdown parser → `structure.layout_parser.pred_titles` |
+| `.pdf` | `formats.pdf.parser.parse_pdfs` | DOC_PROFILE category dispatch: `atlas` → atlas parser; oversized with entry anatomy → shard MinerU; otherwise MinerU API → Markdown parser → `structure.layout_parser.pred_titles` |
 | `.docx` | `formats.docx.parser.parse_docx` + `convert_doc2dics` | OXML iteration → heading detection → hierarchical tree |
 | `.doc` | `conversion.legacy_converter.doc_to_docx` → `.docx` pipeline | LibreOffice headless conversion first |
 | `.pptx` | `formats.pptx.parser.parse_pptx` | iLoveAPI PPTX→PDF → MinerU pipeline |
@@ -187,12 +192,19 @@ Key logic in `parse_docx()`:
 - **Table handling**: `table2html()` converts python-docx Table to HTML with
   accurate `rowspan`/`colspan` via direct OXML inspection.
 
-### PDF Parsing: MinerU Pipeline
+### PDF Parsing: DOC_PROFILE + MinerU Pipeline
 
 ```mermaid
 flowchart LR
-    PDF[formats.pdf.parser] --> MinerU[MinerU Cloud API]
-    MinerU --> MDFile[Markdown + layout.json]
+    PDF[profiling.doc_profiler.profile_document] --> Probe[document_agent probe_page_features]
+    Probe --> Coarse[VLM coarse category]
+    Coarse -->|atlas| Atlas[formats.atlas.parser]
+    Coarse -->|oversized generic/scanned/slides| Anatomy[document_agent structural anatomy + shard_plan]
+    Coarse -->|standard| MinerU[MinerU Cloud API]
+    Anatomy --> Shards[Shard MinerU pipeline]
+    Shards --> MDFile[Markdown + layout.json]
+    Atlas --> Chunks[Atlas page chunks]
+    MinerU --> MDFile
     MDFile --> MDParser[formats.markdown.parser.parse_md]
     MDParser --> EvalHeadings[eval_md_headings + layout.json]
     EvalHeadings --> PredTitles[structure.layout_parser.pred_titles]
@@ -207,7 +219,7 @@ flowchart LR
 | Heading hierarchy recognition | `HIERARCHY_LLM_MODEL` | Falls back to `NORMOL_MODEL` |
 | Image description (VLM) | `IMAGE_MODEL` | `qwen3.5-flash` |
 | Image OCR / Q&A | `IMAGE_MODEL_MAX` | `qwen3.5-flash` |
-| Atlas classification | VLM via `formats.atlas.classifier` | `IMAGE_MODEL` |
+| PDF coarse classification | `IMAGE_MODEL` | `qwen3.5-flash` |
 
 ---
 
diff --git a/apps/worker/app/services/document_agent/bootstrap/aggregate_stats.py b/apps/worker/app/services/document_agent/bootstrap/aggregate_stats.py
index 2177cbf6..f0739edf 100644
--- a/apps/worker/app/services/document_agent/bootstrap/aggregate_stats.py
+++ b/apps/worker/app/services/document_agent/bootstrap/aggregate_stats.py
@@ -54,6 +54,7 @@ def aggregate_doc_stats(ctx: ToolContext, _args: dict[str, Any]) -> ToolResult:
     start = time.monotonic()
     features = list(ctx.blackboard.page_features)
     stats: dict[str, Any] = {}
+    page_count = len(features)
     extrema_pages: list[int] = []
     extrema_samples: list[dict[str, Any]] = []
     for metric in PROFILE_METRICS:
@@ -95,9 +96,43 @@ def aggregate_doc_stats(ctx: ToolContext, _args: dict[str, Any]) -> ToolResult:
             )
 
     deduped_extrema = sorted(set(extrema_pages))
+    landscape_pages = sum(1 for feature in features if feature.orientation == "landscape")
+    scan_like_pages = sum(
+        1
+        for feature in features
+        if feature.raw_text_length < 50 and feature.image_coverage >= 0.5
+    )
+    image_heavy_pages = sum(
+        1 for feature in features if feature.image_coverage >= 0.35
+    )
+    table_signal_pages = sum(
+        1
+        for feature in features
+        if feature.table_count > 0 or feature.drawings_count >= 25
+    )
+    doc_shape = {
+        "page_count": page_count,
+        "landscape_pages": landscape_pages,
+        "landscape_ratio": round(landscape_pages / page_count, 4)
+        if page_count
+        else 0.0,
+        "scan_like_pages": scan_like_pages,
+        "scan_like_ratio": round(scan_like_pages / page_count, 4)
+        if page_count
+        else 0.0,
+        "image_heavy_pages": image_heavy_pages,
+        "image_heavy_ratio": round(image_heavy_pages / page_count, 4)
+        if page_count
+        else 0.0,
+        "table_signal_pages": table_signal_pages,
+        "table_signal_ratio": round(table_signal_pages / page_count, 4)
+        if page_count
+        else 0.0,
+    }
     ctx.blackboard.doc_stats = stats
     ctx.blackboard.extrema_pages = deduped_extrema
     ctx.blackboard.global_signals["doc_stats"] = stats
+    ctx.blackboard.global_signals["doc_shape"] = doc_shape
     ctx.blackboard.global_signals["extrema_pages"] = deduped_extrema
     ctx.blackboard.global_signals["extrema_samples"] = extrema_samples
     return ToolResult(
@@ -106,10 +141,12 @@ def aggregate_doc_stats(ctx: ToolContext, _args: dict[str, Any]) -> ToolResult:
             "metric_count": len(PROFILE_METRICS),
             "extrema_pages": deduped_extrema,
             "extrema_samples": extrema_samples,
+            "doc_shape": doc_shape,
         },
         latency_ms=int((time.monotonic() - start) * 1000),
         output_summary={
             "doc_stats": stats,
+            "doc_shape": doc_shape,
             "extrema_pages": deduped_extrema,
             "extrema_samples": extrema_samples,
         },
diff --git a/apps/worker/app/services/document_agent/coordinator.py b/apps/worker/app/services/document_agent/coordinator.py
index c32b05d4..374adbf3 100644
--- a/apps/worker/app/services/document_agent/coordinator.py
+++ b/apps/worker/app/services/document_agent/coordinator.py
@@ -14,7 +14,11 @@
 )
 from app.services.document_agent.budget import BudgetTracker
 from app.services.document_agent.executor import ReActExecutor
-from app.services.document_agent.manifest import PageAnatomyMap, ToolContext
+from app.services.document_agent.manifest import (
+    DocumentProfile,
+    PageAnatomyMap,
+    ToolContext,
+)
 from app.services.document_agent.persist import build_anatomy_map, persist_anatomy_map
 from app.services.document_agent.planner import ProfilePlanner
 from app.services.document_agent.registry import REGISTRY
@@ -58,64 +62,100 @@ def __init__(
 
     def run(self) -> PageAnatomyMap:
         try:
-            self.state = DocumentAgentState.RUNNING
-            self._run_bootstrap()
-            self._run_toc_pipeline()
-            profile, initial_decision, planner_result = ProfilePlanner(self.ctx).propose()
-            self.blackboard.document_profile = profile
-            self.blackboard.global_signals["document_profile"] = profile.to_dict()
-            self.trace.record_step(
-                round_index=self.round_index,
-                actor="planner",
-                action_type="plan",
-                result=planner_result,
-                tool_name=None,
-                tool_args={},
-            )
-            self.round_index += 1
+            return self._run_structural()
+        except Exception as exc:
+            self._record_failure(exc)
+            raise
 
-            executor_result = ReActExecutor(
-                self.ctx,
-                registry=REGISTRY,
-                max_rounds=int(self.ctx.settings.get("max_rounds", 30)),
-                initial_decision=initial_decision,
-            ).run()
-            if executor_result.verdict.status != "success":
-                raise RuntimeError(
-                    f"profile aborted: {executor_result.verdict.rationale}"
-                )
-            anatomy = build_anatomy_map(self.ctx)
-            persist_result = persist_anatomy_map(self.ctx, {})
-            self.trace.record_step(
-                round_index=self.round_index,
-                actor="persist",
-                action_type="persist",
-                result=persist_result,
-                tool_name="persist.anatomy_map",
-                tool_args={},
-            )
-            self.state = DocumentAgentState.READY
-            self.trace.write_trace_artifact(
-                self.ctx.output_dir,
-                final_status="ready",
-                summary=anatomy.trace_summary | self.trace.summary(),
-            )
-            self.trace.flush(
-                final_status="ready",
-                summary=anatomy.trace_summary | self.trace.summary(),
-            )
-            return anatomy
+    def run_coarse(self) -> DocumentProfile:
+        try:
+            return self._run_coarse()
         except Exception as exc:
-            logger.error(f"[document_agent] profile failed: {exc}")
-            self.state = DocumentAgentState.FAILED
-            self.trace.write_trace_artifact(
-                self.ctx.output_dir,
-                final_status="failed",
-                summary={"error": str(exc), "budget": self.ctx.budget.snapshot()},
-            )
-            self.trace.flush(final_status="failed", summary={"error": str(exc)})
+            self._record_failure(exc)
+            raise
+
+    def run_structural(self) -> PageAnatomyMap:
+        try:
+            return self._run_structural()
+        except Exception as exc:
+            self._record_failure(exc)
             raise
 
+    def _run_coarse(self) -> DocumentProfile:
+        self.state = DocumentAgentState.RUNNING
+        if not self.blackboard.page_features:
+            self._run_bootstrap()
+        profile, _initial_decision, planner_result = ProfilePlanner(self.ctx).propose()
+        self.blackboard.document_profile = profile
+        self.blackboard.global_signals["document_profile"] = profile.to_dict()
+        self.trace.record_step(
+            round_index=self.round_index,
+            actor="planner:coarse",
+            action_type="plan",
+            result=planner_result,
+            tool_name=None,
+            tool_args={},
+        )
+        self.round_index += 1
+        return profile
+
+    def _run_structural(self) -> PageAnatomyMap:
+        self.state = DocumentAgentState.RUNNING
+        if not self.blackboard.page_features:
+            self._run_bootstrap()
+        self._run_toc_pipeline()
+        profile, initial_decision, planner_result = ProfilePlanner(self.ctx).propose()
+        self.blackboard.document_profile = profile
+        self.blackboard.global_signals["document_profile"] = profile.to_dict()
+        self.trace.record_step(
+            round_index=self.round_index,
+            actor="planner",
+            action_type="plan",
+            result=planner_result,
+            tool_name=None,
+            tool_args={},
+        )
+        self.round_index += 1
+        executor_result = ReActExecutor(
+            self.ctx,
+            registry=REGISTRY,
+            max_rounds=int(self.ctx.settings.get("max_rounds", 30)),
+            initial_decision=initial_decision,
+        ).run()
+        if executor_result.verdict.status != "success":
+            raise RuntimeError(f"profile aborted: {executor_result.verdict.rationale}")
+        anatomy = build_anatomy_map(self.ctx)
+        persist_result = persist_anatomy_map(self.ctx, {})
+        self.trace.record_step(
+            round_index=self.round_index,
+            actor="persist",
+            action_type="persist",
+            result=persist_result,
+            tool_name="persist.anatomy_map",
+            tool_args={},
+        )
+        self.state = DocumentAgentState.READY
+        self.trace.write_trace_artifact(
+            self.ctx.output_dir,
+            final_status="ready",
+            summary=anatomy.trace_summary | self.trace.summary(),
+        )
+        self.trace.flush(
+            final_status="ready",
+            summary=anatomy.trace_summary | self.trace.summary(),
+        )
+        return anatomy
+
+    def _record_failure(self, exc: Exception) -> None:
+        logger.error(f"[document_agent] profile failed: {exc}")
+        self.state = DocumentAgentState.FAILED
+        self.trace.write_trace_artifact(
+            self.ctx.output_dir,
+            final_status="failed",
+            summary={"error": str(exc), "budget": self.ctx.budget.snapshot()},
+        )
+        self.trace.flush(final_status="failed", summary={"error": str(exc)})
+
     def _run_bootstrap(self) -> None:
         for tool_name, handler in (
             ("probe.page_features", probe_page_features),
diff --git a/apps/worker/app/services/document_agent/manifest.py b/apps/worker/app/services/document_agent/manifest.py
index 3fd747ca..b89f6164 100644
--- a/apps/worker/app/services/document_agent/manifest.py
+++ b/apps/worker/app/services/document_agent/manifest.py
@@ -47,6 +47,7 @@ def to_dict(self) -> dict[str, Any]:
 class DocumentProfile:
     is_scanned: bool
     category: str
+    routing_category: str = "generic"
     category_rationale: str = ""
     language: str = "unknown"
     rationale: str = ""
diff --git a/apps/worker/app/services/document_agent/planner/planner.py b/apps/worker/app/services/document_agent/planner/planner.py
index bada42b0..7c796586 100644
--- a/apps/worker/app/services/document_agent/planner/planner.py
+++ b/apps/worker/app/services/document_agent/planner/planner.py
@@ -18,6 +18,7 @@
 )
 from app.services.document_agent.planner.prompts import PLANNER_INSTRUCTIONS
 from app.services.document_agent.visual import render_pages
+from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
 from shared.utils.token_estimate import estimate_tokens
 
 PAGE_KIND_DEFINITIONS = {
@@ -119,6 +120,9 @@ def _parse_profile_and_decision(raw: str) -> tuple[DocumentProfile, ReflexionDec
     if not isinstance(data, dict):
         raise ValueError("planner output must be a JSON object")
     category = " ".join(str(data.get("category") or "unknown document").split()[:5])
+    routing_category = PdfRoutingCategory.normalize(
+        data.get("routing_category") or data.get("category")
+    ).value
     raw_is_scanned = data.get("is_scanned")
     if isinstance(raw_is_scanned, bool):
         is_scanned = raw_is_scanned
@@ -129,6 +133,7 @@ def _parse_profile_and_decision(raw: str) -> tuple[DocumentProfile, ReflexionDec
     profile = DocumentProfile(
         is_scanned=is_scanned,
         category=category or "unknown document",
+        routing_category=routing_category,
         category_rationale=str(data.get("category_rationale") or ""),
         language=str(data.get("language") or "unknown"),
         rationale=str(data.get("rationale") or ""),
@@ -196,6 +201,7 @@ def propose(self) -> tuple[DocumentProfile, ReflexionDecision, ToolResult]:
             profile = DocumentProfile(
                 is_scanned=False,
                 category="unknown document",
+                routing_category=PdfRoutingCategory.GENERIC.value,
                 rationale="No planner model configured.",
             )
             decision = ReflexionDecision(
@@ -227,6 +233,7 @@ def propose(self) -> tuple[DocumentProfile, ReflexionDecision, ToolResult]:
                 {},
             ),
             "page_kind_definitions": PAGE_KIND_DEFINITIONS,
+            "doc_shape": self.ctx.blackboard.global_signals.get("doc_shape", {}),
             "doc_stats": self.ctx.blackboard.doc_stats,
             "extrema_samples": self.ctx.blackboard.global_signals.get(
                 "extrema_samples",
@@ -314,5 +321,3 @@ def propose(self) -> tuple[DocumentProfile, ReflexionDecision, ToolResult]:
         except Exception:
             self.ctx.budget.refund("visual", est=prompt_tokens_est)
             raise
-
-
diff --git a/apps/worker/app/services/document_agent/planner/prompts.py b/apps/worker/app/services/document_agent/planner/prompts.py
index 49feab39..7bb1f316 100644
--- a/apps/worker/app/services/document_agent/planner/prompts.py
+++ b/apps/worker/app/services/document_agent/planner/prompts.py
@@ -2,11 +2,15 @@
 
 PLANNER_INSTRUCTIONS = (
     "You are a document profile agent. Use global page-feature statistics, "
-    "TOC/H1 evidence, and page screenshots to classify the document and decide "
-    "whether enough evidence exists to continue toward sharding. Return strict "
-    "JSON only with keys: is_scanned, category, category_rationale, language, "
-    "rationale, next_action, inspect_pages, grep_query. category must be at "
-    "most 5 English words. next_action must be one of inspect_more, grep_text, "
+    "optional TOC/H1 evidence, and page screenshots to classify the PDF. Return strict "
+    "JSON only with keys: is_scanned, category, routing_category, "
+    "category_rationale, language, rationale, next_action, inspect_pages, grep_query. "
+    "category is a concise semantic document type, at most 5 English words, such "
+    "as Financial Prospectus, Technical Manual, Corporate Policy, Research Report, "
+    "Engineering Atlas, or Scanned Handbook. routing_category must be one of "
+    "atlas, scanned, slides, generic. Set routing_category=atlas only for "
+    "engineering drawing collections, construction standard atlases, or page sets "
+    "whose primary unit is a drawing/detail sheet rather than prose. next_action must be one of inspect_more, grep_text, "
     "ready_to_shard, verdict_now. Use inspect_more only when specific extra "
     "page screenshots are needed. Use grep_text only for native PDFs when a "
     "global text search would clarify structure. Do not output a fixed step "
diff --git a/apps/worker/app/services/document_agent/profile_agent.py b/apps/worker/app/services/document_agent/profile_agent.py
index 6a73cf74..c56e18c1 100644
--- a/apps/worker/app/services/document_agent/profile_agent.py
+++ b/apps/worker/app/services/document_agent/profile_agent.py
@@ -6,7 +6,7 @@
 from typing import Any
 
 from app.services.document_agent.coordinator import ProfileCoordinator
-from app.services.document_agent.manifest import PageAnatomyMap
+from app.services.document_agent.manifest import DocumentProfile, PageAnatomyMap
 
 
 class ProfileAgent:
@@ -38,3 +38,23 @@ def run(
             settings=self._settings,
         )
         return coordinator.run()
+
+    def run_coarse(
+        self,
+        file_path: str,
+        job_id: str,
+        *,
+        output_dir: str | None = None,
+        db: Any | None = None,
+    ) -> DocumentProfile:
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(file_path)
+        coordinator = ProfileCoordinator(
+            pdf_path=file_path,
+            job_id=job_id,
+            output_dir=output_dir,
+            db=db,
+            model=self._model,
+            settings=self._settings,
+        )
+        return coordinator.run_coarse()
diff --git a/apps/worker/app/services/document_parser/formats/atlas/classifier.py b/apps/worker/app/services/document_parser/formats/atlas/classifier.py
deleted file mode 100644
index 87c43e86..00000000
--- a/apps/worker/app/services/document_parser/formats/atlas/classifier.py
+++ /dev/null
@@ -1,179 +0,0 @@
-"""
-VLM-based Atlas Classifier
-
-Second-pass visual confirmation for atlas_candidate documents.
-Renders the first 3 pages of a PDF as PNG images, then asks the vision
-model to decide whether the document is an engineering atlas.
-
-Architecture:
-  - Page rendering via PyMuPDF runs in a spawned child process (consistent
-    with the rest of the parsing pipeline).
-  - VLM call is made in the *main process* after the child exits cleanly.
-  - Fails gracefully: any error returns False (treat as non-atlas).
-"""
-
-import base64
-import os
-import tempfile
-from typing import Optional
-
-from app.services.document_parser.formats.pdf.pymupdf_subprocess import run_in_child_process, worker
-from loguru import logger
-from openai.types.chat import (
-    ChatCompletionContentPartImageParam,
-    ChatCompletionContentPartParam,
-    ChatCompletionContentPartTextParam,
-    ChatCompletionMessageParam,
-)
-
-# ── Prompt ──────────────────────────────────────────────────────────────────
-_ATLAS_JUDGE_PROMPT = """You are a document classification expert. Please observe the following PDF page screenshots and determine whether the document is an engineering atlas (drawing collection).
-
-[Typical Characteristics of an Engineering Atlas]
-- Page content is primarily technical drawings (e.g., architectural floor plans, structural details, pipeline installation diagrams, equipment layout plans).
-- Usually contains a title block / info bar (including drawing name, drawing number, design institute, scale, date, etc.).
-- Pages consist mainly of graphics, lines, annotations, and dimensions, with very little pure text.
-- Page orientation is typically landscape (mostly A3 landscape).
-- Common types: National standard design atlases (e.g., 09 series, 22 series), construction drawings, installation detail drawings.
-
-[Judgment Criteria]
-- If this IS an engineering atlas, reply ONLY with: yes
-- If this IS NOT an engineering atlas (e.g., normal report, academic paper, presentation slides), reply ONLY with: no
-
-You must reply ONLY with "yes" or "no", do not say anything else."""
-
-# ── Child-process renderer ───────────────────────────────────────────────────
-
-
-@worker
-def _render_pages_worker(
-    queue, pdf_path: str, page_indices: list, dpi: int, out_dir: str
-) -> None:
-    """Child process: render given PDF pages to PNG files in out_dir."""
-    import pymupdf
-
-    mat = pymupdf.Matrix(dpi / 72, dpi / 72)
-    rendered: list[str] = []
-    try:
-        doc = pymupdf.open(pdf_path)
-        for idx in page_indices:
-            if idx >= doc.page_count:
-                break
-            page = doc[idx]
-            pix = page.get_pixmap(matrix=mat, alpha=False)
-            out_path = os.path.join(out_dir, f"atlas_preview_p{idx + 1}.png")
-            pix.save(out_path)
-            rendered.append(out_path)
-            pix = None
-            page = None
-        doc.close()
-    except Exception as exc:
-        queue.put({"ok": False, "error": str(exc), "rendered": []})
-        return
-    queue.put({"ok": True, "rendered": rendered})
-
-
-def _render_preview_pages(
-    pdf_path: str,
-    page_indices: list[int],
-    out_dir: str,
-    dpi: int = 120,
-) -> list[str]:
-    """Render pages to PNG files. Returns list of file paths."""
-    result = run_in_child_process(
-        _render_pages_worker, pdf_path, page_indices, dpi, out_dir, timeout=30
-    )
-    if not result.get("ok"):
-        raise RuntimeError(f"Page render failed: {result.get('error')}")
-    return result["rendered"]
-
-
-def _png_to_data_url(path: str) -> Optional[str]:
-    """Base64-encode a PNG file as a data URL."""
-    try:
-        with open(path, "rb") as f:
-            data = base64.b64encode(f.read()).decode("utf-8")
-        return f"data:image/png;base64,{data}"
-    except Exception as exc:
-        logger.warning(f"[atlas_classifier] Failed to encode {path}: {exc}")
-        return None
-
-
-def _call_vlm(image_data_urls: list[str]) -> bool:
-    """Call VLM with preview images. Returns True if atlas, False otherwise."""
-    from shared.core.config import settings
-    from shared.services.ai.openai_compatible_client_sync import get_openai_client
-
-    model = settings.IMAGE_MODEL or "qwen-vl-plus"
-    client = get_openai_client(model=model)
-
-    content: list[ChatCompletionContentPartParam] = [
-        ChatCompletionContentPartTextParam(
-            type="text",
-            text=_ATLAS_JUDGE_PROMPT,
-        )
-    ]
-    for url in image_data_urls:
-        content.append(
-            ChatCompletionContentPartImageParam(
-                type="image_url",
-                image_url={"url": url},
-            )
-        )
-
-    messages: list[ChatCompletionMessageParam] = [
-        {"role": "user", "content": content}
-    ]
-    resp: str = client.chat_completion(
-        messages=messages,
-        model=model,
-        temperature=0.0,
-        max_tokens=8,
-    )
-    answer = resp.strip().lower().strip(".")
-    logger.info(f"[atlas_classifier] VLM answer: {repr(resp)}")
-    return answer.startswith("yes") or answer == "1" or answer == "true"
-
-
-# ── Public API ───────────────────────────────────────────────────────────────
-
-
-def classify_atlas_with_vlm(pdf_path: str, n_pages: int = 3) -> bool:
-    """
-    Render the first `n_pages` pages of `pdf_path` and ask the VLM whether
-    the document is an engineering atlas.
-
-    Returns:
-        True  → confirmed atlas
-        False → not an atlas (or error — fail-safe default)
-    """
-    page_indices = list(range(n_pages))
-    with tempfile.TemporaryDirectory(prefix="atlas_clf_") as tmp_dir:
-        try:
-            png_paths = _render_preview_pages(pdf_path, page_indices, tmp_dir)
-            if not png_paths:
-                logger.warning(
-                    "[atlas_classifier] No pages rendered, defaulting to non-atlas"
-                )
-                return False
-
-            data_urls = [u for p in png_paths if (u := _png_to_data_url(p)) is not None]
-            if not data_urls:
-                logger.warning(
-                    "[atlas_classifier] No images encoded, defaulting to non-atlas"
-                )
-                return False
-
-            logger.info(
-                f"[atlas_classifier] Sending {len(data_urls)} page(s) to VLM for atlas check"
-            )
-            is_atlas = _call_vlm(data_urls)
-            logger.info(f"[atlas_classifier] VLM result: is_atlas={is_atlas}")
-            return is_atlas
-
-        except Exception as exc:
-            logger.warning(
-                f"[atlas_classifier] VLM atlas check failed for {pdf_path!r}, "
-                f"defaulting to non-atlas. Error: {exc}"
-            )
-            return False
diff --git a/apps/worker/app/services/document_parser/formats/atlas/parser.py b/apps/worker/app/services/document_parser/formats/atlas/parser.py
index ddac6a6d..efca015e 100644
--- a/apps/worker/app/services/document_parser/formats/atlas/parser.py
+++ b/apps/worker/app/services/document_parser/formats/atlas/parser.py
@@ -2,7 +2,7 @@
 """
 Atlas-specific parsing pipeline.
 
-For documents detected as atlas (doc_category="atlas") — e.g. engineering drawing
+For documents detected as atlas (category="atlas") — e.g. engineering drawing
 collections — this module BYPASSES MinerU entirely and uses PyMuPDF directly to:
   1. Extract text from each page (for naming and content)
   2. Render each page as a single complete image (preserving full-page layout)
@@ -282,7 +282,7 @@ def parse_atlas(
         output_dir: output directory for images, full.md, etc.
         base_llm_paras: LLM parameters dict
         relative_root: path prefix for chunk path field
-        profile: DocProfile with scan_type info
+        profile: parser profile with is_scanned info
 
     Returns:
         pd.DataFrame with ALL_DF_COLS columns
@@ -305,7 +305,7 @@ def parse_atlas(
 
     # ── Determine if VLM is needed ──
     use_vlm = True
-    is_scanned = profile and profile.scan_type == "scanned"
+    is_scanned = bool(profile and profile.is_scanned)
     scan_label = "scanned" if is_scanned else "non-scanned"
     logger.info(f"📐 Atlas: {scan_label} document, VLM enabled for info extraction")
 
diff --git a/apps/worker/app/services/document_parser/formats/pdf/parser.py b/apps/worker/app/services/document_parser/formats/pdf/parser.py
index 217c89f6..08cfab8d 100755
--- a/apps/worker/app/services/document_parser/formats/pdf/parser.py
+++ b/apps/worker/app/services/document_parser/formats/pdf/parser.py
@@ -8,6 +8,7 @@
     build_oversized_pdf_processing_failed_exception,
 )
 from app.services.document_parser.providers.mineru.pdf_service import parse_via_full
+from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
 from app.services.document_parser.support.stage_profiler import stage_timer
 from loguru import logger
 
@@ -25,11 +26,10 @@ def parse_pdfs(
     s3_key=None,
     job_id=None,
 ):
-    route = profile.route if profile else "standard"
     base_llm_paras.update({"doc_name": filename})
 
     # ── Atlas routing: bypass MinerU entirely ──
-    if profile and profile.doc_category == "atlas":
+    if profile and profile.routing_category is PdfRoutingCategory.ATLAS:
         logger.info(f"📐 Atlas detected, bypassing MinerU for {filename}")
         from app.services.document_parser.formats.atlas.parser import parse_atlas
 
@@ -61,7 +61,7 @@ def parse_pdfs(
             ) from exc
 
     # ── Standard single-pass MinerU ──
-    logger.info(f"📄 Standard MinerU parse for {filename} [route={route}]")
+    logger.info(f"📄 Standard MinerU parse for {filename}")
     with stage_timer("pdf.extract.standard", filename=filename):
         parse_via_full(pdf_path, filename, output_dir, s3_key=s3_key)
 
@@ -100,23 +100,21 @@ def _parse_oversized_pdf(
         merge_html_tables,
     )
     from app.services.document_parser.formats.pdf.shard_merger import merge_images, merge_shard_lines
-    from app.services.document_parser.formats.pdf.shard_splitter import (
-        bin_pack_shards,
-        run_doc_agent,
-        split_pdf,
-    )
+    from app.services.document_parser.formats.pdf.shard_splitter import bin_pack_shards, split_pdf
 
-    doc_agent_job_id = job_id or base_llm_paras.get("doc_name", filename)
     work_dir: str | None = None
     temp_shard_s3_keys: list[str] = []
 
     try:
-        # 1. Run doc_agent to get full anatomy map (shard plan + TOC info)
-        with stage_timer("pdf.doc_agent", filename=filename):
-            anatomy = run_doc_agent(
-                pdf_path,
-                job_id=doc_agent_job_id,
-                output_dir=output_dir,
+        # 1. Reuse the entry DOC_PROFILE anatomy map (shard plan + TOC info).
+        anatomy = getattr(profile, "anatomy", None)
+        if anatomy is None:
+            raise RuntimeError(
+                f"Oversized PDF profile for {filename} is missing structural anatomy"
+            )
+        if not anatomy.shard_plan.enabled or not anatomy.shard_plan.shards:
+            raise RuntimeError(
+                f"Oversized PDF profile for {filename} did not produce a shard plan"
             )
 
         agent_shards = anatomy.shard_plan.shards
diff --git a/apps/worker/app/services/document_parser/formats/pdf/shard_splitter.py b/apps/worker/app/services/document_parser/formats/pdf/shard_splitter.py
index 99413e75..6fe1308b 100644
--- a/apps/worker/app/services/document_parser/formats/pdf/shard_splitter.py
+++ b/apps/worker/app/services/document_parser/formats/pdf/shard_splitter.py
@@ -1,4 +1,4 @@
-"""PDF shard splitting: doc_agent integration + bin-packing + physical split."""
+"""PDF shard splitting: bin-packing + physical split."""
 
 from __future__ import annotations
 
@@ -10,7 +10,7 @@
 from loguru import logger
 
 if TYPE_CHECKING:
-    from app.services.document_agent.manifest import PageAnatomyMap, Shard
+    from app.services.document_agent.manifest import Shard
 
 
 @dataclass
@@ -31,40 +31,6 @@ def page_offset(self) -> int:
         return self.page_start - 1
 
 
-def run_doc_agent(
-    pdf_path: str, job_id: str, output_dir: str
-) -> "PageAnatomyMap":
-    """Run doc_agent ProfileCoordinator and return the full anatomy map.
-
-    Returns the complete PageAnatomyMap so callers can access TOC info
-    (toc_result.toc_pages, toc_hierarchies) in addition to the shard plan.
-
-    Raises RuntimeError if the agent fails or produces no shards.
-    """
-    from app.services.document_agent.coordinator import ProfileCoordinator
-
-    agent_output_dir = os.path.join(output_dir, "_doc_agent")
-    os.makedirs(agent_output_dir, exist_ok=True)
-
-    coordinator = ProfileCoordinator(
-        pdf_path=pdf_path,
-        job_id=job_id,
-        output_dir=agent_output_dir,
-    )
-    anatomy = coordinator.run()
-
-    if not anatomy.shard_plan.enabled or not anatomy.shard_plan.shards:
-        raise RuntimeError(
-            f"Doc agent did not produce a valid shard plan for {job_id}"
-        )
-
-    shards = anatomy.shard_plan.shards
-    logger.info(
-        f"📋 Doc agent: {len(shards)} shards via {anatomy.shard_plan.reason}"
-    )
-    return anatomy
-
-
 def bin_pack_shards(
     agent_shards: list["Shard"],
     max_pages: int,
diff --git a/apps/worker/app/services/document_parser/orchestration/parse_session.py b/apps/worker/app/services/document_parser/orchestration/parse_session.py
index beec3c35..a5cf33d3 100644
--- a/apps/worker/app/services/document_parser/orchestration/parse_session.py
+++ b/apps/worker/app/services/document_parser/orchestration/parse_session.py
@@ -4,14 +4,10 @@
 from dataclasses import dataclass
 from typing import Any
 
-from app.services.document_parser.formats.atlas.classifier import classify_atlas_with_vlm
 from app.services.document_parser.orchestration.path_segment import (
     build_parser_path_segment,
 )
 from app.services.document_parser.orchestration.parse_input import ParseInput
-from app.services.document_parser.orchestration.oversized_pdf_policy import (
-    raise_if_oversized_pdf_not_supported,
-)
 from app.services.document_parser.profiling.doc_profiler import profile_document
 from app.services.document_parser.support.stage_profiler import stage_timer
 from loguru import logger
@@ -91,49 +87,11 @@ def build_parse_session(parse_input: ParseInput) -> ParseSession:
         profile = profile_document(
             parse_input.file_full_path,
             parse_input.internal_output_filename,
-        )
-    logger.info(f"📋 DocProfile: {profile.summary()}")
-    logger.debug(f"📋 Reasoning: {profile.reasoning}")
-
-    if profile.atlas_candidate and profile.doc_category not in ("atlas", "ppt_converted"):
-        logger.info(
-            f"🔍 Atlas candidate detected, running VLM visual check for {parse_input.filename}"
-        )
-        with stage_timer("document.atlas_vlm_check", filename=parse_input.filename):
-            vlm_is_atlas = classify_atlas_with_vlm(parse_input.file_full_path)
-        if vlm_is_atlas:
-            profile.doc_category = "atlas"
-            profile.reasoning += " | vlm_confirmed_atlas=True"
-            logger.info(f"✅ VLM confirmed atlas for {parse_input.filename}")
-        else:
-            profile.reasoning += " | vlm_confirmed_atlas=False"
-            logger.info(
-                f"ℹ️ VLM rejected atlas for {parse_input.filename}, routing as generic"
-            )
-
-    if profile.file_type == "pdf" and profile.page_count > settings.MAX_PDF_PAGE_LIMIT:
-        raise_if_oversized_pdf_not_supported(page_count=profile.page_count)
-
-    if profile.doc_category == "atlas":
-        filename, internal_output_filename, relative_root, full_output_dir = (
-            _rename_atlas_output(
-                filename=parse_input.filename,
-                internal_output_filename=parse_input.internal_output_filename,
-                output_dir=parse_input.output_dir,
-            )
-        )
-        logger.info(f"📐 Atlas output renamed: {filename}")
-        parse_input = ParseInput(
-            file_full_path=parse_input.file_full_path,
-            filename=filename,
-            output_dir=parse_input.output_dir,
-            internal_output_filename=internal_output_filename,
             job_id=parse_input.job_id,
-            options=parse_input.options,
-            base_url=parse_input.base_url,
-            fragment_content=parse_input.fragment_content,
-            s3_key=parse_input.s3_key,
+            output_dir=full_output_dir,
         )
+    logger.info(f"📋 DOC_PROFILE: {profile.summary()}")
+    logger.debug(f"📋 Reasoning: {profile.reasoning}")
 
     return ParseSession.from_input(
         parse_input=parse_input,
@@ -141,25 +99,7 @@ def build_parse_session(parse_input: ParseInput) -> ParseSession:
         full_output_dir=full_output_dir,
         profile=profile,
         relative_root=relative_root,
-    )
-
-
-def _rename_atlas_output(
-    *,
-    filename: str,
-    internal_output_filename: str,
-    output_dir: str,
-) -> tuple[str, str, str, str]:
-    name_base, _ = os.path.splitext(filename)
-    internal_name_base, _ = os.path.splitext(internal_output_filename)
-    atlas_filename = name_base + ".atlas"
-    atlas_internal_filename = internal_name_base + ".atlas"
-    relative_root, full_output_dir = _resolve_output_paths(
-        filename=atlas_filename,
-        internal_output_filename=atlas_internal_filename,
-        output_dir=output_dir,
-    )
-    return atlas_filename, atlas_internal_filename, relative_root, full_output_dir
+)
 
 
 def _resolve_output_paths(
diff --git a/apps/worker/app/services/document_parser/profiling/doc_profile_model.py b/apps/worker/app/services/document_parser/profiling/doc_profile_model.py
deleted file mode 100644
index ad1a12e6..00000000
--- a/apps/worker/app/services/document_parser/profiling/doc_profile_model.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from __future__ import annotations
-
-import gc
-import json
-import os
-from dataclasses import asdict, dataclass, field
-from typing import List, Literal, Optional
-
-from loguru import logger
-
-
-@dataclass
-class DocProfile:
-    """Document profile data contract used by parser routing."""
-
-    file_type: str = ""
-    route: Literal["fast", "standard"] = "standard"
-    decision_band: Literal["safe_fast", "gray_zone", "safe_standard"] = "safe_standard"
-    scan_type: Optional[Literal["electronic", "scanned", "mixed"]] = None
-    doc_category: Literal["generic", "atlas", "ppt_converted"] = "generic"
-    page_count: int = 0
-    avg_text_density: float = 0.0
-    avg_image_coverage: float = 0.0
-    has_tables: bool = False
-    has_embedded_fonts: bool = False
-    is_multi_column: bool = False
-    is_degraded_electronic: bool = False
-    sample_text: str = ""
-    has_significant_images: bool = False
-    significant_image_count: int = 0
-    max_image_coverage_on_page: float = 0.0
-    pages_with_significant_images: int = 0
-    large_image_page_ratio: float = 0.0
-    table_signal_pages: int = 0
-    table_signal_strength: float = 0.0
-    complex_pages: int = 0
-    complex_page_ratio: float = 0.0
-    max_drawing_count: int = 0
-    min_text_density_page: float = 0.0
-    text_density_std: float = 0.0
-    estimated_fast_benefit: float = 0.0
-    estimated_risk_score: float = 0.0
-    atlas_candidate: bool = False
-    page_details: List[dict] = field(default_factory=list)
-    reasoning: str = ""
-
-    def to_dict(self) -> dict:
-        data = asdict(self)
-        data.pop("page_details", None)
-        data.pop("sample_text", None)
-        return data
-
-    def summary(self) -> str:
-        parts = (
-            f"[{self.file_type.upper()}] route={self.route}, band={self.decision_band}, "
-            f"scan={self.scan_type}, category={self.doc_category}, "
-            f"pages={self.page_count}, text_density={self.avg_text_density:.0f}, "
-            f"img_coverage={self.avg_image_coverage:.1%}, "
-            f"risk={self.estimated_risk_score:.2f}, gain={self.estimated_fast_benefit:.2f}"
-        )
-        if self.is_degraded_electronic:
-            parts += ", degraded=True"
-        return parts
-
-
-def publish_profile_result(queue, profile: DocProfile) -> None:
-    gc.collect()
-    queue.put({"ok": True, "profile": asdict(profile)})
-
-
-def save_profile_metadata(profile: DocProfile, output_dir: str) -> None:
-    profile_path = os.path.join(output_dir, "profile.json")
-    with open(profile_path, "w", encoding="utf-8") as file_obj:
-        json.dump(profile.to_dict(), file_obj, ensure_ascii=False, indent=2)
-    logger.debug(f"Profile metadata saved to {profile_path}")
diff --git a/apps/worker/app/services/document_parser/profiling/doc_profile_pdf.py b/apps/worker/app/services/document_parser/profiling/doc_profile_pdf.py
deleted file mode 100644
index 32a13840..00000000
--- a/apps/worker/app/services/document_parser/profiling/doc_profile_pdf.py
+++ /dev/null
@@ -1,671 +0,0 @@
-# pyright: reportAttributeAccessIssue=false, reportOperatorIssue=false
-from __future__ import annotations
-
-import math
-from typing import Any
-
-from app.services.document_parser.profiling.doc_profile_model import (
-    DocProfile,
-    publish_profile_result,
-)
-from app.services.document_parser.formats.pdf.pymupdf_subprocess import run_in_child_process, worker
-from loguru import logger
-
-# Thresholds
-SCAN_TEXT_THRESHOLD = 50
-SCAN_IMAGE_COVERAGE_MIN = 0.6
-SCAN_PAGE_RATIO = 0.7
-
-ATLAS_TEXT_THRESHOLD = 200
-ATLAS_CANDIDATE_IMAGE_COVERAGE_MIN = 0.30
-ATLAS_MIN_LANDSCAPE_RATIO = 0.5  # ≥50% of sampled pages must be landscape
-ATLAS_MIN_PAGES = 2  # single-page scans (resumes, posters) are not atlases
-
-FAST_TEXT_THRESHOLD = 500
-MIN_FAST_TEXT_DENSITY_FLOOR = 120
-SAFE_FAST_MAX_PAGE_COUNT = 80
-HARD_STANDARD_PAGE_COUNT = 150
-
-MULTI_COL_GAP_RATIO = 0.15
-MULTI_COL_MIN_BLOCKS = 4
-
-DEGRADED_SKINNY_ASPECT = 50
-DEGRADED_SKINNY_MAX_H = 30
-DEGRADED_SKINNY_MIN_PER_PAGE = 50
-DEGRADED_PAGE_RATIO = 0.5
-
-SIGNIFICANT_IMAGE_AREA_RATIO = 0.12
-MEDIUM_IMAGE_AREA_RATIO = 0.03
-LARGE_IMAGE_PAGE_RATIO = 0.25
-SIGNIFICANT_IMAGE_MIN_DIM = 400
-SIGNIFICANT_IMAGE_MIN_PIXELS = 250_000
-
-PROFILE_MAX_NEW_XREFS_PER_PAGE = 30
-
-TABLE_DRAWING_LINE_THRESHOLD = 12
-TABLE_DRAWING_STRONG_THRESHOLD = 18
-TABLE_DRAWING_RECT_THRESHOLD = 2
-
-SAFE_FAST_MAX_COMPLEX_PAGE_RATIO = 0.05
-SAFE_FAST_MAX_IMAGE_COVERAGE_ON_PAGE = 0.08
-SAFE_FAST_MAX_AVG_IMAGE_COVERAGE = 0.03
-SAFE_FAST_MAX_TEXT_STD = 600.0
-HARD_COMPLEX_PAGE_RATIO = 0.2
-HARD_SIGNIFICANT_IMAGE_PAGES = 3
-HARD_LARGE_IMAGE_PAGE_RATIO = 0.15
-
-
-def _clamp(value: float, min_value: float = 0.0, max_value: float = 1.0) -> float:
-    return max(min_value, min(max_value, value))
-
-
-def _stddev(values: list[float]) -> float:
-    if not values:
-        return 0.0
-    mean = sum(values) / len(values)
-    variance = sum((value - mean) ** 2 for value in values) / len(values)
-    return math.sqrt(variance)
-
-
-def _count_detected_tables(page: Any) -> int:
-    try:
-        finder = page.find_tables()
-    except Exception:
-        return 0
-
-    if not finder:
-        return 0
-
-    tables = getattr(finder, "tables", finder)
-    try:
-        return len(tables)
-    except TypeError:
-        return 1 if tables else 0
-
-
-def _is_stroked_drawing(drawing: dict[str, Any]) -> bool:
-    stroke_width = drawing.get("width")
-    return drawing.get("color") is not None or (
-        stroke_width is not None and stroke_width > 0
-    )
-
-
-def _estimate_fast_benefit(profile: DocProfile) -> float:
-    if profile.page_count <= 2:
-        page_factor = 0.35
-    elif profile.page_count <= 10:
-        page_factor = 0.7
-    elif profile.page_count <= SAFE_FAST_MAX_PAGE_COUNT:
-        page_factor = 1.0
-    elif profile.page_count <= HARD_STANDARD_PAGE_COUNT:
-        page_factor = 0.8
-    else:
-        page_factor = 0.45
-
-    density_factor = _clamp(profile.avg_text_density / 1200.0)
-    stability_factor = _clamp(
-        1.0
-        - (profile.complex_page_ratio * 1.5)
-        - (profile.large_image_page_ratio * 1.2)
-        - (profile.table_signal_strength * 0.8)
-    )
-    return _clamp(
-        (0.35 * page_factor) + (0.40 * density_factor) + (0.25 * stability_factor)
-    )
-
-
-def _estimate_risk_score(profile: DocProfile) -> float:
-    risk = 0.0
-    if profile.scan_type != "electronic":
-        risk += 0.35
-    if profile.doc_category != "generic":
-        risk += 0.20
-    if profile.is_multi_column:
-        risk += 0.20
-    if profile.is_degraded_electronic:
-        risk += 0.20
-    if profile.has_tables:
-        risk += 0.30
-
-    risk += min(0.20, profile.large_image_page_ratio * 1.2)
-    risk += min(0.20, profile.complex_page_ratio * 0.8)
-    risk += min(0.15, profile.table_signal_strength * 0.2)
-    risk += min(0.12, profile.pages_with_significant_images * 0.04)
-
-    if profile.page_count > HARD_STANDARD_PAGE_COUNT:
-        risk += 0.10
-
-    return _clamp(risk)
-
-
-def _classify_route(profile: DocProfile) -> tuple[str, str, float, float, list[str]]:
-    hard_gate_reasons: list[str] = []
-
-    if profile.scan_type != "electronic":
-        hard_gate_reasons.append(f"scan_type={profile.scan_type}")
-    if profile.doc_category != "generic":
-        hard_gate_reasons.append(f"doc_category={profile.doc_category}")
-    if profile.is_multi_column:
-        hard_gate_reasons.append("multi_column")
-    if profile.is_degraded_electronic:
-        hard_gate_reasons.append("degraded_electronic")
-    if profile.has_tables:
-        hard_gate_reasons.append(
-            f"table_signals={profile.table_signal_pages}p/{profile.table_signal_strength:.2f}"
-        )
-    if (
-        profile.max_image_coverage_on_page >= LARGE_IMAGE_PAGE_RATIO
-        or profile.pages_with_significant_images >= HARD_SIGNIFICANT_IMAGE_PAGES
-        or profile.large_image_page_ratio >= HARD_LARGE_IMAGE_PAGE_RATIO
-    ):
-        hard_gate_reasons.append(
-            "significant_images="
-            f"{profile.pages_with_significant_images}p,max={profile.max_image_coverage_on_page:.1%}"
-        )
-    if profile.complex_page_ratio >= HARD_COMPLEX_PAGE_RATIO:
-        hard_gate_reasons.append(f"complex_pages={profile.complex_page_ratio:.0%}")
-    if profile.page_count > HARD_STANDARD_PAGE_COUNT:
-        hard_gate_reasons.append(
-            f"page_count={profile.page_count}>{HARD_STANDARD_PAGE_COUNT}"
-        )
-
-    benefit = _estimate_fast_benefit(profile)
-    risk = _estimate_risk_score(profile)
-
-    if hard_gate_reasons:
-        return (
-            "standard",
-            "safe_standard",
-            benefit,
-            risk,
-            [
-                "decision=safe_standard: hard gate matched",
-                "hard_gates=" + ",".join(hard_gate_reasons),
-            ],
-        )
-
-    safe_fast_checks = [
-        (
-            profile.page_count <= SAFE_FAST_MAX_PAGE_COUNT,
-            f"page_count={profile.page_count}<={SAFE_FAST_MAX_PAGE_COUNT}",
-        ),
-        (
-            profile.avg_text_density >= MIN_FAST_TEXT_DENSITY_FLOOR,
-            "text_density_floor="
-            f"{profile.avg_text_density:.0f}>={MIN_FAST_TEXT_DENSITY_FLOOR}",
-        ),
-        (
-            not profile.has_significant_images,
-            f"has_significant_images={profile.has_significant_images}",
-        ),
-        (
-            profile.max_image_coverage_on_page <= SAFE_FAST_MAX_IMAGE_COVERAGE_ON_PAGE,
-            "max_image_coverage_on_page="
-            f"{profile.max_image_coverage_on_page:.1%}<={SAFE_FAST_MAX_IMAGE_COVERAGE_ON_PAGE:.0%}",
-        ),
-        (
-            profile.avg_image_coverage <= SAFE_FAST_MAX_AVG_IMAGE_COVERAGE,
-            f"avg_image_coverage={profile.avg_image_coverage:.1%}<={SAFE_FAST_MAX_AVG_IMAGE_COVERAGE:.0%}",
-        ),
-        (
-            profile.complex_page_ratio <= SAFE_FAST_MAX_COMPLEX_PAGE_RATIO,
-            f"complex_page_ratio={profile.complex_page_ratio:.0%}<={SAFE_FAST_MAX_COMPLEX_PAGE_RATIO:.0%}",
-        ),
-        (
-            profile.text_density_std <= SAFE_FAST_MAX_TEXT_STD,
-            f"text_density_std={profile.text_density_std:.0f}<={SAFE_FAST_MAX_TEXT_STD:.0f}",
-        ),
-        (
-            risk <= 0.35,
-            f"estimated_risk_score={risk:.2f}<=0.35",
-        ),
-    ]
-
-    failed_checks = [reason for passed, reason in safe_fast_checks if not passed]
-    if not failed_checks:
-        return (
-            "fast",
-            "safe_fast",
-            benefit,
-            risk,
-            [
-                "decision=safe_fast: low-complexity high-yield pdf",
-                "safe_fast_checks_passed",
-            ],
-        )
-
-    return (
-        "standard",
-        "gray_zone",
-        benefit,
-        risk,
-        [
-            "decision=gray_zone: conservative fallback to standard in phase1",
-            "borderline=" + ",".join(failed_checks[:4]),
-        ],
-    )
-
-
-
-@worker
-def _profile_pdf_worker(queue, file_path: str) -> None:
-    """Child process: analyze PDF features, return profile as dict."""
-    import pymupdf
-
-    profile = DocProfile(file_type="pdf")
-    reasons: list[str] = []
-
-    try:
-        doc = pymupdf.open(file_path)
-    except Exception as exc:
-        profile.reasoning = f"Cannot open file: {exc}"
-        publish_profile_result(queue, profile)
-        return
-
-    profile.page_count = doc.page_count
-
-    if doc.page_count == 0:
-        profile.reasoning = "Empty file (0 pages)"
-        doc.close()
-        del doc
-        publish_profile_result(queue, profile)
-        return
-
-    if doc.page_count <= 50:
-        sample_indices = list(range(doc.page_count))
-    else:
-        step = max(1, doc.page_count // 20)
-        sample_indices = list(range(0, doc.page_count, step))[:20]
-        sample_indices = sorted(
-            set(
-                sample_indices
-                + [0, 1, 2]
-                + [doc.page_count - 3, doc.page_count - 2, doc.page_count - 1]
-            )
-        )
-        sample_indices = [idx for idx in sample_indices if 0 <= idx < doc.page_count]
-
-    page_details = []
-    text_lengths: list[float] = []
-    total_text_len = 0
-    total_image_coverage = 0.0
-    scanned_pages = 0
-    all_text_parts: list[str] = []
-    has_any_fonts = False
-    has_any_tables = False
-    table_signal_pages = 0
-    total_table_signal_strength = 0.0
-    multi_col_pages = 0
-    landscape_pages = 0
-    degraded_pages = 0
-    doc_page_sizes = []
-
-    significant_image_count = 0
-    pages_with_significant_images = 0
-    large_image_pages = 0
-    max_image_coverage_on_page = 0.0
-
-    complex_pages = 0
-    max_drawing_count = 0
-
-    # Track xrefs already processed across pages to avoid redundant
-    # get_image_rects() calls on shared/inherited image resources.
-    # PDFs with shared xrefs (e.g. scanned docs) report ALL document
-    # images on every page; without dedup this causes O(pages × images)
-    # content-stream scans.
-    seen_xrefs: set = set()
-
-    for idx in sample_indices:
-        page = doc[idx]
-        page_width = page.rect.width
-        page_height = page.rect.height
-        page_area = page_width * page_height
-
-        if page_width > page_height:
-            landscape_pages += 1
-        doc_page_sizes.append((page_width, page_height))
-
-        text = page.get_text().strip()
-        text_len = len(text)
-        text_lengths.append(float(text_len))
-        total_text_len += text_len
-
-        if len("".join(all_text_parts)) < 500:
-            all_text_parts.append(text[:200])
-
-        images = page.get_images(full=True)
-        img_total_area = 0.0
-        page_significant_image_count = 0
-        page_max_rect_ratio = 0.0
-        page_medium_image_coverage = 0.0
-        skinny_count = 0
-
-        new_xref_count = 0
-        for img in images:
-            xref = img[0]
-            img_w, img_h = img[2], img[3]
-            if (
-                img_h > 0
-                and img_w / img_h > DEGRADED_SKINNY_ASPECT
-                and img_h < DEGRADED_SKINNY_MAX_H
-            ):
-                skinny_count += 1
-
-            # ── xref dedup: skip images already analyzed on earlier pages ──
-            if xref in seen_xrefs:
-                continue
-            seen_xrefs.add(xref)
-            new_xref_count += 1
-            # Cap expensive get_image_rects calls per page
-            if new_xref_count > PROFILE_MAX_NEW_XREFS_PER_PAGE:
-                continue
-
-            try:
-                rects = page.get_image_rects(xref)
-            except Exception:
-                rects = []
-
-            for rect in rects:
-                rect_area = rect.width * rect.height
-                img_total_area += rect_area
-
-                area_ratio = rect_area / page_area if page_area > 0 else 0.0
-                page_max_rect_ratio = max(page_max_rect_ratio, area_ratio)
-
-                is_significant = (
-                    area_ratio >= SIGNIFICANT_IMAGE_AREA_RATIO
-                    or (
-                        area_ratio >= 0.05
-                        and (
-                            max(img_w, img_h) >= SIGNIFICANT_IMAGE_MIN_DIM
-                            or (img_w * img_h) >= SIGNIFICANT_IMAGE_MIN_PIXELS
-                        )
-                    )
-                    or (
-                        area_ratio >= 0.02
-                        and (img_w * img_h) >= (SIGNIFICANT_IMAGE_MIN_PIXELS * 2)
-                    )
-                )
-
-                if is_significant:
-                    page_significant_image_count += 1
-                elif area_ratio >= MEDIUM_IMAGE_AREA_RATIO:
-                    page_medium_image_coverage += area_ratio
-
-        if skinny_count >= DEGRADED_SKINNY_MIN_PER_PAGE:
-            degraded_pages += 1
-
-        img_coverage = img_total_area / page_area if page_area > 0 else 0.0
-        img_coverage = min(img_coverage, 1.0)
-        total_image_coverage += img_coverage
-
-        fonts = page.get_fonts()
-        if fonts:
-            has_any_fonts = True
-
-        drawings = page.get_drawings()
-        drawing_count = len(drawings)
-        max_drawing_count = max(max_drawing_count, drawing_count)
-        line_like_items = 0
-        horizontal_line_items = 0
-        vertical_line_items = 0
-        rect_items = 0
-        fill_only_rect_items = 0
-        for drawing in drawings:
-            is_stroked = _is_stroked_drawing(drawing)
-            for item in drawing.get("items", []):
-                if item[0] == "l":
-                    if is_stroked:
-                        line_like_items += 1
-                        point_a = item[1]
-                        point_b = item[2]
-                        if abs(point_a.y - point_b.y) <= 2:
-                            horizontal_line_items += 1
-                        if abs(point_a.x - point_b.x) <= 2:
-                            vertical_line_items += 1
-                elif item[0] == "re":
-                    if is_stroked:
-                        rect_items += 1
-                        line_like_items += 4
-                        horizontal_line_items += 2
-                        vertical_line_items += 2
-                    else:
-                        fill_only_rect_items += 1
-
-        detected_table_count = _count_detected_tables(page)
-        drawing_table_signal = line_like_items >= TABLE_DRAWING_LINE_THRESHOLD and (
-            (horizontal_line_items >= 2 and vertical_line_items >= 2)
-            or rect_items >= TABLE_DRAWING_RECT_THRESHOLD
-        )
-        # NOTE:
-        # `page.find_tables()` produces too many false positives on Word / Writer
-        # exported pure-text PDFs, where paragraph background boxes are inferred as
-        # full-page tables. For Phase 1 fast-path routing, keep `find_tables()`
-        # only as debug evidence and rely on explicit drawing-grid signals for
-        # table hard gates.
-        table_hit = drawing_table_signal
-        page_table_strength = 0.0
-        if drawing_table_signal:
-            page_table_strength = min(
-                1.0,
-                line_like_items / float(TABLE_DRAWING_STRONG_THRESHOLD),
-            )
-
-        if table_hit:
-            has_any_tables = True
-            table_signal_pages += 1
-        total_table_signal_strength += page_table_strength
-
-        blocks = page.get_text("blocks")
-        text_blocks = [
-            block
-            for block in blocks
-            if block[6] == 0
-            and (block[2] - block[0]) > 20
-            and (block[3] - block[1]) > 10
-        ]
-
-        is_multi_col_page = False
-        if len(text_blocks) >= MULTI_COL_MIN_BLOCKS:
-            min_x_gap = page.rect.width * MULTI_COL_GAP_RATIO
-            side_by_side_count = 0
-
-            for i in range(len(text_blocks)):
-                for j in range(i + 1, len(text_blocks)):
-                    block_i = text_blocks[i]
-                    block_j = text_blocks[j]
-                    y_overlap = min(block_i[3], block_j[3]) - max(
-                        block_i[1], block_j[1]
-                    )
-                    if y_overlap <= 0:
-                        continue
-                    x_gap = max(block_j[0] - block_i[2], block_i[0] - block_j[2])
-                    if x_gap > min_x_gap:
-                        side_by_side_count += 1
-                        if side_by_side_count >= 3:
-                            is_multi_col_page = True
-                            break
-                if is_multi_col_page:
-                    break
-
-        if is_multi_col_page:
-            multi_col_pages += 1
-
-        is_scan_page = (
-            text_len < SCAN_TEXT_THRESHOLD and img_coverage > SCAN_IMAGE_COVERAGE_MIN
-        )
-        if is_scan_page:
-            scanned_pages += 1
-
-        page_has_significant_images = (
-            page_significant_image_count > 0 or page_medium_image_coverage >= 0.18
-        )
-        if page_has_significant_images:
-            pages_with_significant_images += 1
-            significant_image_count += page_significant_image_count or 1
-
-        page_has_large_image = (
-            page_max_rect_ratio >= LARGE_IMAGE_PAGE_RATIO or img_coverage >= 0.35
-        )
-        if page_has_large_image:
-            large_image_pages += 1
-
-        max_image_coverage_on_page = max(
-            max_image_coverage_on_page, page_max_rect_ratio
-        )
-
-        is_complex_page = (
-            table_hit
-            or page_has_large_image
-            or is_multi_col_page
-            or (page_has_significant_images and text_len < FAST_TEXT_THRESHOLD)
-            or (drawing_count >= 25 and text_len < FAST_TEXT_THRESHOLD)
-        )
-        if is_complex_page:
-            complex_pages += 1
-
-        page_details.append(
-            {
-                "page": idx + 1,
-                "text_len": text_len,
-                "image_count": len(images),
-                "img_coverage": round(img_coverage, 3),
-                "font_count": len(fonts),
-                "drawing_count": drawing_count,
-                "line_like_items": line_like_items,
-                "horizontal_line_items": horizontal_line_items,
-                "vertical_line_items": vertical_line_items,
-                "table_hit": table_hit,
-                "detected_table_count": detected_table_count,
-                "stroked_rect_count": rect_items,
-                "fill_only_rect_count": fill_only_rect_items,
-                "significant_image_count": page_significant_image_count,
-                "max_image_coverage": round(page_max_rect_ratio, 3),
-                "is_multi_col_page": is_multi_col_page,
-                "is_scan_page": is_scan_page,
-                "is_complex_page": is_complex_page,
-                "text_block_count": len(text_blocks),
-            }
-        )
-
-        del text_blocks
-        del blocks
-        del drawings
-        del fonts
-        del images
-        del page
-
-    doc.close()
-    del doc
-
-    n_sampled = len(sample_indices)
-    profile.avg_text_density = total_text_len / n_sampled if n_sampled > 0 else 0.0
-    profile.avg_image_coverage = (
-        total_image_coverage / n_sampled if n_sampled > 0 else 0.0
-    )
-    profile.has_embedded_fonts = has_any_fonts
-    profile.has_tables = has_any_tables
-    profile.is_multi_column = multi_col_pages > (n_sampled * 0.3)
-    profile.is_degraded_electronic = degraded_pages > (n_sampled * DEGRADED_PAGE_RATIO)
-    profile.sample_text = " ".join(all_text_parts)[:500]
-    profile.page_details = page_details
-
-    profile.has_significant_images = pages_with_significant_images > 0
-    profile.significant_image_count = significant_image_count
-    profile.max_image_coverage_on_page = max_image_coverage_on_page
-    profile.pages_with_significant_images = pages_with_significant_images
-    profile.large_image_page_ratio = (
-        large_image_pages / n_sampled if n_sampled > 0 else 0.0
-    )
-
-    profile.table_signal_pages = table_signal_pages
-    profile.table_signal_strength = (
-        total_table_signal_strength / n_sampled if n_sampled > 0 else 0.0
-    )
-
-    profile.complex_pages = complex_pages
-    profile.complex_page_ratio = complex_pages / n_sampled if n_sampled > 0 else 0.0
-    profile.max_drawing_count = max_drawing_count
-    profile.min_text_density_page = min(text_lengths) if text_lengths else 0.0
-    profile.text_density_std = _stddev(text_lengths)
-
-    scan_ratio = scanned_pages / n_sampled if n_sampled > 0 else 0.0
-    if scan_ratio >= SCAN_PAGE_RATIO:
-        profile.scan_type = "scanned"
-        reasons.append(
-            f"scanned: {scanned_pages}/{n_sampled} sampled pages are scanned ({scan_ratio:.0%})"
-        )
-    elif scanned_pages > 0:
-        profile.scan_type = "mixed"
-        reasons.append(f"mixed: {scanned_pages}/{n_sampled} sampled pages are scanned")
-    else:
-        profile.scan_type = "electronic"
-        reasons.append(
-            f"electronic: sampled pages contain extractable text (avg={profile.avg_text_density:.0f})"
-        )
-
-    landscape_ratio = landscape_pages / n_sampled if n_sampled > 0 else 0.0
-
-    # ── Linear atlas gate: VLM always makes the final call ──
-    # Any document meeting all 4 conditions is sent for VLM visual confirmation.
-    # We do NOT heuristically commit here — VLM decides in parse_service.
-    is_atlas_candidate = (
-        profile.avg_text_density
-        < ATLAS_TEXT_THRESHOLD  # text-sparse (< 200 chars/page)
-        and profile.avg_image_coverage
-        > ATLAS_CANDIDATE_IMAGE_COVERAGE_MIN  # image-heavy (> 30%)
-        and landscape_ratio >= ATLAS_MIN_LANDSCAPE_RATIO  # mostly landscape (>= 50%)
-        and profile.page_count >= ATLAS_MIN_PAGES  # multi-page (>= 2)
-    )
-    if is_atlas_candidate:
-        profile.doc_category = (
-            "generic"  # provisional — VLM will promote to "atlas" if confirmed
-        )
-        profile.atlas_candidate = True
-        reasons.append(
-            f"atlas_candidate: text={profile.avg_text_density:.0f}<{ATLAS_TEXT_THRESHOLD}, "
-            f"img={profile.avg_image_coverage:.1%}>{ATLAS_CANDIDATE_IMAGE_COVERAGE_MIN:.0%}, "
-            f"landscape={landscape_ratio:.0%}>={ATLAS_MIN_LANDSCAPE_RATIO:.0%}, "
-            f"pages={profile.page_count}>={ATLAS_MIN_PAGES} → VLM confirmation required"
-        )
-    else:
-        profile.doc_category = "generic"
-
-    if landscape_ratio >= 0.8 and profile.doc_category == "generic":
-        slide_ratios = [1.333, 1.778, 1.600]
-        tolerance = 0.05
-        ref_page = doc_page_sizes[0] if doc_page_sizes else None
-        if ref_page:
-            page_ratio = ref_page[0] / ref_page[1] if ref_page[1] > 0 else 0.0
-            is_slide_ratio = any(
-                abs(page_ratio - ratio) < tolerance for ratio in slide_ratios
-            )
-            if is_slide_ratio:
-                profile.doc_category = "ppt_converted"
-                reasons.append(
-                    f"ppt_converted: {landscape_pages}/{n_sampled} landscape, ratio={page_ratio:.2f}"
-                )
-
-    route, decision_band, benefit, risk, route_reasons = _classify_route(profile)
-    profile.route = route
-    profile.decision_band = decision_band
-    profile.estimated_fast_benefit = benefit
-    profile.estimated_risk_score = risk
-    reasons.extend(route_reasons)
-
-    profile.reasoning = " | ".join(reasons)
-    publish_profile_result(queue, profile)
-
-
-def profile_pdf(file_path: str) -> DocProfile:
-    """Profile a PDF by running PyMuPDF analysis in a spawned child process."""
-    result = run_in_child_process(_profile_pdf_worker, file_path, timeout=300)
-    profile = DocProfile(**result["profile"])
-    logger.info(
-        f"[doc-profiler] route={profile.route} band={profile.decision_band} "
-        f"category={profile.doc_category} scan={profile.scan_type} "
-        f"pages={profile.page_count} text_density={profile.avg_text_density:.0f} "
-        f"img_coverage={profile.avg_image_coverage:.1%} risk={profile.estimated_risk_score:.2f} "
-        f"gain={profile.estimated_fast_benefit:.2f}"
-    )
-    return profile
diff --git a/apps/worker/app/services/document_parser/profiling/doc_profiler.py b/apps/worker/app/services/document_parser/profiling/doc_profiler.py
index 184f1364..28076bfb 100644
--- a/apps/worker/app/services/document_parser/profiling/doc_profiler.py
+++ b/apps/worker/app/services/document_parser/profiling/doc_profiler.py
@@ -1,41 +1,107 @@
-"""
-Agentic Document Profiler
+"""Parser-entry document profiling."""
 
-Before data enters the pipeline, use lightweight analysis (~50ms) to generate
-DocProfile, driving routing decisions and type annotations.
-
-Usage:
-    from app.services.document_parser.profiling.doc_profiler import profile_document
-    profile = profile_document("/path/to/file.pdf")
-"""
+from __future__ import annotations
 
 import os
 
-from app.services.document_parser.profiling.doc_profile_model import DocProfile
-from app.services.document_parser.profiling.doc_profile_pdf import profile_pdf
+from app.services.document_agent.coordinator import ProfileCoordinator
+from app.services.document_parser.orchestration.oversized_pdf_policy import (
+    build_oversized_pdf_processing_failed_exception,
+    raise_if_oversized_pdf_not_supported,
+)
+from app.services.document_parser.profiling.profile_model import ParserDocumentProfile
+from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
+
+from shared.core.config import settings
 
 
-def profile_document(file_path: str, filename: str = "") -> DocProfile:
+def profile_document(
+    file_path: str,
+    filename: str = "",
+    *,
+    job_id: str | None = None,
+    output_dir: str | None = None,
+) -> ParserDocumentProfile:
     """
     General document profiling entry point.
 
     Args:
         file_path: Local file path
         filename: File name (used to infer type)
+        job_id: Parse job id for profile trace artifacts
+        output_dir: Parser output directory
 
     Returns:
-        DocProfile
+        ParserDocumentProfile
     """
     if not filename:
         filename = os.path.basename(file_path)
 
     ext = os.path.splitext(filename)[1].lower()
     if ext == ".pdf":
-        return profile_pdf(file_path)
+        return _profile_pdf(file_path, filename, job_id=job_id, output_dir=output_dir)
 
-    return DocProfile(
+    return ParserDocumentProfile(
         file_type=ext.lstrip("."),
-        route="standard",
-        decision_band="safe_standard",
+        category=f"{ext.lstrip('.') or 'unknown'} document",
+        routing_category=PdfRoutingCategory.GENERIC,
         reasoning=f"Non-PDF format ({ext}), using default route",
     )
+
+
+def _profile_pdf(
+    file_path: str,
+    filename: str,
+    *,
+    job_id: str | None,
+    output_dir: str | None,
+) -> ParserDocumentProfile:
+    profile_job_id = job_id or filename
+    agent_output_dir = os.path.join(output_dir, "_doc_agent") if output_dir else None
+    coordinator = ProfileCoordinator(
+        pdf_path=file_path,
+        job_id=profile_job_id,
+        output_dir=agent_output_dir,
+        model=settings.IMAGE_MODEL,
+        settings={
+            "planner_model": settings.IMAGE_MODEL,
+            "vlm_model": settings.IMAGE_MODEL,
+            "model": settings.HIERARCHY_LLM_MODEL or settings.NORMOL_MODEL,
+        },
+    )
+    agent_profile = coordinator.run_coarse()
+    routing_category = PdfRoutingCategory.normalize(agent_profile.routing_category)
+    profile = ParserDocumentProfile(
+        file_type="pdf",
+        category=agent_profile.category,
+        routing_category=routing_category,
+        is_scanned=agent_profile.is_scanned,
+        page_count=coordinator.blackboard.page_count,
+        language=agent_profile.language,
+        reasoning=agent_profile.rationale,
+        category_rationale=agent_profile.category_rationale,
+        metrics={
+            "doc_stats": coordinator.blackboard.doc_stats,
+            "doc_shape": coordinator.blackboard.global_signals.get("doc_shape", {}),
+            "page_kind_counts": coordinator.blackboard.global_signals.get(
+                "page_kind_counts",
+                {},
+            ),
+        },
+    )
+
+    if profile.page_count > settings.MAX_PDF_PAGE_LIMIT:
+        raise_if_oversized_pdf_not_supported(page_count=profile.page_count)
+        if not profile.is_atlas:
+            try:
+                profile.anatomy = coordinator.run_structural()
+            except Exception as exc:
+                raise build_oversized_pdf_processing_failed_exception(
+                    page_count=profile.page_count,
+                    original_exception=exc,
+                ) from exc
+
+    return profile
+
+
+__all__ = ["profile_document"]
diff --git a/apps/worker/app/services/document_parser/profiling/profile_model.py b/apps/worker/app/services/document_parser/profiling/profile_model.py
new file mode 100644
index 00000000..e553ca41
--- /dev/null
+++ b/apps/worker/app/services/document_parser/profiling/profile_model.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from typing import Any
+
+from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
+
+
+@dataclass
+class ParserDocumentProfile:
+    """Parser-entry document profile used for routing and PDF anatomy reuse."""
+
+    file_type: str
+    category: str = "unknown document"
+    routing_category: PdfRoutingCategory = PdfRoutingCategory.GENERIC
+    is_scanned: bool = False
+    page_count: int = 0
+    language: str = "unknown"
+    reasoning: str = ""
+    category_rationale: str = ""
+    anatomy: Any | None = None
+    metrics: dict[str, Any] = field(default_factory=dict)
+
+    @property
+    def is_pdf(self) -> bool:
+        return self.file_type == "pdf"
+
+    @property
+    def is_atlas(self) -> bool:
+        return self.routing_category is PdfRoutingCategory.ATLAS
+
+    @property
+    def has_structural_anatomy(self) -> bool:
+        return self.anatomy is not None
+
+    def to_dict(self) -> dict[str, Any]:
+        data = asdict(self)
+        data["routing_category"] = self.routing_category.value
+        if self.anatomy is not None and hasattr(self.anatomy, "to_dict"):
+            data["anatomy"] = self.anatomy.to_dict()
+        else:
+            data["anatomy"] = None
+        return data
+
+    def summary(self) -> str:
+        parts = (
+            f"[{self.file_type.upper()}] category={self.category}, "
+            f"routing={self.routing_category.value}, "
+            f"scanned={self.is_scanned}, pages={self.page_count}"
+        )
+        if self.has_structural_anatomy:
+            parts += ", anatomy=True"
+        return parts
+
+
+__all__ = ["ParserDocumentProfile"]
diff --git a/apps/worker/app/services/document_parser/profiling/taxonomy.py b/apps/worker/app/services/document_parser/profiling/taxonomy.py
new file mode 100644
index 00000000..2fb0303d
--- /dev/null
+++ b/apps/worker/app/services/document_parser/profiling/taxonomy.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from enum import Enum
+
+
+class PdfRoutingCategory(str, Enum):
+    ATLAS = "atlas"
+    GENERIC = "generic"
+    SCANNED = "scanned"
+    SLIDES = "slides"
+
+    @classmethod
+    def normalize(cls, value: object) -> "PdfRoutingCategory":
+        raw = str(value or "").strip().lower().replace("-", "_").replace(" ", "_")
+        if raw in {"atlas", "engineering_atlas", "drawing_atlas", "drawing_collection"}:
+            return cls.ATLAS
+        if raw in {"scan", "scanned", "scanned_pdf", "image_only"}:
+            return cls.SCANNED
+        if raw in {"slide", "slides", "ppt", "pptx", "presentation"}:
+            return cls.SLIDES
+        return cls.GENERIC
+
+
+__all__ = ["PdfRoutingCategory"]
diff --git a/apps/worker/tests/contract/test_parse_task_contract.py b/apps/worker/tests/contract/test_parse_task_contract.py
index 355966cd..c2f33121 100644
--- a/apps/worker/tests/contract/test_parse_task_contract.py
+++ b/apps/worker/tests/contract/test_parse_task_contract.py
@@ -499,13 +499,10 @@ def test_oversized_pdf_shard_failure_preserves_processing_error(
     monkeypatch.setenv("S3_TEMP_PATH", str(tmp_path))
 
     from app.services.document_parser.formats.pdf import parser as pdf_parser
+    from app.services.document_parser.profiling.profile_model import ParserDocumentProfile
+    from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
     from shared.core.exceptions.domain_exceptions import PDFParsingException
 
-    class _Profile:
-        route = "standard"
-        doc_category = "generic"
-        page_count = 2
-
     monkeypatch.setattr(pdf_parser.settings, "MAX_PDF_PAGE_LIMIT", 1)
 
     def _fail_oversized_parse(*args, **kwargs):
@@ -519,7 +516,12 @@ def _fail_oversized_parse(*args, **kwargs):
             "source.pdf",
             str(tmp_path),
             {},
-            profile=_Profile(),
+            profile=ParserDocumentProfile(
+                file_type="pdf",
+                category="generic document",
+                routing_category=PdfRoutingCategory.GENERIC,
+                page_count=2,
+            ),
         )
 
     assert exc_info.value.details["reason"] == "OVERSIZED_SHARD_PIPELINE_FAILED"
@@ -546,17 +548,14 @@ def test_oversized_pdf_happy_path_uses_shard_pipeline_without_external_services(
         TocResult,
     )
     from app.services.document_parser.formats.pdf import parser as pdf_parser
+    from app.services.document_parser.profiling.profile_model import ParserDocumentProfile
+    from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
 
     pdf_path = tmp_path / "oversized.pdf"
     output_dir = tmp_path / "output"
     output_dir.mkdir()
     _write_blank_pdf(pdf_path, page_count=3)
 
-    class _Profile:
-        route = "standard"
-        doc_category = "generic"
-        page_count = 3
-
     calls: dict[str, object] = {}
     parse_s3_keys: list[str | None] = []
     deleted_s3_keys: list[str] = []
@@ -566,46 +565,40 @@ def delete_upload_file(self, storage_key: str) -> bool:
             deleted_s3_keys.append(storage_key)
             return True
 
-    def _fake_run_doc_agent(pdf_path_arg: str, job_id: str, output_dir: str):
-        calls["doc_agent"] = {
-            "pdf_path": pdf_path_arg,
-            "job_id": job_id,
-            "output_dir": output_dir,
-        }
-        return PageAnatomyMap(
-            job_id=job_id,
-            file_path=pdf_path_arg,
-            page_count=3,
-            page_features=[],
-            page_labels=[],
-            toc_result=TocResult(toc_pages=[1], method="vlm_batch"),
-            h1_result=H1BoundaryResult(method="toc_grep"),
-            shard_plan=ShardPlan(
-                enabled=True,
-                reason="too_large",
-                shards=[
-                    Shard(
-                        shard_index=0,
-                        page_start=1,
-                        page_end=2,
-                        page_offset=0,
-                        anchor_type="h1_boundary",
-                        anchor_evidence="Chapter 1",
-                        confidence=0.9,
-                    ),
-                    Shard(
-                        shard_index=1,
-                        page_start=3,
-                        page_end=3,
-                        page_offset=2,
-                        anchor_type="h1_boundary",
-                        anchor_evidence="Chapter 2",
-                        confidence=0.9,
-                    ),
-                ],
-            ),
-            toc_hierarchies=[{"toc_tree": {"Chapter 1": {}, "Chapter 2": {}}}],
-        )
+    anatomy = PageAnatomyMap(
+        job_id="job-oversized",
+        file_path=str(pdf_path),
+        page_count=3,
+        page_features=[],
+        page_labels=[],
+        toc_result=TocResult(toc_pages=[1], method="vlm_batch"),
+        h1_result=H1BoundaryResult(method="toc_grep"),
+        shard_plan=ShardPlan(
+            enabled=True,
+            reason="too_large",
+            shards=[
+                Shard(
+                    shard_index=0,
+                    page_start=1,
+                    page_end=2,
+                    page_offset=0,
+                    anchor_type="h1_boundary",
+                    anchor_evidence="Chapter 1",
+                    confidence=0.9,
+                ),
+                Shard(
+                    shard_index=1,
+                    page_start=3,
+                    page_end=3,
+                    page_offset=2,
+                    anchor_type="h1_boundary",
+                    anchor_evidence="Chapter 2",
+                    confidence=0.9,
+                ),
+            ],
+        ),
+        toc_hierarchies=[{"toc_tree": {"Chapter 1": {}, "Chapter 2": {}}}],
+    )
 
     def _fake_split_pdf(pdf_path_arg, shards, work_dir, exclude_pages=None):
         calls["exclude_pages"] = exclude_pages
@@ -643,10 +636,6 @@ def _identity_eval_md_headings(
 
     monkeypatch.setattr(pdf_parser.settings, "MAX_PDF_PAGE_LIMIT", 2)
     monkeypatch.setattr(pdf_parser.settings, "MINERU_SHARD_CONCURRENCY", 1)
-    monkeypatch.setattr(
-        "app.services.document_parser.formats.pdf.shard_splitter.run_doc_agent",
-        _fake_run_doc_agent,
-    )
     monkeypatch.setattr(
         "app.services.document_parser.formats.pdf.shard_splitter.split_pdf",
         _fake_split_pdf,
@@ -671,14 +660,19 @@ def _identity_eval_md_headings(
             "model_name": "mock-model",
             "hierarchy_model_name": "mock-model",
         },
-        profile=_Profile(),
+        profile=ParserDocumentProfile(
+            file_type="pdf",
+            category="generic document",
+            routing_category=PdfRoutingCategory.GENERIC,
+            page_count=3,
+            anatomy=anatomy,
+        ),
         relative_root="oversized.pdf",
         s3_key="uploads/job-oversized.pdf",
         job_id="job-oversized",
     )
 
     assert calls["exclude_pages"] == {1}
-    assert calls["doc_agent"]["job_id"] == "job-oversized"
     assert len(calls["heading_dirs"]) == 2
     expected_s3_keys = [
         "tmp/mineru-shards/job-oversized/shard_0.pdf",

From 474ef68595a5d15d36336c7bd50ecd4fdcb137ae Mon Sep 17 00:00:00 2001
From: chengke <404835780@qq.com>
Date: Wed, 10 Jun 2026 16:15:57 +0800
Subject: [PATCH 2/4] feat: add TOC profiling support and upgrade Qwen models
 to 3.6-flash

---
 apps/api/.env.example                         |   5 +-
 apps/worker/.env.example                      |   5 +-
 .../services/document_agent/coordinator.py    | 210 ++++-
 .../app/services/document_agent/manifest.py   |  17 +-
 .../tools/extract_toc_with_boundaries.py      |  82 +-
 .../document_agent/tools/match_h1_pages.py    |  70 +-
 .../app/services/document_agent/validators.py |   2 +-
 .../document_parser/formats/image/parser.py   |   6 +-
 .../formats/markdown/parser.py                |   4 +
 .../document_parser/formats/pdf/parser.py     | 197 +++--
 .../document_parser/profiling/doc_profiler.py |  37 +-
 .../profiling/profile_model.py                |  28 +-
 .../structure/body_boundary.py                |  72 ++
 .../structure/heading_hierarchy.py            |   2 +
 .../structure/layout_parser.py                |  25 +
 .../test_doc_profile_anatomy_contract.py      | 783 ++++++++++++++++++
 .../contract/test_parse_task_contract.py      |  34 +-
 .../shared-python/shared/core/config/ai.py    |   4 +-
 .../shared/core/config/storage.py             |   8 +
 .../shared/services/retrieval/llm_adapter.py  |   4 +-
 20 files changed, 1402 insertions(+), 193 deletions(-)
 create mode 100644 apps/worker/app/services/document_parser/structure/body_boundary.py
 create mode 100644 apps/worker/tests/contract/test_doc_profile_anatomy_contract.py

diff --git a/apps/api/.env.example b/apps/api/.env.example
index 139aa41c..820e849a 100644
--- a/apps/api/.env.example
+++ b/apps/api/.env.example
@@ -81,8 +81,8 @@ ARK_API_KEY=
 # ARK_URL=https://ark.cn-beijing.volces.com/api/v3/chat/completions
 # NORMOL_MODEL=deepseek-chat
 # HIERARCHY_LLM_MODEL=
-# IMAGE_MODEL=qwen3.5-flash
-# IMAGE_MODEL_MAX=qwen3.5-flash
+# IMAGE_MODEL=qwen3.6-flash
+# IMAGE_MODEL_MAX=qwen3.6-flash
 
 # Optional retrieval overrides have code defaults. Retrieval is evidence-only:
 # evidence_text is the primary output and answer_text is always empty. Set
@@ -95,6 +95,7 @@ MAX_FILE_SIZE=314572800
 MAX_PDF_PAGE_LIMIT=200
 OVERSIZED_PDF_SHARD_ENABLED=true
 OVERSIZED_PDF_SOFT_LIMIT=1500
+PDF_PROFILE_TOC_ENABLED=false
 MINERU_SHARD_CONCURRENCY=3
 
 # Required for specific features: webhooks and callbacks
diff --git a/apps/worker/.env.example b/apps/worker/.env.example
index cc2a3687..c9029a58 100644
--- a/apps/worker/.env.example
+++ b/apps/worker/.env.example
@@ -86,8 +86,8 @@ ARK_API_KEY=
 # ARK_URL=https://ark.cn-beijing.volces.com/api/v3/chat/completions
 # NORMOL_MODEL=deepseek-chat
 # HIERARCHY_LLM_MODEL=
-# IMAGE_MODEL=qwen3.5-flash
-# IMAGE_MODEL_MAX=qwen3.5-flash
+# IMAGE_MODEL=qwen3.6-flash
+# IMAGE_MODEL_MAX=qwen3.6-flash
 
 # Optional retrieval overrides have code defaults. Retrieval is evidence-only:
 # evidence_text is the primary output and answer_text is always empty. Set
@@ -120,6 +120,7 @@ MAX_FILE_SIZE=314572800
 MAX_PDF_PAGE_LIMIT=200
 OVERSIZED_PDF_SHARD_ENABLED=true
 OVERSIZED_PDF_SOFT_LIMIT=1500
+PDF_PROFILE_TOC_ENABLED=false
 MINERU_SHARD_CONCURRENCY=3
 
 # Legacy parser compatibility fields.
diff --git a/apps/worker/app/services/document_agent/coordinator.py b/apps/worker/app/services/document_agent/coordinator.py
index 374adbf3..bcb4edb9 100644
--- a/apps/worker/app/services/document_agent/coordinator.py
+++ b/apps/worker/app/services/document_agent/coordinator.py
@@ -17,7 +17,9 @@
 from app.services.document_agent.manifest import (
     DocumentProfile,
     PageAnatomyMap,
+    TocResult,
     ToolContext,
+    ToolResult,
 )
 from app.services.document_agent.persist import build_anatomy_map, persist_anatomy_map
 from app.services.document_agent.planner import ProfilePlanner
@@ -59,6 +61,7 @@ def __init__(
         self.trace = ParseRunRecorder(job_id=job_id, db=db)
         self.ctx.trace = self.trace
         self.round_index = 0
+        self._planner_cache: tuple[DocumentProfile, Any, ToolResult] | None = None
 
     def run(self) -> PageAnatomyMap:
         try:
@@ -81,41 +84,49 @@ def run_structural(self) -> PageAnatomyMap:
             self._record_failure(exc)
             raise
 
+    def run_toc(self) -> TocResult:
+        try:
+            return self._run_toc()
+        except Exception as exc:
+            logger.warning(
+                "[document_agent] TOC profiling failed, degrading to empty TOC: {}",
+                exc,
+            )
+            self.blackboard.toc_result = TocResult(
+                method="none",
+                notes=f"degraded: {type(exc).__name__}: {exc}",
+                failure_kind="degraded",
+            )
+            self.blackboard.toc_hierarchies = None
+            return self.blackboard.toc_result
+
+    def run_lightweight_anatomy(self) -> PageAnatomyMap:
+        try:
+            return self._run_lightweight_anatomy()
+        except Exception as exc:
+            self._record_failure(exc)
+            raise
+
     def _run_coarse(self) -> DocumentProfile:
         self.state = DocumentAgentState.RUNNING
         if not self.blackboard.page_features:
             self._run_bootstrap()
-        profile, _initial_decision, planner_result = ProfilePlanner(self.ctx).propose()
-        self.blackboard.document_profile = profile
-        self.blackboard.global_signals["document_profile"] = profile.to_dict()
-        self.trace.record_step(
-            round_index=self.round_index,
-            actor="planner:coarse",
-            action_type="plan",
-            result=planner_result,
-            tool_name=None,
-            tool_args={},
+        if self._should_run_toc_before_coarse():
+            self._ensure_toc_profile(strict=False)
+        profile, _initial_decision, _planner_result = self._propose_profile(
+            actor="planner:coarse"
         )
-        self.round_index += 1
         return profile
 
     def _run_structural(self) -> PageAnatomyMap:
         self.state = DocumentAgentState.RUNNING
         if not self.blackboard.page_features:
             self._run_bootstrap()
-        self._run_toc_pipeline()
-        profile, initial_decision, planner_result = ProfilePlanner(self.ctx).propose()
-        self.blackboard.document_profile = profile
-        self.blackboard.global_signals["document_profile"] = profile.to_dict()
-        self.trace.record_step(
-            round_index=self.round_index,
-            actor="planner",
-            action_type="plan",
-            result=planner_result,
-            tool_name=None,
-            tool_args={},
+        self._ensure_toc_profile(strict=True)
+        profile, initial_decision, _planner_result = self._propose_profile(
+            actor="planner"
         )
-        self.round_index += 1
+        self._run_h1_boundary_pipeline()
         executor_result = ReActExecutor(
             self.ctx,
             registry=REGISTRY,
@@ -125,6 +136,48 @@ def _run_structural(self) -> PageAnatomyMap:
         if executor_result.verdict.status != "success":
             raise RuntimeError(f"profile aborted: {executor_result.verdict.rationale}")
         anatomy = build_anatomy_map(self.ctx)
+        self._persist_ready_anatomy(anatomy)
+        return anatomy
+
+    def _run_toc(self) -> TocResult:
+        self.state = DocumentAgentState.RUNNING
+        if not self.blackboard.page_features:
+            self._run_bootstrap()
+        self._ensure_toc_profile(strict=False)
+        if self.blackboard.toc_result is None:
+            self.blackboard.toc_result = TocResult(
+                method="none",
+                notes="TOC extraction completed without a result",
+            )
+        return self.blackboard.toc_result
+
+    def _run_lightweight_anatomy(self) -> PageAnatomyMap:
+        self.state = DocumentAgentState.RUNNING
+        if not self.blackboard.page_features:
+            self._run_bootstrap()
+        if self.blackboard.toc_result is None:
+            self.blackboard.toc_result = TocResult(
+                method="none",
+                notes="TOC profiling disabled or not attempted",
+            )
+        self._run_h1_boundary_pipeline()
+        result = REGISTRY.dispatch("propose.shard_plan", self.ctx, {})
+        self.trace.record_step(
+            round_index=self.round_index,
+            actor="anatomy:propose.shard_plan",
+            action_type="anatomy",
+            result=result,
+            tool_name="propose.shard_plan",
+            tool_args={},
+        )
+        if result.status not in {"ok", "invalid"}:
+            raise RuntimeError(result.error or "propose.shard_plan failed")
+        self.round_index += 1
+        anatomy = build_anatomy_map(self.ctx)
+        self._persist_ready_anatomy(anatomy)
+        return anatomy
+
+    def _persist_ready_anatomy(self, anatomy: PageAnatomyMap) -> None:
         persist_result = persist_anatomy_map(self.ctx, {})
         self.trace.record_step(
             round_index=self.round_index,
@@ -144,7 +197,6 @@ def _run_structural(self) -> PageAnatomyMap:
             final_status="ready",
             summary=anatomy.trace_summary | self.trace.summary(),
         )
-        return anatomy
 
     def _record_failure(self, exc: Exception) -> None:
         logger.error(f"[document_agent] profile failed: {exc}")
@@ -175,21 +227,99 @@ def _run_bootstrap(self) -> None:
                 raise RuntimeError(result.error or f"{tool_name} failed")
             self.round_index += 1
 
-    def _run_toc_pipeline(self) -> None:
-        for tool_name in (
-            "find.toc_anchor_pages",
-            "extract.toc_with_boundaries",
-            "match.h1_pages",
-        ):
-            result = REGISTRY.dispatch(tool_name, self.ctx, {})
-            self.trace.record_step(
-                round_index=self.round_index,
-                actor=f"toc:{tool_name}",
-                action_type="toc",
-                result=result,
+    def _toc_result_requires_strict_retry(self) -> bool:
+        toc_result = self.blackboard.toc_result
+        return bool(
+            toc_result
+            and toc_result.method == "none"
+            and toc_result.failure_kind in {"confirm_failed", "degraded"}
+        )
+
+    def _should_run_toc_before_coarse(self) -> bool:
+        if self.ctx.settings.get("toc_before_coarse"):
+            return True
+        try:
+            page_limit = int(self.ctx.settings.get("toc_before_coarse_page_limit", 0))
+        except (TypeError, ValueError):
+            page_limit = 0
+        return page_limit > 0 and self.blackboard.page_count > page_limit
+
+    def _ensure_toc_profile(self, *, strict: bool) -> None:
+        should_run = self.blackboard.toc_result is None
+        if strict and self._toc_result_requires_strict_retry():
+            self.blackboard.toc_result = None
+            self.blackboard.toc_hierarchies = None
+            should_run = True
+
+        if not should_run:
+            return
+
+        self._planner_cache = None
+        try:
+            self._run_toc_extraction_pipeline()
+        except Exception as exc:
+            logger.warning(
+                "[document_agent] TOC profiling failed, "
+                "degrading to empty TOC: {}",
+                exc,
+            )
+            self.blackboard.toc_result = TocResult(
+                method="none",
+                notes=f"degraded: {type(exc).__name__}: {exc}",
+                failure_kind="degraded",
+            )
+            self.blackboard.toc_hierarchies = None
+            return
+
+        if self.blackboard.toc_result is None:
+            self.blackboard.toc_result = TocResult(
+                method="none",
+                notes="TOC extraction completed without a result",
+            )
+
+    def _propose_profile(self, *, actor: str) -> tuple[DocumentProfile, Any, ToolResult]:
+        if self._planner_cache is not None:
+            return self._planner_cache
+
+        profile, initial_decision, planner_result = ProfilePlanner(self.ctx).propose()
+        self.blackboard.document_profile = profile
+        self.blackboard.global_signals["document_profile"] = profile.to_dict()
+        self.trace.record_step(
+            round_index=self.round_index,
+            actor=actor,
+            action_type="plan",
+            result=planner_result,
+            tool_name=None,
+            tool_args={},
+        )
+        self.round_index += 1
+        self._planner_cache = (profile, initial_decision, planner_result)
+        return self._planner_cache
+
+    def _dispatch_profile_tool(self, *, tool_name: str, actor: str) -> ToolResult:
+        result = REGISTRY.dispatch(tool_name, self.ctx, {})
+        self.trace.record_step(
+            round_index=self.round_index,
+            actor=actor,
+            action_type="toc",
+            result=result,
+            tool_name=tool_name,
+            tool_args={},
+        )
+        if result.status not in {"ok", "invalid"}:
+            raise RuntimeError(result.error or f"{tool_name} failed")
+        self.round_index += 1
+        return result
+
+    def _run_toc_extraction_pipeline(self) -> None:
+        for tool_name in ("find.toc_anchor_pages", "extract.toc_with_boundaries"):
+            self._dispatch_profile_tool(
                 tool_name=tool_name,
-                tool_args={},
+                actor=f"toc:{tool_name}",
             )
-            if result.status not in {"ok", "invalid"}:
-                raise RuntimeError(result.error or f"{tool_name} failed")
-            self.round_index += 1
+
+    def _run_h1_boundary_pipeline(self) -> None:
+        self._dispatch_profile_tool(
+            tool_name="match.h1_pages",
+            actor="toc:match.h1_pages",
+        )
diff --git a/apps/worker/app/services/document_agent/manifest.py b/apps/worker/app/services/document_agent/manifest.py
index b89f6164..2518e282 100644
--- a/apps/worker/app/services/document_agent/manifest.py
+++ b/apps/worker/app/services/document_agent/manifest.py
@@ -8,6 +8,7 @@
 
 
 PageKind = Literal["normal", "table_heavy", "image_heavy", "low_content", "landscape"]
+TocFailureKind = Literal["none", "confirm_failed", "rejected_all", "degraded"]
 
 ReflexionAction = Literal["tool_call", "verdict_now"]
 VerdictStatus = Literal["success", "abort"]
@@ -107,16 +108,30 @@ def to_dict(self) -> dict[str, Any]:
         return asdict(self)
 
 
+@dataclass
+class TocEvidence:
+    page_index: int
+    source: str
+    confidence: float
+    reason: str = ""
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
 @dataclass
 class TocResult:
     toc_pages: list[int] = field(default_factory=list)
-    candidates: list[TocCandidate] = field(default_factory=list)
+    candidates: list[TocAnchorPage] = field(default_factory=list)
+    evidence: list[TocEvidence] = field(default_factory=list)
     method: Literal["toc_marker", "vlm_progressive", "vlm_batch", "visual_scan", "none"] = "none"
     notes: str = ""
+    failure_kind: TocFailureKind = "none"
 
     def to_dict(self) -> dict[str, Any]:
         data = asdict(self)
         data["candidates"] = [candidate.to_dict() for candidate in self.candidates]
+        data["evidence"] = [item.to_dict() for item in self.evidence]
         return data
 
 
diff --git a/apps/worker/app/services/document_agent/tools/extract_toc_with_boundaries.py b/apps/worker/app/services/document_agent/tools/extract_toc_with_boundaries.py
index 4a424a5c..ddb876e4 100644
--- a/apps/worker/app/services/document_agent/tools/extract_toc_with_boundaries.py
+++ b/apps/worker/app/services/document_agent/tools/extract_toc_with_boundaries.py
@@ -13,6 +13,7 @@
 
 from app.services.document_agent.manifest import (
     TocAnchorPage,
+    TocEvidence,
     TocResult,
     ToolContext,
     ToolResult,
@@ -67,12 +68,12 @@ def _vlm_confirm_anchors(
     anchor_pages: list[TocAnchorPage],
     model: str,
     budget: Any | None = None,
-) -> tuple[list[TocAnchorPage], bool]:
+) -> tuple[list[TocAnchorPage], bool, list[TocEvidence]]:
     """Phase 1: send all anchor PNGs to VLM, ask which are real TOC starts."""
     from shared.services.ai.openai_compatible_client_sync import get_openai_client
 
     if not anchor_pages:
-        return [], False
+        return [], False, []
 
     import base64
 
@@ -120,7 +121,7 @@ def _vlm_confirm_anchors(
     est = estimate_tokens(str(content_parts[0]["text"])) + len(anchor_pages) * 800
     if budget and not budget.try_reserve("visual", est):
         logger.warning("[extract.toc] insufficient visual budget for anchor confirmation")
-        return [], True
+        return [], True, []
 
     try:
         client = get_openai_client(model=model)
@@ -144,18 +145,50 @@ def _vlm_confirm_anchors(
             items = []
 
         confirmed_pages: set[int] = set()
+        evidence_by_page: dict[int, TocEvidence] = {}
         for item in items:
-            if isinstance(item, dict) and item.get("is_toc_start"):
-                confirmed_pages.add(int(item["page"]))
+            if not isinstance(item, dict) or "page" not in item:
+                continue
+            page = int(item["page"])
+            is_toc_start = bool(item.get("is_toc_start"))
+            if is_toc_start:
+                confirmed_pages.add(page)
+            raw_confidence = item.get("confidence")
+            try:
+                confidence = (
+                    float(raw_confidence)
+                    if raw_confidence is not None
+                    else (0.95 if is_toc_start else 0.05)
+                )
+            except (TypeError, ValueError):
+                confidence = 0.95 if is_toc_start else 0.05
+            evidence_by_page[page] = TocEvidence(
+                page_index=page,
+                source="vlm",
+                confidence=max(0.0, min(1.0, confidence)),
+                reason=str(item.get("reason") or ""),
+            )
 
         confirmed = [a for a in anchor_pages if a.page in confirmed_pages]
         rejected = [a.page for a in anchor_pages if a.page not in confirmed_pages]
+        evidence = [
+            evidence_by_page.get(
+                a.page,
+                TocEvidence(
+                    page_index=a.page,
+                    source="vlm",
+                    confidence=0.05,
+                    reason="VLM response omitted this candidate page",
+                ),
+            )
+            for a in anchor_pages
+        ]
         logger.info(
             "[extract.toc] VLM confirmed {} TOC starts, rejected pages: {}",
             len(confirmed),
             rejected,
         )
-        return confirmed, False
+        return confirmed, False, evidence
     except Exception as exc:
         if budget:
             budget.refund("visual", est=est)
@@ -164,7 +197,7 @@ def _vlm_confirm_anchors(
             "falling back to no confirmed anchors (safe degradation)",
             exc,
         )
-        return [], True
+        return [], True, []
 
 
 # -- Main tool -----------------------------------------------------------------
@@ -190,6 +223,7 @@ def extract_toc_with_boundaries(
         ctx.blackboard.toc_result = TocResult(
             method="none",
             notes="No TOC anchor pages found by find.toc_anchor_pages",
+            failure_kind="none",
         )
         return ToolResult(
             status="ok",
@@ -203,6 +237,7 @@ def extract_toc_with_boundaries(
         ctx.blackboard.toc_result = TocResult(
             method="none",
             notes="No VLM model configured for TOC extraction",
+            failure_kind="degraded",
         )
         return ToolResult(
             status="ok",
@@ -219,18 +254,40 @@ def extract_toc_with_boundaries(
     os.makedirs(output_dir, exist_ok=True)
 
     # -- Phase 1: VLM confirm anchors -----------------------------------------
-    confirmed, confirm_failed = _vlm_confirm_anchors(anchors, model, budget=ctx.budget)
+    confirmed, confirm_failed, confirm_evidence = _vlm_confirm_anchors(
+        anchors, model, budget=ctx.budget
+    )
     if confirm_failed:
         warnings.append("vlm_anchor_confirmation_failed")
     debug_info["phase1_confirmed"] = [a.page for a in confirmed]
-    debug_info["phase1_rejected"] = [
-        a.page for a in anchors if a not in confirmed
-    ]
+    debug_info["phase1_rejected"] = (
+        [] if confirm_failed else [a.page for a in anchors if a not in confirmed]
+    )
+    if confirm_failed:
+        debug_info["phase1_unconfirmed"] = [a.page for a in anchors]
 
     if not confirmed:
+        if confirm_failed:
+            ctx.blackboard.toc_result = TocResult(
+                candidates=list(anchors),
+                evidence=confirm_evidence,
+                method="none",
+                notes="VLM anchor confirmation failed; TOC candidates left unconfirmed",
+                failure_kind="confirm_failed",
+            )
+            return ToolResult(
+                status="ok",
+                payload={"toc_count": 0},
+                latency_ms=int((time.monotonic() - start) * 1000),
+                warnings=warnings,
+                debug=debug_info,
+            )
         ctx.blackboard.toc_result = TocResult(
+            candidates=list(anchors),
+            evidence=confirm_evidence,
             method="none",
             notes="VLM rejected all TOC anchor candidates",
+            failure_kind="rejected_all",
         )
         return ToolResult(
             status="ok",
@@ -364,12 +421,14 @@ def extract_toc_with_boundaries(
 
     ctx.blackboard.toc_result = TocResult(
         toc_pages=all_toc_pages_sorted,
+        evidence=confirm_evidence,
         method="vlm_batch",
         notes=(
             f"VLM confirmed {len(confirmed)} TOC starts, "
             f"batch classify+extract found {toc_region_count} regions, "
             f"toc_pages={all_toc_pages_sorted}"
         ),
+        failure_kind="none",
     )
     ctx.blackboard.toc_hierarchies = toc_hierarchies if toc_hierarchies else None
     ctx.blackboard.global_signals["vlm_toc_entries"] = {
@@ -421,4 +480,3 @@ def extract_toc_with_boundaries(
         warnings=warnings,
         debug=debug_info,
     )
-
diff --git a/apps/worker/app/services/document_agent/tools/match_h1_pages.py b/apps/worker/app/services/document_agent/tools/match_h1_pages.py
index 1a6d0d07..7cae13bf 100644
--- a/apps/worker/app/services/document_agent/tools/match_h1_pages.py
+++ b/apps/worker/app/services/document_agent/tools/match_h1_pages.py
@@ -5,9 +5,7 @@
 import base64
 import json
 import os
-import re
 import time
-import unicodedata
 from typing import Any, cast
 
 from app.services.document_agent.manifest import (
@@ -19,42 +17,12 @@
 from app.services.document_agent.pdf_text import read_page_texts
 from app.services.document_agent.registry import has_toc_result, register_tool
 from app.services.document_agent.visual import render_pages
-from loguru import logger
-
-
-# ── Text normalization for matching ──────────────────────────────────────
-
-_LEADING_NUMBER_RE = re.compile(
-    r"""^
-    (?:
-        [#]+\s*
-        | 第\s*[零一二三四五六七八九十百千\d]+\s*[章节篇部分]
-        | [零一二三四五六七八九十百千]+\s*[、。，,]
-        | [（(]\s*[零一二三四五六七八九十百千\d]+\s*[）)]
-        | \d+(?:\.\d+)*\.?\s*
-        | [IVXLCDM]+\.?\s+
-        | [A-Za-z]\.\s+
-        | Chapter\s+\w+\s*
-    )
-    """,
-    re.VERBOSE | re.IGNORECASE,
+from app.services.document_parser.structure.body_boundary import (
+    clean_toc_title,
+    extract_level1_titles,
+    normalize_heading_text,
 )
-
-_PAGE_SUFFIX_RE = re.compile(r"[\s\.\-·…]+\d+\s*$")
-
-
-def _normalize(text: str) -> str:
-    """Normalize text for fuzzy heading matching."""
-    text = unicodedata.normalize("NFKC", text or "")
-    text = re.sub(r"\s+", " ", text).strip()
-    return text
-
-
-def _clean_toc_title(title: str) -> str:
-    """Remove leading numbering/hashes and trailing page numbers from a TOC title."""
-    cleaned = _PAGE_SUFFIX_RE.sub("", title or "").strip()
-    cleaned = _LEADING_NUMBER_RE.sub("", cleaned).strip()
-    return cleaned
+from loguru import logger
 
 
 # ── C1: Unified grep matching ────────────────────────────────────────────
@@ -80,14 +48,14 @@ def grep_titles_in_pages(
     unmatched: list[str] = []
 
     for title in titles:
-        normalized_title = _normalize(title)
+        normalized_title = normalize_heading_text(title)
         found = False
         for page in search_pages:
             text = page_texts.get(page, "")
-            if normalized_title in _normalize(text):
+            if normalized_title in normalize_heading_text(text):
                 matched_line = ""
                 for line in text.splitlines():
-                    if normalized_title in _normalize(line):
+                    if normalized_title in normalize_heading_text(line):
                         matched_line = line.strip()[:100]
                         break
                 candidates.append(
@@ -130,30 +98,20 @@ def extract_children_titles(
         in_scope = False
         for entry in entries:
             if entry.get("level") == 1:
-                cleaned = _clean_toc_title(entry.get("heading", ""))
-                in_scope = _normalize(cleaned) == _normalize(parent_title)
+                cleaned = clean_toc_title(entry.get("heading", ""))
+                in_scope = normalize_heading_text(cleaned) == normalize_heading_text(
+                    parent_title
+                )
                 continue
             if in_scope and entry.get("level") == 2:
-                cleaned = _clean_toc_title(entry.get("heading", ""))
+                cleaned = clean_toc_title(entry.get("heading", ""))
                 if cleaned and len(cleaned) >= 2:
                     titles.append(cleaned)
     return titles
 
 
 def _extract_level1_titles(toc_hierarchies: list[dict[str, Any]]) -> list[str]:
-    """Extract level-1 titles from toc_hierarchies.
-
-    Each hierarchy dict contains ``toc_tree`` – a nested dict where top-level
-    keys are level-1 headings (values are sub-heading dicts).
-    """
-    titles: list[str] = []
-    for hier in toc_hierarchies:
-        toc_tree = hier.get("toc_tree") or {}
-        for raw_title in toc_tree.keys():
-            cleaned = _clean_toc_title(raw_title)
-            if cleaned and len(cleaned) >= 2:
-                titles.append(cleaned)
-    return titles
+    return extract_level1_titles(toc_hierarchies)
 
 
 # ── C2: Lazy VLM verification ────────────────────────────────────────────
diff --git a/apps/worker/app/services/document_agent/validators.py b/apps/worker/app/services/document_agent/validators.py
index 6f6f01f5..12ea930b 100644
--- a/apps/worker/app/services/document_agent/validators.py
+++ b/apps/worker/app/services/document_agent/validators.py
@@ -35,7 +35,7 @@ def validate_shard_plan(
         if shard.page_offset != shard.page_start - 1:
             errors.append(f"shard {shard.shard_index} page_offset mismatch")
         length = shard.page_end - shard.page_start + 1
-        if plan.enabled and length > max_pages:
+        if length > max_pages:
             errors.append(f"shard {shard.shard_index} exceeds max_pages={max_pages}")
         if plan.enabled and length < min_pages:
             if is_last:
diff --git a/apps/worker/app/services/document_parser/formats/image/parser.py b/apps/worker/app/services/document_parser/formats/image/parser.py
index 251e2e28..df7555bb 100755
--- a/apps/worker/app/services/document_parser/formats/image/parser.py
+++ b/apps/worker/app/services/document_parser/formats/image/parser.py
@@ -55,7 +55,7 @@ def perceptual_hash(data: bytes) -> str:
 
 def _get_vision_client() -> OpenAICompatibleClientSync:
     """Create OpenAI-compatible client for vision models, auto-routing by IMAGE_MODEL name."""
-    image_model = settings.IMAGE_MODEL or "qwen-vl-plus"
+    image_model = settings.IMAGE_MODEL or "qwen3.6-flash"
     return get_openai_client(model=image_model)
 
 
@@ -141,9 +141,9 @@ def ask_image(
     urls_ = process_img_path4read(valid_paths, image_root_dir, size_cut)
 
     if task in ("summary-images", "atlas-page-info"):
-        image_model = settings.IMAGE_MODEL or "gpt-4-vision-preview"
+        image_model = settings.IMAGE_MODEL or "qwen3.6-flash"
     else:  # OCR and image type classification use higher-capability models
-        image_model = settings.IMAGE_MODEL_MAX or "gpt-4-vision-preview"
+        image_model = settings.IMAGE_MODEL_MAX or "qwen3.6-flash"
 
     if len(urls_) > 0:
         prompt, temperature, top_p, max_tokens = build_prompt(
diff --git a/apps/worker/app/services/document_parser/formats/markdown/parser.py b/apps/worker/app/services/document_parser/formats/markdown/parser.py
index 7fdfd4df..925bd524 100755
--- a/apps/worker/app/services/document_parser/formats/markdown/parser.py
+++ b/apps/worker/app/services/document_parser/formats/markdown/parser.py
@@ -113,6 +113,7 @@ def eval_md_headings(
     model_name=None,
     output_dir=None,
     layout_json_path=None,
+    is_first_shard=True,
 ):
     """Evaluate markdown headings with optional TOC hierarchies context"""
     heading_preds = predict_heading_hierarchy(
@@ -125,6 +126,7 @@ def eval_md_headings(
             model_name=model_name,
             output_dir=output_dir,
             layout_json_path=layout_json_path,
+            is_first_shard=is_first_shard,
         )
     )
 
@@ -225,6 +227,7 @@ def parse_md(
     relative_root=None,
     toc_hierarchies=None,
     lines_with_heading=None,
+    is_first_shard=True,
 ):
     if lines_with_heading is not None:
         # ── Phase A bypass ──
@@ -312,6 +315,7 @@ def parse_md(
                 model_name=hierarchy_model_name,
                 output_dir=output_dir,
                 layout_json_path=layout_json_path,
+                is_first_shard=is_first_shard,
             )
 
     # ── Phase B: MarkdownParseState traversal ──
diff --git a/apps/worker/app/services/document_parser/formats/pdf/parser.py b/apps/worker/app/services/document_parser/formats/pdf/parser.py
index 08cfab8d..791cc40e 100755
--- a/apps/worker/app/services/document_parser/formats/pdf/parser.py
+++ b/apps/worker/app/services/document_parser/formats/pdf/parser.py
@@ -9,7 +9,9 @@
 )
 from app.services.document_parser.providers.mineru.pdf_service import parse_via_full
 from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
+from app.services.document_parser.structure.toc_parser import detect_tocs_in_texts
 from app.services.document_parser.support.stage_profiler import stage_timer
+from app.services.document_parser.support.text_helpers import normalize_md
 from loguru import logger
 
 from shared.core.config import settings
@@ -37,19 +39,27 @@ def parse_pdfs(
             pdf_path, output_dir, base_llm_paras, relative_root, profile=profile
         )
 
-    # ── Oversized PDF: doc_agent → shard → parallel MinerU → merge → parse_md ──
-    if profile and profile.page_count > settings.MAX_PDF_PAGE_LIMIT:
-        logger.info(
-            f"📄 Oversized PDF: {profile.page_count} pages > "
-            f"{settings.MAX_PDF_PAGE_LIMIT} limit, entering shard pipeline"
-        )
+    # ── Unified anatomy path: DOC_PROFILE → shard wrapper → parse_md Phase B ──
+    if profile and getattr(profile, "anatomy", None) is not None:
+        is_oversized = profile.page_count > settings.MAX_PDF_PAGE_LIMIT
+        if is_oversized:
+            logger.info(
+                f"📄 Oversized PDF: {profile.page_count} pages > "
+                f"{settings.MAX_PDF_PAGE_LIMIT} limit, entering shard pipeline"
+            )
+        else:
+            logger.info(
+                f"📄 Profile anatomy detected, entering PDF shard pipeline for {filename}"
+            )
         try:
-            return _parse_oversized_pdf(
+            return _parse_pdf_via_shards(
                 pdf_path, filename, output_dir, base_llm_paras,
                 profile=profile, relative_root=relative_root, s3_key=s3_key,
                 job_id=job_id,
             )
         except Exception as exc:
+            if not is_oversized:
+                raise
             logger.exception(
                 "Oversized PDF shard pipeline failed for {} (pages={})",
                 filename,
@@ -77,11 +87,11 @@ def parse_pdfs(
         )
 
 
-def _parse_oversized_pdf(
+def _parse_pdf_via_shards(
     pdf_path, filename, output_dir, base_llm_paras,
     profile=None, relative_root=None, s3_key=None, job_id=None,
 ):
-    """Handle PDFs exceeding MinerU's page limit via shard-first hierarchy.
+    """Handle PDFs via the unified shard-first hierarchy pipeline.
 
     Pipeline:
     1. DOC_AGENT → shard plan + TOC
@@ -99,8 +109,14 @@ def _parse_oversized_pdf(
         eval_md_headings,
         merge_html_tables,
     )
-    from app.services.document_parser.formats.pdf.shard_merger import merge_images, merge_shard_lines
-    from app.services.document_parser.formats.pdf.shard_splitter import bin_pack_shards, split_pdf
+    from app.services.document_parser.formats.pdf.shard_merger import (
+        merge_images,
+        merge_shard_lines,
+    )
+    from app.services.document_parser.formats.pdf.shard_splitter import (
+        bin_pack_shards,
+        split_pdf,
+    )
 
     work_dir: str | None = None
     temp_shard_s3_keys: list[str] = []
@@ -110,21 +126,20 @@ def _parse_oversized_pdf(
         anatomy = getattr(profile, "anatomy", None)
         if anatomy is None:
             raise RuntimeError(
-                f"Oversized PDF profile for {filename} is missing structural anatomy"
+                f"PDF profile for {filename} is missing structural anatomy"
             )
-        if not anatomy.shard_plan.enabled or not anatomy.shard_plan.shards:
+        if not anatomy.shard_plan.shards:
             raise RuntimeError(
-                f"Oversized PDF profile for {filename} did not produce a shard plan"
+                f"PDF profile for {filename} did not produce a shard plan"
             )
 
         agent_shards = anatomy.shard_plan.shards
 
         # 2. Extract TOC info from anatomy for page exclusion and heading constraint
         toc_pages: set[int] = set()
-        toc_hierarchies = None
+        toc_hierarchies = anatomy.toc_hierarchies
         if anatomy.toc_result and anatomy.toc_result.toc_pages:
             toc_pages = set(anatomy.toc_result.toc_pages)
-            toc_hierarchies = anatomy.toc_hierarchies
             logger.info(
                 f"📌 DOC_AGENT TOC detected: {len(toc_pages)} pages to exclude "
                 f"({sorted(toc_pages)}), "
@@ -146,55 +161,77 @@ def _parse_oversized_pdf(
                 f"({ms.page_count} pages)"
             )
 
-        # 4. Physically split PDF (exclude TOC pages if detected)
-        work_dir = os.path.join(output_dir, "_shards")
-        os.makedirs(work_dir, exist_ok=True)
-        with stage_timer("pdf.split", filename=filename):
-            shard_pdf_paths, _page_remap = split_pdf(
-                pdf_path, merged_shards, work_dir,
-                exclude_pages=toc_pages if toc_pages else None,
-            )
+        fast_path_original_pdf = len(merged_shards) == 1 and not toc_pages
 
-        temp_shard_s3_keys = [
-            _build_temp_shard_s3_key(
-                source_s3_key=s3_key,
-                job_id=job_id,
-                filename=filename,
-                shard_index=shard_index,
-            )
-            for shard_index, _shard_pdf_path in enumerate(shard_pdf_paths)
-        ]
-
-        # 5. Parse each shard via MinerU (parallel)
-        shard_output_dirs: list[str | None] = [None] * len(shard_pdf_paths)
+        # 4. Parse via MinerU. The 1-shard/no-TOC case keeps the original
+        # PDF/S3 object to avoid temporary split/upload churn.
+        shard_output_dirs: list[str | None]
         concurrency = settings.MINERU_SHARD_CONCURRENCY
 
-        def _parse_single_shard(shard_idx, shard_pdf):
-            assert work_dir is not None
-            shard_out = os.path.join(work_dir, f"shard_{shard_idx}_output")
-            os.makedirs(shard_out, exist_ok=True)
-            shard_filename = (
-                f"{os.path.splitext(filename)[0]}_shard{shard_idx}.pdf"
-            )
-            shard_s3_key = temp_shard_s3_keys[shard_idx]
-            logger.info(
-                f"  🔄 MinerU shard_{shard_idx}: parsing via S3 URL "
-                f"({shard_s3_key})"
-            )
-            parse_via_full(shard_pdf, shard_filename, shard_out, s3_key=shard_s3_key)
-            return shard_out
-
-        with stage_timer(
-            "pdf.mineru_parallel", filename=filename, shard_count=len(shard_pdf_paths)
-        ):
-            with ThreadPoolExecutor(max_workers=concurrency) as executor:
-                futures = {
-                    executor.submit(_parse_single_shard, i, shard_pdf_path): i
-                    for i, shard_pdf_path in enumerate(shard_pdf_paths)
-                }
-                for future in as_completed(futures):
-                    idx = futures[future]
-                    shard_output_dirs[idx] = future.result()
+        if fast_path_original_pdf:
+            logger.info("📄 Single shard without TOC pages; using original PDF fast path")
+            with stage_timer("pdf.extract.single_shard_fast", filename=filename):
+                parse_via_full(pdf_path, filename, output_dir, s3_key=s3_key)
+            shard_output_dirs = [output_dir]
+        else:
+            # Physically split PDF when TOC pages must be excluded or multiple
+            # MinerU requests are required.
+            work_dir = os.path.join(output_dir, "_shards")
+            os.makedirs(work_dir, exist_ok=True)
+            with stage_timer("pdf.split", filename=filename):
+                shard_pdf_paths, _page_remap = split_pdf(
+                    pdf_path, merged_shards, work_dir,
+                    exclude_pages=toc_pages if toc_pages else None,
+                )
+
+            if not shard_pdf_paths:
+                raise RuntimeError(
+                    f"PDF shard split for {filename} produced no shard PDFs"
+                )
+
+            temp_shard_s3_keys = [
+                _build_temp_shard_s3_key(
+                    source_s3_key=s3_key,
+                    job_id=job_id,
+                    filename=filename,
+                    shard_index=shard_index,
+                )
+                for shard_index, _shard_pdf_path in enumerate(shard_pdf_paths)
+            ]
+
+            # 5. Parse each shard via MinerU (parallel)
+            shard_output_dirs = [None] * len(shard_pdf_paths)
+
+            def _parse_single_shard(shard_idx, shard_pdf):
+                assert work_dir is not None
+                shard_out = os.path.join(work_dir, f"shard_{shard_idx}_output")
+                os.makedirs(shard_out, exist_ok=True)
+                shard_filename = (
+                    f"{os.path.splitext(filename)[0]}_shard{shard_idx}.pdf"
+                )
+                shard_s3_key = temp_shard_s3_keys[shard_idx]
+                logger.info(
+                    f"  🔄 MinerU shard_{shard_idx}: parsing via S3 URL "
+                    f"({shard_s3_key})"
+                )
+                parse_via_full(
+                    shard_pdf, shard_filename, shard_out, s3_key=shard_s3_key
+                )
+                return shard_out
+
+            with stage_timer(
+                "pdf.mineru_parallel",
+                filename=filename,
+                shard_count=len(shard_pdf_paths),
+            ):
+                with ThreadPoolExecutor(max_workers=concurrency) as executor:
+                    futures = {
+                        executor.submit(_parse_single_shard, i, shard_pdf_path): i
+                        for i, shard_pdf_path in enumerate(shard_pdf_paths)
+                    }
+                    for future in as_completed(futures):
+                        idx = futures[future]
+                        shard_output_dirs[idx] = future.result()
 
         # 6. Per-shard heading prediction (parallel)
         @dataclass
@@ -204,12 +241,16 @@ class ShardHeadingResult:
             heading_count: int
 
         smart_parse = base_llm_paras.get("smart_title_parse", True)
+        toc_model_name = base_llm_paras.get("model_name", settings.NORMOL_MODEL)
         hierarchy_model_name = (
             base_llm_paras.get("hierarchy_model_name")
             or base_llm_paras.get("model_name", settings.NORMOL_MODEL)
         )
 
-        def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingResult:
+        def _predict_shard_headings(
+            shard_idx: int,
+            shard_out_dir: str,
+        ) -> ShardHeadingResult:
             """Run full heading prediction pipeline on a single shard's full.md."""
             md_path = os.path.join(shard_out_dir, "full.md")
             if not os.path.exists(md_path):
@@ -220,10 +261,18 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR
             md_lines = [line.strip() for line in md_lines if line.strip() != ""]
             md_lines = merge_html_tables(md_lines)
 
-            # TOC context: first TOC shared by all shards; subsequent TOCs assigned
-            # by page boundary. For simplicity, all TOCs are passed since pred_titles
-            # only matches headings actually present in this shard's content.
+            is_first_shard = shard_idx == 0
             shard_toc = toc_hierarchies
+            if shard_toc is None and is_first_shard and _md_has_toc_keyword(md_lines):
+                logger.info(
+                    f"📌 shard_{shard_idx}: TOC keyword found without profile TOC; "
+                    "reusing markdown TOC detector"
+                )
+                shard_toc, md_lines = detect_tocs_in_texts(
+                    md_lines,
+                    model_name=toc_model_name,
+                    hierarchy_model_name=hierarchy_model_name,
+                )
 
             lines_with_heading = eval_md_headings(
                 md_lines,
@@ -237,6 +286,7 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR
                     if os.path.exists(os.path.join(shard_out_dir, "layout.json"))
                     else None
                 ),
+                is_first_shard=is_first_shard,
             )
 
             heading_count = sum(1 for line in lines_with_heading if line.startswith("#"))
@@ -250,7 +300,9 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR
                 heading_count=heading_count,
             )
 
-        shard_heading_results: list[ShardHeadingResult | None] = [None] * len(shard_output_dirs)
+        shard_heading_results: list[ShardHeadingResult | None] = [None] * len(
+            shard_output_dirs
+        )
 
         with stage_timer(
             "pdf.shard_headings", filename=filename, shard_count=len(shard_output_dirs)
@@ -274,7 +326,7 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR
 
         # Compute level offsets: continuation shards get shifted deeper.
         shard_offsets: list[int] = []
-        for shard in agent_shards:
+        for shard in agent_shards[: len(complete_heading_results)]:
             if shard.is_continuation:
                 shard_offsets.append(max(shard.split_depth - 1, 0))
             else:
@@ -296,7 +348,8 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR
         )
 
         with stage_timer("pdf.merge_images", filename=filename):
-            merge_images(shard_output_dirs, output_dir)
+            if not fast_path_original_pdf:
+                merge_images(shard_output_dirs, output_dir)
 
         logger.info("✅ Shard-first hierarchy complete, entering parse_md Phase B")
 
@@ -314,6 +367,11 @@ def _predict_shard_headings(shard_idx: int, shard_out_dir: str) -> ShardHeadingR
         _cleanup_local_shard_workspace(work_dir)
 
 
+def _md_has_toc_keyword(md_lines: list[str]) -> bool:
+    toc_keywords = {"目录", "目次", "tableofcontents", "contents"}
+    return any(normalize_md(line) in toc_keywords for line in md_lines)
+
+
 def _build_temp_shard_s3_key(
     *,
     source_s3_key: str | None,
@@ -334,6 +392,7 @@ def _source_key_stem(source_s3_key: str | None) -> str | None:
     stem, _extension = os.path.splitext(key_name)
     return stem or None
 
+
 def _sanitize_temp_storage_segment(value: object) -> str:
     normalized = re.sub(r"[^A-Za-z0-9_.-]+", "-", str(value)).strip(".-")
     return normalized or "document"
diff --git a/apps/worker/app/services/document_parser/profiling/doc_profiler.py b/apps/worker/app/services/document_parser/profiling/doc_profiler.py
index 28076bfb..8944c7ab 100644
--- a/apps/worker/app/services/document_parser/profiling/doc_profiler.py
+++ b/apps/worker/app/services/document_parser/profiling/doc_profiler.py
@@ -9,7 +9,11 @@
     build_oversized_pdf_processing_failed_exception,
     raise_if_oversized_pdf_not_supported,
 )
-from app.services.document_parser.profiling.profile_model import ParserDocumentProfile
+from app.services.document_parser.profiling.profile_model import (
+    ParserDocumentProfile,
+    ParserTocProfile,
+    TocEvidence,
+)
 from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
 
 from shared.core.config import settings
@@ -67,6 +71,8 @@ def _profile_pdf(
             "planner_model": settings.IMAGE_MODEL,
             "vlm_model": settings.IMAGE_MODEL,
             "model": settings.HIERARCHY_LLM_MODEL or settings.NORMOL_MODEL,
+            "toc_before_coarse": settings.PDF_PROFILE_TOC_ENABLED,
+            "toc_before_coarse_page_limit": settings.MAX_PDF_PAGE_LIMIT,
         },
     )
     agent_profile = coordinator.run_coarse()
@@ -95,13 +101,42 @@ def _profile_pdf(
         if not profile.is_atlas:
             try:
                 profile.anatomy = coordinator.run_structural()
+                profile.toc = _map_toc_profile(coordinator)
             except Exception as exc:
                 raise build_oversized_pdf_processing_failed_exception(
                     page_count=profile.page_count,
                     original_exception=exc,
                 ) from exc
+    elif settings.PDF_PROFILE_TOC_ENABLED:
+        if not profile.is_atlas:
+            profile.anatomy = coordinator.run_lightweight_anatomy()
+        profile.toc = _map_toc_profile(coordinator)
 
     return profile
 
 
+def _map_toc_profile(coordinator: ProfileCoordinator) -> ParserTocProfile:
+    toc_result = coordinator.blackboard.toc_result
+    if toc_result is None:
+        return ParserTocProfile()
+    evidence = [
+        TocEvidence(
+            page_index=item.page_index,
+            source=item.source,
+            confidence=item.confidence,
+            reason=item.reason,
+        )
+        for item in toc_result.evidence
+    ]
+    source = "pdf_vlm" if toc_result.method != "none" else "none"
+    return ParserTocProfile(
+        toc_pages=list(toc_result.toc_pages),
+        hierarchies=coordinator.blackboard.toc_hierarchies,
+        evidence=evidence,
+        source=source,
+        method=toc_result.method,
+        notes=toc_result.notes,
+    )
+
+
 __all__ = ["profile_document"]
diff --git a/apps/worker/app/services/document_parser/profiling/profile_model.py b/apps/worker/app/services/document_parser/profiling/profile_model.py
index e553ca41..9334ae77 100644
--- a/apps/worker/app/services/document_parser/profiling/profile_model.py
+++ b/apps/worker/app/services/document_parser/profiling/profile_model.py
@@ -6,6 +6,28 @@
 from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
 
 
+@dataclass
+class TocEvidence:
+    page_index: int
+    source: str
+    confidence: float
+    reason: str = ""
+
+
+@dataclass
+class ParserTocProfile:
+    toc_pages: list[int] = field(default_factory=list)
+    hierarchies: list[dict[str, Any]] | None = None
+    evidence: list[TocEvidence] = field(default_factory=list)
+    source: str = "none"
+    method: str = "none"
+    notes: str = ""
+
+    @property
+    def has_toc(self) -> bool:
+        return bool(self.toc_pages or self.hierarchies)
+
+
 @dataclass
 class ParserDocumentProfile:
     """Parser-entry document profile used for routing and PDF anatomy reuse."""
@@ -18,6 +40,8 @@ class ParserDocumentProfile:
     language: str = "unknown"
     reasoning: str = ""
     category_rationale: str = ""
+    toc: ParserTocProfile = field(default_factory=ParserTocProfile)
+    granularity: str = "page"
     anatomy: Any | None = None
     metrics: dict[str, Any] = field(default_factory=dict)
 
@@ -48,9 +72,11 @@ def summary(self) -> str:
             f"routing={self.routing_category.value}, "
             f"scanned={self.is_scanned}, pages={self.page_count}"
         )
+        if self.toc.has_toc:
+            parts += f", toc={self.toc.method}"
         if self.has_structural_anatomy:
             parts += ", anatomy=True"
         return parts
 
 
-__all__ = ["ParserDocumentProfile"]
+__all__ = ["ParserDocumentProfile", "ParserTocProfile", "TocEvidence"]
diff --git a/apps/worker/app/services/document_parser/structure/body_boundary.py b/apps/worker/app/services/document_parser/structure/body_boundary.py
new file mode 100644
index 00000000..a7ad6d3d
--- /dev/null
+++ b/apps/worker/app/services/document_parser/structure/body_boundary.py
@@ -0,0 +1,72 @@
+"""Line-based body boundary helpers for TOC-derived headings."""
+
+from __future__ import annotations
+
+import re
+import unicodedata
+from typing import Any
+
+
+_LEADING_NUMBER_RE = re.compile(
+    r"""^
+    (?:
+        [#]+\s*
+        | 第\s*[零一二三四五六七八九十百千\d]+\s*[章节篇部分]
+        | [零一二三四五六七八九十百千]+\s*[、。，,]
+        | [（(]\s*[零一二三四五六七八九十百千\d]+\s*[）)]
+        | \d+(?:\.\d+)*\.?\s*
+        | [IVXLCDM]+\.?\s+
+        | [A-Za-z]\.\s+
+        | Chapter\s+\w+\s*
+    )
+    """,
+    re.VERBOSE | re.IGNORECASE,
+)
+
+_PAGE_SUFFIX_RE = re.compile(r"[\s\.\-·…]+\d+\s*$")
+
+
+def normalize_heading_text(text: str) -> str:
+    """Normalize text for fuzzy heading matching."""
+    text = unicodedata.normalize("NFKC", text or "")
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def clean_toc_title(title: str) -> str:
+    """Remove leading numbering/hashes and trailing page numbers from a TOC title."""
+    cleaned = _PAGE_SUFFIX_RE.sub("", title or "").strip()
+    cleaned = _LEADING_NUMBER_RE.sub("", cleaned).strip()
+    return cleaned
+
+
+def extract_level1_titles(toc_hierarchies: list[dict[str, Any]]) -> list[str]:
+    """Extract cleaned level-1 titles from TOC hierarchy payloads."""
+    titles: list[str] = []
+    for hier in toc_hierarchies:
+        toc_tree = hier.get("toc_tree") or {}
+        for raw_title in toc_tree.keys():
+            cleaned = clean_toc_title(str(raw_title))
+            if cleaned and len(cleaned) >= 2:
+                titles.append(cleaned)
+    return titles
+
+
+def find_first_body_boundary(
+    lines: list[str],
+    level1_titles: list[str],
+) -> int | None:
+    """Return the first line index matching a TOC level-1 title, if any."""
+    normalized_titles = [
+        normalize_heading_text(title)
+        for title in level1_titles
+        if normalize_heading_text(title)
+    ]
+    if not normalized_titles:
+        return None
+
+    for index, line in enumerate(lines):
+        normalized_line = normalize_heading_text(line.lstrip("#").strip())
+        if any(title in normalized_line for title in normalized_titles):
+            return index
+    return None
diff --git a/apps/worker/app/services/document_parser/structure/heading_hierarchy.py b/apps/worker/app/services/document_parser/structure/heading_hierarchy.py
index 20a9b170..30f6f0f1 100644
--- a/apps/worker/app/services/document_parser/structure/heading_hierarchy.py
+++ b/apps/worker/app/services/document_parser/structure/heading_hierarchy.py
@@ -20,6 +20,7 @@ class HeadingHierarchyInput:
     output_dir: str | None = None
     layout_json_path: str | None = None
     first_toc_ele_num: int | None = None
+    is_first_shard: bool = True
 
 
 def predict_heading_hierarchy(heading_input: HeadingHierarchyInput) -> pd.DataFrame:
@@ -34,4 +35,5 @@ def predict_heading_hierarchy(heading_input: HeadingHierarchyInput) -> pd.DataFr
         output_dir=heading_input.output_dir,
         layout_json_path=heading_input.layout_json_path,
         first_toc_ele_num=heading_input.first_toc_ele_num,
+        is_first_shard=heading_input.is_first_shard,
     )
diff --git a/apps/worker/app/services/document_parser/structure/layout_parser.py b/apps/worker/app/services/document_parser/structure/layout_parser.py
index 7c2545a6..6c49910c 100755
--- a/apps/worker/app/services/document_parser/structure/layout_parser.py
+++ b/apps/worker/app/services/document_parser/structure/layout_parser.py
@@ -20,6 +20,10 @@
 from app.services.document_parser.structure.heading_tree import (
     tree_to_dataframe as heading_tree_to_dataframe,
 )
+from app.services.document_parser.structure.body_boundary import (
+    extract_level1_titles,
+    find_first_body_boundary,
+)
 from app.services.document_parser.support.stage_profiler import stage_timer
 from app.services.document_parser.tables.table_text_parser import df2md
 from gevent.pool import Pool as GeventPool
@@ -415,6 +419,12 @@ def _resolve_first_toc_boundary(toc_hierarchies=None, first_toc_ele_num=None):
     return resolved_start
 
 
+def _first_toc_range_unit(toc_hierarchies=None) -> str | None:
+    if not toc_hierarchies:
+        return None
+    return toc_hierarchies[0].get("toc_range_unit")
+
+
 def pred_titles(
     infos,
     doc_type,
@@ -426,6 +436,7 @@ def pred_titles(
     output_dir=None,
     layout_json_path=None,
     first_toc_ele_num=None,
+    is_first_shard=True,
 ):
     """
     predict title hierarchy
@@ -441,6 +452,7 @@ def pred_titles(
         output_dir: output directory for saving intermediate CSV results
         layout_json_path: path to layout.json for META features (optional)
         first_toc_ele_num: ele_num of the first TOC block in DOCX (for pre-TOC exclusion)
+        is_first_shard: whether PDF-derived markdown belongs to the first document shard
     """
     model_name = _resolve_hierarchy_model_name(model_name)
     logger.info(
@@ -465,6 +477,18 @@ def pred_titles(
     first_toc_start = None
     if doc_type == "md":
         first_toc_start = _resolve_first_toc_boundary(toc_hierarchies=toc_hierarchies)
+        if (
+            first_toc_start is None
+            and is_first_shard
+            and _first_toc_range_unit(toc_hierarchies) == "page"
+        ):
+            level1_titles = extract_level1_titles(toc_hierarchies or [])
+            first_toc_start = find_first_body_boundary(infos, level1_titles)
+            if first_toc_start is not None:
+                logger.info(
+                    f"📌 Demoting PDF front matter before first TOC H1 line "
+                    f"(id < {first_toc_start})"
+                )
     elif doc_type == "docx":
         first_toc_start = _resolve_first_toc_boundary(
             toc_hierarchies=toc_hierarchies,
@@ -494,6 +518,7 @@ def pred_titles(
         and len(toc_hierarchies) > 1
         and doc_type in {"md", "docx"}
         and smart_parse
+        and _first_toc_range_unit(toc_hierarchies) != "page"
     ):
         # Multiple TOCs divide the document into independent zones.
         # Each zone gets its own naive + LLM pipeline with zone-specific TOC context.
diff --git a/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py b/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py
new file mode 100644
index 00000000..f1d108ad
--- /dev/null
+++ b/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py
@@ -0,0 +1,783 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from types import SimpleNamespace
+
+os.environ.setdefault("DATABASE_URL", "postgresql+asyncpg://test:test@localhost/test")
+os.environ.setdefault("TMP_PATH", "/tmp/knowhere-test")
+os.environ.setdefault("S3_BUCKET_NAME", "test-uploads")
+os.environ.setdefault("S3_ACCESS_KEY_ID", "test")
+os.environ.setdefault("S3_SECRET_ACCESS_KEY", "test")
+os.environ.setdefault("S3_TEMP_PATH", "/tmp")
+
+from app.services.document_agent.coordinator import ProfileCoordinator
+from app.services.document_agent.manifest import (
+    DocumentProfile,
+    H1BoundaryResult,
+    PageAnatomyMap,
+    PageFeature,
+    PageLabel,
+    Shard,
+    ShardPlan,
+    TocAnchorPage,
+    TocEvidence as AgentTocEvidence,
+    TocResult,
+    ToolResult,
+)
+from app.services.document_agent.validators import validate_shard_plan
+from app.services.document_parser.profiling.doc_profiler import profile_document
+from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
+
+
+def _page_feature(page: int = 1) -> PageFeature:
+    return PageFeature(
+        page=page,
+        raw_text_length=20,
+        text_density=0.1,
+        image_coverage=0.0,
+        image_count=0,
+        table_count=0,
+        drawings_count=0,
+        orientation="portrait",
+        width=72.0,
+        height=72.0,
+        is_blank_like=False,
+        text_lines_preview=["Section 1"],
+    )
+
+
+def test_run_toc_degrades_to_empty_result_on_standard_failure(tmp_path: Path) -> None:
+    coordinator = ProfileCoordinator(
+        pdf_path=str(tmp_path / "standard.pdf"),
+        job_id="job-toc-fail-soft",
+        output_dir=str(tmp_path / "profile"),
+    )
+    coordinator.blackboard.page_count = 1
+    coordinator.blackboard.page_features = [_page_feature()]
+
+    def _fail_toc_extraction() -> None:
+        raise RuntimeError("VLM JSON parse failed")
+
+    coordinator._run_toc_extraction_pipeline = _fail_toc_extraction  # type: ignore[method-assign]
+
+    toc_result = coordinator.run_toc()
+
+    assert toc_result.method == "none"
+    assert toc_result.toc_pages == []
+    assert toc_result.failure_kind == "degraded"
+    assert "degraded" in toc_result.notes
+    assert coordinator.blackboard.toc_hierarchies is None
+
+
+def test_run_lightweight_anatomy_builds_single_shard_without_planner_llm(
+    tmp_path: Path,
+) -> None:
+    output_dir = tmp_path / "profile"
+    coordinator = ProfileCoordinator(
+        pdf_path=str(tmp_path / "standard.pdf"),
+        job_id="job-lightweight",
+        output_dir=str(output_dir),
+        settings={"shard_threshold": 200},
+    )
+    coordinator.blackboard.page_count = 2
+    coordinator.blackboard.page_features = [_page_feature(1), _page_feature(2)]
+    coordinator.blackboard.page_labels = [
+        PageLabel(page=1, kind="normal", confidence=1.0),
+        PageLabel(page=2, kind="normal", confidence=1.0),
+    ]
+    coordinator.blackboard.doc_stats = {"page_count": 2}
+    coordinator.blackboard.global_signals["page_kind_counts"] = {"normal": 2}
+    coordinator.blackboard.document_profile = DocumentProfile(
+        is_scanned=False,
+        category="Research Report",
+        routing_category=PdfRoutingCategory.GENERIC.value,
+    )
+    coordinator.blackboard.toc_result = TocResult(method="none")
+
+    anatomy = coordinator.run_lightweight_anatomy()
+
+    assert anatomy.shard_plan.enabled is False
+    assert len(anatomy.shard_plan.shards) == 1
+    assert anatomy.shard_plan.shards[0].page_start == 1
+    assert anatomy.shard_plan.shards[0].page_end == 2
+    assert anatomy.toc_result.method == "none"
+    assert (output_dir / "anatomy_map.json").exists()
+
+
+def test_run_structural_retries_transient_confirm_failed_toc_result(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    coordinator = ProfileCoordinator(
+        pdf_path=str(tmp_path / "oversized.pdf"),
+        job_id="job-suspect-empty-toc",
+        output_dir=str(tmp_path / "profile"),
+    )
+    (tmp_path / "profile").mkdir()
+    coordinator.blackboard.page_count = 3
+    coordinator.blackboard.page_features = [_page_feature(1), _page_feature(2)]
+    coordinator.blackboard.page_labels = [
+        PageLabel(page=1, kind="normal", confidence=1.0),
+        PageLabel(page=2, kind="normal", confidence=1.0),
+    ]
+    coordinator.blackboard.doc_stats = {"page_count": 3}
+    coordinator.blackboard.global_signals["page_kind_counts"] = {"normal": 3}
+    coordinator.blackboard.document_profile = DocumentProfile(
+        is_scanned=False,
+        category="Prospectus",
+        routing_category=PdfRoutingCategory.GENERIC.value,
+    )
+    coordinator.blackboard.toc_result = TocResult(
+        candidates=[
+            TocAnchorPage(page=17, png_path="/tmp/toc_anchor_page_17.png", source="text_scan")
+        ],
+        evidence=[
+            AgentTocEvidence(
+                page_index=17,
+                source="vlm",
+                confidence=0.05,
+                reason="rejected",
+            )
+        ],
+        method="none",
+        notes="VLM anchor confirmation failed; TOC candidates left unconfirmed",
+        failure_kind="confirm_failed",
+    )
+
+    calls: list[str] = []
+
+    def fake_toc_extraction() -> None:
+        calls.append("toc")
+        coordinator.blackboard.toc_result = TocResult(toc_pages=[17], method="vlm_batch")
+        coordinator.blackboard.toc_hierarchies = [
+            {"toc_range": [17, 17], "toc_range_unit": "page", "toc_tree": {}}
+        ]
+
+    def fake_h1_boundary() -> None:
+        calls.append("h1")
+        coordinator.blackboard.h1_result = H1BoundaryResult(method="toc_grep")
+
+    def fake_persist(_anatomy):
+        calls.append("persist")
+
+    monkeypatch.setattr(coordinator, "_run_toc_extraction_pipeline", fake_toc_extraction)
+    monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary)
+    monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist)
+
+    from app.services.document_agent import coordinator as coordinator_module
+
+    monkeypatch.setattr(
+        coordinator_module.ProfilePlanner,
+        "propose",
+        lambda self: (
+            coordinator.blackboard.document_profile,
+            None,
+            ToolResult(status="ok", payload={}),
+        ),
+    )
+
+    class FakeExecutor:
+        def __init__(self, *_args, **_kwargs) -> None:
+            pass
+
+        def run(self):
+            coordinator.blackboard.shard_plan = ShardPlan(
+                enabled=True,
+                reason="too_large",
+                shards=[
+                    Shard(
+                        shard_index=0,
+                        page_start=1,
+                        page_end=3,
+                        page_offset=0,
+                        anchor_type="forced_max_size",
+                        anchor_evidence="fixture",
+                        confidence=1.0,
+                    )
+                ],
+            )
+            return SimpleNamespace(
+                success=True,
+                verdict=SimpleNamespace(status="success", rationale="ok"),
+                trace_summary={},
+            )
+
+    monkeypatch.setattr(coordinator_module, "ReActExecutor", FakeExecutor)
+
+    anatomy = coordinator.run_structural()
+
+    assert calls[:2] == ["toc", "h1"]
+    assert anatomy.toc_result.toc_pages == [17]
+
+
+def test_run_structural_trusts_rejected_all_toc_and_fails_open(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    coordinator = ProfileCoordinator(
+        pdf_path=str(tmp_path / "oversized.pdf"),
+        job_id="job-rejected-all-toc-fail-open",
+        output_dir=str(tmp_path / "profile"),
+    )
+    (tmp_path / "profile").mkdir()
+    coordinator.blackboard.page_count = 3
+    coordinator.blackboard.page_features = [_page_feature(1), _page_feature(2)]
+    coordinator.blackboard.page_labels = [
+        PageLabel(page=1, kind="normal", confidence=1.0),
+        PageLabel(page=2, kind="normal", confidence=1.0),
+    ]
+    coordinator.blackboard.doc_stats = {"page_count": 3}
+    coordinator.blackboard.global_signals["page_kind_counts"] = {"normal": 3}
+    coordinator.blackboard.document_profile = DocumentProfile(
+        is_scanned=False,
+        category="Prospectus",
+        routing_category=PdfRoutingCategory.GENERIC.value,
+    )
+    coordinator.blackboard.toc_result = TocResult(
+        candidates=[
+            TocAnchorPage(
+                page=17,
+                png_path="/tmp/toc_anchor_page_17.png",
+                source="text_scan",
+            )
+        ],
+        method="none",
+        notes="VLM rejected all TOC anchor candidates",
+        failure_kind="rejected_all",
+    )
+
+    def fake_toc_extraction() -> None:
+        raise AssertionError("rejected_all should be trusted and not retried")
+
+    calls: list[str] = []
+
+    def fake_h1_boundary() -> None:
+        calls.append("h1")
+        coordinator.blackboard.h1_result = H1BoundaryResult(method="none")
+
+    def fake_persist(_anatomy):
+        calls.append("persist")
+
+    monkeypatch.setattr(coordinator, "_run_toc_extraction_pipeline", fake_toc_extraction)
+    monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary)
+    monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist)
+
+    from app.services.document_agent import coordinator as coordinator_module
+
+    monkeypatch.setattr(
+        coordinator_module.ProfilePlanner,
+        "propose",
+        lambda self: (
+            coordinator.blackboard.document_profile,
+            None,
+            ToolResult(status="ok", payload={}),
+        ),
+    )
+
+    class FakeExecutor:
+        def __init__(self, *_args, **_kwargs) -> None:
+            pass
+
+        def run(self):
+            coordinator.blackboard.shard_plan = ShardPlan(
+                enabled=True,
+                reason="too_large",
+                shards=[
+                    Shard(
+                        shard_index=0,
+                        page_start=1,
+                        page_end=3,
+                        page_offset=0,
+                        anchor_type="forced_max_size",
+                        anchor_evidence="fixture",
+                        confidence=1.0,
+                    )
+                ],
+            )
+            return SimpleNamespace(
+                success=True,
+                verdict=SimpleNamespace(status="success", rationale="ok"),
+                trace_summary={},
+            )
+
+    monkeypatch.setattr(coordinator_module, "ReActExecutor", FakeExecutor)
+
+    anatomy = coordinator.run_structural()
+
+    assert calls == ["h1", "persist"]
+    assert anatomy.toc_result.failure_kind == "rejected_all"
+    assert anatomy.toc_result.toc_pages == []
+
+
+def test_run_coarse_runs_toc_before_planner_for_oversized_and_reuses_planner(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    coordinator = ProfileCoordinator(
+        pdf_path=str(tmp_path / "oversized.pdf"),
+        job_id="job-toc-before-coarse",
+        output_dir=str(tmp_path / "profile"),
+        settings={"toc_before_coarse_page_limit": 2},
+    )
+    (tmp_path / "profile").mkdir()
+    coordinator.blackboard.page_count = 3
+    coordinator.blackboard.page_features = [_page_feature(1), _page_feature(2)]
+    coordinator.blackboard.page_labels = [
+        PageLabel(page=1, kind="normal", confidence=1.0),
+        PageLabel(page=2, kind="normal", confidence=1.0),
+    ]
+    coordinator.blackboard.doc_stats = {"page_count": 3}
+    coordinator.blackboard.global_signals["page_kind_counts"] = {"normal": 3}
+
+    calls: list[str] = []
+
+    def fake_toc_extraction() -> None:
+        calls.append("toc")
+        coordinator.blackboard.toc_result = TocResult(toc_pages=[17], method="vlm_batch")
+        coordinator.blackboard.toc_hierarchies = [
+            {"toc_range": [17, 17], "toc_range_unit": "page", "toc_tree": {}}
+        ]
+
+    def fake_h1_boundary() -> None:
+        calls.append("h1")
+        coordinator.blackboard.h1_result = H1BoundaryResult(method="toc_grep")
+
+    def fake_persist(_anatomy):
+        calls.append("persist")
+
+    monkeypatch.setattr(coordinator, "_run_toc_extraction_pipeline", fake_toc_extraction)
+    monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary)
+    monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist)
+
+    from app.services.document_agent import coordinator as coordinator_module
+
+    def fake_propose(_self):
+        calls.append("planner")
+        return (
+            DocumentProfile(
+                is_scanned=False,
+                category="Prospectus",
+                routing_category=PdfRoutingCategory.GENERIC.value,
+            ),
+            None,
+            ToolResult(status="ok", payload={}),
+        )
+
+    monkeypatch.setattr(coordinator_module.ProfilePlanner, "propose", fake_propose)
+
+    class FakeExecutor:
+        def __init__(self, *_args, **_kwargs) -> None:
+            pass
+
+        def run(self):
+            coordinator.blackboard.shard_plan = ShardPlan(
+                enabled=True,
+                reason="too_large",
+                shards=[
+                    Shard(
+                        shard_index=0,
+                        page_start=1,
+                        page_end=3,
+                        page_offset=0,
+                        anchor_type="forced_max_size",
+                        anchor_evidence="fixture",
+                        confidence=1.0,
+                    )
+                ],
+            )
+            return SimpleNamespace(
+                success=True,
+                verdict=SimpleNamespace(status="success", rationale="ok"),
+                trace_summary={},
+            )
+
+    monkeypatch.setattr(coordinator_module, "ReActExecutor", FakeExecutor)
+
+    coordinator.run_coarse()
+    anatomy = coordinator.run_structural()
+
+    assert calls == ["toc", "planner", "h1", "persist"]
+    assert anatomy.toc_result.toc_pages == [17]
+
+
+def test_anchor_confirmation_failure_requires_one_strict_retry(tmp_path: Path) -> None:
+    coordinator = ProfileCoordinator(
+        pdf_path=str(tmp_path / "oversized.pdf"),
+        job_id="job-confirm-failed-not-suspect",
+        output_dir=str(tmp_path / "profile"),
+    )
+    coordinator.blackboard.toc_result = TocResult(
+        candidates=[
+            TocAnchorPage(
+                page=17,
+                png_path="/tmp/toc_anchor_page_17.png",
+                source="text_scan",
+            )
+        ],
+        method="none",
+        notes="VLM anchor confirmation failed; TOC candidates left unconfirmed",
+        failure_kind="confirm_failed",
+    )
+
+    assert coordinator._toc_result_requires_strict_retry() is True
+
+
+def test_oversized_single_shard_plan_is_invalid() -> None:
+    report = validate_shard_plan(
+        ShardPlan(
+            enabled=False,
+            reason="not_needed",
+            shards=[
+                Shard(
+                    shard_index=0,
+                    page_start=1,
+                    page_end=407,
+                    page_offset=0,
+                    anchor_type="forced_max_size",
+                    anchor_evidence="final shard",
+                    confidence=1.0,
+                )
+            ],
+        ),
+        page_count=407,
+        min_pages=20,
+        max_pages=200,
+    )
+
+    assert report.valid is False
+    assert report.errors == ["shard 0 exceeds max_pages=200"]
+
+
+def test_standard_pdf_profile_toc_flag_off_preserves_current_behavior(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    from app.services.document_parser.profiling import doc_profiler
+
+    fake_instances: list[object] = []
+
+    class FakeCoordinator:
+        def __init__(self, **_kwargs) -> None:
+            self.calls: list[str] = []
+            self.blackboard = SimpleNamespace(
+                page_count=2,
+                doc_stats={"page_count": 2},
+                global_signals={},
+                toc_result=None,
+                toc_hierarchies=None,
+            )
+            fake_instances.append(self)
+
+        def run_coarse(self) -> DocumentProfile:
+            self.calls.append("run_coarse")
+            return DocumentProfile(
+                is_scanned=False,
+                category="Research Report",
+                routing_category=PdfRoutingCategory.GENERIC.value,
+            )
+
+        def run_toc(self) -> TocResult:
+            self.calls.append("run_toc")
+            raise AssertionError("run_toc should be flag-gated for standard PDFs")
+
+        def run_lightweight_anatomy(self):
+            self.calls.append("run_lightweight_anatomy")
+            raise AssertionError("lightweight anatomy should be flag-gated")
+
+    monkeypatch.setattr(doc_profiler, "ProfileCoordinator", FakeCoordinator)
+    monkeypatch.setattr(doc_profiler.settings, "PDF_PROFILE_TOC_ENABLED", False)
+    monkeypatch.setattr(doc_profiler.settings, "MAX_PDF_PAGE_LIMIT", 200)
+
+    profile = profile_document(
+        str(tmp_path / "standard.pdf"),
+        "standard.pdf",
+        job_id="job-flag-off",
+        output_dir=str(tmp_path),
+    )
+
+    assert profile.toc.has_toc is False
+    assert profile.anatomy is None
+    assert fake_instances[0].calls == ["run_coarse"]
+
+
+def test_standard_pdf_profile_toc_flag_on_builds_toc_and_lightweight_anatomy(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    from app.services.document_parser.profiling import doc_profiler
+
+    fake_anatomy = object()
+
+    class FakeCoordinator:
+        def __init__(self, **_kwargs) -> None:
+            self.calls: list[str] = []
+            self.blackboard = SimpleNamespace(
+                page_count=2,
+                doc_stats={"page_count": 2},
+                global_signals={},
+                toc_result=None,
+                toc_hierarchies=None,
+            )
+
+        def run_coarse(self) -> DocumentProfile:
+            self.calls.append("run_coarse")
+            self.blackboard.toc_result = TocResult(
+                toc_pages=[2],
+                evidence=[
+                    AgentTocEvidence(
+                        page_index=2,
+                        source="vlm",
+                        confidence=0.95,
+                        reason="table of contents",
+                    )
+                ],
+                method="vlm_batch",
+            )
+            self.blackboard.toc_hierarchies = [
+                {"toc_range": [2, 2], "toc_range_unit": "page", "toc_tree": {}}
+            ]
+            return DocumentProfile(
+                is_scanned=False,
+                category="Research Report",
+                routing_category=PdfRoutingCategory.GENERIC.value,
+            )
+
+        def run_toc(self) -> TocResult:
+            self.calls.append("run_toc")
+            raise AssertionError("run_toc should be no-op after TOC-before-coarse")
+
+        def run_lightweight_anatomy(self):
+            self.calls.append("run_lightweight_anatomy")
+            return fake_anatomy
+
+    fake_instances: list[FakeCoordinator] = []
+
+    class CapturingCoordinator(FakeCoordinator):
+        def __init__(self, **kwargs) -> None:
+            super().__init__(**kwargs)
+            fake_instances.append(self)
+
+    monkeypatch.setattr(doc_profiler, "ProfileCoordinator", CapturingCoordinator)
+    monkeypatch.setattr(doc_profiler.settings, "PDF_PROFILE_TOC_ENABLED", True)
+    monkeypatch.setattr(doc_profiler.settings, "MAX_PDF_PAGE_LIMIT", 200)
+
+    profile = profile_document(
+        str(tmp_path / "standard.pdf"),
+        "standard.pdf",
+        job_id="job-flag-on",
+        output_dir=str(tmp_path),
+    )
+
+    assert fake_instances[0].calls == [
+        "run_coarse",
+        "run_lightweight_anatomy",
+    ]
+    assert profile.toc.has_toc is True
+    assert profile.toc.method == "vlm_batch"
+    assert profile.toc.evidence[0].confidence == 0.95
+    assert profile.anatomy is fake_anatomy
+
+
+def test_pdf_shard_pipeline_accepts_single_shard_fast_path(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    from app.services.document_parser.formats.markdown import parser as markdown_parser
+    from app.services.document_parser.formats.pdf import parser as pdf_parser
+    from app.services.document_parser.formats.pdf import shard_splitter
+
+    output_dir = tmp_path / "out"
+    output_dir.mkdir()
+    calls: list[str] = []
+
+    def fake_parse_via_full(pdf_path, filename, out_dir, s3_key=None):
+        calls.append(f"parse:{filename}:{s3_key}")
+        Path(out_dir, "full.md").write_text("1. Introduction\nBody\n", encoding="utf-8")
+
+    def fail_split(*_args, **_kwargs):
+        raise AssertionError("single shard without TOC should not split")
+
+    def fake_eval_md_headings(md_lines, *_args, **_kwargs):
+        return [f"# {line}" if line.startswith("1.") else line for line in md_lines]
+
+    def fake_parse_md(*_args, **kwargs):
+        calls.append("parse_md")
+        return {"lines": kwargs["lines_with_heading"]}
+
+    monkeypatch.setattr(pdf_parser, "parse_via_full", fake_parse_via_full)
+    monkeypatch.setattr(shard_splitter, "split_pdf", fail_split)
+    monkeypatch.setattr(markdown_parser, "eval_md_headings", fake_eval_md_headings)
+    monkeypatch.setattr(pdf_parser, "parse_md", fake_parse_md)
+
+    profile = SimpleNamespace(
+        anatomy=PageAnatomyMap(
+            job_id="job-single-shard",
+            file_path=str(tmp_path / "standard.pdf"),
+            page_count=2,
+            page_features=[_page_feature(1), _page_feature(2)],
+            page_labels=[
+                PageLabel(page=1, kind="normal", confidence=1.0),
+                PageLabel(page=2, kind="normal", confidence=1.0),
+            ],
+            toc_result=TocResult(method="none"),
+            h1_result=H1BoundaryResult(method="none"),
+            shard_plan=ShardPlan(
+                enabled=False,
+                reason="not_needed",
+                shards=[
+                    Shard(
+                        shard_index=0,
+                        page_start=1,
+                        page_end=2,
+                        page_offset=0,
+                        anchor_type="forced_max_size",
+                        anchor_evidence="document within shard threshold",
+                        confidence=1.0,
+                    )
+                ],
+            ),
+        )
+    )
+
+    result = pdf_parser._parse_pdf_via_shards(
+        str(tmp_path / "standard.pdf"),
+        "standard.pdf",
+        str(output_dir),
+        {"smart_title_parse": False, "model_name": "test-model"},
+        profile=profile,
+        s3_key="uploads/source.pdf",
+    )
+
+    assert calls == ["parse:standard.pdf:uploads/source.pdf", "parse_md"]
+    assert result["lines"] == ["# 1. Introduction", "Body"]
+
+
+def test_pdf_first_shard_reuses_markdown_toc_detector_when_profile_misses_toc(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    from app.services.document_parser.formats.markdown import parser as markdown_parser
+    from app.services.document_parser.formats.pdf import parser as pdf_parser
+
+    output_dir = tmp_path / "out"
+    output_dir.mkdir()
+    detector_calls: list[list[str]] = []
+    heading_contexts: list[object] = []
+
+    def fake_parse_via_full(_pdf_path, _filename, out_dir, s3_key=None):
+        Path(out_dir, "full.md").write_text(
+            "Contents\n1 Introduction .... 2\n1 Introduction\nBody\n",
+            encoding="utf-8",
+        )
+
+    def fake_detect_tocs_in_texts(md_lines, **_kwargs):
+        detector_calls.append(list(md_lines))
+        return (
+            [
+                {
+                    "toc_range": [0, 1],
+                    "toc_range_unit": "line",
+                    "toc_tree": {"Introduction": {}},
+                }
+            ],
+            ["1 Introduction", "Body"],
+        )
+
+    def fake_eval_md_headings(md_lines, *_args, **kwargs):
+        heading_contexts.append(kwargs.get("toc_hierarchies"))
+        return [f"# {line}" if line.startswith("1 ") else line for line in md_lines]
+
+    monkeypatch.setattr(pdf_parser, "parse_via_full", fake_parse_via_full)
+    monkeypatch.setattr(pdf_parser, "detect_tocs_in_texts", fake_detect_tocs_in_texts)
+    monkeypatch.setattr(markdown_parser, "eval_md_headings", fake_eval_md_headings)
+    monkeypatch.setattr(
+        pdf_parser,
+        "parse_md",
+        lambda *_args, **kwargs: {"lines": kwargs["lines_with_heading"]},
+    )
+
+    profile = SimpleNamespace(
+        anatomy=PageAnatomyMap(
+            job_id="job-missed-toc",
+            file_path=str(tmp_path / "standard.pdf"),
+            page_count=3,
+            page_features=[_page_feature(1), _page_feature(2), _page_feature(3)],
+            page_labels=[
+                PageLabel(page=1, kind="normal", confidence=1.0),
+                PageLabel(page=2, kind="normal", confidence=1.0),
+                PageLabel(page=3, kind="normal", confidence=1.0),
+            ],
+            toc_result=TocResult(method="none"),
+            h1_result=H1BoundaryResult(method="none"),
+            shard_plan=ShardPlan(
+                enabled=False,
+                reason="not_needed",
+                shards=[
+                    Shard(
+                        shard_index=0,
+                        page_start=1,
+                        page_end=3,
+                        page_offset=0,
+                        anchor_type="forced_max_size",
+                        anchor_evidence="document within shard threshold",
+                        confidence=1.0,
+                    )
+                ],
+            ),
+        )
+    )
+
+    result = pdf_parser._parse_pdf_via_shards(
+        str(tmp_path / "standard.pdf"),
+        "standard.pdf",
+        str(output_dir),
+        {"smart_title_parse": False, "model_name": "test-model"},
+        profile=profile,
+    )
+
+    assert len(detector_calls) == 1
+    assert heading_contexts[0][0]["toc_range_unit"] == "line"
+    assert result["lines"] == ["# 1 Introduction", "Body"]
+
+
+def test_page_based_toc_demotes_front_matter_only_on_first_shard() -> None:
+    from app.services.document_parser.structure.layout_parser import pred_titles
+
+    toc_hierarchies = [
+        {
+            "toc_range": [2, 2],
+            "toc_range_unit": "page",
+            "toc_tree": {"Risk Factors": {}},
+        }
+    ]
+    lines = [
+        "1. Cover",
+        "2. Legal Notice",
+        "3. Risk Factors",
+        "4. Business",
+    ]
+
+    first_shard = pred_titles(
+        lines,
+        doc_type="md",
+        toc_hierarchies=toc_hierarchies,
+        smart_parse=False,
+        is_first_shard=True,
+    )
+    continuation = pred_titles(
+        lines,
+        doc_type="md",
+        toc_hierarchies=toc_hierarchies,
+        smart_parse=False,
+        is_first_shard=False,
+    )
+
+    first_levels = dict(zip(first_shard["id"], first_shard["level"], strict=False))
+    continuation_levels = dict(
+        zip(continuation["id"], continuation["level"], strict=False)
+    )
+    assert first_levels[0] == -1
+    assert first_levels[1] == -1
+    assert first_levels[2] > 0
+    assert continuation_levels[0] > 0
diff --git a/apps/worker/tests/contract/test_parse_task_contract.py b/apps/worker/tests/contract/test_parse_task_contract.py
index c2f33121..6548eb43 100644
--- a/apps/worker/tests/contract/test_parse_task_contract.py
+++ b/apps/worker/tests/contract/test_parse_task_contract.py
@@ -498,6 +498,13 @@ def test_oversized_pdf_shard_failure_preserves_processing_error(
     monkeypatch.setenv("S3_SECRET_ACCESS_KEY", "test")
     monkeypatch.setenv("S3_TEMP_PATH", str(tmp_path))
 
+    from app.services.document_agent.manifest import (
+        H1BoundaryResult,
+        PageAnatomyMap,
+        Shard,
+        ShardPlan,
+        TocResult,
+    )
     from app.services.document_parser.formats.pdf import parser as pdf_parser
     from app.services.document_parser.profiling.profile_model import ParserDocumentProfile
     from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
@@ -508,7 +515,7 @@ def test_oversized_pdf_shard_failure_preserves_processing_error(
     def _fail_oversized_parse(*args, **kwargs):
         raise RuntimeError("MinerU shard 0 failed")
 
-    monkeypatch.setattr(pdf_parser, "_parse_oversized_pdf", _fail_oversized_parse)
+    monkeypatch.setattr(pdf_parser, "_parse_pdf_via_shards", _fail_oversized_parse)
 
     with pytest.raises(PDFParsingException) as exc_info:
         pdf_parser.parse_pdfs(
@@ -521,6 +528,30 @@ def _fail_oversized_parse(*args, **kwargs):
                 category="generic document",
                 routing_category=PdfRoutingCategory.GENERIC,
                 page_count=2,
+                anatomy=PageAnatomyMap(
+                    job_id="job-oversized-fail",
+                    file_path=str(tmp_path / "source.pdf"),
+                    page_count=2,
+                    page_features=[],
+                    page_labels=[],
+                    toc_result=TocResult(method="none"),
+                    h1_result=H1BoundaryResult(method="none"),
+                    shard_plan=ShardPlan(
+                        enabled=True,
+                        reason="too_large",
+                        shards=[
+                            Shard(
+                                shard_index=0,
+                                page_start=1,
+                                page_end=2,
+                                page_offset=0,
+                                anchor_type="forced_max_size",
+                                anchor_evidence="fixture",
+                                confidence=1.0,
+                            )
+                        ],
+                    ),
+                ),
             ),
         )
 
@@ -630,6 +661,7 @@ def _identity_eval_md_headings(
         model_name=None,
         output_dir=None,
         layout_json_path=None,
+        is_first_shard=True,
     ):
         calls.setdefault("heading_dirs", []).append(output_dir)
         return list(md_lines)
diff --git a/packages/shared-python/shared/core/config/ai.py b/packages/shared-python/shared/core/config/ai.py
index 9d035051..559e53f3 100644
--- a/packages/shared-python/shared/core/config/ai.py
+++ b/packages/shared-python/shared/core/config/ai.py
@@ -29,12 +29,12 @@ class AIConfig(BaseModel):
         description="Heading and outline recognition model; falls back to NORMOL_MODEL when empty",
     )
     IMAGE_MODEL: str = Field(
-        default="qwen3.5-flash",
+        default="qwen3.6-flash",
         description="Image model for image summary, atlas, and OCR flows",
     )
 
     IMAGE_MODEL_MAX: str = Field(
-        default="qwen3.5-flash",
+        default="qwen3.6-flash",
         description="Higher-capability image model for OCR and image type classification",
     )
     RETRIEVAL_DECOMPOSITION_ENABLED: bool = Field(
diff --git a/packages/shared-python/shared/core/config/storage.py b/packages/shared-python/shared/core/config/storage.py
index 3daf7551..75046091 100644
--- a/packages/shared-python/shared/core/config/storage.py
+++ b/packages/shared-python/shared/core/config/storage.py
@@ -79,6 +79,14 @@ class StorageConfig(BaseModel):
         description="Soft page limit for oversized PDF shard pipeline. "
         "Documents exceeding this are rejected with a contact-support message.",
     )
+    PDF_PROFILE_TOC_ENABLED: bool = Field(
+        default=False,
+        description=(
+            "Enable PDF TOC extraction during parser-entry DOC_PROFILE for "
+            "standard and atlas PDFs. Oversized PDFs still run TOC profiling as "
+            "part of the shard pipeline."
+        ),
+    )
     MINERU_SHARD_CONCURRENCY: int = Field(
         default=3,
         ge=1,
diff --git a/packages/shared-python/shared/services/retrieval/llm_adapter.py b/packages/shared-python/shared/services/retrieval/llm_adapter.py
index f0a981b9..8de055a0 100644
--- a/packages/shared-python/shared/services/retrieval/llm_adapter.py
+++ b/packages/shared-python/shared/services/retrieval/llm_adapter.py
@@ -174,7 +174,7 @@ def create_retrieval_vlm_fn(
 ) -> LLMFn | None:
     """Create an async VLM callable for image-aware answer generation.
 
-    Uses the IMAGE_MODEL (e.g. qwen3.5-flash) for multimodal input.
+    Uses the IMAGE_MODEL (e.g. qwen3.6-flash) for multimodal input.
     Returns None when the image model is not configured.
 
     The returned function accepts the same ``LLMFnInput`` type as
@@ -183,7 +183,7 @@ def create_retrieval_vlm_fn(
     """
     from shared.core.config import settings
 
-    effective_model = model or getattr(settings, 'IMAGE_MODEL', '') or 'qwen3.5-flash'
+    effective_model = model or getattr(settings, 'IMAGE_MODEL', '') or 'qwen3.6-flash'
 
     if not _has_llm_credentials():
         logger.debug('retrieval: no LLM credentials for VLM, image-aware answering disabled')

From 637ba234e421141531e7e2224b209d0810b7e8a7 Mon Sep 17 00:00:00 2001
From: chengke <404835780@qq.com>
Date: Wed, 10 Jun 2026 16:29:38 +0800
Subject: [PATCH 3/4] fix: update Qwen model references to version 3.6-flash in
 AGENTS.md and README.md

---
 AGENTS.md | 6 +++---
 README.md | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 6cceff82..9e68dc18 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -217,9 +217,9 @@ flowchart LR
 |:---|:---|:---|
 | Text/table summarization | `NORMOL_MODEL` | `deepseek-chat` |
 | Heading hierarchy recognition | `HIERARCHY_LLM_MODEL` | Falls back to `NORMOL_MODEL` |
-| Image description (VLM) | `IMAGE_MODEL` | `qwen3.5-flash` |
-| Image OCR / Q&A | `IMAGE_MODEL_MAX` | `qwen3.5-flash` |
-| PDF coarse classification | `IMAGE_MODEL` | `qwen3.5-flash` |
+| Image description (VLM) | `IMAGE_MODEL` | `qwen3.6-flash` |
+| Image OCR / Q&A | `IMAGE_MODEL_MAX` | `qwen3.6-flash` |
+| PDF coarse classification | `IMAGE_MODEL` | `qwen3.6-flash` |
 
 ---
 
diff --git a/README.md b/README.md
index 703876dc..6aca6f3a 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ A: Knowhere uses MinerU as its default parser because it performs best in our te
 
 **Q: What LLM / VLM dependencies does Knowhere have?**
 
-A: By default, DeepSeek (`deepseek-chat`) handles text and table summarization, and Qwen-VL (`qwen3.5-flash`) handles image OCR and descriptions. Knowhere is model-agnostic. Swap in OpenAI, DashScope, Zhipu, or Volcengine via environment variables.
+A: By default, DeepSeek (`deepseek-chat`) handles text and table summarization, and Qwen-VL (`qwen3.6-flash`) handles image OCR and descriptions. Knowhere is model-agnostic. Swap in OpenAI, DashScope, Zhipu, or Volcengine via environment variables.
 
 **Q: How is Agentic Retrieval different from traditional RAG?**
 

From ac87030e6ca5131f234f70cc7618f7f5143d7ef8 Mon Sep 17 00:00:00 2001
From: chengke <404835780@qq.com>
Date: Wed, 10 Jun 2026 16:47:51 +0800
Subject: [PATCH 4/4] test: stabilize worker contract imports

---
 apps/worker/tests/contract/conftest.py        | 39 +++++++++++++++--
 .../test_doc_profile_anatomy_contract.py      | 43 ++++++++++---------
 2 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/apps/worker/tests/contract/conftest.py b/apps/worker/tests/contract/conftest.py
index 41ba3846..f9738370 100644
--- a/apps/worker/tests/contract/conftest.py
+++ b/apps/worker/tests/contract/conftest.py
@@ -17,6 +17,7 @@
 
 _REPO_ROOT: Path = Path(__file__).resolve().parents[4]
 _WORKER_ROOT: Path = _REPO_ROOT / "apps" / "worker"
+_API_ROOT: Path = _REPO_ROOT / "apps" / "api"
 _DOCUMENT_INGESTION_TASK_NAMES: tuple[str, ...] = (
     "app.core.tasks.document_ingestion_tasks.upload_url_file_task",
     "app.core.tasks.kb_tasks.upload_url_file_task",
@@ -25,6 +26,39 @@
 )
 
 
+def _module_loaded_from(module_name: str, root: Path) -> bool:
+    module = sys.modules.get(module_name)
+    if module is None:
+        return False
+
+    root_value = str(root)
+    module_file = getattr(module, "__file__", None)
+    if isinstance(module_file, str) and module_file.startswith(root_value):
+        return True
+
+    module_paths = getattr(module, "__path__", ())
+    return any(str(module_path).startswith(root_value) for module_path in module_paths)
+
+
+def _ensure_worker_import_context() -> None:
+    worker_root_value = str(_WORKER_ROOT)
+    if worker_root_value in sys.path:
+        sys.path.remove(worker_root_value)
+    sys.path.insert(0, worker_root_value)
+
+    cached_module_names = list(sys.modules)
+    for module_name in cached_module_names:
+        if module_name == "app" or module_name.startswith("app."):
+            if _module_loaded_from(module_name, _API_ROOT):
+                sys.modules.pop(module_name, None)
+
+
+@pytest.fixture(autouse=True)
+def worker_contract_import_context() -> Generator[None, None, None]:
+    _ensure_worker_import_context()
+    yield
+
+
 def _resolve_postgresql_executable() -> str | None:
     configured_executable: str | None = os.getenv("PYTEST_POSTGRESQL_EXECUTABLE")
 
@@ -65,10 +99,7 @@ def worker_contract_environment(
     contract_runtime.configure_contract_environment(monkeypatch, postgresql_proc)
     asyncio.run(contract_runtime.prepare_contract_storage())
 
-    worker_root_value = str(_WORKER_ROOT)
-    if worker_root_value in sys.path:
-        sys.path.remove(worker_root_value)
-    sys.path.insert(0, worker_root_value)
+    _ensure_worker_import_context()
     contract_runtime.clear_application_modules()
 
     from shared.core.celery_app import get_celery_app
diff --git a/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py b/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py
index f1d108ad..448e1630 100644
--- a/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py
+++ b/apps/worker/tests/contract/test_doc_profile_anatomy_contract.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import importlib
 import os
 from pathlib import Path
 from types import SimpleNamespace
@@ -11,6 +12,7 @@
 os.environ.setdefault("S3_SECRET_ACCESS_KEY", "test")
 os.environ.setdefault("S3_TEMP_PATH", "/tmp")
 
+from app.services.document_agent import coordinator as coordinator_module
 from app.services.document_agent.coordinator import ProfileCoordinator
 from app.services.document_agent.manifest import (
     DocumentProfile,
@@ -26,8 +28,12 @@
     ToolResult,
 )
 from app.services.document_agent.validators import validate_shard_plan
+from app.services.document_parser.formats.pdf import parser as pdf_parser
+from app.services.document_parser.formats.pdf import shard_splitter
+from app.services.document_parser.profiling import doc_profiler
 from app.services.document_parser.profiling.doc_profiler import profile_document
 from app.services.document_parser.profiling.taxonomy import PdfRoutingCategory
+from app.services.document_parser.structure.layout_parser import pred_titles
 
 
 def _page_feature(page: int = 1) -> PageFeature:
@@ -165,8 +171,6 @@ def fake_persist(_anatomy):
     monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary)
     monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist)
 
-    from app.services.document_agent import coordinator as coordinator_module
-
     monkeypatch.setattr(
         coordinator_module.ProfilePlanner,
         "propose",
@@ -263,8 +267,6 @@ def fake_persist(_anatomy):
     monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary)
     monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist)
 
-    from app.services.document_agent import coordinator as coordinator_module
-
     monkeypatch.setattr(
         coordinator_module.ProfilePlanner,
         "propose",
@@ -350,8 +352,6 @@ def fake_persist(_anatomy):
     monkeypatch.setattr(coordinator, "_run_h1_boundary_pipeline", fake_h1_boundary)
     monkeypatch.setattr(coordinator, "_persist_ready_anatomy", fake_persist)
 
-    from app.services.document_agent import coordinator as coordinator_module
-
     def fake_propose(_self):
         calls.append("planner")
         return (
@@ -453,8 +453,6 @@ def test_standard_pdf_profile_toc_flag_off_preserves_current_behavior(
     monkeypatch,
     tmp_path: Path,
 ) -> None:
-    from app.services.document_parser.profiling import doc_profiler
-
     fake_instances: list[object] = []
 
     class FakeCoordinator:
@@ -505,8 +503,6 @@ def test_standard_pdf_profile_toc_flag_on_builds_toc_and_lightweight_anatomy(
     monkeypatch,
     tmp_path: Path,
 ) -> None:
-    from app.services.document_parser.profiling import doc_profiler
-
     fake_anatomy = object()
 
     class FakeCoordinator:
@@ -583,10 +579,6 @@ def test_pdf_shard_pipeline_accepts_single_shard_fast_path(
     monkeypatch,
     tmp_path: Path,
 ) -> None:
-    from app.services.document_parser.formats.markdown import parser as markdown_parser
-    from app.services.document_parser.formats.pdf import parser as pdf_parser
-    from app.services.document_parser.formats.pdf import shard_splitter
-
     output_dir = tmp_path / "out"
     output_dir.mkdir()
     calls: list[str] = []
@@ -605,9 +597,16 @@ def fake_parse_md(*_args, **kwargs):
         calls.append("parse_md")
         return {"lines": kwargs["lines_with_heading"]}
 
+    active_markdown_parser = importlib.import_module(
+        "app.services.document_parser.formats.markdown.parser"
+    )
     monkeypatch.setattr(pdf_parser, "parse_via_full", fake_parse_via_full)
     monkeypatch.setattr(shard_splitter, "split_pdf", fail_split)
-    monkeypatch.setattr(markdown_parser, "eval_md_headings", fake_eval_md_headings)
+    monkeypatch.setattr(
+        active_markdown_parser,
+        "eval_md_headings",
+        fake_eval_md_headings,
+    )
     monkeypatch.setattr(pdf_parser, "parse_md", fake_parse_md)
 
     profile = SimpleNamespace(
@@ -657,9 +656,6 @@ def test_pdf_first_shard_reuses_markdown_toc_detector_when_profile_misses_toc(
     monkeypatch,
     tmp_path: Path,
 ) -> None:
-    from app.services.document_parser.formats.markdown import parser as markdown_parser
-    from app.services.document_parser.formats.pdf import parser as pdf_parser
-
     output_dir = tmp_path / "out"
     output_dir.mkdir()
     detector_calls: list[list[str]] = []
@@ -688,9 +684,16 @@ def fake_eval_md_headings(md_lines, *_args, **kwargs):
         heading_contexts.append(kwargs.get("toc_hierarchies"))
         return [f"# {line}" if line.startswith("1 ") else line for line in md_lines]
 
+    active_markdown_parser = importlib.import_module(
+        "app.services.document_parser.formats.markdown.parser"
+    )
     monkeypatch.setattr(pdf_parser, "parse_via_full", fake_parse_via_full)
     monkeypatch.setattr(pdf_parser, "detect_tocs_in_texts", fake_detect_tocs_in_texts)
-    monkeypatch.setattr(markdown_parser, "eval_md_headings", fake_eval_md_headings)
+    monkeypatch.setattr(
+        active_markdown_parser,
+        "eval_md_headings",
+        fake_eval_md_headings,
+    )
     monkeypatch.setattr(
         pdf_parser,
         "parse_md",
@@ -742,8 +745,6 @@ def fake_eval_md_headings(md_lines, *_args, **kwargs):
 
 
 def test_page_based_toc_demotes_front_matter_only_on_first_shard() -> None:
-    from app.services.document_parser.structure.layout_parser import pred_titles
-
     toc_hierarchies = [
         {
             "toc_range": [2, 2],