contract-intelligence/src/shared/layout.py at main · starman69/contract-intelligence · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""LayoutClient abstraction — extract paragraphs+pages from binary documents.

Two implementations:
- AzureLayoutClient        wraps Azure Document Intelligence prebuilt-layout
- UnstructuredLayoutClient calls the unstructured.io REST API and normalizes
                           its element list to the DI as_dict() shape that
                           pipeline.py already consumes.

Returned shape (subset of DI's that pipeline.py reads):

    {
      "paragraphs": [
        {
          "content": "text body",
          "role": "title" | "sectionHeading" | "pageHeader" | "pageFooter" |
                  "pageNumber" | None,
          "boundingRegions": [{"pageNumber": 1}],
        },
        ...
      ],
      "pages": [{"pageNumber": N}, ...],
    }

Pure module — no Azure SDK imports — so unit tests can exercise the normalizer
without azure-ai-documentintelligence installed.
"""
from __future__ import annotations

from typing import Any, Protocol


class LayoutClient(Protocol):
    def analyze(self, content: bytes) -> dict[str, Any]:
        ...


class AzureLayoutClient:
    """Adapter over azure.ai.documentintelligence.DocumentIntelligenceClient."""

    def __init__(self, di_client: Any) -> None:
        self._di = di_client

    def analyze(self, content: bytes) -> dict[str, Any]:
        # Imported lazily so this module is parseable without azure SDK installed.
        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest

        poller = self._di.begin_analyze_document(
            "prebuilt-layout", AnalyzeDocumentRequest(bytes_source=content)
        )
        return poller.result().as_dict()


class UnstructuredLayoutClient:
    """Adapter over the unstructured.io REST API
    (`quay.io/unstructured-io/unstructured-api`)."""

    def __init__(self, base_url: str, timeout: float = 120.0) -> None:
        self._base = base_url.rstrip("/")
        self._timeout = timeout

    def analyze(self, content: bytes) -> dict[str, Any]:
        import requests

        resp = requests.post(
            f"{self._base}/general/v0/general",
            files={"files": ("doc.pdf", content, "application/pdf")},
            data={"strategy": "auto", "coordinates": "false"},
            timeout=self._timeout,
        )
        resp.raise_for_status()
        return normalize_unstructured_to_di_shape(resp.json())


# unstructured element types -> DI paragraph roles. Anything not mapped becomes
# a paragraph with no role (i.e. body text).
_ROLE_MAP: dict[str, str | None] = {
    "Title": "title",
    "Header": "pageHeader",
    "Footer": "pageFooter",
    "PageNumber": "pageNumber",
    "Section-header": "sectionHeading",
    "SectionHeader": "sectionHeading",
    "NarrativeText": None,
    "ListItem": None,
    "UncategorizedText": None,
    "Address": None,
    "EmailAddress": None,
    "Image": None,  # included as paragraph for completeness; text may be empty
    "FigureCaption": None,
    "Formula": None,
}


def normalize_unstructured_to_di_shape(elements: list[dict[str, Any]]) -> dict[str, Any]:
    """Convert unstructured.io element list -> DI as_dict()-compatible subset.

    Only emits the keys pipeline.py reads: `paragraphs` (with `content`,
    optional `role`, and `boundingRegions[0].pageNumber`) and `pages`
    (with `pageNumber`).
    """
    paragraphs: list[dict[str, Any]] = []
    pages: dict[int, dict[str, Any]] = {}
    for el in elements:
        text = (el.get("text") or "").strip()
        if not text:
            continue
        type_ = el.get("type", "")
        if type_ == "Table":
            # Tables are rendered as paragraphs with the textified table; DI
            # consumers read table-as-text just fine for our extraction prompt.
            text = el.get("metadata", {}).get("text_as_html") or text
        meta = el.get("metadata") or {}
        page = int(meta.get("page_number") or 1)
        para: dict[str, Any] = {
            "content": text,
            "boundingRegions": [{"pageNumber": page}],
        }
        role = _ROLE_MAP.get(type_)
        if role is not None:
            para["role"] = role
        paragraphs.append(para)
        pages.setdefault(page, {"pageNumber": page})
    return {
        "paragraphs": paragraphs,
        "pages": [pages[k] for k in sorted(pages)],
    }