-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlayout.py
More file actions
126 lines (104 loc) · 4.22 KB
/
layout.py
File metadata and controls
126 lines (104 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""LayoutClient abstraction — extract paragraphs+pages from binary documents.
Two implementations:
- AzureLayoutClient wraps Azure Document Intelligence prebuilt-layout
- UnstructuredLayoutClient calls the unstructured.io REST API and normalizes
its element list to the DI as_dict() shape that
pipeline.py already consumes.
Returned shape (subset of DI's that pipeline.py reads):
{
"paragraphs": [
{
"content": "text body",
"role": "title" | "sectionHeading" | "pageHeader" | "pageFooter" |
"pageNumber" | None,
"boundingRegions": [{"pageNumber": 1}],
},
...
],
"pages": [{"pageNumber": N}, ...],
}
Pure module — no Azure SDK imports — so unit tests can exercise the normalizer
without azure-ai-documentintelligence installed.
"""
from __future__ import annotations
from typing import Any, Protocol
class LayoutClient(Protocol):
def analyze(self, content: bytes) -> dict[str, Any]:
...
class AzureLayoutClient:
"""Adapter over azure.ai.documentintelligence.DocumentIntelligenceClient."""
def __init__(self, di_client: Any) -> None:
self._di = di_client
def analyze(self, content: bytes) -> dict[str, Any]:
# Imported lazily so this module is parseable without azure SDK installed.
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
poller = self._di.begin_analyze_document(
"prebuilt-layout", AnalyzeDocumentRequest(bytes_source=content)
)
return poller.result().as_dict()
class UnstructuredLayoutClient:
"""Adapter over the unstructured.io REST API
(`quay.io/unstructured-io/unstructured-api`)."""
def __init__(self, base_url: str, timeout: float = 120.0) -> None:
self._base = base_url.rstrip("/")
self._timeout = timeout
def analyze(self, content: bytes) -> dict[str, Any]:
import requests
resp = requests.post(
f"{self._base}/general/v0/general",
files={"files": ("doc.pdf", content, "application/pdf")},
data={"strategy": "auto", "coordinates": "false"},
timeout=self._timeout,
)
resp.raise_for_status()
return normalize_unstructured_to_di_shape(resp.json())
# unstructured element types -> DI paragraph roles. Anything not mapped becomes
# a paragraph with no role (i.e. body text).
_ROLE_MAP: dict[str, str | None] = {
"Title": "title",
"Header": "pageHeader",
"Footer": "pageFooter",
"PageNumber": "pageNumber",
"Section-header": "sectionHeading",
"SectionHeader": "sectionHeading",
"NarrativeText": None,
"ListItem": None,
"UncategorizedText": None,
"Address": None,
"EmailAddress": None,
"Image": None, # included as paragraph for completeness; text may be empty
"FigureCaption": None,
"Formula": None,
}
def normalize_unstructured_to_di_shape(elements: list[dict[str, Any]]) -> dict[str, Any]:
"""Convert unstructured.io element list -> DI as_dict()-compatible subset.
Only emits the keys pipeline.py reads: `paragraphs` (with `content`,
optional `role`, and `boundingRegions[0].pageNumber`) and `pages`
(with `pageNumber`).
"""
paragraphs: list[dict[str, Any]] = []
pages: dict[int, dict[str, Any]] = {}
for el in elements:
text = (el.get("text") or "").strip()
if not text:
continue
type_ = el.get("type", "")
if type_ == "Table":
# Tables are rendered as paragraphs with the textified table; DI
# consumers read table-as-text just fine for our extraction prompt.
text = el.get("metadata", {}).get("text_as_html") or text
meta = el.get("metadata") or {}
page = int(meta.get("page_number") or 1)
para: dict[str, Any] = {
"content": text,
"boundingRegions": [{"pageNumber": page}],
}
role = _ROLE_MAP.get(type_)
if role is not None:
para["role"] = role
paragraphs.append(para)
pages.setdefault(page, {"pageNumber": page})
return {
"paragraphs": paragraphs,
"pages": [pages[k] for k in sorted(pages)],
}