Skip to content

Commit 6f108c8

Browse files
authored
feat: add api layout_parsing (#62)
1 parent dcc7926 commit 6f108c8

8 files changed

Lines changed: 246 additions & 6 deletions

File tree

examples/layout_parsing_example.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""
2+
Layout Parsing Example
3+
4+
This example demonstrates how to use the layout_parsing API to parse
5+
images and PDFs for OCR with detailed layout detection.
6+
7+
The API returns:
8+
- Markdown formatted text
9+
- Detailed layout information (element positions, types, content)
10+
- Visualization images
11+
"""
12+
from zai import ZaiClient
13+
14+
def layout_parsing_example_with_url():
15+
"""Example using an image URL for layout parsing."""
16+
client = ZaiClient()
17+
18+
# Image URL to parse
19+
image_url = "https://cdn.bigmodel.cn/static/platform/images/trialcenter/example/visual_img1.jpeg"
20+
21+
print(f"Sending request to parse image: {image_url}")
22+
23+
try:
24+
response = client.layout_parsing.create(
25+
model="glm-ocr",
26+
file=image_url
27+
)
28+
29+
print("\n✓ Request successful!")
30+
print(f"Task ID: {response.id}")
31+
print(f"Model: {response.model}")
32+
print(f"Created at: {response.created}")
33+
print(f"Request ID: {response.request_id}")
34+
35+
# Print document info
36+
if response.data_info:
37+
print(f"\nDocument Info:")
38+
print(f" - Total pages: {response.data_info.num_pages}")
39+
if response.data_info.pages:
40+
for i, page in enumerate(response.data_info.pages):
41+
print(f" - Page {i+1}: {page.width}x{page.height}")
42+
43+
# Print markdown results
44+
print(f"\n=== Markdown Results ===\n")
45+
print(response.md_results)
46+
47+
# Print layout details if available
48+
if response.layout_details:
49+
print(f"\n=== Layout Details ===")
50+
for page_idx, page_details in enumerate(response.layout_details):
51+
print(f"\nPage {page_idx + 1}:")
52+
for element in page_details:
53+
print(f" [{element.index}] {element.label}: {element.content[:50] if element.content else 'N/A'}...")
54+
if element.bbox_2d:
55+
print(f" BBox: {element.bbox_2d}")
56+
57+
# Print visualization URLs
58+
if response.layout_visualization:
59+
print(f"\n=== Visualization URLs ===")
60+
for i, url in enumerate(response.layout_visualization):
61+
print(f" Page {i + 1}: {url}")
62+
63+
# Print usage information
64+
if response.usage:
65+
print(f"\n=== Token Usage ===")
66+
print(f" - Prompt tokens: {response.usage.prompt_tokens}")
67+
print(f" - Completion tokens: {response.usage.completion_tokens}")
68+
print(f" - Total tokens: {response.usage.total_tokens}")
69+
if response.usage.prompt_tokens_details:
70+
print(f" - Prompt details: {response.usage.prompt_tokens_details}")
71+
72+
except Exception as e:
73+
print(f"\n✗ Error: {e}")
74+
raise
75+
76+
77+
if __name__ == "__main__":
78+
# Run the URL example with the test image
79+
layout_parsing_example_with_url()

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "zai-sdk"
3-
version = "0.2.1"
3+
version = "0.2.2"
44
description = "A SDK library for accessing big model apis from Z.ai"
55
authors = ["Z.ai"]
66
readme = "README.md"

src/zai/_client.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from zai.api_resource.web_search import WebSearchApi
2525
from zai.api_resource.web_reader import WebReaderApi
2626
from zai.api_resource.file_parser import FileParser
27-
from zai.api_resource.ocr import HandwritingOCR
27+
from zai.api_resource.ocr import HandwritingOCR, LayoutParsing
2828

2929
from .core import (
3030
NOT_GIVEN,
@@ -206,6 +206,11 @@ def ocr(self) -> HandwritingOCR:
206206
from zai.api_resource.ocr import HandwritingOCR
207207
return HandwritingOCR(self)
208208

209+
@cached_property
210+
def layout_parsing(self) -> LayoutParsing:
211+
from zai.api_resource.ocr import LayoutParsing
212+
return LayoutParsing(self)
213+
209214
@property
210215
@override
211216
def auth_headers(self) -> dict[str, str]:

src/zai/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
__title__ = 'Z.ai'
2-
__version__ = '0.2.1'
2+
__version__ = '0.2.2'

src/zai/api_resource/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from .files import Files, FilesWithRawResponse
1515
from .images import Images
1616
from .moderations import Moderations
17-
from .ocr import HandwritingOCR
17+
from .ocr import HandwritingOCR, LayoutParsing
1818
from .tools import Tools
1919
from .videos import (
2020
Videos,
@@ -40,5 +40,6 @@
4040
'WebReaderApi',
4141
'Agents',
4242
'FileParser',
43-
'HandwritingOCR'
43+
'HandwritingOCR',
44+
'LayoutParsing',
4445
]
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .handwriting_ocr import HandwritingOCR
2+
from .layout_parsing import LayoutParsing
23

3-
__all__ = ["HandwritingOCR"]
4+
__all__ = ["HandwritingOCR", "LayoutParsing"]
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING, Optional
4+
5+
import httpx
6+
7+
from zai.core import (
8+
BaseAPI,
9+
NOT_GIVEN,
10+
Body,
11+
Headers,
12+
NotGiven,
13+
deepcopy_minimal,
14+
make_request_options,
15+
)
16+
from zai.types.ocr.layout_parsing_resp import LayoutParsingResp
17+
18+
if TYPE_CHECKING:
19+
from zai._client import ZaiClient
20+
21+
__all__ = ["LayoutParsing"]
22+
23+
24+
class LayoutParsing(BaseAPI):
25+
"""
26+
Layout parsing API resource for document/image OCR with layout detection.
27+
28+
This API supports parsing images and PDF documents to extract text content
29+
with detailed layout information.
30+
"""
31+
32+
def __init__(self, client: "ZaiClient") -> None:
33+
super().__init__(client)
34+
35+
def create(
36+
self,
37+
*,
38+
model: str,
39+
file: str,
40+
return_crop_images: Optional[bool] | NotGiven = NOT_GIVEN,
41+
need_layout_visualization: Optional[bool] | NotGiven = NOT_GIVEN,
42+
start_page_id: Optional[int] | NotGiven = NOT_GIVEN,
43+
end_page_id: Optional[int] | NotGiven = NOT_GIVEN,
44+
request_id: Optional[str] | NotGiven = NOT_GIVEN,
45+
user_id: Optional[str] | NotGiven = NOT_GIVEN,
46+
extra_headers: Headers | None = None,
47+
extra_body: Body | None = None,
48+
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
49+
) -> LayoutParsingResp:
50+
"""
51+
Parse document or image layout and extract text content.
52+
53+
Arguments:
54+
model (str): Model code, e.g., 'GLM-OCR' or 'glm-ocr'
55+
file (str): URL or base64 encoded image/PDF to parse.
56+
Supported formats: PDF, JPG, PNG.
57+
Size limits: Image ≤ 10MB, PDF ≤ 50MB, max 100 pages.
58+
return_crop_images (Optional[bool]): Whether to return crop images.
59+
Defaults to False. When True, returns cropped image information.
60+
need_layout_visualization (Optional[bool]): Whether to return detailed layout visualization results.
61+
Defaults to False. When True, returns detailed layout image result information.
62+
start_page_id (Optional[int]): Starting page number for PDF parsing.
63+
end_page_id (Optional[int]): Ending page number for PDF parsing.
64+
request_id (Optional[str]): Unique request identifier. Auto-generated if not provided.
65+
user_id (Optional[str]): End user ID for abuse monitoring.
66+
Length: 6-128 characters.
67+
extra_headers (Headers): Additional HTTP headers.
68+
extra_body (Body): Additional request body parameters.
69+
timeout (float | httpx.Timeout): Request timeout.
70+
71+
Returns:
72+
LayoutParsingResp: Parsed layout result including:
73+
- id: Task ID
74+
- created: Unix timestamp
75+
- model: Model name
76+
- md_results: Markdown formatted recognition result
77+
- crop_images: Cropped image information (if return_crop_images=True)
78+
- layout_visualization: Detailed layout visualization information (if need_layout_visualization=True)
79+
- data_info: Document metadata (page count, dimensions)
80+
"""
81+
if not model:
82+
raise ValueError("`model` must be provided.")
83+
if not file:
84+
raise ValueError("`file` must be provided.")
85+
86+
body = deepcopy_minimal(
87+
{
88+
"model": model,
89+
"file": file,
90+
"return_crop_images": return_crop_images,
91+
"need_layout_visualization": need_layout_visualization,
92+
"start_page_id": start_page_id,
93+
"end_page_id": end_page_id,
94+
"request_id": request_id,
95+
"user_id": user_id,
96+
}
97+
)
98+
99+
return self._post(
100+
"/layout_parsing",
101+
body=body,
102+
options=make_request_options(
103+
extra_headers=extra_headers, extra_body=extra_body, timeout=timeout
104+
),
105+
cast_type=LayoutParsingResp,
106+
)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from typing import List, Optional, Dict, Any
2+
3+
from zai.core import BaseModel
4+
5+
__all__ = ["LayoutParsingResp", "LayoutDetail", "DataInfo", "PageInfo", "Usage"]
6+
7+
8+
class Usage(BaseModel):
9+
"""Token usage information"""
10+
completion_tokens: int
11+
prompt_tokens: int
12+
prompt_tokens_details: Optional[Dict[str, Any]] = None
13+
total_tokens: int
14+
15+
16+
class PageInfo(BaseModel):
17+
"""Page size information"""
18+
width: int
19+
height: int
20+
21+
22+
class DataInfo(BaseModel):
23+
"""Document basic information"""
24+
num_pages: int
25+
pages: Optional[List[PageInfo]] = None
26+
27+
28+
class LayoutDetail(BaseModel):
29+
"""Layout detail element"""
30+
index: int
31+
label: str
32+
bbox_2d: Optional[List[float]] = None
33+
content: Optional[str] = None
34+
height: Optional[int] = None
35+
width: Optional[int] = None
36+
37+
38+
class LayoutParsingResp(BaseModel):
39+
"""Response model for layout parsing API"""
40+
id: str
41+
created: int
42+
model: str
43+
md_results: Optional[str] = None
44+
layout_details: Optional[List[List[LayoutDetail]]] = None
45+
layout_visualization: Optional[List[str]] = None
46+
data_info: Optional[DataInfo] = None
47+
usage: Optional[Usage] = None
48+
request_id: Optional[str] = None

0 commit comments

Comments
 (0)