Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions examples/layout_parsing_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
Layout Parsing Example

This example demonstrates how to use the layout_parsing API to parse
images and PDFs for OCR with detailed layout detection.

The API returns:
- Markdown formatted text
- Detailed layout information (element positions, types, content)
- Visualization images
"""
from zai import ZaiClient

def layout_parsing_example_with_url():
"""Example using an image URL for layout parsing."""
client = ZaiClient()

# Image URL to parse
image_url = "https://cdn.bigmodel.cn/static/platform/images/trialcenter/example/visual_img1.jpeg"

print(f"Sending request to parse image: {image_url}")

try:
response = client.layout_parsing.create(
model="glm-ocr",
file=image_url
)

print("\n✓ Request successful!")
print(f"Task ID: {response.id}")
print(f"Model: {response.model}")
print(f"Created at: {response.created}")
print(f"Request ID: {response.request_id}")

# Print document info
if response.data_info:
print(f"\nDocument Info:")
print(f" - Total pages: {response.data_info.num_pages}")
if response.data_info.pages:
for i, page in enumerate(response.data_info.pages):
print(f" - Page {i+1}: {page.width}x{page.height}")

# Print markdown results
print(f"\n=== Markdown Results ===\n")
print(response.md_results)

# Print layout details if available
if response.layout_details:
print(f"\n=== Layout Details ===")
for page_idx, page_details in enumerate(response.layout_details):
print(f"\nPage {page_idx + 1}:")
for element in page_details:
print(f" [{element.index}] {element.label}: {element.content[:50] if element.content else 'N/A'}...")
if element.bbox_2d:
print(f" BBox: {element.bbox_2d}")

# Print visualization URLs
if response.layout_visualization:
print(f"\n=== Visualization URLs ===")
for i, url in enumerate(response.layout_visualization):
print(f" Page {i + 1}: {url}")

# Print usage information
if response.usage:
print(f"\n=== Token Usage ===")
print(f" - Prompt tokens: {response.usage.prompt_tokens}")
print(f" - Completion tokens: {response.usage.completion_tokens}")
print(f" - Total tokens: {response.usage.total_tokens}")
if response.usage.prompt_tokens_details:
print(f" - Prompt details: {response.usage.prompt_tokens_details}")

except Exception as e:
print(f"\n✗ Error: {e}")
raise


if __name__ == "__main__":
# Run the URL example with the test image
layout_parsing_example_with_url()
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "zai-sdk"
version = "0.2.1"
version = "0.2.2"
description = "A SDK library for accessing big model apis from Z.ai"
authors = ["Z.ai"]
readme = "README.md"
Expand Down
7 changes: 6 additions & 1 deletion src/zai/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from zai.api_resource.web_search import WebSearchApi
from zai.api_resource.web_reader import WebReaderApi
from zai.api_resource.file_parser import FileParser
from zai.api_resource.ocr import HandwritingOCR
from zai.api_resource.ocr import HandwritingOCR, LayoutParsing

from .core import (
NOT_GIVEN,
Expand Down Expand Up @@ -206,6 +206,11 @@ def ocr(self) -> HandwritingOCR:
from zai.api_resource.ocr import HandwritingOCR
return HandwritingOCR(self)

@cached_property
def layout_parsing(self) -> LayoutParsing:
from zai.api_resource.ocr import LayoutParsing
return LayoutParsing(self)

@property
@override
def auth_headers(self) -> dict[str, str]:
Expand Down
2 changes: 1 addition & 1 deletion src/zai/_version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__title__ = 'Z.ai'
__version__ = '0.2.1'
__version__ = '0.2.2'
5 changes: 3 additions & 2 deletions src/zai/api_resource/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .files import Files, FilesWithRawResponse
from .images import Images
from .moderations import Moderations
from .ocr import HandwritingOCR
from .ocr import HandwritingOCR, LayoutParsing
from .tools import Tools
from .videos import (
Videos,
Expand All @@ -40,5 +40,6 @@
'WebReaderApi',
'Agents',
'FileParser',
'HandwritingOCR'
'HandwritingOCR',
'LayoutParsing',
]
3 changes: 2 additions & 1 deletion src/zai/api_resource/ocr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .handwriting_ocr import HandwritingOCR
from .layout_parsing import LayoutParsing

__all__ = ["HandwritingOCR"]
__all__ = ["HandwritingOCR", "LayoutParsing"]
106 changes: 106 additions & 0 deletions src/zai/api_resource/ocr/layout_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Optional

import httpx

from zai.core import (
BaseAPI,
NOT_GIVEN,
Body,
Headers,
NotGiven,
deepcopy_minimal,
make_request_options,
)
from zai.types.ocr.layout_parsing_resp import LayoutParsingResp

if TYPE_CHECKING:
from zai._client import ZaiClient

__all__ = ["LayoutParsing"]


class LayoutParsing(BaseAPI):
"""
Layout parsing API resource for document/image OCR with layout detection.

This API supports parsing images and PDF documents to extract text content
with detailed layout information.
"""

def __init__(self, client: "ZaiClient") -> None:
super().__init__(client)

def create(
self,
*,
model: str,
file: str,
return_crop_images: Optional[bool] | NotGiven = NOT_GIVEN,
need_layout_visualization: Optional[bool] | NotGiven = NOT_GIVEN,
start_page_id: Optional[int] | NotGiven = NOT_GIVEN,
end_page_id: Optional[int] | NotGiven = NOT_GIVEN,
request_id: Optional[str] | NotGiven = NOT_GIVEN,
user_id: Optional[str] | NotGiven = NOT_GIVEN,
extra_headers: Headers | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
) -> LayoutParsingResp:
"""
Parse document or image layout and extract text content.

Arguments:
model (str): Model code, e.g., 'GLM-OCR' or 'glm-ocr'
file (str): URL or base64 encoded image/PDF to parse.
Supported formats: PDF, JPG, PNG.
Size limits: Image ≤ 10MB, PDF ≤ 50MB, max 100 pages.
return_crop_images (Optional[bool]): Whether to return crop images.
Defaults to False. When True, returns cropped image information.
need_layout_visualization (Optional[bool]): Whether to return detailed layout visualization results.
Defaults to False. When True, returns detailed layout image result information.
start_page_id (Optional[int]): Starting page number for PDF parsing.
end_page_id (Optional[int]): Ending page number for PDF parsing.
request_id (Optional[str]): Unique request identifier. Auto-generated if not provided.
user_id (Optional[str]): End user ID for abuse monitoring.
Length: 6-128 characters.
extra_headers (Headers): Additional HTTP headers.
extra_body (Body): Additional request body parameters.
timeout (float | httpx.Timeout): Request timeout.

Returns:
LayoutParsingResp: Parsed layout result including:
- id: Task ID
- created: Unix timestamp
- model: Model name
- md_results: Markdown formatted recognition result
- crop_images: Cropped image information (if return_crop_images=True)
- layout_visualization: Detailed layout visualization information (if need_layout_visualization=True)
- data_info: Document metadata (page count, dimensions)
"""
if not model:
raise ValueError("`model` must be provided.")
if not file:
raise ValueError("`file` must be provided.")

body = deepcopy_minimal(
{
"model": model,
"file": file,
"return_crop_images": return_crop_images,
"need_layout_visualization": need_layout_visualization,
"start_page_id": start_page_id,
"end_page_id": end_page_id,
"request_id": request_id,
"user_id": user_id,
}
)

return self._post(
"/layout_parsing",
body=body,
options=make_request_options(
extra_headers=extra_headers, extra_body=extra_body, timeout=timeout
),
cast_type=LayoutParsingResp,
)
48 changes: 48 additions & 0 deletions src/zai/types/ocr/layout_parsing_resp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import List, Optional, Dict, Any

from zai.core import BaseModel

__all__ = ["LayoutParsingResp", "LayoutDetail", "DataInfo", "PageInfo", "Usage"]


class Usage(BaseModel):
"""Token usage information"""
completion_tokens: int
prompt_tokens: int
prompt_tokens_details: Optional[Dict[str, Any]] = None
total_tokens: int


class PageInfo(BaseModel):
"""Page size information"""
width: int
height: int


class DataInfo(BaseModel):
"""Document basic information"""
num_pages: int
pages: Optional[List[PageInfo]] = None


class LayoutDetail(BaseModel):
"""Layout detail element"""
index: int
label: str
bbox_2d: Optional[List[float]] = None
content: Optional[str] = None
height: Optional[int] = None
width: Optional[int] = None


class LayoutParsingResp(BaseModel):
"""Response model for layout parsing API"""
id: str
created: int
model: str
md_results: Optional[str] = None
layout_details: Optional[List[List[LayoutDetail]]] = None
layout_visualization: Optional[List[str]] = None
data_info: Optional[DataInfo] = None
usage: Optional[Usage] = None
request_id: Optional[str] = None