feat: add api layout_parsing (#62)

tomsun28 · web-flow · commit 6f108c8ef8f2 · 2026-02-03T00:07:26.000+08:00
diff --git a/examples/layout_parsing_example.py b/examples/layout_parsing_example.py
@@ -0,0 +1,79 @@
+"""
+Layout Parsing Example
+
+This example demonstrates how to use the layout_parsing API to parse
+images and PDFs for OCR with detailed layout detection.
+
+The API returns:
+- Markdown formatted text
+- Detailed layout information (element positions, types, content)
+- Visualization images
+"""
+from zai import ZaiClient
+
+def layout_parsing_example_with_url():
+    """Example using an image URL for layout parsing."""
+    client = ZaiClient()
+
+    # Image URL to parse
+    image_url = "https://cdn.bigmodel.cn/static/platform/images/trialcenter/example/visual_img1.jpeg"
+
+    print(f"Sending request to parse image: {image_url}")
+
+    try:
+        response = client.layout_parsing.create(
+            model="glm-ocr",
+            file=image_url
+        )
+
+        print("\n✓ Request successful!")
+        print(f"Task ID: {response.id}")
+        print(f"Model: {response.model}")
+        print(f"Created at: {response.created}")
+        print(f"Request ID: {response.request_id}")
+
+        # Print document info
+        if response.data_info:
+            print(f"\nDocument Info:")
+            print(f"  - Total pages: {response.data_info.num_pages}")
+            if response.data_info.pages:
+                for i, page in enumerate(response.data_info.pages):
+                    print(f"  - Page {i+1}: {page.width}x{page.height}")
+
+        # Print markdown results
+        print(f"\n=== Markdown Results ===\n")
+        print(response.md_results)
+
+        # Print layout details if available
+        if response.layout_details:
+            print(f"\n=== Layout Details ===")
+            for page_idx, page_details in enumerate(response.layout_details):
+                print(f"\nPage {page_idx + 1}:")
+                for element in page_details:
+                    print(f"  [{element.index}] {element.label}: {element.content[:50] if element.content else 'N/A'}...")
+                    if element.bbox_2d:
+                        print(f"       BBox: {element.bbox_2d}")
+
+        # Print visualization URLs
+        if response.layout_visualization:
+            print(f"\n=== Visualization URLs ===")
+            for i, url in enumerate(response.layout_visualization):
+                print(f"  Page {i + 1}: {url}")
+
+        # Print usage information
+        if response.usage:
+            print(f"\n=== Token Usage ===")
+            print(f"  - Prompt tokens: {response.usage.prompt_tokens}")
+            print(f"  - Completion tokens: {response.usage.completion_tokens}")
+            print(f"  - Total tokens: {response.usage.total_tokens}")
+            if response.usage.prompt_tokens_details:
+                print(f"  - Prompt details: {response.usage.prompt_tokens_details}")
+
+    except Exception as e:
+        print(f"\n✗ Error: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    # Run the URL example with the test image
+    layout_parsing_example_with_url()
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "zai-sdk"
-version = "0.2.1"
+version = "0.2.2"
 description = "A SDK library for accessing big model apis from Z.ai"
 authors = ["Z.ai"]
 readme = "README.md"
diff --git a/src/zai/_client.py b/src/zai/_client.py
@@ -24,7 +24,7 @@
     from zai.api_resource.web_search import WebSearchApi
     from zai.api_resource.web_reader import WebReaderApi
     from zai.api_resource.file_parser import FileParser
-    from zai.api_resource.ocr import HandwritingOCR
+    from zai.api_resource.ocr import HandwritingOCR, LayoutParsing
 
 from .core import (
     NOT_GIVEN,
@@ -206,6 +206,11 @@ def ocr(self) -> HandwritingOCR:
         from zai.api_resource.ocr import HandwritingOCR
         return HandwritingOCR(self)
 
+    @cached_property
+    def layout_parsing(self) -> LayoutParsing:
+        from zai.api_resource.ocr import LayoutParsing
+        return LayoutParsing(self)
+
     @property
     @override
     def auth_headers(self) -> dict[str, str]:
diff --git a/src/zai/_version.py b/src/zai/_version.py
@@ -1,2 +1,2 @@
 __title__ = 'Z.ai'
-__version__ = '0.2.1'
+__version__ = '0.2.2'
diff --git a/src/zai/api_resource/__init__.py b/src/zai/api_resource/__init__.py
@@ -14,7 +14,7 @@
 from .files import Files, FilesWithRawResponse
 from .images import Images
 from .moderations import Moderations
-from .ocr import HandwritingOCR
+from .ocr import HandwritingOCR, LayoutParsing
 from .tools import Tools
 from .videos import (
 	Videos,
@@ -40,5 +40,6 @@
     'WebReaderApi',
     'Agents',
     'FileParser',
-    'HandwritingOCR'
+    'HandwritingOCR',
+    'LayoutParsing',
 ]
diff --git a/src/zai/api_resource/ocr/__init__.py b/src/zai/api_resource/ocr/__init__.py
@@ -1,3 +1,4 @@
 from .handwriting_ocr import HandwritingOCR
+from .layout_parsing import LayoutParsing
 
-__all__ = ["HandwritingOCR"]
+__all__ = ["HandwritingOCR", "LayoutParsing"]
diff --git a/src/zai/api_resource/ocr/layout_parsing.py b/src/zai/api_resource/ocr/layout_parsing.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+import httpx
+
+from zai.core import (
+    BaseAPI,
+    NOT_GIVEN,
+    Body,
+    Headers,
+    NotGiven,
+    deepcopy_minimal,
+    make_request_options,
+)
+from zai.types.ocr.layout_parsing_resp import LayoutParsingResp
+
+if TYPE_CHECKING:
+    from zai._client import ZaiClient
+
+__all__ = ["LayoutParsing"]
+
+
+class LayoutParsing(BaseAPI):
+    """
+    Layout parsing API resource for document/image OCR with layout detection.
+    
+    This API supports parsing images and PDF documents to extract text content
+    with detailed layout information.
+    """
+
+    def __init__(self, client: "ZaiClient") -> None:
+        super().__init__(client)
+
+    def create(
+        self,
+        *,
+        model: str,
+        file: str,
+        return_crop_images: Optional[bool] | NotGiven = NOT_GIVEN,
+        need_layout_visualization: Optional[bool] | NotGiven = NOT_GIVEN,
+        start_page_id: Optional[int] | NotGiven = NOT_GIVEN,
+        end_page_id: Optional[int] | NotGiven = NOT_GIVEN,
+        request_id: Optional[str] | NotGiven = NOT_GIVEN,
+        user_id: Optional[str] | NotGiven = NOT_GIVEN,
+        extra_headers: Headers | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> LayoutParsingResp:
+        """
+        Parse document or image layout and extract text content.
+
+        Arguments:
+            model (str): Model code, e.g., 'GLM-OCR' or 'glm-ocr'
+            file (str): URL or base64 encoded image/PDF to parse.
+                Supported formats: PDF, JPG, PNG.
+                Size limits: Image ≤ 10MB, PDF ≤ 50MB, max 100 pages.
+            return_crop_images (Optional[bool]): Whether to return crop images.
+                Defaults to False. When True, returns cropped image information.
+            need_layout_visualization (Optional[bool]): Whether to return detailed layout visualization results.
+                Defaults to False. When True, returns detailed layout image result information.
+            start_page_id (Optional[int]): Starting page number for PDF parsing.
+            end_page_id (Optional[int]): Ending page number for PDF parsing.
+            request_id (Optional[str]): Unique request identifier. Auto-generated if not provided.
+            user_id (Optional[str]): End user ID for abuse monitoring.
+                Length: 6-128 characters.
+            extra_headers (Headers): Additional HTTP headers.
+            extra_body (Body): Additional request body parameters.
+            timeout (float | httpx.Timeout): Request timeout.
+
+        Returns:
+            LayoutParsingResp: Parsed layout result including:
+                - id: Task ID
+                - created: Unix timestamp
+                - model: Model name
+                - md_results: Markdown formatted recognition result
+                - crop_images: Cropped image information (if return_crop_images=True)
+                - layout_visualization: Detailed layout visualization information (if need_layout_visualization=True)
+                - data_info: Document metadata (page count, dimensions)
+        """
+        if not model:
+            raise ValueError("`model` must be provided.")
+        if not file:
+            raise ValueError("`file` must be provided.")
+
+        body = deepcopy_minimal(
+            {
+                "model": model,
+                "file": file,
+                "return_crop_images": return_crop_images,
+                "need_layout_visualization": need_layout_visualization,
+                "start_page_id": start_page_id,
+                "end_page_id": end_page_id,
+                "request_id": request_id,
+                "user_id": user_id,
+            }
+        )
+
+        return self._post(
+            "/layout_parsing",
+            body=body,
+            options=make_request_options(
+                extra_headers=extra_headers, extra_body=extra_body, timeout=timeout
+            ),
+            cast_type=LayoutParsingResp,
+        )
diff --git a/src/zai/types/ocr/layout_parsing_resp.py b/src/zai/types/ocr/layout_parsing_resp.py
@@ -0,0 +1,48 @@
+from typing import List, Optional, Dict, Any
+
+from zai.core import BaseModel
+
+__all__ = ["LayoutParsingResp", "LayoutDetail", "DataInfo", "PageInfo", "Usage"]
+
+
+class Usage(BaseModel):
+    """Token usage information"""
+    completion_tokens: int
+    prompt_tokens: int
+    prompt_tokens_details: Optional[Dict[str, Any]] = None
+    total_tokens: int
+
+
+class PageInfo(BaseModel):
+    """Page size information"""
+    width: int
+    height: int
+
+
+class DataInfo(BaseModel):
+    """Document basic information"""
+    num_pages: int
+    pages: Optional[List[PageInfo]] = None
+
+
+class LayoutDetail(BaseModel):
+    """Layout detail element"""
+    index: int
+    label: str
+    bbox_2d: Optional[List[float]] = None
+    content: Optional[str] = None
+    height: Optional[int] = None
+    width: Optional[int] = None
+
+
+class LayoutParsingResp(BaseModel):
+    """Response model for layout parsing API"""
+    id: str
+    created: int
+    model: str
+    md_results: Optional[str] = None
+    layout_details: Optional[List[List[LayoutDetail]]] = None
+    layout_visualization: Optional[List[str]] = None
+    data_info: Optional[DataInfo] = None
+    usage: Optional[Usage] = None
+    request_id: Optional[str] = None

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`__title__ = 'Z.ai'`
`2`		`-__version__ = '0.2.1'`
	`2`	`+__version__ = '0.2.2'`