diff --git a/examples/layout_parsing_example.py b/examples/layout_parsing_example.py new file mode 100644 index 0000000..f9fbc66 --- /dev/null +++ b/examples/layout_parsing_example.py @@ -0,0 +1,79 @@ +""" +Layout Parsing Example + +This example demonstrates how to use the layout_parsing API to parse +images and PDFs for OCR with detailed layout detection. + +The API returns: +- Markdown formatted text +- Detailed layout information (element positions, types, content) +- Visualization images +""" +from zai import ZaiClient + +def layout_parsing_example_with_url(): + """Example using an image URL for layout parsing.""" + client = ZaiClient() + + # Image URL to parse + image_url = "https://cdn.bigmodel.cn/static/platform/images/trialcenter/example/visual_img1.jpeg" + + print(f"Sending request to parse image: {image_url}") + + try: + response = client.layout_parsing.create( + model="glm-ocr", + file=image_url + ) + + print("\n✓ Request successful!") + print(f"Task ID: {response.id}") + print(f"Model: {response.model}") + print(f"Created at: {response.created}") + print(f"Request ID: {response.request_id}") + + # Print document info + if response.data_info: + print(f"\nDocument Info:") + print(f" - Total pages: {response.data_info.num_pages}") + if response.data_info.pages: + for i, page in enumerate(response.data_info.pages): + print(f" - Page {i+1}: {page.width}x{page.height}") + + # Print markdown results + print(f"\n=== Markdown Results ===\n") + print(response.md_results) + + # Print layout details if available + if response.layout_details: + print(f"\n=== Layout Details ===") + for page_idx, page_details in enumerate(response.layout_details): + print(f"\nPage {page_idx + 1}:") + for element in page_details: + print(f" [{element.index}] {element.label}: {element.content[:50] if element.content else 'N/A'}...") + if element.bbox_2d: + print(f" BBox: {element.bbox_2d}") + + # Print visualization URLs + if response.layout_visualization: + print(f"\n=== Visualization URLs ===") + for i, url in enumerate(response.layout_visualization): + print(f" Page {i + 1}: {url}") + + # Print usage information + if response.usage: + print(f"\n=== Token Usage ===") + print(f" - Prompt tokens: {response.usage.prompt_tokens}") + print(f" - Completion tokens: {response.usage.completion_tokens}") + print(f" - Total tokens: {response.usage.total_tokens}") + if response.usage.prompt_tokens_details: + print(f" - Prompt details: {response.usage.prompt_tokens_details}") + + except Exception as e: + print(f"\n✗ Error: {e}") + raise + + +if __name__ == "__main__": + # Run the URL example with the test image + layout_parsing_example_with_url() diff --git a/pyproject.toml b/pyproject.toml index 21dd79b..a539fd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "zai-sdk" -version = "0.2.1" +version = "0.2.2" description = "A SDK library for accessing big model apis from Z.ai" authors = ["Z.ai"] readme = "README.md" diff --git a/src/zai/_client.py b/src/zai/_client.py index faadce9..a9d7c09 100644 --- a/src/zai/_client.py +++ b/src/zai/_client.py @@ -24,7 +24,7 @@ from zai.api_resource.web_search import WebSearchApi from zai.api_resource.web_reader import WebReaderApi from zai.api_resource.file_parser import FileParser - from zai.api_resource.ocr import HandwritingOCR + from zai.api_resource.ocr import HandwritingOCR, LayoutParsing from .core import ( NOT_GIVEN, @@ -206,6 +206,11 @@ def ocr(self) -> HandwritingOCR: from zai.api_resource.ocr import HandwritingOCR return HandwritingOCR(self) + @cached_property + def layout_parsing(self) -> LayoutParsing: + from zai.api_resource.ocr import LayoutParsing + return LayoutParsing(self) + @property @override def auth_headers(self) -> dict[str, str]: diff --git a/src/zai/_version.py b/src/zai/_version.py index 9883b7a..3acae28 100644 --- a/src/zai/_version.py +++ b/src/zai/_version.py @@ -1,2 +1,2 @@ __title__ = 'Z.ai' -__version__ = '0.2.1' +__version__ = '0.2.2' diff --git a/src/zai/api_resource/__init__.py b/src/zai/api_resource/__init__.py index f2bd160..e0aba3b 100644 --- a/src/zai/api_resource/__init__.py +++ b/src/zai/api_resource/__init__.py @@ -14,7 +14,7 @@ from .files import Files, FilesWithRawResponse from .images import Images from .moderations import Moderations -from .ocr import HandwritingOCR +from .ocr import HandwritingOCR, LayoutParsing from .tools import Tools from .videos import ( Videos, @@ -40,5 +40,6 @@ 'WebReaderApi', 'Agents', 'FileParser', - 'HandwritingOCR' + 'HandwritingOCR', + 'LayoutParsing', ] diff --git a/src/zai/api_resource/ocr/__init__.py b/src/zai/api_resource/ocr/__init__.py index 0a9a47a..ac2f751 100644 --- a/src/zai/api_resource/ocr/__init__.py +++ b/src/zai/api_resource/ocr/__init__.py @@ -1,3 +1,4 @@ from .handwriting_ocr import HandwritingOCR +from .layout_parsing import LayoutParsing -__all__ = ["HandwritingOCR"] +__all__ = ["HandwritingOCR", "LayoutParsing"] diff --git a/src/zai/api_resource/ocr/layout_parsing.py b/src/zai/api_resource/ocr/layout_parsing.py new file mode 100644 index 0000000..ff10ef6 --- /dev/null +++ b/src/zai/api_resource/ocr/layout_parsing.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +import httpx + +from zai.core import ( + BaseAPI, + NOT_GIVEN, + Body, + Headers, + NotGiven, + deepcopy_minimal, + make_request_options, +) +from zai.types.ocr.layout_parsing_resp import LayoutParsingResp + +if TYPE_CHECKING: + from zai._client import ZaiClient + +__all__ = ["LayoutParsing"] + + +class LayoutParsing(BaseAPI): + """ + Layout parsing API resource for document/image OCR with layout detection. + + This API supports parsing images and PDF documents to extract text content + with detailed layout information. + """ + + def __init__(self, client: "ZaiClient") -> None: + super().__init__(client) + + def create( + self, + *, + model: str, + file: str, + return_crop_images: Optional[bool] | NotGiven = NOT_GIVEN, + need_layout_visualization: Optional[bool] | NotGiven = NOT_GIVEN, + start_page_id: Optional[int] | NotGiven = NOT_GIVEN, + end_page_id: Optional[int] | NotGiven = NOT_GIVEN, + request_id: Optional[str] | NotGiven = NOT_GIVEN, + user_id: Optional[str] | NotGiven = NOT_GIVEN, + extra_headers: Headers | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> LayoutParsingResp: + """ + Parse document or image layout and extract text content. + + Arguments: + model (str): Model code, e.g., 'GLM-OCR' or 'glm-ocr' + file (str): URL or base64 encoded image/PDF to parse. + Supported formats: PDF, JPG, PNG. + Size limits: Image ≤ 10MB, PDF ≤ 50MB, max 100 pages. + return_crop_images (Optional[bool]): Whether to return crop images. + Defaults to False. When True, returns cropped image information. + need_layout_visualization (Optional[bool]): Whether to return detailed layout visualization results. + Defaults to False. When True, returns detailed layout image result information. + start_page_id (Optional[int]): Starting page number for PDF parsing. + end_page_id (Optional[int]): Ending page number for PDF parsing. + request_id (Optional[str]): Unique request identifier. Auto-generated if not provided. + user_id (Optional[str]): End user ID for abuse monitoring. + Length: 6-128 characters. + extra_headers (Headers): Additional HTTP headers. + extra_body (Body): Additional request body parameters. + timeout (float | httpx.Timeout): Request timeout. + + Returns: + LayoutParsingResp: Parsed layout result including: + - id: Task ID + - created: Unix timestamp + - model: Model name + - md_results: Markdown formatted recognition result + - crop_images: Cropped image information (if return_crop_images=True) + - layout_visualization: Detailed layout visualization information (if need_layout_visualization=True) + - data_info: Document metadata (page count, dimensions) + """ + if not model: + raise ValueError("`model` must be provided.") + if not file: + raise ValueError("`file` must be provided.") + + body = deepcopy_minimal( + { + "model": model, + "file": file, + "return_crop_images": return_crop_images, + "need_layout_visualization": need_layout_visualization, + "start_page_id": start_page_id, + "end_page_id": end_page_id, + "request_id": request_id, + "user_id": user_id, + } + ) + + return self._post( + "/layout_parsing", + body=body, + options=make_request_options( + extra_headers=extra_headers, extra_body=extra_body, timeout=timeout + ), + cast_type=LayoutParsingResp, + ) diff --git a/src/zai/types/ocr/layout_parsing_resp.py b/src/zai/types/ocr/layout_parsing_resp.py new file mode 100644 index 0000000..129e65f --- /dev/null +++ b/src/zai/types/ocr/layout_parsing_resp.py @@ -0,0 +1,48 @@ +from typing import List, Optional, Dict, Any + +from zai.core import BaseModel + +__all__ = ["LayoutParsingResp", "LayoutDetail", "DataInfo", "PageInfo", "Usage"] + + +class Usage(BaseModel): + """Token usage information""" + completion_tokens: int + prompt_tokens: int + prompt_tokens_details: Optional[Dict[str, Any]] = None + total_tokens: int + + +class PageInfo(BaseModel): + """Page size information""" + width: int + height: int + + +class DataInfo(BaseModel): + """Document basic information""" + num_pages: int + pages: Optional[List[PageInfo]] = None + + +class LayoutDetail(BaseModel): + """Layout detail element""" + index: int + label: str + bbox_2d: Optional[List[float]] = None + content: Optional[str] = None + height: Optional[int] = None + width: Optional[int] = None + + +class LayoutParsingResp(BaseModel): + """Response model for layout parsing API""" + id: str + created: int + model: str + md_results: Optional[str] = None + layout_details: Optional[List[List[LayoutDetail]]] = None + layout_visualization: Optional[List[str]] = None + data_info: Optional[DataInfo] = None + usage: Optional[Usage] = None + request_id: Optional[str] = None