From a4c4c201543d4b3e9611ae397a7f18e0b156e6ab Mon Sep 17 00:00:00 2001 From: tomsun28 Date: Sun, 1 Feb 2026 17:31:19 +0800 Subject: [PATCH 1/6] feat: add api layout_parsing --- src/zai/_client.py | 7 +- src/zai/api_resource/__init__.py | 5 +- src/zai/api_resource/ocr/__init__.py | 3 +- src/zai/api_resource/ocr/layout_parsing.py | 102 +++++++++++++++++++++ src/zai/types/ocr/layout_parsing_resp.py | 39 ++++++++ 5 files changed, 152 insertions(+), 4 deletions(-) create mode 100644 src/zai/api_resource/ocr/layout_parsing.py create mode 100644 src/zai/types/ocr/layout_parsing_resp.py diff --git a/src/zai/_client.py b/src/zai/_client.py index faadce9..a9d7c09 100644 --- a/src/zai/_client.py +++ b/src/zai/_client.py @@ -24,7 +24,7 @@ from zai.api_resource.web_search import WebSearchApi from zai.api_resource.web_reader import WebReaderApi from zai.api_resource.file_parser import FileParser - from zai.api_resource.ocr import HandwritingOCR + from zai.api_resource.ocr import HandwritingOCR, LayoutParsing from .core import ( NOT_GIVEN, @@ -206,6 +206,11 @@ def ocr(self) -> HandwritingOCR: from zai.api_resource.ocr import HandwritingOCR return HandwritingOCR(self) + @cached_property + def layout_parsing(self) -> LayoutParsing: + from zai.api_resource.ocr import LayoutParsing + return LayoutParsing(self) + @property @override def auth_headers(self) -> dict[str, str]: diff --git a/src/zai/api_resource/__init__.py b/src/zai/api_resource/__init__.py index f2bd160..e0aba3b 100644 --- a/src/zai/api_resource/__init__.py +++ b/src/zai/api_resource/__init__.py @@ -14,7 +14,7 @@ from .files import Files, FilesWithRawResponse from .images import Images from .moderations import Moderations -from .ocr import HandwritingOCR +from .ocr import HandwritingOCR, LayoutParsing from .tools import Tools from .videos import ( Videos, @@ -40,5 +40,6 @@ 'WebReaderApi', 'Agents', 'FileParser', - 'HandwritingOCR' + 'HandwritingOCR', + 'LayoutParsing', ] diff --git a/src/zai/api_resource/ocr/__init__.py b/src/zai/api_resource/ocr/__init__.py index 0a9a47a..ac2f751 100644 --- a/src/zai/api_resource/ocr/__init__.py +++ b/src/zai/api_resource/ocr/__init__.py @@ -1,3 +1,4 @@ from .handwriting_ocr import HandwritingOCR +from .layout_parsing import LayoutParsing -__all__ = ["HandwritingOCR"] +__all__ = ["HandwritingOCR", "LayoutParsing"] diff --git a/src/zai/api_resource/ocr/layout_parsing.py b/src/zai/api_resource/ocr/layout_parsing.py new file mode 100644 index 0000000..b8982ba --- /dev/null +++ b/src/zai/api_resource/ocr/layout_parsing.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +import httpx + +from zai.core import ( + BaseAPI, + NOT_GIVEN, + Body, + Headers, + NotGiven, + deepcopy_minimal, + make_request_options, +) +from zai.types.ocr.layout_parsing_resp import LayoutParsingResp + +if TYPE_CHECKING: + from zai._client import ZaiClient + +__all__ = ["LayoutParsing"] + + +class LayoutParsing(BaseAPI): + """ + Layout parsing API resource for document/image OCR with layout detection. + + This API supports parsing images and PDF documents to extract text content + with detailed layout information. + """ + + def __init__(self, client: "ZaiClient") -> None: + super().__init__(client) + + def create( + self, + *, + model: str, + file: str, + use_layout_details: Optional[bool] | NotGiven = NOT_GIVEN, + start_page_id: Optional[int] | NotGiven = NOT_GIVEN, + end_page_id: Optional[int] | NotGiven = NOT_GIVEN, + request_id: Optional[str] | NotGiven = NOT_GIVEN, + user_id: Optional[str] | NotGiven = NOT_GIVEN, + extra_headers: Headers | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + ) -> LayoutParsingResp: + """ + Parse document or image layout and extract text content. + + Arguments: + model (str): Model code, e.g., 'GLM-OCR' or 'glm-ocr' + file (str): URL or base64 encoded image/PDF to parse. + Supported formats: PDF, JPG, PNG. + Size limits: Image ≤ 10MB, PDF ≤ 50MB, max 100 pages. + use_layout_details (Optional[bool]): Whether to return detailed layout info. + Defaults to True. When True, returns more detailed element positions and types. + start_page_id (Optional[int]): Starting page number for PDF parsing. + end_page_id (Optional[int]): Ending page number for PDF parsing. + request_id (Optional[str]): Unique request identifier. Auto-generated if not provided. + user_id (Optional[str]): End user ID for abuse monitoring. + Length: 6-128 characters. + extra_headers (Headers): Additional HTTP headers. + extra_body (Body): Additional request body parameters. + timeout (float | httpx.Timeout): Request timeout. + + Returns: + LayoutParsingResp: Parsed layout result including: + - id: Task ID + - created: Unix timestamp + - model: Model name + - md_results: Markdown formatted recognition result + - layout_details: Detailed layout information (if use_layout_details=True) + - layout_visualization: Visualization image URLs + - data_info: Document metadata (page count, dimensions) + """ + if not model: + raise ValueError("`model` must be provided.") + if not file: + raise ValueError("`file` must be provided.") + + body = deepcopy_minimal( + { + "model": model, + "file": file, + "use_layout_details": use_layout_details, + "start_page_id": start_page_id, + "end_page_id": end_page_id, + "request_id": request_id, + "user_id": user_id, + } + ) + + return self._post( + "/layout_parsing", + body=body, + options=make_request_options( + extra_headers=extra_headers, extra_body=extra_body, timeout=timeout + ), + cast_type=LayoutParsingResp, + ) diff --git a/src/zai/types/ocr/layout_parsing_resp.py b/src/zai/types/ocr/layout_parsing_resp.py new file mode 100644 index 0000000..a2b13c1 --- /dev/null +++ b/src/zai/types/ocr/layout_parsing_resp.py @@ -0,0 +1,39 @@ +from typing import List, Optional + +from zai.core import BaseModel + +__all__ = ["LayoutParsingResp", "LayoutDetail", "DataInfo", "PageInfo"] + + +class PageInfo(BaseModel): + """Page size information""" + width: int + height: int + + +class DataInfo(BaseModel): + """Document basic information""" + num_pages: int + pages: Optional[List[PageInfo]] = None + + +class LayoutDetail(BaseModel): + """Layout detail element""" + index: int + label: str + bbox_2d: Optional[List[float]] = None + content: Optional[str] = None + height: Optional[int] = None + width: Optional[int] = None + + +class LayoutParsingResp(BaseModel): + """Response model for layout parsing API""" + id: str + created: int + model: str + md_results: Optional[str] = None + layout_details: Optional[List[List[LayoutDetail]]] = None + layout_visualization: Optional[List[str]] = None + data_info: Optional[DataInfo] = None + request_id: Optional[str] = None From c1dce70024a4c10332bc0eb48e33fe8787ca6013 Mon Sep 17 00:00:00 2001 From: tomsun28 Date: Sun, 1 Feb 2026 17:40:44 +0800 Subject: [PATCH 2/6] feat: add layout parsing example demonstrating OCR with layout detection --- examples/layout_parsing_example.py | 136 +++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 examples/layout_parsing_example.py diff --git a/examples/layout_parsing_example.py b/examples/layout_parsing_example.py new file mode 100644 index 0000000..48e8fcb --- /dev/null +++ b/examples/layout_parsing_example.py @@ -0,0 +1,136 @@ +""" +Layout Parsing Example + +This example demonstrates how to use the layout_parsing API to parse +images and PDFs for OCR with detailed layout detection. + +The API returns: +- Markdown formatted text +- Detailed layout information (element positions, types, content) +- Visualization images +""" +from zai import ZaiClient + +def layout_parsing_example_with_url(): + """Example using an image URL for layout parsing.""" + client = ZaiClient( + base_url=BASE_URL, + api_key=API_KEY + ) + + # Image URL to parse + image_url = "https://cdn.bigmodel.cn/static/platform/images/trialcenter/example/visual_img1.jpeg" + + print("=== Layout Parsing Example (URL) ===\n") + print(f"Sending request to parse image: {image_url}") + + try: + response = client.layout_parsing.create( + model="glm-ocr", + file=image_url, + use_layout_details=True # Get detailed layout info + ) + + print("\n✓ Request successful!") + print(f"Task ID: {response.id}") + print(f"Model: {response.model}") + print(f"Created at: {response.created}") + print(f"Request ID: {response.request_id}") + + # Print document info + if response.data_info: + print(f"\nDocument Info:") + print(f" - Total pages: {response.data_info.num_pages}") + if response.data_info.pages: + for i, page in enumerate(response.data_info.pages): + print(f" - Page {i+1}: {page.width}x{page.height}") + + # Print markdown results + print(f"\n=== Markdown Results ===\n") + print(response.md_results) + + # Print layout details if available + if response.layout_details: + print(f"\n=== Layout Details ===") + for page_idx, page_details in enumerate(response.layout_details): + print(f"\nPage {page_idx + 1}:") + for element in page_details: + print(f" [{element.index}] {element.label}: {element.content[:50] if element.content else 'N/A'}...") + if element.bbox_2d: + print(f" BBox: {element.bbox_2d}") + + # Print visualization URLs + if response.layout_visualization: + print(f"\n=== Visualization URLs ===") + for i, url in enumerate(response.layout_visualization): + print(f" Page {i + 1}: {url}") + + except Exception as e: + print(f"\n✗ Error: {e}") + raise + + +def layout_parsing_example_with_base64(): + """Example using a base64 encoded image for layout parsing.""" + client = ZaiClient() + + print("\n=== Layout Parsing Example (Base64) ===\n") + print("Note: Replace 'your_base64_encoded_image' with an actual base64 string") + + # Example with base64 (you would read and encode an actual image file) + # import base64 + # with open("image.png", "rb") as f: + # base64_image = base64.b64encode(f.read()).decode("utf-8") + # base64_url = f"data:image/png;base64,{base64_image}" + + # For demonstration, this shows the API usage + # Uncomment and use with actual base64 data: + """ + response = client.layout_parsing.create( + model="glm-ocr", + file="data:image/jpeg;base64,/9j/4AAQ...", # your base64 data + use_layout_details=True + ) + print(response.md_results) + """ + print("(Skipping base64 example - no actual image data provided)") + + +def layout_parsing_example_with_pdf_pages(): + """Example using PDF with page range selection.""" + client = ZaiClient( + base_url=BASE_URL, + api_key=API_KEY + ) + + print("\n=== Layout Parsing Example (PDF with page range) ===\n") + print("Note: Replace 'your_pdf_url' with an actual PDF URL") + + # For demonstration, this shows the API usage + # Uncomment and use with actual PDF: + """ + pdf_url = "https://example.com/document.pdf" + + response = client.layout_parsing.create( + model="glm-ocr", + file=pdf_url, + use_layout_details=True, + start_page_id=1, # Start from page 1 + end_page_id=5, # Parse up to page 5 + request_id="unique-request-id-123", # Optional custom request ID + user_id="user-456" # Optional user ID for tracking + ) + + print(f"Parsed {response.data_info.num_pages} pages") + print(response.md_results) + """ + print("(Skipping PDF example - no actual PDF URL provided)") + + +if __name__ == "__main__": + # Run the URL example with the test image + layout_parsing_example_with_url() + + # Uncomment to run other examples: + # layout_parsing_example_with_base64() + # layout_parsing_example_with_pdf_pages() From 4af699029709ca5edb9505eb3240aa8696e81839 Mon Sep 17 00:00:00 2001 From: tomsun28 Date: Sun, 1 Feb 2026 17:42:44 +0800 Subject: [PATCH 3/6] update --- pyproject.toml | 2 +- src/zai/_version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 21dd79b..a539fd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "zai-sdk" -version = "0.2.1" +version = "0.2.2" description = "A SDK library for accessing big model apis from Z.ai" authors = ["Z.ai"] readme = "README.md" diff --git a/src/zai/_version.py b/src/zai/_version.py index 9883b7a..3acae28 100644 --- a/src/zai/_version.py +++ b/src/zai/_version.py @@ -1,2 +1,2 @@ __title__ = 'Z.ai' -__version__ = '0.2.1' +__version__ = '0.2.2' From 23ab6286098b4499a2c2a4f0a1dfa794fe8bb0c8 Mon Sep 17 00:00:00 2001 From: tomsun28 Date: Sun, 1 Feb 2026 17:48:21 +0800 Subject: [PATCH 4/6] refactor: simplify layout parsing example by removing unused functions and comments --- examples/layout_parsing_example.py | 72 +----------------------------- 1 file changed, 1 insertion(+), 71 deletions(-) diff --git a/examples/layout_parsing_example.py b/examples/layout_parsing_example.py index 48e8fcb..059753a 100644 --- a/examples/layout_parsing_example.py +++ b/examples/layout_parsing_example.py @@ -3,25 +3,16 @@ This example demonstrates how to use the layout_parsing API to parse images and PDFs for OCR with detailed layout detection. - -The API returns: -- Markdown formatted text -- Detailed layout information (element positions, types, content) -- Visualization images """ from zai import ZaiClient def layout_parsing_example_with_url(): """Example using an image URL for layout parsing.""" - client = ZaiClient( - base_url=BASE_URL, - api_key=API_KEY - ) + client = ZaiClient() # Image URL to parse image_url = "https://cdn.bigmodel.cn/static/platform/images/trialcenter/example/visual_img1.jpeg" - print("=== Layout Parsing Example (URL) ===\n") print(f"Sending request to parse image: {image_url}") try: @@ -70,67 +61,6 @@ def layout_parsing_example_with_url(): raise -def layout_parsing_example_with_base64(): - """Example using a base64 encoded image for layout parsing.""" - client = ZaiClient() - - print("\n=== Layout Parsing Example (Base64) ===\n") - print("Note: Replace 'your_base64_encoded_image' with an actual base64 string") - - # Example with base64 (you would read and encode an actual image file) - # import base64 - # with open("image.png", "rb") as f: - # base64_image = base64.b64encode(f.read()).decode("utf-8") - # base64_url = f"data:image/png;base64,{base64_image}" - - # For demonstration, this shows the API usage - # Uncomment and use with actual base64 data: - """ - response = client.layout_parsing.create( - model="glm-ocr", - file="data:image/jpeg;base64,/9j/4AAQ...", # your base64 data - use_layout_details=True - ) - print(response.md_results) - """ - print("(Skipping base64 example - no actual image data provided)") - - -def layout_parsing_example_with_pdf_pages(): - """Example using PDF with page range selection.""" - client = ZaiClient( - base_url=BASE_URL, - api_key=API_KEY - ) - - print("\n=== Layout Parsing Example (PDF with page range) ===\n") - print("Note: Replace 'your_pdf_url' with an actual PDF URL") - - # For demonstration, this shows the API usage - # Uncomment and use with actual PDF: - """ - pdf_url = "https://example.com/document.pdf" - - response = client.layout_parsing.create( - model="glm-ocr", - file=pdf_url, - use_layout_details=True, - start_page_id=1, # Start from page 1 - end_page_id=5, # Parse up to page 5 - request_id="unique-request-id-123", # Optional custom request ID - user_id="user-456" # Optional user ID for tracking - ) - - print(f"Parsed {response.data_info.num_pages} pages") - print(response.md_results) - """ - print("(Skipping PDF example - no actual PDF URL provided)") - - if __name__ == "__main__": # Run the URL example with the test image layout_parsing_example_with_url() - - # Uncomment to run other examples: - # layout_parsing_example_with_base64() - # layout_parsing_example_with_pdf_pages() From 7219af8fd62668e0c52d6a45a82407d7d65e0ce9 Mon Sep 17 00:00:00 2001 From: tomsun28 Date: Mon, 2 Feb 2026 16:11:48 +0800 Subject: [PATCH 5/6] feat: add api layout_parsing --- examples/layout_parsing_example.py | 14 ++++++++++++++ src/zai/types/ocr/layout_parsing_resp.py | 13 +++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/examples/layout_parsing_example.py b/examples/layout_parsing_example.py index 059753a..5a76139 100644 --- a/examples/layout_parsing_example.py +++ b/examples/layout_parsing_example.py @@ -3,6 +3,11 @@ This example demonstrates how to use the layout_parsing API to parse images and PDFs for OCR with detailed layout detection. + +The API returns: +- Markdown formatted text +- Detailed layout information (element positions, types, content) +- Visualization images """ from zai import ZaiClient @@ -56,6 +61,15 @@ def layout_parsing_example_with_url(): for i, url in enumerate(response.layout_visualization): print(f" Page {i + 1}: {url}") + # Print usage information + if response.usage: + print(f"\n=== Token Usage ===") + print(f" - Prompt tokens: {response.usage.prompt_tokens}") + print(f" - Completion tokens: {response.usage.completion_tokens}") + print(f" - Total tokens: {response.usage.total_tokens}") + if response.usage.prompt_tokens_details: + print(f" - Prompt details: {response.usage.prompt_tokens_details}") + except Exception as e: print(f"\n✗ Error: {e}") raise diff --git a/src/zai/types/ocr/layout_parsing_resp.py b/src/zai/types/ocr/layout_parsing_resp.py index a2b13c1..129e65f 100644 --- a/src/zai/types/ocr/layout_parsing_resp.py +++ b/src/zai/types/ocr/layout_parsing_resp.py @@ -1,8 +1,16 @@ -from typing import List, Optional +from typing import List, Optional, Dict, Any from zai.core import BaseModel -__all__ = ["LayoutParsingResp", "LayoutDetail", "DataInfo", "PageInfo"] +__all__ = ["LayoutParsingResp", "LayoutDetail", "DataInfo", "PageInfo", "Usage"] + + +class Usage(BaseModel): + """Token usage information""" + completion_tokens: int + prompt_tokens: int + prompt_tokens_details: Optional[Dict[str, Any]] = None + total_tokens: int class PageInfo(BaseModel): @@ -36,4 +44,5 @@ class LayoutParsingResp(BaseModel): layout_details: Optional[List[List[LayoutDetail]]] = None layout_visualization: Optional[List[str]] = None data_info: Optional[DataInfo] = None + usage: Optional[Usage] = None request_id: Optional[str] = None From 08085f6ef65c968fb522057b4bbe4a1d1ba3589a Mon Sep 17 00:00:00 2001 From: tomsun28 Date: Mon, 2 Feb 2026 23:54:18 +0800 Subject: [PATCH 6/6] refactor: rename layout parsing parameters for clarity Replace `use_layout_details` with `return_crop_images` and `need_layout_visualization` to better reflect their specific purposes. Update documentation and remove parameter from example. --- examples/layout_parsing_example.py | 3 +-- src/zai/api_resource/ocr/layout_parsing.py | 16 ++++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/examples/layout_parsing_example.py b/examples/layout_parsing_example.py index 5a76139..f9fbc66 100644 --- a/examples/layout_parsing_example.py +++ b/examples/layout_parsing_example.py @@ -23,8 +23,7 @@ def layout_parsing_example_with_url(): try: response = client.layout_parsing.create( model="glm-ocr", - file=image_url, - use_layout_details=True # Get detailed layout info + file=image_url ) print("\n✓ Request successful!") diff --git a/src/zai/api_resource/ocr/layout_parsing.py b/src/zai/api_resource/ocr/layout_parsing.py index b8982ba..ff10ef6 100644 --- a/src/zai/api_resource/ocr/layout_parsing.py +++ b/src/zai/api_resource/ocr/layout_parsing.py @@ -37,7 +37,8 @@ def create( *, model: str, file: str, - use_layout_details: Optional[bool] | NotGiven = NOT_GIVEN, + return_crop_images: Optional[bool] | NotGiven = NOT_GIVEN, + need_layout_visualization: Optional[bool] | NotGiven = NOT_GIVEN, start_page_id: Optional[int] | NotGiven = NOT_GIVEN, end_page_id: Optional[int] | NotGiven = NOT_GIVEN, request_id: Optional[str] | NotGiven = NOT_GIVEN, @@ -54,8 +55,10 @@ def create( file (str): URL or base64 encoded image/PDF to parse. Supported formats: PDF, JPG, PNG. Size limits: Image ≤ 10MB, PDF ≤ 50MB, max 100 pages. - use_layout_details (Optional[bool]): Whether to return detailed layout info. - Defaults to True. When True, returns more detailed element positions and types. + return_crop_images (Optional[bool]): Whether to return crop images. + Defaults to False. When True, returns cropped image information. + need_layout_visualization (Optional[bool]): Whether to return detailed layout visualization results. + Defaults to False. When True, returns detailed layout image result information. start_page_id (Optional[int]): Starting page number for PDF parsing. end_page_id (Optional[int]): Ending page number for PDF parsing. request_id (Optional[str]): Unique request identifier. Auto-generated if not provided. @@ -71,8 +74,8 @@ def create( - created: Unix timestamp - model: Model name - md_results: Markdown formatted recognition result - - layout_details: Detailed layout information (if use_layout_details=True) - - layout_visualization: Visualization image URLs + - crop_images: Cropped image information (if return_crop_images=True) + - layout_visualization: Detailed layout visualization information (if need_layout_visualization=True) - data_info: Document metadata (page count, dimensions) """ if not model: @@ -84,7 +87,8 @@ def create( { "model": model, "file": file, - "use_layout_details": use_layout_details, + "return_crop_images": return_crop_images, + "need_layout_visualization": need_layout_visualization, "start_page_id": start_page_id, "end_page_id": end_page_id, "request_id": request_id,