Ontos-AI · suguanYang · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/docs/usage.md b/docs/usage.md
@@ -249,7 +249,7 @@ Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`).
 | `content` | `str` | The text content |
 | `path` | `str \| None` | Document structure path (e.g. `"Section 1 > Subsection 2"`) |
 | `length` | `int` | Character count |
-| `tokens` | `int \| None` | Token count (if computed) |
+| `tokens` | `List[str] \| None` | Tokenized words returned by the parser pipeline |
 | `keywords` | `List[str] \| None` | Extracted keywords (requires `summary_txt: True`) |
 | `summary` | `str \| None` | AI-generated summary (requires `summary_txt: True`) |
 | `relationships` | `List \| None` | Relationships to other chunks |
@@ -729,4 +729,4 @@ def verify_webhook(payload: bytes, signature: str, secret: str) -> bool:
 
 ### Retry behavior
 
-If your webhook endpoint returns a non-2xx status code, the API retries delivery up to 6 times with exponential backoff.
+If your webhook endpoint returns a non-2xx status code, the API retries delivery up to 6 times with exponential backoff.
diff --git a/src/knowhere/lib/result_parser.py b/src/knowhere/lib/result_parser.py
@@ -18,6 +18,7 @@
     ParseResult,
     TableChunk,
     TextChunk,
+    TextChunkTokens,
 )
 
 _logger = getLogger()
@@ -79,6 +80,38 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]:
     return fallback
 
 
+def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]:
+    """Return a string-only token list with empty values removed."""
+    normalized_tokens: List[str] = []
+    for raw_token in raw_tokens:
+        token_text: str = str(raw_token).strip()
+        if token_text:
+            normalized_tokens.append(token_text)
+    return normalized_tokens
+
+
+def _parseTextChunkTokens(
+    raw_tokens: Any,
+    *,
+    chunk_id: str,
+) -> Optional[TextChunkTokens]:
+    """Normalize text chunk tokens from the current backend payload."""
+    if raw_tokens is None:
+        return None
+    if isinstance(raw_tokens, bool):
+        raise KnowhereError(
+            f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool."
+        )
+    if isinstance(raw_tokens, list):
+        return _normalizeTokenList(raw_tokens)
+
+    raise KnowhereError(
+        "Invalid tokens payload for text chunk "
+        f"'{chunk_id}': expected list[str], "
+        f"got {type(raw_tokens).__name__}."
+    )
+
+
 def _buildChunks(
     raw_chunks: List[Dict[str, Any]],
     zf: zipfile.ZipFile,
@@ -127,13 +160,15 @@ def _buildChunks(
             )
         else:
             metadata = raw.get("metadata", {})
+            chunk_id: str = raw.get("chunk_id", "")
+            raw_tokens: Any = metadata.get("tokens", raw.get("tokens"))
             chunk = TextChunk(
-                chunk_id=raw.get("chunk_id", ""),
+                chunk_id=chunk_id,
                 type="text",
                 content=raw.get("content", ""),
                 path=raw.get("path"),
                 length=metadata.get("length", raw.get("length", 0)),
-                tokens=metadata.get("tokens", raw.get("tokens")),
+                tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
                 keywords=metadata.get("keywords", raw.get("keywords")),
                 summary=metadata.get("summary", raw.get("summary")),
                 relationships=metadata.get("relationships", raw.get("relationships")),

diff --git a/src/knowhere/types/result.py b/src/knowhere/types/result.py
@@ -8,6 +8,7 @@
 from typing import Any, Dict, List, Optional, Union
 
 from pydantic import BaseModel, Field
+from typing_extensions import TypeAlias
 
 from knowhere._exceptions import ValidationError
 
@@ -124,12 +125,15 @@ class BaseChunk(BaseModel):
     path: Optional[str] = None
 
 
+TextChunkTokens: TypeAlias = List[str]
+
+
 class TextChunk(BaseChunk):
     """A text chunk extracted from the document."""
 
     type: str = "text"
     length: int = 0
-    tokens: Optional[int] = None
+    tokens: Optional[TextChunkTokens] = None
     keywords: Optional[List[str]] = None
     summary: Optional[str] = None
     relationships: Optional[List[Union[Dict[str, Any], str]]] = None

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -138,7 +138,7 @@ def _factory(
         "content": "Hello world",
         "path": "test/section1",
         "length": 11,
-        "tokens": 2,
+        "tokens": ["Hello", "world"],
         "keywords": ["hello"],
         "summary": "A greeting",
         "relationships": [],

diff --git a/tests/fixtures/real_result.zip b/tests/fixtures/real_result.zip
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -391,7 +391,7 @@ def test_from_dict(self) -> None:
             content="Some text content",
             path="doc/section1",
             length=17,
-            tokens=4,
+            tokens=["Some", "text", "content"],
             keywords=["text", "content"],
             summary="A text chunk",
             relationships=[{"target": "text_2", "type": "follows"}],
@@ -400,7 +400,7 @@ def test_from_dict(self) -> None:
         assert chunk.type == "text"
         assert chunk.content == "Some text content"
         assert chunk.length == 17
-        assert chunk.tokens == 4
+        assert chunk.tokens == ["Some", "text", "content"]
         assert chunk.keywords == ["text", "content"]
         assert chunk.summary == "A text chunk"
         assert chunk.relationships is not None
@@ -419,6 +419,13 @@ def test_is_instance_of_base_chunk(self) -> None:
         chunk: TextChunk = TextChunk(chunk_id="text_3")
         assert isinstance(chunk, BaseChunk)
 
+    def test_accepts_tokens_list(self) -> None:
+        chunk: TextChunk = TextChunk(
+            chunk_id="text_4",
+            tokens=["attention", "transformer"],
+        )
+        assert chunk.tokens == ["attention", "transformer"]
+
 
 # ---------------------------------------------------------------------------
 # ImageChunk model

diff --git a/tests/test_result_parser.py b/tests/test_result_parser.py
@@ -35,7 +35,7 @@
         "content": "Hello world",
         "path": "test/section1",
         "length": 11,
-        "tokens": 2,
+        "tokens": ["Hello", "world"],
         "keywords": ["hello"],
         "summary": "A greeting",
         "relationships": [],
@@ -52,6 +52,8 @@
     },
 ]
 
+TEXT_TOKENS_LIST: List[str] = ["Ashish", "Vaswani", "attention", "transformer"]
+
 MARKDOWN: str = "# Test\n\nHello world"
 IMAGE_BYTES: bytes = b"\xff\xd8\xff\xe0"
 
@@ -147,6 +149,74 @@ def test_loads_text_chunks(self) -> None:
         assert text_chunks[0].chunk_id == "text_chunk_1"
         assert text_chunks[0].content == "Hello world"
 
+    def test_accepts_text_chunk_tokens_as_list(self) -> None:
+        manifest: Dict[str, Any] = _make_manifest()
+        chunks: List[Dict[str, Any]] = [
+            {
+                "chunk_id": "text_chunk_tokens_list",
+                "type": "text",
+                "content": "Attention is all you need",
+                "path": "paper/abstract",
+                "metadata": {
+                    "length": 25,
+                    "tokens": TEXT_TOKENS_LIST,
+                    "keywords": ["attention", "transformer"],
+                    "summary": "Transformer introduction",
+                    "relationships": [],
+                },
+            }
+        ]
+        zip_bytes: bytes = _build_zip(manifest, chunks=chunks)
+
+        result: ParseResult = parseResultZip(zip_bytes, verify_checksum=False)
+
+        assert len(result.text_chunks) == 1
+        assert result.text_chunks[0].tokens == TEXT_TOKENS_LIST
+
+    def test_rejects_legacy_text_chunk_tokens_string(self) -> None:
+        manifest: Dict[str, Any] = _make_manifest()
+        chunks: List[Dict[str, Any]] = [
+            {
+                "chunk_id": "text_chunk_tokens_string",
+                "type": "text",
+                "content": "Attention is all you need",
+                "path": "paper/abstract",
+                "metadata": {
+                    "length": 25,
+                    "tokens": "Ashish;Vaswani;attention;transformer",
+                    "keywords": ["attention", "transformer"],
+                    "summary": "Transformer introduction",
+                    "relationships": [],
+                },
+            }
+        ]
+        zip_bytes: bytes = _build_zip(manifest, chunks=chunks)
+
+        with pytest.raises(KnowhereError, match="expected list\\[str\\]"):
+            parseResultZip(zip_bytes, verify_checksum=False)
+
+    def test_rejects_integer_text_chunk_tokens(self) -> None:
+        manifest: Dict[str, Any] = _make_manifest()
+        chunks: List[Dict[str, Any]] = [
+            {
+                "chunk_id": "text_chunk_tokens_int",
+                "type": "text",
+                "content": "Attention is all you need",
+                "path": "paper/abstract",
+                "metadata": {
+                    "length": 25,
+                    "tokens": 4,
+                    "keywords": ["attention", "transformer"],
+                    "summary": "Transformer introduction",
+                    "relationships": [],
+                },
+            }
+        ]
+        zip_bytes: bytes = _build_zip(manifest, chunks=chunks)
+
+        with pytest.raises(KnowhereError, match="expected list\\[str\\]"):
+            parseResultZip(zip_bytes, verify_checksum=False)
+
     def test_loads_image_chunks_with_data(self) -> None:
         manifest: Dict[str, Any] = _make_manifest()
         zip_bytes: bytes = _build_zip(manifest)