diff --git a/docs/usage.md b/docs/usage.md index fadedc5..cf32420 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -249,7 +249,7 @@ Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`). | `content` | `str` | The text content | | `path` | `str \| None` | Document structure path (e.g. `"Section 1 > Subsection 2"`) | | `length` | `int` | Character count | -| `tokens` | `int \| None` | Token count (if computed) | +| `tokens` | `List[str] \| None` | Tokenized words returned by the parser pipeline | | `keywords` | `List[str] \| None` | Extracted keywords (requires `summary_txt: True`) | | `summary` | `str \| None` | AI-generated summary (requires `summary_txt: True`) | | `relationships` | `List \| None` | Relationships to other chunks | @@ -729,4 +729,4 @@ def verify_webhook(payload: bytes, signature: str, secret: str) -> bool: ### Retry behavior -If your webhook endpoint returns a non-2xx status code, the API retries delivery up to 6 times with exponential backoff. \ No newline at end of file +If your webhook endpoint returns a non-2xx status code, the API retries delivery up to 6 times with exponential backoff. diff --git a/src/knowhere/lib/result_parser.py b/src/knowhere/lib/result_parser.py index 2ccba74..8f02013 100644 --- a/src/knowhere/lib/result_parser.py +++ b/src/knowhere/lib/result_parser.py @@ -18,6 +18,7 @@ ParseResult, TableChunk, TextChunk, + TextChunkTokens, ) _logger = getLogger() @@ -79,6 +80,38 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]: return fallback +def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]: + """Return a string-only token list with empty values removed.""" + normalized_tokens: List[str] = [] + for raw_token in raw_tokens: + token_text: str = str(raw_token).strip() + if token_text: + normalized_tokens.append(token_text) + return normalized_tokens + + +def _parseTextChunkTokens( + raw_tokens: Any, + *, + chunk_id: str, +) -> Optional[TextChunkTokens]: + """Normalize text chunk tokens from the current backend payload.""" + if raw_tokens is None: + return None + if isinstance(raw_tokens, bool): + raise KnowhereError( + f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool." + ) + if isinstance(raw_tokens, list): + return _normalizeTokenList(raw_tokens) + + raise KnowhereError( + "Invalid tokens payload for text chunk " + f"'{chunk_id}': expected list[str], " + f"got {type(raw_tokens).__name__}." + ) + + def _buildChunks( raw_chunks: List[Dict[str, Any]], zf: zipfile.ZipFile, @@ -127,13 +160,15 @@ def _buildChunks( ) else: metadata = raw.get("metadata", {}) + chunk_id: str = raw.get("chunk_id", "") + raw_tokens: Any = metadata.get("tokens", raw.get("tokens")) chunk = TextChunk( - chunk_id=raw.get("chunk_id", ""), + chunk_id=chunk_id, type="text", content=raw.get("content", ""), path=raw.get("path"), length=metadata.get("length", raw.get("length", 0)), - tokens=metadata.get("tokens", raw.get("tokens")), + tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id), keywords=metadata.get("keywords", raw.get("keywords")), summary=metadata.get("summary", raw.get("summary")), relationships=metadata.get("relationships", raw.get("relationships")), diff --git a/src/knowhere/types/result.py b/src/knowhere/types/result.py index ea02cb7..07ae918 100644 --- a/src/knowhere/types/result.py +++ b/src/knowhere/types/result.py @@ -8,6 +8,7 @@ from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, Field +from typing_extensions import TypeAlias from knowhere._exceptions import ValidationError @@ -124,12 +125,15 @@ class BaseChunk(BaseModel): path: Optional[str] = None +TextChunkTokens: TypeAlias = List[str] + + class TextChunk(BaseChunk): """A text chunk extracted from the document.""" type: str = "text" length: int = 0 - tokens: Optional[int] = None + tokens: Optional[TextChunkTokens] = None keywords: Optional[List[str]] = None summary: Optional[str] = None relationships: Optional[List[Union[Dict[str, Any], str]]] = None diff --git a/tests/conftest.py b/tests/conftest.py index f925edc..a03325e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -138,7 +138,7 @@ def _factory( "content": "Hello world", "path": "test/section1", "length": 11, - "tokens": 2, + "tokens": ["Hello", "world"], "keywords": ["hello"], "summary": "A greeting", "relationships": [], diff --git a/tests/fixtures/real_result.zip b/tests/fixtures/real_result.zip index 8a846fb..0c5f296 100644 Binary files a/tests/fixtures/real_result.zip and b/tests/fixtures/real_result.zip differ diff --git a/tests/test_models.py b/tests/test_models.py index 2230c9d..5025a45 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -391,7 +391,7 @@ def test_from_dict(self) -> None: content="Some text content", path="doc/section1", length=17, - tokens=4, + tokens=["Some", "text", "content"], keywords=["text", "content"], summary="A text chunk", relationships=[{"target": "text_2", "type": "follows"}], @@ -400,7 +400,7 @@ def test_from_dict(self) -> None: assert chunk.type == "text" assert chunk.content == "Some text content" assert chunk.length == 17 - assert chunk.tokens == 4 + assert chunk.tokens == ["Some", "text", "content"] assert chunk.keywords == ["text", "content"] assert chunk.summary == "A text chunk" assert chunk.relationships is not None @@ -419,6 +419,13 @@ def test_is_instance_of_base_chunk(self) -> None: chunk: TextChunk = TextChunk(chunk_id="text_3") assert isinstance(chunk, BaseChunk) + def test_accepts_tokens_list(self) -> None: + chunk: TextChunk = TextChunk( + chunk_id="text_4", + tokens=["attention", "transformer"], + ) + assert chunk.tokens == ["attention", "transformer"] + # --------------------------------------------------------------------------- # ImageChunk model diff --git a/tests/test_result_parser.py b/tests/test_result_parser.py index 4b49448..4e23a87 100644 --- a/tests/test_result_parser.py +++ b/tests/test_result_parser.py @@ -35,7 +35,7 @@ "content": "Hello world", "path": "test/section1", "length": 11, - "tokens": 2, + "tokens": ["Hello", "world"], "keywords": ["hello"], "summary": "A greeting", "relationships": [], @@ -52,6 +52,8 @@ }, ] +TEXT_TOKENS_LIST: List[str] = ["Ashish", "Vaswani", "attention", "transformer"] + MARKDOWN: str = "# Test\n\nHello world" IMAGE_BYTES: bytes = b"\xff\xd8\xff\xe0" @@ -147,6 +149,74 @@ def test_loads_text_chunks(self) -> None: assert text_chunks[0].chunk_id == "text_chunk_1" assert text_chunks[0].content == "Hello world" + def test_accepts_text_chunk_tokens_as_list(self) -> None: + manifest: Dict[str, Any] = _make_manifest() + chunks: List[Dict[str, Any]] = [ + { + "chunk_id": "text_chunk_tokens_list", + "type": "text", + "content": "Attention is all you need", + "path": "paper/abstract", + "metadata": { + "length": 25, + "tokens": TEXT_TOKENS_LIST, + "keywords": ["attention", "transformer"], + "summary": "Transformer introduction", + "relationships": [], + }, + } + ] + zip_bytes: bytes = _build_zip(manifest, chunks=chunks) + + result: ParseResult = parseResultZip(zip_bytes, verify_checksum=False) + + assert len(result.text_chunks) == 1 + assert result.text_chunks[0].tokens == TEXT_TOKENS_LIST + + def test_rejects_legacy_text_chunk_tokens_string(self) -> None: + manifest: Dict[str, Any] = _make_manifest() + chunks: List[Dict[str, Any]] = [ + { + "chunk_id": "text_chunk_tokens_string", + "type": "text", + "content": "Attention is all you need", + "path": "paper/abstract", + "metadata": { + "length": 25, + "tokens": "Ashish;Vaswani;attention;transformer", + "keywords": ["attention", "transformer"], + "summary": "Transformer introduction", + "relationships": [], + }, + } + ] + zip_bytes: bytes = _build_zip(manifest, chunks=chunks) + + with pytest.raises(KnowhereError, match="expected list\\[str\\]"): + parseResultZip(zip_bytes, verify_checksum=False) + + def test_rejects_integer_text_chunk_tokens(self) -> None: + manifest: Dict[str, Any] = _make_manifest() + chunks: List[Dict[str, Any]] = [ + { + "chunk_id": "text_chunk_tokens_int", + "type": "text", + "content": "Attention is all you need", + "path": "paper/abstract", + "metadata": { + "length": 25, + "tokens": 4, + "keywords": ["attention", "transformer"], + "summary": "Transformer introduction", + "relationships": [], + }, + } + ] + zip_bytes: bytes = _build_zip(manifest, chunks=chunks) + + with pytest.raises(KnowhereError, match="expected list\\[str\\]"): + parseResultZip(zip_bytes, verify_checksum=False) + def test_loads_image_chunks_with_data(self) -> None: manifest: Dict[str, Any] = _make_manifest() zip_bytes: bytes = _build_zip(manifest)