Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`).
| `content` | `str` | The text content |
| `path` | `str \| None` | Document structure path (e.g. `"Section 1 > Subsection 2"`) |
| `length` | `int` | Character count |
| `tokens` | `int \| None` | Token count (if computed) |
| `tokens` | `List[str] \| None` | Tokenized words returned by the parser pipeline |
| `keywords` | `List[str] \| None` | Extracted keywords (requires `summary_txt: True`) |
| `summary` | `str \| None` | AI-generated summary (requires `summary_txt: True`) |
| `relationships` | `List \| None` | Relationships to other chunks |
Expand Down Expand Up @@ -729,4 +729,4 @@ def verify_webhook(payload: bytes, signature: str, secret: str) -> bool:

### Retry behavior

If your webhook endpoint returns a non-2xx status code, the API retries delivery up to 6 times with exponential backoff.
If your webhook endpoint returns a non-2xx status code, the API retries delivery up to 6 times with exponential backoff.
39 changes: 37 additions & 2 deletions src/knowhere/lib/result_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
ParseResult,
TableChunk,
TextChunk,
TextChunkTokens,
)

_logger = getLogger()
Expand Down Expand Up @@ -79,6 +80,38 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]:
return fallback


def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]:
"""Return a string-only token list with empty values removed."""
normalized_tokens: List[str] = []
for raw_token in raw_tokens:
token_text: str = str(raw_token).strip()
if token_text:
normalized_tokens.append(token_text)
return normalized_tokens


def _parseTextChunkTokens(
raw_tokens: Any,
*,
chunk_id: str,
) -> Optional[TextChunkTokens]:
"""Normalize text chunk tokens from the current backend payload."""
if raw_tokens is None:
return None
if isinstance(raw_tokens, bool):
raise KnowhereError(
f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool."
)
if isinstance(raw_tokens, list):
return _normalizeTokenList(raw_tokens)

raise KnowhereError(
"Invalid tokens payload for text chunk "
f"'{chunk_id}': expected list[str], "
f"got {type(raw_tokens).__name__}."
)


def _buildChunks(
raw_chunks: List[Dict[str, Any]],
zf: zipfile.ZipFile,
Expand Down Expand Up @@ -127,13 +160,15 @@ def _buildChunks(
)
else:
metadata = raw.get("metadata", {})
chunk_id: str = raw.get("chunk_id", "")
raw_tokens: Any = metadata.get("tokens", raw.get("tokens"))
chunk = TextChunk(
chunk_id=raw.get("chunk_id", ""),
chunk_id=chunk_id,
type="text",
content=raw.get("content", ""),
path=raw.get("path"),
length=metadata.get("length", raw.get("length", 0)),
tokens=metadata.get("tokens", raw.get("tokens")),
tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
keywords=metadata.get("keywords", raw.get("keywords")),
summary=metadata.get("summary", raw.get("summary")),
relationships=metadata.get("relationships", raw.get("relationships")),
Expand Down
6 changes: 5 additions & 1 deletion src/knowhere/types/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel, Field
from typing_extensions import TypeAlias

from knowhere._exceptions import ValidationError

Expand Down Expand Up @@ -124,12 +125,15 @@ class BaseChunk(BaseModel):
path: Optional[str] = None


TextChunkTokens: TypeAlias = List[str]


class TextChunk(BaseChunk):
"""A text chunk extracted from the document."""

type: str = "text"
length: int = 0
tokens: Optional[int] = None
tokens: Optional[TextChunkTokens] = None
keywords: Optional[List[str]] = None
summary: Optional[str] = None
relationships: Optional[List[Union[Dict[str, Any], str]]] = None
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def _factory(
"content": "Hello world",
"path": "test/section1",
"length": 11,
"tokens": 2,
"tokens": ["Hello", "world"],
"keywords": ["hello"],
"summary": "A greeting",
"relationships": [],
Expand Down
Binary file modified tests/fixtures/real_result.zip
Binary file not shown.
11 changes: 9 additions & 2 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def test_from_dict(self) -> None:
content="Some text content",
path="doc/section1",
length=17,
tokens=4,
tokens=["Some", "text", "content"],
keywords=["text", "content"],
summary="A text chunk",
relationships=[{"target": "text_2", "type": "follows"}],
Expand All @@ -400,7 +400,7 @@ def test_from_dict(self) -> None:
assert chunk.type == "text"
assert chunk.content == "Some text content"
assert chunk.length == 17
assert chunk.tokens == 4
assert chunk.tokens == ["Some", "text", "content"]
assert chunk.keywords == ["text", "content"]
assert chunk.summary == "A text chunk"
assert chunk.relationships is not None
Expand All @@ -419,6 +419,13 @@ def test_is_instance_of_base_chunk(self) -> None:
chunk: TextChunk = TextChunk(chunk_id="text_3")
assert isinstance(chunk, BaseChunk)

def test_accepts_tokens_list(self) -> None:
chunk: TextChunk = TextChunk(
chunk_id="text_4",
tokens=["attention", "transformer"],
)
assert chunk.tokens == ["attention", "transformer"]


# ---------------------------------------------------------------------------
# ImageChunk model
Expand Down
72 changes: 71 additions & 1 deletion tests/test_result_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"content": "Hello world",
"path": "test/section1",
"length": 11,
"tokens": 2,
"tokens": ["Hello", "world"],
"keywords": ["hello"],
"summary": "A greeting",
"relationships": [],
Expand All @@ -52,6 +52,8 @@
},
]

TEXT_TOKENS_LIST: List[str] = ["Ashish", "Vaswani", "attention", "transformer"]

MARKDOWN: str = "# Test\n\nHello world"
IMAGE_BYTES: bytes = b"\xff\xd8\xff\xe0"

Expand Down Expand Up @@ -147,6 +149,74 @@ def test_loads_text_chunks(self) -> None:
assert text_chunks[0].chunk_id == "text_chunk_1"
assert text_chunks[0].content == "Hello world"

def test_accepts_text_chunk_tokens_as_list(self) -> None:
manifest: Dict[str, Any] = _make_manifest()
chunks: List[Dict[str, Any]] = [
{
"chunk_id": "text_chunk_tokens_list",
"type": "text",
"content": "Attention is all you need",
"path": "paper/abstract",
"metadata": {
"length": 25,
"tokens": TEXT_TOKENS_LIST,
"keywords": ["attention", "transformer"],
"summary": "Transformer introduction",
"relationships": [],
},
}
]
zip_bytes: bytes = _build_zip(manifest, chunks=chunks)

result: ParseResult = parseResultZip(zip_bytes, verify_checksum=False)

assert len(result.text_chunks) == 1
assert result.text_chunks[0].tokens == TEXT_TOKENS_LIST

def test_rejects_legacy_text_chunk_tokens_string(self) -> None:
manifest: Dict[str, Any] = _make_manifest()
chunks: List[Dict[str, Any]] = [
{
"chunk_id": "text_chunk_tokens_string",
"type": "text",
"content": "Attention is all you need",
"path": "paper/abstract",
"metadata": {
"length": 25,
"tokens": "Ashish;Vaswani;attention;transformer",
"keywords": ["attention", "transformer"],
"summary": "Transformer introduction",
"relationships": [],
},
}
]
zip_bytes: bytes = _build_zip(manifest, chunks=chunks)

with pytest.raises(KnowhereError, match="expected list\\[str\\]"):
parseResultZip(zip_bytes, verify_checksum=False)

def test_rejects_integer_text_chunk_tokens(self) -> None:
manifest: Dict[str, Any] = _make_manifest()
chunks: List[Dict[str, Any]] = [
{
"chunk_id": "text_chunk_tokens_int",
"type": "text",
"content": "Attention is all you need",
"path": "paper/abstract",
"metadata": {
"length": 25,
"tokens": 4,
"keywords": ["attention", "transformer"],
"summary": "Transformer introduction",
"relationships": [],
},
}
]
zip_bytes: bytes = _build_zip(manifest, chunks=chunks)

with pytest.raises(KnowhereError, match="expected list\\[str\\]"):
parseResultZip(zip_bytes, verify_checksum=False)

def test_loads_image_chunks_with_data(self) -> None:
manifest: Dict[str, Any] = _make_manifest()
zip_bytes: bytes = _build_zip(manifest)
Expand Down
Loading