From 409aabbee32b47fc96b7f52188c65e09b54c2430 Mon Sep 17 00:00:00 2001 From: Kenny Rch Date: Tue, 16 Jun 2026 05:30:33 +0300 Subject: [PATCH] feat: implement lexical index search --- README.md | 4 +- src/docs_buddy/adapters/__init__.py | 22 ++++++++- src/docs_buddy/adapters/whoosh_index.py | 63 ++++++++++++++++++++----- src/docs_buddy/domain/__init__.py | 34 +++++++++++++ src/docs_buddy/services/use_cases.py | 24 +++++++++- tests/integration/test_adapters.py | 36 +++++++++++++- tests/unit/test_domain.py | 16 +++++++ tests/unit/test_services.py | 24 ++++++++++ todo.md | 2 + 9 files changed, 208 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 5300520..2a87dae 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Docs Buddy + # Docs Buddy Navigate docs like a Pro. @@ -20,7 +20,7 @@ git repository. ### Indexing the documentation -- The document chunks are indexed for lexical (TODO: and semantic) +- The document chunks are indexed for lexical (TODO: semantic) search ### Answering queries from the documentation diff --git a/src/docs_buddy/adapters/__init__.py b/src/docs_buddy/adapters/__init__.py index fa83aff..dc70992 100644 --- a/src/docs_buddy/adapters/__init__.py +++ b/src/docs_buddy/adapters/__init__.py @@ -15,7 +15,7 @@ from docs_buddy.common import PathLike, DocsBuddyError from docs_buddy import domain, services from docs_buddy.services import events, commands -from .whoosh_index import WhooshDocumentIndex +from .whoosh_index import WhooshDocumentIndex, WhooshIndexError log = logging.getLogger(__name__) @@ -141,6 +141,8 @@ class FakeIntermediateStorage: def __init__(self, destination): self._destination = Path(destination) + # todo: consider refactoring so that sink is merely + # manipulated here but belongs to containing object self.sink = {} def __repr__(self): @@ -156,6 +158,10 @@ def get_temp_location(self): finally: self.sink.pop(temp_location, None) + @property + def destination_content(self): + return self.sink[str(self._destination)] + def replace_destination(self, temp_location: PathLike) -> None: self.sink[str(self._destination)] = self.sink.pop(temp_location) @@ -224,6 +230,11 @@ def __init__(self, source: PathLike, destination: PathLike): def sink(self): return self._intermediate_storage.sink + @property + def destination_content(self): + """ """ + return self._intermediate_storage.destination_content + @contextmanager def get_temp_location(self): with self._intermediate_storage.get_temp_location() as temp_location: @@ -251,6 +262,15 @@ def fit(self, chunks, destination): """Index document chunks in memory""" self._pipeline.sink[destination] = list(chunks) + def search(self, query, max_results): + """Return results from the existing chunks""" + chunks = self._pipeline.destination_content + return [ + domain.QueryResult(c.chunk, c.path, c.metadata) + for c in chunks + if str(query).lower() in c.chunk.lower() + ][:max_results] + class FileSystemIntermediateStorage: """File system implementation of the intermediate storage protocol""" diff --git a/src/docs_buddy/adapters/whoosh_index.py b/src/docs_buddy/adapters/whoosh_index.py index c3809a5..571b01b 100644 --- a/src/docs_buddy/adapters/whoosh_index.py +++ b/src/docs_buddy/adapters/whoosh_index.py @@ -4,28 +4,42 @@ from pathlib import Path import json -from docs_buddy.common import PathLike, json_datetime_handler +from docs_buddy.common import PathLike, json_datetime_handler, DocsBuddyError from docs_buddy import domain from whoosh import index from whoosh.fields import Schema, TEXT, ID, KEYWORD +from whoosh import qparser + + +class WhooshIndexError(DocsBuddyError): + pass class WhooshDocumentIndex: """Whoosh-based implementation of DocumentIndex protocol.""" - def __init__(self): + _SCHEMA = Schema( + chunk_id=ID(stored=True, unique=True), + content=TEXT(stored=True), + path=ID(stored=True), + path_keywords=KEYWORD(lowercase=True, scorable=True), + metadata=TEXT(stored=True), + ) + _SEARCH_FIELDS = ["content", "metadata", "path_keywords"] + + def __init__(self, index_location: PathLike | None = None): """ Initialize a Whoosh document index. """ - - self._schema = Schema( - chunk_id=ID(stored=True, unique=True), - content=TEXT(stored=True), - path=ID(stored=True), - path_keywords=KEYWORD(lowercase=True, scorable=True), - metadata=TEXT(stored=True), - ) + self._index = None + if index_location: + self._index = index.open_dir(index_location) + self._query_parser = qparser.MultifieldParser( + self._SEARCH_FIELDS, + schema=self._SCHEMA, + group=qparser.OrGroup, + ) def fit( self, chunks: Iterator[domain.DocumentChunk], destination: PathLike @@ -37,7 +51,7 @@ def fit( chunks: Iterator of DocumentChunk objects to index destination: Path where the index should be stored """ - ix = index.create_in(str(destination), self._schema) + ix = index.create_in(str(destination), self._SCHEMA) writer = ix.writer() @@ -55,3 +69,30 @@ def fit( ) writer.commit() + + def search(self, query: domain.Query, max_results: int) -> list[domain.QueryResult]: + """Search the whoosh index""" + + if not self._index: + # todo: consider refactoring index into builder and searcher for better + # interface segregation. Would help avoid this error + cls_name = type(self).__name__ + raise WhooshIndexError( + f"Index not properly initialized. Initialize {cls_name} with index location" + ) + + parsed_query = self._query_parser.parse(str(query)) + + with self._index.searcher() as searcher: + # todo: consider interaction between indexing and searching + # is locking required for coordination? + results = searcher.search(parsed_query, limit=max_results) + results = [ + domain.QueryResult( + content=r["content"], + path=r["path"], + metadata=json.loads(r["metadata"]), + ) + for r in results + ] + return results diff --git a/src/docs_buddy/domain/__init__.py b/src/docs_buddy/domain/__init__.py index 742aec5..571a5b0 100644 --- a/src/docs_buddy/domain/__init__.py +++ b/src/docs_buddy/domain/__init__.py @@ -7,6 +7,10 @@ from docs_buddy import common +class InvalidQueryError(common.DocsBuddyError): + pass + + @dataclass(frozen=True) class RawDocument: """Representation of an unprocessed document""" @@ -58,6 +62,36 @@ def __str__(self): return json.dumps(asdict(self), default=common.json_datetime_handler) +@dataclass(frozen=True) +class Query: + """Representation of a user query""" + + text: str + + def __post_init__(self): + clean_query = self.text.strip() + if not clean_query: + raise InvalidQueryError( + f"Invalid query: '{clean_query}'. Length must be > 0 after stripping" + ) + super().__setattr__("text", clean_query) + + def __str__(self): + return self.text + + +@dataclass(frozen=True) +class QueryResult: + """Result of a user query""" + + content: str + path: str + metadata: dict[str, Any] + + def __str__(self): + return json.dumps(asdict(self)) + + def sliding_window(seq: Sequence, size: int, step: int) -> Iterator[dict]: """Returns chunks from the sequence""" return ({"chunk": seq[i : i + size], "index": i} for i in range(0, len(seq), step)) diff --git a/src/docs_buddy/services/use_cases.py b/src/docs_buddy/services/use_cases.py index 5778299..dc5b5eb 100644 --- a/src/docs_buddy/services/use_cases.py +++ b/src/docs_buddy/services/use_cases.py @@ -1,4 +1,4 @@ -"""Use case handlers, adapter interfaces, events and commands""" +"""Use case handlers, adapter interfaces""" import functools from dataclasses import dataclass @@ -8,11 +8,17 @@ from docs_buddy.common import PathLike, DocsBuddyError from docs_buddy import domain +DEFAULT_MAX_RESULTS = 10 + class RepositorySyncError(DocsBuddyError): pass +class SearchIndexError(DocsBuddyError): + pass + + class RepoStorage(Protocol): """Protocol that manages repository updates""" @@ -52,6 +58,10 @@ def fit( self, chunks: Iterator[domain.DocumentChunk], destination: PathLike ) -> None: ... + def search( + self, query: domain.Query, max_results: int + ) -> list[domain.QueryResult]: ... + class DocumentChunksPipeline(SupportsIntermediateStorage, Protocol): """Protocol for providing document chunks""" @@ -159,8 +169,20 @@ def apply(args, func): def index_document_chunks( pipeline: DocumentChunksPipeline, index: DocumentIndex ) -> None: + """Indexes the document chunks""" + document_chunks = pipeline.get_document_chunks() with pipeline.get_temp_location() as tmp_location: index.fit(document_chunks, destination=tmp_location) pipeline.replace_destination(tmp_location) + + +def search_index( + query: domain.Query, index: DocumentIndex, max_results: int = DEFAULT_MAX_RESULTS +) -> list[domain.QueryResult]: + """Returns search results from the index""" + if max_results < 1: + raise SearchIndexError("max results must be at least 1") + + return index.search(query, max_results) diff --git a/tests/integration/test_adapters.py b/tests/integration/test_adapters.py index 2bc9643..4750acd 100644 --- a/tests/integration/test_adapters.py +++ b/tests/integration/test_adapters.py @@ -1,14 +1,14 @@ """Adapter tests that interact with infrastructure""" -import pytest from pathlib import Path import shutil import tempfile import json +import pytest import whoosh.index -from docs_buddy import adapters, domain +from docs_buddy import adapters, domain, common def test_get_temp_location_creates_and_cleans_up() -> None: @@ -114,6 +114,38 @@ def test_whoosh_document_index_fit_creates_index() -> None: assert "metadata" in doc +def test_incorrectly_initialized_index_raises() -> None: + # initialize index without index directory + index = adapters.WhooshDocumentIndex() + + # should raise an error on search + with pytest.raises(adapters.WhooshIndexError): + query = domain.Query("providers") + results = index.search(query, max_results=10) + + +def test_can_search_whoosh_index() -> None: + + with tempfile.TemporaryDirectory() as temp_dir: + + indexer = adapters.WhooshDocumentIndex() + + chunks = [ + domain.DocumentChunk.fromstring(json.dumps(_SAMPLE_CHUNK_1)), + domain.DocumentChunk.fromstring(json.dumps(_SAMPLE_CHUNK_2)), + ] + + indexer.fit(iter(chunks), temp_dir) + + document_index = adapters.WhooshDocumentIndex(temp_dir) + query = domain.Query("providers") + results = document_index.search(query, max_results=10) + assert len(results) > 1 + + # it should be possible to specify max length of results + assert len(document_index.search(query, max_results=1)) == 1 + + def test_whoosh_document_index_fitting_for_empty_documents() -> None: """Test that WhooshDocumentIndex creates an index from DocumentChunks.""" diff --git a/tests/unit/test_domain.py b/tests/unit/test_domain.py index c7ef0f3..876446b 100644 --- a/tests/unit/test_domain.py +++ b/tests/unit/test_domain.py @@ -197,3 +197,19 @@ def test_overlapping_chunks_newlines_preserved() -> None: # Newlines should remain in first 3 chunks assert all(["\n" in item["chunk"] for item in result[:3]]) + + +def test_query_parsing_processes_text() -> None: + valid_query = "foo" + empty_query = "" + whitespace_query = " " * 10 + padded_query = " bar " + + assert str(domain.Query(valid_query)) == valid_query + assert str(domain.Query(padded_query)) == "bar" + + with pytest.raises(domain.InvalidQueryError): + _ = domain.Query(empty_query) + + with pytest.raises(domain.InvalidQueryError): + _ = domain.Query(whitespace_query) diff --git a/tests/unit/test_services.py b/tests/unit/test_services.py index 4db2247..403844c 100644 --- a/tests/unit/test_services.py +++ b/tests/unit/test_services.py @@ -239,3 +239,27 @@ def test_can_index_documents() -> None: assert (action1, arg1) == ("MKDIR", tmp_location) assert (action2, arg2) == ("RMRF", dest) assert (action3, arg3_1, arg3_2) == ("MV", tmp_location, dest) + + +def test_can_search_index() -> None: + source = ".chunks/programmmer-ke/akash-docs-buddy" + dest = ".index/programmer-ke/akash-docs-buddy" + + pipeline = adapters.FakeDocumentChunksPipeline(source, dest) + index = adapters.FakeIndex(pipeline) + + services.index_document_chunks(pipeline, index) + + query = domain.Query(text="provider") + results = services.search_index(query, index) + assert len(results) > 0 + + # can specify max results + results = services.search_index(query, index, max_results=1) + assert len(results) == 1 + + # max results must be > 0 + bad_values = [0, -1, -30] + for bad_value in bad_values: + with pytest.raises(services.SearchIndexError): + _ = services.search_index(query, index, max_results=bad_value) diff --git a/todo.md b/todo.md index 2581f5b..a2de17c 100644 --- a/todo.md +++ b/todo.md @@ -1,5 +1,7 @@ # todo +- address todo comments + # in progress - [>] Implement lexical search over document index