From 409aabbee32b47fc96b7f52188c65e09b54c2430 Mon Sep 17 00:00:00 2001
From: Kenny Rch <ken@99nth.com>
Date: Tue, 16 Jun 2026 05:30:33 +0300
Subject: [PATCH] feat: implement lexical index search

---
 README.md                               |  4 +-
 src/docs_buddy/adapters/__init__.py     | 22 ++++++++-
 src/docs_buddy/adapters/whoosh_index.py | 63 ++++++++++++++++++++-----
 src/docs_buddy/domain/__init__.py       | 34 +++++++++++++
 src/docs_buddy/services/use_cases.py    | 24 +++++++++-
 tests/integration/test_adapters.py      | 36 +++++++++++++-
 tests/unit/test_domain.py               | 16 +++++++
 tests/unit/test_services.py             | 24 ++++++++++
 todo.md                                 |  2 +
 9 files changed, 208 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 5300520..2a87dae 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Docs Buddy
+ # Docs Buddy
 
 Navigate docs like a Pro.
 
@@ -20,7 +20,7 @@ git repository.
 
 ### Indexing the documentation
 
-- The document chunks are indexed for lexical (TODO: and semantic)
+- The document chunks are indexed for lexical (TODO: semantic)
   search
 
 ### Answering queries from the documentation
diff --git a/src/docs_buddy/adapters/__init__.py b/src/docs_buddy/adapters/__init__.py
index fa83aff..dc70992 100644
--- a/src/docs_buddy/adapters/__init__.py
+++ b/src/docs_buddy/adapters/__init__.py
@@ -15,7 +15,7 @@
 from docs_buddy.common import PathLike, DocsBuddyError
 from docs_buddy import domain, services
 from docs_buddy.services import events, commands
-from .whoosh_index import WhooshDocumentIndex
+from .whoosh_index import WhooshDocumentIndex, WhooshIndexError
 
 log = logging.getLogger(__name__)
 
@@ -141,6 +141,8 @@ class FakeIntermediateStorage:
 
     def __init__(self, destination):
         self._destination = Path(destination)
+        # todo: consider refactoring so that sink is merely
+        # manipulated here but belongs to containing object
         self.sink = {}
 
     def __repr__(self):
@@ -156,6 +158,10 @@ def get_temp_location(self):
         finally:
             self.sink.pop(temp_location, None)
 
+    @property
+    def destination_content(self):
+        return self.sink[str(self._destination)]
+
     def replace_destination(self, temp_location: PathLike) -> None:
         self.sink[str(self._destination)] = self.sink.pop(temp_location)
 
@@ -224,6 +230,11 @@ def __init__(self, source: PathLike, destination: PathLike):
     def sink(self):
         return self._intermediate_storage.sink
 
+    @property
+    def destination_content(self):
+        """ """
+        return self._intermediate_storage.destination_content
+
     @contextmanager
     def get_temp_location(self):
         with self._intermediate_storage.get_temp_location() as temp_location:
@@ -251,6 +262,15 @@ def fit(self, chunks, destination):
         """Index document chunks in memory"""
         self._pipeline.sink[destination] = list(chunks)
 
+    def search(self, query, max_results):
+        """Return results from the existing chunks"""
+        chunks = self._pipeline.destination_content
+        return [
+            domain.QueryResult(c.chunk, c.path, c.metadata)
+            for c in chunks
+            if str(query).lower() in c.chunk.lower()
+        ][:max_results]
+
 
 class FileSystemIntermediateStorage:
     """File system implementation of the intermediate storage protocol"""
diff --git a/src/docs_buddy/adapters/whoosh_index.py b/src/docs_buddy/adapters/whoosh_index.py
index c3809a5..571b01b 100644
--- a/src/docs_buddy/adapters/whoosh_index.py
+++ b/src/docs_buddy/adapters/whoosh_index.py
@@ -4,28 +4,42 @@
 from pathlib import Path
 import json
 
-from docs_buddy.common import PathLike, json_datetime_handler
+from docs_buddy.common import PathLike, json_datetime_handler, DocsBuddyError
 from docs_buddy import domain
 from whoosh import index
 from whoosh.fields import Schema, TEXT, ID, KEYWORD
+from whoosh import qparser
+
+
+class WhooshIndexError(DocsBuddyError):
+    pass
 
 
 class WhooshDocumentIndex:
     """Whoosh-based implementation of DocumentIndex protocol."""
 
-    def __init__(self):
+    _SCHEMA = Schema(
+        chunk_id=ID(stored=True, unique=True),
+        content=TEXT(stored=True),
+        path=ID(stored=True),
+        path_keywords=KEYWORD(lowercase=True, scorable=True),
+        metadata=TEXT(stored=True),
+    )
+    _SEARCH_FIELDS = ["content", "metadata", "path_keywords"]
+
+    def __init__(self, index_location: PathLike | None = None):
         """
         Initialize a Whoosh document index.
 
         """
-
-        self._schema = Schema(
-            chunk_id=ID(stored=True, unique=True),
-            content=TEXT(stored=True),
-            path=ID(stored=True),
-            path_keywords=KEYWORD(lowercase=True, scorable=True),
-            metadata=TEXT(stored=True),
-        )
+        self._index = None
+        if index_location:
+            self._index = index.open_dir(index_location)
+            self._query_parser = qparser.MultifieldParser(
+                self._SEARCH_FIELDS,
+                schema=self._SCHEMA,
+                group=qparser.OrGroup,
+            )
 
     def fit(
         self, chunks: Iterator[domain.DocumentChunk], destination: PathLike
@@ -37,7 +51,7 @@ def fit(
             chunks: Iterator of DocumentChunk objects to index
             destination: Path where the index should be stored
         """
-        ix = index.create_in(str(destination), self._schema)
+        ix = index.create_in(str(destination), self._SCHEMA)
 
         writer = ix.writer()
 
@@ -55,3 +69,30 @@ def fit(
             )
 
         writer.commit()
+
+    def search(self, query: domain.Query, max_results: int) -> list[domain.QueryResult]:
+        """Search the whoosh index"""
+
+        if not self._index:
+            # todo: consider refactoring index into builder and searcher for better
+            # interface segregation. Would help avoid this error
+            cls_name = type(self).__name__
+            raise WhooshIndexError(
+                f"Index not properly initialized. Initialize {cls_name} with index location"
+            )
+
+        parsed_query = self._query_parser.parse(str(query))
+
+        with self._index.searcher() as searcher:
+            # todo: consider interaction between indexing and searching
+            # is locking required for coordination?
+            results = searcher.search(parsed_query, limit=max_results)
+            results = [
+                domain.QueryResult(
+                    content=r["content"],
+                    path=r["path"],
+                    metadata=json.loads(r["metadata"]),
+                )
+                for r in results
+            ]
+        return results
diff --git a/src/docs_buddy/domain/__init__.py b/src/docs_buddy/domain/__init__.py
index 742aec5..571a5b0 100644
--- a/src/docs_buddy/domain/__init__.py
+++ b/src/docs_buddy/domain/__init__.py
@@ -7,6 +7,10 @@
 from docs_buddy import common
 
 
+class InvalidQueryError(common.DocsBuddyError):
+    pass
+
+
 @dataclass(frozen=True)
 class RawDocument:
     """Representation of an unprocessed document"""
@@ -58,6 +62,36 @@ def __str__(self):
         return json.dumps(asdict(self), default=common.json_datetime_handler)
 
 
+@dataclass(frozen=True)
+class Query:
+    """Representation of a user query"""
+
+    text: str
+
+    def __post_init__(self):
+        clean_query = self.text.strip()
+        if not clean_query:
+            raise InvalidQueryError(
+                f"Invalid query: '{clean_query}'. Length must be > 0 after stripping"
+            )
+        super().__setattr__("text", clean_query)
+
+    def __str__(self):
+        return self.text
+
+
+@dataclass(frozen=True)
+class QueryResult:
+    """Result of a user query"""
+
+    content: str
+    path: str
+    metadata: dict[str, Any]
+
+    def __str__(self):
+        return json.dumps(asdict(self))
+
+
 def sliding_window(seq: Sequence, size: int, step: int) -> Iterator[dict]:
     """Returns chunks from the sequence"""
     return ({"chunk": seq[i : i + size], "index": i} for i in range(0, len(seq), step))
diff --git a/src/docs_buddy/services/use_cases.py b/src/docs_buddy/services/use_cases.py
index 5778299..dc5b5eb 100644
--- a/src/docs_buddy/services/use_cases.py
+++ b/src/docs_buddy/services/use_cases.py
@@ -1,4 +1,4 @@
-"""Use case handlers, adapter interfaces, events and commands"""
+"""Use case handlers, adapter interfaces"""
 
 import functools
 from dataclasses import dataclass
@@ -8,11 +8,17 @@
 from docs_buddy.common import PathLike, DocsBuddyError
 from docs_buddy import domain
 
+DEFAULT_MAX_RESULTS = 10
+
 
 class RepositorySyncError(DocsBuddyError):
     pass
 
 
+class SearchIndexError(DocsBuddyError):
+    pass
+
+
 class RepoStorage(Protocol):
     """Protocol that manages repository updates"""
 
@@ -52,6 +58,10 @@ def fit(
         self, chunks: Iterator[domain.DocumentChunk], destination: PathLike
     ) -> None: ...
 
+    def search(
+        self, query: domain.Query, max_results: int
+    ) -> list[domain.QueryResult]: ...
+
 
 class DocumentChunksPipeline(SupportsIntermediateStorage, Protocol):
     """Protocol for providing document chunks"""
@@ -159,8 +169,20 @@ def apply(args, func):
 def index_document_chunks(
     pipeline: DocumentChunksPipeline, index: DocumentIndex
 ) -> None:
+    """Indexes the document chunks"""
+
     document_chunks = pipeline.get_document_chunks()
 
     with pipeline.get_temp_location() as tmp_location:
         index.fit(document_chunks, destination=tmp_location)
         pipeline.replace_destination(tmp_location)
+
+
+def search_index(
+    query: domain.Query, index: DocumentIndex, max_results: int = DEFAULT_MAX_RESULTS
+) -> list[domain.QueryResult]:
+    """Returns search results from the index"""
+    if max_results < 1:
+        raise SearchIndexError("max results must be at least 1")
+
+    return index.search(query, max_results)
diff --git a/tests/integration/test_adapters.py b/tests/integration/test_adapters.py
index 2bc9643..4750acd 100644
--- a/tests/integration/test_adapters.py
+++ b/tests/integration/test_adapters.py
@@ -1,14 +1,14 @@
 """Adapter tests that interact with infrastructure"""
 
-import pytest
 from pathlib import Path
 import shutil
 import tempfile
 import json
 
+import pytest
 import whoosh.index
 
-from docs_buddy import adapters, domain
+from docs_buddy import adapters, domain, common
 
 
 def test_get_temp_location_creates_and_cleans_up() -> None:
@@ -114,6 +114,38 @@ def test_whoosh_document_index_fit_creates_index() -> None:
                 assert "metadata" in doc
 
 
+def test_incorrectly_initialized_index_raises() -> None:
+    # initialize index without index directory
+    index = adapters.WhooshDocumentIndex()
+
+    # should raise an error on search
+    with pytest.raises(adapters.WhooshIndexError):
+        query = domain.Query("providers")
+        results = index.search(query, max_results=10)
+
+
+def test_can_search_whoosh_index() -> None:
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+
+        indexer = adapters.WhooshDocumentIndex()
+
+        chunks = [
+            domain.DocumentChunk.fromstring(json.dumps(_SAMPLE_CHUNK_1)),
+            domain.DocumentChunk.fromstring(json.dumps(_SAMPLE_CHUNK_2)),
+        ]
+
+        indexer.fit(iter(chunks), temp_dir)
+
+        document_index = adapters.WhooshDocumentIndex(temp_dir)
+        query = domain.Query("providers")
+        results = document_index.search(query, max_results=10)
+        assert len(results) > 1
+
+        # it should be possible to specify max length of results
+        assert len(document_index.search(query, max_results=1)) == 1
+
+
 def test_whoosh_document_index_fitting_for_empty_documents() -> None:
     """Test that WhooshDocumentIndex creates an index from DocumentChunks."""
 
diff --git a/tests/unit/test_domain.py b/tests/unit/test_domain.py
index c7ef0f3..876446b 100644
--- a/tests/unit/test_domain.py
+++ b/tests/unit/test_domain.py
@@ -197,3 +197,19 @@ def test_overlapping_chunks_newlines_preserved() -> None:
 
     # Newlines should remain in first 3 chunks
     assert all(["\n" in item["chunk"] for item in result[:3]])
+
+
+def test_query_parsing_processes_text() -> None:
+    valid_query = "foo"
+    empty_query = ""
+    whitespace_query = " " * 10
+    padded_query = " bar   "
+
+    assert str(domain.Query(valid_query)) == valid_query
+    assert str(domain.Query(padded_query)) == "bar"
+
+    with pytest.raises(domain.InvalidQueryError):
+        _ = domain.Query(empty_query)
+
+    with pytest.raises(domain.InvalidQueryError):
+        _ = domain.Query(whitespace_query)
diff --git a/tests/unit/test_services.py b/tests/unit/test_services.py
index 4db2247..403844c 100644
--- a/tests/unit/test_services.py
+++ b/tests/unit/test_services.py
@@ -239,3 +239,27 @@ def test_can_index_documents() -> None:
     assert (action1, arg1) == ("MKDIR", tmp_location)
     assert (action2, arg2) == ("RMRF", dest)
     assert (action3, arg3_1, arg3_2) == ("MV", tmp_location, dest)
+
+
+def test_can_search_index() -> None:
+    source = ".chunks/programmmer-ke/akash-docs-buddy"
+    dest = ".index/programmer-ke/akash-docs-buddy"
+
+    pipeline = adapters.FakeDocumentChunksPipeline(source, dest)
+    index = adapters.FakeIndex(pipeline)
+
+    services.index_document_chunks(pipeline, index)
+
+    query = domain.Query(text="provider")
+    results = services.search_index(query, index)
+    assert len(results) > 0
+
+    # can specify max results
+    results = services.search_index(query, index, max_results=1)
+    assert len(results) == 1
+
+    # max results must be > 0
+    bad_values = [0, -1, -30]
+    for bad_value in bad_values:
+        with pytest.raises(services.SearchIndexError):
+            _ = services.search_index(query, index, max_results=bad_value)
diff --git a/todo.md b/todo.md
index 2581f5b..a2de17c 100644
--- a/todo.md
+++ b/todo.md
@@ -1,5 +1,7 @@
 # todo
 
+- address todo comments
+
 # in progress
 
 - [>] Implement lexical search over document index