Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Docs Buddy
# Docs Buddy

Navigate docs like a Pro.

Expand All @@ -20,7 +20,7 @@ git repository.

### Indexing the documentation

- The document chunks are indexed for lexical (TODO: and semantic)
- The document chunks are indexed for lexical (TODO: semantic)
search

### Answering queries from the documentation
Expand Down
22 changes: 21 additions & 1 deletion src/docs_buddy/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from docs_buddy.common import PathLike, DocsBuddyError
from docs_buddy import domain, services
from docs_buddy.services import events, commands
from .whoosh_index import WhooshDocumentIndex
from .whoosh_index import WhooshDocumentIndex, WhooshIndexError

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -141,6 +141,8 @@ class FakeIntermediateStorage:

def __init__(self, destination):
self._destination = Path(destination)
# todo: consider refactoring so that sink is merely
# manipulated here but belongs to containing object
self.sink = {}

def __repr__(self):
Expand All @@ -156,6 +158,10 @@ def get_temp_location(self):
finally:
self.sink.pop(temp_location, None)

@property
def destination_content(self):
return self.sink[str(self._destination)]

def replace_destination(self, temp_location: PathLike) -> None:
self.sink[str(self._destination)] = self.sink.pop(temp_location)

Expand Down Expand Up @@ -224,6 +230,11 @@ def __init__(self, source: PathLike, destination: PathLike):
def sink(self):
return self._intermediate_storage.sink

@property
def destination_content(self):
""" """
return self._intermediate_storage.destination_content

@contextmanager
def get_temp_location(self):
with self._intermediate_storage.get_temp_location() as temp_location:
Expand Down Expand Up @@ -251,6 +262,15 @@ def fit(self, chunks, destination):
"""Index document chunks in memory"""
self._pipeline.sink[destination] = list(chunks)

def search(self, query, max_results):
"""Return results from the existing chunks"""
chunks = self._pipeline.destination_content
return [
domain.QueryResult(c.chunk, c.path, c.metadata)
for c in chunks
if str(query).lower() in c.chunk.lower()
][:max_results]


class FileSystemIntermediateStorage:
"""File system implementation of the intermediate storage protocol"""
Expand Down
63 changes: 52 additions & 11 deletions src/docs_buddy/adapters/whoosh_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,42 @@
from pathlib import Path
import json

from docs_buddy.common import PathLike, json_datetime_handler
from docs_buddy.common import PathLike, json_datetime_handler, DocsBuddyError
from docs_buddy import domain
from whoosh import index
from whoosh.fields import Schema, TEXT, ID, KEYWORD
from whoosh import qparser


class WhooshIndexError(DocsBuddyError):
pass


class WhooshDocumentIndex:
"""Whoosh-based implementation of DocumentIndex protocol."""

def __init__(self):
_SCHEMA = Schema(
chunk_id=ID(stored=True, unique=True),
content=TEXT(stored=True),
path=ID(stored=True),
path_keywords=KEYWORD(lowercase=True, scorable=True),
metadata=TEXT(stored=True),
)
_SEARCH_FIELDS = ["content", "metadata", "path_keywords"]

def __init__(self, index_location: PathLike | None = None):
"""
Initialize a Whoosh document index.

"""

self._schema = Schema(
chunk_id=ID(stored=True, unique=True),
content=TEXT(stored=True),
path=ID(stored=True),
path_keywords=KEYWORD(lowercase=True, scorable=True),
metadata=TEXT(stored=True),
)
self._index = None
if index_location:
self._index = index.open_dir(index_location)
self._query_parser = qparser.MultifieldParser(
self._SEARCH_FIELDS,
schema=self._SCHEMA,
group=qparser.OrGroup,
)

def fit(
self, chunks: Iterator[domain.DocumentChunk], destination: PathLike
Expand All @@ -37,7 +51,7 @@ def fit(
chunks: Iterator of DocumentChunk objects to index
destination: Path where the index should be stored
"""
ix = index.create_in(str(destination), self._schema)
ix = index.create_in(str(destination), self._SCHEMA)

writer = ix.writer()

Expand All @@ -55,3 +69,30 @@ def fit(
)

writer.commit()

def search(self, query: domain.Query, max_results: int) -> list[domain.QueryResult]:
"""Search the whoosh index"""

if not self._index:
# todo: consider refactoring index into builder and searcher for better
# interface segregation. Would help avoid this error
cls_name = type(self).__name__
raise WhooshIndexError(
f"Index not properly initialized. Initialize {cls_name} with index location"
)

parsed_query = self._query_parser.parse(str(query))

with self._index.searcher() as searcher:
# todo: consider interaction between indexing and searching
# is locking required for coordination?
results = searcher.search(parsed_query, limit=max_results)
results = [
domain.QueryResult(
content=r["content"],
path=r["path"],
metadata=json.loads(r["metadata"]),
)
for r in results
]
return results
34 changes: 34 additions & 0 deletions src/docs_buddy/domain/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
from docs_buddy import common


class InvalidQueryError(common.DocsBuddyError):
pass


@dataclass(frozen=True)
class RawDocument:
"""Representation of an unprocessed document"""
Expand Down Expand Up @@ -58,6 +62,36 @@ def __str__(self):
return json.dumps(asdict(self), default=common.json_datetime_handler)


@dataclass(frozen=True)
class Query:
"""Representation of a user query"""

text: str

def __post_init__(self):
clean_query = self.text.strip()
if not clean_query:
raise InvalidQueryError(
f"Invalid query: '{clean_query}'. Length must be > 0 after stripping"
)
super().__setattr__("text", clean_query)

def __str__(self):
return self.text


@dataclass(frozen=True)
class QueryResult:
"""Result of a user query"""

content: str
path: str
metadata: dict[str, Any]

def __str__(self):
return json.dumps(asdict(self))


def sliding_window(seq: Sequence, size: int, step: int) -> Iterator[dict]:
"""Returns chunks from the sequence"""
return ({"chunk": seq[i : i + size], "index": i} for i in range(0, len(seq), step))
Expand Down
24 changes: 23 additions & 1 deletion src/docs_buddy/services/use_cases.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Use case handlers, adapter interfaces, events and commands"""
"""Use case handlers, adapter interfaces"""

import functools
from dataclasses import dataclass
Expand All @@ -8,11 +8,17 @@
from docs_buddy.common import PathLike, DocsBuddyError
from docs_buddy import domain

DEFAULT_MAX_RESULTS = 10


class RepositorySyncError(DocsBuddyError):
pass


class SearchIndexError(DocsBuddyError):
pass


class RepoStorage(Protocol):
"""Protocol that manages repository updates"""

Expand Down Expand Up @@ -52,6 +58,10 @@ def fit(
self, chunks: Iterator[domain.DocumentChunk], destination: PathLike
) -> None: ...

def search(
self, query: domain.Query, max_results: int
) -> list[domain.QueryResult]: ...


class DocumentChunksPipeline(SupportsIntermediateStorage, Protocol):
"""Protocol for providing document chunks"""
Expand Down Expand Up @@ -159,8 +169,20 @@ def apply(args, func):
def index_document_chunks(
pipeline: DocumentChunksPipeline, index: DocumentIndex
) -> None:
"""Indexes the document chunks"""

document_chunks = pipeline.get_document_chunks()

with pipeline.get_temp_location() as tmp_location:
index.fit(document_chunks, destination=tmp_location)
pipeline.replace_destination(tmp_location)


def search_index(
query: domain.Query, index: DocumentIndex, max_results: int = DEFAULT_MAX_RESULTS
) -> list[domain.QueryResult]:
"""Returns search results from the index"""
if max_results < 1:
raise SearchIndexError("max results must be at least 1")

return index.search(query, max_results)
36 changes: 34 additions & 2 deletions tests/integration/test_adapters.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""Adapter tests that interact with infrastructure"""

import pytest
from pathlib import Path
import shutil
import tempfile
import json

import pytest
import whoosh.index

from docs_buddy import adapters, domain
from docs_buddy import adapters, domain, common


def test_get_temp_location_creates_and_cleans_up() -> None:
Expand Down Expand Up @@ -114,6 +114,38 @@ def test_whoosh_document_index_fit_creates_index() -> None:
assert "metadata" in doc


def test_incorrectly_initialized_index_raises() -> None:
# initialize index without index directory
index = adapters.WhooshDocumentIndex()

# should raise an error on search
with pytest.raises(adapters.WhooshIndexError):
query = domain.Query("providers")
results = index.search(query, max_results=10)


def test_can_search_whoosh_index() -> None:

with tempfile.TemporaryDirectory() as temp_dir:

indexer = adapters.WhooshDocumentIndex()

chunks = [
domain.DocumentChunk.fromstring(json.dumps(_SAMPLE_CHUNK_1)),
domain.DocumentChunk.fromstring(json.dumps(_SAMPLE_CHUNK_2)),
]

indexer.fit(iter(chunks), temp_dir)

document_index = adapters.WhooshDocumentIndex(temp_dir)
query = domain.Query("providers")
results = document_index.search(query, max_results=10)
assert len(results) > 1

# it should be possible to specify max length of results
assert len(document_index.search(query, max_results=1)) == 1


def test_whoosh_document_index_fitting_for_empty_documents() -> None:
"""Test that WhooshDocumentIndex creates an index from DocumentChunks."""

Expand Down
16 changes: 16 additions & 0 deletions tests/unit/test_domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,19 @@ def test_overlapping_chunks_newlines_preserved() -> None:

# Newlines should remain in first 3 chunks
assert all(["\n" in item["chunk"] for item in result[:3]])


def test_query_parsing_processes_text() -> None:
valid_query = "foo"
empty_query = ""
whitespace_query = " " * 10
padded_query = " bar "

assert str(domain.Query(valid_query)) == valid_query
assert str(domain.Query(padded_query)) == "bar"

with pytest.raises(domain.InvalidQueryError):
_ = domain.Query(empty_query)

with pytest.raises(domain.InvalidQueryError):
_ = domain.Query(whitespace_query)
24 changes: 24 additions & 0 deletions tests/unit/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,3 +239,27 @@ def test_can_index_documents() -> None:
assert (action1, arg1) == ("MKDIR", tmp_location)
assert (action2, arg2) == ("RMRF", dest)
assert (action3, arg3_1, arg3_2) == ("MV", tmp_location, dest)


def test_can_search_index() -> None:
source = ".chunks/programmmer-ke/akash-docs-buddy"
dest = ".index/programmer-ke/akash-docs-buddy"

pipeline = adapters.FakeDocumentChunksPipeline(source, dest)
index = adapters.FakeIndex(pipeline)

services.index_document_chunks(pipeline, index)

query = domain.Query(text="provider")
results = services.search_index(query, index)
assert len(results) > 0

# can specify max results
results = services.search_index(query, index, max_results=1)
assert len(results) == 1

# max results must be > 0
bad_values = [0, -1, -30]
for bad_value in bad_values:
with pytest.raises(services.SearchIndexError):
_ = services.search_index(query, index, max_results=bad_value)
2 changes: 2 additions & 0 deletions todo.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# todo

- address todo comments

# in progress

- [>] Implement lexical search over document index
Expand Down