Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/semble/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def _load_index(path: str, content: list[ContentType]) -> SembleIndex:
def _run_search(path: str, query: str, top_k: int, content: list[ContentType], max_snippet_lines: int | None) -> None:
"""Handle the `search` subcommand."""
index = _load_index(path, content)
results = index.search(query, top_k=top_k)
results = index.search(query, top_k=top_k, max_snippet_lines=max_snippet_lines)
out = format_results(query, results, max_snippet_lines) if results else {"error": "No results found."}
print(json.dumps(out))
_maybe_save_index(index, path)
Expand All @@ -130,7 +130,7 @@ def _run_find_related(
if chunk is None:
print(f"No chunk found at {file_path}:{line}.", file=sys.stderr)
sys.exit(1)
results = index.find_related(chunk, top_k=top_k)
results = index.find_related(chunk, top_k=top_k, max_snippet_lines=max_snippet_lines)
label = f"Chunks related to {file_path}:{line}"
out = (
format_results(label, results, max_snippet_lines)
Expand Down
11 changes: 8 additions & 3 deletions src/semble/index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,18 +224,21 @@ def from_git(
content=normalized,
)

def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]:
def find_related(
self, source: Chunk | SearchResult, *, top_k: int = 5, max_snippet_lines: int | None = None
) -> list[SearchResult]:
"""Return chunks semantically similar to the given chunk or search result.

:param source: A SearchResult or Chunk to use as the seed.
:param top_k: Number of similar chunks to return.
:param max_snippet_lines: Lines of content to count for savings stats. None = full chunk.
:return: Ranked list of SearchResult objects, most similar first.
"""
target = source.chunk if isinstance(source, SearchResult) else source
selector = self._get_selector_vector(filter_languages=[target.language]) if target.language else None
results = _search_semantic(target.content, self.model, self._semantic_index, self.chunks, top_k + 1, selector)
results = [r for r in results if r.chunk != target][:top_k]
save_search_stats(results, CallType.FIND_RELATED, self._file_sizes)
save_search_stats(results, CallType.FIND_RELATED, self._file_sizes, max_snippet_lines)
return results

def _get_selector_vector(
Expand All @@ -258,6 +261,7 @@ def search(
filter_languages: list[str] | None = None,
filter_paths: list[str] | None = None,
rerank: bool | None = None,
max_snippet_lines: int | None = None,
) -> list[SearchResult]:
"""Search the index and return the top-k most relevant chunks.

Expand All @@ -271,6 +275,7 @@ def search(
chunks from these files are returned.
:param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties).
Defaults to True when ContentType.CODE was indexed.
:param max_snippet_lines: Lines of content to count for savings stats. None = full chunk.
:return: Ranked list of SearchResult objects, best match first.
"""
if not self.chunks or not query.strip():
Expand All @@ -290,7 +295,7 @@ def search(
selector=selector,
rerank=resolved_rerank,
)
save_search_stats(results, CallType.SEARCH, self._file_sizes)
save_search_stats(results, CallType.SEARCH, self._file_sizes, max_snippet_lines)
return results

@classmethod
Expand Down
8 changes: 5 additions & 3 deletions src/semble/mcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ async def search(
"If the snippet does not contain enough context to confirm you have the right location, "
"call again with max_snippet_lines=None."
),
ge=0,
),
] = 10,
) -> str:
Expand All @@ -91,7 +92,7 @@ async def search(
index = await _get_index(repo, default_source, cache)
except ValueError as exc:
return str(exc)
results = index.search(query, top_k=top_k)
results = index.search(query, top_k=top_k, max_snippet_lines=max_snippet_lines)
if not results:
return json.dumps({"error": "No results found."})
return json.dumps(format_results(query, results, max_snippet_lines))
Expand All @@ -111,7 +112,8 @@ async def find_related(
description=(
"Lines of source per result. "
"Default 10 = signature + first body lines. 0 = location only. None = full chunk."
)
),
ge=0,
),
] = 10,
) -> str:
Expand All @@ -131,7 +133,7 @@ async def find_related(
f"No chunk found at {file_path}:{line}. "
"Make sure the file is indexed and the line number is within a known chunk."
)
results = index.find_related(chunk, top_k=top_k)
results = index.find_related(chunk, top_k=top_k, max_snippet_lines=max_snippet_lines)
if not results:
return json.dumps({"error": f"No related chunks found for {file_path}:{line}."})
label = f"Chunks related to {file_path}:{line}"
Expand Down
10 changes: 9 additions & 1 deletion src/semble/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,18 @@ def save_search_stats(
results: list[SearchResult],
call_type: CallType,
file_sizes: dict[str, int],
max_snippet_lines: int | None = None,
) -> None:
"""Save stats about a search or find_related call to the stats file."""
try:
snippet_chars = sum(len(result.chunk.content) for result in results)
snippet_chars = sum(
len("\n".join(result.chunk.content.splitlines()[:max_snippet_lines]))
if max_snippet_lines and max_snippet_lines > 0
else 0
if max_snippet_lines == 0
else len(result.chunk.content)
for result in results
)
file_chars = sum(
file_sizes[path] for path in {result.chunk.file_path for result in results} if path in file_sizes
)
Expand Down
Loading