Skip to content

Commit 875dd2a

Browse files
CopilotMte90
andcommitted
Fix vector search, add file content retrieval, markdown rendering, and incremental indexing
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com>
1 parent 3743fa7 commit 875dd2a

7 files changed

Lines changed: 199 additions & 32 deletions

File tree

ai/analyzer.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -440,17 +440,42 @@ def analyze_local_path_background(local_path: str, database_path: str, venv_path
440440

441441

442442

443-
def search_semantic(query: str, database_path: str, top_k: int = 5):
443+
def search_semantic(query: str, database_path: str, top_k: int = 5, include_content: bool = True):
444444
"""
445445
Uses sqlite-vector's vector_full_scan to retrieve best-matching chunks and returns
446-
a list of {file_id, path, chunk_index, score}.
446+
a list of {file_id, path, chunk_index, score, content (optional)}.
447+
448+
Args:
449+
query: Search query text
450+
database_path: Path to the SQLite database
451+
top_k: Number of results to return
452+
include_content: Whether to retrieve and include the actual chunk text
453+
454+
Returns:
455+
List of dicts with file_id, path, chunk_index, score, and optionally content
447456
"""
448457
q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
449458
if not q_emb:
450459
return []
451460

452461
try:
453-
return _search_vectors(database_path, q_emb, top_k=top_k)
462+
results = _search_vectors(database_path, q_emb, top_k=top_k)
463+
464+
# If content is requested, retrieve chunk text for each result
465+
if include_content:
466+
for result in results:
467+
try:
468+
chunk_text = _get_chunk_text(
469+
database_path,
470+
result["file_id"],
471+
result["chunk_index"]
472+
)
473+
result["content"] = chunk_text or ""
474+
except Exception as e:
475+
logger.warning(f"Failed to retrieve chunk text for {result['path']} chunk {result['chunk_index']}: {e}")
476+
result["content"] = ""
477+
478+
return results
454479
except Exception:
455480
raise
456481

db/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@ class CreateProjectRequest(BaseModel):
1212

1313
class IndexProjectRequest(BaseModel):
1414
project_id: str
15+
incremental: Optional[bool] = True # Default to incremental indexing
1516

1617

1718
class QueryRequest(BaseModel):
1819
project_id: str
1920
query: str
2021
top_k: Optional[int] = 5
22+
include_content: Optional[bool] = True # Whether to include file content in results
2123

endpoints/project_endpoints.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,12 @@ def api_index_project(http_request: Request, request: IndexProjectRequest, backg
160160
Index or re-index a project in the background.
161161
162162
- **project_id**: Unique project identifier
163+
- **incremental**: If True (default), only index new/changed files. If False, re-index all files.
163164
164165
Starts background indexing process:
165166
- Scans project directory for code files
166167
- Generates embeddings for semantic search
167-
- Uses incremental indexing (skips unchanged files)
168+
- Uses incremental indexing by default (skips unchanged files)
168169
169170
Rate limit: 10 requests per minute per IP.
170171
@@ -195,20 +196,31 @@ def api_index_project(http_request: Request, request: IndexProjectRequest, backg
195196
# Update status to indexing
196197
update_project_status(request.project_id, "indexing")
197198

198-
# Start background indexing
199+
# Start background indexing with incremental flag
199200
venv_path = CFG.get("venv_path")
201+
incremental = request.incremental if request.incremental is not None else True
200202

201203
def index_callback():
202204
try:
203-
analyze_local_path_background(project_path, db_path, venv_path, MAX_FILE_SIZE, CFG)
205+
from ai.analyzer import analyze_local_path_sync
206+
# Use sync version directly with incremental flag
207+
analyze_local_path_sync(project_path, db_path, venv_path, MAX_FILE_SIZE, CFG, incremental=incremental)
204208
update_project_status(request.project_id, "ready", datetime.utcnow().isoformat())
205209
except Exception as e:
210+
logger.exception(f"Indexing failed for project {request.project_id}: {e}")
206211
update_project_status(request.project_id, "error")
207212
raise
208213

209214
background_tasks.add_task(index_callback)
210215

211-
return JSONResponse({"status": "indexing", "project_id": request.project_id})
216+
indexing_type = "incremental" if incremental else "full"
217+
logger.info(f"Started {indexing_type} indexing for project {request.project_id}")
218+
219+
return JSONResponse({
220+
"status": "indexing",
221+
"project_id": request.project_id,
222+
"incremental": incremental
223+
})
212224
except Exception as e:
213225
logger.exception(f"Error starting project indexing: {e}")
214226
return JSONResponse({"error": "Failed to start indexing"}, status_code=500)

endpoints/query_endpoints.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,18 @@ def api_query(http_request: Request, request: QueryRequest):
2929
- **project_id**: Unique project identifier
3030
- **query**: Search query text
3131
- **top_k**: Number of results to return (default: 5, max: 20)
32+
- **include_content**: Whether to include file content in results (default: True)
3233
3334
Performs semantic search using vector embeddings:
3435
- Generates embedding for query
3536
- Finds most similar code chunks
3637
- Returns ranked results with scores
38+
- Optionally includes actual file content
3739
3840
Rate limit: 100 requests per minute per IP.
3941
4042
Returns:
41-
- **results**: Array of matching code chunks
43+
- **results**: Array of matching code chunks (with content if requested)
4244
- **project_id**: Project identifier
4345
- **query**: Original query text
4446
"""
@@ -58,7 +60,8 @@ def api_query(http_request: Request, request: QueryRequest):
5860
project_id=request.project_id,
5961
query=request.query,
6062
top_k=request.top_k,
61-
use_cache=True
63+
use_cache=True,
64+
include_content=request.include_content if request.include_content is not None else True
6265
)
6366
return JSONResponse(result)
6467
except ValueError as e:

endpoints/web_endpoints.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -184,24 +184,53 @@ async def code_endpoint(request: Request):
184184
# If RAG requested, perform semantic search and build context
185185
if use_rag:
186186
try:
187-
retrieved = search_semantic(prompt, database_path, top_k=top_k)
188-
# Build context WITHOUT including snippets: only include file references and scores
187+
# Retrieve with content included
188+
retrieved = search_semantic(prompt, database_path, top_k=top_k, include_content=True)
189+
# Build context WITH actual file content for better RAG results
189190
context_parts = []
190191
total_len = len(combined_context)
191192
for r in retrieved:
192-
part = f"File: {r.get('path')} (score: {r.get('score', 0):.4f})\n"
193+
content = r.get("content", "")
194+
path = r.get("path", "")
195+
score = r.get("score", 0)
196+
197+
# Include file path, score, and actual content
198+
part = f"File: {path} (score: {score:.4f})\n{content}\n"
199+
193200
if total_len + len(part) > TOTAL_CONTEXT_LIMIT:
201+
# If full content doesn't fit, try to include at least partial content
202+
remaining = TOTAL_CONTEXT_LIMIT - total_len
203+
if remaining > 200: # Only include if we have meaningful space
204+
truncated_content = content[:remaining - 100] + "..."
205+
part = f"File: {path} (score: {score:.4f})\n{truncated_content}\n"
206+
context_parts.append(part)
207+
used_context.append({
208+
"path": path,
209+
"score": score,
210+
"content": truncated_content,
211+
"file_id": r.get("file_id"),
212+
"chunk_index": r.get("chunk_index")
213+
})
194214
break
215+
195216
context_parts.append(part)
196217
total_len += len(part)
197-
used_context.append({"path": r.get("path"), "score": r.get("score")})
218+
used_context.append({
219+
"path": path,
220+
"score": score,
221+
"content": content,
222+
"file_id": r.get("file_id"),
223+
"chunk_index": r.get("chunk_index")
224+
})
225+
198226
if context_parts:
199-
retrieved_text = "\n".join(context_parts)
227+
retrieved_text = "\n---\n".join(context_parts)
200228
if combined_context:
201-
combined_context = combined_context + "\n\nRetrieved:\n" + retrieved_text
229+
combined_context = combined_context + "\n\nRetrieved Context:\n" + retrieved_text
202230
else:
203-
combined_context = "Retrieved:\n" + retrieved_text
204-
except Exception:
231+
combined_context = "Retrieved Context:\n" + retrieved_text
232+
except Exception as e:
233+
logger.exception(f"RAG search failed: {e}")
205234
used_context = []
206235

207236
# Call the coding model with prompt and combined_context

services/search_service.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ def semantic_search(
2424
project_id: str,
2525
query: str,
2626
top_k: int = 5,
27-
use_cache: bool = True
27+
use_cache: bool = True,
28+
include_content: bool = True
2829
) -> Dict[str, Any]:
2930
"""
3031
Perform semantic search on a project.
@@ -34,6 +35,7 @@ def semantic_search(
3435
query: Search query text
3536
top_k: Number of results to return
3637
use_cache: Whether to use result caching
38+
include_content: Whether to include actual file content in results
3739
3840
Returns:
3941
Dictionary with results, project_id, and query
@@ -53,8 +55,8 @@ def semantic_search(
5355
if stats.get("file_count", 0) == 0:
5456
raise ValueError(f"Project not indexed: {project_id}")
5557

56-
# Check cache
57-
if use_cache:
58+
# Check cache (only if content is not required, as content makes cache key complex)
59+
if use_cache and not include_content:
5860
cache_key = SearchService._make_cache_key(project_id, query, top_k)
5961
cached = search_cache.get(cache_key)
6062
if cached is not None:
@@ -63,7 +65,7 @@ def semantic_search(
6365

6466
# Perform search
6567
try:
66-
results = search_semantic(query, db_path, top_k=top_k)
68+
results = search_semantic(query, db_path, top_k=top_k, include_content=include_content)
6769

6870
response = {
6971
"results": results,
@@ -72,8 +74,8 @@ def semantic_search(
7274
"count": len(results)
7375
}
7476

75-
# Cache results
76-
if use_cache:
77+
# Cache results (only if content not included to keep cache size reasonable)
78+
if use_cache and not include_content:
7779
search_cache.set(cache_key, response)
7880

7981
logger.info(f"Search completed: {len(results)} results for '{query[:50]}'")

0 commit comments

Comments
 (0)