Fix vector search, add file content retrieval, markdown rendering, and incremental indexing

Copilot · Mte90 · Copilot · commit 875dd2a02009 · 2025-11-11T11:12:01.000Z
Co-authored-by: Mte90 &lt;403283+Mte90@users.noreply.github.com&gt;
diff --git a/ai/analyzer.py b/ai/analyzer.py
@@ -440,17 +440,42 @@ def analyze_local_path_background(local_path: str, database_path: str, venv_path
 
 
 
-def search_semantic(query: str, database_path: str, top_k: int = 5):
+def search_semantic(query: str, database_path: str, top_k: int = 5, include_content: bool = True):
     """
     Uses sqlite-vector's vector_full_scan to retrieve best-matching chunks and returns
-    a list of {file_id, path, chunk_index, score}.
+    a list of {file_id, path, chunk_index, score, content (optional)}.
+    
+    Args:
+        query: Search query text
+        database_path: Path to the SQLite database
+        top_k: Number of results to return
+        include_content: Whether to retrieve and include the actual chunk text
+        
+    Returns:
+        List of dicts with file_id, path, chunk_index, score, and optionally content
     """
     q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
     if not q_emb:
         return []
 
     try:
-        return _search_vectors(database_path, q_emb, top_k=top_k)
+        results = _search_vectors(database_path, q_emb, top_k=top_k)
+        
+        # If content is requested, retrieve chunk text for each result
+        if include_content:
+            for result in results:
+                try:
+                    chunk_text = _get_chunk_text(
+                        database_path, 
+                        result["file_id"], 
+                        result["chunk_index"]
+                    )
+                    result["content"] = chunk_text or ""
+                except Exception as e:
+                    logger.warning(f"Failed to retrieve chunk text for {result['path']} chunk {result['chunk_index']}: {e}")
+                    result["content"] = ""
+        
+        return results
     except Exception:
         raise
 
diff --git a/db/models.py b/db/models.py
@@ -12,10 +12,12 @@ class CreateProjectRequest(BaseModel):
 
 class IndexProjectRequest(BaseModel):
     project_id: str
+    incremental: Optional[bool] = True  # Default to incremental indexing
 
 
 class QueryRequest(BaseModel):
     project_id: str
     query: str
     top_k: Optional[int] = 5
+    include_content: Optional[bool] = True  # Whether to include file content in results
 
diff --git a/endpoints/project_endpoints.py b/endpoints/project_endpoints.py
@@ -160,11 +160,12 @@ def api_index_project(http_request: Request, request: IndexProjectRequest, backg
     Index or re-index a project in the background.
     
     - **project_id**: Unique project identifier
+    - **incremental**: If True (default), only index new/changed files. If False, re-index all files.
     
     Starts background indexing process:
     - Scans project directory for code files
     - Generates embeddings for semantic search
-    - Uses incremental indexing (skips unchanged files)
+    - Uses incremental indexing by default (skips unchanged files)
     
     Rate limit: 10 requests per minute per IP.
     
@@ -195,20 +196,31 @@ def api_index_project(http_request: Request, request: IndexProjectRequest, backg
         # Update status to indexing
         update_project_status(request.project_id, "indexing")
         
-        # Start background indexing
+        # Start background indexing with incremental flag
         venv_path = CFG.get("venv_path")
+        incremental = request.incremental if request.incremental is not None else True
         
         def index_callback():
             try:
-                analyze_local_path_background(project_path, db_path, venv_path, MAX_FILE_SIZE, CFG)
+                from ai.analyzer import analyze_local_path_sync
+                # Use sync version directly with incremental flag
+                analyze_local_path_sync(project_path, db_path, venv_path, MAX_FILE_SIZE, CFG, incremental=incremental)
                 update_project_status(request.project_id, "ready", datetime.utcnow().isoformat())
             except Exception as e:
+                logger.exception(f"Indexing failed for project {request.project_id}: {e}")
                 update_project_status(request.project_id, "error")
                 raise
         
         background_tasks.add_task(index_callback)
         
-        return JSONResponse({"status": "indexing", "project_id": request.project_id})
+        indexing_type = "incremental" if incremental else "full"
+        logger.info(f"Started {indexing_type} indexing for project {request.project_id}")
+        
+        return JSONResponse({
+            "status": "indexing", 
+            "project_id": request.project_id,
+            "incremental": incremental
+        })
     except Exception as e:
         logger.exception(f"Error starting project indexing: {e}")
         return JSONResponse({"error": "Failed to start indexing"}, status_code=500)
diff --git a/endpoints/query_endpoints.py b/endpoints/query_endpoints.py
@@ -29,16 +29,18 @@ def api_query(http_request: Request, request: QueryRequest):
     - **project_id**: Unique project identifier
     - **query**: Search query text
     - **top_k**: Number of results to return (default: 5, max: 20)
+    - **include_content**: Whether to include file content in results (default: True)
     
     Performs semantic search using vector embeddings:
     - Generates embedding for query
     - Finds most similar code chunks
     - Returns ranked results with scores
+    - Optionally includes actual file content
     
     Rate limit: 100 requests per minute per IP.
     
     Returns:
-    - **results**: Array of matching code chunks
+    - **results**: Array of matching code chunks (with content if requested)
     - **project_id**: Project identifier
     - **query**: Original query text
     """
@@ -58,7 +60,8 @@ def api_query(http_request: Request, request: QueryRequest):
             project_id=request.project_id,
             query=request.query,
             top_k=request.top_k,
-            use_cache=True
+            use_cache=True,
+            include_content=request.include_content if request.include_content is not None else True
         )
         return JSONResponse(result)
     except ValueError as e:
diff --git a/endpoints/web_endpoints.py b/endpoints/web_endpoints.py
@@ -184,24 +184,53 @@ async def code_endpoint(request: Request):
     # If RAG requested, perform semantic search and build context
     if use_rag:
         try:
-            retrieved = search_semantic(prompt, database_path, top_k=top_k)
-            # Build context WITHOUT including snippets: only include file references and scores
+            # Retrieve with content included
+            retrieved = search_semantic(prompt, database_path, top_k=top_k, include_content=True)
+            # Build context WITH actual file content for better RAG results
             context_parts = []
             total_len = len(combined_context)
             for r in retrieved:
-                part = f"File: {r.get('path')} (score: {r.get('score', 0):.4f})\n"
+                content = r.get("content", "")
+                path = r.get("path", "")
+                score = r.get("score", 0)
+                
+                # Include file path, score, and actual content
+                part = f"File: {path} (score: {score:.4f})\n{content}\n"
+                
                 if total_len + len(part) > TOTAL_CONTEXT_LIMIT:
+                    # If full content doesn't fit, try to include at least partial content
+                    remaining = TOTAL_CONTEXT_LIMIT - total_len
+                    if remaining > 200:  # Only include if we have meaningful space
+                        truncated_content = content[:remaining - 100] + "..."
+                        part = f"File: {path} (score: {score:.4f})\n{truncated_content}\n"
+                        context_parts.append(part)
+                        used_context.append({
+                            "path": path, 
+                            "score": score,
+                            "content": truncated_content,
+                            "file_id": r.get("file_id"),
+                            "chunk_index": r.get("chunk_index")
+                        })
                     break
+                
                 context_parts.append(part)
                 total_len += len(part)
-                used_context.append({"path": r.get("path"), "score": r.get("score")})
+                used_context.append({
+                    "path": path, 
+                    "score": score,
+                    "content": content,
+                    "file_id": r.get("file_id"),
+                    "chunk_index": r.get("chunk_index")
+                })
+            
             if context_parts:
-                retrieved_text = "\n".join(context_parts)
+                retrieved_text = "\n---\n".join(context_parts)
                 if combined_context:
-                    combined_context = combined_context + "\n\nRetrieved:\n" + retrieved_text
+                    combined_context = combined_context + "\n\nRetrieved Context:\n" + retrieved_text
                 else:
-                    combined_context = "Retrieved:\n" + retrieved_text
-        except Exception:
+                    combined_context = "Retrieved Context:\n" + retrieved_text
+        except Exception as e:
+            logger.exception(f"RAG search failed: {e}")
             used_context = []
 
     # Call the coding model with prompt and combined_context
diff --git a/services/search_service.py b/services/search_service.py
@@ -24,7 +24,8 @@ def semantic_search(
         project_id: str,
         query: str,
         top_k: int = 5,
-        use_cache: bool = True
+        use_cache: bool = True,
+        include_content: bool = True
     ) -> Dict[str, Any]:
         """
         Perform semantic search on a project.
@@ -34,6 +35,7 @@ def semantic_search(
             query: Search query text
             top_k: Number of results to return
             use_cache: Whether to use result caching
+            include_content: Whether to include actual file content in results
         
         Returns:
             Dictionary with results, project_id, and query
@@ -53,8 +55,8 @@ def semantic_search(
         if stats.get("file_count", 0) == 0:
             raise ValueError(f"Project not indexed: {project_id}")
         
-        # Check cache
-        if use_cache:
+        # Check cache (only if content is not required, as content makes cache key complex)
+        if use_cache and not include_content:
             cache_key = SearchService._make_cache_key(project_id, query, top_k)
             cached = search_cache.get(cache_key)
             if cached is not None:
@@ -63,7 +65,7 @@ def semantic_search(
         
         # Perform search
         try:
-            results = search_semantic(query, db_path, top_k=top_k)
+            results = search_semantic(query, db_path, top_k=top_k, include_content=include_content)
             
             response = {
                 "results": results,
@@ -72,8 +74,8 @@ def semantic_search(
                 "count": len(results)
             }
             
-            # Cache results
-            if use_cache:
+            # Cache results (only if content not included to keep cache size reasonable)
+            if use_cache and not include_content:
                 search_cache.set(cache_key, response)
             
             logger.info(f"Search completed: {len(results)} results for '{query[:50]}'")
diff --git a/templates/index.html b/templates/index.html