Add llama-index integration for vector search and project path field in UI

Copilot · Mte90 · Copilot · commit c1748cc871c1 · 2025-11-11T11:16:14.000Z
Co-authored-by: Mte90 &lt;403283+Mte90@users.noreply.github.com&gt;
diff --git a/ai/analyzer.py b/ai/analyzer.py
@@ -442,7 +442,7 @@ def analyze_local_path_background(local_path: str, database_path: str, venv_path
 
 def search_semantic(query: str, database_path: str, top_k: int = 5, include_content: bool = True):
     """
-    Uses sqlite-vector's vector_full_scan to retrieve best-matching chunks and returns
+    Uses llama-index with sqlite-vector backend to retrieve best-matching chunks and returns
     a list of {file_id, path, chunk_index, score, content (optional)}.
     
     Args:
@@ -454,29 +454,33 @@ def search_semantic(query: str, database_path: str, top_k: int = 5, include_cont
     Returns:
         List of dicts with file_id, path, chunk_index, score, and optionally content
     """
-    q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
-    if not q_emb:
-        return []
-
     try:
-        results = _search_vectors(database_path, q_emb, top_k=top_k)
+        # Use llama-index for semantic search
+        from .llama_integration import llama_index_search
         
-        # If content is requested, retrieve chunk text for each result
-        if include_content:
-            for result in results:
-                try:
-                    chunk_text = _get_chunk_text(
-                        database_path, 
-                        result["file_id"], 
-                        result["chunk_index"]
-                    )
-                    result["content"] = chunk_text or ""
-                except Exception as e:
-                    logger.warning(f"Failed to retrieve chunk text for {result['path']} chunk {result['chunk_index']}: {e}")
-                    result["content"] = ""
+        docs = llama_index_search(query, database_path, top_k=top_k)
         
+        results = []
+        for doc in docs:
+            metadata = doc.metadata or {}
+            result = {
+                "file_id": metadata.get("file_id", 0),
+                "path": metadata.get("path", ""),
+                "chunk_index": metadata.get("chunk_index", 0),
+                "score": metadata.get("score", 0.0)
+            }
+            
+            # Include content if requested
+            if include_content:
+                result["content"] = doc.text or ""
+            
+            results.append(result)
+        
+        logger.info(f"llama-index search returned {len(results)} results")
         return results
-    except Exception:
+        
+    except Exception as e:
+        logger.exception(f"Semantic search failed: {e}")
         raise
 
 
diff --git a/ai/llama_integration.py b/ai/llama_integration.py
@@ -1,10 +1,13 @@
 """
 LlamaIndex integration for document retrieval.
+Provides RAG functionality using llama-index with sqlite-vector backend.
 """
-from typing import List
+from typing import List, Optional
 from llama_index.core import Document
+from llama_index.core.vector_stores.types import VectorStoreQuery
 
 from .openai import EmbeddingClient
+from .llama_vector_store import SQLiteVectorStore
 from utils.logger import get_logger
 
 logger = get_logger(__name__)
@@ -13,36 +16,71 @@
 _embedding_client = EmbeddingClient()
 
 
+def llama_index_search(query: str, database_path: str, top_k: int = 5) -> List[Document]:
+    """
+    Perform semantic search using llama-index with sqlite-vector backend.
+    
+    Args:
+        query: Search query text
+        database_path: Path to project database
+        top_k: Number of results to return
+    
+    Returns:
+        List of Document objects with chunk text and metadata
+    """
+    try:
+        # Get query embedding
+        q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
+        if not q_emb:
+            logger.warning("Failed to generate query embedding")
+            return []
+        
+        # Create vector store
+        vector_store = SQLiteVectorStore(database_path)
+        
+        # Create query
+        vector_query = VectorStoreQuery(
+            query_embedding=q_emb,
+            similarity_top_k=top_k
+        )
+        
+        # Execute query
+        query_result = vector_store.query(vector_query)
+        
+        # Convert TextNodes to Documents
+        docs: List[Document] = []
+        for node, score in zip(query_result.nodes, query_result.similarities):
+            doc = Document(
+                text=node.text,
+                metadata={
+                    **node.metadata,
+                    "score": score
+                }
+            )
+            docs.append(doc)
+        
+        logger.info(f"llama-index search returned {len(docs)} documents")
+        return docs
+        
+    except Exception as e:
+        logger.exception(f"llama-index search failed: {e}")
+        return []
+
+
 def llama_index_retrieve_documents(query: str, database_path: str, top_k: int = 5, 
                                    search_func=None, get_chunk_func=None) -> List[Document]:
     """
-    Return llama_index.core.Document objects for the top_k matching chunks using sqlite-vector.
+    Legacy function - now redirects to llama_index_search.
     
     Args:
         query: Search query text
         database_path: Path to project database
         top_k: Number of results to return
-        search_func: Function to search vectors (injected from analyzer)
-        get_chunk_func: Function to get chunk text (injected from analyzer)
+        search_func: Deprecated - not used
+        get_chunk_func: Deprecated - not used
     
     Returns:
         List of Document objects with chunk text and metadata
     """
-    if search_func is None or get_chunk_func is None:
-        raise ValueError("search_func and get_chunk_func must be provided")
-    
-    q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
-    if not q_emb:
-        return []
+    return llama_index_search(query, database_path, top_k)
 
-    rows = search_func(database_path, q_emb, top_k=top_k)
-    docs: List[Document] = []
-    for r in rows:
-        fid = r.get("file_id")
-        path = r.get("path")
-        chunk_idx = r.get("chunk_index", 0)
-        score = r.get("score", 0.0)
-        chunk_text = get_chunk_func(database_path, fid, chunk_idx) or ""
-        doc = Document(text=chunk_text, extra_info={"path": path, "file_id": fid, "chunk_index": chunk_idx, "score": score})
-        docs.append(doc)
-    return docs
diff --git a/ai/llama_vector_store.py b/ai/llama_vector_store.py
@@ -0,0 +1,135 @@
+"""
+Custom LlamaIndex Vector Store implementation using sqlite-vector.
+This bridges llama-index's vector store interface with our sqlite-vector backend.
+"""
+from typing import List, Optional, Any, Dict
+from llama_index.core.vector_stores.types import (
+    VectorStore,
+    VectorStoreQuery,
+    VectorStoreQueryResult,
+)
+from llama_index.core.schema import TextNode, BaseNode
+
+from db.vector_operations import search_vectors, get_chunk_text
+from utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class SQLiteVectorStore(VectorStore):
+    """
+    Custom vector store implementation that uses sqlite-vector backend.
+    Compatible with llama-index's VectorStore interface.
+    """
+    
+    def __init__(self, database_path: str):
+        """
+        Initialize the SQLite vector store.
+        
+        Args:
+            database_path: Path to the SQLite database with vector extension
+        """
+        self.database_path = database_path
+        self._is_embedding_query = True
+        logger.info(f"Initialized SQLiteVectorStore with database: {database_path}")
+    
+    @property
+    def client(self) -> Any:
+        """Return the database path as the client."""
+        return self.database_path
+    
+    def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
+        """
+        Add nodes to the vector store.
+        Note: In our implementation, nodes are added during the indexing process
+        via the analyzer module, not through this interface.
+        """
+        logger.warning("add() called on SQLiteVectorStore - nodes should be added via analyzer module")
+        return []
+    
+    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
+        """Delete a document from the vector store."""
+        logger.warning(f"delete() called on SQLiteVectorStore for {ref_doc_id} - not implemented")
+        pass
+    
+    def query(
+        self,
+        query: VectorStoreQuery,
+        **kwargs: Any,
+    ) -> VectorStoreQueryResult:
+        """
+        Query the vector store.
+        
+        Args:
+            query: VectorStoreQuery with query embedding and parameters
+            
+        Returns:
+            VectorStoreQueryResult with nodes, similarities, and ids
+        """
+        if query.query_embedding is None:
+            logger.error("Query embedding is None")
+            return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
+        
+        # Get top_k from query, default to 5
+        top_k = query.similarity_top_k or 5
+        
+        try:
+            # Use our existing search_vectors function
+            results = search_vectors(
+                database_path=self.database_path,
+                q_vector=query.query_embedding,
+                top_k=top_k
+            )
+            
+            nodes: List[TextNode] = []
+            similarities: List[float] = []
+            ids: List[str] = []
+            
+            for result in results:
+                file_id = result["file_id"]
+                path = result["path"]
+                chunk_index = result["chunk_index"]
+                score = result["score"]
+                
+                # Retrieve the actual chunk text
+                chunk_text = get_chunk_text(self.database_path, file_id, chunk_index)
+                
+                if chunk_text:
+                    # Create a TextNode for llama-index
+                    node = TextNode(
+                        text=chunk_text,
+                        metadata={
+                            "file_id": file_id,
+                            "path": path,
+                            "chunk_index": chunk_index,
+                        },
+                        id_=f"{file_id}_{chunk_index}"
+                    )
+                    
+                    nodes.append(node)
+                    similarities.append(score)
+                    ids.append(node.id_)
+            
+            logger.debug(f"Vector query returned {len(nodes)} results")
+            
+            return VectorStoreQueryResult(
+                nodes=nodes,
+                similarities=similarities,
+                ids=ids
+            )
+            
+        except Exception as e:
+            logger.exception(f"Error querying vector store: {e}")
+            return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
+    
+    def persist(
+        self,
+        persist_path: str,
+        fs: Optional[Any] = None,
+    ) -> None:
+        """
+        Persist the vector store.
+        Note: Our SQLite database is already persistent.
+        """
+        logger.debug("persist() called - SQLite database is already persistent")
+        pass
diff --git a/templates/index.html b/templates/index.html