Skip to content

Commit c1748cc

Browse files
CopilotMte90
andcommitted
Add llama-index integration for vector search and project path field in UI
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com>
1 parent 875dd2a commit c1748cc

4 files changed

Lines changed: 303 additions & 52 deletions

File tree

ai/analyzer.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,7 @@ def analyze_local_path_background(local_path: str, database_path: str, venv_path
442442

443443
def search_semantic(query: str, database_path: str, top_k: int = 5, include_content: bool = True):
444444
"""
445-
Uses sqlite-vector's vector_full_scan to retrieve best-matching chunks and returns
445+
Uses llama-index with sqlite-vector backend to retrieve best-matching chunks and returns
446446
a list of {file_id, path, chunk_index, score, content (optional)}.
447447
448448
Args:
@@ -454,29 +454,33 @@ def search_semantic(query: str, database_path: str, top_k: int = 5, include_cont
454454
Returns:
455455
List of dicts with file_id, path, chunk_index, score, and optionally content
456456
"""
457-
q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
458-
if not q_emb:
459-
return []
460-
461457
try:
462-
results = _search_vectors(database_path, q_emb, top_k=top_k)
458+
# Use llama-index for semantic search
459+
from .llama_integration import llama_index_search
463460

464-
# If content is requested, retrieve chunk text for each result
465-
if include_content:
466-
for result in results:
467-
try:
468-
chunk_text = _get_chunk_text(
469-
database_path,
470-
result["file_id"],
471-
result["chunk_index"]
472-
)
473-
result["content"] = chunk_text or ""
474-
except Exception as e:
475-
logger.warning(f"Failed to retrieve chunk text for {result['path']} chunk {result['chunk_index']}: {e}")
476-
result["content"] = ""
461+
docs = llama_index_search(query, database_path, top_k=top_k)
477462

463+
results = []
464+
for doc in docs:
465+
metadata = doc.metadata or {}
466+
result = {
467+
"file_id": metadata.get("file_id", 0),
468+
"path": metadata.get("path", ""),
469+
"chunk_index": metadata.get("chunk_index", 0),
470+
"score": metadata.get("score", 0.0)
471+
}
472+
473+
# Include content if requested
474+
if include_content:
475+
result["content"] = doc.text or ""
476+
477+
results.append(result)
478+
479+
logger.info(f"llama-index search returned {len(results)} results")
478480
return results
479-
except Exception:
481+
482+
except Exception as e:
483+
logger.exception(f"Semantic search failed: {e}")
480484
raise
481485

482486

ai/llama_integration.py

Lines changed: 59 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
"""
22
LlamaIndex integration for document retrieval.
3+
Provides RAG functionality using llama-index with sqlite-vector backend.
34
"""
4-
from typing import List
5+
from typing import List, Optional
56
from llama_index.core import Document
7+
from llama_index.core.vector_stores.types import VectorStoreQuery
68

79
from .openai import EmbeddingClient
10+
from .llama_vector_store import SQLiteVectorStore
811
from utils.logger import get_logger
912

1013
logger = get_logger(__name__)
@@ -13,36 +16,71 @@
1316
_embedding_client = EmbeddingClient()
1417

1518

19+
def llama_index_search(query: str, database_path: str, top_k: int = 5) -> List[Document]:
20+
"""
21+
Perform semantic search using llama-index with sqlite-vector backend.
22+
23+
Args:
24+
query: Search query text
25+
database_path: Path to project database
26+
top_k: Number of results to return
27+
28+
Returns:
29+
List of Document objects with chunk text and metadata
30+
"""
31+
try:
32+
# Get query embedding
33+
q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
34+
if not q_emb:
35+
logger.warning("Failed to generate query embedding")
36+
return []
37+
38+
# Create vector store
39+
vector_store = SQLiteVectorStore(database_path)
40+
41+
# Create query
42+
vector_query = VectorStoreQuery(
43+
query_embedding=q_emb,
44+
similarity_top_k=top_k
45+
)
46+
47+
# Execute query
48+
query_result = vector_store.query(vector_query)
49+
50+
# Convert TextNodes to Documents
51+
docs: List[Document] = []
52+
for node, score in zip(query_result.nodes, query_result.similarities):
53+
doc = Document(
54+
text=node.text,
55+
metadata={
56+
**node.metadata,
57+
"score": score
58+
}
59+
)
60+
docs.append(doc)
61+
62+
logger.info(f"llama-index search returned {len(docs)} documents")
63+
return docs
64+
65+
except Exception as e:
66+
logger.exception(f"llama-index search failed: {e}")
67+
return []
68+
69+
1670
def llama_index_retrieve_documents(query: str, database_path: str, top_k: int = 5,
1771
search_func=None, get_chunk_func=None) -> List[Document]:
1872
"""
19-
Return llama_index.core.Document objects for the top_k matching chunks using sqlite-vector.
73+
Legacy function - now redirects to llama_index_search.
2074
2175
Args:
2276
query: Search query text
2377
database_path: Path to project database
2478
top_k: Number of results to return
25-
search_func: Function to search vectors (injected from analyzer)
26-
get_chunk_func: Function to get chunk text (injected from analyzer)
79+
search_func: Deprecated - not used
80+
get_chunk_func: Deprecated - not used
2781
2882
Returns:
2983
List of Document objects with chunk text and metadata
3084
"""
31-
if search_func is None or get_chunk_func is None:
32-
raise ValueError("search_func and get_chunk_func must be provided")
33-
34-
q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
35-
if not q_emb:
36-
return []
85+
return llama_index_search(query, database_path, top_k)
3786

38-
rows = search_func(database_path, q_emb, top_k=top_k)
39-
docs: List[Document] = []
40-
for r in rows:
41-
fid = r.get("file_id")
42-
path = r.get("path")
43-
chunk_idx = r.get("chunk_index", 0)
44-
score = r.get("score", 0.0)
45-
chunk_text = get_chunk_func(database_path, fid, chunk_idx) or ""
46-
doc = Document(text=chunk_text, extra_info={"path": path, "file_id": fid, "chunk_index": chunk_idx, "score": score})
47-
docs.append(doc)
48-
return docs

ai/llama_vector_store.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
"""
2+
Custom LlamaIndex Vector Store implementation using sqlite-vector.
3+
This bridges llama-index's vector store interface with our sqlite-vector backend.
4+
"""
5+
from typing import List, Optional, Any, Dict
6+
from llama_index.core.vector_stores.types import (
7+
VectorStore,
8+
VectorStoreQuery,
9+
VectorStoreQueryResult,
10+
)
11+
from llama_index.core.schema import TextNode, BaseNode
12+
13+
from db.vector_operations import search_vectors, get_chunk_text
14+
from utils.logger import get_logger
15+
16+
logger = get_logger(__name__)
17+
18+
19+
class SQLiteVectorStore(VectorStore):
20+
"""
21+
Custom vector store implementation that uses sqlite-vector backend.
22+
Compatible with llama-index's VectorStore interface.
23+
"""
24+
25+
def __init__(self, database_path: str):
26+
"""
27+
Initialize the SQLite vector store.
28+
29+
Args:
30+
database_path: Path to the SQLite database with vector extension
31+
"""
32+
self.database_path = database_path
33+
self._is_embedding_query = True
34+
logger.info(f"Initialized SQLiteVectorStore with database: {database_path}")
35+
36+
@property
37+
def client(self) -> Any:
38+
"""Return the database path as the client."""
39+
return self.database_path
40+
41+
def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
42+
"""
43+
Add nodes to the vector store.
44+
Note: In our implementation, nodes are added during the indexing process
45+
via the analyzer module, not through this interface.
46+
"""
47+
logger.warning("add() called on SQLiteVectorStore - nodes should be added via analyzer module")
48+
return []
49+
50+
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
51+
"""Delete a document from the vector store."""
52+
logger.warning(f"delete() called on SQLiteVectorStore for {ref_doc_id} - not implemented")
53+
pass
54+
55+
def query(
56+
self,
57+
query: VectorStoreQuery,
58+
**kwargs: Any,
59+
) -> VectorStoreQueryResult:
60+
"""
61+
Query the vector store.
62+
63+
Args:
64+
query: VectorStoreQuery with query embedding and parameters
65+
66+
Returns:
67+
VectorStoreQueryResult with nodes, similarities, and ids
68+
"""
69+
if query.query_embedding is None:
70+
logger.error("Query embedding is None")
71+
return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
72+
73+
# Get top_k from query, default to 5
74+
top_k = query.similarity_top_k or 5
75+
76+
try:
77+
# Use our existing search_vectors function
78+
results = search_vectors(
79+
database_path=self.database_path,
80+
q_vector=query.query_embedding,
81+
top_k=top_k
82+
)
83+
84+
nodes: List[TextNode] = []
85+
similarities: List[float] = []
86+
ids: List[str] = []
87+
88+
for result in results:
89+
file_id = result["file_id"]
90+
path = result["path"]
91+
chunk_index = result["chunk_index"]
92+
score = result["score"]
93+
94+
# Retrieve the actual chunk text
95+
chunk_text = get_chunk_text(self.database_path, file_id, chunk_index)
96+
97+
if chunk_text:
98+
# Create a TextNode for llama-index
99+
node = TextNode(
100+
text=chunk_text,
101+
metadata={
102+
"file_id": file_id,
103+
"path": path,
104+
"chunk_index": chunk_index,
105+
},
106+
id_=f"{file_id}_{chunk_index}"
107+
)
108+
109+
nodes.append(node)
110+
similarities.append(score)
111+
ids.append(node.id_)
112+
113+
logger.debug(f"Vector query returned {len(nodes)} results")
114+
115+
return VectorStoreQueryResult(
116+
nodes=nodes,
117+
similarities=similarities,
118+
ids=ids
119+
)
120+
121+
except Exception as e:
122+
logger.exception(f"Error querying vector store: {e}")
123+
return VectorStoreQueryResult(nodes=[], similarities=[], ids=[])
124+
125+
def persist(
126+
self,
127+
persist_path: str,
128+
fs: Optional[Any] = None,
129+
) -> None:
130+
"""
131+
Persist the vector store.
132+
Note: Our SQLite database is already persistent.
133+
"""
134+
logger.debug("persist() called - SQLite database is already persistent")
135+
pass

0 commit comments

Comments
 (0)