Skip to content

Commit 46d8495

Browse files
CopilotMte90
andauthored
Fix vector search error and integrate llama-index for RAG operations (#17)
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
1 parent 5decaa1 commit 46d8495

File tree

14 files changed

+688
-622
lines changed

14 files changed

+688
-622
lines changed

ai/analyzer.py

Lines changed: 45 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515
load_sqlite_vector_extension as _load_sqlite_vector_extension,
1616
ensure_chunks_and_meta as _ensure_chunks_and_meta,
1717
insert_chunk_vector_with_retry as _insert_chunk_vector_with_retry,
18-
search_vectors as _search_vectors,
1918
get_chunk_text as _get_chunk_text,
2019
)
21-
from .openai import call_coding_api, EmbeddingClient
20+
from .openai import call_coding_api
21+
from .llama_embeddings import OpenAICompatibleEmbedding
22+
from .llama_chunker import chunk_with_llama_index
2223
from llama_index.core import Document
2324
from utils.logger import get_logger
24-
from utils import compute_file_hash, chunk_text, norm, cosine
25-
from .smart_chunker import smart_chunk
25+
from utils import compute_file_hash, norm, cosine
2626
import logging
2727

2828
# reduce noise from httpx used by external libs
@@ -64,8 +64,8 @@
6464

6565
logger = get_logger(__name__)
6666

67-
# Initialize EmbeddingClient for structured logging and retry logic
68-
_embedding_client = EmbeddingClient()
67+
# Initialize llama-index embedding client
68+
_embedding_client = OpenAICompatibleEmbedding()
6969

7070
# Thread-local storage to track execution state inside futures
7171
_thread_state = threading.local()
@@ -86,7 +86,8 @@ def _get_embedding_with_semaphore(semaphore: threading.Semaphore, text: str, fil
8686
semaphore.acquire()
8787
try:
8888
_thread_state.stage = "calling_embed_text"
89-
result = _embedding_client.embed_text(text, file_path=file_path, chunk_index=chunk_index)
89+
# Use llama-index embedding client
90+
result = _embedding_client._get_text_embedding(text)
9091
_thread_state.stage = "completed"
9192
return result
9293
except Exception as e:
@@ -171,14 +172,8 @@ def _process_file_sync(
171172
if isinstance(cfg, dict):
172173
embedding_model = cfg.get("embedding_model")
173174

174-
# Use smart chunking for supported code languages
175-
use_smart_chunking = cfg.get("smart_chunking", True) if isinstance(cfg, dict) else True
176-
supported_languages = ["python", "javascript", "typescript", "java", "go", "rust", "c", "cpp"]
177-
178-
if use_smart_chunking and lang in supported_languages:
179-
chunks = smart_chunk(content, language=lang, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
180-
else:
181-
chunks = chunk_text(content, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
175+
# Use llama-index chunking for all content
176+
chunks = chunk_with_llama_index(content, language=lang, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
182177

183178
if not chunks:
184179
chunks = [content]
@@ -395,11 +390,13 @@ def analyze_local_path_sync(
395390

396391
try:
397392
# Use batch update for efficiency - single database transaction
393+
# Store total_files for performance (avoid re-scanning directory on every request)
398394
set_project_metadata_batch(database_path, {
399395
"last_indexed_at": time.strftime("%Y-%m-%d %H:%M:%S"),
400396
"last_index_duration": str(duration),
401397
"files_indexed": str(file_count),
402-
"files_skipped": str(skipped_count)
398+
"files_skipped": str(skipped_count),
399+
"total_files": str(total_files) # Store total files found during indexing
403400
})
404401
except Exception:
405402
logger.exception("Failed to store indexing metadata")
@@ -442,16 +439,40 @@ def analyze_local_path_background(local_path: str, database_path: str, venv_path
442439

443440
def search_semantic(query: str, database_path: str, top_k: int = 5):
444441
"""
445-
Uses sqlite-vector's vector_full_scan to retrieve best-matching chunks and returns
446-
a list of {file_id, path, chunk_index, score}.
442+
Uses llama-index with sqlite-vector backend to retrieve best-matching chunks.
443+
Always includes content as it's needed for the coding model context.
444+
445+
Args:
446+
query: Search query text
447+
database_path: Path to the SQLite database
448+
top_k: Number of results to return
449+
450+
Returns:
451+
List of dicts with file_id, path, chunk_index, score, and content
447452
"""
448-
q_emb = _embedding_client.embed_text(query, file_path="<query>", chunk_index=0)
449-
if not q_emb:
450-
return []
451-
452453
try:
453-
return _search_vectors(database_path, q_emb, top_k=top_k)
454-
except Exception:
454+
# Use llama-index for semantic search
455+
from .llama_integration import llama_index_search
456+
457+
docs = llama_index_search(query, database_path, top_k=top_k)
458+
459+
results = []
460+
for doc in docs:
461+
metadata = doc.metadata or {}
462+
result = {
463+
"file_id": metadata.get("file_id", 0),
464+
"path": metadata.get("path", ""),
465+
"chunk_index": metadata.get("chunk_index", 0),
466+
"score": metadata.get("score", 0.0),
467+
"content": doc.text or "" # Always include content for LLM context
468+
}
469+
results.append(result)
470+
471+
logger.info(f"llama-index search returned {len(results)} results")
472+
return results
473+
474+
except Exception as e:
475+
logger.exception(f"Semantic search failed: {e}")
455476
raise
456477

457478

ai/llama_chunker.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""
2+
LlamaIndex-based chunking for code and text.
3+
Replaces smart_chunker.py with llama-index's built-in splitters.
4+
"""
5+
from typing import List
6+
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
7+
from llama_index.core.schema import Document
8+
9+
from utils.logger import get_logger
10+
11+
logger = get_logger(__name__)
12+
13+
14+
def chunk_with_llama_index(
15+
content: str,
16+
language: str = "text",
17+
chunk_size: int = 800,
18+
chunk_overlap: int = 100
19+
) -> List[str]:
20+
"""
21+
Chunk text or code using llama-index's splitters.
22+
23+
Args:
24+
content: Text or code content to chunk
25+
language: Programming language (python, javascript, etc.) or "text"
26+
chunk_size: Target size for each chunk in characters
27+
chunk_overlap: Overlap between chunks in characters
28+
29+
Returns:
30+
List of text chunks
31+
"""
32+
# Map language names to llama-index language identifiers
33+
language_map = {
34+
"python": "python",
35+
"javascript": "js",
36+
"typescript": "ts",
37+
"java": "java",
38+
"go": "go",
39+
"rust": "rust",
40+
"c": "c",
41+
"cpp": "cpp",
42+
"c++": "cpp",
43+
}
44+
45+
try:
46+
# Check if it's a supported code language
47+
llama_lang = language_map.get(language.lower())
48+
49+
if llama_lang:
50+
# Use CodeSplitter for code
51+
splitter = CodeSplitter(
52+
language=llama_lang,
53+
chunk_lines=40, # Target lines per chunk (approximation)
54+
chunk_lines_overlap=5, # Overlap in lines
55+
max_chars=chunk_size
56+
)
57+
logger.debug(f"Using CodeSplitter for language: {llama_lang}")
58+
else:
59+
# Use SentenceSplitter for text or unknown languages
60+
splitter = SentenceSplitter(
61+
chunk_size=chunk_size,
62+
chunk_overlap=chunk_overlap,
63+
paragraph_separator="\n\n",
64+
secondary_chunking_regex="[^,.;。?!]+[,.;。?!]?"
65+
)
66+
logger.debug(f"Using SentenceSplitter for language: {language}")
67+
68+
# Create a document and split it
69+
doc = Document(text=content)
70+
nodes = splitter.get_nodes_from_documents([doc])
71+
72+
# Extract text from nodes
73+
chunks = [node.text for node in nodes if node.text]
74+
75+
logger.debug(f"Split content into {len(chunks)} chunks")
76+
return chunks if chunks else [content]
77+
78+
except Exception as e:
79+
logger.exception(f"Error chunking with llama-index: {e}")
80+
# Fallback to simple chunking
81+
return simple_chunk(content, chunk_size, chunk_overlap)
82+
83+
84+
def simple_chunk(text: str, chunk_size: int = 800, chunk_overlap: int = 100) -> List[str]:
85+
"""
86+
Simple character-based chunking fallback.
87+
88+
Args:
89+
text: Text to chunk
90+
chunk_size: Size of each chunk
91+
chunk_overlap: Overlap between chunks
92+
93+
Returns:
94+
List of text chunks
95+
"""
96+
if not text:
97+
return []
98+
99+
chunks = []
100+
step = max(1, chunk_size - chunk_overlap)
101+
102+
for i in range(0, len(text), step):
103+
end = min(i + chunk_size, len(text))
104+
chunk = text[i:end]
105+
if chunk.strip():
106+
chunks.append(chunk)
107+
108+
if end >= len(text):
109+
break
110+
111+
return chunks if chunks else [text]

ai/llama_embeddings.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
"""
2+
LlamaIndex-compatible embeddings using OpenAI API.
3+
Replaces the custom EmbeddingClient with llama-index's embedding abstraction.
4+
"""
5+
from typing import List, Optional
6+
from llama_index.core.embeddings import BaseEmbedding
7+
from llama_index.core.bridge.pydantic import PrivateAttr
8+
from openai import OpenAI
9+
10+
from utils.config import CFG
11+
from utils.logger import get_logger
12+
13+
logger = get_logger(__name__)
14+
15+
16+
class OpenAICompatibleEmbedding(BaseEmbedding):
17+
"""
18+
LlamaIndex-compatible embedding model using OpenAI-compatible API.
19+
Works with any OpenAI-compatible endpoint (OpenAI, Azure, local servers, etc.)
20+
"""
21+
22+
_client: OpenAI = PrivateAttr()
23+
_model: str = PrivateAttr()
24+
25+
def __init__(
26+
self,
27+
api_key: Optional[str] = None,
28+
api_base: Optional[str] = None,
29+
model: Optional[str] = None,
30+
**kwargs
31+
):
32+
"""
33+
Initialize the embedding model.
34+
35+
Args:
36+
api_key: OpenAI API key (defaults to config)
37+
api_base: API base URL (defaults to config)
38+
model: Model name (defaults to config)
39+
"""
40+
super().__init__(**kwargs)
41+
42+
# Get config values
43+
self._client = OpenAI(
44+
api_key=api_key or CFG.get("api_key"),
45+
base_url=api_base or CFG.get("api_url")
46+
)
47+
self._model = model or CFG.get("embedding_model") or "text-embedding-3-small"
48+
49+
logger.info(f"Initialized OpenAICompatibleEmbedding with model: {self._model}")
50+
51+
@classmethod
52+
def class_name(cls) -> str:
53+
return "OpenAICompatibleEmbedding"
54+
55+
async def _aget_query_embedding(self, query: str) -> List[float]:
56+
"""Get query embedding asynchronously."""
57+
return self._get_query_embedding(query)
58+
59+
async def _aget_text_embedding(self, text: str) -> List[float]:
60+
"""Get text embedding asynchronously."""
61+
return self._get_text_embedding(text)
62+
63+
def _get_query_embedding(self, query: str) -> List[float]:
64+
"""Get embedding for a query."""
65+
return self._get_text_embedding(query)
66+
67+
def _get_text_embedding(self, text: str) -> List[float]:
68+
"""Get embedding for a text."""
69+
try:
70+
# Clean the text
71+
text = text.replace("\n", " ").strip()
72+
if not text:
73+
logger.warning("Empty text provided for embedding")
74+
return []
75+
76+
# Call OpenAI API
77+
response = self._client.embeddings.create(
78+
input=[text],
79+
model=self._model
80+
)
81+
82+
if response.data and len(response.data) > 0:
83+
embedding = response.data[0].embedding
84+
logger.debug(f"Generated embedding with dimension: {len(embedding)}")
85+
return embedding
86+
else:
87+
logger.error("No embedding returned from API")
88+
return []
89+
90+
except Exception as e:
91+
logger.exception(f"Failed to generate embedding: {e}")
92+
return []
93+
94+
def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
95+
"""Get embeddings for multiple texts."""
96+
embeddings = []
97+
for text in texts:
98+
embedding = self._get_text_embedding(text)
99+
embeddings.append(embedding)
100+
return embeddings

0 commit comments

Comments
 (0)