Skip to content

Commit 2dfc4ed

Browse files
CopilotMte90
andcommitted
Improve indexing logging and storage efficiency
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com>
1 parent fe46b99 commit 2dfc4ed

2 files changed

Lines changed: 59 additions & 5 deletions

File tree

ai/analyzer.py

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,14 +255,48 @@ def _search_vectors(database_path: str, q_vector: List[float], top_k: int = 5) -
255255

256256

257257
def _get_chunk_text(database_path: str, file_id: int, chunk_index: int) -> Optional[str]:
258+
"""
259+
Get chunk text by reading from filesystem instead of database.
260+
Uses project_path metadata and file path to read the actual file.
261+
"""
262+
from db.operations import get_project_metadata
263+
258264
conn = _connect_db(database_path)
259265
try:
260266
cur = conn.cursor()
261-
cur.execute("SELECT content FROM files WHERE id = ?", (file_id,))
267+
# Get file path from database
268+
cur.execute("SELECT path FROM files WHERE id = ?", (file_id,))
262269
row = cur.fetchone()
263270
if not row:
271+
logger.warning(f"File not found in database: file_id={file_id}")
272+
return None
273+
274+
file_path = row[0]
275+
if not file_path:
276+
logger.warning(f"File path is empty for file_id={file_id}")
277+
return None
278+
279+
# Get project path from metadata
280+
project_path = get_project_metadata(database_path, "project_path")
281+
if not project_path:
282+
logger.error("Project path not found in metadata, cannot read file from filesystem")
264283
return None
265-
content = row[0] or ""
284+
285+
# Construct full file path
286+
full_path = os.path.join(project_path, file_path)
287+
288+
# Read file content from filesystem
289+
try:
290+
with open(full_path, "r", encoding="utf-8", errors="ignore") as fh:
291+
content = fh.read()
292+
except Exception as e:
293+
logger.warning(f"Failed to read file from filesystem: {full_path}, error: {e}")
294+
return None
295+
296+
if not content:
297+
return None
298+
299+
# Extract the chunk
266300
if CHUNK_SIZE <= 0:
267301
return content
268302
step = max(1, CHUNK_SIZE - CHUNK_OVERLAP)
@@ -309,8 +343,12 @@ def _process_file_sync(
309343

310344
# Check if file needs reindexing (incremental mode)
311345
if incremental and not needs_reindex(database_path, rel_path, mtime, file_hash):
346+
logger.debug(f"Skipping unchanged file: {rel_path}")
312347
return {"stored": False, "embedded": False, "skipped": True}
313348

349+
# Log file processing
350+
logger.info(f"Processing file: {rel_path}")
351+
314352
# store file (synchronous DB writer) with metadata
315353
try:
316354
fid = store_file(database_path, rel_path, content, lang, mtime, file_hash)
@@ -426,16 +464,26 @@ def analyze_local_path_sync(
426464
Submits per-file tasks to a shared ThreadPoolExecutor.
427465
Supports incremental indexing to skip unchanged files.
428466
"""
467+
from db.operations import set_project_metadata
468+
429469
semaphore = threading.Semaphore(EMBEDDING_CONCURRENCY)
430470
start_time = time.time()
431471

472+
# Store project path in metadata for filesystem access
473+
try:
474+
set_project_metadata(database_path, "project_path", local_path)
475+
logger.info(f"Starting indexing for project at: {local_path}")
476+
except Exception as e:
477+
logger.warning(f"Failed to store project path in metadata: {e}")
478+
432479
try:
433480
file_count = 0
434481
emb_count = 0
435482
skipped_count = 0
436483
file_paths: List[Dict[str, str]] = []
437484

438485
# Collect files to process
486+
logger.info("Collecting files to index...")
439487
for root, dirs, files in os.walk(local_path):
440488
for fname in files:
441489
full = os.path.join(root, fname)
@@ -447,6 +495,8 @@ def analyze_local_path_sync(
447495
except Exception:
448496
continue
449497
file_paths.append({"full": full, "rel": rel})
498+
499+
logger.info(f"Found {len(file_paths)} files to process")
450500

451501
# Process files in chunks to avoid too many futures at once.
452502
CHUNK_SUBMIT = 256
@@ -482,6 +532,9 @@ def analyze_local_path_sync(
482532
end_time = time.time()
483533
duration = end_time - start_time
484534

535+
# Log summary
536+
logger.info(f"Indexing completed: {file_count} files processed, {emb_count} embeddings created, {skipped_count} files skipped in {duration:.2f}s")
537+
485538
try:
486539
# Use batch update for efficiency - single database transaction
487540
set_project_metadata_batch(database_path, {

db/operations.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,21 +233,22 @@ def store_file(database_path, path, content, language, last_modified=None, file_
233233
Insert or update a file record into the DB using a queued single-writer to avoid
234234
sqlite 'database is locked' errors in multithreaded scenarios.
235235
Supports incremental indexing with last_modified and file_hash tracking.
236+
Note: Does not store full file content in database (only snippet), content is read from filesystem when needed.
236237
Returns lastrowid (same as the previous store_file implementation).
237238
"""
238239
snippet = (content[:512] if content else "")
239240
sql = """
240241
INSERT INTO files (path, content, language, snippet, last_modified, file_hash, updated_at)
241-
VALUES (?, ?, ?, ?, ?, ?, datetime('now'))
242+
VALUES (?, NULL, ?, ?, ?, ?, datetime('now'))
242243
ON CONFLICT(path) DO UPDATE SET
243-
content=excluded.content,
244+
content=NULL,
244245
language=excluded.language,
245246
snippet=excluded.snippet,
246247
last_modified=excluded.last_modified,
247248
file_hash=excluded.file_hash,
248249
updated_at=datetime('now')
249250
"""
250-
params = (path, content, language, snippet, last_modified, file_hash)
251+
params = (path, language, snippet, last_modified, file_hash)
251252

252253
writer = _get_writer(database_path)
253254
# We wait for the background writer to complete the insert and then return the row id.

0 commit comments

Comments
 (0)