From a6772c3a4975b4e8669880240a335245263b9d57 Mon Sep 17 00:00:00 2001 From: Raphael Southall Date: Wed, 3 Jun 2026 09:42:36 +0100 Subject: [PATCH] Prune notes deleted from disk on full index and watcher startup full_index was upsert-only: it added and updated notes but never removed DB rows for files that no longer existed on disk. The only deletion path was the live watcher's per-event handler, so files removed while the watcher was down orphaned their rows indefinitely, polluting co-occurrence and community detection with ghost nodes. Add reconcile_deletions() to diff the DB against a full disk scan and prune orphans (FK cascade drops chunks/summaries/triples; sqlite-vec rows cleared explicitly). full_index runs it by default; the watcher runs it on startup to self-heal offline deletions. An empty scan skips pruning to avoid wiping the index on a misconfigured or unmounted vault_root. Add `neurostack index --no-prune` to opt out, and tests covering the cascade, the empty-scan guard, and exclude-dir handling. --- CHANGELOG.md | 12 +++++ src/neurostack/cli/__init__.py | 5 ++ src/neurostack/cli/index.py | 5 +- src/neurostack/watcher.py | 67 ++++++++++++++++++++++++++ tests/test_reconcile.py | 87 ++++++++++++++++++++++++++++++++++ 5 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 tests/test_reconcile.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b1c2fb8..fc556cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## Unreleased + +### Fixed + +- **`neurostack index` now prunes notes deleted from disk.** A full index was upsert-only: it added and updated notes but never removed DB rows for files that no longer existed. The only deletion path was the live watcher's per-event handler, so any file removed while the watcher was down orphaned its rows forever — inflating note counts, polluting co-occurrence and community detection with ghost nodes, and dragging modularity down. A full scan sees the whole vault, so it can now reconcile: anything in the DB but not on disk is pruned (FK cascades drop chunks/summaries/triples; sqlite-vec rows are cleared explicitly). An empty scan is treated as a misconfigured/unmounted vault and skips pruning rather than wiping the index. + +### Added + +- `reconcile_deletions(conn, vault_root, exclude_dirs)` in `watcher.py` — prune orphaned notes; returns the count pruned. +- Startup reconcile in `neurostack watch`: the watcher sweeps offline deletions on boot, so it self-heals without a manual re-index. +- `neurostack index --no-prune` to keep orphaned rows (opt out of the new default). + ## v0.13.0 — Remove vault_capture (2026-05-05) ### Breaking changes diff --git a/src/neurostack/cli/__init__.py b/src/neurostack/cli/__init__.py index 46cf9bf..c734fd9 100644 --- a/src/neurostack/cli/__init__.py +++ b/src/neurostack/cli/__init__.py @@ -442,6 +442,11 @@ def main(): "--workers", "-w", type=int, default=2, help="Number of parallel workers for LLM calls (default: 2)", ) + p.add_argument( + "--no-prune", action="store_true", + help="Keep notes whose files were deleted from disk " + "(default: prune orphaned notes from the index)", + ) p.set_defaults(func=cmd_index) # search diff --git a/src/neurostack/cli/index.py b/src/neurostack/cli/index.py index 6df3508..5da53e4 100644 --- a/src/neurostack/cli/index.py +++ b/src/neurostack/cli/index.py @@ -9,13 +9,14 @@ def cmd_index(args): from ..schema import DB_PATH, get_db from ..watcher import full_index - full_index( + pruned = full_index( vault_root=Path(args.vault), embed_url=args.embed_url, summarize_url=args.summarize_url, skip_summary=args.skip_summary, skip_triples=args.skip_triples, workers=getattr(args, "workers", 2), + prune=not getattr(args, "no_prune", False), ) db_path = Path(os.environ.get("NEUROSTACK_DB_PATH", DB_PATH)) conn = get_db(db_path) @@ -23,6 +24,8 @@ def cmd_index(args): chunks = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0] edges = conn.execute("SELECT COUNT(*) FROM graph_edges").fetchone()[0] print(f"Indexed {notes} notes, {chunks} chunks, {edges} graph edges.") + if pruned: + print(f"Pruned {pruned} orphaned notes (deleted from disk).") if notes == 0: print("\n \033[33m!\033[0m No Markdown files found in the vault.") print(" Add .md files to your vault, then run: neurostack index") diff --git a/src/neurostack/watcher.py b/src/neurostack/watcher.py index 3640378..809a31a 100644 --- a/src/neurostack/watcher.py +++ b/src/neurostack/watcher.py @@ -594,6 +594,52 @@ def _write_note_results(conn, result: dict, _has_vec: bool) -> None: conn.commit() +def reconcile_deletions( + conn, + vault_root: Path, + exclude_dirs: list[str] | None = None, +) -> int: + """Prune index rows for notes whose files no longer exist on disk. + + A full disk scan is the source of truth: any note in the DB but not on + disk was deleted while nothing was watching. FK cascades drop the note's + chunks/summaries/triples; the sqlite-vec virtual tables aren't cascaded, + so they're cleared explicitly first. + + Returns the number of orphaned notes pruned. If the scan finds zero files + the prune is skipped — an empty scan almost always means a misconfigured + or unmounted ``vault_root``, not a genuinely emptied vault, and we refuse + to wipe the whole index on that basis. + """ + skip_parts = {".git", ".obsidian", ".trash"} + skip_parts.update(exclude_dirs or []) + disk_paths = { + str(f.relative_to(vault_root)) + for f in vault_root.rglob("*.md") + if not skip_parts.intersection(f.parts) + } + db_paths = [r["path"] for r in conn.execute("SELECT path FROM notes").fetchall()] + orphans = [p for p in db_paths if p not in disk_paths] + if not orphans: + return 0 + if not disk_paths: + log.warning( + "Skipping orphan prune: vault scan found 0 files " + "(likely a misconfigured or unmounted vault_root)." + ) + return 0 + + log.info(f"Pruning {len(orphans)} orphaned notes (deleted from disk)...") + _has_vec = has_vec_index(conn) + for rel_path in orphans: + if _has_vec: + delete_chunk_vecs(conn, rel_path) + delete_triple_vecs(conn, rel_path) + conn.execute("DELETE FROM notes WHERE path = ?", (rel_path,)) + conn.commit() + return len(orphans) + + def full_index( vault_root: Path | None = None, embed_url: str = None, @@ -602,12 +648,19 @@ def full_index( skip_triples: bool = False, exclude_dirs: list[str] | None = None, workers: int = 2, + prune: bool = True, ): """Full re-index of the entire vault. Uses ThreadPoolExecutor to parallelize LLM-bound work (summaries, triples, embeddings) across multiple notes. SQLite writes remain single-threaded. Set workers=1 for sequential processing (original behavior). + + When ``prune`` is True (default), reconciles the index against disk: + notes whose files no longer exist are deleted. A full scan sees the + whole vault, so anything in the DB but not on disk was removed offline + (the live watcher only catches deletions while it is running). Returns + the number of orphaned notes pruned. """ from concurrent.futures import ThreadPoolExecutor, as_completed @@ -695,6 +748,9 @@ def full_index( if done % 50 == 0 or done == total: log.info(f" Progress: {done}/{total}") + # Reconcile against disk: prune notes whose files no longer exist. + pruned = reconcile_deletions(conn, vault_root, exclude_dirs) if prune else 0 + # Build graph log.info("Building wiki-link graph...") build_graph(conn, vault_root) @@ -714,6 +770,7 @@ def full_index( log.info(f"Vector index: {n_chunks} chunks, {n_triples} triples.") log.info("Index complete.") + return pruned def backfill_summaries( @@ -1006,6 +1063,16 @@ def run_watcher( else: log.info(f"Watching {vault_root} for changes...") + # Startup reconcile: the observer only catches deletions that happen + # while it is running, so files removed while the watcher was down would + # orphan their rows forever. Sweep them on boot. + try: + pruned = reconcile_deletions(get_db(DB_PATH), vault_root, exclude_dirs) + if pruned: + log.info("Startup reconcile pruned %d orphaned notes.", pruned) + except Exception as e: + log.warning("Startup reconcile failed: %s", e) + handler = DebouncedHandler( vault_root, embed_url, summarize_url, exclude_dirs=exclude_dirs, diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py new file mode 100644 index 0000000..4aa37cb --- /dev/null +++ b/tests/test_reconcile.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2024-2026 Raphael Southall +"""Tests for orphan reconciliation (pruning notes deleted from disk).""" + +from neurostack.watcher import reconcile_deletions + + +def _insert_note(conn, path, *, with_chunk=False): + conn.execute( + "INSERT INTO notes (path, title, content_hash, updated_at) " + "VALUES (?, ?, ?, ?)", + (path, path, "h", "2026-01-15T00:00:00+00:00"), + ) + if with_chunk: + conn.execute( + "INSERT INTO chunks (note_path, heading_path, content, " + "content_hash, position) VALUES (?, ?, ?, ?, ?)", + (path, "", "body", "h", 0), + ) + conn.commit() + + +def _make_file(vault, rel): + f = vault / rel + f.parent.mkdir(parents=True, exist_ok=True) + f.write_text("# note\n") + return f + + +def test_prunes_orphan_and_cascades_chunks(in_memory_db, tmp_path): + vault = tmp_path / "vault" + vault.mkdir() + _make_file(vault, "keep.md") + _insert_note(in_memory_db, "keep.md") + _insert_note(in_memory_db, "ghost.md", with_chunk=True) + + pruned = reconcile_deletions(in_memory_db, vault) + + assert pruned == 1 + paths = {r["path"] for r in in_memory_db.execute("SELECT path FROM notes")} + assert paths == {"keep.md"} + # FK cascade should have removed the orphan's chunk + chunk_rows = in_memory_db.execute( + "SELECT COUNT(*) FROM chunks WHERE note_path = 'ghost.md'" + ).fetchone()[0] + assert chunk_rows == 0 + + +def test_no_orphans_returns_zero(in_memory_db, tmp_path): + vault = tmp_path / "vault" + vault.mkdir() + _make_file(vault, "keep.md") + _insert_note(in_memory_db, "keep.md") + + assert reconcile_deletions(in_memory_db, vault) == 0 + + +def test_empty_scan_is_a_no_op_safety_guard(in_memory_db, tmp_path): + """An empty scan (unmounted/misconfigured vault) must not wipe the index.""" + vault = tmp_path / "vault" + vault.mkdir() # no .md files on disk + _insert_note(in_memory_db, "ghost.md") + + pruned = reconcile_deletions(in_memory_db, vault) + + assert pruned == 0 + paths = {r["path"] for r in in_memory_db.execute("SELECT path FROM notes")} + assert paths == {"ghost.md"} # preserved despite not being on disk + + +def test_excluded_dir_notes_are_pruned(in_memory_db, tmp_path): + """Excluded dirs aren't managed by the indexer, so stale rows pointing + into them are reconciled away — same exclusion the scan applies.""" + vault = tmp_path / "vault" + vault.mkdir() + _make_file(vault, "keep.md") + _make_file(vault, "archive/old.md") + _insert_note(in_memory_db, "keep.md") + _insert_note(in_memory_db, "archive/old.md") + + # 'archive' excluded: its file is skipped in the scan, so the DB row for + # archive/old.md would look orphaned. Confirm current behaviour is explicit. + pruned = reconcile_deletions(in_memory_db, vault, exclude_dirs=["archive"]) + + assert pruned == 1 + paths = {r["path"] for r in in_memory_db.execute("SELECT path FROM notes")} + assert paths == {"keep.md"}