Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Changelog

## Unreleased

### Fixed

- **`neurostack index` now prunes notes deleted from disk.** A full index was upsert-only: it added and updated notes but never removed DB rows for files that no longer existed. The only deletion path was the live watcher's per-event handler, so any file removed while the watcher was down orphaned its rows forever — inflating note counts, polluting co-occurrence and community detection with ghost nodes, and dragging modularity down. A full scan sees the whole vault, so it can now reconcile: anything in the DB but not on disk is pruned (FK cascades drop chunks/summaries/triples; sqlite-vec rows are cleared explicitly). An empty scan is treated as a misconfigured/unmounted vault and skips pruning rather than wiping the index.

### Added

- `reconcile_deletions(conn, vault_root, exclude_dirs)` in `watcher.py` — prune orphaned notes; returns the count pruned.
- Startup reconcile in `neurostack watch`: the watcher sweeps offline deletions on boot, so it self-heals without a manual re-index.
- `neurostack index --no-prune` to keep orphaned rows (opt out of the new default).

## v0.13.0 — Remove vault_capture (2026-05-05)

### Breaking changes
Expand Down
5 changes: 5 additions & 0 deletions src/neurostack/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,11 @@ def main():
"--workers", "-w", type=int, default=2,
help="Number of parallel workers for LLM calls (default: 2)",
)
p.add_argument(
"--no-prune", action="store_true",
help="Keep notes whose files were deleted from disk "
"(default: prune orphaned notes from the index)",
)
p.set_defaults(func=cmd_index)

# search
Expand Down
5 changes: 4 additions & 1 deletion src/neurostack/cli/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,23 @@
def cmd_index(args):
from ..schema import DB_PATH, get_db
from ..watcher import full_index
full_index(
pruned = full_index(
vault_root=Path(args.vault),
embed_url=args.embed_url,
summarize_url=args.summarize_url,
skip_summary=args.skip_summary,
skip_triples=args.skip_triples,
workers=getattr(args, "workers", 2),
prune=not getattr(args, "no_prune", False),
)
db_path = Path(os.environ.get("NEUROSTACK_DB_PATH", DB_PATH))
conn = get_db(db_path)
notes = conn.execute("SELECT COUNT(*) FROM notes").fetchone()[0]
chunks = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
edges = conn.execute("SELECT COUNT(*) FROM graph_edges").fetchone()[0]
print(f"Indexed {notes} notes, {chunks} chunks, {edges} graph edges.")
if pruned:
print(f"Pruned {pruned} orphaned notes (deleted from disk).")
if notes == 0:
print("\n \033[33m!\033[0m No Markdown files found in the vault.")
print(" Add .md files to your vault, then run: neurostack index")
Expand Down
67 changes: 67 additions & 0 deletions src/neurostack/watcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,52 @@ def _write_note_results(conn, result: dict, _has_vec: bool) -> None:
conn.commit()


def reconcile_deletions(
conn,
vault_root: Path,
exclude_dirs: list[str] | None = None,
) -> int:
"""Prune index rows for notes whose files no longer exist on disk.

A full disk scan is the source of truth: any note in the DB but not on
disk was deleted while nothing was watching. FK cascades drop the note's
chunks/summaries/triples; the sqlite-vec virtual tables aren't cascaded,
so they're cleared explicitly first.

Returns the number of orphaned notes pruned. If the scan finds zero files
the prune is skipped — an empty scan almost always means a misconfigured
or unmounted ``vault_root``, not a genuinely emptied vault, and we refuse
to wipe the whole index on that basis.
"""
skip_parts = {".git", ".obsidian", ".trash"}
skip_parts.update(exclude_dirs or [])
disk_paths = {
str(f.relative_to(vault_root))
for f in vault_root.rglob("*.md")
if not skip_parts.intersection(f.parts)
}
db_paths = [r["path"] for r in conn.execute("SELECT path FROM notes").fetchall()]
orphans = [p for p in db_paths if p not in disk_paths]
if not orphans:
return 0
if not disk_paths:
log.warning(
"Skipping orphan prune: vault scan found 0 files "
"(likely a misconfigured or unmounted vault_root)."
)
return 0

log.info(f"Pruning {len(orphans)} orphaned notes (deleted from disk)...")
_has_vec = has_vec_index(conn)
for rel_path in orphans:
if _has_vec:
delete_chunk_vecs(conn, rel_path)
delete_triple_vecs(conn, rel_path)
conn.execute("DELETE FROM notes WHERE path = ?", (rel_path,))
conn.commit()
return len(orphans)


def full_index(
vault_root: Path | None = None,
embed_url: str = None,
Expand All @@ -602,12 +648,19 @@ def full_index(
skip_triples: bool = False,
exclude_dirs: list[str] | None = None,
workers: int = 2,
prune: bool = True,
):
"""Full re-index of the entire vault.

Uses ThreadPoolExecutor to parallelize LLM-bound work (summaries, triples,
embeddings) across multiple notes. SQLite writes remain single-threaded.
Set workers=1 for sequential processing (original behavior).

When ``prune`` is True (default), reconciles the index against disk:
notes whose files no longer exist are deleted. A full scan sees the
whole vault, so anything in the DB but not on disk was removed offline
(the live watcher only catches deletions while it is running). Returns
the number of orphaned notes pruned.
"""
from concurrent.futures import ThreadPoolExecutor, as_completed

Expand Down Expand Up @@ -695,6 +748,9 @@ def full_index(
if done % 50 == 0 or done == total:
log.info(f" Progress: {done}/{total}")

# Reconcile against disk: prune notes whose files no longer exist.
pruned = reconcile_deletions(conn, vault_root, exclude_dirs) if prune else 0

# Build graph
log.info("Building wiki-link graph...")
build_graph(conn, vault_root)
Expand All @@ -714,6 +770,7 @@ def full_index(
log.info(f"Vector index: {n_chunks} chunks, {n_triples} triples.")

log.info("Index complete.")
return pruned


def backfill_summaries(
Expand Down Expand Up @@ -1006,6 +1063,16 @@ def run_watcher(
else:
log.info(f"Watching {vault_root} for changes...")

# Startup reconcile: the observer only catches deletions that happen
# while it is running, so files removed while the watcher was down would
# orphan their rows forever. Sweep them on boot.
try:
pruned = reconcile_deletions(get_db(DB_PATH), vault_root, exclude_dirs)
if pruned:
log.info("Startup reconcile pruned %d orphaned notes.", pruned)
except Exception as e:
log.warning("Startup reconcile failed: %s", e)

handler = DebouncedHandler(
vault_root, embed_url, summarize_url,
exclude_dirs=exclude_dirs,
Expand Down
87 changes: 87 additions & 0 deletions tests/test_reconcile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024-2026 Raphael Southall
"""Tests for orphan reconciliation (pruning notes deleted from disk)."""

from neurostack.watcher import reconcile_deletions


def _insert_note(conn, path, *, with_chunk=False):
conn.execute(
"INSERT INTO notes (path, title, content_hash, updated_at) "
"VALUES (?, ?, ?, ?)",
(path, path, "h", "2026-01-15T00:00:00+00:00"),
)
if with_chunk:
conn.execute(
"INSERT INTO chunks (note_path, heading_path, content, "
"content_hash, position) VALUES (?, ?, ?, ?, ?)",
(path, "", "body", "h", 0),
)
conn.commit()


def _make_file(vault, rel):
f = vault / rel
f.parent.mkdir(parents=True, exist_ok=True)
f.write_text("# note\n")
return f


def test_prunes_orphan_and_cascades_chunks(in_memory_db, tmp_path):
vault = tmp_path / "vault"
vault.mkdir()
_make_file(vault, "keep.md")
_insert_note(in_memory_db, "keep.md")
_insert_note(in_memory_db, "ghost.md", with_chunk=True)

pruned = reconcile_deletions(in_memory_db, vault)

assert pruned == 1
paths = {r["path"] for r in in_memory_db.execute("SELECT path FROM notes")}
assert paths == {"keep.md"}
# FK cascade should have removed the orphan's chunk
chunk_rows = in_memory_db.execute(
"SELECT COUNT(*) FROM chunks WHERE note_path = 'ghost.md'"
).fetchone()[0]
assert chunk_rows == 0


def test_no_orphans_returns_zero(in_memory_db, tmp_path):
vault = tmp_path / "vault"
vault.mkdir()
_make_file(vault, "keep.md")
_insert_note(in_memory_db, "keep.md")

assert reconcile_deletions(in_memory_db, vault) == 0


def test_empty_scan_is_a_no_op_safety_guard(in_memory_db, tmp_path):
"""An empty scan (unmounted/misconfigured vault) must not wipe the index."""
vault = tmp_path / "vault"
vault.mkdir() # no .md files on disk
_insert_note(in_memory_db, "ghost.md")

pruned = reconcile_deletions(in_memory_db, vault)

assert pruned == 0
paths = {r["path"] for r in in_memory_db.execute("SELECT path FROM notes")}
assert paths == {"ghost.md"} # preserved despite not being on disk


def test_excluded_dir_notes_are_pruned(in_memory_db, tmp_path):
"""Excluded dirs aren't managed by the indexer, so stale rows pointing
into them are reconciled away — same exclusion the scan applies."""
vault = tmp_path / "vault"
vault.mkdir()
_make_file(vault, "keep.md")
_make_file(vault, "archive/old.md")
_insert_note(in_memory_db, "keep.md")
_insert_note(in_memory_db, "archive/old.md")

# 'archive' excluded: its file is skipped in the scan, so the DB row for
# archive/old.md would look orphaned. Confirm current behaviour is explicit.
pruned = reconcile_deletions(in_memory_db, vault, exclude_dirs=["archive"])

assert pruned == 1
paths = {r["path"] for r in in_memory_db.execute("SELECT path FROM notes")}
assert paths == {"keep.md"}
Loading