Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

### Fixed

- **Community modularity no longer collapses toward random.** In a single-domain vault most note pairs share a moderate baseline embedding cosine (off-diagonal mean ≈0.36 on a ~490-note vault), so the semantic signal was a dense floor connecting nearly everything — Newman modularity sat at Q≈0.06, barely better than a random partition. `_build_similarity_matrix` now prunes that floor with an adaptive threshold (`SEMANTIC_THRESHOLD_K`, default mean + 0.5·std of the off-diagonal distribution), zeroing weak semantic edges before community detection. Measured: Q 0.06 → ~0.30 (coarse) / ~0.28 (fine) with stable community counts. The threshold is adaptive, not a fixed cosine, so it self-tunes per vault; set `SEMANTIC_THRESHOLD_K = None` to disable. Diagnosis showed the co-occurrence graph was *not* the cause (entity document-frequency is healthy — 90% of entities appear in a single note), so co-occurrence pruning was a dead end.

- **`neurostack index` now prunes notes deleted from disk.** A full index was upsert-only: it added and updated notes but never removed DB rows for files that no longer existed. The only deletion path was the live watcher's per-event handler, so any file removed while the watcher was down orphaned its rows forever — inflating note counts, polluting co-occurrence and community detection with ghost nodes, and dragging modularity down. A full scan sees the whole vault, so it can now reconcile: anything in the DB but not on disk is pruned (FK cascades drop chunks/summaries/triples; sqlite-vec rows are cleared explicitly). An empty scan is treated as a misconfigured/unmounted vault and skips pruning rather than wiping the index.

### Added
Expand Down
20 changes: 20 additions & 0 deletions src/neurostack/attractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,17 @@
# Minimum shared entities for a note-note edge (co-occurrence signal)
MIN_SHARED = 2

# ── Adaptive semantic edge threshold ──
# In a single-domain vault, most note pairs share a moderate baseline cosine
# (~the off-diagonal mean), so the semantic signal is a dense floor that
# connects nearly everything and collapses community modularity toward random
# (Q≈0.06). We zero semantic edges below mean + k·std of the off-diagonal
# distribution, keeping only meaningfully-similar pairs. Measured effect on a
# ~490-note vault: Q 0.06 → ~0.30 at k=0.5, with stable community counts.
# Adaptive (not a fixed cosine) so it self-tunes to each vault's spread.
# Set to None to disable thresholding.
SEMANTIC_THRESHOLD_K = 0.5


def _build_similarity_matrix(
conn: sqlite3.Connection,
Expand All @@ -92,6 +103,15 @@ def _build_similarity_matrix(
# Clamp to [0, 1] — negative cosine means unrelated, treat as 0
np.clip(S_semantic, 0.0, 1.0, out=S_semantic)

# Prune the dense low-similarity floor (see SEMANTIC_THRESHOLD_K). Compute
# the threshold on the off-diagonal only — the diagonal is self-similarity
# (1.0) and would skew mean/std. Edges below it are zeroed so only
# meaningfully-similar note pairs feed community detection.
if SEMANTIC_THRESHOLD_K is not None and n > 2:
off = S_semantic[~np.eye(n, dtype=bool)]
threshold = float(off.mean() + SEMANTIC_THRESHOLD_K * off.std())
S_semantic[S_semantic < threshold] = 0.0

# 2. Co-occurrence signal (entity co-occurrence weights → note-note)
S_cooc = np.zeros((n, n), dtype=np.float32)

Expand Down
67 changes: 67 additions & 0 deletions tests/test_attractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,3 +589,70 @@ def test_counts_and_bounds(self):

def test_empty(self):
assert _size_stats({}) == (0, 0, 0, 0.0)


# ---------------------------------------------------------------------------
# Adaptive semantic edge threshold
# ---------------------------------------------------------------------------

class TestSemanticThreshold:
"""The adaptive mean+k*std threshold prunes the dense semantic floor."""

def _two_cluster_embeddings(self, dim=8):
# Cluster A along e0, cluster B at 60° (cosine 0.5) — within-cluster
# cosine 1.0, cross-cluster 0.5. Deterministic, no RNG.
u = np.zeros(dim, dtype=np.float32)
u[0] = 1.0
v = np.zeros(dim, dtype=np.float32)
v[0] = 0.5
v[1] = 0.75 ** 0.5
return np.stack([u, u, u, v, v, v])

def test_threshold_prunes_cross_cluster_edges(self, in_memory_db):
conn = in_memory_db
paths = [f"n{i}.md" for i in range(6)]
for p in paths:
_insert_note(conn, p)
conn.commit()
embs = self._two_cluster_embeddings()

from neurostack import attractor
with patch.object(attractor, "SEMANTIC_THRESHOLD_K", None):
S_full = _build_similarity_matrix(conn, paths, embs)
with patch.object(attractor, "SEMANTIC_THRESHOLD_K", 0.5):
S_thr = _build_similarity_matrix(conn, paths, embs)

# Thresholding removes edges overall...
assert np.count_nonzero(S_thr) < np.count_nonzero(S_full)
# ...specifically the weak cross-cluster ones (cosine 0.5)...
assert S_full[0, 3] > 0.0
assert S_thr[0, 3] == 0.0
# ...while strong within-cluster edges survive.
assert S_thr[0, 1] > 0.0
assert S_thr[3, 4] > 0.0

def test_disabled_keeps_floor(self, in_memory_db):
conn = in_memory_db
paths = [f"n{i}.md" for i in range(6)]
for p in paths:
_insert_note(conn, p)
conn.commit()
embs = self._two_cluster_embeddings()

from neurostack import attractor
with patch.object(attractor, "SEMANTIC_THRESHOLD_K", None):
S = _build_similarity_matrix(conn, paths, embs)
# With thresholding off, the cross-cluster edge is retained.
assert S[0, 3] > 0.0

def test_small_vault_not_thresholded(self, in_memory_db):
"""n<=2 skips thresholding (can't estimate a distribution)."""
conn = in_memory_db
paths = ["a.md", "b.md"]
for p in paths:
_insert_note(conn, p)
conn.commit()
# Identical embeddings -> cosine 1.0 -> full ALPHA_SEMANTIC retained.
emb = np.ones(8, dtype=np.float32)
S = _build_similarity_matrix(conn, paths, np.stack([emb, emb]))
assert S[0, 1] == pytest.approx(ALPHA_SEMANTIC, abs=1e-4)
Loading