diff --git a/CHANGELOG.md b/CHANGELOG.md index fc556cb..5df2218 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ ### Fixed +- **Community modularity no longer collapses toward random.** In a single-domain vault most note pairs share a moderate baseline embedding cosine (off-diagonal mean ≈0.36 on a ~490-note vault), so the semantic signal was a dense floor connecting nearly everything — Newman modularity sat at Q≈0.06, barely better than a random partition. `_build_similarity_matrix` now prunes that floor with an adaptive threshold (`SEMANTIC_THRESHOLD_K`, default mean + 0.5·std of the off-diagonal distribution), zeroing weak semantic edges before community detection. Measured: Q 0.06 → ~0.30 (coarse) / ~0.28 (fine) with stable community counts. The threshold is adaptive, not a fixed cosine, so it self-tunes per vault; set `SEMANTIC_THRESHOLD_K = None` to disable. Diagnosis showed the co-occurrence graph was *not* the cause (entity document-frequency is healthy — 90% of entities appear in a single note), so co-occurrence pruning was a dead end. + - **`neurostack index` now prunes notes deleted from disk.** A full index was upsert-only: it added and updated notes but never removed DB rows for files that no longer existed. The only deletion path was the live watcher's per-event handler, so any file removed while the watcher was down orphaned its rows forever — inflating note counts, polluting co-occurrence and community detection with ghost nodes, and dragging modularity down. A full scan sees the whole vault, so it can now reconcile: anything in the DB but not on disk is pruned (FK cascades drop chunks/summaries/triples; sqlite-vec rows are cleared explicitly). An empty scan is treated as a misconfigured/unmounted vault and skips pruning rather than wiping the index. ### Added diff --git a/src/neurostack/attractor.py b/src/neurostack/attractor.py index 00d635a..652d323 100644 --- a/src/neurostack/attractor.py +++ b/src/neurostack/attractor.py @@ -69,6 +69,17 @@ # Minimum shared entities for a note-note edge (co-occurrence signal) MIN_SHARED = 2 +# ── Adaptive semantic edge threshold ── +# In a single-domain vault, most note pairs share a moderate baseline cosine +# (~the off-diagonal mean), so the semantic signal is a dense floor that +# connects nearly everything and collapses community modularity toward random +# (Q≈0.06). We zero semantic edges below mean + k·std of the off-diagonal +# distribution, keeping only meaningfully-similar pairs. Measured effect on a +# ~490-note vault: Q 0.06 → ~0.30 at k=0.5, with stable community counts. +# Adaptive (not a fixed cosine) so it self-tunes to each vault's spread. +# Set to None to disable thresholding. +SEMANTIC_THRESHOLD_K = 0.5 + def _build_similarity_matrix( conn: sqlite3.Connection, @@ -92,6 +103,15 @@ def _build_similarity_matrix( # Clamp to [0, 1] — negative cosine means unrelated, treat as 0 np.clip(S_semantic, 0.0, 1.0, out=S_semantic) + # Prune the dense low-similarity floor (see SEMANTIC_THRESHOLD_K). Compute + # the threshold on the off-diagonal only — the diagonal is self-similarity + # (1.0) and would skew mean/std. Edges below it are zeroed so only + # meaningfully-similar note pairs feed community detection. + if SEMANTIC_THRESHOLD_K is not None and n > 2: + off = S_semantic[~np.eye(n, dtype=bool)] + threshold = float(off.mean() + SEMANTIC_THRESHOLD_K * off.std()) + S_semantic[S_semantic < threshold] = 0.0 + # 2. Co-occurrence signal (entity co-occurrence weights → note-note) S_cooc = np.zeros((n, n), dtype=np.float32) diff --git a/tests/test_attractor.py b/tests/test_attractor.py index 2428be9..23c7e44 100644 --- a/tests/test_attractor.py +++ b/tests/test_attractor.py @@ -589,3 +589,70 @@ def test_counts_and_bounds(self): def test_empty(self): assert _size_stats({}) == (0, 0, 0, 0.0) + + +# --------------------------------------------------------------------------- +# Adaptive semantic edge threshold +# --------------------------------------------------------------------------- + +class TestSemanticThreshold: + """The adaptive mean+k*std threshold prunes the dense semantic floor.""" + + def _two_cluster_embeddings(self, dim=8): + # Cluster A along e0, cluster B at 60° (cosine 0.5) — within-cluster + # cosine 1.0, cross-cluster 0.5. Deterministic, no RNG. + u = np.zeros(dim, dtype=np.float32) + u[0] = 1.0 + v = np.zeros(dim, dtype=np.float32) + v[0] = 0.5 + v[1] = 0.75 ** 0.5 + return np.stack([u, u, u, v, v, v]) + + def test_threshold_prunes_cross_cluster_edges(self, in_memory_db): + conn = in_memory_db + paths = [f"n{i}.md" for i in range(6)] + for p in paths: + _insert_note(conn, p) + conn.commit() + embs = self._two_cluster_embeddings() + + from neurostack import attractor + with patch.object(attractor, "SEMANTIC_THRESHOLD_K", None): + S_full = _build_similarity_matrix(conn, paths, embs) + with patch.object(attractor, "SEMANTIC_THRESHOLD_K", 0.5): + S_thr = _build_similarity_matrix(conn, paths, embs) + + # Thresholding removes edges overall... + assert np.count_nonzero(S_thr) < np.count_nonzero(S_full) + # ...specifically the weak cross-cluster ones (cosine 0.5)... + assert S_full[0, 3] > 0.0 + assert S_thr[0, 3] == 0.0 + # ...while strong within-cluster edges survive. + assert S_thr[0, 1] > 0.0 + assert S_thr[3, 4] > 0.0 + + def test_disabled_keeps_floor(self, in_memory_db): + conn = in_memory_db + paths = [f"n{i}.md" for i in range(6)] + for p in paths: + _insert_note(conn, p) + conn.commit() + embs = self._two_cluster_embeddings() + + from neurostack import attractor + with patch.object(attractor, "SEMANTIC_THRESHOLD_K", None): + S = _build_similarity_matrix(conn, paths, embs) + # With thresholding off, the cross-cluster edge is retained. + assert S[0, 3] > 0.0 + + def test_small_vault_not_thresholded(self, in_memory_db): + """n<=2 skips thresholding (can't estimate a distribution).""" + conn = in_memory_db + paths = ["a.md", "b.md"] + for p in paths: + _insert_note(conn, p) + conn.commit() + # Identical embeddings -> cosine 1.0 -> full ALPHA_SEMANTIC retained. + emb = np.ones(8, dtype=np.float32) + S = _build_similarity_matrix(conn, paths, np.stack([emb, emb])) + assert S[0, 1] == pytest.approx(ALPHA_SEMANTIC, abs=1e-4)