raphasouthall · raphasouthall · Jun 3, 2026 · Jun 3, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 ### Fixed
 
+- **Community modularity no longer collapses toward random.** In a single-domain vault most note pairs share a moderate baseline embedding cosine (off-diagonal mean ≈0.36 on a ~490-note vault), so the semantic signal was a dense floor connecting nearly everything — Newman modularity sat at Q≈0.06, barely better than a random partition. `_build_similarity_matrix` now prunes that floor with an adaptive threshold (`SEMANTIC_THRESHOLD_K`, default mean + 0.5·std of the off-diagonal distribution), zeroing weak semantic edges before community detection. Measured: Q 0.06 → ~0.30 (coarse) / ~0.28 (fine) with stable community counts. The threshold is adaptive, not a fixed cosine, so it self-tunes per vault; set `SEMANTIC_THRESHOLD_K = None` to disable. Diagnosis showed the co-occurrence graph was *not* the cause (entity document-frequency is healthy — 90% of entities appear in a single note), so co-occurrence pruning was a dead end.
+
 - **`neurostack index` now prunes notes deleted from disk.** A full index was upsert-only: it added and updated notes but never removed DB rows for files that no longer existed. The only deletion path was the live watcher's per-event handler, so any file removed while the watcher was down orphaned its rows forever — inflating note counts, polluting co-occurrence and community detection with ghost nodes, and dragging modularity down. A full scan sees the whole vault, so it can now reconcile: anything in the DB but not on disk is pruned (FK cascades drop chunks/summaries/triples; sqlite-vec rows are cleared explicitly). An empty scan is treated as a misconfigured/unmounted vault and skips pruning rather than wiping the index.
 
 ### Added

diff --git a/src/neurostack/attractor.py b/src/neurostack/attractor.py
@@ -69,6 +69,17 @@
 # Minimum shared entities for a note-note edge (co-occurrence signal)
 MIN_SHARED = 2
 
+# ── Adaptive semantic edge threshold ──
+# In a single-domain vault, most note pairs share a moderate baseline cosine
+# (~the off-diagonal mean), so the semantic signal is a dense floor that
+# connects nearly everything and collapses community modularity toward random
+# (Q≈0.06). We zero semantic edges below mean + k·std of the off-diagonal
+# distribution, keeping only meaningfully-similar pairs. Measured effect on a
+# ~490-note vault: Q 0.06 → ~0.30 at k=0.5, with stable community counts.
+# Adaptive (not a fixed cosine) so it self-tunes to each vault's spread.
+# Set to None to disable thresholding.
+SEMANTIC_THRESHOLD_K = 0.5
+
 
 def _build_similarity_matrix(
     conn: sqlite3.Connection,
@@ -92,6 +103,15 @@ def _build_similarity_matrix(
     # Clamp to [0, 1] — negative cosine means unrelated, treat as 0
     np.clip(S_semantic, 0.0, 1.0, out=S_semantic)
 
+    # Prune the dense low-similarity floor (see SEMANTIC_THRESHOLD_K). Compute
+    # the threshold on the off-diagonal only — the diagonal is self-similarity
+    # (1.0) and would skew mean/std. Edges below it are zeroed so only
+    # meaningfully-similar note pairs feed community detection.
+    if SEMANTIC_THRESHOLD_K is not None and n > 2:
+        off = S_semantic[~np.eye(n, dtype=bool)]
+        threshold = float(off.mean() + SEMANTIC_THRESHOLD_K * off.std())
+        S_semantic[S_semantic < threshold] = 0.0
+
     # 2. Co-occurrence signal (entity co-occurrence weights → note-note)
     S_cooc = np.zeros((n, n), dtype=np.float32)
 

diff --git a/tests/test_attractor.py b/tests/test_attractor.py
@@ -589,3 +589,70 @@ def test_counts_and_bounds(self):
 
     def test_empty(self):
         assert _size_stats({}) == (0, 0, 0, 0.0)
+
+
+# ---------------------------------------------------------------------------
+# Adaptive semantic edge threshold
+# ---------------------------------------------------------------------------
+
+class TestSemanticThreshold:
+    """The adaptive mean+k*std threshold prunes the dense semantic floor."""
+
+    def _two_cluster_embeddings(self, dim=8):
+        # Cluster A along e0, cluster B at 60° (cosine 0.5) — within-cluster
+        # cosine 1.0, cross-cluster 0.5. Deterministic, no RNG.
+        u = np.zeros(dim, dtype=np.float32)
+        u[0] = 1.0
+        v = np.zeros(dim, dtype=np.float32)
+        v[0] = 0.5
+        v[1] = 0.75 ** 0.5
+        return np.stack([u, u, u, v, v, v])
+
+    def test_threshold_prunes_cross_cluster_edges(self, in_memory_db):
+        conn = in_memory_db
+        paths = [f"n{i}.md" for i in range(6)]
+        for p in paths:
+            _insert_note(conn, p)
+        conn.commit()
+        embs = self._two_cluster_embeddings()
+
+        from neurostack import attractor
+        with patch.object(attractor, "SEMANTIC_THRESHOLD_K", None):
+            S_full = _build_similarity_matrix(conn, paths, embs)
+        with patch.object(attractor, "SEMANTIC_THRESHOLD_K", 0.5):
+            S_thr = _build_similarity_matrix(conn, paths, embs)
+
+        # Thresholding removes edges overall...
+        assert np.count_nonzero(S_thr) < np.count_nonzero(S_full)
+        # ...specifically the weak cross-cluster ones (cosine 0.5)...
+        assert S_full[0, 3] > 0.0
+        assert S_thr[0, 3] == 0.0
+        # ...while strong within-cluster edges survive.
+        assert S_thr[0, 1] > 0.0
+        assert S_thr[3, 4] > 0.0
+
+    def test_disabled_keeps_floor(self, in_memory_db):
+        conn = in_memory_db
+        paths = [f"n{i}.md" for i in range(6)]
+        for p in paths:
+            _insert_note(conn, p)
+        conn.commit()
+        embs = self._two_cluster_embeddings()
+
+        from neurostack import attractor
+        with patch.object(attractor, "SEMANTIC_THRESHOLD_K", None):
+            S = _build_similarity_matrix(conn, paths, embs)
+        # With thresholding off, the cross-cluster edge is retained.
+        assert S[0, 3] > 0.0
+
+    def test_small_vault_not_thresholded(self, in_memory_db):
+        """n<=2 skips thresholding (can't estimate a distribution)."""
+        conn = in_memory_db
+        paths = ["a.md", "b.md"]
+        for p in paths:
+            _insert_note(conn, p)
+        conn.commit()
+        # Identical embeddings -> cosine 1.0 -> full ALPHA_SEMANTIC retained.
+        emb = np.ones(8, dtype=np.float32)
+        S = _build_similarity_matrix(conn, paths, np.stack([emb, emb]))
+        assert S[0, 1] == pytest.approx(ALPHA_SEMANTIC, abs=1e-4)