diff --git a/src/neurostack/attractor.py b/src/neurostack/attractor.py index 024484d..00d635a 100644 --- a/src/neurostack/attractor.py +++ b/src/neurostack/attractor.py @@ -60,6 +60,12 @@ # convergence happens in fewer iterations. TOP_K_NEIGHBORS = 50 +# Per-level top-K. Coarse uses a wider neighbourhood so broad basins form; +# fine uses a narrower one so high-β softmax doesn't funnel every note into +# a handful of hub attractors (see issue #33 for the inversion this fixes). +TOP_K_COARSE = 80 +TOP_K_FINE = 20 + # Minimum shared entities for a note-note edge (co-occurrence signal) MIN_SHARED = 2 @@ -217,6 +223,7 @@ def _attractor_convergence( beta: float, max_iter: int | None = None, threshold: float = CONVERGENCE_THRESHOLD, + top_k: int | None = None, ) -> np.ndarray: """Run Hopfield-style attractor dynamics on the similarity matrix. @@ -226,16 +233,21 @@ def _attractor_convergence( state_i(t+1) = softmax(β · S_i · state(t)) - For large matrices (n > TOP_K_NEIGHBORS), S is sparsified to keep only - the top-k neighbors per row, and iterations are scaled adaptively. + For large matrices (n > top_k), S is sparsified to keep only the top-k + neighbors per row, and iterations are scaled adaptively. Callers can + override top_k per level — coarse/fine partitions need different + neighbourhood widths to avoid hub monopolisation at high β (issue #33). Returns the converged state matrix (n × n). """ n = S.shape[0] + if top_k is None: + top_k = TOP_K_NEIGHBORS + # Sparsify for large matrices — keeps convergence fast - if n > TOP_K_NEIGHBORS: - S = _sparsify_top_k(S, TOP_K_NEIGHBORS) + if n > top_k: + S = _sparsify_top_k(S, top_k) if max_iter is None: max_iter = _adaptive_max_iter(n) @@ -273,6 +285,7 @@ def _attractor_convergence( def _assign_communities( state: np.ndarray, note_paths: list[str], + merge_singletons: bool = True, ) -> dict[int, list[str]]: """Assign notes to communities based on converged attractor states. @@ -280,8 +293,10 @@ def _assign_communities( We assign each note to the index of its dominant attractor (argmax of its state vector), then group by attractor. - Applies lateral inhibition: singleton communities (only 1 note) are - merged into the nearest non-singleton community. + When merge_singletons is True (default), lateral inhibition folds every + 1-note basin into the nearest non-singleton community. At high β this + wipes out the narrow basins that a fine partition is supposed to + expose — pass False at β_FINE so singletons survive (issue #33). """ n = len(note_paths) @@ -293,6 +308,9 @@ def _assign_communities( for i in range(n): raw_communities[int(assignments[i])].append(note_paths[i]) + if not merge_singletons: + return {i: notes for i, notes in enumerate(raw_communities.values())} + # Lateral inhibition: merge singletons into nearest non-singleton non_singletons = { k: v for k, v in raw_communities.items() if len(v) > 1 @@ -325,6 +343,53 @@ def _assign_communities( return {i: notes for i, notes in enumerate(communities.values())} +def _modularity( + S: np.ndarray, + note_paths: list[str], + communities: dict[int, list[str]], +) -> float: + """Weighted Newman modularity Q of a partition on similarity matrix S. + + Q = (1 / 2m) Σ_ij [A_ij - k_i·k_j / 2m] · δ(c_i, c_j) + + Uses the pre-sparsification S so the metric reflects the partition's + fit to the full similarity structure, not the truncated neighbourhood. + Returns 0.0 for degenerate matrices (zero total weight). + """ + n = S.shape[0] + # Ensure symmetry: modularity is defined on an undirected weighted graph + A = (S + S.T) / 2.0 + two_m = float(A.sum()) + if two_m <= 0.0 or n == 0: + return 0.0 + + k = A.sum(axis=1) # strength (weighted degree) per node + path_to_idx = {p: i for i, p in enumerate(note_paths)} + + q = 0.0 + for members in communities.values(): + idx = np.array( + [path_to_idx[p] for p in members if p in path_to_idx], + dtype=np.int64, + ) + if idx.size == 0: + continue + # Sum of A within the community block minus expected under null model + block = A[np.ix_(idx, idx)] + k_block = k[idx] + q += float(block.sum()) - float(k_block.sum() ** 2) / two_m + + return q / two_m + + +def _size_stats(communities: dict[int, list[str]]) -> tuple[int, int, int, float]: + """Return (count, min_size, max_size, mean_size) for a partition.""" + sizes = [len(v) for v in communities.values()] + if not sizes: + return 0, 0, 0, 0.0 + return len(sizes), min(sizes), max(sizes), sum(sizes) / len(sizes) + + def _store_communities( conn: sqlite3.Connection, level: int, @@ -349,6 +414,23 @@ def _store_communities( ) +def _store_level_stats( + conn: sqlite3.Connection, + level: int, + communities: dict[int, list[str]], + modularity: float, +) -> None: + """Persist per-level aggregate stats (size distribution + modularity).""" + count, min_size, max_size, mean_size = _size_stats(communities) + now = datetime.now(timezone.utc).isoformat() + conn.execute( + "INSERT OR REPLACE INTO community_level_stats" + " (level, n_communities, min_size, max_size, mean_size," + " modularity, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?)", + (level, count, min_size, max_size, mean_size, modularity, now), + ) + + def detect_communities( conn: sqlite3.Connection | None = None, db_path=None, @@ -374,6 +456,7 @@ def detect_communities( # Clear existing conn.execute("DELETE FROM community_members") conn.execute("DELETE FROM communities") + conn.execute("DELETE FROM community_level_stats") conn.commit() # Persist entity co-occurrence weights from triples @@ -417,26 +500,48 @@ def detect_communities( # ── Coarse communities (low β → broad basins) ── log.info( f"Running attractor convergence level 0" - f" (coarse, β={BETA_COARSE})..." + f" (coarse, β={BETA_COARSE}, top_k={TOP_K_COARSE})..." + ) + state_coarse = _attractor_convergence( + S, beta=BETA_COARSE, top_k=TOP_K_COARSE, ) - state_coarse = _attractor_convergence(S, beta=BETA_COARSE) communities_coarse = _assign_communities(state_coarse, note_paths) _store_communities(conn, level=0, communities=communities_coarse) + q_coarse = _modularity(S, note_paths, communities_coarse) + _store_level_stats(conn, 0, communities_coarse, q_coarse) n_coarse = len(communities_coarse) # ── Fine communities (high β → narrow basins) ── + # Narrower top_k keeps softmax basins narrow; merge_singletons=False + # preserves the 1-note basins that are the whole point at β=2.0 + # (issue #33). log.info( f"Running attractor convergence level 1" - f" (fine, β={BETA_FINE})..." + f" (fine, β={BETA_FINE}, top_k={TOP_K_FINE})..." + ) + state_fine = _attractor_convergence( + S, beta=BETA_FINE, top_k=TOP_K_FINE, + ) + communities_fine = _assign_communities( + state_fine, note_paths, merge_singletons=False, ) - state_fine = _attractor_convergence(S, beta=BETA_FINE) - communities_fine = _assign_communities(state_fine, note_paths) _store_communities(conn, level=1, communities=communities_fine) + q_fine = _modularity(S, note_paths, communities_fine) + _store_level_stats(conn, 1, communities_fine, q_fine) n_fine = len(communities_fine) + if q_fine <= q_coarse: + log.warning( + "Community hierarchy sanity check failed:" + f" Q(fine)={q_fine:.4f} <= Q(coarse)={q_coarse:.4f}." + " The fine partition is not a tighter fit than coarse —" + " expect n_fine > n_coarse and Q(fine) > Q(coarse)." + ) + conn.commit() log.info( f"Community detection done:" - f" {n_coarse} coarse, {n_fine} fine communities." + f" {n_coarse} coarse (Q={q_coarse:.4f})," + f" {n_fine} fine (Q={q_fine:.4f}) communities." ) return n_coarse, n_fine diff --git a/src/neurostack/schema.py b/src/neurostack/schema.py index de09db4..d0f8e39 100644 --- a/src/neurostack/schema.py +++ b/src/neurostack/schema.py @@ -32,7 +32,7 @@ def __getattr__(name: str): return _db_path() raise AttributeError(f"module {__name__!r} has no attribute {name!r}") -SCHEMA_VERSION = 13 +SCHEMA_VERSION = 14 SCHEMA_SQL = """ CREATE TABLE IF NOT EXISTS schema_version ( @@ -162,6 +162,19 @@ def __getattr__(name: str): CREATE INDEX IF NOT EXISTS idx_community_members_entity ON community_members(entity); +-- Per-level community partition statistics (size distribution + modularity). +-- One row per level — used by vault_stats to surface partition quality and +-- to detect an inverted hierarchy (issue #33). +CREATE TABLE IF NOT EXISTS community_level_stats ( + level INTEGER PRIMARY KEY, + n_communities INTEGER, + min_size INTEGER, + max_size INTEGER, + mean_size REAL, + modularity REAL, + updated_at TEXT +); + -- Folder-level aggregate summaries for semantic context= boosting CREATE TABLE IF NOT EXISTS folder_summaries ( folder_path TEXT PRIMARY KEY, @@ -522,6 +535,20 @@ def __getattr__(name: str): MIGRATION_V13 = "-- vec index tables created by vecindex.ensure_vec_tables()" +# Migration from v13 to v14: add community_level_stats table (issue #33) +MIGRATION_V14 = """ +CREATE TABLE IF NOT EXISTS community_level_stats ( + level INTEGER PRIMARY KEY, + n_communities INTEGER, + min_size INTEGER, + max_size INTEGER, + mean_size REAL, + modularity REAL, + updated_at TEXT +); +""" + + def _run_migrations(conn: sqlite3.Connection): """Run schema migrations if needed.""" row = conn.execute("SELECT MAX(version) as v FROM schema_version").fetchone() @@ -750,6 +777,19 @@ def _run_migrations(conn: sqlite3.Connection): # Vec tables are created by _init_vec_index() after migrations log.info("Migration to v13 complete.") + if current < 14: + log.info( + "Migrating schema v13 -> v14: " + "adding community_level_stats table..." + ) + conn.executescript(MIGRATION_V14) + conn.execute( + "INSERT OR REPLACE INTO schema_version" + " VALUES (14)" + ) + conn.commit() + log.info("Migration to v14 complete.") + def get_db(db_path: Path | None = None) -> sqlite3.Connection: """Get a database connection, creating schema if needed.""" diff --git a/src/neurostack/tools/search_tools.py b/src/neurostack/tools/search_tools.py index 8855e1b..0e508df 100644 --- a/src/neurostack/tools/search_tools.py +++ b/src/neurostack/tools/search_tools.py @@ -22,6 +22,44 @@ def _cfg(): return cfg.vault_root, cfg.embed_url +def _community_level_stats(conn) -> list[dict]: + """Return per-level community partition stats (size distribution + modularity). + + Reads community_level_stats populated by attractor.detect_communities. + Empty list if the table doesn't exist yet or no levels have been written. + """ + try: + rows = conn.execute( + "SELECT level, n_communities, min_size, max_size, mean_size," + " modularity FROM community_level_stats ORDER BY level" + ).fetchall() + except Exception: + return [] + def _label(level: int) -> str: + if level == 0: + return "coarse" + if level == 1: + return "fine" + return f"level{level}" + + return [ + { + "level": r["level"], + "label": _label(r["level"]), + "n_communities": r["n_communities"], + "min_size": r["min_size"], + "max_size": r["max_size"], + "mean_size": ( + round(r["mean_size"], 2) if r["mean_size"] is not None else None + ), + "modularity": ( + round(r["modularity"], 4) if r["modularity"] is not None else None + ), + } + for r in rows + ] + + def _search_memories_for_results(query: str, workspace: str = None, limit: int = 3) -> list[dict]: """Search memories and return compact results for inclusion in vault_search.""" try: @@ -400,6 +438,7 @@ def vault_stats() -> dict: "communities_summarized": conn.execute( "SELECT COUNT(*) as c FROM communities WHERE summary IS NOT NULL" ).fetchone()["c"], + "community_levels": _community_level_stats(conn), "excitability": { "active": dormancy["active_count"], "dormant": dormancy["dormant_count"], diff --git a/tests/test_attractor.py b/tests/test_attractor.py index 9ec28df..2428be9 100644 --- a/tests/test_attractor.py +++ b/tests/test_attractor.py @@ -8,11 +8,15 @@ from neurostack.attractor import ( ALPHA_SEMANTIC, GAMMA_WIKILINKS, + TOP_K_COARSE, + TOP_K_FINE, TOP_K_NEIGHBORS, _adaptive_max_iter, _assign_communities, _attractor_convergence, _build_similarity_matrix, + _modularity, + _size_stats, _sparsify_top_k, ) @@ -457,3 +461,131 @@ def test_values_bounded(self, in_memory_db): S = _build_similarity_matrix(conn, paths, embs) assert np.all(S >= 0.0) + + +# --------------------------------------------------------------------------- +# Per-level top_k plumbing (issue #33) +# --------------------------------------------------------------------------- + +class TestAttractorTopKOverride: + def test_top_k_override_changes_sparsity(self): + """Passing a smaller top_k than default should sparsify more aggressively. + + The assertion is indirect: running convergence with top_k=2 on a dense + matrix should still produce a valid row-stochastic state (which only + works if the sparsified S is the one actually used). + """ + n = 20 + np.random.seed(0) + S = np.random.rand(n, n).astype(np.float32) + np.fill_diagonal(S, 0.0) + S = (S + S.T) / 2 + + state = _attractor_convergence(S, beta=1.0, max_iter=5, top_k=2) + row_sums = state.sum(axis=1) + np.testing.assert_allclose(row_sums, 1.0, atol=1e-5) + + def test_default_top_k_matches_constant(self): + """Without an explicit top_k, the function should use TOP_K_NEIGHBORS.""" + assert TOP_K_NEIGHBORS > 0 + # Coarse/fine per-level constants exist and are distinct + assert TOP_K_COARSE != TOP_K_FINE + assert TOP_K_COARSE > TOP_K_FINE + + +# --------------------------------------------------------------------------- +# Singleton-merge gate (issue #33) +# --------------------------------------------------------------------------- + +class TestAssignCommunitiesMergeGate: + def test_merge_false_keeps_singletons(self): + """merge_singletons=False preserves 1-note basins instead of absorbing them.""" + # Two notes form a cluster at attractor 0; one singleton at attractor 2 + state = np.array([ + [0.8, 0.15, 0.05], + [0.7, 0.25, 0.05], + [0.1, 0.1, 0.8], # singleton, attracted to itself + ], dtype=np.float32) + paths = ["a.md", "b.md", "c.md"] + + communities = _assign_communities(state, paths, merge_singletons=False) + + # Both the duo and the singleton survive — 2 communities total + assert len(communities) == 2 + sizes = sorted(len(v) for v in communities.values()) + assert sizes == [1, 2] + + def test_merge_true_is_default_and_absorbs(self): + """Default behaviour still merges singletons (existing contract).""" + state = np.array([ + [0.8, 0.15, 0.05], + [0.7, 0.25, 0.05], + [0.1, 0.1, 0.8], + ], dtype=np.float32) + paths = ["a.md", "b.md", "c.md"] + + def mock_csb(query, matrix): + return np.array([0.9]) + + with patch("neurostack.attractor.cosine_similarity_batch", mock_csb): + communities = _assign_communities(state, paths) + + assert len(communities) == 1 + + +# --------------------------------------------------------------------------- +# Modularity + size stats +# --------------------------------------------------------------------------- + +class TestModularity: + def test_two_block_partition_has_positive_q(self): + """A clean 2-block matrix with the correct partition yields Q > 0.""" + # 4 notes: two fully-connected pairs, zero between + S = np.array([ + [0.0, 0.9, 0.0, 0.0], + [0.9, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.9], + [0.0, 0.0, 0.9, 0.0], + ], dtype=np.float32) + paths = ["a.md", "b.md", "c.md", "d.md"] + correct = {0: ["a.md", "b.md"], 1: ["c.md", "d.md"]} + + q = _modularity(S, paths, correct) + assert q > 0.4 # Analytical Q for this graph is 0.5 + + def test_all_in_one_community_is_zero(self): + """Lumping everything into one community gives Q = 0.""" + S = np.array([ + [0.0, 0.9, 0.0, 0.0], + [0.9, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.9], + [0.0, 0.0, 0.9, 0.0], + ], dtype=np.float32) + paths = ["a.md", "b.md", "c.md", "d.md"] + one_community = {0: paths} + q = _modularity(S, paths, one_community) + assert abs(q) < 1e-6 + + def test_empty_or_zero_matrix_returns_zero(self): + """Degenerate similarity matrix returns 0.0, not NaN.""" + S = np.zeros((3, 3), dtype=np.float32) + paths = ["a.md", "b.md", "c.md"] + q = _modularity(S, paths, {0: paths}) + assert q == 0.0 + + +class TestSizeStats: + def test_counts_and_bounds(self): + communities = { + 0: ["a.md", "b.md", "c.md"], + 1: ["d.md"], + 2: ["e.md", "f.md"], + } + count, min_size, max_size, mean_size = _size_stats(communities) + assert count == 3 + assert min_size == 1 + assert max_size == 3 + assert mean_size == pytest.approx(2.0) + + def test_empty(self): + assert _size_stats({}) == (0, 0, 0, 0.0) diff --git a/tests/test_schema.py b/tests/test_schema.py index e2cdaef..a6ddf6c 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -226,7 +226,7 @@ def test_migration_v11_to_v12(in_memory_db): # Version bumped row = conn.execute("SELECT MAX(version) as v FROM schema_version").fetchone() - assert row["v"] == 13 + assert row["v"] == 14 def test_migration_v12_idempotent(in_memory_db): @@ -247,4 +247,4 @@ def test_migration_v12_idempotent(in_memory_db): _run_migrations(conn) row = conn.execute("SELECT MAX(version) as v FROM schema_version").fetchone() - assert row["v"] == 13 + assert row["v"] == 14