From 4e40038ef405ef06161f6f8fc11af5fe75868071 Mon Sep 17 00:00:00 2001 From: Raphael Southall Date: Wed, 3 Jun 2026 15:24:37 +0100 Subject: [PATCH] Add folder-path signal to community similarity matrix Embeddings see "infrastructure" as one topic regardless of whether a note lives under work/ or home/, so distinct organisational areas collapsed into a single community (the largest coarse cluster was work+literature+home soup). Blend a fourth channel into _build_similarity_matrix: notes sharing a top-level folder prefix get a uniform similarity bump (PATH_SIGNAL_WEIGHT=0.3, PATH_PREFIX_DEPTH=1). A read-only sweep on a ~490-note vault picked these: depth=1 is the work/home grain (depth=2 reduced cohesion); delta=0.3 lifts folder purity 0.68->0.85 coarse / 0.76->0.93 fine while modularity holds or improves; delta>=0.5 over-purifies and collapses community count. Degrades gracefully: root-level files form no path edges, a flat single-folder vault yields an all-zero signal, weight 0 disables. Tests cover same-folder bump, cross-folder isolation, root-file exclusion, and the disable path. --- CHANGELOG.md | 4 +++ src/neurostack/attractor.py | 32 ++++++++++++++++++++++- tests/test_attractor.py | 52 +++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e05c017..17e0173 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## Unreleased +### Added + +- **Folder-path signal in community detection.** Embeddings treat "infrastructure" as one topic whether a note lives under `work/` or `home/`, so distinct organisational areas bled into one cluster. `_build_similarity_matrix` now blends a fourth channel — notes sharing a top-level folder prefix get a uniform similarity bump (`PATH_SIGNAL_WEIGHT`, default 0.3; `PATH_PREFIX_DEPTH`, default 1). Measured on a ~490-note vault: folder purity rose 0.68 → 0.85 (coarse) / 0.76 → 0.93 (fine) with modularity held or slightly improved. Degrades gracefully — root-level files form no path edges and a flat single-folder vault yields an all-zero signal; set the weight to 0 to disable. + ### Fixed - **Community hierarchy check no longer false-alarms on every healthy build.** The build asserted `Q(fine) > Q(coarse)` and logged "sanity check failed" whenever it didn't. But Newman modularity is resolution-dependent and maximised at a single scale, so a finer partition scores *lower* at the implicit γ=1 by construction — the assertion could never hold for a healthy hierarchy. Verified empirically: the fine partition only overtakes coarse at γ≈`BETA_FINE` (≈2.0). Replaced with `_hierarchy_health_warning()`, which checks what issue #33 is actually about — the fine level collapsing into *fewer* communities than coarse — plus a `MIN_HEALTHY_Q` floor for near-random partitions. diff --git a/src/neurostack/attractor.py b/src/neurostack/attractor.py index a1985a6..2cd1862 100644 --- a/src/neurostack/attractor.py +++ b/src/neurostack/attractor.py @@ -39,9 +39,22 @@ # α: semantic similarity (embedding cosine) — structural/content overlap # β_cooc: co-occurrence weight — Hebbian "used together" signal # γ: wiki-link weight — explicit human connections +# δ_path: folder-path weight — notes sharing a top-level folder (e.g. work/ +# vs home/) get a similarity bump. Embeddings see "infrastructure" as one +# topic regardless of work-vs-personal context; the folder layout carries +# that organisational signal, which otherwise never reaches detection. ALPHA_SEMANTIC = 0.6 BETA_COOCCURRENCE = 0.25 GAMMA_WIKILINKS = 0.15 +PATH_SIGNAL_WEIGHT = 0.3 + +# Folder depth the path signal groups on. 1 = top-level (work/, home/, +# research/…), the grain that separates work from personal. Deeper grouping +# (e.g. work/proj-a vs work/proj-b) measurably reduced cohesion. Notes with no +# folder (root-level files) form no path edges; a flat single-folder vault +# yields an all-zero signal, so this degrades gracefully. Set the weight to 0 +# to disable. +PATH_PREFIX_DEPTH = 1 # ── Inverse temperature for attractor convergence ── # Low β → broad themes (coarse), high β → narrow sub-themes (fine) @@ -194,11 +207,28 @@ def _build_similarity_matrix( S_links[src, tgt] = 1.0 S_links[tgt, src] = 1.0 # symmetric - # Blend all three signals + # 4. Folder-path signal: notes sharing a top-level folder prefix get a + # uniform similarity bump, carrying the vault's organisational structure + # (work/ vs home/ vs research/…) that embeddings alone don't capture. + S_path = np.zeros((n, n), dtype=np.float32) + if PATH_SIGNAL_WEIGHT > 0: + prefix_groups: dict[str, list[int]] = defaultdict(list) + for i, p in enumerate(note_paths): + prefix_groups["/".join(p.split("/")[:PATH_PREFIX_DEPTH])].append(i) + for members in prefix_groups.values(): + # A root-level file (no folder) is its own singleton group and + # contributes no edges, which is what we want. + if len(members) < 2: + continue + mi = np.array(members) + S_path[np.ix_(mi, mi)] = 1.0 + + # Blend all four signals S = ( ALPHA_SEMANTIC * S_semantic + BETA_COOCCURRENCE * S_cooc + GAMMA_WIKILINKS * S_links + + PATH_SIGNAL_WEIGHT * S_path ) # Zero out self-similarity (diagonal) — a note shouldn't attract itself diff --git a/tests/test_attractor.py b/tests/test_attractor.py index e898fba..0868f83 100644 --- a/tests/test_attractor.py +++ b/tests/test_attractor.py @@ -8,6 +8,7 @@ from neurostack.attractor import ( ALPHA_SEMANTIC, GAMMA_WIKILINKS, + PATH_SIGNAL_WEIGHT, TOP_K_COARSE, TOP_K_FINE, TOP_K_NEIGHBORS, @@ -680,3 +681,54 @@ def test_weak_structure_warns(self): def test_equal_counts_ok(self): # n_fine == n_coarse is a valid (non-collapsed) refinement boundary. assert _hierarchy_health_warning(7, 7, 0.30, 0.20) is None + + +# --------------------------------------------------------------------------- +# Folder-path signal +# --------------------------------------------------------------------------- + +class TestPathSignal: + """Notes sharing a top-level folder get a uniform similarity bump.""" + + def test_same_folder_gets_bump_cross_folder_does_not(self, in_memory_db): + conn = in_memory_db + paths = ["work/a.md", "work/b.md", "home/c.md", "home/d.md"] + for p in paths: + _insert_note(conn, p) + conn.commit() + embs = np.eye(4, 16, dtype=np.float32) # orthogonal -> semantic ~0 + + S = _build_similarity_matrix(conn, paths, embs) + + # same top-level folder -> path weight (semantic floor is ~0 here) + assert S[0, 1] == pytest.approx(PATH_SIGNAL_WEIGHT, abs=1e-4) + assert S[2, 3] == pytest.approx(PATH_SIGNAL_WEIGHT, abs=1e-4) + # different folder -> no path edge + assert S[0, 2] == pytest.approx(0.0, abs=1e-4) + + def test_root_level_file_forms_no_path_edge(self, in_memory_db): + conn = in_memory_db + paths = ["home/a.md", "home/b.md", "top.md"] + for p in paths: + _insert_note(conn, p) + conn.commit() + embs = np.eye(3, 16, dtype=np.float32) + + S = _build_similarity_matrix(conn, paths, embs) + + assert S[0, 1] == pytest.approx(PATH_SIGNAL_WEIGHT, abs=1e-4) # home pair + assert S[0, 2] == pytest.approx(0.0, abs=1e-4) # root file, no group + assert S[1, 2] == pytest.approx(0.0, abs=1e-4) + + def test_weight_zero_disables(self, in_memory_db): + conn = in_memory_db + paths = ["work/a.md", "work/b.md", "work/c.md"] + for p in paths: + _insert_note(conn, p) + conn.commit() + embs = np.eye(3, 16, dtype=np.float32) + + from neurostack import attractor + with patch.object(attractor, "PATH_SIGNAL_WEIGHT", 0.0): + S = _build_similarity_matrix(conn, paths, embs) + assert S[0, 1] == pytest.approx(0.0, abs=1e-4) # same folder but off