From 110f43605f83cca2578545b1b12a07d0f8b47227 Mon Sep 17 00:00:00 2001 From: Test User Date: Sun, 3 May 2026 14:32:44 +0300 Subject: [PATCH 1/6] fix: filter audit recursion from default search --- .../Sources/BrainBar/BrainDatabase.swift | 33 +++++++--- brain-bar/Sources/BrainBar/MCPRouter.swift | 7 ++- .../Tests/BrainBarTests/MCPRouterTests.swift | 55 +++++++++++++++++ src/brainlayer/mcp/__init__.py | 16 ++++- src/brainlayer/mcp/search_handler.py | 8 +++ src/brainlayer/search_repo.py | 55 ++++++++++++++++- tests/test_audit_recursion_filter.py | 60 +++++++++++++++++++ 7 files changed, 222 insertions(+), 12 deletions(-) create mode 100644 tests/test_audit_recursion_filter.py diff --git a/brain-bar/Sources/BrainBar/BrainDatabase.swift b/brain-bar/Sources/BrainBar/BrainDatabase.swift index df191af1..3bd3a8d5 100644 --- a/brain-bar/Sources/BrainBar/BrainDatabase.swift +++ b/brain-bar/Sources/BrainBar/BrainDatabase.swift @@ -532,7 +532,8 @@ final class BrainDatabase: @unchecked Sendable { tag: String? = nil, importanceMin: Double? = nil, subscriberID: String? = nil, - unreadOnly: Bool = false + unreadOnly: Bool = false, + includeAudit: Bool = false ) throws -> [[String: Any]] { guard db != nil else { throw DBError.notOpen } let trimmedQuery = query.trimmingCharacters(in: .whitespacesAndNewlines) @@ -551,7 +552,8 @@ final class BrainDatabase: @unchecked Sendable { project: project, source: source, tag: tag, - importanceMin: importanceMin + importanceMin: importanceMin, + includeAudit: includeAudit ) { return exact } @@ -572,7 +574,8 @@ final class BrainDatabase: @unchecked Sendable { importanceMin: importanceMin, subscribedTags: subscribedTags, ackFloor: ackFloor, - unreadOnly: unreadOnly + unreadOnly: unreadOnly, + includeAudit: includeAudit ) maxRowID = max(maxRowID, searchResult.maxRowID) appendDeduped(searchResult.rows, to: &results, seenChunkIDs: &seenChunkIDs, limit: limit) @@ -593,7 +596,8 @@ final class BrainDatabase: @unchecked Sendable { importanceMin: importanceMin, subscribedTags: subscribedTags, ackFloor: ackFloor, - unreadOnly: unreadOnly + unreadOnly: unreadOnly, + includeAudit: includeAudit ) maxRowID = max(maxRowID, searchResult.maxRowID) appendDeduped(searchResult.rows, to: &results, seenChunkIDs: &seenChunkIDs, limit: limit) @@ -618,7 +622,8 @@ final class BrainDatabase: @unchecked Sendable { importanceMin: Double?, subscribedTags: [String], ackFloor: Int64, - unreadOnly: Bool + unreadOnly: Bool, + includeAudit: Bool ) throws -> (rows: [[String: Any]], maxRowID: Int64) { guard let db else { throw DBError.notOpen } let allowedTables = ["chunks_fts", "chunks_fts_trigram"] @@ -636,6 +641,7 @@ final class BrainDatabase: @unchecked Sendable { let tagTerms = Array(repeating: "c.tags LIKE ?", count: subscribedTags.count).joined(separator: " OR ") conditions.append("(\(tagTerms))") } + if !includeAudit { conditions.append(Self.auditRecursionTagExclusionSQL(alias: "c")) } if importanceMin != nil { conditions.append("c.importance >= ?") } if unreadOnly { conditions.append("c.rowid > ?") } @@ -907,7 +913,8 @@ final class BrainDatabase: @unchecked Sendable { tag: String? = nil, importanceMin: Double? = nil, subscriberID: String? = nil, - unreadOnly: Bool = false + unreadOnly: Bool = false, + includeAudit: Bool = false ) throws -> [SearchQueryCandidate] { guard let db else { throw DBError.notOpen } let sanitized = sanitizeFTS5Query(query) @@ -930,6 +937,7 @@ final class BrainDatabase: @unchecked Sendable { let tagTerms = Array(repeating: "c.tags LIKE ?", count: subscribedTags.count).joined(separator: " OR ") conditions.append("(\(tagTerms))") } + if !includeAudit { conditions.append(Self.auditRecursionTagExclusionSQL(alias: "c")) } if importanceMin != nil { conditions.append("c.importance >= ?") } if unreadOnly { conditions.append("c.rowid > ?") } @@ -1487,6 +1495,15 @@ final class BrainDatabase: @unchecked Sendable { """ } + private static func auditRecursionTagExclusionSQL(alias: String) -> String { + let tags = "LOWER(COALESCE(\(alias).tags, ''))" + return """ + \(tags) NOT LIKE '%audit%' + AND \(tags) NOT LIKE '%agent=auditor%' + AND \(tags) NOT GLOB '*r0[0-9]*' + """ + } + private func databaseSizeBytes() -> Int64 { let candidates = [path, "\(path)-wal", "\(path)-shm"] return candidates.reduce(into: Int64(0)) { total, candidate in @@ -1813,7 +1830,8 @@ final class BrainDatabase: @unchecked Sendable { project: String?, source: String?, tag: String?, - importanceMin: Double? + importanceMin: Double?, + includeAudit: Bool ) throws -> [[String: Any]]? { guard let db else { throw DBError.notOpen } guard limit > 0, !query.contains(where: { $0.isWhitespace }), query.contains("-") else { @@ -1825,6 +1843,7 @@ final class BrainDatabase: @unchecked Sendable { if project != nil { conditions.append("c.project = ?") } if sourceFilter != nil { conditions.append("c.source = ?") } if tag != nil { conditions.append("c.tags LIKE ?") } + if !includeAudit { conditions.append(Self.auditRecursionTagExclusionSQL(alias: "c")) } if importanceMin != nil { conditions.append("c.importance >= ?") } let sql = """ diff --git a/brain-bar/Sources/BrainBar/MCPRouter.swift b/brain-bar/Sources/BrainBar/MCPRouter.swift index 208b3a99..494cc535 100644 --- a/brain-bar/Sources/BrainBar/MCPRouter.swift +++ b/brain-bar/Sources/BrainBar/MCPRouter.swift @@ -225,6 +225,7 @@ final class MCPRouter: @unchecked Sendable { let tag = args["tag"] as? String let subscriberID = (args["agent_id"] as? String) ?? (args["subscriber_id"] as? String) let unreadOnly = args["unread_only"] as? Bool ?? false + let includeAudit = args["include_audit"] as? Bool ?? false let sourceCountsAsFilter: Bool if let source { let trimmed = source.trimmingCharacters(in: .whitespacesAndNewlines) @@ -264,7 +265,8 @@ final class MCPRouter: @unchecked Sendable { tag: tag, importanceMin: importanceMin, subscriberID: subscriberID, - unreadOnly: unreadOnly + unreadOnly: unreadOnly, + includeAudit: includeAudit ) let typedResults = results.map(SearchResult.init(payload:)) let textSection = TextFormatter.formatSearchResults(query: query, results: typedResults, total: typedResults.count) @@ -820,7 +822,7 @@ final class MCPRouter: @unchecked Sendable { nonisolated(unsafe) static let toolDefinitions: [[String: Any]] = [ [ "name": "brain_search", - "description": "Search through past conversations and learnings. Hybrid semantic + keyword search.", + "description": "Search through past conversations and learnings. Hybrid semantic + keyword search. Audit/eval chunks tagged audit, r02/r0x, audit-pollution-source, or agent=auditor are excluded by default; set include_audit=true only when explicitly looking up audit history.", "annotations": MCPRouter.readOnlyAnnotations, "inputSchema": MCPRouter.limitedInputSchema([ "type": "object", @@ -833,6 +835,7 @@ final class MCPRouter: @unchecked Sendable { "importance_min": ["type": "number", "description": "Minimum importance score (1-10)"], "agent_id": ["type": "string", "description": "Optional stable agent id for unread filtering"], "unread_only": ["type": "boolean", "description": "Return only chunks not yet acknowledged by agent_id"], + "include_audit": ["type": "boolean", "description": "Opt in to audit/eval memories. Defaults false to prevent audit-recursion pollution."], "detail": ["type": "string", "enum": ["compact", "full"], "description": "Result detail level"], ] as [String: Any], "required": ["query"] diff --git a/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift b/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift index 6032764c..3ecab9b1 100644 --- a/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift +++ b/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift @@ -574,6 +574,61 @@ final class MCPRouterTests: XCTestCase { XCTAssertEqual(text.components(separatedBy: "Sagit meeting notes").count - 1, 1, "Only one matching source should be returned") } + func testBrainSearchExcludesAuditRecursionByDefaultAndAllowsOptIn() throws { + let tempDB = NSTemporaryDirectory() + "brainbar-audit-filter-\(UUID().uuidString).db" + defer { try? FileManager.default.removeItem(atPath: tempDB) } + let db = BrainDatabase(path: tempDB) + defer { db.close() } + + try db.insertChunk( + id: "audit-recursion-source", + content: "why restart BrainBar audit recursion contamination exact match", + sessionId: "s1", + project: "brainlayer", + contentType: "assistant_text", + importance: 8, + tags: "[\"r02\", \"audit\"]" + ) + try db.insertChunk( + id: "ordinary-brainbar-memory", + content: "why restart BrainBar because launchd replaced the old degraded binary", + sessionId: "s2", + project: "brainlayer", + contentType: "assistant_text", + importance: 8, + tags: "[\"brainbar\", \"reliability\"]" + ) + + let router = MCPRouter() + router.setDatabase(db) + let defaultResponse = router.handle([ + "jsonrpc": "2.0", + "id": 160, + "method": "tools/call", + "params": [ + "name": "brain_search", + "arguments": ["query": "why restart BrainBar", "num_results": 3] as [String: Any] + ] as [String: Any] + ]) + let defaultText = ((defaultResponse["result"] as? [String: Any])?["content"] as? [[String: Any]])?.first?["text"] as? String ?? "" + + XCTAssertTrue(defaultText.contains("ordinary-bra"), defaultText) + XCTAssertFalse(defaultText.contains("audit-recurs"), defaultText) + + let optInResponse = router.handle([ + "jsonrpc": "2.0", + "id": 161, + "method": "tools/call", + "params": [ + "name": "brain_search", + "arguments": ["query": "why restart BrainBar", "num_results": 3, "include_audit": true] as [String: Any] + ] as [String: Any] + ]) + let optInText = ((optInResponse["result"] as? [String: Any])?["content"] as? [[String: Any]])?.first?["text"] as? String ?? "" + + XCTAssertTrue(optInText.contains("audit-recurs"), optInText) + } + func testBrainSearchSourceAllKeepsKGAugmentation() throws { let tempDB = NSTemporaryDirectory() + "brainbar-source-all-\(UUID().uuidString).db" defer { try? FileManager.default.removeItem(atPath: tempDB) } diff --git a/src/brainlayer/mcp/__init__.py b/src/brainlayer/mcp/__init__.py index 66882662..50155ed8 100644 --- a/src/brainlayer/mcp/__init__.py +++ b/src/brainlayer/mcp/__init__.py @@ -377,7 +377,7 @@ async def list_tools() -> list[Tool]: Tool( name="brain_search", title="Search Knowledge Base", - description="""Search BrainLayer's persistent memory for past decisions, project history, debugging notes, preferences, and other stored knowledge. Use when: the user asks what was decided before, how something was implemented, what happened to a file, or what you are working on. Don't use when: you need current session context or stats (use brain_recall), a named entity graph lookup (use brain_entity), or to save new information (use brain_store). query should be a natural-language lookup phrase; file_path switches to file-history routing, chunk_id expands a known result, and project narrows scope. num_results defaults to 5 and detail defaults to 'compact'; add date, tag, intent, or source filters only when they materially narrow the search. Returns ranked matches with scores, metadata, and compact snippets or full content; after finding a promising chunk, call brain_search with chunk_id or use brain_recall for session-level context.""", + description="""Search BrainLayer's persistent memory for decisions, project history, debugging notes, preferences, and stored knowledge. Use when: the user asks what was decided before, how something was implemented, what happened to a file, or what you are working on. Don't use when: you need current session context or stats (use brain_recall), a named entity graph lookup (use brain_entity), or to save new information (use brain_store). query is natural language; file_path switches to file-history routing, chunk_id expands a known result, and project narrows scope. num_results defaults to 5 and detail defaults to 'compact'; add date, tag, intent, or source filters only when useful. Audit/eval chunks tagged audit, r02/r0x, audit-pollution-source, or agent=auditor are excluded by default; set include_audit=true only for audit history. Returns ranked matches with scores, metadata, snippets, or full content.""", annotations=_READ_ONLY, inputSchema=_bounded_input_schema( { @@ -505,6 +505,11 @@ async def list_tools() -> list[Tool]: "type": "string", "description": "Filter by correction type tag (e.g. 'correction:preference', 'correction:factual', 'correction:naming'). Matches chunks tagged with the given correction category.", }, + "include_audit": { + "type": "boolean", + "default": False, + "description": "Opt in to audit/eval memories tagged audit, r02/r0x, audit-pollution-source, or agent=auditor. Defaults false to prevent audit-recursion pollution.", + }, "detail": { "type": "string", "enum": ["compact", "full"], @@ -652,7 +657,7 @@ async def list_tools() -> list[Tool]: Tool( name="brain_recall", title="Recall / Search / Entity Lookup", - description="""Get working context, recent sessions, plan/session links, per-session operations, summaries, stats, or routed search from one entry point. Use when: you need 'what am I working on', recent session history, plan linkage, operation groups for a session, or knowledge-base health stats. Don't use when: you already know you want topical memory search (use brain_search), a direct entity graph lookup (use brain_entity), or to store or digest new content (use brain_store or brain_digest). mode can be explicit or auto-detected from query; session_id is required for operations and summary, plan_name targets plan mode, and hours, days, and limit control context windows. In search mode, file_path, chunk_id, content filters, num_results, and detail='compact'|'full' behave like brain_search. Returns structured context, search results, or stats depending on mode; use brain_search after broad routing when you need tighter topical retrieval.""", + description="""Get working context, recent sessions, plan/session links, per-session operations, summaries, stats, or routed search from one entry point. Use when: you need 'what am I working on', recent session history, plan linkage, operation groups for a session, or knowledge-base health stats. Don't use when: you already know you want topical memory search (use brain_search), a direct entity graph lookup (use brain_entity), or to store or digest new content (use brain_store or brain_digest). mode can be explicit or auto-detected from query; session_id is required for operations and summary, plan_name targets plan mode, and hours, days, and limit control context windows. In search mode, file_path, chunk_id, content filters, num_results, include_audit, and detail='compact'|'full' behave like brain_search. Returns structured context, search results, or stats depending on mode; use brain_search after broad routing when you need tighter topical retrieval.""", annotations=_READ_ONLY, inputSchema=_bounded_input_schema( { @@ -792,6 +797,11 @@ async def list_tools() -> list[Tool]: "default": "compact", "description": "Result detail level (mode=search). 'compact': snippet + metadata. 'full': complete content.", }, + "include_audit": { + "type": "boolean", + "default": False, + "description": "Opt in to audit/eval memories in search mode. Defaults false to prevent audit-recursion pollution.", + }, }, } ), @@ -1218,6 +1228,7 @@ async def call_tool(name: str, arguments: dict[str, Any]): detail=arguments.get("detail", "compact"), source_filter=resolved_source_filter, correction_category=arguments.get("correction_category"), + include_audit=arguments.get("include_audit", False), ) ) @@ -1297,6 +1308,7 @@ async def call_tool(name: str, arguments: dict[str, Any]): max_results=arguments.get("max_results", 10), detail=arguments.get("detail", "compact"), entity_type=arguments.get("entity_type"), + include_audit=arguments.get("include_audit", False), ) ) diff --git a/src/brainlayer/mcp/search_handler.py b/src/brainlayer/mcp/search_handler.py index 508965dd..35d98a45 100644 --- a/src/brainlayer/mcp/search_handler.py +++ b/src/brainlayer/mcp/search_handler.py @@ -397,6 +397,7 @@ async def _brain_search( detail: str = "compact", source_filter: str | None = None, correction_category: str | None = None, + include_audit: bool = False, ): """Unified search dispatcher -- routes to the right internal handler.""" @@ -432,6 +433,7 @@ async def _brain_search( detail=detail, source_filter_like=source_filter, correction_category=correction_category, + include_audit=include_audit, ) if chunk_id is not None: @@ -480,6 +482,7 @@ async def _brain_search( detail=detail, source_filter=source_filter, correction_category=correction_category, + include_audit=include_audit, ) if _query_signals_current_context(query): @@ -634,6 +637,7 @@ async def _brain_search( fts_query_override=fts_query_override, source_filter_like=source_filter, correction_category=correction_category, + include_audit=include_audit, ) @@ -732,6 +736,7 @@ async def _brain_recall( # --- T3 filter additions --- source_filter: str | None = None, correction_category: str | None = None, + include_audit: bool = False, ): """Unified recall dispatcher -- routes to session/context/search/entity handlers. @@ -789,6 +794,7 @@ async def _brain_recall( detail=detail, source_filter=source_filter, correction_category=correction_category, + include_audit=include_audit, ) if resolved_mode == "entity": @@ -849,6 +855,7 @@ async def _search( # --- T3 filter additions --- source_filter_like: str | None = None, correction_category: str | None = None, + include_audit: bool = False, ): """Execute a hybrid search query (semantic + keyword via RRF). Retries on BusyError.""" try: @@ -909,6 +916,7 @@ async def _search( entity_id=entity_id, source_filter_like=source_filter_like, correction_category=correction_category, + include_audit=include_audit, ) break except Exception as e: diff --git a/src/brainlayer/search_repo.py b/src/brainlayer/search_repo.py index 4c02941b..85156bfa 100644 --- a/src/brainlayer/search_repo.py +++ b/src/brainlayer/search_repo.py @@ -38,6 +38,20 @@ "Grounding Results — Prompt", ] META_NOISE_PATTERNS_CASEFOLDED = [pattern.casefold() for pattern in META_NOISE_PATTERNS] +AUDIT_RECURSION_TAG_PATTERNS = ( + "LOWER(tag) LIKE '%audit%'", + "LOWER(tag) = 'r02'", + "LOWER(tag) GLOB '*r0[0-9]*'", + "LOWER(tag) = 'audit-pollution-source'", + "LOWER(tag) = 'agent=auditor'", +) +AUDIT_RECURSION_TAG_SQL = ( + "NOT EXISTS (" + "SELECT 1 FROM chunk_tags audit_tags " + "WHERE audit_tags.chunk_id = {chunk_id_expr} " + f"AND ({' OR '.join(AUDIT_RECURSION_TAG_PATTERNS)})" + ")" +) # Module-level LRU cache: {cache_key: (result, timestamp)} _hybrid_cache: "OrderedDict[tuple, tuple[dict, float]]" = OrderedDict() @@ -117,6 +131,30 @@ def _contains_meta_noise(content: Optional[str]) -> bool: return any(pattern in content_folded for pattern in META_NOISE_PATTERNS_CASEFOLDED) +def _audit_recursion_exclusion_sql(chunk_id_expr: str) -> str: + return AUDIT_RECURSION_TAG_SQL.format(chunk_id_expr=chunk_id_expr) + + +def _is_audit_recursion_metadata(meta: dict) -> bool: + tags = meta.get("tags") + if not isinstance(tags, list): + return False + for tag in tags: + normalized = str(tag).casefold() + if "audit" in normalized: + return True + if normalized == "r02": + return True + if len(normalized) >= 3 and any( + normalized[index : index + 2] == "r0" and normalized[index + 2].isdigit() + for index in range(len(normalized) - 2) + ): + return True + if normalized in {"audit-pollution-source", "agent=auditor"}: + return True + return False + + class SearchMixin: """Search and query methods, mixed into VectorStore.""" @@ -294,6 +332,7 @@ def search( include_archived: bool = False, source_filter_like: Optional[str] = None, correction_category: Optional[str] = None, + include_audit: bool = False, ) -> Dict[str, List]: """Search chunks by embedding or text. @@ -355,6 +394,8 @@ def search( if correction_category: where_clauses.append("c.id IN (SELECT chunk_id FROM chunk_tags WHERE tag LIKE ?)") filter_params.append(f"correction:{correction_category}%") + if not include_audit: + where_clauses.append(_audit_recursion_exclusion_sql("c.id")) if not include_archived: where_clauses.append("c.superseded_by IS NULL") where_clauses.append("c.aggregated_into IS NULL") @@ -434,6 +475,8 @@ def search( if correction_category: where_clauses.append("id IN (SELECT chunk_id FROM chunk_tags WHERE tag LIKE ?)") params.append(f"correction:{correction_category}%") + if not include_audit: + where_clauses.append(_audit_recursion_exclusion_sql("id")) if not include_archived: where_clauses.append("superseded_by IS NULL") where_clauses.append("aggregated_into IS NULL") @@ -641,6 +684,7 @@ def _binary_search( include_archived: bool = False, source_filter_like: Optional[str] = None, correction_category: Optional[str] = None, + include_audit: bool = False, ) -> Dict[str, List]: """Run KNN search against binary-quantized vectors.""" cursor = self._read_cursor() @@ -691,6 +735,8 @@ def _binary_search( if correction_category: where_clauses.append("c.id IN (SELECT chunk_id FROM chunk_tags WHERE tag LIKE ?)") filter_params.append(f"correction:{correction_category}%") + if not include_audit: + where_clauses.append(_audit_recursion_exclusion_sql("c.id")) if not include_archived: where_clauses.append("c.superseded_by IS NULL") where_clauses.append("c.aggregated_into IS NULL") @@ -845,6 +891,7 @@ def hybrid_search( source_filter_like: Optional[str] = None, correction_category: Optional[str] = None, filter_meta_noise: bool = True, + include_audit: bool = False, ) -> Dict[str, List]: """Hybrid search combining semantic (vector) + keyword (FTS5) via Reciprocal Rank Fusion. @@ -878,7 +925,7 @@ def hybrid_search( entity_id, k, include_archived, - ) + (fts_query_override, kg_boost, source_filter_like, correction_category, filter_meta_noise) + ) + (fts_query_override, kg_boost, source_filter_like, correction_category, filter_meta_noise, include_audit) now = time.monotonic() if cache_key in _hybrid_cache: cached_result, cached_at = _hybrid_cache[cache_key] @@ -912,6 +959,7 @@ def hybrid_search( include_archived=include_archived, source_filter_like=source_filter_like, correction_category=correction_category, + include_audit=include_audit, ) semantic = self._rerank_binary_results_with_float(query_embedding, semantic) else: @@ -933,6 +981,7 @@ def hybrid_search( include_archived=include_archived, source_filter_like=source_filter_like, correction_category=correction_category, + include_audit=include_audit, ) # Build semantic rank map: chunk_content -> rank @@ -993,6 +1042,8 @@ def hybrid_search( if correction_category: fts_extra.append("AND c.id IN (SELECT chunk_id FROM chunk_tags WHERE tag LIKE ?)") fts_filter_params.append(f"correction:{correction_category}%") + if not include_audit: + fts_extra.append(f"AND {_audit_recursion_exclusion_sql('c.id')}") if filter_meta_noise: for pattern in META_NOISE_PATTERNS_CASEFOLDED: fts_extra.append("AND LOWER(c.content) NOT LIKE ?") @@ -1136,6 +1187,8 @@ def _ingest_keyword_rows(rows: list[tuple], ranks: dict[str, int]) -> None: if filter_meta_noise and _contains_meta_noise(doc): continue + if not include_audit and _is_audit_recursion_metadata(meta): + continue # Apply filters to FTS-only results if fts_rank is not None and sem_entry is None: diff --git a/tests/test_audit_recursion_filter.py b/tests/test_audit_recursion_filter.py new file mode 100644 index 00000000..f6490b66 --- /dev/null +++ b/tests/test_audit_recursion_filter.py @@ -0,0 +1,60 @@ +import json + +from brainlayer._helpers import serialize_f32 +from brainlayer.vector_store import VectorStore + + +def _insert_chunk(store: VectorStore, chunk_id: str, content: str, tags: list[str], embedding: list[float]) -> None: + cursor = store.conn.cursor() + cursor.execute( + """INSERT INTO chunks (id, content, metadata, source_file, project, + content_type, char_count, source, tags) + VALUES (?, ?, '{}', 'audit-filter-test.jsonl', 'brainlayer', + 'assistant_text', ?, 'claude_code', ?)""", + (chunk_id, content, len(content), json.dumps(tags)), + ) + cursor.execute( + "INSERT INTO chunk_vectors (chunk_id, embedding) VALUES (?, ?)", + (chunk_id, serialize_f32(embedding)), + ) + + +def test_hybrid_search_excludes_audit_recursion_by_default(tmp_path): + store = VectorStore(tmp_path / "audit-filter.db") + try: + query_embedding = [0.01] * 1024 + _insert_chunk( + store, + "audit-recursion-source", + "why restart BrainBar audit recursion contamination exact match", + ["r02", "audit"], + query_embedding, + ) + _insert_chunk( + store, + "ordinary-brainbar-memory", + "why restart BrainBar because launchd replaced the old degraded binary", + ["brainbar", "reliability"], + [0.02] * 1024, + ) + + default_results = store.hybrid_search( + query_embedding=query_embedding, + query_text="why restart BrainBar", + n_results=3, + ) + default_ids = default_results["ids"][0] + + assert "audit-recursion-source" not in default_ids + assert "ordinary-brainbar-memory" in default_ids + + audit_results = store.hybrid_search( + query_embedding=query_embedding, + query_text="why restart BrainBar", + n_results=3, + include_audit=True, + ) + + assert "audit-recursion-source" in audit_results["ids"][0] + finally: + store.close() From 316769ba1f53795c1a08915464c92b461bddb968 Mon Sep 17 00:00:00 2001 From: Test User Date: Sun, 3 May 2026 14:48:56 +0300 Subject: [PATCH 2/6] fix: honor audit filter in edge search paths --- .../Sources/BrainBar/BrainDatabase.swift | 13 +++-- .../Tests/BrainBarTests/MCPRouterTests.swift | 32 +++++++++++++ src/brainlayer/mcp/search_handler.py | 10 +++- src/brainlayer/search_repo.py | 7 +-- tests/test_audit_recursion_filter.py | 23 +++++++++ tests/test_search_exact_chunk_id.py | 48 +++++++++++++++++++ 6 files changed, 122 insertions(+), 11 deletions(-) diff --git a/brain-bar/Sources/BrainBar/BrainDatabase.swift b/brain-bar/Sources/BrainBar/BrainDatabase.swift index 3bd3a8d5..1af4a0b1 100644 --- a/brain-bar/Sources/BrainBar/BrainDatabase.swift +++ b/brain-bar/Sources/BrainBar/BrainDatabase.swift @@ -1496,11 +1496,16 @@ final class BrainDatabase: @unchecked Sendable { } private static func auditRecursionTagExclusionSQL(alias: String) -> String { - let tags = "LOWER(COALESCE(\(alias).tags, ''))" + let tagsJSON = "CASE WHEN json_valid(\(alias).tags) THEN \(alias).tags ELSE '[]' END" + let tagValue = "LOWER(CAST(audit_tags.value AS TEXT))" return """ - \(tags) NOT LIKE '%audit%' - AND \(tags) NOT LIKE '%agent=auditor%' - AND \(tags) NOT GLOB '*r0[0-9]*' + NOT EXISTS ( + SELECT 1 + FROM json_each(\(tagsJSON)) audit_tags + WHERE \(tagValue) LIKE '%audit%' + OR \(tagValue) = 'agent=auditor' + OR \(tagValue) GLOB 'r0[0-9]' + ) """ } diff --git a/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift b/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift index 3ecab9b1..d607c31f 100644 --- a/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift +++ b/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift @@ -629,6 +629,38 @@ final class MCPRouterTests: XCTestCase { XCTAssertTrue(optInText.contains("audit-recurs"), optInText) } + func testBrainSearchDoesNotTreatR0xSubstringTagsAsAudit() throws { + let tempDB = NSTemporaryDirectory() + "brainbar-audit-substring-\(UUID().uuidString).db" + defer { try? FileManager.default.removeItem(atPath: tempDB) } + let db = BrainDatabase(path: tempDB) + defer { db.close() } + + try db.insertChunk( + id: "ordinary-mirror07-memory", + content: "mirror07 normal operational memory should remain searchable", + sessionId: "s1", + project: "brainlayer", + contentType: "assistant_text", + importance: 8, + tags: "[\"mirror07\", \"reliability\"]" + ) + + let router = MCPRouter() + router.setDatabase(db) + let response = router.handle([ + "jsonrpc": "2.0", + "id": 162, + "method": "tools/call", + "params": [ + "name": "brain_search", + "arguments": ["query": "mirror07 normal operational memory", "num_results": 3] as [String: Any] + ] as [String: Any] + ]) + let text = ((response["result"] as? [String: Any])?["content"] as? [[String: Any]])?.first?["text"] as? String ?? "" + + XCTAssertTrue(text.contains("ordinary-mir"), text) + } + func testBrainSearchSourceAllKeepsKGAugmentation() throws { let tempDB = NSTemporaryDirectory() + "brainbar-source-all-\(UUID().uuidString).db" defer { try? FileManager.default.removeItem(atPath: tempDB) } diff --git a/src/brainlayer/mcp/search_handler.py b/src/brainlayer/mcp/search_handler.py index 35d98a45..1c4e462e 100644 --- a/src/brainlayer/mcp/search_handler.py +++ b/src/brainlayer/mcp/search_handler.py @@ -10,6 +10,7 @@ from .._helpers import _escape_fts5_query from ..lexical_defense import _normalize_surface, load_lexical_defense_dictionary +from ..search_repo import _is_audit_recursion_metadata # Retry settings for DB lock resilience on reads _RETRY_MAX_ATTEMPTS = 3 @@ -163,6 +164,7 @@ def _exact_chunk_lookup_result( sentiment: str | None = None, source_filter: str | None = None, correction_category: str | None = None, + include_audit: bool = False, ) -> tuple[list[TextContent], dict] | None: """Return an exact chunk hit for chunk-id shaped queries, or None on miss.""" candidate = query.strip() @@ -193,6 +195,8 @@ def _exact_chunk_lookup_result( parsed_tags = None if tag is not None and tag not in (parsed_tags or []): return None + if not include_audit and _is_audit_recursion_metadata({"tags": parsed_tags or []}): + return None if importance_min is not None: chunk_importance = chunk.get("importance") if not isinstance(chunk_importance, (int, float)) or float(chunk_importance) < float(importance_min): @@ -520,8 +524,9 @@ async def _brain_search( intent=intent, sentiment=sentiment, source_filter=source_filter, - correction_category=correction_category, - ) + correction_category=correction_category, + include_audit=include_audit, + ) if exact_chunk_hit is not None: return exact_chunk_hit fts_query_override = _expanded_fts_query(query, store) @@ -566,6 +571,7 @@ async def _brain_search( n_results=num_results, entity_name=entity_name, project_filter=normalized_project, + include_audit=include_audit, ) chunk_results = kg_results.get("chunks", {}) diff --git a/src/brainlayer/search_repo.py b/src/brainlayer/search_repo.py index 85156bfa..324a3fd8 100644 --- a/src/brainlayer/search_repo.py +++ b/src/brainlayer/search_repo.py @@ -41,7 +41,7 @@ AUDIT_RECURSION_TAG_PATTERNS = ( "LOWER(tag) LIKE '%audit%'", "LOWER(tag) = 'r02'", - "LOWER(tag) GLOB '*r0[0-9]*'", + "LOWER(tag) GLOB 'r0[0-9]'", "LOWER(tag) = 'audit-pollution-source'", "LOWER(tag) = 'agent=auditor'", ) @@ -145,10 +145,7 @@ def _is_audit_recursion_metadata(meta: dict) -> bool: return True if normalized == "r02": return True - if len(normalized) >= 3 and any( - normalized[index : index + 2] == "r0" and normalized[index + 2].isdigit() - for index in range(len(normalized) - 2) - ): + if len(normalized) == 3 and normalized[:2] == "r0" and normalized[2].isdigit(): return True if normalized in {"audit-pollution-source", "agent=auditor"}: return True diff --git a/tests/test_audit_recursion_filter.py b/tests/test_audit_recursion_filter.py index f6490b66..8fee62ef 100644 --- a/tests/test_audit_recursion_filter.py +++ b/tests/test_audit_recursion_filter.py @@ -58,3 +58,26 @@ def test_hybrid_search_excludes_audit_recursion_by_default(tmp_path): assert "audit-recursion-source" in audit_results["ids"][0] finally: store.close() + + +def test_hybrid_search_does_not_exclude_r0x_substrings_inside_normal_tags(tmp_path): + store = VectorStore(tmp_path / "audit-filter-substring.db") + try: + query_embedding = [0.03] * 1024 + _insert_chunk( + store, + "ordinary-mirror07-memory", + "mirror07 normal operational memory should remain searchable", + ["mirror07", "reliability"], + query_embedding, + ) + + results = store.hybrid_search( + query_embedding=query_embedding, + query_text="mirror07 normal operational memory", + n_results=3, + ) + + assert "ordinary-mirror07-memory" in results["ids"][0] + finally: + store.close() diff --git a/tests/test_search_exact_chunk_id.py b/tests/test_search_exact_chunk_id.py index 8ad9d339..b87d417e 100644 --- a/tests/test_search_exact_chunk_id.py +++ b/tests/test_search_exact_chunk_id.py @@ -80,6 +80,54 @@ def test_exact_chunk_lookup_skips_lifecycle_managed_chunks(): assert _exact_chunk_lookup_result("brainbar-archived01", mock_store, "compact") is None +def test_exact_chunk_lookup_excludes_audit_chunks_unless_opted_in(): + """Exact chunk lookup must obey the same audit filter as hybrid search.""" + mock_store = MagicMock() + mock_store.get_chunk.return_value = { + "id": "brainbar-audit01", + "content": "Audit recursion source", + "project": "brainlayer", + "tags": '["r02", "audit"]', + } + + assert _exact_chunk_lookup_result("brainbar-audit01", mock_store, "compact") is None + + _, structured = _exact_chunk_lookup_result( + "brainbar-audit01", + mock_store, + "compact", + include_audit=True, + ) + assert structured["results"][0]["chunk_id"] == "brainbar-audit01" + + +@pytest.mark.asyncio +async def test_brain_search_entity_path_forwards_include_audit(): + """Entity-routed KG hybrid search must honor explicit audit opt-in.""" + mock_store = MagicMock() + mock_store.kg_hybrid_search.return_value = { + "chunks": { + "ids": [[]], + "documents": [[]], + "metadatas": [[]], + "distances": [[]], + } + } + mock_model = MagicMock() + mock_model.embed_query.return_value = [0.1] * 1024 + + with ( + patch("brainlayer.mcp.search_handler._get_vector_store", return_value=mock_store), + patch("brainlayer.mcp.search_handler._detect_entities", return_value=[{"name": "BrainBar"}]), + patch("brainlayer.mcp.search_handler._kg_facts_sql", return_value=[]), + patch("brainlayer.mcp.search_handler._get_embedding_model", return_value=mock_model), + patch("brainlayer.mcp.search_handler._search", new=AsyncMock(return_value=(["fallback"], {}))), + ): + await _brain_search(query="BrainBar", include_audit=True) + + assert mock_store.kg_hybrid_search.call_args.kwargs["include_audit"] is True + + @pytest.mark.asyncio async def test_brain_search_chunk_id_context_routing_wins_over_exact_lookup(): """Explicit chunk_id context expansion should run before exact-id short-circuiting.""" From 1605ea5e91daac7578eefc1b37cc192b33ddaa34 Mon Sep 17 00:00:00 2001 From: Test User Date: Sun, 3 May 2026 14:54:09 +0300 Subject: [PATCH 3/6] chore: format search handler --- src/brainlayer/mcp/search_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/brainlayer/mcp/search_handler.py b/src/brainlayer/mcp/search_handler.py index 1c4e462e..3fd72d46 100644 --- a/src/brainlayer/mcp/search_handler.py +++ b/src/brainlayer/mcp/search_handler.py @@ -524,9 +524,9 @@ async def _brain_search( intent=intent, sentiment=sentiment, source_filter=source_filter, - correction_category=correction_category, - include_audit=include_audit, - ) + correction_category=correction_category, + include_audit=include_audit, + ) if exact_chunk_hit is not None: return exact_chunk_hit fts_query_override = _expanded_fts_query(query, store) From 70bdc7e887cb94ffaff3b68df2244c3c4687f58b Mon Sep 17 00:00:00 2001 From: Test User Date: Sun, 3 May 2026 15:40:11 +0300 Subject: [PATCH 4/6] fix: address audit filter review gaps --- .../Tests/BrainBarTests/MCPRouterTests.swift | 1 + src/brainlayer/mcp/search_handler.py | 23 +++++++++++-------- src/brainlayer/search_repo.py | 7 +++++- tests/test_audit_recursion_filter.py | 4 ++++ tests/test_search_exact_chunk_id.py | 12 ++++++---- 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift b/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift index d607c31f..94e622ac 100644 --- a/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift +++ b/brain-bar/Tests/BrainBarTests/MCPRouterTests.swift @@ -627,6 +627,7 @@ final class MCPRouterTests: XCTestCase { let optInText = ((optInResponse["result"] as? [String: Any])?["content"] as? [[String: Any]])?.first?["text"] as? String ?? "" XCTAssertTrue(optInText.contains("audit-recurs"), optInText) + XCTAssertTrue(optInText.contains("ordinary-bra"), optInText) } func testBrainSearchDoesNotTreatR0xSubstringTagsAsAudit() throws { diff --git a/src/brainlayer/mcp/search_handler.py b/src/brainlayer/mcp/search_handler.py index 3fd72d46..90ac704c 100644 --- a/src/brainlayer/mcp/search_handler.py +++ b/src/brainlayer/mcp/search_handler.py @@ -38,6 +38,11 @@ _CHUNK_ID_QUERY_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_]*(?:-[A-Za-z0-9_]+)+$") +def _empty_exact_chunk_lookup_result(query: str) -> tuple[list[TextContent], dict]: + structured: dict[str, Any] = {"query": query, "total": 0, "results": []} + return ([TextContent(type="text", text=format_search_results(query, [], 0))], structured) + + def _quote_fts_phrase(value: str) -> str: return f'"{value.replace(chr(34), "")}"' @@ -175,16 +180,16 @@ def _exact_chunk_lookup_result( if not chunk: return None if any(chunk.get(field) is not None for field in ("superseded_by", "aggregated_into", "archived_at")): - return None + return _empty_exact_chunk_lookup_result(query) if any(value is not None for value in (source, intent, sentiment, source_filter, correction_category)): - return None + return _empty_exact_chunk_lookup_result(query) if project is not None: chunk_project = _normalize_project_name(chunk.get("project")) or chunk.get("project") normalized_project = _normalize_project_name(project) or project if chunk_project not in (normalized_project, None): - return None + return _empty_exact_chunk_lookup_result(query) if content_type is not None and chunk.get("content_type") != content_type: - return None + return _empty_exact_chunk_lookup_result(query) tags = chunk.get("tags") parsed_tags = None @@ -194,18 +199,18 @@ def _exact_chunk_lookup_result( except (json.JSONDecodeError, TypeError): parsed_tags = None if tag is not None and tag not in (parsed_tags or []): - return None + return _empty_exact_chunk_lookup_result(query) if not include_audit and _is_audit_recursion_metadata({"tags": parsed_tags or []}): - return None + return _empty_exact_chunk_lookup_result(query) if importance_min is not None: chunk_importance = chunk.get("importance") if not isinstance(chunk_importance, (int, float)) or float(chunk_importance) < float(importance_min): - return None + return _empty_exact_chunk_lookup_result(query) chunk_date = chunk.get("created_at", "")[:10] if chunk.get("created_at") else None if date_from is not None and (chunk_date is None or chunk_date < date_from): - return None + return _empty_exact_chunk_lookup_result(query) if date_to is not None and (chunk_date is None or chunk_date > date_to): - return None + return _empty_exact_chunk_lookup_result(query) item = { "score": 1.0, diff --git a/src/brainlayer/search_repo.py b/src/brainlayer/search_repo.py index 324a3fd8..b62824de 100644 --- a/src/brainlayer/search_repo.py +++ b/src/brainlayer/search_repo.py @@ -411,6 +411,7 @@ def search( or (source_filter and source_filter != "claude_code") or source_filter_like or correction_category + or not include_audit ) effective_k = min(n_results * 10, 1000) if needs_overfetch else n_results params = [query_bytes, effective_k] + filter_params @@ -744,7 +745,11 @@ def _binary_search( where_sql = "AND " + " AND ".join(where_clauses) needs_overfetch = ( - entity_id or (source_filter and source_filter != "claude_code") or source_filter_like or correction_category + entity_id + or (source_filter and source_filter != "claude_code") + or source_filter_like + or correction_category + or not include_audit ) effective_k = min(n_results * 10, 1000) if needs_overfetch else n_results params = [query_bytes, effective_k] + filter_params diff --git a/tests/test_audit_recursion_filter.py b/tests/test_audit_recursion_filter.py index 8fee62ef..e2c055ec 100644 --- a/tests/test_audit_recursion_filter.py +++ b/tests/test_audit_recursion_filter.py @@ -17,6 +17,10 @@ def _insert_chunk(store: VectorStore, chunk_id: str, content: str, tags: list[st "INSERT INTO chunk_vectors (chunk_id, embedding) VALUES (?, ?)", (chunk_id, serialize_f32(embedding)), ) + cursor.executemany( + "INSERT OR IGNORE INTO chunk_tags (chunk_id, tag) VALUES (?, ?)", + [(chunk_id, tag) for tag in tags], + ) def test_hybrid_search_excludes_audit_recursion_by_default(tmp_path): diff --git a/tests/test_search_exact_chunk_id.py b/tests/test_search_exact_chunk_id.py index b87d417e..1b6baa51 100644 --- a/tests/test_search_exact_chunk_id.py +++ b/tests/test_search_exact_chunk_id.py @@ -77,7 +77,8 @@ def test_exact_chunk_lookup_skips_lifecycle_managed_chunks(): "archived_at": "2026-04-30T09:15:00Z", } - assert _exact_chunk_lookup_result("brainbar-archived01", mock_store, "compact") is None + _, structured = _exact_chunk_lookup_result("brainbar-archived01", mock_store, "compact") + assert structured["total"] == 0 def test_exact_chunk_lookup_excludes_audit_chunks_unless_opted_in(): @@ -90,7 +91,8 @@ def test_exact_chunk_lookup_excludes_audit_chunks_unless_opted_in(): "tags": '["r02", "audit"]', } - assert _exact_chunk_lookup_result("brainbar-audit01", mock_store, "compact") is None + _, hidden = _exact_chunk_lookup_result("brainbar-audit01", mock_store, "compact") + assert hidden["total"] == 0 _, structured = _exact_chunk_lookup_result( "brainbar-audit01", @@ -125,6 +127,7 @@ async def test_brain_search_entity_path_forwards_include_audit(): ): await _brain_search(query="BrainBar", include_audit=True) + mock_store.kg_hybrid_search.assert_called() assert mock_store.kg_hybrid_search.call_args.kwargs["include_audit"] is True @@ -181,5 +184,6 @@ async def test_brain_search_exact_chunk_id_respects_project_scope(): ): result = await _brain_search(query=chunk_id, project="brainlayer", detail="compact") - assert result == (["fallback"], {"total": 0, "results": []}) - search_mock.assert_awaited_once() + _, structured = result + assert structured["total"] == 0 + search_mock.assert_not_awaited() From eb29a3032f463a7b48540318406f4b51fee046f8 Mon Sep 17 00:00:00 2001 From: Test User Date: Sun, 3 May 2026 16:00:53 +0300 Subject: [PATCH 5/6] fix: handle audit filter review edge cases --- src/brainlayer/engine.py | 5 ++ src/brainlayer/mcp/search_handler.py | 43 ++++++++++++--- src/brainlayer/search_repo.py | 63 ++++++++++++--------- src/brainlayer/vector_store.py | 2 + tests/test_audit_recursion_filter.py | 82 ++++++++++++++++++++++++++++ 5 files changed, 161 insertions(+), 34 deletions(-) diff --git a/src/brainlayer/engine.py b/src/brainlayer/engine.py index 80a53655..5511cd5c 100644 --- a/src/brainlayer/engine.py +++ b/src/brainlayer/engine.py @@ -178,6 +178,7 @@ def think( embed_fn: Any, project: str | None = None, max_results: int = 10, + include_audit: bool = False, ) -> ThinkResult: """Given current task context, retrieve relevant past knowledge. @@ -206,6 +207,7 @@ def think( n_results=max_results, project_filter=project, importance_min=3.0, # Skip low-importance noise + include_audit=include_audit, ) if not results["documents"][0]: @@ -239,6 +241,7 @@ def recall( topic: str | None = None, project: str | None = None, max_results: int = 10, + include_audit: bool = False, ) -> RecallResult: """Proactive smart retrieval based on file or topic. @@ -278,6 +281,7 @@ def recall( query_text=fname, n_results=max_results, project_filter=project, + include_audit=include_audit, ) for doc, meta in zip(search_results["documents"][0], search_results["metadatas"][0]): result.related_chunks.append( @@ -299,6 +303,7 @@ def recall( query_text=topic, n_results=max_results, project_filter=project, + include_audit=include_audit, ) for doc, meta in zip(search_results["documents"][0], search_results["metadatas"][0]): result.related_chunks.append( diff --git a/src/brainlayer/mcp/search_handler.py b/src/brainlayer/mcp/search_handler.py index 90ac704c..ced4f6ad 100644 --- a/src/brainlayer/mcp/search_handler.py +++ b/src/brainlayer/mcp/search_handler.py @@ -450,7 +450,12 @@ async def _brain_search( if file_path is not None and _query_has_regression_signal(query): regression_result = await _regression(file_path=file_path, project=project) - recall_result = await _recall(file_path=file_path, project=project, max_results=max_results) + recall_result = await _recall( + file_path=file_path, + project=project, + max_results=max_results, + include_audit=include_audit, + ) merged_text = [] if isinstance(regression_result, list): merged_text.extend(regression_result) @@ -462,7 +467,12 @@ async def _brain_search( if file_path is not None: timeline = await _file_timeline(file_path=file_path, project=project, limit=50) - recall_result = await _recall(file_path=file_path, project=project, max_results=max_results) + recall_result = await _recall( + file_path=file_path, + project=project, + max_results=max_results, + include_audit=include_audit, + ) merged_text = [] if isinstance(timeline, list): merged_text.extend(timeline) @@ -496,7 +506,9 @@ async def _brain_search( if _query_signals_current_context(query): ctx = await _current_context(hours=24) - think_result = await _think(context=query, project=project, max_results=max_results) + think_result = await _think( + context=query, project=project, max_results=max_results, include_audit=include_audit + ) merged_text = [] if isinstance(ctx, tuple): merged_text.extend(ctx[0]) @@ -509,10 +521,10 @@ async def _brain_search( return merged_text if _query_signals_think(query): - return await _think(context=query, project=project, max_results=max_results) + return await _think(context=query, project=project, max_results=max_results, include_audit=include_audit) if _query_signals_recall(query): - return await _recall(topic=query, project=project, max_results=max_results) + return await _recall(topic=query, project=project, max_results=max_results, include_audit=include_audit) store = _get_vector_store() exact_chunk_hit = _exact_chunk_lookup_result( @@ -1210,7 +1222,12 @@ async def _plan_links( return _error_result(f"Plan links error: {str(e)}") -async def _think(context: str, project: str | None = None, max_results: int = 10): +async def _think( + context: str, + project: str | None = None, + max_results: int = 10, + include_audit: bool = False, +): """Execute think -- retrieve relevant memories for current task.""" try: from ..engine import think @@ -1226,7 +1243,12 @@ def _embed(text: str) -> list[float]: result = await loop.run_in_executor( None, lambda: think( - context=context, store=store, embed_fn=_embed, project=normalized_project, max_results=max_results + context=context, + store=store, + embed_fn=_embed, + project=normalized_project, + max_results=max_results, + include_audit=include_audit, ), ) structured = { @@ -1243,7 +1265,11 @@ def _embed(text: str) -> list[float]: async def _recall( - file_path: str | None = None, topic: str | None = None, project: str | None = None, max_results: int = 10 + file_path: str | None = None, + topic: str | None = None, + project: str | None = None, + max_results: int = 10, + include_audit: bool = False, ): """Execute recall -- proactive context retrieval.""" try: @@ -1266,6 +1292,7 @@ def _embed(text: str) -> list[float]: topic=topic, project=normalized_project, max_results=max_results, + include_audit=include_audit, ), ) structured = { diff --git a/src/brainlayer/search_repo.py b/src/brainlayer/search_repo.py index b62824de..ab620323 100644 --- a/src/brainlayer/search_repo.py +++ b/src/brainlayer/search_repo.py @@ -39,18 +39,9 @@ ] META_NOISE_PATTERNS_CASEFOLDED = [pattern.casefold() for pattern in META_NOISE_PATTERNS] AUDIT_RECURSION_TAG_PATTERNS = ( - "LOWER(tag) LIKE '%audit%'", - "LOWER(tag) = 'r02'", - "LOWER(tag) GLOB 'r0[0-9]'", - "LOWER(tag) = 'audit-pollution-source'", - "LOWER(tag) = 'agent=auditor'", -) -AUDIT_RECURSION_TAG_SQL = ( - "NOT EXISTS (" - "SELECT 1 FROM chunk_tags audit_tags " - "WHERE audit_tags.chunk_id = {chunk_id_expr} " - f"AND ({' OR '.join(AUDIT_RECURSION_TAG_PATTERNS)})" - ")" + "{tag_expr} LIKE '%audit%'", + "{tag_expr} = 'r02'", + "{tag_expr} GLOB 'r0[0-9]'", ) # Module-level LRU cache: {cache_key: (result, timestamp)} @@ -131,8 +122,28 @@ def _contains_meta_noise(content: Optional[str]) -> bool: return any(pattern in content_folded for pattern in META_NOISE_PATTERNS_CASEFOLDED) -def _audit_recursion_exclusion_sql(chunk_id_expr: str) -> str: - return AUDIT_RECURSION_TAG_SQL.format(chunk_id_expr=chunk_id_expr) +def _audit_recursion_tag_predicate(tag_expr: str) -> str: + lowered = f"LOWER(CAST({tag_expr} AS TEXT))" + return "(" + " OR ".join(pattern.format(tag_expr=lowered) for pattern in AUDIT_RECURSION_TAG_PATTERNS) + ")" + + +def _audit_recursion_exclusion_sql(chunk_id_expr: str, tags_expr: str, *, use_chunk_tags: bool = True) -> str: + if use_chunk_tags: + return ( + "NOT EXISTS (" + "SELECT 1 FROM chunk_tags audit_tags " + f"WHERE audit_tags.chunk_id = {chunk_id_expr} " + f"AND {_audit_recursion_tag_predicate('audit_tags.tag')}" + ")" + ) + + tags_json = f"CASE WHEN json_valid({tags_expr}) THEN {tags_expr} ELSE '[]' END" + return ( + "NOT EXISTS (" + f"SELECT 1 FROM json_each({tags_json}) audit_tags " + f"WHERE {_audit_recursion_tag_predicate('audit_tags.value')}" + ")" + ) def _is_audit_recursion_metadata(meta: dict) -> bool: @@ -147,14 +158,19 @@ def _is_audit_recursion_metadata(meta: dict) -> bool: return True if len(normalized) == 3 and normalized[:2] == "r0" and normalized[2].isdigit(): return True - if normalized in {"audit-pollution-source", "agent=auditor"}: - return True return False class SearchMixin: """Search and query methods, mixed into VectorStore.""" + def _audit_recursion_exclusion_sql(self, chunk_id_expr: str, tags_expr: str) -> str: + return _audit_recursion_exclusion_sql( + chunk_id_expr, + tags_expr, + use_chunk_tags=getattr(self, "_chunk_tags_available", True), + ) + def _load_chunk_embeddings(self, chunk_ids: List[str]) -> Dict[str, np.ndarray]: """Fetch float embeddings for the provided chunk IDs.""" if not chunk_ids: @@ -392,7 +408,7 @@ def search( where_clauses.append("c.id IN (SELECT chunk_id FROM chunk_tags WHERE tag LIKE ?)") filter_params.append(f"correction:{correction_category}%") if not include_audit: - where_clauses.append(_audit_recursion_exclusion_sql("c.id")) + where_clauses.append(self._audit_recursion_exclusion_sql("c.id", "c.tags")) if not include_archived: where_clauses.append("c.superseded_by IS NULL") where_clauses.append("c.aggregated_into IS NULL") @@ -411,7 +427,6 @@ def search( or (source_filter and source_filter != "claude_code") or source_filter_like or correction_category - or not include_audit ) effective_k = min(n_results * 10, 1000) if needs_overfetch else n_results params = [query_bytes, effective_k] + filter_params @@ -474,7 +489,7 @@ def search( where_clauses.append("id IN (SELECT chunk_id FROM chunk_tags WHERE tag LIKE ?)") params.append(f"correction:{correction_category}%") if not include_audit: - where_clauses.append(_audit_recursion_exclusion_sql("id")) + where_clauses.append(self._audit_recursion_exclusion_sql("id", "tags")) if not include_archived: where_clauses.append("superseded_by IS NULL") where_clauses.append("aggregated_into IS NULL") @@ -734,7 +749,7 @@ def _binary_search( where_clauses.append("c.id IN (SELECT chunk_id FROM chunk_tags WHERE tag LIKE ?)") filter_params.append(f"correction:{correction_category}%") if not include_audit: - where_clauses.append(_audit_recursion_exclusion_sql("c.id")) + where_clauses.append(self._audit_recursion_exclusion_sql("c.id", "c.tags")) if not include_archived: where_clauses.append("c.superseded_by IS NULL") where_clauses.append("c.aggregated_into IS NULL") @@ -745,11 +760,7 @@ def _binary_search( where_sql = "AND " + " AND ".join(where_clauses) needs_overfetch = ( - entity_id - or (source_filter and source_filter != "claude_code") - or source_filter_like - or correction_category - or not include_audit + entity_id or (source_filter and source_filter != "claude_code") or source_filter_like or correction_category ) effective_k = min(n_results * 10, 1000) if needs_overfetch else n_results params = [query_bytes, effective_k] + filter_params @@ -1045,7 +1056,7 @@ def hybrid_search( fts_extra.append("AND c.id IN (SELECT chunk_id FROM chunk_tags WHERE tag LIKE ?)") fts_filter_params.append(f"correction:{correction_category}%") if not include_audit: - fts_extra.append(f"AND {_audit_recursion_exclusion_sql('c.id')}") + fts_extra.append(f"AND {self._audit_recursion_exclusion_sql('c.id', 'c.tags')}") if filter_meta_noise: for pattern in META_NOISE_PATTERNS_CASEFOLDED: fts_extra.append("AND LOWER(c.content) NOT LIKE ?") diff --git a/src/brainlayer/vector_store.py b/src/brainlayer/vector_store.py index a7a98c23..4534ea95 100644 --- a/src/brainlayer/vector_store.py +++ b/src/brainlayer/vector_store.py @@ -94,6 +94,7 @@ def _init_readonly_db(self) -> None: } self._binary_index_available = "chunk_vectors_binary" in existing_tables self._trigram_fts_available = "chunks_fts_trigram" in existing_tables + self._chunk_tags_available = "chunk_tags" in existing_tables self._local = threading.local() def _init_db_with_retry(self) -> None: @@ -380,6 +381,7 @@ def _init_db(self) -> None: PRIMARY KEY (chunk_id, tag) ) """) + self._chunk_tags_available = True cursor.execute("CREATE INDEX IF NOT EXISTS idx_chunk_tags_tag ON chunk_tags(tag)") # Sync triggers: keep chunk_tags in sync with chunks.tags JSON diff --git a/tests/test_audit_recursion_filter.py b/tests/test_audit_recursion_filter.py index e2c055ec..20bec717 100644 --- a/tests/test_audit_recursion_filter.py +++ b/tests/test_audit_recursion_filter.py @@ -1,6 +1,7 @@ import json from brainlayer._helpers import serialize_f32 +from brainlayer.engine import recall, think from brainlayer.vector_store import VectorStore @@ -85,3 +86,84 @@ def test_hybrid_search_does_not_exclude_r0x_substrings_inside_normal_tags(tmp_pa assert "ordinary-mirror07-memory" in results["ids"][0] finally: store.close() + + +def test_readonly_legacy_db_without_chunk_tags_uses_json_tag_fallback(tmp_path): + db_path = tmp_path / "legacy-readonly-audit-filter.db" + store = VectorStore(db_path) + try: + query_embedding = [0.04] * 1024 + _insert_chunk( + store, + "legacy-audit-source", + "legacy readonly audit memory should be filtered", + ["r02", "audit"], + query_embedding, + ) + _insert_chunk( + store, + "legacy-ordinary-memory", + "legacy readonly ordinary memory should be searchable", + ["brainbar", "reliability"], + [0.05] * 1024, + ) + cursor = store.conn.cursor() + for trigger in ( + "chunk_tags_insert", + "chunk_tags_update", + "chunk_tags_update_clear", + "chunk_tags_delete", + ): + cursor.execute(f"DROP TRIGGER IF EXISTS {trigger}") + cursor.execute("DROP TABLE chunk_tags") + finally: + store.close() + + db_path.chmod(0o444) + readonly_store = VectorStore(db_path) + try: + assert readonly_store._chunk_tags_available is False + results = readonly_store.hybrid_search( + query_embedding=query_embedding, + query_text="legacy readonly memory", + n_results=3, + ) + ids = results["ids"][0] + assert "legacy-audit-source" not in ids + assert "legacy-ordinary-memory" in ids + finally: + readonly_store.close() + db_path.chmod(0o644) + + +def test_engine_think_and_recall_forward_include_audit(): + class MockStore: + def __init__(self): + self.calls = [] + + def hybrid_search(self, **kwargs): + self.calls.append(kwargs) + return { + "documents": [["ordinary memory"]], + "metadatas": [[{"intent": "decision", "project": "brainlayer"}]], + } + + def get_file_timeline(self, *_args, **_kwargs): + return [] + + mock_store = MockStore() + think( + "think about audit history", + store=mock_store, + embed_fn=lambda _text: [0.1] * 1024, + include_audit=True, + ) + recall( + store=mock_store, + embed_fn=lambda _text: [0.1] * 1024, + topic="audit history", + include_audit=True, + ) + + assert mock_store.calls[0]["include_audit"] is True + assert mock_store.calls[1]["include_audit"] is True From 4a8c8ee8c10a182a86f771385e268cf6d28f6423 Mon Sep 17 00:00:00 2001 From: Test User Date: Sun, 3 May 2026 16:43:21 +0300 Subject: [PATCH 6/6] fix: preserve results under audit-heavy knn --- src/brainlayer/search_repo.py | 13 +++++- tests/test_audit_recursion_filter.py | 68 ++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/src/brainlayer/search_repo.py b/src/brainlayer/search_repo.py index ab620323..5555757e 100644 --- a/src/brainlayer/search_repo.py +++ b/src/brainlayer/search_repo.py @@ -40,6 +40,7 @@ META_NOISE_PATTERNS_CASEFOLDED = [pattern.casefold() for pattern in META_NOISE_PATTERNS] AUDIT_RECURSION_TAG_PATTERNS = ( "{tag_expr} LIKE '%audit%'", + "{tag_expr} = 'r0x'", "{tag_expr} = 'r02'", "{tag_expr} GLOB 'r0[0-9]'", ) @@ -154,7 +155,7 @@ def _is_audit_recursion_metadata(meta: dict) -> bool: normalized = str(tag).casefold() if "audit" in normalized: return True - if normalized == "r02": + if normalized in {"r02", "r0x"}: return True if len(normalized) == 3 and normalized[:2] == "r0" and normalized[2].isdigit(): return True @@ -422,11 +423,13 @@ def search( # Bump k to over-fetch when post-KNN filters may discard most results: # - entity_id: entity filter applied post-KNN, most candidates won't match # - non-default source: rare sources (youtube, whatsapp) are <0.01% of chunks + # - audit filter: audit-recursion chunks can dominate the nearest neighbors needs_overfetch = ( entity_id or (source_filter and source_filter != "claude_code") or source_filter_like or correction_category + or not include_audit ) effective_k = min(n_results * 10, 1000) if needs_overfetch else n_results params = [query_bytes, effective_k] + filter_params @@ -759,8 +762,14 @@ def _binary_search( if where_clauses: where_sql = "AND " + " AND ".join(where_clauses) + # These predicates are applied after sqlite-vec KNN, so fetch extra candidates + # when they may discard nearest neighbors before the final result cap. needs_overfetch = ( - entity_id or (source_filter and source_filter != "claude_code") or source_filter_like or correction_category + entity_id + or (source_filter and source_filter != "claude_code") + or source_filter_like + or correction_category + or not include_audit ) effective_k = min(n_results * 10, 1000) if needs_overfetch else n_results params = [query_bytes, effective_k] + filter_params diff --git a/tests/test_audit_recursion_filter.py b/tests/test_audit_recursion_filter.py index 20bec717..0fa221b5 100644 --- a/tests/test_audit_recursion_filter.py +++ b/tests/test_audit_recursion_filter.py @@ -42,6 +42,7 @@ def test_hybrid_search_excludes_audit_recursion_by_default(tmp_path): ["brainbar", "reliability"], [0.02] * 1024, ) + store.build_binary_index() default_results = store.hybrid_search( query_embedding=query_embedding, @@ -76,6 +77,7 @@ def test_hybrid_search_does_not_exclude_r0x_substrings_inside_normal_tags(tmp_pa ["mirror07", "reliability"], query_embedding, ) + store.build_binary_index() results = store.hybrid_search( query_embedding=query_embedding, @@ -107,6 +109,7 @@ def test_readonly_legacy_db_without_chunk_tags_uses_json_tag_fallback(tmp_path): ["brainbar", "reliability"], [0.05] * 1024, ) + store.build_binary_index() cursor = store.conn.cursor() for trigger in ( "chunk_tags_insert", @@ -136,6 +139,71 @@ def test_readonly_legacy_db_without_chunk_tags_uses_json_tag_fallback(tmp_path): db_path.chmod(0o644) +def test_hybrid_search_overfetches_when_audit_chunks_dominate_knn(tmp_path): + store = VectorStore(tmp_path / "audit-filter-overfetch.db") + try: + query_embedding = [0.06] * 1024 + for index in range(60): + _insert_chunk( + store, + f"audit-neighbor-{index}", + f"audit recursion neighbor {index}", + ["r02", "audit"], + query_embedding, + ) + _insert_chunk( + store, + "ordinary-after-audit-neighbors", + "ordinary BrainBar restart decision should survive audit-heavy nearest neighbors", + ["brainbar", "reliability"], + [0.061] * 1024, + ) + store.build_binary_index() + + results = store.hybrid_search( + query_embedding=query_embedding, + query_text="ordinary BrainBar restart decision", + n_results=3, + ) + + assert "ordinary-after-audit-neighbors" in results["ids"][0] + assert all(not chunk_id.startswith("audit-neighbor-") for chunk_id in results["ids"][0]) + finally: + store.close() + + +def test_exact_r0x_tag_is_filtered_as_audit_shorthand(tmp_path): + store = VectorStore(tmp_path / "audit-filter-r0x.db") + try: + query_embedding = [0.07] * 1024 + _insert_chunk( + store, + "audit-r0x-source", + "r0x audit shorthand memory should be filtered", + ["r0x"], + query_embedding, + ) + _insert_chunk( + store, + "ordinary-r0x-control", + "ordinary control memory should remain searchable", + ["brainbar"], + [0.071] * 1024, + ) + store.build_binary_index() + + results = store.hybrid_search( + query_embedding=query_embedding, + query_text="ordinary control memory", + n_results=3, + ) + + assert "audit-r0x-source" not in results["ids"][0] + assert "ordinary-r0x-control" in results["ids"][0] + finally: + store.close() + + def test_engine_think_and_recall_forward_include_audit(): class MockStore: def __init__(self):