From 14d7f785c0205087ce0b89ee54b872a2596da9aa Mon Sep 17 00:00:00 2001 From: RuneLind Date: Tue, 12 May 2026 20:49:04 +0200 Subject: [PATCH 1/4] Add CRAG-lite corrective retrieval around the knowledge search tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 of the corrective-retrieval work (plan: mimir/plans/huginn-muninn-corrective-rag.md). After a copilot-sdk bot calls Huginn's `search_knowledge`, optionally grade the results with a dedicated Haiku call and do a bounded corrective re-query before the model sees them — consuming the Phase-0 contract (`bestScore`, `confidenceBand`, `retryHints`, `noConfidentResults`, `min_relevance`). - `knowledge-grader.ts` — awaiting Haiku evaluator → correct/ambiguous/insufficient + rewritten query / suggested collection. Fail-soft to "correct". - `corrective-retrieval.ts` — grade → re-query `/api/search` (rerank=true) → merge + dedupe by collection/doc_id (parsed from the rendered result text) → consolidated text + `corrective` metadata. Hard cap 1 retry (configurable 2), never recursive. - `knowledge-search-client.ts` — HTTP client for `/api/search` + a renderer mirroring Huginn's MCP-adapter result format. - copilot-sdk connector: registers a `hooks.onPostToolUse` handler that runs the corrective pass and returns a `modifiedResult`; re-appends any trailing Huginn trace marker so downstream trace extraction is unaffected. Claude-CLI bots can't be intercepted this way — left to Phase 3 (prompt-level guidance). - Trace spans: `knowledge_grade` + `knowledge_requery` synthesized under the tool span (`corrective-trace-spans.ts`); a corrective chip on the parent tool span in the dashboard waterfall. - Config: per-bot `correctiveRetrieval` block, `CORRECTIVE_RETRIEVAL_ENABLED` global default, `CORRECTIVE_RETRIEVAL_DISABLED` kill-switch. Off by default — when off the hook isn't registered and behaviour is byte-identical to before. - Tests: grader (verdict parsing, fail-soft), orchestrator (retry/merge/dedupe, budget exhaustion, budget clamp, re-query errors), search client (rendering, doc-id extraction, fetch), trace-span planner, connector hook helpers. --- .env.example | 6 + CLAUDE.md | 18 ++ package.json | 4 +- src/ai/CLAUDE.md | 6 +- src/ai/connectors/copilot-sdk.ts | 158 +++++++++- src/ai/connectors/corrective-hook.test.ts | 139 +++++++++ src/ai/corrective-config.test.ts | 48 ++++ src/ai/corrective-config.ts | 46 +++ src/ai/corrective-retrieval.test.ts | 225 +++++++++++++++ src/ai/corrective-retrieval.ts | 240 ++++++++++++++++ src/ai/huginn-trace-pointer.ts | 29 ++ src/ai/knowledge-grader.test.ts | 78 +++++ src/ai/knowledge-grader.ts | 150 ++++++++++ src/ai/knowledge-search-client.test.ts | 138 +++++++++ src/ai/knowledge-search-client.ts | 271 ++++++++++++++++++ src/ai/tool-status.ts | 10 + src/bots/config.ts | 13 +- src/core/corrective-trace-spans.test.ts | 63 ++++ src/core/corrective-trace-spans.ts | 94 ++++++ src/core/tool-spans.ts | 40 ++- src/dashboard/views/components/span-label.ts | 53 +++- .../views/components/traces-waterfall.ts | 18 ++ src/types.ts | 26 ++ 23 files changed, 1851 insertions(+), 22 deletions(-) create mode 100644 src/ai/connectors/corrective-hook.test.ts create mode 100644 src/ai/corrective-config.test.ts create mode 100644 src/ai/corrective-config.ts create mode 100644 src/ai/corrective-retrieval.test.ts create mode 100644 src/ai/corrective-retrieval.ts create mode 100644 src/ai/knowledge-grader.test.ts create mode 100644 src/ai/knowledge-grader.ts create mode 100644 src/ai/knowledge-search-client.test.ts create mode 100644 src/ai/knowledge-search-client.ts create mode 100644 src/core/corrective-trace-spans.test.ts create mode 100644 src/core/corrective-trace-spans.ts diff --git a/.env.example b/.env.example index 714031e..cc5d2c4 100644 --- a/.env.example +++ b/.env.example @@ -34,3 +34,9 @@ WHISPER_MODEL_PATH=./models/ggml-base.en.bin # TRACING_ENABLED=true # TRACING_RETENTION_DAYS=7 # PROMPT_SNAPSHOTS_RETENTION_DAYS=3 + +# Corrective retrieval (CRAG-lite) around the knowledge search tool — off by +# default. Opt in per-bot via config.json `correctiveRetrieval`, or globally here. +# CORRECTIVE_RETRIEVAL_ENABLED=true +# CORRECTIVE_RETRIEVAL_BUDGET=1 # max corrective re-queries per search (1–2) +# CORRECTIVE_RETRIEVAL_DISABLED=1 # hard kill-switch — overrides per-bot config diff --git a/CLAUDE.md b/CLAUDE.md index 7142b4e..ca9e53e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -217,6 +217,7 @@ All fields are optional — falls back to global `.env` values: | `showWaterfall` | boolean | `true` | Show request progress waterfall overlay in web chat | | `contextWindow` | number | — | Context window size in tokens (e.g. `32768`). Shown as usage in web chat and percentage in Telegram footer | | `prompts` | object | — | Configurable prompts: `jiraAnalysis` (Jira research instruction, content appended automatically), `investigateCode` (follow-up code investigation prompt) | +| `correctiveRetrieval` | object | off | CRAG-lite corrective loop around the knowledge search tool — `{ enabled?: boolean, retryBudget?: 1\|2 }`. Only the `copilot-sdk` connector honours it; off by default. See "Corrective retrieval" below. | ### Database @@ -253,6 +254,9 @@ PostgreSQL + pgvector via Docker (single container). | `SLACK_APP_TOKEN_` | No | — | Slack app-level token (per bot) | | `SLACK_ALLOWED_USER_IDS_` | No | — | Comma-separated Slack user IDs | | `LOG_DIR` | No | `./logs` | Log file directory (set `none` to disable file logging) | +| `CORRECTIVE_RETRIEVAL_ENABLED` | No | `false` | Global default for the CRAG-lite corrective loop (per-bot `correctiveRetrieval.enabled` overrides) | +| `CORRECTIVE_RETRIEVAL_BUDGET` | No | `1` | Default max corrective re-queries per knowledge search (clamped to 1–2) | +| `CORRECTIVE_RETRIEVAL_DISABLED` | No | — | Set to `1` to hard-disable corrective retrieval everywhere, regardless of per-bot config | | `GOAL_CHECK_INTERVAL_MS` | No | — | Legacy alias for `SCHEDULER_INTERVAL_MS` | | `GOAL_CHECK_ENABLED` | No | — | Legacy alias for `SCHEDULER_ENABLED` | @@ -352,6 +356,20 @@ uvx --from "git+https://github.com/oraios/serena" serena project index /path/to/ | `src/dashboard/views/serena-page.ts` | Dashboard UI for managing instances | | `src/dashboard/mcp-client.ts` | MCP Debug client — supports both stdio and HTTP servers | +## Corrective Retrieval (CRAG-lite) + +A CRAG-style "grade the search results, re-query if they're weak" loop wrapped around Huginn's `search_knowledge` MCP tool. **Off by default**; enable per-bot in `config.json` (`"correctiveRetrieval": { "enabled": true, "retryBudget": 1 }`), globally via `CORRECTIVE_RETRIEVAL_ENABLED=true`, or hard-disable everywhere with `CORRECTIVE_RETRIEVAL_DISABLED=1`. + +How it works (copilot-sdk connector only): +1. The connector registers a Copilot SDK `onPostToolUse` hook. When a bot calls `search_knowledge`, the hook intercepts the result before the model sees it. +2. `src/ai/knowledge-grader.ts` — a dedicated **awaiting** Haiku call grades the results (`correct` / `ambiguous` / `insufficient`) and, if weak, proposes a rewritten query and/or a better collection. Fail-soft: any Haiku error → `correct` (no change). +3. `src/ai/corrective-retrieval.ts` — if not `correct` and the retry budget (1, configurable to 2) isn't spent, re-queries Huginn's `/api/search` (`src/ai/knowledge-search-client.ts`) with `rerank=true`, merges the fresh hits into the original result text (deduped by `collection/doc_id` parsed from the rendered output), and appends an inline note. Never recursive. +4. Traces: `knowledge_grade` + `knowledge_requery` spans synthesized under the tool span (`src/core/corrective-trace-spans.ts`), rendered in the dashboard waterfall with a corrective chip on the parent tool span. + +**Connector asymmetry:** Claude-CLI bots run the MCP tool inside their own process, so the result can't be intercepted — they get nothing here (Phase 3 will add prompt-level corrective guidance instead). When the toggle is off, the hook isn't registered and behaviour is byte-identical to before. + +Key files: `src/ai/knowledge-grader.ts`, `src/ai/corrective-retrieval.ts`, `src/ai/knowledge-search-client.ts`, `src/ai/corrective-config.ts`, `src/core/corrective-trace-spans.ts`, hook wiring in `src/ai/connectors/copilot-sdk.ts`. + ## Slack Bot When implementing Slack bot features, be aware of the different message contexts (DMs, threads, channels, Assistant API) — each has different API constraints and capabilities. Check Slack app configuration settings (like 'Agent or Assistant' toggle) as a potential root cause before writing code fixes. diff --git a/package.json b/package.json index 93559d4..0573af9 100644 --- a/package.json +++ b/package.json @@ -12,8 +12,8 @@ "cleanup": "bun run scripts/cleanup-stale-mcp.ts", "cleanup:kill": "bun run scripts/cleanup-stale-mcp.ts --kill", "typecheck": "tsc --noEmit", - "test": "bun test src/utils/ src/bot/telegram-format.test.ts src/bot/topic-commands.test.ts src/slack/slack-format.test.ts src/ai/result-parser.test.ts src/ai/stream-parser.test.ts src/ai/tool-restrictions.test.ts src/ai/knowledge-search.test.ts src/ai/mcp-status.test.ts src/ai/huginn-trace.test.ts src/db/ src/core/topic-commands.test.ts src/core/mcp-env-snapshot.test.ts src/core/tool-spans.test.ts src/core/process-error.test.ts src/core/search-trace-spans.test.ts src/chat/state.test.ts src/chat/chat-config.test.ts src/chat/views/components/ src/dashboard/routes/route-utils.test.ts src/startup/adapter-audit.test.ts src/voice/tts.test.ts && bun test src/scheduler/executor.test.ts && bun test src/core/message-processor.test.ts && bun test src/ai/prompt-builder.test.ts src/ai/executor.test.ts src/bot/handler.test.ts src/bot/middleware.test.ts src/slack/handler.test.ts src/memory/ src/scheduler/detector.test.ts src/scheduler/briefing-prompt.test.ts src/watchers/ src/goals/detector.test.ts src/dashboard/agent-status.test.ts src/dashboard/activity-log.test.ts src/dashboard/views/components/ && bun test src/tracing/tracer.test.ts", - "test:unit": "bun test src/utils/ src/ai/result-parser.test.ts src/ai/stream-parser.test.ts src/ai/tool-restrictions.test.ts src/ai/knowledge-search.test.ts src/ai/mcp-status.test.ts src/ai/huginn-trace.test.ts src/slack/slack-format.test.ts src/bot/telegram-format.test.ts src/bot/topic-commands.test.ts src/bots/config.test.ts src/chat/views/components/ src/dashboard/routes/route-utils.test.ts src/dashboard/agent-status.test.ts src/dashboard/activity-log.test.ts src/dashboard/views/components/ src/watchers/runner.test.ts src/goals/detector.test.ts src/startup/adapter-audit.test.ts src/core/mcp-env-snapshot.test.ts src/core/tool-spans.test.ts src/core/process-error.test.ts src/core/search-trace-spans.test.ts && bun test src/tracing/tracer.test.ts", + "test": "bun test src/utils/ src/bot/telegram-format.test.ts src/bot/topic-commands.test.ts src/slack/slack-format.test.ts src/ai/result-parser.test.ts src/ai/stream-parser.test.ts src/ai/tool-restrictions.test.ts src/ai/knowledge-search.test.ts src/ai/knowledge-search-client.test.ts src/ai/knowledge-grader.test.ts src/ai/corrective-retrieval.test.ts src/ai/corrective-config.test.ts src/ai/connectors/corrective-hook.test.ts src/ai/mcp-status.test.ts src/ai/huginn-trace.test.ts src/db/ src/core/topic-commands.test.ts src/core/mcp-env-snapshot.test.ts src/core/tool-spans.test.ts src/core/process-error.test.ts src/core/search-trace-spans.test.ts src/core/corrective-trace-spans.test.ts src/chat/state.test.ts src/chat/chat-config.test.ts src/chat/views/components/ src/dashboard/routes/route-utils.test.ts src/startup/adapter-audit.test.ts src/voice/tts.test.ts && bun test src/scheduler/executor.test.ts && bun test src/core/message-processor.test.ts && bun test src/ai/prompt-builder.test.ts src/ai/executor.test.ts src/bot/handler.test.ts src/bot/middleware.test.ts src/slack/handler.test.ts src/memory/ src/scheduler/detector.test.ts src/scheduler/briefing-prompt.test.ts src/watchers/ src/goals/detector.test.ts src/dashboard/agent-status.test.ts src/dashboard/activity-log.test.ts src/dashboard/views/components/ && bun test src/tracing/tracer.test.ts", + "test:unit": "bun test src/utils/ src/ai/result-parser.test.ts src/ai/stream-parser.test.ts src/ai/tool-restrictions.test.ts src/ai/knowledge-search.test.ts src/ai/knowledge-search-client.test.ts src/ai/knowledge-grader.test.ts src/ai/corrective-retrieval.test.ts src/ai/corrective-config.test.ts src/ai/connectors/corrective-hook.test.ts src/ai/mcp-status.test.ts src/ai/huginn-trace.test.ts src/slack/slack-format.test.ts src/bot/telegram-format.test.ts src/bot/topic-commands.test.ts src/bots/config.test.ts src/chat/views/components/ src/dashboard/routes/route-utils.test.ts src/dashboard/agent-status.test.ts src/dashboard/activity-log.test.ts src/dashboard/views/components/ src/watchers/runner.test.ts src/goals/detector.test.ts src/startup/adapter-audit.test.ts src/core/mcp-env-snapshot.test.ts src/core/tool-spans.test.ts src/core/process-error.test.ts src/core/search-trace-spans.test.ts src/core/corrective-trace-spans.test.ts && bun test src/tracing/tracer.test.ts", "test:db": "bun test src/db/", "test:handlers": "bun test src/core/message-processor.test.ts src/ai/prompt-builder.test.ts src/ai/executor.test.ts src/bot/handler.test.ts src/bot/middleware.test.ts src/slack/handler.test.ts src/memory/ src/scheduler/detector.test.ts src/scheduler/briefing-prompt.test.ts src/watchers/ src/goals/detector.test.ts src/chat/state.test.ts src/voice/tts.test.ts", "test:integration": "bun test src/chat/integration.test.ts", diff --git a/src/ai/CLAUDE.md b/src/ai/CLAUDE.md index a614f54..04b2d42 100644 --- a/src/ai/CLAUDE.md +++ b/src/ai/CLAUDE.md @@ -15,7 +15,11 @@ | `json-extract.ts` | Extract JSON objects from mixed text output | | `haiku-extraction.ts` | Shared Haiku executor for async extraction tasks (memories, goals, tasks) | | `huginn-trace.ts` | Inline-fence Huginn trace handling (legacy mode) — `parseHuginnTrace`, `extractMcpResultText`, oversized-CLI-divert recovery | -| `huginn-trace-pointer.ts` | Phase 2 out-of-band trace channel — parses `huginn-trace-url:` line and fetches the trace from Huginn's `/api/trace/` endpoint. Preferred when `HUGINN_TRACE_POINTER=1` is set on Huginn. Also exports `processMcpToolResult()` — the unwrap → peel → fetch pipeline connectors run on every tool result | +| `huginn-trace-pointer.ts` | Phase 2 out-of-band trace channel — parses `huginn-trace-url:` line and fetches the trace from Huginn's `/api/trace/` endpoint. Preferred when `HUGINN_TRACE_POINTER=1` is set on Huginn. Also exports `processMcpToolResult()` — the unwrap → peel → fetch pipeline connectors run on every tool result — and `peelTraceMarkerForRewrite()` for connectors that rewrite a tool result and need to re-append the trace marker | +| `knowledge-grader.ts` | CRAG-lite retrieval evaluator — an awaiting Haiku call that grades knowledge-search results (`correct`/`ambiguous`/`insufficient`) and proposes a rewritten query/collection. Fail-soft to `correct`. | +| `corrective-retrieval.ts` | Corrective grade-and-requery orchestrator — `runCorrectiveRetrieval()`: grade → bounded re-query Huginn → merge+dedupe → consolidated text + `corrective` metadata. ≤1 retry (configurable to 2), non-recursive. | +| `knowledge-search-client.ts` | HTTP client for Huginn's `/api/search` + a renderer mirroring the MCP adapter's result format, used by the corrective re-query path. | +| `corrective-config.ts` | Resolves the per-bot corrective-retrieval toggle + retry budget (kill-switch > per-bot config.json > global env defaults). | | `connectors/` | Three connector implementations (see below) | ## Connector Abstraction diff --git a/src/ai/connectors/copilot-sdk.ts b/src/ai/connectors/copilot-sdk.ts index 1b05ed5..72fe97d 100644 --- a/src/ai/connectors/copilot-sdk.ts +++ b/src/ai/connectors/copilot-sdk.ts @@ -1,17 +1,20 @@ -import { CopilotClient, approveAll, type SessionEvent, type CustomAgentConfig } from "@github/copilot-sdk"; +import { CopilotClient, approveAll, type SessionEvent, type SessionConfig, type CustomAgentConfig, type ToolResultObject } from "@github/copilot-sdk"; import type { Config } from "../../config.ts"; import type { BotConfig } from "../../bots/config.ts"; import type { ClaudeExecResult } from "../executor.ts"; import type { StreamProgressCallback } from "../stream-parser.ts"; import { formatToolDisplayName, isReportIntentTool, extractIntentText } from "../stream-parser.ts"; import { truncateOutput } from "../truncate-output.ts"; -import { processMcpToolResult } from "../huginn-trace-pointer.ts"; -import type { ToolCall } from "../../types.ts"; +import { processMcpToolResult, peelTraceMarkerForRewrite } from "../huginn-trace-pointer.ts"; +import type { CorrectiveToolMeta, ToolCall } from "../../types.ts"; import { parseMcpConfig } from "./copilot-mcp.ts"; import { preflightMcpForRequest } from "../mcp-status.ts"; import { getLog } from "../../logging.ts"; import { resolve } from "node:path"; import { discoverSerenaConfigs } from "../../serena/config.ts"; +import { isKnowledgeSearchTool } from "../tool-status.ts"; +import { resolveCorrectiveConfig } from "../corrective-config.ts"; +import { runCorrectiveRetrieval, type CorrectiveMetadata, type CorrectiveRetrievalContext } from "../corrective-retrieval.ts"; const log = getLog("ai", "copilot-sdk"); @@ -80,6 +83,43 @@ export async function executePrompt( // Build custom subagents (e.g. verify-code for grep/diff verification) const customAgents = buildCustomAgents(botConfig); + // Corrective retrieval (CRAG-lite): when enabled for this bot, an onPostToolUse + // hook grades each knowledge-search result with Haiku and, if it's weak, does a + // bounded re-query — splicing the fresh hits into the result before the model + // sees it. Off by default (see src/ai/corrective-config.ts); when off, the hook + // isn't registered at all and behaviour is byte-identical to before. + const correctiveCfg = resolveCorrectiveConfig(botConfig); + const correctiveOutcomes: CorrectiveMetadata[] = []; + const correctiveEnabled = correctiveCfg.enabled && hasMcp; + const userQuestion = correctiveEnabled ? extractUserQuestion(prompt) : ""; + const correctiveHooks: SessionConfig["hooks"] | undefined = correctiveEnabled + ? { + onPostToolUse: async (input) => { + if (!isKnowledgeSearchTool(input.toolName)) return; + try { + const result = await applyCorrectiveRetrieval({ + toolName: input.toolName, + toolArgs: input.toolArgs, + toolResult: input.toolResult, + botConfig, + budget: correctiveCfg.retryBudget, + userQuestion, + }); + if (result) { + correctiveOutcomes.push(result.metadata); + if (result.modifiedResult) return { modifiedResult: result.modifiedResult }; + } + } catch (e) { + log.warn("Corrective retrieval hook failed: {error}", { + botName: botConfig.name, + error: e instanceof Error ? e.message : String(e), + }); + } + return; + }, + } + : undefined; + // Create session per request (system prompt is dynamic — memories, goals, history change per message) const session = await cl.createSession({ model, @@ -92,6 +132,7 @@ export async function executePrompt( ...(hasMcp ? { mcpServers } : {}), ...(customAgents.length > 0 ? { customAgents } : {}), ...(botConfig.excludedTools?.length ? { excludedTools: botConfig.excludedTools } : {}), + ...(correctiveHooks ? { hooks: correctiveHooks } : {}), }); // Track tool calls for waterfall @@ -259,6 +300,11 @@ export async function executePrompt( const wallClockMs = performance.now() - wallStart; const content = response?.data?.content ?? ""; + // Attach corrective-retrieval metadata to the matching knowledge-search tool + // calls so the traces waterfall can synthesize knowledge_grade / knowledge_requery + // spans (onPostToolUse gives no toolCallId, so this matches by tool order). + if (correctiveOutcomes.length > 0) attachCorrectiveOutcomes(toolCalls, correctiveOutcomes); + return { result: content, costUsd: 0, // Copilot SDK doesn't report cost (subscription model) @@ -332,3 +378,109 @@ function abbreviateInput(args: unknown): string | undefined { const json = JSON.stringify(args); return json.length > 500 ? json.slice(0, 500) + "…" : json; } + +// ── Corrective retrieval (CRAG-lite) helpers ─────────────────────────────── + +export interface ApplyCorrectiveArgs { + toolName: string; + toolArgs: unknown; + toolResult: ToolResultObject; + botConfig: Pick; + budget: number; + userQuestion: string; + /** Injectable for tests — forwarded to {@link runCorrectiveRetrieval}. */ + searchFn?: CorrectiveRetrievalContext["searchFn"]; + gradeFn?: CorrectiveRetrievalContext["gradeFn"]; +} + +/** + * Run the corrective grade-and-requery pass on a knowledge-search tool result. + * Returns `null` when there's nothing to act on (empty result, tool error); + * otherwise always returns the `metadata` (for tracing) and, when results were + * merged in, a `modifiedResult` to hand back to the model. The trailing Huginn + * trace marker, if any, is peeled off the body before splicing and re-appended + * after, so downstream trace extraction is unaffected. + */ +export async function applyCorrectiveRetrieval( + args: ApplyCorrectiveArgs, +): Promise<{ modifiedResult?: ToolResultObject; metadata: CorrectiveMetadata } | null> { + const { toolResult, toolArgs, botConfig, budget, userQuestion } = args; + const originalText = toolResult?.textResultForLlm; + if (typeof originalText !== "string" || originalText.length === 0) return null; + // Tool errors (server down, bad collection) carry an `error` field — don't + // grade those; the model handles the error itself. + if (toolResult.resultType && toolResult.resultType !== "success") return null; + + const { body, remainder } = peelTraceMarkerForRewrite(originalText); + + const argObj = toolArgs && typeof toolArgs === "object" ? (toolArgs as Record) : {}; + const originalQuery = typeof argObj.query === "string" ? argObj.query.trim() : ""; + const originalCollections = normalizeCollections(argObj.collection); + + const outcome = await runCorrectiveRetrieval({ + question: userQuestion || originalQuery, + originalQuery, + originalCollections, + originalResultText: body, + budget, + botName: botConfig.name, + cwd: botConfig.dir, + log, + graderTimeoutMs: 30_000, + searchFn: args.searchFn, + gradeFn: args.gradeFn, + }); + + if (!outcome.changed) return { metadata: outcome.metadata }; + + return { + metadata: outcome.metadata, + modifiedResult: { ...toolResult, textResultForLlm: outcome.text + remainder }, + }; +} + +function normalizeCollections(v: unknown): string[] | undefined { + if (typeof v === "string" && v.trim()) return [v.trim()]; + if (Array.isArray(v)) { + const arr = v.filter((x): x is string => typeof x === "string" && x.trim().length > 0); + return arr.length > 0 ? arr : undefined; + } + return undefined; +} + +/** Pull the current user turn out of the assembled prompt for grading. The + * prompt-builder puts history in a `` block followed by + * the current message, so everything after the last close tag is the turn. + * Capped so the grader prompt stays cheap. */ +export function extractUserQuestion(prompt: string): string { + const closeTag = ""; + const idx = prompt.lastIndexOf(closeTag); + const tail = idx !== -1 ? prompt.slice(idx + closeTag.length) : prompt; + const trimmed = tail.trim(); + return trimmed.length > 1500 ? trimmed.slice(-1500).trim() : trimmed; +} + +/** Attach corrective outcomes to the knowledge-search tool calls in order + * (onPostToolUse exposes no toolCallId, so the i-th outcome maps to the i-th + * knowledge-search tool call). */ +export function attachCorrectiveOutcomes(toolCalls: ToolCall[], outcomes: CorrectiveMetadata[]): void { + let i = 0; + for (const tc of toolCalls) { + if (i >= outcomes.length) break; + if (!isKnowledgeSearchTool(tc.name)) continue; + tc.corrective = correctiveMetaToToolMeta(outcomes[i++]!); + } +} + +function correctiveMetaToToolMeta(m: CorrectiveMetadata): CorrectiveToolMeta { + return { + retries: m.retries, + verdicts: m.verdicts, + reasons: m.reasons, + queriesTried: m.queriesTried, + collectionsTried: m.collectionsTried.map((c) => c ?? null), + finalVerdict: m.finalVerdict, + graderMs: m.graderMs, + requeryMs: m.requeryMs, + }; +} diff --git a/src/ai/connectors/corrective-hook.test.ts b/src/ai/connectors/corrective-hook.test.ts new file mode 100644 index 0000000..56b66c1 --- /dev/null +++ b/src/ai/connectors/corrective-hook.test.ts @@ -0,0 +1,139 @@ +import { test, expect, describe } from "bun:test"; +import { applyCorrectiveRetrieval, extractUserQuestion, attachCorrectiveOutcomes } from "./copilot-sdk.ts"; +import type { CorrectiveMetadata } from "../corrective-retrieval.ts"; +import type { ToolCall } from "../../types.ts"; +import type { KnowledgeGrade } from "../knowledge-grader.ts"; + +describe("extractUserQuestion", () => { + test("returns the current turn after the conversation_history block", () => { + const prompt = "\nuser: hi\nassistant: hello\n\n\nWhat SEDs belong to LA_BUC_02?"; + expect(extractUserQuestion(prompt)).toBe("What SEDs belong to LA_BUC_02?"); + }); + + test("returns the whole prompt when there's no history block", () => { + expect(extractUserQuestion("just a question")).toBe("just a question"); + }); + + test("caps very long tails to the last 1500 chars", () => { + const long = "x".repeat(5000); + const out = extractUserQuestion(long); + expect(out.length).toBe(1500); + }); +}); + +describe("attachCorrectiveOutcomes", () => { + function tc(name: string): ToolCall { + return { id: name, name, displayName: name, durationMs: 1, startOffsetMs: 0 }; + } + function meta(finalVerdict: string): CorrectiveMetadata { + return { retries: 1, verdicts: ["insufficient", finalVerdict] as KnowledgeGrade["verdict"][], reasons: ["x", "y"], queriesTried: ["q"], collectionsTried: [undefined], finalVerdict: finalVerdict as KnowledgeGrade["verdict"], graderMs: 100, requeryMs: [50] }; + } + + test("maps the i-th outcome to the i-th knowledge-search tool call, skipping others", () => { + const calls = [tc("knowledge-search_knowledge"), tc("yggdrasil-symbol_context"), tc("knowledge-search_knowledge")]; + attachCorrectiveOutcomes(calls, [meta("correct"), meta("ambiguous")]); + expect(calls[0]!.corrective?.finalVerdict).toBe("correct"); + expect(calls[1]!.corrective).toBeUndefined(); + expect(calls[2]!.corrective?.finalVerdict).toBe("ambiguous"); + expect(calls[2]!.corrective?.collectionsTried).toEqual([null]); + }); + + test("no-op when there are no outcomes", () => { + const calls = [tc("knowledge-search_knowledge")]; + attachCorrectiveOutcomes(calls, []); + expect(calls[0]!.corrective).toBeUndefined(); + }); +}); + +describe("applyCorrectiveRetrieval", () => { + const botConfig = { name: "test", dir: "/tmp/test-bot" }; + const okGrade: KnowledgeGrade = { verdict: "correct", reason: "covered" }; + + function grader(...grades: KnowledgeGrade[]) { + let i = 0; + return async () => grades[Math.min(i++, grades.length - 1)]!; + } + + test("returns null for a tool error result", async () => { + const out = await applyCorrectiveRetrieval({ + toolName: "knowledge-search_knowledge", + toolArgs: { query: "x" }, + toolResult: { textResultForLlm: "Knowledge API server is not running", resultType: "failure" }, + botConfig, + budget: 1, + userQuestion: "q", + gradeFn: grader(okGrade), + searchFn: async () => ({ results: [] }), + }); + expect(out).toBeNull(); + }); + + test("returns null for an empty result", async () => { + const out = await applyCorrectiveRetrieval({ + toolName: "knowledge-search_knowledge", + toolArgs: { query: "x" }, + toolResult: { textResultForLlm: "", resultType: "success" }, + botConfig, + budget: 1, + userQuestion: "q", + gradeFn: grader(okGrade), + searchFn: async () => ({ results: [] }), + }); + expect(out).toBeNull(); + }); + + test("verdict 'correct' → metadata only, no modifiedResult", async () => { + const out = await applyCorrectiveRetrieval({ + toolName: "knowledge-search_knowledge", + toolArgs: { query: "x", collection: "wiki" }, + toolResult: { textResultForLlm: "## Doc (80% relevant · high)\ncollection: `wiki` doc_id: `1`\n\nbody", resultType: "success" }, + botConfig, + budget: 1, + userQuestion: "q", + gradeFn: grader(okGrade), + searchFn: async () => ({ results: [] }), + }); + expect(out).not.toBeNull(); + expect(out!.modifiedResult).toBeUndefined(); + expect(out!.metadata.retries).toBe(0); + }); + + test("low-confidence result → exactly one re-query, merged, trace fence preserved at the end", async () => { + const original = + "## Old doc (15% relevant · low)\ncollection: `wiki` doc_id: `1`\n\nweak body\n\n```huginn-trace\n{\"schemaVersion\":1,\"totalMs\":42}\n```"; + let searchCalls = 0; + const out = await applyCorrectiveRetrieval({ + toolName: "knowledge-search_knowledge", + toolArgs: { query: "LA_BUC_02", collection: "wiki" }, + toolResult: { textResultForLlm: original, resultType: "success" }, + botConfig, + budget: 1, + userQuestion: "what SEDs belong to LA_BUC_02?", + gradeFn: grader( + { verdict: "insufficient", rewrittenQuery: "LA_BUC_02 structured electronic documents", reason: "off-topic" }, + { verdict: "correct", reason: "now covered" }, + ), + searchFn: async (query: string) => { + searchCalls++; + expect(query).toBe("LA_BUC_02 structured electronic documents"); + return { + results: [ + { collection: "wiki", id: "1", title: "Old doc", relevance: 0.7, confidenceBand: "high", matchedChunks: [{ content: "x" }] }, // dupe + { collection: "wiki", id: "2", title: "Right doc", relevance: 0.8, confidenceBand: "high", matchedChunks: [{ content: "the answer" }] }, + ], + }; + }, + }); + expect(searchCalls).toBe(1); + expect(out).not.toBeNull(); + expect(out!.modifiedResult).toBeDefined(); + const text = out!.modifiedResult!.textResultForLlm; + expect(text).toContain("Old doc"); + expect(text).toContain("Right doc"); + expect(text).toContain("[corrective retrieval — re-query #1"); + expect(text.match(/doc_id: `1`/g)?.length).toBe(1); // dupe dropped + expect(text.trimEnd().endsWith("```")).toBe(true); // trace fence re-appended at the very end + expect(text).toContain("\"schemaVersion\":1"); + expect(out!.metadata.queriesTried).toEqual(["LA_BUC_02 structured electronic documents"]); + }); +}); diff --git a/src/ai/corrective-config.test.ts b/src/ai/corrective-config.test.ts new file mode 100644 index 0000000..b3996fa --- /dev/null +++ b/src/ai/corrective-config.test.ts @@ -0,0 +1,48 @@ +import { test, expect, describe } from "bun:test"; +import { resolveCorrectiveConfig, clampBudget } from "./corrective-config.ts"; + +describe("clampBudget", () => { + test("clamps to the 1–2 range and floors", () => { + expect(clampBudget(0)).toBe(1); + expect(clampBudget(1)).toBe(1); + expect(clampBudget(2)).toBe(2); + expect(clampBudget(5)).toBe(2); + expect(clampBudget(1.9)).toBe(1); + expect(clampBudget(NaN)).toBe(1); + }); +}); + +describe("resolveCorrectiveConfig", () => { + test("off by default when nothing is configured", () => { + expect(resolveCorrectiveConfig({}, {})).toEqual({ enabled: false, retryBudget: 1 }); + }); + + test("per-bot config enables it and clamps the budget", () => { + expect(resolveCorrectiveConfig({ correctiveRetrieval: { enabled: true, retryBudget: 9 } }, {})).toEqual({ + enabled: true, + retryBudget: 2, + }); + }); + + test("global env default enables it when the bot doesn't say otherwise", () => { + const env = { CORRECTIVE_RETRIEVAL_ENABLED: "true", CORRECTIVE_RETRIEVAL_BUDGET: "2" }; + expect(resolveCorrectiveConfig({}, env)).toEqual({ enabled: true, retryBudget: 2 }); + }); + + test("per-bot config overrides the global default (disable wins too)", () => { + const env = { CORRECTIVE_RETRIEVAL_ENABLED: "true" }; + expect(resolveCorrectiveConfig({ correctiveRetrieval: { enabled: false } }, env).enabled).toBe(false); + }); + + test("kill-switch overrides everything", () => { + const env = { CORRECTIVE_RETRIEVAL_DISABLED: "1", CORRECTIVE_RETRIEVAL_ENABLED: "true" }; + expect(resolveCorrectiveConfig({ correctiveRetrieval: { enabled: true, retryBudget: 2 } }, env)).toEqual({ + enabled: false, + retryBudget: 1, + }); + }); + + test("a bare global enable defaults the budget to 1", () => { + expect(resolveCorrectiveConfig({}, { CORRECTIVE_RETRIEVAL_ENABLED: "true" })).toEqual({ enabled: true, retryBudget: 1 }); + }); +}); diff --git a/src/ai/corrective-config.ts b/src/ai/corrective-config.ts new file mode 100644 index 0000000..765cf1b --- /dev/null +++ b/src/ai/corrective-config.ts @@ -0,0 +1,46 @@ +import type { BotConfig } from "../bots/config.ts"; + +/** + * Resolved per-bot corrective-retrieval settings. Precedence: + * 1. `CORRECTIVE_RETRIEVAL_DISABLED=1` (hard kill-switch) → always off. + * 2. The bot's `config.json` `correctiveRetrieval` block. + * 3. The global env defaults (`CORRECTIVE_RETRIEVAL_ENABLED` / + * `CORRECTIVE_RETRIEVAL_BUDGET`). + * `retryBudget` is clamped to 1–2 regardless of source. + * + * Reads `process.env` directly (rather than going through `loadConfig()`) so it + * has no hard `DATABASE_URL` dependency and behaves the same in tests. + */ +export interface ResolvedCorrectiveConfig { + enabled: boolean; + /** Max corrective re-queries per knowledge search (1 or 2). */ + retryBudget: number; +} + +export function resolveCorrectiveConfig( + botConfig: Pick, + env: NodeJS.ProcessEnv = process.env, +): ResolvedCorrectiveConfig { + if (env.CORRECTIVE_RETRIEVAL_DISABLED === "1") { + return { enabled: false, retryBudget: 1 }; + } + + const bot = botConfig.correctiveRetrieval; + const globalEnabled = env.CORRECTIVE_RETRIEVAL_ENABLED === "true"; + const enabled = bot?.enabled ?? globalEnabled; + + const globalBudget = parseBudgetEnv(env.CORRECTIVE_RETRIEVAL_BUDGET); + const rawBudget = bot?.retryBudget ?? globalBudget ?? 1; + return { enabled, retryBudget: clampBudget(rawBudget) }; +} + +export function clampBudget(n: number): number { + if (!Number.isFinite(n)) return 1; + return Math.max(1, Math.min(2, Math.floor(n))); +} + +function parseBudgetEnv(raw: string | undefined): number | undefined { + if (!raw) return undefined; + const n = parseInt(raw, 10); + return Number.isNaN(n) ? undefined : n; +} diff --git a/src/ai/corrective-retrieval.test.ts b/src/ai/corrective-retrieval.test.ts new file mode 100644 index 0000000..3501f84 --- /dev/null +++ b/src/ai/corrective-retrieval.test.ts @@ -0,0 +1,225 @@ +import { test, expect, describe } from "bun:test"; +import { runCorrectiveRetrieval } from "./corrective-retrieval.ts"; +import type { KnowledgeGrade } from "./knowledge-grader.ts"; +import type { KnowledgeSearchResponse, KnowledgeSearchResult } from "./knowledge-search-client.ts"; +import { renderSearchResults } from "./knowledge-search-client.ts"; +import { getLog } from "../logging.ts"; + +const log = getLog("test", "corrective-retrieval"); + +function result(over: Partial & { id: string; collection: string }): KnowledgeSearchResult { + return { + title: `Doc ${over.id}`, + relevance: 0.7, + confidenceBand: "high", + matchedChunks: [{ content: `body of ${over.id}` }], + ...over, + }; +} + +function searchResponse(results: KnowledgeSearchResult[], over: Partial = {}): KnowledgeSearchResponse { + return { results, bestScore: results[0]?.relevance, ...over }; +} + +/** A grader stub that returns the given verdicts in sequence (last one repeats). */ +function gradeSequence(...grades: KnowledgeGrade[]) { + let i = 0; + return async () => grades[Math.min(i++, grades.length - 1)]!; +} + +/** A search stub that returns the given responses in sequence (last one repeats), + * recording the queries it was called with. */ +function searchSequence(...responses: KnowledgeSearchResponse[]) { + const calls: { query: string; collections?: string[] }[] = []; + let i = 0; + const fn = async (query: string, opts?: { collections?: string[] }) => { + calls.push({ query, collections: opts?.collections }); + return responses[Math.min(i++, responses.length - 1)]!; + }; + return Object.assign(fn, { calls }); +} + +const baseCtx = { + question: "what SEDs belong to LA_BUC_02?", + originalQuery: "LA_BUC_02", + botName: "test", + log, +}; + +describe("runCorrectiveRetrieval", () => { + test("verdict 'correct' → no re-query, text unchanged", async () => { + const search = searchSequence(searchResponse([result({ id: "1", collection: "wiki" })])); + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalResultText: "## Original (80% relevant · high)\ncollection: `wiki` doc_id: `1`\n\nbody", + budget: 1, + gradeFn: gradeSequence({ verdict: "correct", reason: "covered" }), + searchFn: search, + }); + expect(out.changed).toBe(false); + expect(out.text).toContain("## Original"); + expect(out.metadata.retries).toBe(0); + expect(out.metadata.verdicts).toEqual(["correct"]); + expect(out.metadata.queriesTried).toEqual([]); + expect(search.calls.length).toBe(0); + }); + + test("insufficient → one re-query → merged & deduped, then correct", async () => { + const original = renderSearchResults([result({ id: "1", collection: "wiki", title: "Old doc" })]); + const search = searchSequence( + searchResponse([ + result({ id: "1", collection: "wiki", title: "Old doc" }), // dupe — must be dropped + result({ id: "2", collection: "wiki", title: "Fresh doc" }), + ]), + ); + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalResultText: original, + budget: 1, + gradeFn: gradeSequence( + { verdict: "insufficient", rewrittenQuery: "LA_BUC_02 structured electronic documents", reason: "off-topic" }, + { verdict: "correct", reason: "now covered" }, + ), + searchFn: search, + }); + expect(out.changed).toBe(true); + expect(out.text).toContain("Old doc"); // original kept + expect(out.text).toContain("Fresh doc"); // fresh appended + expect(out.text).toContain("[corrective retrieval — re-query #1"); + // doc_id `1` appears once (original) — the dupe from the re-query was dropped. + expect(out.text.match(/doc_id: `1`/g)?.length).toBe(1); + expect(out.metadata.retries).toBe(1); + expect(out.metadata.verdicts).toEqual(["insufficient", "correct"]); + expect(out.metadata.finalVerdict).toBe("correct"); + expect(out.metadata.queriesTried).toEqual(["LA_BUC_02 structured electronic documents"]); + expect(search.calls[0]?.query).toBe("LA_BUC_02 structured electronic documents"); + }); + + test("budget 1 stops after one re-query even if still insufficient", async () => { + const search = searchSequence(searchResponse([result({ id: "9", collection: "wiki", title: "Marginal" })])); + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalResultText: renderSearchResults([result({ id: "1", collection: "wiki" })]), + budget: 1, + gradeFn: gradeSequence({ verdict: "insufficient", rewrittenQuery: "broader terms", reason: "weak" }), + searchFn: search, + }); + expect(out.metadata.retries).toBe(1); + expect(out.metadata.verdicts).toEqual(["insufficient", "insufficient"]); + expect(out.metadata.finalVerdict).toBe("insufficient"); + expect(search.calls.length).toBe(1); + }); + + test("budget is clamped to 2 even when configured higher", async () => { + const search = searchSequence( + searchResponse([result({ id: "a", collection: "wiki" })]), + searchResponse([result({ id: "b", collection: "wiki" })]), + searchResponse([result({ id: "c", collection: "wiki" })]), + ); + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalResultText: renderSearchResults([result({ id: "1", collection: "wiki" })]), + budget: 5, + gradeFn: gradeSequence( + { verdict: "insufficient", rewrittenQuery: "q1", reason: "x" }, + { verdict: "insufficient", rewrittenQuery: "q2", reason: "x" }, + { verdict: "insufficient", rewrittenQuery: "q3", reason: "x" }, + ), + searchFn: search, + }); + expect(out.metadata.retries).toBe(2); + expect(search.calls.map((c) => c.query)).toEqual(["q1", "q2"]); + }); + + test("re-query throws → loop stops, original unchanged", async () => { + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalResultText: "## Original\ncollection: `wiki` doc_id: `1`", + budget: 1, + gradeFn: gradeSequence({ verdict: "insufficient", rewrittenQuery: "q", reason: "weak" }), + searchFn: async () => { throw new Error("knowledge api down"); }, + }); + expect(out.changed).toBe(false); + expect(out.metadata.retries).toBe(0); + expect(out.metadata.verdicts).toEqual(["insufficient"]); + }); + + test("re-query returns only duplicates → no append, but retry recorded", async () => { + const original = renderSearchResults([result({ id: "1", collection: "wiki" })]); + const search = searchSequence(searchResponse([result({ id: "1", collection: "wiki" })])); + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalResultText: original, + budget: 1, + gradeFn: gradeSequence({ verdict: "ambiguous", rewrittenQuery: "rephrased", reason: "broad" }), + searchFn: search, + }); + expect(out.changed).toBe(false); + expect(out.metadata.retries).toBe(1); + expect(out.metadata.queriesTried).toEqual(["rephrased"]); + }); + + test("no rewritten query and no footer hints → no re-query", async () => { + const search = searchSequence(searchResponse([result({ id: "x", collection: "wiki" })])); + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalResultText: "## Original\ncollection: `wiki` doc_id: `1`", + budget: 1, + gradeFn: gradeSequence({ verdict: "insufficient", reason: "nothing on topic" }), + searchFn: search, + }); + expect(out.changed).toBe(false); + expect(out.metadata.retries).toBe(0); + expect(search.calls.length).toBe(0); + }); + + test("falls back to broaderQuery parsed from the result footer", async () => { + const original = + "## Original (12% relevant · low)\ncollection: `wiki` doc_id: `1`\n\n*No confident match — try: broader query: \"LA_BUC concepts\"*"; + const search = searchSequence(searchResponse([result({ id: "2", collection: "wiki", title: "Wider hit" })])); + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalResultText: original, + budget: 1, + gradeFn: gradeSequence( + { verdict: "insufficient", reason: "weak" }, // no rewrittenQuery — must use the footer hint + { verdict: "correct", reason: "ok" }, + ), + searchFn: search, + }); + expect(search.calls[0]?.query).toBe("LA_BUC concepts"); + expect(out.text).toContain("Wider hit"); + }); + + test("suggestedCollection redirects the re-query scope", async () => { + const search = searchSequence(searchResponse([result({ id: "2", collection: "confluence", title: "Conf doc" })])); + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalQuery: "LA_BUC_02", + originalCollections: ["wiki"], + originalResultText: renderSearchResults([result({ id: "1", collection: "wiki" })]), + budget: 1, + gradeFn: gradeSequence( + { verdict: "ambiguous", rewrittenQuery: "LA_BUC_02 details", suggestedCollection: "confluence", reason: "wrong collection" }, + { verdict: "correct", reason: "ok" }, + ), + searchFn: search, + }); + expect(search.calls[0]?.collections).toEqual(["confluence"]); + expect(out.changed).toBe(true); + }); + + test("grader unavailable (returns 'correct') → no change", async () => { + const search = searchSequence(searchResponse([result({ id: "x", collection: "wiki" })])); + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalResultText: "## Original", + budget: 1, + gradeFn: async () => ({ verdict: "correct", reason: "grader unavailable" }), + searchFn: search, + }); + expect(out.changed).toBe(false); + expect(out.metadata.retries).toBe(0); + expect(search.calls.length).toBe(0); + }); +}); diff --git a/src/ai/corrective-retrieval.ts b/src/ai/corrective-retrieval.ts new file mode 100644 index 0000000..925f7ad --- /dev/null +++ b/src/ai/corrective-retrieval.ts @@ -0,0 +1,240 @@ +import type { Logger } from "@logtape/logtape"; +import { gradeKnowledgeResults, type GradeVerdict, type KnowledgeGrade } from "./knowledge-grader.ts"; +import { + searchKnowledge, + renderSearchResults, + renderRetryHintsFooter, + extractDocKeysFromRenderedText, + parseQueryHintsFromFooter, + docKey, + type KnowledgeSearchResponse, +} from "./knowledge-search-client.ts"; + +/** + * CRAG-lite corrective loop around the knowledge search tool. After a bot's + * `search_knowledge` call returns, this: + * + * 1. Grades the result with Haiku ({@link gradeKnowledgeResults}). + * 2. If the verdict is "ambiguous" / "insufficient" and the retry budget + * isn't spent, re-queries Huginn's `/api/search` with the grader's + * rewritten query (falling back to the Phase-0 `retryHints.broaderQuery` / + * `narrowerQuery` parsed from the result footer), optionally redirected to + * a `suggestedCollection`, forcing `rerank=true` so the re-query's + * `confidenceBand`s are trustworthy. + * 3. Merges the fresh hits into the original result text — deduped against + * it by `collection/doc_id` — with an inline note explaining the retry. + * 4. Optionally re-grades and retries again, up to the (clamped 1–2) budget; + * never recursive. + * + * Returns the consolidated text to feed the model plus a `corrective` metadata + * block for tracing (`{retries, verdicts, reasons, queriesTried, finalVerdict}`). + * + * Fail-soft throughout: a grader that can't be reached returns "correct" (no + * change); a re-query HTTP error ends the loop with whatever's accumulated. The + * caller is expected to gate on the per-bot toggle — this function assumes the + * feature is enabled and `budget >= 1`. + * + * Plan: `../mimir/plans/huginn-muninn-corrective-rag.md` (Phase 1). + */ + +export interface CorrectiveMetadata { + /** Number of re-queries actually issued (0–budget). */ + retries: number; + /** Grader verdict from each grading pass, in order (length = retries + 1). */ + verdicts: GradeVerdict[]; + /** Grader reason from each grading pass, parallel to `verdicts`. */ + reasons: string[]; + /** The re-query strings actually issued (excludes the original query). */ + queriesTried: string[]; + /** Collections each re-query was scoped to (parallel to `queriesTried`); + * `undefined` entry = searched all collections. */ + collectionsTried: (string[] | undefined)[]; + /** The verdict from the last grading pass — i.e. whether the corrective + * pass left the result set in good shape. */ + finalVerdict: GradeVerdict; + /** Total wall time spent in the Haiku grader across all passes, ms. */ + graderMs: number; + /** Wall time of each re-query HTTP call, parallel to `queriesTried`, ms. */ + requeryMs: number[]; +} + +export interface CorrectiveOutcome { + /** Tool-result text to feed back to the model. Equal to `originalResultText` + * when nothing changed. */ + text: string; + /** True when `text` differs from `originalResultText` (i.e. results were + * merged in). */ + changed: boolean; + metadata: CorrectiveMetadata; +} + +export interface CorrectiveRetrievalContext { + /** The user's information need — used to grade relevance. Typically the + * current user turn (trimmed). */ + question: string; + /** The search query the model issued (from the tool call's args). Used to + * avoid re-issuing an identical query. */ + originalQuery: string; + /** Collection(s) the model restricted the original search to, if any. */ + originalCollections?: string[]; + /** The rendered, trace-marker-peeled tool result the model would otherwise + * see. */ + originalResultText: string; + /** Max re-queries. Clamped to [1, 2]. The caller gates on the per-bot + * toggle; this function only sees enabled invocations. */ + budget: number; + botName: string; + /** Working directory for the grader's Haiku spawn. */ + cwd?: string; + log: Logger; + /** Haiku model override for the grader. */ + graderModel?: string; + graderTimeoutMs?: number; + /** Injectable for tests. */ + searchFn?: typeof searchKnowledge; + gradeFn?: typeof gradeKnowledgeResults; +} + +export async function runCorrectiveRetrieval(ctx: CorrectiveRetrievalContext): Promise { + const budget = Math.max(1, Math.min(2, Math.floor(ctx.budget))); + const search = ctx.searchFn ?? searchKnowledge; + const grade = ctx.gradeFn ?? gradeKnowledgeResults; + const { question, originalQuery, originalResultText, botName, cwd, log } = ctx; + + let currentText = originalResultText; + let currentCollections = ctx.originalCollections; + let lastQuery = originalQuery; + + const verdicts: GradeVerdict[] = []; + const reasons: string[] = []; + const queriesTried: string[] = []; + const collectionsTried: (string[] | undefined)[] = []; + const requeryMs: number[] = []; + let graderMs = 0; + let retries = 0; + + for (;;) { + let g: KnowledgeGrade; + const gradeStart = performance.now(); + try { + g = await grade({ + question, + toolResultText: currentText, + botName, + cwd, + log, + model: ctx.graderModel, + timeoutMs: ctx.graderTimeoutMs, + }); + } catch (err) { + log.warn("corrective: grader threw — stopping with current results: {error}", { + botName, + error: err instanceof Error ? err.message : String(err), + }); + g = { verdict: "correct", reason: "grader error" }; + } + graderMs += performance.now() - gradeStart; + verdicts.push(g.verdict); + reasons.push(g.reason); + + if (g.verdict === "correct" || retries >= budget) break; + + const nextQuery = pickRetryQuery(g, currentText, { lastQuery, originalQuery, queriesTried }); + if (!nextQuery) break; + + const collections = g.suggestedCollection ? [g.suggestedCollection] : currentCollections; + + let resp: KnowledgeSearchResponse; + const requeryStart = performance.now(); + try { + resp = await search(nextQuery, { + collections, + rerank: true, + limit: 10, + maxChunksPerDoc: 2, + }); + } catch (err) { + log.warn("corrective: re-query failed for {query} — stopping: {error}", { + botName, + query: nextQuery, + error: err instanceof Error ? err.message : String(err), + }); + break; + } + + retries++; + queriesTried.push(nextQuery); + collectionsTried.push(collections); + requeryMs.push(Math.round(performance.now() - requeryStart)); + lastQuery = nextQuery; + + const existing = extractDocKeysFromRenderedText(currentText); + const fresh = resp.results.filter((r) => r.id && r.collection && !existing.has(docKey(r))); + if (fresh.length === 0) { + // The re-query surfaced nothing new (or nothing at all). Don't append a + // confirmation block — keep the model's context clean. The trace still + // records the attempt via `queriesTried`. + log.info("corrective: re-query {query} added no new documents", { botName, query: nextQuery }); + break; + } + + const note = buildCorrectiveNote({ + retryNum: retries, + verdict: g.verdict, + reason: g.reason, + query: nextQuery, + collections, + freshCount: fresh.length, + }); + currentText = `${currentText}\n\n---\n${note}\n\n${renderSearchResults(fresh)}${renderRetryHintsFooter(resp)}`; + currentCollections = collections; + } + + return { + text: currentText, + changed: currentText !== originalResultText, + metadata: { + retries, + verdicts, + reasons, + queriesTried, + collectionsTried, + finalVerdict: verdicts[verdicts.length - 1] ?? "correct", + graderMs: Math.round(graderMs), + requeryMs, + }, + }; +} + +function pickRetryQuery( + grade: KnowledgeGrade, + resultText: string, + used: { lastQuery: string; originalQuery: string; queriesTried: string[] }, +): string | null { + const footer = parseQueryHintsFromFooter(resultText); + const candidates = [grade.rewrittenQuery, footer.broaderQuery, footer.narrowerQuery] + .map((q) => (typeof q === "string" ? q.trim() : "")) + .filter((q) => q.length > 0); + for (const q of candidates) { + if (q === used.lastQuery || q === used.originalQuery || used.queriesTried.includes(q)) continue; + return q; + } + return null; +} + +function buildCorrectiveNote(args: { + retryNum: number; + verdict: GradeVerdict; + reason: string; + query: string; + collections?: string[]; + freshCount: number; +}): string { + const scope = args.collections?.length ? ` in collection${args.collections.length > 1 ? "s" : ""} ${args.collections.join(", ")}` : ""; + const plural = args.freshCount === 1 ? "result" : "results"; + return ( + `[corrective retrieval — re-query #${args.retryNum}: prior results graded "${args.verdict}" ` + + `(${args.reason}); re-searched "${args.query}"${scope}; ${args.freshCount} additional ${plural} below, ` + + `deduped against the results above]` + ); +} diff --git a/src/ai/huginn-trace-pointer.ts b/src/ai/huginn-trace-pointer.ts index 3b517a1..49043f6 100644 --- a/src/ai/huginn-trace-pointer.ts +++ b/src/ai/huginn-trace-pointer.ts @@ -105,6 +105,35 @@ export function parseHuginnTracePointer( return { text: output, fetchUrl: null }; } +/** + * Split a tool-result string into its body and its trailing Huginn trace + * marker (pointer line or inline `huginn-trace` fence), reconstructing the + * marker as a re-appendable string. + * + * Used by connectors that rewrite a knowledge-search tool result (e.g. the + * corrective-retrieval pass in copilot-sdk) before handing it to the model: + * peel the marker, splice new content into the body, then re-append `remainder` + * at the very end so the downstream {@link processMcpToolResult} call still + * finds and extracts the trace. Does not perform any network fetch. + * + * `remainder` is `""` when no marker was present (or a pointer-id form whose + * URL couldn't be resolved — in which case it wouldn't have been fetched + * anyway, so dropping it is harmless). Otherwise it includes the leading + * `\n\n` separator. + */ +export function peelTraceMarkerForRewrite(text: string): { body: string; remainder: string } { + const ptr = parseHuginnTracePointer(text); + if (ptr.text !== text) { + // A pointer line was stripped. Reconstruct the URL form when we have one. + return { body: ptr.text, remainder: ptr.fetchUrl ? `\n\nhuginn-trace-url: ${ptr.fetchUrl}` : "" }; + } + const fence = parseHuginnTrace(text); + if (fence.trace !== null) { + return { body: fence.text, remainder: `\n\n\`\`\`huginn-trace\n${JSON.stringify(fence.trace)}\n\`\`\`` }; + } + return { body: text, remainder: "" }; +} + interface HuginnTraceChannel { /** Tool output with the trace marker stripped, ready to store / forward. */ text: string; diff --git a/src/ai/knowledge-grader.test.ts b/src/ai/knowledge-grader.test.ts new file mode 100644 index 0000000..df42d94 --- /dev/null +++ b/src/ai/knowledge-grader.test.ts @@ -0,0 +1,78 @@ +import { test, expect, describe } from "bun:test"; +import { gradeKnowledgeResults, normalizeGrade } from "./knowledge-grader.ts"; +import { getLog } from "../logging.ts"; +import type { HaikuResult } from "../scheduler/executor.ts"; + +const log = getLog("test", "knowledge-grader"); + +function fakeSpawn(result: string): () => Promise { + return async () => ({ result, inputTokens: 0, outputTokens: 0, model: "haiku" }); +} + +describe("normalizeGrade", () => { + test("passes through a valid 'correct' verdict and drops any query", () => { + const g = normalizeGrade({ verdict: "correct", rewrittenQuery: "ignored", reason: "covered" }); + expect(g.verdict).toBe("correct"); + expect(g.rewrittenQuery).toBeUndefined(); + expect(g.suggestedCollection).toBeUndefined(); + expect(g.reason).toBe("covered"); + }); + + test("keeps rewrittenQuery / suggestedCollection for non-correct verdicts", () => { + const g = normalizeGrade({ + verdict: "ambiguous", + rewrittenQuery: " LA_BUC_02 SED list ", + suggestedCollection: " confluence ", + reason: "too broad", + }); + expect(g.verdict).toBe("ambiguous"); + expect(g.rewrittenQuery).toBe("LA_BUC_02 SED list"); + expect(g.suggestedCollection).toBe("confluence"); + }); + + test("unknown / missing verdict falls back to 'correct' (fail-soft)", () => { + expect(normalizeGrade({}).verdict).toBe("correct"); + expect(normalizeGrade({ verdict: "garbage" }).verdict).toBe("correct"); + expect(normalizeGrade({ verdict: 42 }).verdict).toBe("correct"); + }); + + test("blank / non-string rewrittenQuery is dropped", () => { + const g = normalizeGrade({ verdict: "insufficient", rewrittenQuery: " ", reason: "" }); + expect(g.rewrittenQuery).toBeUndefined(); + expect(g.reason).toBeTruthy(); // synthesized default + }); +}); + +describe("gradeKnowledgeResults", () => { + const base = { question: "what SEDs belong to LA_BUC_02?", toolResultText: "## Some doc (12% relevant · low)", botName: "test", log }; + + test("parses a clean JSON verdict from Haiku", async () => { + const g = await gradeKnowledgeResults({ + ...base, + spawnFn: fakeSpawn('{"verdict":"insufficient","rewrittenQuery":"LA_BUC_02 structured electronic documents","reason":"off-topic snippets"}'), + }); + expect(g.verdict).toBe("insufficient"); + expect(g.rewrittenQuery).toBe("LA_BUC_02 structured electronic documents"); + }); + + test("tolerates surrounding prose / markdown fence around the JSON", async () => { + const g = await gradeKnowledgeResults({ + ...base, + spawnFn: fakeSpawn('Here is my assessment:\n```json\n{"verdict":"ambiguous","reason":"query too vague"}\n```\n'), + }); + expect(g.verdict).toBe("ambiguous"); + }); + + test("Haiku throwing → verdict 'correct' (no disruption)", async () => { + const g = await gradeKnowledgeResults({ + ...base, + spawnFn: async () => { throw new Error("haiku down"); }, + }); + expect(g.verdict).toBe("correct"); + }); + + test("unparseable Haiku output → verdict 'correct'", async () => { + const g = await gradeKnowledgeResults({ ...base, spawnFn: fakeSpawn("not json at all, sorry") }); + expect(g.verdict).toBe("correct"); + }); +}); diff --git a/src/ai/knowledge-grader.ts b/src/ai/knowledge-grader.ts new file mode 100644 index 0000000..52ed45a --- /dev/null +++ b/src/ai/knowledge-grader.ts @@ -0,0 +1,150 @@ +import { spawnHaiku } from "../scheduler/executor.ts"; +import { extractJson } from "./json-extract.ts"; +import type { Logger } from "@logtape/logtape"; + +/** + * CRAG-style retrieval evaluator for the knowledge search tool. Given the + * user's question and the (rendered) search results — which carry per-result + * `confidenceBand` annotations and a `*No confident match — try: …*` footer + * from Huginn's MCP adapter — a dedicated Haiku call decides whether the + * results are good enough to answer from, and if not, proposes a sharper + * query and/or a better collection. + * + * This is an **awaiting** Haiku call (it gates whether a corrective re-query + * happens), so it uses {@link spawnHaiku} directly rather than the + * fire-and-forget {@link runHaikuExtraction} pattern. + * + * Fail-soft: any Haiku error or unparseable output yields `verdict: "correct"` + * — the corrective loop becomes a no-op and the model sees the original result + * unchanged. The corrective feature must never make a search *worse*. + * + * Plan: `../mimir/plans/huginn-muninn-corrective-rag.md` (Phase 1). + */ + +export type GradeVerdict = "correct" | "ambiguous" | "insufficient"; + +export interface KnowledgeGrade { + verdict: GradeVerdict; + /** A single search string (not a question) to re-query with. Present only + * when verdict is "ambiguous" or "insufficient" and the grader had a better + * query to offer. */ + rewrittenQuery?: string; + /** A collection name to try instead — only when the results hint another + * collection is the right home. Never invented. */ + suggestedCollection?: string; + /** One short sentence explaining the verdict. */ + reason: string; +} + +export interface GradeKnowledgeOptions { + question: string; + /** The rendered search-result text the model would see (trace markers + * already peeled). */ + toolResultText: string; + botName: string; + /** Working directory for the Haiku spawn — keeps the session out of the + * project root and gives it the bot's MCP/settings context. */ + cwd?: string; + log: Logger; + /** Haiku model override (defaults to the project's standard Haiku model). */ + model?: string; + timeoutMs?: number; + /** Injectable for tests — defaults to {@link spawnHaiku}. */ + spawnFn?: typeof spawnHaiku; +} + +/** Cap the result text fed into the grader prompt — keeps the Haiku call cheap + * and well under its context window. The trailing footer (retry hints) lives + * at the end of the text, so prefer keeping the head + tail. */ +const MAX_RESULT_CHARS = 12_000; + +export async function gradeKnowledgeResults(opts: GradeKnowledgeOptions): Promise { + const { question, botName, cwd, log } = opts; + const resultText = clampResultText(opts.toolResultText); + + const prompt = buildGraderPrompt(question, resultText); + + const spawn = opts.spawnFn ?? spawnHaiku; + let raw: string; + try { + const res = await spawn(prompt, { + source: "knowledge-grader", + entrypoint: `${botName}-knowledge-grader`, + cwd, + botName, + model: opts.model, + timeoutMs: opts.timeoutMs, + }); + raw = res.result; + } catch (err) { + log.warn("knowledge grader Haiku call failed — treating as 'correct': {error}", { + botName, + error: err instanceof Error ? err.message : String(err), + }); + return { verdict: "correct", reason: "grader unavailable" }; + } + + let parsed: Record; + try { + parsed = extractJson>(raw); + } catch { + log.warn("knowledge grader: unparseable result — treating as 'correct': {raw}", { + botName, + raw: raw.slice(0, 300), + }); + return { verdict: "correct", reason: "grader output unparseable" }; + } + + return normalizeGrade(parsed); +} + +export function normalizeGrade(parsed: Record): KnowledgeGrade { + const verdict = parsed.verdict; + const safeVerdict: GradeVerdict = + verdict === "ambiguous" || verdict === "insufficient" ? verdict : "correct"; + + const reason = typeof parsed.reason === "string" && parsed.reason.trim() + ? parsed.reason.trim() + : safeVerdict === "correct" + ? "results cover the question" + : "results do not clearly cover the question"; + + const grade: KnowledgeGrade = { verdict: safeVerdict, reason }; + + if (safeVerdict !== "correct") { + const rq = typeof parsed.rewrittenQuery === "string" ? parsed.rewrittenQuery.trim() : ""; + if (rq) grade.rewrittenQuery = rq; + const sc = typeof parsed.suggestedCollection === "string" ? parsed.suggestedCollection.trim() : ""; + if (sc) grade.suggestedCollection = sc; + } + + return grade; +} + +function clampResultText(text: string): string { + if (text.length <= MAX_RESULT_CHARS) return text; + const head = Math.floor(MAX_RESULT_CHARS * 0.7); + const tail = MAX_RESULT_CHARS - head; + return `${text.slice(0, head)}\n…[${text.length - MAX_RESULT_CHARS} chars omitted]…\n${text.slice(-tail)}`; +} + +function buildGraderPrompt(question: string, resultText: string): string { + return `You grade the quality of knowledge-base search results before an assistant answers from them. + +USER QUESTION: +${question} + +SEARCH RESULTS (each hit is annotated with a confidence band — high / medium / low; a trailing "No confident match" or "Weak match" line, if present, means the search itself was unsure): +${resultText || "(no results were returned)"} + +Decide whether these results let the question be answered well, then respond with ONLY a JSON object — no prose, no markdown fence: +{"verdict":"correct"|"ambiguous"|"insufficient","rewrittenQuery":"...","suggestedCollection":"...","reason":"..."} + +Guidance: +- "correct": at least one clearly on-topic, reasonably-confident result covers the question. No re-query needed. Omit rewrittenQuery and suggestedCollection. +- "ambiguous": results are partially relevant but the query was too broad, too narrow, or worded differently than the indexed content; a sharper query would likely find better hits. +- "insufficient": nothing on-topic, or only low-confidence / off-topic snippets, or no results at all. +- rewrittenQuery: a single concise SEARCH STRING (keywords / phrase), NOT a question. Only when verdict is "ambiguous" or "insufficient". If you cannot improve on the query, omit it. +- suggestedCollection: only set it if the results clearly hint a different collection is the right home for this topic. Never invent a collection name. +- reason: one short sentence.`; +} diff --git a/src/ai/knowledge-search-client.test.ts b/src/ai/knowledge-search-client.test.ts new file mode 100644 index 0000000..51186c6 --- /dev/null +++ b/src/ai/knowledge-search-client.test.ts @@ -0,0 +1,138 @@ +import { test, expect, describe, afterEach } from "bun:test"; +import { + searchKnowledge, + renderSearchResults, + renderRetryHintsFooter, + extractDocKeysFromRenderedText, + parseQueryHintsFromFooter, + docKey, + type KnowledgeSearchResult, +} from "./knowledge-search-client.ts"; + +describe("renderSearchResults", () => { + test("renders header, url, breadcrumb, the doc-id line and chunk bodies", () => { + const r: KnowledgeSearchResult = { + collection: "wiki", + id: "abc-123", + title: "Knowledge Graph RAG", + url: "https://example.test/kg-rag", + breadcrumb: "Architecture / Retrieval", + relevance: 0.823, + confidenceBand: "high", + modifiedTime: "2026-05-01T12:00:00Z", + matchedChunks: [{ heading: "Overview", content: "It combines a graph with vector search." }], + }; + const out = renderSearchResults([r]); + expect(out).toContain("## Knowledge Graph RAG (82.3% relevant · high) | updated: 2026-05-01"); + expect(out).toContain("https://example.test/kg-rag"); + expect(out).toContain("Architecture / Retrieval"); + expect(out).toContain("collection: `wiki` doc_id: `abc-123`"); + expect(out).toContain("**Overview**"); + expect(out).toContain("It combines a graph with vector search."); + }); + + test("falls back to snippet when there are no matched chunks", () => { + const out = renderSearchResults([{ collection: "c", id: "1", title: "T", relevance: 0.5, confidenceBand: "medium", snippet: "a short snippet" }]); + expect(out).toContain("a short snippet"); + expect(out).toContain("collection: `c` doc_id: `1`"); + }); + + test("WIP metadata renders the marker; internal metadata keys are hidden", () => { + const out = renderSearchResults([{ collection: "c", id: "1", title: "Draft", metadata: { wip: "true", page_id: "x", owner: "alice" }, matchedChunks: [{ content: "body" }] }]); + expect(out).toContain("**[UNDER ARBEID]**"); + expect(out).not.toContain("page_id"); + }); +}); + +describe("extractDocKeysFromRenderedText", () => { + test("pulls collection/doc_id pairs out of rendered result text", () => { + const text = renderSearchResults([ + { collection: "wiki", id: "1", title: "A", matchedChunks: [{ content: "x" }] }, + { collection: "confluence", id: "PAGE-2", title: "B", matchedChunks: [{ content: "y" }] }, + ]); + const keys = extractDocKeysFromRenderedText(text); + expect(keys.has("wiki/1")).toBe(true); + expect(keys.has("confluence/PAGE-2")).toBe(true); + expect(keys.size).toBe(2); + }); + + test("returns empty set for text with no doc-id lines", () => { + expect(extractDocKeysFromRenderedText("just some prose").size).toBe(0); + }); + + test("docKey matches the rendered line format", () => { + expect(docKey({ collection: "wiki", id: "1" })).toBe("wiki/1"); + }); +}); + +describe("parseQueryHintsFromFooter", () => { + test("extracts broader and narrower query hints", () => { + const footer = '*No confident match — try: related terms: A, B · narrower query: "X Y narrow" · broader query: "X wide"*'; + expect(parseQueryHintsFromFooter(footer)).toEqual({ broaderQuery: "X wide", narrowerQuery: "X Y narrow" }); + }); + + test("returns empty object when no hints present", () => { + expect(parseQueryHintsFromFooter("no hints here")).toEqual({}); + }); +}); + +describe("renderRetryHintsFooter", () => { + test("renders a 'No confident match' line with hints", () => { + const out = renderRetryHintsFooter({ noConfidentResults: true, retryHints: { relatedTerms: ["a", "b"], broaderQuery: "wider" } }); + expect(out).toContain("No confident match — try: related terms: a, b · broader query: \"wider\""); + }); + + test("renders a bare 'No confident match.' when there are no hints", () => { + expect(renderRetryHintsFooter({ noConfidentResults: true })).toBe("\n\n*No confident match.*"); + }); + + test("returns empty string when there's nothing to say", () => { + expect(renderRetryHintsFooter({})).toBe(""); + }); +}); + +describe("searchKnowledge", () => { + const realFetch = globalThis.fetch; + afterEach(() => { globalThis.fetch = realFetch; }); + + test("builds the query string and normalizes the response", async () => { + let seenUrl = ""; + globalThis.fetch = (async (input: RequestInfo | URL) => { + seenUrl = String(input); + return new Response( + JSON.stringify({ + results: [{ collection: "wiki", id: "1", title: "T", relevance: 0.7, confidenceBand: "high", matchedChunks: [{ content: "c" }] }], + bestScore: 0.7, + }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + }) as unknown as typeof fetch; + + const resp = await searchKnowledge("graph rag", { collections: ["wiki"], rerank: true, minRelevance: 0.4, limit: 5 }); + expect(seenUrl).toContain("/api/search?"); + expect(seenUrl).toContain("q=graph+rag"); + expect(seenUrl).toContain("collection=wiki"); + expect(seenUrl).toContain("rerank=true"); + expect(seenUrl).toContain("min_relevance=0.4"); + expect(resp.results.length).toBe(1); + expect(resp.results[0]?.confidenceBand).toBe("high"); + expect(resp.bestScore).toBe(0.7); + }); + + test("parses noConfidentResults + retryHints", async () => { + globalThis.fetch = (async () => + new Response(JSON.stringify({ results: [], bestScore: 0.75, noConfidentResults: true, retryHints: { detectedEntities: ["RAG"], broaderQuery: "wider" } }), { + status: 200, + headers: { "content-type": "application/json" }, + })) as unknown as typeof fetch; + const resp = await searchKnowledge("x"); + expect(resp.noConfidentResults).toBe(true); + expect(resp.retryHints?.detectedEntities).toEqual(["RAG"]); + expect(resp.retryHints?.broaderQuery).toBe("wider"); + }); + + test("throws on a non-2xx response", async () => { + globalThis.fetch = (async () => new Response("nope", { status: 503 })) as unknown as typeof fetch; + await expect(searchKnowledge("x")).rejects.toThrow(); + }); +}); diff --git a/src/ai/knowledge-search-client.ts b/src/ai/knowledge-search-client.ts new file mode 100644 index 0000000..599685c --- /dev/null +++ b/src/ai/knowledge-search-client.ts @@ -0,0 +1,271 @@ +import { getLog } from "../logging.ts"; + +const log = getLog("ai", "knowledge-search"); + +/** Base URL for Huginn's HTTP API — same env Huginn-side uses. Read directly + * (not via `loadConfig()`) so this module has no `DATABASE_URL` dependency. */ +function knowledgeApiBaseUrl(): string { + return process.env.KNOWLEDGE_API_URL || "http://localhost:8321"; +} + +/** + * Thin HTTP client for Huginn's `GET /api/search`, plus a renderer that mirrors + * the shape Huginn's MCP adapter produces (so a corrective re-query's hits read + * identically to the ones the model already saw) and a parser for the + * `collection: \`x\` doc_id: \`y\`` lines those results carry (used to dedupe a + * re-query against the original result text). + * + * Scope: this is the Phase-1 corrective-retrieval consumer of the Phase-0 + * contract — `bestScore`, per-result `confidenceBand`, `retryHints`, + * `noConfidentResults`, `min_relevance`. See + * `../mimir/plans/huginn-muninn-corrective-rag.md`. + */ + +export type ConfidenceBand = "high" | "medium" | "low"; + +export interface KnowledgeMatchedChunk { + content?: string; + heading?: string; + relevance?: number; + metadata?: Record; +} + +export interface KnowledgeSearchResult { + collection: string; + id: string; + title: string; + url?: string; + snippet?: string; + breadcrumb?: string; + heading?: string; + relevance?: number; + confidenceBand?: ConfidenceBand; + modifiedTime?: string; + matchedChunks?: KnowledgeMatchedChunk[]; + metadata?: Record; + /** Graph-context annotation lines, when graph augmentation produced any. */ + graphContext?: string[]; +} + +export interface KnowledgeRetryHints { + detectedEntities?: string[]; + relatedTerms?: string[]; + narrowerQuery?: string; + broaderQuery?: string; +} + +export interface KnowledgeSearchResponse { + results: KnowledgeSearchResult[]; + bestScore?: number; + noConfidentResults?: boolean; + retryHints?: KnowledgeRetryHints; + /** Present when Huginn returns a relational graph answer ahead of the hits. */ + graphAnswer?: string; + lowConfidence?: boolean; +} + +export interface SearchKnowledgeOptions { + /** Restrict to specific collection(s). Omit to search all available. */ + collections?: string[]; + limit?: number; + brief?: boolean; + /** Force (or disable) cross-encoder reranking. Default: Huginn's default + * (`true` for full, `false` for brief). Corrective re-queries pass `true` + * so `confidenceBand` is trustworthy on the re-query. */ + rerank?: boolean; + /** Drop results below this relevance (0.0–1.0). When it empties the set the + * response carries `noConfidentResults` + `retryHints`. */ + minRelevance?: number; + maxChunksPerDoc?: number; + timeoutMs?: number; + /** Override the base URL (defaults to `config.knowledgeApiUrl`). */ + baseUrl?: string; +} + +const GRAPH_CONTEXT_KEY = "graph_context"; + +/** Call Huginn's `/api/search`. Throws on network error / non-2xx — callers in + * the corrective path treat that as "no re-query" (fail-soft). */ +export async function searchKnowledge( + query: string, + opts: SearchKnowledgeOptions = {}, +): Promise { + const baseUrl = (opts.baseUrl ?? knowledgeApiBaseUrl()).replace(/\/+$/, ""); + const params = new URLSearchParams(); + params.set("q", query); + if (opts.limit !== undefined) params.set("limit", String(opts.limit)); + if (opts.brief) params.set("brief", "true"); + if (opts.rerank !== undefined) params.set("rerank", String(opts.rerank)); + if (opts.minRelevance !== undefined) params.set("min_relevance", String(opts.minRelevance)); + if (opts.maxChunksPerDoc !== undefined) params.set("max_chunks_per_doc", String(opts.maxChunksPerDoc)); + for (const c of opts.collections ?? []) params.append("collection", c); + + const url = `${baseUrl}/api/search?${params.toString()}`; + const resp = await fetch(url, { signal: AbortSignal.timeout(opts.timeoutMs ?? 8000) }); + if (!resp.ok) { + throw new Error(`knowledge search returned ${resp.status} for ${query}`); + } + const data = (await resp.json()) as Record; + return normalizeResponse(data); +} + +function normalizeResponse(data: Record): KnowledgeSearchResponse { + const rawResults = Array.isArray(data.results) ? (data.results as Record[]) : []; + const results: KnowledgeSearchResult[] = rawResults.map((r) => ({ + collection: String(r.collection ?? ""), + id: String(r.id ?? ""), + title: String(r.title ?? r.id ?? "(untitled)"), + url: r.url ? String(r.url) : undefined, + snippet: r.snippet ? String(r.snippet) : undefined, + breadcrumb: r.breadcrumb ? String(r.breadcrumb) : undefined, + heading: r.heading ? String(r.heading) : undefined, + relevance: typeof r.relevance === "number" ? r.relevance : undefined, + confidenceBand: isBand(r.confidenceBand) ? r.confidenceBand : undefined, + modifiedTime: r.modifiedTime ? String(r.modifiedTime) : undefined, + matchedChunks: Array.isArray(r.matchedChunks) + ? (r.matchedChunks as Record[]).map((c) => ({ + content: c.content ? String(c.content) : undefined, + heading: c.heading ? String(c.heading) : undefined, + relevance: typeof c.relevance === "number" ? c.relevance : undefined, + metadata: isRecord(c.metadata) ? c.metadata : undefined, + })) + : undefined, + metadata: isRecord(r.metadata) ? r.metadata : undefined, + graphContext: Array.isArray(r[GRAPH_CONTEXT_KEY]) + ? (r[GRAPH_CONTEXT_KEY] as unknown[]).map(String) + : undefined, + })); + + return { + results, + bestScore: typeof data.bestScore === "number" ? data.bestScore : undefined, + noConfidentResults: data.noConfidentResults === true, + retryHints: parseRetryHints(data.retryHints), + graphAnswer: data.graph_answer ? String(data.graph_answer) : undefined, + lowConfidence: data.lowConfidence === true, + }; +} + +function parseRetryHints(raw: unknown): KnowledgeRetryHints | undefined { + if (!isRecord(raw)) return undefined; + const hints: KnowledgeRetryHints = {}; + if (Array.isArray(raw.detectedEntities)) hints.detectedEntities = raw.detectedEntities.map(String); + if (Array.isArray(raw.relatedTerms)) hints.relatedTerms = raw.relatedTerms.map(String); + if (typeof raw.narrowerQuery === "string") hints.narrowerQuery = raw.narrowerQuery; + if (typeof raw.broaderQuery === "string") hints.broaderQuery = raw.broaderQuery; + return Object.keys(hints).length > 0 ? hints : undefined; +} + +function isBand(v: unknown): v is ConfidenceBand { + return v === "high" || v === "medium" || v === "low"; +} + +function isRecord(v: unknown): v is Record { + return typeof v === "object" && v !== null && !Array.isArray(v); +} + +const INTERNAL_METADATA_KEYS = new Set(["page_id", "space", "breadcrumb", "title", "wip"]); + +function isWip(r: Pick): boolean { + return (r.metadata?.wip as unknown) === "true"; +} + +function formatRelevanceBand(r: KnowledgeSearchResult): string { + if (r.relevance === undefined) return ""; + const pct = `${(r.relevance * 100).toFixed(1)}% relevant`; + return ` (${pct}${r.confidenceBand ? ` · ${r.confidenceBand}` : ""})`; +} + +function formatDate(iso?: string): string { + return iso ? iso.slice(0, 10) : ""; +} + +function visibleMetaLine(metadata?: Record): string { + if (!metadata) return ""; + const entries = Object.entries(metadata).filter(([k, v]) => !INTERNAL_METADATA_KEYS.has(k) && v); + if (entries.length === 0) return ""; + return `\n*${entries.map(([k, v]) => `${k}: ${v}`).join(" | ")}*`; +} + +/** + * Render search results in (approximately) the same shape Huginn's MCP adapter + * uses for `brief=false` searches: `## title (NN% relevant · band)` header, + * url, breadcrumb, the `collection: \`x\` doc_id: \`y\`` line, then the matched + * chunks. Used to splice a corrective re-query's hits into the tool result the + * model sees. + */ +export function renderSearchResults(results: KnowledgeSearchResult[]): string { + return results + .map((r) => { + const date = r.modifiedTime ? ` | updated: ${formatDate(r.modifiedTime)}` : ""; + const wip = isWip(r) ? " **[UNDER ARBEID]**" : ""; + let header = `## ${r.title}${wip}${formatRelevanceBand(r)}${date}`; + if (r.url) header += `\n${r.url}`; + if (r.breadcrumb) header += `\n${r.breadcrumb}`; + header += `\ncollection: \`${r.collection}\` doc_id: \`${r.id}\``; + if (r.graphContext?.length) header += `\n*${r.graphContext.join(" | ")}*`; + + const bodyLines: string[] = []; + const chunks = r.matchedChunks ?? []; + if (chunks.length > 0) { + for (const chunk of chunks) { + if (chunk.heading) bodyLines.push(`**${chunk.heading}**`); + if (chunk.content) bodyLines.push(chunk.content); + const ml = visibleMetaLine(chunk.metadata); + if (ml) bodyLines.push(ml.replace(/^\n/, "")); + } + } else if (r.snippet) { + bodyLines.push(r.snippet); + } + return bodyLines.length > 0 ? `${header}\n\n${bodyLines.join("\n\n")}` : header; + }) + .join("\n\n"); +} + +/** Render the Phase-0 `retryHints` / `noConfidentResults` footer, mirroring + * the MCP adapter — used when a re-query itself comes back empty/weak so the + * consolidated result still surfaces the next move. Returns "" when nothing + * useful applies. */ +export function renderRetryHintsFooter(resp: Pick): string { + const hints = resp.retryHints ?? {}; + const bits: string[] = []; + if (hints.relatedTerms?.length) bits.push(`related terms: ${hints.relatedTerms.join(", ")}`); + if (hints.narrowerQuery) bits.push(`narrower query: "${hints.narrowerQuery}"`); + if (hints.broaderQuery) bits.push(`broader query: "${hints.broaderQuery}"`); + if (bits.length === 0 && !resp.noConfidentResults) return ""; + const prefix = resp.noConfidentResults ? "No confident match" : "Weak match"; + return bits.length > 0 ? `\n\n*${prefix} — try: ${bits.join(" · ")}*` : `\n\n*${prefix}.*`; +} + +const DOC_ID_LINE_RE = /collection:\s*`([^`]+)`\s+doc_id:\s*`([^`]+)`/g; + +/** Extract `collection/doc_id` keys from rendered search-result text — used to + * dedupe a corrective re-query against the original result the model already + * has, since (per the chosen Phase-1 approach) we don't re-fetch the original + * in structured form. The `collection: \`…\` doc_id: \`…\`` line is emitted by + * Huginn's MCP adapter for every hit and is stable. */ +export function extractDocKeysFromRenderedText(text: string): Set { + const keys = new Set(); + for (const m of text.matchAll(DOC_ID_LINE_RE)) { + keys.add(`${m[1]}/${m[2]}`); + } + return keys; +} + +export function docKey(r: Pick): string { + return `${r.collection}/${r.id}`; +} + +/** Parse a `broader query: "..."` / `narrower query: "..."` hint out of a + * rendered "*No confident match — try: …*" footer. Belt-and-suspenders for the + * corrective re-query when the Haiku grader didn't supply a rewritten query. */ +export function parseQueryHintsFromFooter(text: string): { broaderQuery?: string; narrowerQuery?: string } { + const out: { broaderQuery?: string; narrowerQuery?: string } = {}; + const broader = text.match(/broader query:\s*"([^"]+)"/); + if (broader) out.broaderQuery = broader[1]; + const narrower = text.match(/narrower query:\s*"([^"]+)"/); + if (narrower) out.narrowerQuery = narrower[1]; + return out; +} + +export { log as knowledgeSearchLog }; diff --git a/src/ai/tool-status.ts b/src/ai/tool-status.ts index f1247f2..7d807bf 100644 --- a/src/ai/tool-status.ts +++ b/src/ai/tool-status.ts @@ -315,6 +315,16 @@ export function parseToolName(name: string): { server: string; tool: string } | return undefined; } +/** + * True when `toolName` is Huginn's knowledge search tool (`search_knowledge`), + * in any connector's naming format. Used to gate the corrective-retrieval pass. + */ +export function isKnowledgeSearchTool(toolName: string): boolean { + const parsed = parseToolName(toolName); + const tool = parsed?.tool ?? toolName; + return tool === "search_knowledge"; +} + /** * Get human-friendly status text for a tool call. * Returns undefined for tools that should not show status (e.g. report_intent). diff --git a/src/bots/config.ts b/src/bots/config.ts index 7a47b6b..0af4a4b 100644 --- a/src/bots/config.ts +++ b/src/bots/config.ts @@ -77,6 +77,16 @@ export interface BotConfig { hivemind?: HivemindBotConfig; /** MCP status probing config — controls cache TTL and which servers are critical */ mcpStatus?: McpStatusConfig; + /** CRAG-lite corrective retrieval around the knowledge search tool (Phase 1). + * Off unless `enabled: true` here or the global default is on. `retryBudget` + * is clamped to 1–2. Only the copilot-sdk connector honours this. */ + correctiveRetrieval?: CorrectiveRetrievalBotConfig; +} + +export interface CorrectiveRetrievalBotConfig { + enabled?: boolean; + /** Max corrective re-queries per knowledge search. Clamped to 1–2. Default 1. */ + retryBudget?: number; } export interface BotPrompts { @@ -164,7 +174,7 @@ function discoverBotsInternal(opts: { requireTokens: boolean }): BotConfig[] { try { botSettings = JSON.parse(readFileSync(configJsonPath, "utf-8")); // Warn about unknown keys to catch typos - const knownKeys = new Set(["connector", "model", "thinkingMaxTokens", "timeoutMs", "restrictedTools", "channelListening", "serena", "baseUrl", "showWaterfall", "prompts", "contextWindow", "hivemind", "mcpStatus"]); + const knownKeys = new Set(["connector", "model", "thinkingMaxTokens", "timeoutMs", "restrictedTools", "channelListening", "serena", "baseUrl", "showWaterfall", "prompts", "contextWindow", "hivemind", "mcpStatus", "correctiveRetrieval"]); const unknownKeys = Object.keys(botSettings).filter((k) => !knownKeys.has(k)); if (unknownKeys.length > 0) { log.warn("Bot \"{name}\" config.json has unknown keys: {keys} — possible typo?", { name, keys: unknownKeys.join(", ") }); @@ -208,6 +218,7 @@ function discoverBotsInternal(opts: { requireTokens: boolean }): BotConfig[] { contextWindow: botSettings.contextWindow as number | undefined, hivemind: parseHivemindConfig(botSettings.hivemind) ?? undefined, mcpStatus: botSettings.mcpStatus as McpStatusConfig | undefined, + correctiveRetrieval: botSettings.correctiveRetrieval as CorrectiveRetrievalBotConfig | undefined, }); const configParts: string[] = []; diff --git a/src/core/corrective-trace-spans.test.ts b/src/core/corrective-trace-spans.test.ts new file mode 100644 index 0000000..be64f3c --- /dev/null +++ b/src/core/corrective-trace-spans.test.ts @@ -0,0 +1,63 @@ +import { test, expect, describe } from "bun:test"; +import { planCorrectiveSpans } from "./corrective-trace-spans.ts"; +import type { CorrectiveToolMeta } from "../types.ts"; + +describe("planCorrectiveSpans", () => { + test("returns empty when there's no corrective metadata", () => { + expect(planCorrectiveSpans(undefined, 100)).toEqual([]); + expect(planCorrectiveSpans({ retries: 0, verdicts: [], reasons: [], queriesTried: [], finalVerdict: "correct" }, 100)).toEqual([]); + }); + + test("one knowledge_grade span when graded but not re-queried", () => { + const corr: CorrectiveToolMeta = { + retries: 0, + verdicts: ["correct"], + reasons: ["covered"], + queriesTried: [], + finalVerdict: "correct", + graderMs: 1200, + }; + const spans = planCorrectiveSpans(corr, 200); + expect(spans.map((s) => s.name)).toEqual(["knowledge_grade"]); + expect(spans[0]!.startOffsetMs).toBe(200); + expect(spans[0]!.durationMs).toBe(1200); + expect(spans[0]!.attributes.model).toBe("haiku"); + expect(spans[0]!.attributes.finalVerdict).toBe("correct"); + expect(spans[0]!.attributes.passes).toBe(1); + }); + + test("grade span + one requery span per re-query, laid out sequentially after the tool", () => { + const corr: CorrectiveToolMeta = { + retries: 2, + verdicts: ["insufficient", "ambiguous", "correct"], + reasons: ["off-topic", "broad", "ok"], + queriesTried: ["q1", "q2"], + collectionsTried: [null, ["confluence"]], + finalVerdict: "correct", + graderMs: 900, + requeryMs: [150, 220], + }; + const spans = planCorrectiveSpans(corr, 300); + expect(spans.map((s) => s.name)).toEqual(["knowledge_grade", "knowledge_requery", "knowledge_requery"]); + // grade [300, 1200), requery#1 [1200, 1350), requery#2 [1350, 1570) + expect(spans[0]!.startOffsetMs).toBe(300); + expect(spans[1]!.startOffsetMs).toBe(1200); + expect(spans[1]!.durationMs).toBe(150); + expect(spans[1]!.attributes.query).toBe("q1"); + expect(spans[1]!.attributes.collection).toBe("(all)"); + expect(spans[2]!.startOffsetMs).toBe(1350); + expect(spans[2]!.durationMs).toBe(220); + expect(spans[2]!.attributes.query).toBe("q2"); + expect(spans[2]!.attributes.collection).toBe("confluence"); + }); + + test("uses a 1ms floor when timings are missing", () => { + const spans = planCorrectiveSpans( + { retries: 1, verdicts: ["insufficient", "correct"], reasons: ["x", "y"], queriesTried: ["q"], finalVerdict: "correct" }, + 0, + ); + expect(spans[0]!.durationMs).toBe(1); + expect(spans[1]!.durationMs).toBe(1); + expect(spans[1]!.startOffsetMs).toBe(1); + }); +}); diff --git a/src/core/corrective-trace-spans.ts b/src/core/corrective-trace-spans.ts new file mode 100644 index 0000000..f4ca5c0 --- /dev/null +++ b/src/core/corrective-trace-spans.ts @@ -0,0 +1,94 @@ +import type { Tracer } from "../tracing/index.ts"; +import type { CorrectiveToolMeta } from "../types.ts"; + +/** + * Synthesize waterfall spans for a knowledge-search tool call's CRAG-lite + * corrective pass (see src/ai/corrective-retrieval.ts): + * + * - one `knowledge_grade` span — the Haiku retrieval evaluator (attrs: + * verdicts per pass, the triggering reason, the final verdict, model) + * - one `knowledge_requery` span per corrective re-query (attrs: the rewritten + * query, the collection scope) + * + * They're nested under the tool span and laid out sequentially starting at the + * tool span's nominal end (the corrective work runs *after* Huginn's search, in + * the connector's onPostToolUse hook), so they extend just past the tool bar's + * right edge — which is the honest picture of the added latency. + * + * Mirrors the structure of {@link emitSearchTraceSpans} in search-trace-spans.ts. + */ + +export interface SynthesizedCorrectiveSpan { + name: string; + durationMs: number; + attributes: Record; + /** Offset from the *tool span's* start. */ + startOffsetMs: number; +} + +/** Pure planner — returns the spans we'd emit for a tool call with corrective + * metadata. `toolDurationMs` is where the corrective spans begin (just after + * the tool's own work). Exposed for testing. */ +export function planCorrectiveSpans( + corrective: CorrectiveToolMeta | undefined, + toolDurationMs: number, +): SynthesizedCorrectiveSpan[] { + if (!corrective || !Array.isArray(corrective.verdicts) || corrective.verdicts.length === 0) return []; + + const out: SynthesizedCorrectiveSpan[] = []; + let cursor = Math.max(0, toolDurationMs); + + const graderMs = typeof corrective.graderMs === "number" && corrective.graderMs > 0 ? corrective.graderMs : 1; + out.push({ + name: "knowledge_grade", + durationMs: graderMs, + startOffsetMs: cursor, + attributes: { + model: "haiku", + passes: corrective.verdicts.length, + verdicts: corrective.verdicts, + finalVerdict: corrective.finalVerdict, + reason: corrective.reasons?.[0], + retries: corrective.retries, + synthesized: true, + }, + }); + cursor += graderMs; + + const requeryMs = corrective.requeryMs ?? []; + corrective.queriesTried.forEach((query, i) => { + const ms = typeof requeryMs[i] === "number" && requeryMs[i]! > 0 ? requeryMs[i]! : 1; + const collection = corrective.collectionsTried?.[i] ?? null; + out.push({ + name: "knowledge_requery", + durationMs: ms, + startOffsetMs: cursor, + attributes: { + query, + collection: collection && collection.length > 0 ? collection.join(", ") : "(all)", + index: i + 1, + synthesized: true, + }, + }); + cursor += ms; + }); + + return out; +} + +/** Emit the corrective spans under the given tool span. No-op when there's no + * corrective metadata. */ +export function emitCorrectiveSpans(opts: { + tracer: Tracer; + toolSpanId: string; + toolStartedAt: Date; + toolDurationMs: number; + corrective: CorrectiveToolMeta | undefined; +}): void { + for (const s of planCorrectiveSpans(opts.corrective, opts.toolDurationMs)) { + opts.tracer.addSubSpan(opts.toolSpanId, s.name, s.durationMs, s.attributes, { + parentStartedAt: opts.toolStartedAt, + startOffsetMs: s.startOffsetMs, + }); + } +} diff --git a/src/core/tool-spans.ts b/src/core/tool-spans.ts index a9a4751..9b7789d 100644 --- a/src/core/tool-spans.ts +++ b/src/core/tool-spans.ts @@ -3,6 +3,7 @@ import type { ToolCall } from "../types.ts"; import { getToolStatus } from "../ai/tool-status.ts"; import { parseHuginnTrace } from "../ai/huginn-trace.ts"; import { emitSearchTraceSpans } from "./search-trace-spans.ts"; +import { emitCorrectiveSpans } from "./corrective-trace-spans.ts"; /** * Trace-marker-emitting MCP tools whose spans benefit from an env snapshot. @@ -107,22 +108,39 @@ export async function attachToolSpans( if (captureOutputs && toolOutput !== undefined) { attrs.output = toolOutput; } + // CRAG-lite corrective-retrieval metadata, when the connector ran a + // grade-and-requery pass on this knowledge-search tool result. + if (tool.corrective !== undefined) attrs.corrective = tool.corrective; + const toolSpanId = tracer.addChildSpan("claude", tool.displayName, tool.durationMs, attrs, tool.startOffsetMs); + const claudeStart = tracer.spanStartedAt("claude"); + const toolStart = claudeStart + ? new Date(claudeStart.getTime() + (tool.startOffsetMs ?? 0)) + : undefined; + // If the tool call carries a v1 Huginn search trace, synthesize per-stage // child spans so the waterfall shows where the time went without the // operator having to expand the trace JSON. - if (attrs.searchTrace !== undefined) { - const claudeStart = tracer.spanStartedAt("claude"); - if (claudeStart) { - const toolStart = new Date(claudeStart.getTime() + (tool.startOffsetMs ?? 0)); - emitSearchTraceSpans({ - tracer, - toolSpanId, - toolStartedAt: toolStart, - searchTrace: attrs.searchTrace, - }); - } + if (attrs.searchTrace !== undefined && toolStart) { + emitSearchTraceSpans({ + tracer, + toolSpanId, + toolStartedAt: toolStart, + searchTrace: attrs.searchTrace, + }); + } + + // If the connector ran a corrective pass, synthesize knowledge_grade / + // knowledge_requery child spans after the tool's own work. + if (tool.corrective !== undefined && toolStart) { + emitCorrectiveSpans({ + tracer, + toolSpanId, + toolStartedAt: toolStart, + toolDurationMs: tool.durationMs, + corrective: tool.corrective, + }); } } } diff --git a/src/dashboard/views/components/span-label.ts b/src/dashboard/views/components/span-label.ts index 7455fb5..3c18414 100644 --- a/src/dashboard/views/components/span-label.ts +++ b/src/dashboard/views/components/span-label.ts @@ -9,6 +9,12 @@ interface SpanLike { toolId?: unknown; input?: unknown; output?: unknown; + corrective?: { + retries?: unknown; + finalVerdict?: unknown; + verdicts?: unknown; + queriesTried?: unknown; + } | unknown; searchTrace?: | { collections?: Array<{ @@ -46,6 +52,11 @@ export function deriveSpanLabelHtml(span: SpanLike): { html: string; tooltip: st ? `${escHtml(verb)}` : ''; + // Corrective-retrieval chip — present only on knowledge-search tool spans that + // went through a CRAG-lite grade/requery pass. Shows the final verdict and the + // retry count so a corrected search is visible at a glance. + const corr = correctiveChipFromAttrs(attrs.corrective); + // Search-tool path: collection chips + counts chip, derived from searchTrace // or input.collection. let collections = collectionsFor(attrs); @@ -74,24 +85,58 @@ export function deriveSpanLabelHtml(span: SpanLike): { html: string; tooltip: st if (summary.totalMs != null) tooltipLines.push("total: " + summary.totalMs + "ms"); if (summary.lowConfidence) tooltipLines.push("⚠ low confidence"); } + if (corr) tooltipLines.push(...corr.tooltipLines); return { - html: verbChip + firstChip + moreChip + countsChip, + html: verbChip + (corr ? corr.html : "") + firstChip + moreChip + countsChip, tooltip: tooltipLines.join("\n"), }; } // Per-tool extras path: graph_node / symbol_context / list_files / - // read_source / search_pattern. + // read_source / search_pattern (also the knowledge-search fallback). const extras = toolLabelExtras(canonName, attrs); if (extras) { return { - html: verbChip + extras.chips, - tooltip: [span.name, ...extras.tooltipLines].join("\n"), + html: verbChip + (corr ? corr.html : "") + extras.chips, + tooltip: [span.name, ...(corr ? corr.tooltipLines : []), ...extras.tooltipLines].join("\n"), + }; + } + if (corr) { + return { + html: verbChip + corr.html, + tooltip: [span.name ?? "", ...corr.tooltipLines].join("\n"), }; } return null; } +/** Build the corrective-retrieval chip from a tool span's `attributes.corrective`. + * Returns null when the attribute is absent or malformed. Chip text is the + * final verdict's symbol + retry count (e.g. `⟲1 ✓`); color reflects whether + * the corrective pass left the result set usable. */ +function correctiveChipFromAttrs(raw: unknown): { html: string; tooltipLines: string[] } | null { + if (!raw || typeof raw !== "object") return null; + const c = raw as { retries?: unknown; finalVerdict?: unknown; verdicts?: unknown; queriesTried?: unknown }; + const finalVerdict = typeof c.finalVerdict === "string" ? c.finalVerdict : undefined; + const verdicts = Array.isArray(c.verdicts) ? c.verdicts.map(String) : []; + if (!finalVerdict && verdicts.length === 0) return null; + const retries = typeof c.retries === "number" ? c.retries : 0; + const queries = Array.isArray(c.queriesTried) ? c.queriesTried.map(String) : []; + + const cls = + finalVerdict === "correct" ? "wf-corrective wf-corrective-ok" + : finalVerdict === "ambiguous" ? "wf-corrective wf-corrective-warn" + : "wf-corrective wf-corrective-bad"; + const sym = finalVerdict === "correct" ? "✓" : finalVerdict === "ambiguous" ? "≈" : "✗"; + const text = retries > 0 ? `⟲${retries} ${sym}` : `grade ${sym}`; + const tip = `corrective retrieval: ${verdicts.join(" → ") || finalVerdict}` + + (queries.length ? `; re-queried: ${queries.map((q) => `"${q}"`).join(", ")}` : "; no re-query"); + return { + html: `${escHtml(text)}`, + tooltipLines: [tip], + }; +} + interface ToolLabelExtras { chips: string; tooltipLines: string[]; } type ExtrasRecipe = { diff --git a/src/dashboard/views/components/traces-waterfall.ts b/src/dashboard/views/components/traces-waterfall.ts index 8d03a57..73e9e61 100644 --- a/src/dashboard/views/components/traces-waterfall.ts +++ b/src/dashboard/views/components/traces-waterfall.ts @@ -128,6 +128,24 @@ export function tracesWaterfallStyles(): string { color: var(--status-warning); border-color: color-mix(in srgb, var(--status-warning) 35%, transparent); } + /* Corrective-retrieval chip — marks a knowledge search that went through a + CRAG-lite grade/requery pass. Color = whether the result set ended usable. */ + .wf-chip.wf-corrective { font-variant-numeric: tabular-nums; font-weight: 600; } + .wf-chip.wf-corrective-ok { + background: color-mix(in srgb, var(--status-ok, var(--status-cyan)) 14%, transparent); + color: var(--status-ok, var(--status-cyan)); + border: 1px solid color-mix(in srgb, var(--status-ok, var(--status-cyan)) 35%, transparent); + } + .wf-chip.wf-corrective-warn { + background: color-mix(in srgb, var(--status-warning) 14%, transparent); + color: var(--status-warning); + border: 1px solid color-mix(in srgb, var(--status-warning) 35%, transparent); + } + .wf-chip.wf-corrective-bad { + background: color-mix(in srgb, var(--status-error, var(--status-magenta)) 14%, transparent); + color: var(--status-error, var(--status-magenta)); + border: 1px solid color-mix(in srgb, var(--status-error, var(--status-magenta)) 35%, transparent); + } .waterfall-bar-container { position: relative; height: 16px; diff --git a/src/types.ts b/src/types.ts index 0f5beb7..08507ca 100644 --- a/src/types.ts +++ b/src/types.ts @@ -73,6 +73,32 @@ export interface ToolCall { * (404, timeout, network) — see {@link fetchHuginnTrace}. */ searchTraceFetch?: Promise; + /** + * CRAG-lite corrective-retrieval metadata, set when the copilot-sdk connector + * ran a grade-and-requery pass on this knowledge-search tool result. Drives + * the synthesized `knowledge_grade` / `knowledge_requery` waterfall spans. + * See src/ai/corrective-retrieval.ts. + */ + corrective?: CorrectiveToolMeta; +} + +export interface CorrectiveToolMeta { + /** Number of corrective re-queries actually issued (0–budget). */ + retries: number; + /** Grader verdict from each grading pass, in order ("correct" | "ambiguous" | "insufficient"). */ + verdicts: string[]; + /** Grader reason per pass, parallel to `verdicts`. */ + reasons: string[]; + /** Re-query strings actually issued (excludes the original query). */ + queriesTried: string[]; + /** Collections each re-query was scoped to, parallel to `queriesTried`; `null` = all. */ + collectionsTried?: (string[] | null)[]; + /** Verdict from the final grading pass — whether the result set ended up usable. */ + finalVerdict: string; + /** Total Haiku grader wall time across all passes, ms. */ + graderMs?: number; + /** Wall time of each re-query HTTP call, parallel to `queriesTried`, ms. */ + requeryMs?: number[]; } export interface ClaudeResult { From 1ed5046a6d3eb0e175dc47aad191dd5ce0fa7866 Mon Sep 17 00:00:00 2001 From: RuneLind Date: Tue, 12 May 2026 21:22:18 +0200 Subject: [PATCH 2/4] Add a no-model "signal" grader and make it the default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The awaiting Haiku grader (`claude` CLI per knowledge search, ~11s on a 12 KB result prompt) is too slow for the hot path. Add a second grader mode and make it the default: - `"signal"` (default) — no model call. Reads the cheap signal Huginn already emits (a `*Weak match …*` / `*No confident match …*` footer, or a "No results found" body) and, when present, re-queries with the `broaderQuery` / `narrowerQuery` from Huginn's own `retryHints`. ~0ms for confident searches; ~one extra HTTP call when weak. A fully uneventful signal-mode check emits no trace span. - `"haiku"` (opt-in via `correctiveRetrieval.grader: "haiku"` / `CORRECTIVE_RETRIEVAL_GRADER=haiku`) — the previous behaviour, but the result text is now digested down to the top hits' titles + bands + a short body prefix before being sent to Haiku, so it's ~3–5s instead of ~11s. Also: on a corrective merge, the now-obsolete `*Weak match — try: …*` footer is stripped from the prior result before the fresh hits are spliced in (keeps the model's context clean and stops signal-mode re-grading from re-detecting an already-handled weak signal). `corrective` metadata + the `knowledge_grade` span gain a `graderMode` / `mode` field. Tests updated for the new shape; signal-grader paths covered (confident → no-op, weak footer → re-query with hint, related-terms-only → no re-query, budget 2 doesn't loop, "No results" body); Haiku digest covered. --- .env.example | 1 + CLAUDE.md | 19 ++-- src/ai/CLAUDE.md | 8 +- src/ai/connectors/copilot-sdk.ts | 34 ++++-- src/ai/connectors/corrective-hook.test.ts | 111 +++++++++++++++++- src/ai/corrective-config.test.ts | 45 +++++--- src/ai/corrective-config.ts | 25 ++++- src/ai/corrective-retrieval.test.ts | 94 ++++++++++++++++ src/ai/corrective-retrieval.ts | 86 ++++++++------ src/ai/knowledge-grader.test.ts | 56 ++++++++- src/ai/knowledge-grader.ts | 131 ++++++++++++++++------ src/ai/knowledge-search-client.ts | 13 +++ src/bots/config.ts | 5 + src/core/corrective-trace-spans.test.ts | 19 ++-- src/core/corrective-trace-spans.ts | 2 +- src/types.ts | 4 +- 16 files changed, 533 insertions(+), 120 deletions(-) diff --git a/.env.example b/.env.example index cc5d2c4..d50c288 100644 --- a/.env.example +++ b/.env.example @@ -39,4 +39,5 @@ WHISPER_MODEL_PATH=./models/ggml-base.en.bin # default. Opt in per-bot via config.json `correctiveRetrieval`, or globally here. # CORRECTIVE_RETRIEVAL_ENABLED=true # CORRECTIVE_RETRIEVAL_BUDGET=1 # max corrective re-queries per search (1–2) +# CORRECTIVE_RETRIEVAL_GRADER=signal # result judge: "signal" (no model call) or "haiku" (~3–5s/search) # CORRECTIVE_RETRIEVAL_DISABLED=1 # hard kill-switch — overrides per-bot config diff --git a/CLAUDE.md b/CLAUDE.md index ca9e53e..c6bd3f6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -217,7 +217,7 @@ All fields are optional — falls back to global `.env` values: | `showWaterfall` | boolean | `true` | Show request progress waterfall overlay in web chat | | `contextWindow` | number | — | Context window size in tokens (e.g. `32768`). Shown as usage in web chat and percentage in Telegram footer | | `prompts` | object | — | Configurable prompts: `jiraAnalysis` (Jira research instruction, content appended automatically), `investigateCode` (follow-up code investigation prompt) | -| `correctiveRetrieval` | object | off | CRAG-lite corrective loop around the knowledge search tool — `{ enabled?: boolean, retryBudget?: 1\|2 }`. Only the `copilot-sdk` connector honours it; off by default. See "Corrective retrieval" below. | +| `correctiveRetrieval` | object | off | CRAG-lite corrective loop around the knowledge search tool — `{ enabled?: boolean, retryBudget?: 1\|2, grader?: "signal"\|"haiku" }`. Only the `copilot-sdk` connector honours it; off by default. `grader` defaults to `"signal"` (no model call). See "Corrective retrieval" below. | ### Database @@ -256,6 +256,7 @@ PostgreSQL + pgvector via Docker (single container). | `LOG_DIR` | No | `./logs` | Log file directory (set `none` to disable file logging) | | `CORRECTIVE_RETRIEVAL_ENABLED` | No | `false` | Global default for the CRAG-lite corrective loop (per-bot `correctiveRetrieval.enabled` overrides) | | `CORRECTIVE_RETRIEVAL_BUDGET` | No | `1` | Default max corrective re-queries per knowledge search (clamped to 1–2) | +| `CORRECTIVE_RETRIEVAL_GRADER` | No | `signal` | Default result-quality judge: `signal` (no model call) or `haiku` (slimmed awaiting Haiku call, ~3–5s/search) | | `CORRECTIVE_RETRIEVAL_DISABLED` | No | — | Set to `1` to hard-disable corrective retrieval everywhere, regardless of per-bot config | | `GOAL_CHECK_INTERVAL_MS` | No | — | Legacy alias for `SCHEDULER_INTERVAL_MS` | | `GOAL_CHECK_ENABLED` | No | — | Legacy alias for `SCHEDULER_ENABLED` | @@ -358,13 +359,17 @@ uvx --from "git+https://github.com/oraios/serena" serena project index /path/to/ ## Corrective Retrieval (CRAG-lite) -A CRAG-style "grade the search results, re-query if they're weak" loop wrapped around Huginn's `search_knowledge` MCP tool. **Off by default**; enable per-bot in `config.json` (`"correctiveRetrieval": { "enabled": true, "retryBudget": 1 }`), globally via `CORRECTIVE_RETRIEVAL_ENABLED=true`, or hard-disable everywhere with `CORRECTIVE_RETRIEVAL_DISABLED=1`. +A CRAG-style "judge the search results, re-query if they're weak" loop wrapped around Huginn's `search_knowledge` MCP tool. **Off by default**; enable per-bot in `config.json` (`"correctiveRetrieval": { "enabled": true, "retryBudget": 1, "grader": "signal" }`), globally via `CORRECTIVE_RETRIEVAL_ENABLED=true`, or hard-disable everywhere with `CORRECTIVE_RETRIEVAL_DISABLED=1`. -How it works (copilot-sdk connector only): -1. The connector registers a Copilot SDK `onPostToolUse` hook. When a bot calls `search_knowledge`, the hook intercepts the result before the model sees it. -2. `src/ai/knowledge-grader.ts` — a dedicated **awaiting** Haiku call grades the results (`correct` / `ambiguous` / `insufficient`) and, if weak, proposes a rewritten query and/or a better collection. Fail-soft: any Haiku error → `correct` (no change). -3. `src/ai/corrective-retrieval.ts` — if not `correct` and the retry budget (1, configurable to 2) isn't spent, re-queries Huginn's `/api/search` (`src/ai/knowledge-search-client.ts`) with `rerank=true`, merges the fresh hits into the original result text (deduped by `collection/doc_id` parsed from the rendered output), and appends an inline note. Never recursive. -4. Traces: `knowledge_grade` + `knowledge_requery` spans synthesized under the tool span (`src/core/corrective-trace-spans.ts`), rendered in the dashboard waterfall with a corrective chip on the parent tool span. +How it works (copilot-sdk connector only): the connector registers a Copilot SDK `onPostToolUse` hook that intercepts each `search_knowledge` result before the model sees it (`src/ai/connectors/copilot-sdk.ts` → `applyCorrectiveRetrieval` → `runCorrectiveRetrieval`). + +**Two grader modes** (`src/ai/knowledge-grader.ts`): +- `"signal"` (**default — no model call, ~0ms for confident searches**): reads the cheap signal Huginn already emits — a `*Weak match …*` / `*No confident match …*` footer or a "No results found" body — and, when present, re-queries with the `broaderQuery` / `narrowerQuery` from Huginn's own `retryHints` (parsed from that footer). Most searches add zero latency; a weak one costs ~one extra HTTP call. +- `"haiku"` (opt-in, `grader: "haiku"`): a *slimmed* **awaiting** Haiku call that also reads the result snippets and can propose a semantic rewrite / a better collection. ~3–5s per search (the result text is digested to titles + bands + a taste of each hit before being sent), so it's not the default. Fail-soft: any Haiku error → `correct` (no change). + +On a non-`correct` verdict, `src/ai/corrective-retrieval.ts` re-queries Huginn's `/api/search` (`src/ai/knowledge-search-client.ts`) with `rerank=true`, merges the fresh hits into the original result text (deduped by `collection/doc_id` parsed from the rendered output; the now-obsolete `*Weak match*` footer is stripped), and appends an inline note. Retry budget 1 (configurable to 2); never recursive. + +Traces: `knowledge_grade` (attrs include `mode`, `verdicts`, `finalVerdict`) + `knowledge_requery` spans synthesized under the tool span (`src/core/corrective-trace-spans.ts`), rendered in the dashboard waterfall with a corrective chip on the parent tool span. A fully uneventful signal-mode check (confident, no re-query) emits no span. **Connector asymmetry:** Claude-CLI bots run the MCP tool inside their own process, so the result can't be intercepted — they get nothing here (Phase 3 will add prompt-level corrective guidance instead). When the toggle is off, the hook isn't registered and behaviour is byte-identical to before. diff --git a/src/ai/CLAUDE.md b/src/ai/CLAUDE.md index 04b2d42..515ab9a 100644 --- a/src/ai/CLAUDE.md +++ b/src/ai/CLAUDE.md @@ -16,10 +16,10 @@ | `haiku-extraction.ts` | Shared Haiku executor for async extraction tasks (memories, goals, tasks) | | `huginn-trace.ts` | Inline-fence Huginn trace handling (legacy mode) — `parseHuginnTrace`, `extractMcpResultText`, oversized-CLI-divert recovery | | `huginn-trace-pointer.ts` | Phase 2 out-of-band trace channel — parses `huginn-trace-url:` line and fetches the trace from Huginn's `/api/trace/` endpoint. Preferred when `HUGINN_TRACE_POINTER=1` is set on Huginn. Also exports `processMcpToolResult()` — the unwrap → peel → fetch pipeline connectors run on every tool result — and `peelTraceMarkerForRewrite()` for connectors that rewrite a tool result and need to re-append the trace marker | -| `knowledge-grader.ts` | CRAG-lite retrieval evaluator — an awaiting Haiku call that grades knowledge-search results (`correct`/`ambiguous`/`insufficient`) and proposes a rewritten query/collection. Fail-soft to `correct`. | -| `corrective-retrieval.ts` | Corrective grade-and-requery orchestrator — `runCorrectiveRetrieval()`: grade → bounded re-query Huginn → merge+dedupe → consolidated text + `corrective` metadata. ≤1 retry (configurable to 2), non-recursive. | -| `knowledge-search-client.ts` | HTTP client for Huginn's `/api/search` + a renderer mirroring the MCP adapter's result format, used by the corrective re-query path. | -| `corrective-config.ts` | Resolves the per-bot corrective-retrieval toggle + retry budget (kill-switch > per-bot config.json > global env defaults). | +| `knowledge-grader.ts` | CRAG-lite retrieval evaluators — `gradeFromSignal()` (default, no model call: reads Huginn's `*Weak match*` / "No results" signal) and `gradeKnowledgeResults()` (opt-in: a *slimmed* awaiting Haiku call that also reads snippets and can propose a rewrite). Both fail-soft to `correct`. | +| `corrective-retrieval.ts` | Corrective grade-and-requery orchestrator — `runCorrectiveRetrieval()`: grade (signal or haiku) → bounded re-query Huginn → merge+dedupe → consolidated text + `corrective` metadata. ≤1 retry (configurable to 2), non-recursive. | +| `knowledge-search-client.ts` | HTTP client for Huginn's `/api/search` + a renderer mirroring the MCP adapter's result format + footer/doc-id parsers, used by the corrective re-query path. | +| `corrective-config.ts` | Resolves the per-bot corrective-retrieval toggle + retry budget + grader mode (kill-switch > per-bot config.json > global env defaults; grader defaults to `"signal"`). | | `connectors/` | Three connector implementations (see below) | ## Connector Abstraction diff --git a/src/ai/connectors/copilot-sdk.ts b/src/ai/connectors/copilot-sdk.ts index 72fe97d..247bf34 100644 --- a/src/ai/connectors/copilot-sdk.ts +++ b/src/ai/connectors/copilot-sdk.ts @@ -84,10 +84,12 @@ export async function executePrompt( const customAgents = buildCustomAgents(botConfig); // Corrective retrieval (CRAG-lite): when enabled for this bot, an onPostToolUse - // hook grades each knowledge-search result with Haiku and, if it's weak, does a - // bounded re-query — splicing the fresh hits into the result before the model - // sees it. Off by default (see src/ai/corrective-config.ts); when off, the hook - // isn't registered at all and behaviour is byte-identical to before. + // hook judges each knowledge-search result and, if it's weak, does a bounded + // re-query — splicing the fresh hits into the result before the model sees it. + // Default judge is `"signal"` (no model call — re-query only when Huginn + // already flags the result weak, using Huginn's `retryHints`); `"haiku"` is + // an opt-in slower/smarter alternative. Off by default (see corrective-config.ts); + // when off, the hook isn't registered and behaviour is byte-identical to before. const correctiveCfg = resolveCorrectiveConfig(botConfig); const correctiveOutcomes: CorrectiveMetadata[] = []; const correctiveEnabled = correctiveCfg.enabled && hasMcp; @@ -103,6 +105,7 @@ export async function executePrompt( toolResult: input.toolResult, botConfig, budget: correctiveCfg.retryBudget, + grader: correctiveCfg.grader, userQuestion, }); if (result) { @@ -387,6 +390,7 @@ export interface ApplyCorrectiveArgs { toolResult: ToolResultObject; botConfig: Pick; budget: number; + grader?: CorrectiveRetrievalContext["grader"]; userQuestion: string; /** Injectable for tests — forwarded to {@link runCorrectiveRetrieval}. */ searchFn?: CorrectiveRetrievalContext["searchFn"]; @@ -395,11 +399,12 @@ export interface ApplyCorrectiveArgs { /** * Run the corrective grade-and-requery pass on a knowledge-search tool result. - * Returns `null` when there's nothing to act on (empty result, tool error); - * otherwise always returns the `metadata` (for tracing) and, when results were - * merged in, a `modifiedResult` to hand back to the model. The trailing Huginn - * trace marker, if any, is peeled off the body before splicing and re-appended - * after, so downstream trace extraction is unaffected. + * Returns `null` when there's nothing to act on (empty result, tool error, or a + * fully uneventful signal-mode check — judged confident, no re-query — which + * isn't worth a trace span); otherwise returns the `metadata` (for tracing) and, + * when results were merged in, a `modifiedResult` to hand back to the model. The + * trailing Huginn trace marker, if any, is peeled off the body before splicing + * and re-appended after, so downstream trace extraction is unaffected. */ export async function applyCorrectiveRetrieval( args: ApplyCorrectiveArgs, @@ -423,6 +428,7 @@ export async function applyCorrectiveRetrieval( originalCollections, originalResultText: body, budget, + grader: args.grader, botName: botConfig.name, cwd: botConfig.dir, log, @@ -431,7 +437,14 @@ export async function applyCorrectiveRetrieval( gradeFn: args.gradeFn, }); - if (!outcome.changed) return { metadata: outcome.metadata }; + if (!outcome.changed) { + // A signal-mode check that found nothing wrong is a free no-op — don't + // clutter the trace with a span for every confident search. A Haiku-mode + // check, or any pass that graded something non-"correct", is worth recording. + const uneventful = + outcome.metadata.graderMode === "signal" && outcome.metadata.verdicts.every((v) => v === "correct"); + return uneventful ? null : { metadata: outcome.metadata }; + } return { metadata: outcome.metadata, @@ -480,6 +493,7 @@ function correctiveMetaToToolMeta(m: CorrectiveMetadata): CorrectiveToolMeta { queriesTried: m.queriesTried, collectionsTried: m.collectionsTried.map((c) => c ?? null), finalVerdict: m.finalVerdict, + graderMode: m.graderMode, graderMs: m.graderMs, requeryMs: m.requeryMs, }; diff --git a/src/ai/connectors/corrective-hook.test.ts b/src/ai/connectors/corrective-hook.test.ts index 56b66c1..eaefc64 100644 --- a/src/ai/connectors/corrective-hook.test.ts +++ b/src/ai/connectors/corrective-hook.test.ts @@ -26,13 +26,24 @@ describe("attachCorrectiveOutcomes", () => { return { id: name, name, displayName: name, durationMs: 1, startOffsetMs: 0 }; } function meta(finalVerdict: string): CorrectiveMetadata { - return { retries: 1, verdicts: ["insufficient", finalVerdict] as KnowledgeGrade["verdict"][], reasons: ["x", "y"], queriesTried: ["q"], collectionsTried: [undefined], finalVerdict: finalVerdict as KnowledgeGrade["verdict"], graderMs: 100, requeryMs: [50] }; + return { + retries: 1, + verdicts: ["insufficient", finalVerdict] as KnowledgeGrade["verdict"][], + reasons: ["x", "y"], + queriesTried: ["q"], + collectionsTried: [undefined], + finalVerdict: finalVerdict as KnowledgeGrade["verdict"], + graderMode: "signal", + graderMs: 0, + requeryMs: [50], + }; } test("maps the i-th outcome to the i-th knowledge-search tool call, skipping others", () => { const calls = [tc("knowledge-search_knowledge"), tc("yggdrasil-symbol_context"), tc("knowledge-search_knowledge")]; attachCorrectiveOutcomes(calls, [meta("correct"), meta("ambiguous")]); expect(calls[0]!.corrective?.finalVerdict).toBe("correct"); + expect(calls[0]!.corrective?.graderMode).toBe("signal"); expect(calls[1]!.corrective).toBeUndefined(); expect(calls[2]!.corrective?.finalVerdict).toBe("ambiguous"); expect(calls[2]!.corrective?.collectionsTried).toEqual([null]); @@ -45,7 +56,7 @@ describe("attachCorrectiveOutcomes", () => { }); }); -describe("applyCorrectiveRetrieval", () => { +describe("applyCorrectiveRetrieval — haiku mode", () => { const botConfig = { name: "test", dir: "/tmp/test-bot" }; const okGrade: KnowledgeGrade = { verdict: "correct", reason: "covered" }; @@ -61,6 +72,7 @@ describe("applyCorrectiveRetrieval", () => { toolResult: { textResultForLlm: "Knowledge API server is not running", resultType: "failure" }, botConfig, budget: 1, + grader: "haiku", userQuestion: "q", gradeFn: grader(okGrade), searchFn: async () => ({ results: [] }), @@ -75,6 +87,7 @@ describe("applyCorrectiveRetrieval", () => { toolResult: { textResultForLlm: "", resultType: "success" }, botConfig, budget: 1, + grader: "haiku", userQuestion: "q", gradeFn: grader(okGrade), searchFn: async () => ({ results: [] }), @@ -82,13 +95,14 @@ describe("applyCorrectiveRetrieval", () => { expect(out).toBeNull(); }); - test("verdict 'correct' → metadata only, no modifiedResult", async () => { + test("verdict 'correct' → metadata only, no modifiedResult (haiku still recorded)", async () => { const out = await applyCorrectiveRetrieval({ toolName: "knowledge-search_knowledge", toolArgs: { query: "x", collection: "wiki" }, toolResult: { textResultForLlm: "## Doc (80% relevant · high)\ncollection: `wiki` doc_id: `1`\n\nbody", resultType: "success" }, botConfig, budget: 1, + grader: "haiku", userQuestion: "q", gradeFn: grader(okGrade), searchFn: async () => ({ results: [] }), @@ -96,6 +110,7 @@ describe("applyCorrectiveRetrieval", () => { expect(out).not.toBeNull(); expect(out!.modifiedResult).toBeUndefined(); expect(out!.metadata.retries).toBe(0); + expect(out!.metadata.graderMode).toBe("haiku"); }); test("low-confidence result → exactly one re-query, merged, trace fence preserved at the end", async () => { @@ -108,6 +123,7 @@ describe("applyCorrectiveRetrieval", () => { toolResult: { textResultForLlm: original, resultType: "success" }, botConfig, budget: 1, + grader: "haiku", userQuestion: "what SEDs belong to LA_BUC_02?", gradeFn: grader( { verdict: "insufficient", rewrittenQuery: "LA_BUC_02 structured electronic documents", reason: "off-topic" }, @@ -137,3 +153,92 @@ describe("applyCorrectiveRetrieval", () => { expect(out!.metadata.queriesTried).toEqual(["LA_BUC_02 structured electronic documents"]); }); }); + +describe("applyCorrectiveRetrieval — signal mode (default)", () => { + const botConfig = { name: "test", dir: "/tmp/test-bot" }; + + test("confident search (no weak footer) → returns null (uneventful free check, no span)", async () => { + let searchCalls = 0; + const out = await applyCorrectiveRetrieval({ + toolName: "knowledge-search_knowledge", + toolArgs: { query: "x", collection: "wiki" }, + toolResult: { textResultForLlm: "## Doc (80% relevant · high)\ncollection: `wiki` doc_id: `1`\n\nbody", resultType: "success" }, + botConfig, + budget: 1, + userQuestion: "q", + searchFn: async () => { searchCalls++; return { results: [] }; }, + }); + expect(out).toBeNull(); + expect(searchCalls).toBe(0); // no grader call, no re-query + }); + + test("Huginn-flagged weak result → re-queries with the footer hint, merges", async () => { + const original = + "## Marginal hit (12% relevant · low)\ncollection: `wiki` doc_id: `1`\n\nmeh\n\n*Weak match — try: broader query: \"LA_BUC concepts\"*"; + let seenQuery = ""; + const out = await applyCorrectiveRetrieval({ + toolName: "knowledge-search_knowledge", + toolArgs: { query: "LA_BUC_02 obscure phrasing", collection: "wiki" }, + toolResult: { textResultForLlm: original, resultType: "success" }, + botConfig, + budget: 1, + userQuestion: "what about LA_BUC_02?", + searchFn: async (query: string) => { + seenQuery = query; + return { + results: [{ collection: "wiki", id: "2", title: "Wider hit", relevance: 0.6, confidenceBand: "medium", matchedChunks: [{ content: "useful" }] }], + }; + }, + }); + expect(seenQuery).toBe("LA_BUC concepts"); + expect(out).not.toBeNull(); + expect(out!.modifiedResult).toBeDefined(); + const text = out!.modifiedResult!.textResultForLlm; + expect(text).toContain("Wider hit"); + expect(text).toContain("[corrective retrieval — re-query #1"); + expect(text).not.toContain("Weak match — try"); // obsolete footer stripped + expect(out!.metadata.graderMode).toBe("signal"); + expect(out!.metadata.verdicts).toEqual(["insufficient", "correct"]); + expect(out!.metadata.queriesTried).toEqual(["LA_BUC concepts"]); + }); + + test("weak result but no usable hint → metadata recorded, no re-query, not null", async () => { + const original = + "## Marginal hit (12% relevant · low)\ncollection: `wiki` doc_id: `1`\n\nmeh\n\n*Weak match — try: related terms: foo, bar*"; + let searchCalls = 0; + const out = await applyCorrectiveRetrieval({ + toolName: "knowledge-search_knowledge", + toolArgs: { query: "x", collection: "wiki" }, + toolResult: { textResultForLlm: original, resultType: "success" }, + botConfig, + budget: 1, + userQuestion: "q", + searchFn: async () => { searchCalls++; return { results: [] }; }, + }); + expect(searchCalls).toBe(0); + expect(out).not.toBeNull(); + expect(out!.modifiedResult).toBeUndefined(); + expect(out!.metadata.retries).toBe(0); + expect(out!.metadata.verdicts).toEqual(["insufficient"]); + }); + + test("'No results found' body → re-queries with the footer hint if present", async () => { + const original = "No results found for 'xyz'.\n\n*No confident match — try: narrower query: \"xyz precise term\"*"; + let seenQuery = ""; + const out = await applyCorrectiveRetrieval({ + toolName: "knowledge-search_knowledge", + toolArgs: { query: "xyz", collection: "wiki" }, + toolResult: { textResultForLlm: original, resultType: "success" }, + botConfig, + budget: 1, + userQuestion: "q", + searchFn: async (query: string) => { + seenQuery = query; + return { results: [{ collection: "wiki", id: "9", title: "Found it", relevance: 0.7, confidenceBand: "high", matchedChunks: [{ content: "yes" }] }] }; + }, + }); + expect(seenQuery).toBe("xyz precise term"); + expect(out!.modifiedResult).toBeDefined(); + expect(out!.modifiedResult!.textResultForLlm).toContain("Found it"); + }); +}); diff --git a/src/ai/corrective-config.test.ts b/src/ai/corrective-config.test.ts index b3996fa..8acde74 100644 --- a/src/ai/corrective-config.test.ts +++ b/src/ai/corrective-config.test.ts @@ -1,5 +1,5 @@ import { test, expect, describe } from "bun:test"; -import { resolveCorrectiveConfig, clampBudget } from "./corrective-config.ts"; +import { resolveCorrectiveConfig, clampBudget, normalizeGraderMode } from "./corrective-config.ts"; describe("clampBudget", () => { test("clamps to the 1–2 range and floors", () => { @@ -12,37 +12,56 @@ describe("clampBudget", () => { }); }); +describe("normalizeGraderMode", () => { + test("only 'haiku' opts into the model grader; everything else is 'signal'", () => { + expect(normalizeGraderMode("haiku")).toBe("haiku"); + expect(normalizeGraderMode("signal")).toBe("signal"); + expect(normalizeGraderMode(undefined)).toBe("signal"); + expect(normalizeGraderMode("nonsense")).toBe("signal"); + }); +}); + describe("resolveCorrectiveConfig", () => { - test("off by default when nothing is configured", () => { - expect(resolveCorrectiveConfig({}, {})).toEqual({ enabled: false, retryBudget: 1 }); + test("off, budget 1, signal grader by default", () => { + expect(resolveCorrectiveConfig({}, {})).toEqual({ enabled: false, retryBudget: 1, grader: "signal" }); }); - test("per-bot config enables it and clamps the budget", () => { - expect(resolveCorrectiveConfig({ correctiveRetrieval: { enabled: true, retryBudget: 9 } }, {})).toEqual({ + test("per-bot config enables it, clamps the budget, and selects the grader", () => { + expect(resolveCorrectiveConfig({ correctiveRetrieval: { enabled: true, retryBudget: 9, grader: "haiku" } }, {})).toEqual({ enabled: true, retryBudget: 2, + grader: "haiku", }); }); - test("global env default enables it when the bot doesn't say otherwise", () => { - const env = { CORRECTIVE_RETRIEVAL_ENABLED: "true", CORRECTIVE_RETRIEVAL_BUDGET: "2" }; - expect(resolveCorrectiveConfig({}, env)).toEqual({ enabled: true, retryBudget: 2 }); + test("global env defaults apply when the bot doesn't say otherwise", () => { + const env = { CORRECTIVE_RETRIEVAL_ENABLED: "true", CORRECTIVE_RETRIEVAL_BUDGET: "2", CORRECTIVE_RETRIEVAL_GRADER: "haiku" }; + expect(resolveCorrectiveConfig({}, env)).toEqual({ enabled: true, retryBudget: 2, grader: "haiku" }); }); test("per-bot config overrides the global default (disable wins too)", () => { - const env = { CORRECTIVE_RETRIEVAL_ENABLED: "true" }; - expect(resolveCorrectiveConfig({ correctiveRetrieval: { enabled: false } }, env).enabled).toBe(false); + const env = { CORRECTIVE_RETRIEVAL_ENABLED: "true", CORRECTIVE_RETRIEVAL_GRADER: "haiku" }; + expect(resolveCorrectiveConfig({ correctiveRetrieval: { enabled: false, grader: "signal" } }, env)).toEqual({ + enabled: false, + retryBudget: 1, + grader: "signal", + }); }); test("kill-switch overrides everything", () => { const env = { CORRECTIVE_RETRIEVAL_DISABLED: "1", CORRECTIVE_RETRIEVAL_ENABLED: "true" }; - expect(resolveCorrectiveConfig({ correctiveRetrieval: { enabled: true, retryBudget: 2 } }, env)).toEqual({ + expect(resolveCorrectiveConfig({ correctiveRetrieval: { enabled: true, retryBudget: 2, grader: "haiku" } }, env)).toEqual({ enabled: false, retryBudget: 1, + grader: "signal", }); }); - test("a bare global enable defaults the budget to 1", () => { - expect(resolveCorrectiveConfig({}, { CORRECTIVE_RETRIEVAL_ENABLED: "true" })).toEqual({ enabled: true, retryBudget: 1 }); + test("a bare global enable defaults the budget to 1 and the grader to signal", () => { + expect(resolveCorrectiveConfig({}, { CORRECTIVE_RETRIEVAL_ENABLED: "true" })).toEqual({ + enabled: true, + retryBudget: 1, + grader: "signal", + }); }); }); diff --git a/src/ai/corrective-config.ts b/src/ai/corrective-config.ts index 765cf1b..8b5b14b 100644 --- a/src/ai/corrective-config.ts +++ b/src/ai/corrective-config.ts @@ -4,17 +4,25 @@ import type { BotConfig } from "../bots/config.ts"; * Resolved per-bot corrective-retrieval settings. Precedence: * 1. `CORRECTIVE_RETRIEVAL_DISABLED=1` (hard kill-switch) → always off. * 2. The bot's `config.json` `correctiveRetrieval` block. - * 3. The global env defaults (`CORRECTIVE_RETRIEVAL_ENABLED` / - * `CORRECTIVE_RETRIEVAL_BUDGET`). - * `retryBudget` is clamped to 1–2 regardless of source. + * 3. The global env defaults (`CORRECTIVE_RETRIEVAL_ENABLED`, + * `CORRECTIVE_RETRIEVAL_BUDGET`, `CORRECTIVE_RETRIEVAL_GRADER`). + * + * `retryBudget` is clamped to 1–2. `grader` is `"signal"` (default — no model + * call: re-query only when Huginn already flags the result weak, using Huginn's + * own `retryHints`) or `"haiku"` (a slimmed awaiting Haiku call that can also + * propose a semantic rewrite — costs ~3–5s per search, so opt-in only). * * Reads `process.env` directly (rather than going through `loadConfig()`) so it * has no hard `DATABASE_URL` dependency and behaves the same in tests. */ +export type GraderMode = "signal" | "haiku"; + export interface ResolvedCorrectiveConfig { enabled: boolean; /** Max corrective re-queries per knowledge search (1 or 2). */ retryBudget: number; + /** How the result quality is judged before a re-query. */ + grader: GraderMode; } export function resolveCorrectiveConfig( @@ -22,7 +30,7 @@ export function resolveCorrectiveConfig( env: NodeJS.ProcessEnv = process.env, ): ResolvedCorrectiveConfig { if (env.CORRECTIVE_RETRIEVAL_DISABLED === "1") { - return { enabled: false, retryBudget: 1 }; + return { enabled: false, retryBudget: 1, grader: "signal" }; } const bot = botConfig.correctiveRetrieval; @@ -31,7 +39,10 @@ export function resolveCorrectiveConfig( const globalBudget = parseBudgetEnv(env.CORRECTIVE_RETRIEVAL_BUDGET); const rawBudget = bot?.retryBudget ?? globalBudget ?? 1; - return { enabled, retryBudget: clampBudget(rawBudget) }; + + const grader = normalizeGraderMode(bot?.grader ?? env.CORRECTIVE_RETRIEVAL_GRADER); + + return { enabled, retryBudget: clampBudget(rawBudget), grader }; } export function clampBudget(n: number): number { @@ -39,6 +50,10 @@ export function clampBudget(n: number): number { return Math.max(1, Math.min(2, Math.floor(n))); } +export function normalizeGraderMode(raw: string | undefined): GraderMode { + return raw === "haiku" ? "haiku" : "signal"; +} + function parseBudgetEnv(raw: string | undefined): number | undefined { if (!raw) return undefined; const n = parseInt(raw, 10); diff --git a/src/ai/corrective-retrieval.test.ts b/src/ai/corrective-retrieval.test.ts index 3501f84..3820370 100644 --- a/src/ai/corrective-retrieval.test.ts +++ b/src/ai/corrective-retrieval.test.ts @@ -39,10 +39,14 @@ function searchSequence(...responses: KnowledgeSearchResponse[]) { return Object.assign(fn, { calls }); } +// Most of these exercise the grader-agnostic loop logic (retry / merge / dedupe +// / budget), driven through the opt-in Haiku grader with an injected fake. The +// "signal grader" describe block at the bottom covers the default (no-model) path. const baseCtx = { question: "what SEDs belong to LA_BUC_02?", originalQuery: "LA_BUC_02", botName: "test", + grader: "haiku" as const, log, }; @@ -222,4 +226,94 @@ describe("runCorrectiveRetrieval", () => { expect(out.metadata.retries).toBe(0); expect(search.calls.length).toBe(0); }); + + test("records graderMode in the metadata", async () => { + const out = await runCorrectiveRetrieval({ + ...baseCtx, + originalResultText: "## Original\ncollection: `wiki` doc_id: `1`", + budget: 1, + gradeFn: gradeSequence({ verdict: "correct", reason: "ok" }), + searchFn: searchSequence(searchResponse([result({ id: "1", collection: "wiki" })])), + }); + expect(out.metadata.graderMode).toBe("haiku"); + }); +}); + +describe("runCorrectiveRetrieval — signal grader (default, no model call)", () => { + const signalCtx = { + question: "what about LA_BUC_02?", + originalQuery: "LA_BUC_02 obscure phrasing", + botName: "test", + log, + // grader omitted → defaults to "signal" + }; + + test("confident result (no weak footer) → no grade-driven re-query, gradeFn never consulted", async () => { + const search = searchSequence(searchResponse([result({ id: "z", collection: "wiki" })])); + let graded = false; + const out = await runCorrectiveRetrieval({ + ...signalCtx, + originalResultText: renderSearchResults([result({ id: "1", collection: "wiki" })]), + budget: 1, + gradeFn: async () => { graded = true; return { verdict: "insufficient", reason: "x" }; }, + searchFn: search, + }); + expect(graded).toBe(false); + expect(out.changed).toBe(false); + expect(out.metadata.graderMode).toBe("signal"); + expect(out.metadata.verdicts).toEqual(["correct"]); + expect(search.calls.length).toBe(0); + }); + + test("Huginn 'Weak match' footer → re-queries with the broaderQuery hint, merges", async () => { + const original = + renderSearchResults([result({ id: "1", collection: "wiki", title: "Marginal" })]) + + '\n\n*Weak match — try: broader query: "LA_BUC concepts"*'; + const search = searchSequence(searchResponse([result({ id: "2", collection: "wiki", title: "Wider hit" })])); + const out = await runCorrectiveRetrieval({ + ...signalCtx, + originalResultText: original, + budget: 1, + searchFn: search, + }); + expect(search.calls[0]?.query).toBe("LA_BUC concepts"); + expect(out.changed).toBe(true); + expect(out.text).toContain("Wider hit"); + expect(out.text).not.toContain("Weak match — try"); // obsolete footer stripped on merge + expect(out.metadata.graderMode).toBe("signal"); + expect(out.metadata.verdicts).toEqual(["insufficient", "correct"]); + expect(out.metadata.queriesTried).toEqual(["LA_BUC concepts"]); + expect(out.metadata.graderMs).toBeLessThan(50); // ≈0 — no model call + }); + + test("weak footer with only related terms (no broader/narrower) → no re-query", async () => { + const original = + renderSearchResults([result({ id: "1", collection: "wiki" })]) + "\n\n*Weak match — try: related terms: foo, bar*"; + const search = searchSequence(searchResponse([result({ id: "2", collection: "wiki" })])); + const out = await runCorrectiveRetrieval({ + ...signalCtx, + originalResultText: original, + budget: 1, + searchFn: search, + }); + expect(search.calls.length).toBe(0); + expect(out.changed).toBe(false); + expect(out.metadata.verdicts).toEqual(["insufficient"]); + expect(out.metadata.retries).toBe(0); + }); + + test("budget 2 does not loop in signal mode once the footer hint is exhausted", async () => { + const original = + renderSearchResults([result({ id: "1", collection: "wiki" })]) + + '\n\n*Weak match — try: narrower query: "LA_BUC_02 narrow"*'; + const search = searchSequence(searchResponse([result({ id: "2", collection: "wiki" })])); + const out = await runCorrectiveRetrieval({ + ...signalCtx, + originalResultText: original, + budget: 2, + searchFn: search, + }); + expect(search.calls.map((c) => c.query)).toEqual(["LA_BUC_02 narrow"]); + expect(out.metadata.retries).toBe(1); + }); }); diff --git a/src/ai/corrective-retrieval.ts b/src/ai/corrective-retrieval.ts index 925f7ad..234ad0d 100644 --- a/src/ai/corrective-retrieval.ts +++ b/src/ai/corrective-retrieval.ts @@ -1,11 +1,13 @@ import type { Logger } from "@logtape/logtape"; -import { gradeKnowledgeResults, type GradeVerdict, type KnowledgeGrade } from "./knowledge-grader.ts"; +import { gradeKnowledgeResults, gradeFromSignal, type GradeVerdict, type KnowledgeGrade } from "./knowledge-grader.ts"; +import type { GraderMode } from "./corrective-config.ts"; import { searchKnowledge, renderSearchResults, renderRetryHintsFooter, extractDocKeysFromRenderedText, parseQueryHintsFromFooter, + stripTrailingRetryFooter, docKey, type KnowledgeSearchResponse, } from "./knowledge-search-client.ts"; @@ -14,25 +16,27 @@ import { * CRAG-lite corrective loop around the knowledge search tool. After a bot's * `search_knowledge` call returns, this: * - * 1. Grades the result with Haiku ({@link gradeKnowledgeResults}). + * 1. Grades the result — by default the **`"signal"`** grader (no model call: + * reads Huginn's `*Weak match …*` / "No results" signal — {@link + * gradeFromSignal}); optionally the **`"haiku"`** grader (a slimmed + * awaiting Haiku call that also reads snippets and can propose a semantic + * rewrite — {@link gradeKnowledgeResults}). * 2. If the verdict is "ambiguous" / "insufficient" and the retry budget * isn't spent, re-queries Huginn's `/api/search` with the grader's - * rewritten query (falling back to the Phase-0 `retryHints.broaderQuery` / - * `narrowerQuery` parsed from the result footer), optionally redirected to - * a `suggestedCollection`, forcing `rerank=true` so the re-query's - * `confidenceBand`s are trustworthy. - * 3. Merges the fresh hits into the original result text — deduped against - * it by `collection/doc_id` — with an inline note explaining the retry. + * rewritten query (Haiku mode) or the Phase-0 `retryHints.broaderQuery` / + * `narrowerQuery` parsed from the result footer (signal mode), optionally + * redirected to a `suggestedCollection`, forcing `rerank=true` so the + * re-query's `confidenceBand`s are trustworthy. + * 3. Merges the fresh hits into the original result text — deduped against it + * by `collection/doc_id` — with an inline note explaining the retry. * 4. Optionally re-grades and retries again, up to the (clamped 1–2) budget; * never recursive. * * Returns the consolidated text to feed the model plus a `corrective` metadata - * block for tracing (`{retries, verdicts, reasons, queriesTried, finalVerdict}`). - * - * Fail-soft throughout: a grader that can't be reached returns "correct" (no - * change); a re-query HTTP error ends the loop with whatever's accumulated. The - * caller is expected to gate on the per-bot toggle — this function assumes the - * feature is enabled and `budget >= 1`. + * block for tracing. Fail-soft throughout: a grader that can't be reached + * returns "correct" (no change); a re-query HTTP error ends the loop with + * whatever's accumulated. The caller gates on the per-bot toggle — this + * function assumes the feature is enabled and `budget >= 1`. * * Plan: `../mimir/plans/huginn-muninn-corrective-rag.md` (Phase 1). */ @@ -52,7 +56,9 @@ export interface CorrectiveMetadata { /** The verdict from the last grading pass — i.e. whether the corrective * pass left the result set in good shape. */ finalVerdict: GradeVerdict; - /** Total wall time spent in the Haiku grader across all passes, ms. */ + /** Which grader judged the result(s). */ + graderMode: GraderMode; + /** Total wall time spent in the grader across all passes, ms (≈0 in signal mode). */ graderMs: number; /** Wall time of each re-query HTTP call, parallel to `queriesTried`, ms. */ requeryMs: number[]; @@ -83,11 +89,13 @@ export interface CorrectiveRetrievalContext { /** Max re-queries. Clamped to [1, 2]. The caller gates on the per-bot * toggle; this function only sees enabled invocations. */ budget: number; + /** Which grader to use. `"signal"` (default) makes no model call. */ + grader?: GraderMode; botName: string; - /** Working directory for the grader's Haiku spawn. */ + /** Working directory for the grader's Haiku spawn (Haiku mode only). */ cwd?: string; log: Logger; - /** Haiku model override for the grader. */ + /** Haiku model override for the grader (Haiku mode only). */ graderModel?: string; graderTimeoutMs?: number; /** Injectable for tests. */ @@ -97,8 +105,9 @@ export interface CorrectiveRetrievalContext { export async function runCorrectiveRetrieval(ctx: CorrectiveRetrievalContext): Promise { const budget = Math.max(1, Math.min(2, Math.floor(ctx.budget))); + const graderMode: GraderMode = ctx.grader ?? "signal"; const search = ctx.searchFn ?? searchKnowledge; - const grade = ctx.gradeFn ?? gradeKnowledgeResults; + const haikuGrade = ctx.gradeFn ?? gradeKnowledgeResults; const { question, originalQuery, originalResultText, botName, cwd, log } = ctx; let currentText = originalResultText; @@ -116,22 +125,26 @@ export async function runCorrectiveRetrieval(ctx: CorrectiveRetrievalContext): P for (;;) { let g: KnowledgeGrade; const gradeStart = performance.now(); - try { - g = await grade({ - question, - toolResultText: currentText, - botName, - cwd, - log, - model: ctx.graderModel, - timeoutMs: ctx.graderTimeoutMs, - }); - } catch (err) { - log.warn("corrective: grader threw — stopping with current results: {error}", { - botName, - error: err instanceof Error ? err.message : String(err), - }); - g = { verdict: "correct", reason: "grader error" }; + if (graderMode === "haiku") { + try { + g = await haikuGrade({ + question, + toolResultText: currentText, + botName, + cwd, + log, + model: ctx.graderModel, + timeoutMs: ctx.graderTimeoutMs, + }); + } catch (err) { + log.warn("corrective: grader threw — stopping with current results: {error}", { + botName, + error: err instanceof Error ? err.message : String(err), + }); + g = { verdict: "correct", reason: "grader error" }; + } + } else { + g = gradeFromSignal(currentText); } graderMs += performance.now() - gradeStart; verdicts.push(g.verdict); @@ -186,7 +199,9 @@ export async function runCorrectiveRetrieval(ctx: CorrectiveRetrievalContext): P collections, freshCount: fresh.length, }); - currentText = `${currentText}\n\n---\n${note}\n\n${renderSearchResults(fresh)}${renderRetryHintsFooter(resp)}`; + // Drop the now-obsolete "try X" footer from what we had, append the note + + // fresh hits + (the re-query's own footer, if it too came back weak). + currentText = `${stripTrailingRetryFooter(currentText).trimEnd()}\n\n---\n${note}\n\n${renderSearchResults(fresh)}${renderRetryHintsFooter(resp)}`; currentCollections = collections; } @@ -200,6 +215,7 @@ export async function runCorrectiveRetrieval(ctx: CorrectiveRetrievalContext): P queriesTried, collectionsTried, finalVerdict: verdicts[verdicts.length - 1] ?? "correct", + graderMode, graderMs: Math.round(graderMs), requeryMs, }, diff --git a/src/ai/knowledge-grader.test.ts b/src/ai/knowledge-grader.test.ts index df42d94..63486ef 100644 --- a/src/ai/knowledge-grader.test.ts +++ b/src/ai/knowledge-grader.test.ts @@ -1,5 +1,5 @@ import { test, expect, describe } from "bun:test"; -import { gradeKnowledgeResults, normalizeGrade } from "./knowledge-grader.ts"; +import { gradeKnowledgeResults, normalizeGrade, gradeFromSignal, digestResultsForGrading } from "./knowledge-grader.ts"; import { getLog } from "../logging.ts"; import type { HaikuResult } from "../scheduler/executor.ts"; @@ -76,3 +76,57 @@ describe("gradeKnowledgeResults", () => { expect(g.verdict).toBe("correct"); }); }); + +describe("gradeFromSignal", () => { + test("'correct' when there's no weak/no-results signal", () => { + const g = gradeFromSignal("## A doc (82% relevant · high)\ncollection: `wiki` doc_id: `1`\n\nbody text"); + expect(g.verdict).toBe("correct"); + expect(g.rewrittenQuery).toBeUndefined(); + }); + + test("'insufficient' on a trailing Weak match footer", () => { + const g = gradeFromSignal('## A doc (12% relevant · low)\ncollection: `wiki` doc_id: `1`\n\nbody\n\n*Weak match — try: broader query: "x"*'); + expect(g.verdict).toBe("insufficient"); + expect(g.rewrittenQuery).toBeUndefined(); // signal mode never rewrites; the loop uses the footer hint + }); + + test("'insufficient' on a No confident match footer", () => { + expect(gradeFromSignal("nothing relevant\n\n*No confident match — try: related terms: a, b*").verdict).toBe("insufficient"); + }); + + test("'insufficient' on a 'No results found' body", () => { + expect(gradeFromSignal("No results found for 'xyz'.").verdict).toBe("insufficient"); + }); + + test("a literal 'weak match' inside body prose does not trigger (must be a `*…*` footer line)", () => { + expect(gradeFromSignal("## Doc\nThis explains why a weak match can happen.").verdict).toBe("correct"); + }); + + test("empty input → 'correct'", () => { + expect(gradeFromSignal("").verdict).toBe("correct"); + }); +}); + +describe("digestResultsForGrading", () => { + test("keeps the weak-match footer even when the body is large", () => { + const big = Array.from({ length: 8 }, (_, i) => `## Doc ${i} (50% relevant · medium)\nhttps://x/${i}\ncollection: \`c\` doc_id: \`${i}\`\n\n${"lorem ipsum ".repeat(80)}`).join("\n"); + const text = `${big}\n\n*Weak match — try: broader query: "wider"*`; + const digest = digestResultsForGrading(text); + expect(digest).toContain('*Weak match — try: broader query: "wider"*'); + expect(digest.length).toBeLessThan(text.length); + expect(digest).toContain("## Doc 0"); + }); + + test("trims each block's body to a short prefix", () => { + const text = `## Doc (70% relevant · high)\nhttps://x/1\ncollection: \`c\` doc_id: \`1\`\n\n${"A".repeat(2000)}`; + const digest = digestResultsForGrading(text); + expect(digest).toContain("## Doc (70% relevant · high)"); + expect(digest).toContain("…"); // truncation marker + expect(digest.length).toBeLessThan(700); + }); + + test("empty input → empty string", () => { + expect(digestResultsForGrading("")).toBe(""); + expect(digestResultsForGrading(" ")).toBe(""); + }); +}); diff --git a/src/ai/knowledge-grader.ts b/src/ai/knowledge-grader.ts index 52ed45a..081cb07 100644 --- a/src/ai/knowledge-grader.ts +++ b/src/ai/knowledge-grader.ts @@ -3,20 +3,22 @@ import { extractJson } from "./json-extract.ts"; import type { Logger } from "@logtape/logtape"; /** - * CRAG-style retrieval evaluator for the knowledge search tool. Given the - * user's question and the (rendered) search results — which carry per-result - * `confidenceBand` annotations and a `*No confident match — try: …*` footer - * from Huginn's MCP adapter — a dedicated Haiku call decides whether the - * results are good enough to answer from, and if not, proposes a sharper - * query and/or a better collection. + * Retrieval-quality judges for the knowledge search tool, used by the + * corrective-retrieval loop (see corrective-retrieval.ts): * - * This is an **awaiting** Haiku call (it gates whether a corrective re-query - * happens), so it uses {@link spawnHaiku} directly rather than the - * fire-and-forget {@link runHaikuExtraction} pattern. + * - {@link gradeFromSignal} — the **default**: no model call. Just reads the + * cheap signal Huginn already emits (a `*Weak match …*` / `*No confident + * match …*` footer, or a "No results found" body) and returns `insufficient` + * when the search itself was unsure, `correct` otherwise. The re-query, when + * one happens, uses Huginn's own `retryHints` (parsed from that footer). + * - {@link gradeKnowledgeResults} — opt-in (`correctiveRetrieval.grader: + * "haiku"`): a slimmed **awaiting** Haiku call that also reads the result + * snippets and can propose a semantic rewrite / a better collection. Costs + * ~3–5s per search, so it's not the default. * - * Fail-soft: any Haiku error or unparseable output yields `verdict: "correct"` - * — the corrective loop becomes a no-op and the model sees the original result - * unchanged. The corrective feature must never make a search *worse*. + * Both are fail-soft: a Haiku error or unparseable output → `verdict: "correct"` + * (the corrective loop becomes a no-op and the model sees the original result + * unchanged). The corrective feature must never make a search *worse*. * * Plan: `../mimir/plans/huginn-muninn-corrective-rag.md` (Phase 1). */ @@ -25,25 +27,52 @@ export type GradeVerdict = "correct" | "ambiguous" | "insufficient"; export interface KnowledgeGrade { verdict: GradeVerdict; - /** A single search string (not a question) to re-query with. Present only - * when verdict is "ambiguous" or "insufficient" and the grader had a better - * query to offer. */ + /** A single search string (not a question) to re-query with. Present only in + * Haiku mode when the grader had a better query to offer; signal mode never + * sets it (the re-query query comes from Huginn's `retryHints` instead). */ rewrittenQuery?: string; /** A collection name to try instead — only when the results hint another - * collection is the right home. Never invented. */ + * collection is the right home. Never invented. Haiku mode only. */ suggestedCollection?: string; /** One short sentence explaining the verdict. */ reason: string; } +// ── Signal grader (default — no model call) ──────────────────────────────── + +/** Matches the `*Weak match …*` / `*No confident match …*` footer Huginn's MCP + * adapter appends when `bestScore` is below its weak-result threshold or the + * result list is empty. */ +const WEAK_FOOTER_RE = /(^|\n)\s*\*(?:No confident match|Weak match)\b/; +const NO_RESULTS_RE = /(^|\n)No results found for /; + +/** + * Judge a search result purely from Huginn's emitted signal — no LLM. Returns + * `insufficient` (no rewritten query — the corrective loop will fall back to + * the `retryHints.broaderQuery` / `narrowerQuery` parsed from the footer) when + * Huginn flagged the result weak/empty, `correct` otherwise. + */ +export function gradeFromSignal(resultText: string): KnowledgeGrade { + const text = resultText ?? ""; + if (NO_RESULTS_RE.test(text)) { + return { verdict: "insufficient", reason: "search returned no results" }; + } + if (WEAK_FOOTER_RE.test(text)) { + return { verdict: "insufficient", reason: "Huginn flagged the result as low confidence" }; + } + return { verdict: "correct", reason: "no low-confidence signal from the search" }; +} + +// ── Haiku grader (opt-in) ────────────────────────────────────────────────── + export interface GradeKnowledgeOptions { question: string; /** The rendered search-result text the model would see (trace markers - * already peeled). */ + * already peeled). Digested down to the top hits before being sent to Haiku. */ toolResultText: string; botName: string; /** Working directory for the Haiku spawn — keeps the session out of the - * project root and gives it the bot's MCP/settings context. */ + * project root. */ cwd?: string; log: Logger; /** Haiku model override (defaults to the project's standard Haiku model). */ @@ -53,16 +82,15 @@ export interface GradeKnowledgeOptions { spawnFn?: typeof spawnHaiku; } -/** Cap the result text fed into the grader prompt — keeps the Haiku call cheap - * and well under its context window. The trailing footer (retry hints) lives - * at the end of the text, so prefer keeping the head + tail. */ -const MAX_RESULT_CHARS = 12_000; +/** Cap the (already-digested) result text fed into the grader prompt. Kept + * small so the Haiku call stays in the ~3–5s range rather than ~10s+. */ +const MAX_GRADER_INPUT_CHARS = 4_000; export async function gradeKnowledgeResults(opts: GradeKnowledgeOptions): Promise { const { question, botName, cwd, log } = opts; - const resultText = clampResultText(opts.toolResultText); + const digest = digestResultsForGrading(opts.toolResultText); - const prompt = buildGraderPrompt(question, resultText); + const prompt = buildGraderPrompt(question, digest); const spawn = opts.spawnFn ?? spawnHaiku; let raw: string; @@ -121,21 +149,60 @@ export function normalizeGrade(parsed: Record): KnowledgeGrade return grade; } -function clampResultText(text: string): string { - if (text.length <= MAX_RESULT_CHARS) return text; - const head = Math.floor(MAX_RESULT_CHARS * 0.7); - const tail = MAX_RESULT_CHARS - head; - return `${text.slice(0, head)}\n…[${text.length - MAX_RESULT_CHARS} chars omitted]…\n${text.slice(-tail)}`; +/** + * Reduce a full rendered result text to a compact digest for the Haiku grader: + * the top result blocks (header line with title + confidence band, the + * breadcrumb/url line, and a short prefix of the body) plus the trailing + * weak-match footer if present. Keeps the prompt small without dropping the + * signal the grader needs (titles + bands + a taste of each hit + whether the + * search flagged itself unsure). + */ +export function digestResultsForGrading(text: string): string { + const src = (text ?? "").trim(); + if (!src) return ""; + + // Pull off the trailing weak-match footer (a single `*…*` line at the end) + // so it's never lost to truncation. + let footer = ""; + const footerMatch = src.match(/\n\s*(\*(?:No confident match|Weak match)[^\n]*\*)\s*$/); + const body = footerMatch ? src.slice(0, footerMatch.index).trimEnd() : src; + if (footerMatch) footer = footerMatch[1]!; + + // Split into result blocks at `## ` headers (the MCP adapter's full-mode + // format). If there are no `## ` headers (brief mode uses `1. **Title**`), + // just take the head of the body. + const blocks = body.split(/\n(?=## )/); + const digestedBlocks: string[] = []; + let used = 0; + for (const block of blocks) { + if (used >= MAX_GRADER_INPUT_CHARS) break; + const lines = block.split("\n"); + // Header + the next couple of lines (url / breadcrumb / collection), then a + // short prefix of whatever follows. + const headLines = lines.slice(0, 4).join("\n"); + const rest = lines.slice(4).join("\n").replace(/\n{2,}/g, "\n").trim(); + const restPrefix = rest.length > 240 ? rest.slice(0, 240) + "…" : rest; + const piece = restPrefix ? `${headLines}\n${restPrefix}` : headLines; + digestedBlocks.push(piece); + used += piece.length; + } + + let out = digestedBlocks.join("\n\n"); + if (out.length > MAX_GRADER_INPUT_CHARS) { + out = out.slice(0, MAX_GRADER_INPUT_CHARS) + "\n…[truncated]…"; + } + if (footer) out = `${out}\n\n${footer}`; + return out; } -function buildGraderPrompt(question: string, resultText: string): string { +function buildGraderPrompt(question: string, resultDigest: string): string { return `You grade the quality of knowledge-base search results before an assistant answers from them. USER QUESTION: ${question} -SEARCH RESULTS (each hit is annotated with a confidence band — high / medium / low; a trailing "No confident match" or "Weak match" line, if present, means the search itself was unsure): -${resultText || "(no results were returned)"} +SEARCH RESULTS (top hits — each annotated with a confidence band: high / medium / low; a trailing "No confident match" or "Weak match" line, if present, means the search itself was unsure): +${resultDigest || "(no results were returned)"} Decide whether these results let the question be answered well, then respond with ONLY a JSON object — no prose, no markdown fence: {"verdict":"correct"|"ambiguous"|"insufficient","rewrittenQuery":"...","suggestedCollection":"...","reason":"..."} diff --git a/src/ai/knowledge-search-client.ts b/src/ai/knowledge-search-client.ts index 599685c..22dca9b 100644 --- a/src/ai/knowledge-search-client.ts +++ b/src/ai/knowledge-search-client.ts @@ -237,6 +237,19 @@ export function renderRetryHintsFooter(resp: Pick 0 ? `\n\n*${prefix} — try: ${bits.join(" · ")}*` : `\n\n*${prefix}.*`; } +/** A trailing `*Weak match …*` / `*No confident match …*` retry-hints footer + * (Huginn's MCP adapter appends one; {@link renderRetryHintsFooter} produces + * the same shape). */ +const TRAILING_RETRY_FOOTER_RE = /\n+\s*\*(?:No confident match|Weak match)[^\n]*\*\s*$/; + +/** Strip a trailing retry-hints footer from a rendered result text. Used when + * splicing a corrective re-query in: the original "try X" footer is obsolete + * once X has been tried, and leaving it would also confuse the next signal-mode + * grade pass into re-detecting the *already-handled* weak signal. */ +export function stripTrailingRetryFooter(text: string): string { + return text.replace(TRAILING_RETRY_FOOTER_RE, ""); +} + const DOC_ID_LINE_RE = /collection:\s*`([^`]+)`\s+doc_id:\s*`([^`]+)`/g; /** Extract `collection/doc_id` keys from rendered search-result text — used to diff --git a/src/bots/config.ts b/src/bots/config.ts index 0af4a4b..ad11bd5 100644 --- a/src/bots/config.ts +++ b/src/bots/config.ts @@ -87,6 +87,11 @@ export interface CorrectiveRetrievalBotConfig { enabled?: boolean; /** Max corrective re-queries per knowledge search. Clamped to 1–2. Default 1. */ retryBudget?: number; + /** Result-quality judge: `"signal"` (default — no model call; re-query only + * when Huginn already flags the result weak, using Huginn's `retryHints`) or + * `"haiku"` (a slimmed awaiting Haiku call that can also propose a semantic + * rewrite — ~3–5s per search, opt-in). */ + grader?: "signal" | "haiku"; } export interface BotPrompts { diff --git a/src/core/corrective-trace-spans.test.ts b/src/core/corrective-trace-spans.test.ts index be64f3c..39b4e0f 100644 --- a/src/core/corrective-trace-spans.test.ts +++ b/src/core/corrective-trace-spans.test.ts @@ -11,18 +11,19 @@ describe("planCorrectiveSpans", () => { test("one knowledge_grade span when graded but not re-queried", () => { const corr: CorrectiveToolMeta = { retries: 0, - verdicts: ["correct"], - reasons: ["covered"], + verdicts: ["insufficient"], + reasons: ["Huginn flagged the result as low confidence"], queriesTried: [], - finalVerdict: "correct", - graderMs: 1200, + finalVerdict: "insufficient", + graderMode: "signal", + graderMs: 0, }; const spans = planCorrectiveSpans(corr, 200); expect(spans.map((s) => s.name)).toEqual(["knowledge_grade"]); expect(spans[0]!.startOffsetMs).toBe(200); - expect(spans[0]!.durationMs).toBe(1200); - expect(spans[0]!.attributes.model).toBe("haiku"); - expect(spans[0]!.attributes.finalVerdict).toBe("correct"); + expect(spans[0]!.durationMs).toBe(1); // 1ms floor — signal mode has ~0 grader time + expect(spans[0]!.attributes.mode).toBe("signal"); + expect(spans[0]!.attributes.finalVerdict).toBe("insufficient"); expect(spans[0]!.attributes.passes).toBe(1); }); @@ -34,6 +35,7 @@ describe("planCorrectiveSpans", () => { queriesTried: ["q1", "q2"], collectionsTried: [null, ["confluence"]], finalVerdict: "correct", + graderMode: "haiku", graderMs: 900, requeryMs: [150, 220], }; @@ -41,6 +43,7 @@ describe("planCorrectiveSpans", () => { expect(spans.map((s) => s.name)).toEqual(["knowledge_grade", "knowledge_requery", "knowledge_requery"]); // grade [300, 1200), requery#1 [1200, 1350), requery#2 [1350, 1570) expect(spans[0]!.startOffsetMs).toBe(300); + expect(spans[0]!.attributes.mode).toBe("haiku"); expect(spans[1]!.startOffsetMs).toBe(1200); expect(spans[1]!.durationMs).toBe(150); expect(spans[1]!.attributes.query).toBe("q1"); @@ -53,7 +56,7 @@ describe("planCorrectiveSpans", () => { test("uses a 1ms floor when timings are missing", () => { const spans = planCorrectiveSpans( - { retries: 1, verdicts: ["insufficient", "correct"], reasons: ["x", "y"], queriesTried: ["q"], finalVerdict: "correct" }, + { retries: 1, verdicts: ["insufficient", "correct"], reasons: ["x", "y"], queriesTried: ["q"], finalVerdict: "correct", graderMode: "signal" }, 0, ); expect(spans[0]!.durationMs).toBe(1); diff --git a/src/core/corrective-trace-spans.ts b/src/core/corrective-trace-spans.ts index f4ca5c0..cea500c 100644 --- a/src/core/corrective-trace-spans.ts +++ b/src/core/corrective-trace-spans.ts @@ -44,7 +44,7 @@ export function planCorrectiveSpans( durationMs: graderMs, startOffsetMs: cursor, attributes: { - model: "haiku", + mode: corrective.graderMode ?? "signal", passes: corrective.verdicts.length, verdicts: corrective.verdicts, finalVerdict: corrective.finalVerdict, diff --git a/src/types.ts b/src/types.ts index 08507ca..31244dc 100644 --- a/src/types.ts +++ b/src/types.ts @@ -95,7 +95,9 @@ export interface CorrectiveToolMeta { collectionsTried?: (string[] | null)[]; /** Verdict from the final grading pass — whether the result set ended up usable. */ finalVerdict: string; - /** Total Haiku grader wall time across all passes, ms. */ + /** Which grader judged the result(s): `"signal"` (no model call) or `"haiku"`. */ + graderMode?: string; + /** Total grader wall time across all passes, ms (≈0 in signal mode). */ graderMs?: number; /** Wall time of each re-query HTTP call, parallel to `queriesTried`, ms. */ requeryMs?: number[]; From 4282066d013de44ab51911a0645d56dfc3880650 Mon Sep 17 00:00:00 2001 From: RuneLind Date: Tue, 12 May 2026 21:35:42 +0200 Subject: [PATCH 3/4] Surface "0 hits / low confidence" on the search-tool waterfall row MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A Huginn search can keep hundreds of candidates yet hand the model "No results found / low confidence" — the kept/fetched candidate count chip hid that. Now, when the captured tool output (or the trace's Phase-0 `response` block) shows the model got nothing usable, the row replaces the `N/N` candidate chip with a red `0 hits` chip; a weak-match footer flips the count chip to the low-confidence palette and adds a tooltip note. The corrective chip's tooltip also now carries the grader mode and the grade reason (e.g. "corrective retrieval (signal): insufficient — search returned no results; no re-query"). --- .../views/components/helpers.test.ts | 45 ++++++++++++++++ src/dashboard/views/components/span-label.ts | 54 ++++++++++++++++--- .../views/components/traces-waterfall.ts | 8 +++ 3 files changed, 99 insertions(+), 8 deletions(-) diff --git a/src/dashboard/views/components/helpers.test.ts b/src/dashboard/views/components/helpers.test.ts index 149968f..e829d51 100644 --- a/src/dashboard/views/components/helpers.test.ts +++ b/src/dashboard/views/components/helpers.test.ts @@ -217,6 +217,51 @@ describe("deriveSpanLabelHtml", () => { expect(single!.html).not.toContain("summed across"); }); + test("shows a '0 hits' chip (not the candidate count) when the search returned nothing to the model", () => { + const out = deriveSpanLabelHtml({ + name: "knowledge-search_knowledge", + attributes: { + output: "No results found for 'meningen med livet' (low confidence).\n\n*No confident match — try: related terms: a, b*", + searchTrace: { + schemaVersion: 1, + collections: [{ name: "kb", candidates: [{ kept: true }, { kept: true }, { kept: true }], confidence: { lowConfidence: false } }], + }, + }, + }); + expect(out!.html).toContain("wf-chip wf-no-hits"); + expect(out!.html).toContain(">0 hits<"); + expect(out!.html).not.toContain(">2/3<"); // candidate count suppressed + expect(out!.tooltip).toContain("no results returned to the model"); + }); + + test("flips counts chip to low-conf variant when the output carries a weak-match footer", () => { + const out = deriveSpanLabelHtml({ + name: "knowledge-search_knowledge", + attributes: { + output: "## A doc (18% relevant · low)\ncollection: `kb` doc_id: `1`\n\nbody\n\n*Weak match — try: broader query: \"x\"*", + searchTrace: { + schemaVersion: 1, + collections: [{ name: "kb", candidates: [{ kept: true }, { kept: true }], confidence: { lowConfidence: false } }], + }, + }, + }); + expect(out!.html).toContain("wf-chip wf-counts wf-low-conf"); + expect(out!.tooltip).toContain("low-confidence results"); + }); + + test("corrective chip tooltip carries the grader mode and reason", () => { + const out = deriveSpanLabelHtml({ + name: "knowledge-search_knowledge", + attributes: { + input: { query: "x", collection: "kb" }, + corrective: { retries: 0, verdicts: ["insufficient"], reasons: ["search returned no results"], finalVerdict: "insufficient", graderMode: "signal", queriesTried: [] }, + }, + }); + expect(out!.html).toContain("wf-corrective-bad"); + expect(out!.html).toContain(">grade "); + expect(out!.tooltip).toMatch(/corrective retrieval \(signal\): insufficient — search returned no results; no re-query/); + }); + test("flips counts chip to low-conf variant when any collection is low-confidence", () => { const out = deriveSpanLabelHtml({ name: "knowledge-search_knowledge", diff --git a/src/dashboard/views/components/span-label.ts b/src/dashboard/views/components/span-label.ts index 3c18414..9bd29da 100644 --- a/src/dashboard/views/components/span-label.ts +++ b/src/dashboard/views/components/span-label.ts @@ -57,6 +57,12 @@ export function deriveSpanLabelHtml(span: SpanLike): { html: string; tooltip: st // retry count so a corrected search is visible at a glance. const corr = correctiveChipFromAttrs(attrs.corrective); + // Whether the search actually returned anything usable *to the model* — + // distinct from "how many candidates the pipeline kept". A search can keep + // hundreds of candidates yet hand the model "No results found / low + // confidence", which the candidate-count chip alone hides. + const resultSignal = searchResultSignal(attrs); + // Search-tool path: collection chips + counts chip, derived from searchTrace // or input.collection. let collections = collectionsFor(attrs); @@ -67,18 +73,23 @@ export function deriveSpanLabelHtml(span: SpanLike): { html: string; tooltip: st const moreChip = collections.length > 1 ? `+${collections.length - 1}` : ''; + const lowConf = !!(summary?.lowConfidence) || resultSignal === "weak"; let countsChip = ""; - if (summary) { - const cls = summary.lowConfidence ? "wf-chip wf-counts wf-low-conf" : "wf-chip wf-counts"; - const scope = collections.length > 1 - ? ` (summed across ${collections.length} collections)` - : ""; - const tip = summary.lowConfidence + if (resultSignal === "empty") { + // The model got "No results found" — the kept/fetched count is candidate + // pipeline noise here, so show the honest outcome instead. + countsChip = `0 hits`; + } else if (summary) { + const cls = lowConf ? "wf-chip wf-counts wf-low-conf" : "wf-chip wf-counts"; + const scope = collections.length > 1 ? ` (summed across ${collections.length} collections)` : ""; + const tip = lowConf ? `${summary.kept} kept / ${summary.fetched} fetched${scope} · low confidence` : `${summary.kept} kept / ${summary.fetched} fetched${scope}`; countsChip = `${summary.kept}/${summary.fetched}`; } const tooltipLines = [span.name, "collections: " + collections.join(", ")]; + if (resultSignal === "empty") tooltipLines.push("⚠ no results returned to the model"); + else if (resultSignal === "weak") tooltipLines.push("⚠ low-confidence results (Huginn flagged a weak match)"); if (summary) { tooltipLines.push(`candidates: ${summary.kept} kept / ${summary.fetched} fetched`); if (summary.topTitle) tooltipLines.push("top: " + summary.topTitle); @@ -116,12 +127,14 @@ export function deriveSpanLabelHtml(span: SpanLike): { html: string; tooltip: st * the corrective pass left the result set usable. */ function correctiveChipFromAttrs(raw: unknown): { html: string; tooltipLines: string[] } | null { if (!raw || typeof raw !== "object") return null; - const c = raw as { retries?: unknown; finalVerdict?: unknown; verdicts?: unknown; queriesTried?: unknown }; + const c = raw as { retries?: unknown; finalVerdict?: unknown; verdicts?: unknown; queriesTried?: unknown; reasons?: unknown; graderMode?: unknown }; const finalVerdict = typeof c.finalVerdict === "string" ? c.finalVerdict : undefined; const verdicts = Array.isArray(c.verdicts) ? c.verdicts.map(String) : []; if (!finalVerdict && verdicts.length === 0) return null; const retries = typeof c.retries === "number" ? c.retries : 0; const queries = Array.isArray(c.queriesTried) ? c.queriesTried.map(String) : []; + const reason = Array.isArray(c.reasons) && typeof c.reasons[0] === "string" ? (c.reasons[0] as string) : ""; + const mode = c.graderMode === "haiku" ? "haiku" : c.graderMode === "signal" ? "signal" : ""; const cls = finalVerdict === "correct" ? "wf-corrective wf-corrective-ok" @@ -129,7 +142,9 @@ function correctiveChipFromAttrs(raw: unknown): { html: string; tooltipLines: st : "wf-corrective wf-corrective-bad"; const sym = finalVerdict === "correct" ? "✓" : finalVerdict === "ambiguous" ? "≈" : "✗"; const text = retries > 0 ? `⟲${retries} ${sym}` : `grade ${sym}`; - const tip = `corrective retrieval: ${verdicts.join(" → ") || finalVerdict}` + + const tip = + `corrective retrieval${mode ? ` (${mode})` : ""}: ${verdicts.join(" → ") || finalVerdict}` + + (reason ? ` — ${reason}` : "") + (queries.length ? `; re-queried: ${queries.map((q) => `"${q}"`).join(", ")}` : "; no re-query"); return { html: `${escHtml(text)}`, @@ -137,6 +152,29 @@ function correctiveChipFromAttrs(raw: unknown): { html: string; tooltipLines: st }; } +/** Whether a search-tool span's result was actually usable *by the model*: + * `"empty"` ("No results found" / `noConfidentResults`), `"weak"` (a + * `*Weak match*` / `*No confident match*` footer), or `null` (looks fine). + * Reads the captured tool output first (ground truth of what the model saw), + * falling back to the Huginn trace's Phase-0 `response` block. */ +function searchResultSignal(attrs: NonNullable): "empty" | "weak" | null { + const out = typeof attrs.output === "string" ? attrs.output : null; + if (out) { + if (/(^|\n)\s*No results found for /.test(out)) return "empty"; + if (/(^|\n)\s*\*(?:No confident match|Weak match)\b/.test(out)) return "weak"; + return null; + } + const trace = attrs.searchTrace; + if (trace && typeof trace === "object") { + const resp = (trace as { response?: { noConfidentResults?: unknown; bestScore?: unknown } }).response; + if (resp) { + if (resp.noConfidentResults === true) return "empty"; + if (typeof resp.bestScore === "number" && resp.bestScore < 0.45) return "weak"; + } + } + return null; +} + interface ToolLabelExtras { chips: string; tooltipLines: string[]; } type ExtrasRecipe = { diff --git a/src/dashboard/views/components/traces-waterfall.ts b/src/dashboard/views/components/traces-waterfall.ts index 73e9e61..9604e49 100644 --- a/src/dashboard/views/components/traces-waterfall.ts +++ b/src/dashboard/views/components/traces-waterfall.ts @@ -128,6 +128,14 @@ export function tracesWaterfallStyles(): string { color: var(--status-warning); border-color: color-mix(in srgb, var(--status-warning) 35%, transparent); } + /* "0 hits" chip — the search returned nothing usable to the model, even if + the pipeline kept candidates. Replaces the kept/fetched count in that case. */ + .wf-chip.wf-no-hits { + background: color-mix(in srgb, var(--status-error, var(--status-magenta)) 14%, transparent); + color: var(--status-error, var(--status-magenta)); + border: 1px solid color-mix(in srgb, var(--status-error, var(--status-magenta)) 35%, transparent); + font-weight: 600; + } /* Corrective-retrieval chip — marks a knowledge search that went through a CRAG-lite grade/requery pass. Color = whether the result set ended usable. */ .wf-chip.wf-corrective { font-variant-numeric: tabular-nums; font-weight: 600; } From 364b4272813a0563250c14462495694cc1540710 Mon Sep 17 00:00:00 2001 From: RuneLind Date: Tue, 12 May 2026 22:51:23 +0200 Subject: [PATCH 4/4] Tidy up corrective-retrieval per review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review findings on the corrective-retrieval branch: - attachCorrectiveOutcomes: the hook now pushes one slot per knowledge-search tool call (a null when that search had no outcome), so a skipped search no longer shifts a later search's metadata onto it. This was a real misalignment bug — common in signal mode, where a confident search produces no outcome. - runCorrectiveRetrieval reuses clampBudget instead of re-deriving the clamp inline (the inline version returned NaN for non-finite input). - Consolidate the "weak match / no results" detection regexes into one place (knowledge-search-client.ts: classifyResultSignal, extractTrailingRetryFooter, WEAK_RESULT_RELEVANCE); knowledge-grader.ts now consumes them. - CorrectiveToolMeta uses proper string-union types; named GRADER_TIMEOUT_MS and WEAK_BEST_SCORE constants instead of bare literals; lookup table for the corrective chip's verdict→style mapping; drop the unused KnowledgeSearchResponse.lowConfidence field; trim a few rot-prone "Phase N" doc comments. --- src/ai/connectors/copilot-sdk.ts | 34 +++++++---- src/ai/connectors/corrective-hook.test.ts | 10 +++ src/ai/corrective-retrieval.ts | 8 +-- src/ai/knowledge-grader.ts | 31 +++------- src/ai/knowledge-search-client.ts | 64 ++++++++++++++------ src/dashboard/views/components/span-label.ts | 38 ++++++------ src/types.ts | 16 +++-- 7 files changed, 126 insertions(+), 75 deletions(-) diff --git a/src/ai/connectors/copilot-sdk.ts b/src/ai/connectors/copilot-sdk.ts index 247bf34..e5f0f4f 100644 --- a/src/ai/connectors/copilot-sdk.ts +++ b/src/ai/connectors/copilot-sdk.ts @@ -91,13 +91,19 @@ export async function executePrompt( // an opt-in slower/smarter alternative. Off by default (see corrective-config.ts); // when off, the hook isn't registered and behaviour is byte-identical to before. const correctiveCfg = resolveCorrectiveConfig(botConfig); - const correctiveOutcomes: CorrectiveMetadata[] = []; + // One slot per knowledge-search tool call (in order); `null` when that search + // produced no corrective outcome (tool error / uneventful signal pass). Keeping + // the slot is what lets attachCorrectiveOutcomes map outcomes positionally + // without desyncing when an earlier search is skipped. + const correctiveOutcomes: (CorrectiveMetadata | null)[] = []; const correctiveEnabled = correctiveCfg.enabled && hasMcp; const userQuestion = correctiveEnabled ? extractUserQuestion(prompt) : ""; const correctiveHooks: SessionConfig["hooks"] | undefined = correctiveEnabled ? { onPostToolUse: async (input) => { if (!isKnowledgeSearchTool(input.toolName)) return; + let metadata: CorrectiveMetadata | null = null; + let modified: { modifiedResult: ToolResultObject } | undefined; try { const result = await applyCorrectiveRetrieval({ toolName: input.toolName, @@ -109,8 +115,8 @@ export async function executePrompt( userQuestion, }); if (result) { - correctiveOutcomes.push(result.metadata); - if (result.modifiedResult) return { modifiedResult: result.modifiedResult }; + metadata = result.metadata; + if (result.modifiedResult) modified = { modifiedResult: result.modifiedResult }; } } catch (e) { log.warn("Corrective retrieval hook failed: {error}", { @@ -118,7 +124,8 @@ export async function executePrompt( error: e instanceof Error ? e.message : String(e), }); } - return; + correctiveOutcomes.push(metadata); + return modified; }, } : undefined; @@ -384,6 +391,10 @@ function abbreviateInput(args: unknown): string | undefined { // ── Corrective retrieval (CRAG-lite) helpers ─────────────────────────────── +/** Timeout for the (opt-in) Haiku grader subprocess. Kept well under the bot's + * overall response timeout so a slow grader can't dominate the request. */ +const GRADER_TIMEOUT_MS = 30_000; + export interface ApplyCorrectiveArgs { toolName: string; toolArgs: unknown; @@ -432,7 +443,7 @@ export async function applyCorrectiveRetrieval( botName: botConfig.name, cwd: botConfig.dir, log, - graderTimeoutMs: 30_000, + graderTimeoutMs: GRADER_TIMEOUT_MS, searchFn: args.searchFn, gradeFn: args.gradeFn, }); @@ -473,15 +484,16 @@ export function extractUserQuestion(prompt: string): string { return trimmed.length > 1500 ? trimmed.slice(-1500).trim() : trimmed; } -/** Attach corrective outcomes to the knowledge-search tool calls in order - * (onPostToolUse exposes no toolCallId, so the i-th outcome maps to the i-th - * knowledge-search tool call). */ -export function attachCorrectiveOutcomes(toolCalls: ToolCall[], outcomes: CorrectiveMetadata[]): void { +/** Attach corrective outcomes to the knowledge-search tool calls in order. + * `onPostToolUse` exposes no toolCallId, so this maps positionally — which is + * exact because the hook pushes one slot per knowledge-search call (a `null` + * for ones with no outcome), parallel to the order they appear in `toolCalls`. */ +export function attachCorrectiveOutcomes(toolCalls: ToolCall[], outcomes: (CorrectiveMetadata | null)[]): void { let i = 0; for (const tc of toolCalls) { - if (i >= outcomes.length) break; if (!isKnowledgeSearchTool(tc.name)) continue; - tc.corrective = correctiveMetaToToolMeta(outcomes[i++]!); + const m = outcomes[i++]; + if (m) tc.corrective = correctiveMetaToToolMeta(m); } } diff --git a/src/ai/connectors/corrective-hook.test.ts b/src/ai/connectors/corrective-hook.test.ts index eaefc64..3becbd7 100644 --- a/src/ai/connectors/corrective-hook.test.ts +++ b/src/ai/connectors/corrective-hook.test.ts @@ -49,6 +49,16 @@ describe("attachCorrectiveOutcomes", () => { expect(calls[2]!.corrective?.collectionsTried).toEqual([null]); }); + test("a null slot (a knowledge search with no outcome) doesn't shift later outcomes onto it", () => { + // search #1 produced nothing (e.g. confident signal-mode pass → null slot), + // search #2 produced an outcome — #2's metadata must land on call #2, not #1. + const calls = [tc("knowledge-search_knowledge"), tc("yggdrasil-symbol_context"), tc("knowledge-search_knowledge")]; + attachCorrectiveOutcomes(calls, [null, meta("ambiguous")]); + expect(calls[0]!.corrective).toBeUndefined(); + expect(calls[1]!.corrective).toBeUndefined(); + expect(calls[2]!.corrective?.finalVerdict).toBe("ambiguous"); + }); + test("no-op when there are no outcomes", () => { const calls = [tc("knowledge-search_knowledge")]; attachCorrectiveOutcomes(calls, []); diff --git a/src/ai/corrective-retrieval.ts b/src/ai/corrective-retrieval.ts index 234ad0d..d4e1497 100644 --- a/src/ai/corrective-retrieval.ts +++ b/src/ai/corrective-retrieval.ts @@ -1,6 +1,6 @@ import type { Logger } from "@logtape/logtape"; import { gradeKnowledgeResults, gradeFromSignal, type GradeVerdict, type KnowledgeGrade } from "./knowledge-grader.ts"; -import type { GraderMode } from "./corrective-config.ts"; +import { clampBudget, type GraderMode } from "./corrective-config.ts"; import { searchKnowledge, renderSearchResults, @@ -36,9 +36,9 @@ import { * block for tracing. Fail-soft throughout: a grader that can't be reached * returns "correct" (no change); a re-query HTTP error ends the loop with * whatever's accumulated. The caller gates on the per-bot toggle — this - * function assumes the feature is enabled and `budget >= 1`. + * function assumes the feature is enabled. * - * Plan: `../mimir/plans/huginn-muninn-corrective-rag.md` (Phase 1). + * Design: `../mimir/plans/huginn-muninn-corrective-rag.md`. */ export interface CorrectiveMetadata { @@ -104,7 +104,7 @@ export interface CorrectiveRetrievalContext { } export async function runCorrectiveRetrieval(ctx: CorrectiveRetrievalContext): Promise { - const budget = Math.max(1, Math.min(2, Math.floor(ctx.budget))); + const budget = clampBudget(ctx.budget); const graderMode: GraderMode = ctx.grader ?? "signal"; const search = ctx.searchFn ?? searchKnowledge; const haikuGrade = ctx.gradeFn ?? gradeKnowledgeResults; diff --git a/src/ai/knowledge-grader.ts b/src/ai/knowledge-grader.ts index 081cb07..3d9aaef 100644 --- a/src/ai/knowledge-grader.ts +++ b/src/ai/knowledge-grader.ts @@ -1,5 +1,6 @@ import { spawnHaiku } from "../scheduler/executor.ts"; import { extractJson } from "./json-extract.ts"; +import { classifyResultSignal, extractTrailingRetryFooter } from "./knowledge-search-client.ts"; import type { Logger } from "@logtape/logtape"; /** @@ -19,8 +20,6 @@ import type { Logger } from "@logtape/logtape"; * Both are fail-soft: a Haiku error or unparseable output → `verdict: "correct"` * (the corrective loop becomes a no-op and the model sees the original result * unchanged). The corrective feature must never make a search *worse*. - * - * Plan: `../mimir/plans/huginn-muninn-corrective-rag.md` (Phase 1). */ export type GradeVerdict = "correct" | "ambiguous" | "insufficient"; @@ -40,12 +39,6 @@ export interface KnowledgeGrade { // ── Signal grader (default — no model call) ──────────────────────────────── -/** Matches the `*Weak match …*` / `*No confident match …*` footer Huginn's MCP - * adapter appends when `bestScore` is below its weak-result threshold or the - * result list is empty. */ -const WEAK_FOOTER_RE = /(^|\n)\s*\*(?:No confident match|Weak match)\b/; -const NO_RESULTS_RE = /(^|\n)No results found for /; - /** * Judge a search result purely from Huginn's emitted signal — no LLM. Returns * `insufficient` (no rewritten query — the corrective loop will fall back to @@ -53,14 +46,14 @@ const NO_RESULTS_RE = /(^|\n)No results found for /; * Huginn flagged the result weak/empty, `correct` otherwise. */ export function gradeFromSignal(resultText: string): KnowledgeGrade { - const text = resultText ?? ""; - if (NO_RESULTS_RE.test(text)) { - return { verdict: "insufficient", reason: "search returned no results" }; - } - if (WEAK_FOOTER_RE.test(text)) { - return { verdict: "insufficient", reason: "Huginn flagged the result as low confidence" }; + switch (classifyResultSignal(resultText ?? "")) { + case "empty": + return { verdict: "insufficient", reason: "search returned no results" }; + case "weak": + return { verdict: "insufficient", reason: "Huginn flagged the result as low confidence" }; + default: + return { verdict: "correct", reason: "no low-confidence signal from the search" }; } - return { verdict: "correct", reason: "no low-confidence signal from the search" }; } // ── Haiku grader (opt-in) ────────────────────────────────────────────────── @@ -161,12 +154,8 @@ export function digestResultsForGrading(text: string): string { const src = (text ?? "").trim(); if (!src) return ""; - // Pull off the trailing weak-match footer (a single `*…*` line at the end) - // so it's never lost to truncation. - let footer = ""; - const footerMatch = src.match(/\n\s*(\*(?:No confident match|Weak match)[^\n]*\*)\s*$/); - const body = footerMatch ? src.slice(0, footerMatch.index).trimEnd() : src; - if (footerMatch) footer = footerMatch[1]!; + // Pull off the trailing weak-match footer so it's never lost to truncation. + const { body, footer } = extractTrailingRetryFooter(src); // Split into result blocks at `## ` headers (the MCP adapter's full-mode // format). If there are no `## ` headers (brief mode uses `1. **Title**`), diff --git a/src/ai/knowledge-search-client.ts b/src/ai/knowledge-search-client.ts index 22dca9b..b9a281c 100644 --- a/src/ai/knowledge-search-client.ts +++ b/src/ai/knowledge-search-client.ts @@ -11,14 +11,12 @@ function knowledgeApiBaseUrl(): string { /** * Thin HTTP client for Huginn's `GET /api/search`, plus a renderer that mirrors * the shape Huginn's MCP adapter produces (so a corrective re-query's hits read - * identically to the ones the model already saw) and a parser for the - * `collection: \`x\` doc_id: \`y\`` lines those results carry (used to dedupe a - * re-query against the original result text). - * - * Scope: this is the Phase-1 corrective-retrieval consumer of the Phase-0 - * contract — `bestScore`, per-result `confidenceBand`, `retryHints`, - * `noConfidentResults`, `min_relevance`. See - * `../mimir/plans/huginn-muninn-corrective-rag.md`. + * identically to the ones the model already saw) and parsers for the signal + * Huginn bakes into result text — the `collection: \`x\` doc_id: \`y\`` lines + * (for deduping a re-query) and the `*Weak match …*` / "No results" footer + * (for grading). Used by the corrective-retrieval loop; consumes Huginn's + * `bestScore` / `confidenceBand` / `retryHints` / `noConfidentResults` / + * `min_relevance` contract. */ export type ConfidenceBand = "high" | "medium" | "low"; @@ -61,7 +59,6 @@ export interface KnowledgeSearchResponse { retryHints?: KnowledgeRetryHints; /** Present when Huginn returns a relational graph answer ahead of the hits. */ graphAnswer?: string; - lowConfidence?: boolean; } export interface SearchKnowledgeOptions { @@ -142,7 +139,6 @@ function normalizeResponse(data: Record): KnowledgeSearchRespon noConfidentResults: data.noConfidentResults === true, retryHints: parseRetryHints(data.retryHints), graphAnswer: data.graph_answer ? String(data.graph_answer) : undefined, - lowConfidence: data.lowConfidence === true, }; } @@ -237,10 +233,44 @@ export function renderRetryHintsFooter(resp: Pick 0 ? `\n\n*${prefix} — try: ${bits.join(" · ")}*` : `\n\n*${prefix}.*`; } -/** A trailing `*Weak match …*` / `*No confident match …*` retry-hints footer - * (Huginn's MCP adapter appends one; {@link renderRetryHintsFooter} produces - * the same shape). */ -const TRAILING_RETRY_FOOTER_RE = /\n+\s*\*(?:No confident match|Weak match)[^\n]*\*\s*$/; +/** + * Patterns describing the signal Huginn's MCP adapter emits about result + * quality (the renderer above produces the same shapes). Centralised here so + * the grader, the orchestrator and the dashboard all read the same thing. + * + * - {@link NO_RESULTS_BODY_RE} — a "No results found for …" body (matches + * anywhere a line starts with it, so it still fires after merging). + * - {@link WEAK_MATCH_FOOTER_RE} — a `*Weak match …*` / `*No confident match …*` + * line anywhere in the text (used for detection). + * - {@link TRAILING_RETRY_FOOTER_RE} — the same footer anchored at end-of-string, + * with a capture group (used for stripping/extracting it). + */ +export const NO_RESULTS_BODY_RE = /(^|\n)\s*No results found for /; +export const WEAK_MATCH_FOOTER_RE = /(^|\n)\s*\*(?:No confident match|Weak match)\b/; +const TRAILING_RETRY_FOOTER_RE = /\n+\s*(\*(?:No confident match|Weak match)[^\n]*\*)\s*$/; + +/** Huginn's weak-result relevance threshold — a `bestScore` below this means + * "found something, but nothing confidently relevant". Mirrors Huginn's + * `WEAK_RESULT_RELEVANCE`. */ +export const WEAK_RESULT_RELEVANCE = 0.45; + +/** Classify a rendered search-result text by the quality signal Huginn baked + * into it: `"empty"` (no results), `"weak"` (a weak/no-confident-match footer), + * or `null` (looks fine). */ +export function classifyResultSignal(text: string): "empty" | "weak" | null { + if (!text) return null; + if (NO_RESULTS_BODY_RE.test(text)) return "empty"; + if (WEAK_MATCH_FOOTER_RE.test(text)) return "weak"; + return null; +} + +/** Split a rendered result text into its body and trailing retry-hints footer + * (`""` when there's no footer). */ +export function extractTrailingRetryFooter(text: string): { body: string; footer: string } { + const m = text.match(TRAILING_RETRY_FOOTER_RE); + if (!m) return { body: text, footer: "" }; + return { body: text.slice(0, m.index).trimEnd(), footer: m[1]! }; +} /** Strip a trailing retry-hints footer from a rendered result text. Used when * splicing a corrective re-query in: the original "try X" footer is obsolete @@ -254,9 +284,9 @@ const DOC_ID_LINE_RE = /collection:\s*`([^`]+)`\s+doc_id:\s*`([^`]+)`/g; /** Extract `collection/doc_id` keys from rendered search-result text — used to * dedupe a corrective re-query against the original result the model already - * has, since (per the chosen Phase-1 approach) we don't re-fetch the original - * in structured form. The `collection: \`…\` doc_id: \`…\`` line is emitted by - * Huginn's MCP adapter for every hit and is stable. */ + * has (we don't re-fetch the original in structured form). The + * `collection: \`…\` doc_id: \`…\`` line is emitted by Huginn's MCP adapter + * for every hit and is stable. */ export function extractDocKeysFromRenderedText(text: string): Set { const keys = new Set(); for (const m of text.matchAll(DOC_ID_LINE_RE)) { diff --git a/src/dashboard/views/components/span-label.ts b/src/dashboard/views/components/span-label.ts index 9bd29da..1dc0d07 100644 --- a/src/dashboard/views/components/span-label.ts +++ b/src/dashboard/views/components/span-label.ts @@ -9,12 +9,7 @@ interface SpanLike { toolId?: unknown; input?: unknown; output?: unknown; - corrective?: { - retries?: unknown; - finalVerdict?: unknown; - verdicts?: unknown; - queriesTried?: unknown; - } | unknown; + corrective?: unknown; searchTrace?: | { collections?: Array<{ @@ -121,6 +116,12 @@ export function deriveSpanLabelHtml(span: SpanLike): { html: string; tooltip: st return null; } +const CORRECTIVE_VERDICT_DISPLAY: Record = { + correct: { cls: "wf-corrective wf-corrective-ok", sym: "✓" }, + ambiguous: { cls: "wf-corrective wf-corrective-warn", sym: "≈" }, + insufficient: { cls: "wf-corrective wf-corrective-bad", sym: "✗" }, +}; + /** Build the corrective-retrieval chip from a tool span's `attributes.corrective`. * Returns null when the attribute is absent or malformed. Chip text is the * final verdict's symbol + retry count (e.g. `⟲1 ✓`); color reflects whether @@ -136,32 +137,35 @@ function correctiveChipFromAttrs(raw: unknown): { html: string; tooltipLines: st const reason = Array.isArray(c.reasons) && typeof c.reasons[0] === "string" ? (c.reasons[0] as string) : ""; const mode = c.graderMode === "haiku" ? "haiku" : c.graderMode === "signal" ? "signal" : ""; - const cls = - finalVerdict === "correct" ? "wf-corrective wf-corrective-ok" - : finalVerdict === "ambiguous" ? "wf-corrective wf-corrective-warn" - : "wf-corrective wf-corrective-bad"; - const sym = finalVerdict === "correct" ? "✓" : finalVerdict === "ambiguous" ? "≈" : "✗"; - const text = retries > 0 ? `⟲${retries} ${sym}` : `grade ${sym}`; + const display = CORRECTIVE_VERDICT_DISPLAY[finalVerdict ?? "insufficient"] ?? CORRECTIVE_VERDICT_DISPLAY.insufficient!; + const text = retries > 0 ? `⟲${retries} ${display.sym}` : `grade ${display.sym}`; const tip = `corrective retrieval${mode ? ` (${mode})` : ""}: ${verdicts.join(" → ") || finalVerdict}` + (reason ? ` — ${reason}` : "") + (queries.length ? `; re-queried: ${queries.map((q) => `"${q}"`).join(", ")}` : "; no re-query"); return { - html: `${escHtml(text)}`, + html: `${escHtml(text)}`, tooltipLines: [tip], }; } +// Mirrors knowledge-search-client.ts's classifyResultSignal — kept local because +// this file lives in the dashboard layer and shouldn't import from src/ai. +const NO_RESULTS_OUTPUT_RE = /(^|\n)\s*No results found for /; +const WEAK_FOOTER_OUTPUT_RE = /(^|\n)\s*\*(?:No confident match|Weak match)\b/; +/** Huginn's weak-result relevance threshold (its `WEAK_RESULT_RELEVANCE`). */ +const WEAK_BEST_SCORE = 0.45; + /** Whether a search-tool span's result was actually usable *by the model*: * `"empty"` ("No results found" / `noConfidentResults`), `"weak"` (a * `*Weak match*` / `*No confident match*` footer), or `null` (looks fine). * Reads the captured tool output first (ground truth of what the model saw), - * falling back to the Huginn trace's Phase-0 `response` block. */ + * falling back to the Huginn trace's `response` block. */ function searchResultSignal(attrs: NonNullable): "empty" | "weak" | null { const out = typeof attrs.output === "string" ? attrs.output : null; if (out) { - if (/(^|\n)\s*No results found for /.test(out)) return "empty"; - if (/(^|\n)\s*\*(?:No confident match|Weak match)\b/.test(out)) return "weak"; + if (NO_RESULTS_OUTPUT_RE.test(out)) return "empty"; + if (WEAK_FOOTER_OUTPUT_RE.test(out)) return "weak"; return null; } const trace = attrs.searchTrace; @@ -169,7 +173,7 @@ function searchResultSignal(attrs: NonNullable): "empty" const resp = (trace as { response?: { noConfidentResults?: unknown; bestScore?: unknown } }).response; if (resp) { if (resp.noConfidentResults === true) return "empty"; - if (typeof resp.bestScore === "number" && resp.bestScore < 0.45) return "weak"; + if (typeof resp.bestScore === "number" && resp.bestScore < WEAK_BEST_SCORE) return "weak"; } } return null; diff --git a/src/types.ts b/src/types.ts index 31244dc..555d017 100644 --- a/src/types.ts +++ b/src/types.ts @@ -82,11 +82,17 @@ export interface ToolCall { corrective?: CorrectiveToolMeta; } +// These unions are duplicated (not imported) from src/ai because src/types.ts +// is a leaf module imported widely — pulling in src/ai would invert the +// dependency direction. The corrective-retrieval code asserts compatibility. +export type CorrectiveVerdict = "correct" | "ambiguous" | "insufficient"; +export type CorrectiveGraderMode = "signal" | "haiku"; + export interface CorrectiveToolMeta { /** Number of corrective re-queries actually issued (0–budget). */ retries: number; - /** Grader verdict from each grading pass, in order ("correct" | "ambiguous" | "insufficient"). */ - verdicts: string[]; + /** Grader verdict from each grading pass, in order. */ + verdicts: CorrectiveVerdict[]; /** Grader reason per pass, parallel to `verdicts`. */ reasons: string[]; /** Re-query strings actually issued (excludes the original query). */ @@ -94,9 +100,9 @@ export interface CorrectiveToolMeta { /** Collections each re-query was scoped to, parallel to `queriesTried`; `null` = all. */ collectionsTried?: (string[] | null)[]; /** Verdict from the final grading pass — whether the result set ended up usable. */ - finalVerdict: string; - /** Which grader judged the result(s): `"signal"` (no model call) or `"haiku"`. */ - graderMode?: string; + finalVerdict: CorrectiveVerdict; + /** Which grader judged the result(s). */ + graderMode?: CorrectiveGraderMode; /** Total grader wall time across all passes, ms (≈0 in signal mode). */ graderMs?: number; /** Wall time of each re-query HTTP call, parallel to `queriesTried`, ms. */