Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions plugins/goal-guard/agents.js
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,19 @@ export const BASE_GATES = Object.freeze([
"goal-final-auditor",
]);

/**
* Minimal base-gate set for lean/token-conscious mode. Drops the code-only
* diff-reviewer and verifier gates — each full review pass runs O(N) subagent
* subtasks (one per required gate) so reducing N by 2 saves two subtask tokens
* per cycle. The prompt auditor, correctness reviewer, and final auditor are the
* irreducible safety floor (goal alignment + correctness + finality).
*/
export const LEAN_BASE_GATES = Object.freeze([
"goal-prompt-auditor",
"goal-reviewer",
"goal-final-auditor",
]);

/**
* Gates that only make sense when the goal actually changed code. A research,
* analysis, explanation, or planning goal produces a text/evidence deliverable
Expand Down
27 changes: 26 additions & 1 deletion plugins/goal-guard/autocontinue.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* `state`, so it is fully unit-testable.
*/

import { completionAllowed, missingGates } from "./gates.js";
import { completionAllowed, missingGates, evidenceCoverageMet } from "./gates.js";

/** Consecutive no-change idle ticks after which auto-continue pauses for the human. */
export const NO_PROGRESS_LIMIT = 4;
Expand Down Expand Up @@ -54,13 +54,38 @@ export function progressSignature(state, config) {
* exact reviewer gates via the task tool. */
export function continuationMessage(state, config) {
const missing = missingGates(state, config);
const criteria = Array.isArray(state?.contract?.acceptanceCriteria) ? state.contract.acceptanceCriteria : [];
const hasCriteria = criteria.length > 0;
const evidenceOk = evidenceCoverageMet(state);
const lines = ["The goal is NOT complete — do not stop. Continue working now."];
if (!state?.contract) {
lines.push("First, record the Goal Contract with the `goal_contract` tool (title, the original request, and concrete acceptance criteria) so the objective is anchored.");
}
if (state?.dirty) {
lines.push("There are changes that are not yet reviewed/verified after your latest edits — actually run the code/tests and record it with `goal_evidence`.");
}
if (hasCriteria && !evidenceOk) {
const uncovered = criteria.filter((c) => {
const full = String(c).trim().toLowerCase();
if (!full) return false;
const entries = Array.isArray(state?.evidence) ? state.evidence : [];
const lastEditSeq = state?.lastEditSeq || 0;
return !entries.some((entry) => {
const ecriteria = Array.isArray(entry.criteria) ? entry.criteria : [];
if (!ecriteria.some((ec) => String(ec).trim().toLowerCase() === full)) return false;
if (!entry.seq) return lastEditSeq === 0;
return entry.seq > lastEditSeq;
});
});
if (uncovered.length) {
lines.push(
`EVIDENCE COVERAGE MISSING — every acceptance criterion must have fresh recorded ` +
`evidence. Uncovered criteria: ${uncovered.map((c) => `"${c}"`).join(", ")}. ` +
`Run verification and record with \`goal_evidence\`, passing each uncovered criterion. ` +
`Mere exhaustion of approaches is NOT success — the guard requires proof.`,
);
}
}
if (missing.length) {
if (config?.programmaticReview) {
lines.push(
Expand Down
23 changes: 21 additions & 2 deletions plugins/goal-guard/completion.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
* adversarial digit-runs cannot trigger polynomial backtracking (issue #367).
*/

import { missingGates, completionAllowed } from "./gates.js";
import { missingGates, completionAllowed, evidenceCoverageMet } from "./gates.js";
import { summarizeState } from "./summary.js";

const CYCLES_RE = /Review cycles:\s*(\d+)/gi;
Expand Down Expand Up @@ -131,7 +131,26 @@ export function evaluateCompletionClaim(state, config, text) {
reason = `claimed review cycles (${claimedCycles}) do not match recorded review cycles (${state.reviewCycles})`;
} else if (!completionAllowed(state, config)) {
const missing = missingGates(state, config).join(", ");
reason = `required review gates are missing or stale (${missing || "goal session not active"})`;
if (missing) {
reason = `required review gates are missing or stale (${missing})`;
} else if (!evidenceCoverageMet(state)) {
const criteria = Array.isArray(state?.contract?.acceptanceCriteria) ? state.contract.acceptanceCriteria : [];
const uncovered = criteria.filter((c) => {
const full = String(c).trim().toLowerCase();
if (!full) return false;
const entries = Array.isArray(state?.evidence) ? state.evidence : [];
const lastEditSeq = state?.lastEditSeq || 0;
return !entries.some((entry) => {
const ecriteria = Array.isArray(entry.criteria) ? entry.criteria : [];
if (!ecriteria.some((ec) => String(ec).trim().toLowerCase() === full)) return false;
if (!entry.seq) return lastEditSeq === 0;
return entry.seq > lastEditSeq;
});
});
reason = `acceptance criteria lack evidence coverage (${uncovered.map((c) => `"${c}"`).join(", ") || "all criteria"}) — goal not achieved, mere exhaustion is not success`;
} else {
reason = "goal session not active";
}
}

if (!reason) return { blocked: false, claimedCycles };
Expand Down
16 changes: 16 additions & 0 deletions plugins/goal-guard/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,18 @@ export const DEFAULT_CONFIG = Object.freeze({
* explanation goal is gated on evidence instead, never on an empty diff);
* "always" forces them on; "never" turns them off. */
requireCodeReview: "auto",
/** Require fresh evidence covering every acceptance criterion before completion
* is allowed. This is the programmatic exhaustion-prevention gate: a strong LLM
* that exhausts every approach and persuades the reviewers it "tried everything"
* cannot satisfy this check for criteria it never actually achieved — the guard
* demands verified, recorded proof, not statements of effort. Default true. */
requireEvidenceCoverage: true,
/** Lean mode — reduces the base gate set to the 3 safety-critical reviewers
* (prompt-auditor, reviewer, final-auditor) and disables contextual gates, so
* each full review round runs fewer subagent subtasks and token consumption per
* cycle drops sharply. Combined with the evidence-coverage check this keeps the
* exhaustion loophole closed. Default false. */
leanGates: false,
/** Block non-Goal agents from invoking the goal-* subagents via the task tool. */
restrictSubagents: true,
/** Maximum tracked sessions before LRU eviction. */
Expand Down Expand Up @@ -134,6 +146,8 @@ export const CONFIG_DOCS = Object.freeze({
// Gates & scope
contextualGates: { group: "Gates", summary: "Require specialist reviewer gates derived from goal text / changed files." },
requireCodeReview: { group: "Gates", summary: "When to require the code-only diff/verification gates: 'auto' (only once the goal edits a file), 'always', or 'never'. Lets non-code agentic goals complete on evidence." },
requireEvidenceCoverage: { group: "Gates", summary: "Require fresh recorded evidence for every acceptance criterion before completion (exhaustion-prevention gate — the guard demands verified proof, not statements of effort)." },
leanGates: { group: "Gates", summary: "Lean mode: reduce the base reviewer set and disable contextual gates for much lower token consumption per review cycle." },
restrictSubagents: { group: "Gates", summary: "Lock the goal-* subagents to the Goal agent (other agents can't call them)." },
// State & lifecycle
injectSystemState: { group: "State", summary: "Inject a live Goal Guard state block into the system prompt." },
Expand Down Expand Up @@ -218,6 +232,8 @@ function fromEnv(env) {
GOAL_GUARD_PERSIST: ["persist", coerceBool],
GOAL_GUARD_CONTEXTUAL_GATES: ["contextualGates", coerceBool],
GOAL_GUARD_REQUIRE_CODE_REVIEW: ["requireCodeReview", coerceStr],
GOAL_GUARD_REQUIRE_EVIDENCE_COVERAGE: ["requireEvidenceCoverage", coerceBool],
GOAL_GUARD_LEAN_GATES: ["leanGates", coerceBool],
GOAL_GUARD_RESTRICT_SUBAGENTS: ["restrictSubagents", coerceBool],
GOAL_GUARD_MAX_SESSIONS: ["maxSessions", coerceInt],
GOAL_GUARD_SESSION_TTL_MS: ["sessionTtlMs", coerceInt],
Expand Down
50 changes: 46 additions & 4 deletions plugins/goal-guard/gates.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* Re-running verification after a clean review no longer re-opens the gates.
*/

import { BASE_GATES, CODE_GATES, CONTEXTUAL_GATES } from "./agents.js";
import { BASE_GATES, LEAN_BASE_GATES, CODE_GATES, CONTEXTUAL_GATES } from "./agents.js";

/**
* Whether this goal has touched code. A goal is code-bearing once it is dirty or has
Expand Down Expand Up @@ -79,14 +79,19 @@ export function refreshStickyGates(state) {

/** The reviewers that must PASS for this state, given config. */
export function requiredGates(state, config) {
// Lean mode uses the minimal base-gate set (3 reviewers instead of 5) and
// disables contextual gates — drastically reduces token consumption per cycle.
const lean = !!(config?.leanGates);
// Code-only gates (diff review, verification) are required only when the goal
// actually changed code; a non-code goal is gated on its evidence instead, so it
// is not blocked forever by an empty `git diff`. `requireCodeReview` overrides the
// auto-detection: "always" forces them on, "never" off.
const mode = config?.requireCodeReview || "auto";
const includeCodeGates = mode === "always" ? true : mode === "never" ? false : isCodeBearing(state);
const gates = BASE_GATES.filter((agent) => includeCodeGates || !CODE_GATES.includes(agent));
if (!config || config.contextualGates) {
const baseSet = lean ? LEAN_BASE_GATES : BASE_GATES;
const gates = baseSet.filter((agent) => includeCodeGates || !CODE_GATES.includes(agent));
// Contextual gates are disabled in lean mode (token economy).
if (!lean && (!config || config.contextualGates)) {
const contextual = new Set([...(state.stickyGates || []), ...contextualGatesFor(state)]);
for (const agent of contextual) {
if (!gates.includes(agent)) gates.push(agent);
Expand All @@ -106,6 +111,43 @@ export function missingGates(state, config) {
return requiredGates(state, config).filter((agent) => !gatePassedFresh(state, agent));
}

/**
* Whether the evidence ledger covers every recorded acceptance criterion with
* at least one piece of fresh (post-last-edit) evidence. A goal with no criteria
* trivially passes.
*
* This is a programmatic exhaustion-prevention gate: a strong LLM agent that
* exhausts every approach and convinces the reviewers it "tried everything"
* cannot fake evidence for criteria it never achieved. The guard requires
* VERIFIED, recorded proof — not statements of effort.
*/
export function evidenceCoverageMet(state) {
const criteria = Array.isArray(state?.contract?.acceptanceCriteria)
? state.contract.acceptanceCriteria
: [];
if (!criteria.length) return true; // no criteria = nothing to cover
const entries = Array.isArray(state?.evidence) ? state.evidence : [];
const lastEditSeq = state?.lastEditSeq || 0;
for (const criterion of criteria) {
const full = String(criterion).trim().toLowerCase();
if (!full) continue;
const covered = entries.some((entry) => {
const ecriteria = Array.isArray(entry.criteria) ? entry.criteria : [];
if (!ecriteria.some((c) => String(c).trim().toLowerCase() === full)) return false;
// Evidence must be fresh (recorded after the last edit).
if (!entry.seq) return lastEditSeq === 0;
return entry.seq > lastEditSeq;
});
if (!covered) return false;
}
return true;
}

export function completionAllowed(state, config) {
return Boolean(state.active) && missingGates(state, config).length === 0;
if (!state.active) return false;
if (missingGates(state, config).length !== 0) return false;
// Evidence-coverage check: every acceptance criterion must have fresh recorded
// evidence — a programmatic gate the reviewers cannot be persuaded past.
if (config?.requireEvidenceCoverage !== false && !evidenceCoverageMet(state)) return false;
return true;
}
6 changes: 3 additions & 3 deletions plugins/goal-guard/summary.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* messages, and the `goal_status` tool. Kept pure and dependency-light.
*/

import { requiredGates, missingGates, gatePassedFresh } from "./gates.js";
import { requiredGates, missingGates, gatePassedFresh, completionAllowed } from "./gates.js";
import { prettyAgentName } from "./agents.js";

/**
Expand Down Expand Up @@ -99,7 +99,7 @@ export function sidebarView(state, config) {
const cycles = Number(state.reviewCycles) || 0;
const gates = `${passing}/${required.length} gates`;
const todos = sidebarTodos(state, required, missing);
const done = required.length > 0 && missing.length === 0 && !state.dirty;
const done = completionAllowed(state, config) && !state.dirty;
if (done) {
return {
state: "done",
Expand Down Expand Up @@ -190,7 +190,7 @@ export function statusReport(state, config) {
reviewerMemory: reviewerMemoryReport(state),
changedFiles: state.changedFiles.slice(-50),
contract: state.contract,
completionAllowed: Boolean(state.active) && missing.length === 0,
completionAllowed: completionAllowed(state, config),
};
}

Expand Down
1 change: 1 addition & 0 deletions tests/autocontinue.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ test("a COMPLETE goal does not auto-continue and resets the counters", () => {
reviewCycles: 1,
autoContinueCount: 7,
autoContinueNoProgress: 3,
evidence: [{ command: "test", result: "pass", criteria: ["done"], seq: 2 }],
});
const d = evaluateAutoContinue(st, DEFAULT_CONFIG);
assert.equal(d.continue, false);
Expand Down
12 changes: 12 additions & 0 deletions tests/config.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,15 @@ test("REGRESSION: a degenerate maxSessions (0/negative) falls back to the defaul
assert.equal(resolveConfig({ maxSessions: 25 }).maxSessions, 25);
assert.equal(resolveConfig({}, { GOAL_GUARD_MAX_SESSIONS: "0" }).maxSessions, DEFAULT_CONFIG.maxSessions);
});

test("requireEvidenceCoverage defaults to true and is configurable", () => {
assert.equal(DEFAULT_CONFIG.requireEvidenceCoverage, true);
assert.equal(resolveConfig({ requireEvidenceCoverage: false }).requireEvidenceCoverage, false);
assert.equal(resolveConfig(undefined, { GOAL_GUARD_REQUIRE_EVIDENCE_COVERAGE: "off" }).requireEvidenceCoverage, false);
});

test("leanGates defaults to false and is configurable", () => {
assert.equal(DEFAULT_CONFIG.leanGates, false);
assert.equal(resolveConfig({ leanGates: true }).leanGates, true);
assert.equal(resolveConfig(undefined, { GOAL_GUARD_LEAN_GATES: "1" }).leanGates, true);
});
3 changes: 2 additions & 1 deletion tests/deep-bughunt.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { evaluateCompletionClaim } from "../plugins/goal-guard/completion.js";
import { analyzeCommand } from "../plugins/goal-guard/shell.js";
import { goalSimilarity, SAME_GOAL_THRESHOLD, createGoalTools } from "../plugins/goal-guard/tools.js";
import { createStore, createState } from "../plugins/goal-guard/state.js";
import { markEdit } from "../plugins/goal-guard/events.js";
import { markEdit, recordEvidence } from "../plugins/goal-guard/events.js";
import { runReviewCycle } from "../plugins/goal-guard/review-runner.js";
import { DEFAULT_CONFIG } from "../plugins/goal-guard/config.js";
import { completionAllowed } from "../plugins/goal-guard/gates.js";
Expand Down Expand Up @@ -59,6 +59,7 @@ async function allowedState(id) {
state.active = true;
state.contract = { title: "Do x", original: "do x", acceptanceCriteria: ["x works"] };
markEdit(store, state, "edit");
recordEvidence(store, state, "test", "PASS", ["x works"]);
await runReviewCycle(mockReviewClient("PASS", id), store, state, DEFAULT_CONFIG, { sessionID: id, sleep: async () => {}, pollMs: 1, timeoutMs: 500 });
return state;
}
Expand Down
Loading
Loading