diff --git a/csis/verification/critic_stack.py b/csis/verification/critic_stack.py index 5623ee8..9057a12 100644 --- a/csis/verification/critic_stack.py +++ b/csis/verification/critic_stack.py @@ -10,18 +10,60 @@ returns a recent precision metric; if it drops below floor, promotions freeze. This is a separate signal from the verified-gain rate so the Critic can't be tuned-into-quietude. + +``CriticEffortLevel`` parameterises review depth, mirroring the effort tiers +shipped in Claude Code v2.1.147 (``/code-review low|medium|high|max``). The +Coordinator picks ``low`` for cheap sanity-checks inside a tight budget and +``max`` for pre-promotion gates where a missed flaw is costlier than latency. +The mapping is intentionally coarse — the point is that the Coordinator can +express intent without hard-coding token counts in call sites. """ from __future__ import annotations import json import re from dataclasses import dataclass, field +from enum import Enum from typing import Iterable from csis.backends.base import LLMBackend, LLMRequest from csis.contracts import Artifact, CriticFinding, GraderResult, Plan +class CriticEffortLevel(str, Enum): + """How deeply should the critic look? + + Maps to (min_attempts, max_tokens) in EFFORT_PARAMS. + + low — quick sanity check; 1 attempt, 500 tokens. Use inside tight + per-iteration budgets (e.g. rapid Researcher → Builder loops). + medium — default; 3 attempts, 2 000 tokens. Matches the historical + Phase-0 baseline. + high — thorough; 5 attempts, 4 000 tokens. Use before Verifier cert. + max — exhaustive; 8 attempts, 8 000 tokens. Use at promotion gates + where a missed flaw is more costly than extra latency. + """ + + low = "low" + medium = "medium" + high = "high" + max = "max" + + +@dataclass(frozen=True) +class _EffortParams: + min_attempts: int + max_tokens: int + + +EFFORT_PARAMS: dict[CriticEffortLevel, _EffortParams] = { + CriticEffortLevel.low: _EffortParams(min_attempts=1, max_tokens=500), + CriticEffortLevel.medium: _EffortParams(min_attempts=3, max_tokens=2000), + CriticEffortLevel.high: _EffortParams(min_attempts=5, max_tokens=4000), + CriticEffortLevel.max: _EffortParams(min_attempts=8, max_tokens=8000), +} + + CRITIC_SYSTEM_PROMPT = """You are the Critic in CSIS Phase-0. Your one job is to falsify the claim that the artifact passes the plan @@ -84,14 +126,24 @@ def run_critic( plan: Plan, artifact: Artifact, grader_results: list[GraderResult], - min_attempts: int = 3, + min_attempts: int | None = None, + effort: CriticEffortLevel = CriticEffortLevel.medium, ) -> list[CriticFinding]: + """Run the V2 critic at the requested effort level. + + ``effort`` selects the (min_attempts, max_tokens) pair from EFFORT_PARAMS. + If ``min_attempts`` is supplied explicitly it overrides the effort-derived + value; this preserves backward compatibility for callers that hard-code a + count (e.g. the F7 seeded-flaw evaluator, which always wants exactly 3). + """ + params = EFFORT_PARAMS[effort] + effective_min = min_attempts if min_attempts is not None else params.min_attempts req = LLMRequest( role="critic", checkpoint_id=checkpoint_id, system=CRITIC_SYSTEM_PROMPT, - prompt=build_critic_prompt(plan, artifact, grader_results, min_attempts), - max_tokens=2000, + prompt=build_critic_prompt(plan, artifact, grader_results, effective_min), + max_tokens=params.max_tokens, ) resp = backend.complete(req) return parse_critic_output(resp.text) diff --git a/tests/test_verification.py b/tests/test_verification.py index 212827a..300da6f 100644 --- a/tests/test_verification.py +++ b/tests/test_verification.py @@ -4,6 +4,7 @@ F1 — mock-vs-mock cross-checkpoint must be structural, not decorative F6 — corrupted grader: pinned source hash check F7 — critic incentive: seeded synthetic flaws + minimum attempts + CriticEffortLevel — effort tiers produce correct min_attempts + max_tokens """ from __future__ import annotations @@ -22,6 +23,8 @@ build_certificate, ) from csis.verification.critic_stack import ( + EFFORT_PARAMS, + CriticEffortLevel, CriticEvaluator, SeededFlaw, parse_critic_output, @@ -202,3 +205,95 @@ def test_seeded_flaw_evaluator_tracks_catch_rate() -> None: caught_2 = evaluator.submit_seeded(backend=backend, checkpoint_id="beta", flaw=flaw) assert caught_1 and not caught_2 assert evaluator.catch_rate() == 0.5 + + +# ---- CriticEffortLevel ------------------------------------------------------ + + +def test_effort_params_ordering() -> None: + """Higher effort levels must demand more attempts and allow more tokens. + + This test fails if the EFFORT_PARAMS table is mis-ordered, which would + let a low-effort critic silently substitute for a high-effort gate. + """ + levels = [CriticEffortLevel.low, CriticEffortLevel.medium, CriticEffortLevel.high, CriticEffortLevel.max] + for i in range(len(levels) - 1): + lo = EFFORT_PARAMS[levels[i]] + hi = EFFORT_PARAMS[levels[i + 1]] + assert lo.min_attempts < hi.min_attempts, ( + f"{levels[i]} min_attempts={lo.min_attempts} must be < " + f"{levels[i+1]} min_attempts={hi.min_attempts}" + ) + assert lo.max_tokens < hi.max_tokens, ( + f"{levels[i]} max_tokens={lo.max_tokens} must be < " + f"{levels[i+1]} max_tokens={hi.max_tokens}" + ) + + +def test_run_critic_effort_low_uses_fewer_attempts_than_high() -> None: + """run_critic at 'low' effort sends a smaller min_attempts prompt than 'high'.""" + captured: list[str] = [] + + class CapturingBackend(MockBackend): + def complete(self, req): # type: ignore[override] + captured.append(req.prompt) + return super().complete(req) + + backend_low = CapturingBackend() + # Return enough falsification attempts for any effort level. + big_response = ( + '[{"attempt":"a","falsified":false},' + '{"attempt":"b","falsified":false},' + '{"attempt":"c","falsified":false},' + '{"attempt":"d","falsified":false},' + '{"attempt":"e","falsified":false},' + '{"attempt":"f","falsified":false},' + '{"attempt":"g","falsified":false},' + '{"attempt":"h","falsified":false}]' + ) + backend_low.script("critic", "beta", big_response) + run_critic(backend=backend_low, checkpoint_id="beta", plan=_plan(), + artifact=_artifact(), grader_results=[], effort=CriticEffortLevel.low) + + backend_max = CapturingBackend() + backend_max.script("critic", "beta", big_response) + run_critic(backend=backend_max, checkpoint_id="beta", plan=_plan(), + artifact=_artifact(), grader_results=[], effort=CriticEffortLevel.max) + + low_prompt, max_prompt = captured[0], captured[1] + # The prompt embeds the min_attempts number; max must request more. + assert "1" in low_prompt # low: at least 1 + assert "8" in max_prompt # max: at least 8 + + +def test_run_critic_explicit_min_attempts_overrides_effort() -> None: + """Explicit min_attempts= kwarg overrides the effort-derived value. + + This preserves backward compatibility for the F7 seeded-flaw evaluator + and any other caller that hard-codes a count. + """ + backend = MockBackend() + backend.script("critic", "beta", + '[{"attempt":"x","falsified":false},{"attempt":"y","falsified":false}]') + # Pass effort=low (which gives 1 attempt) but override to 2 explicitly. + findings = run_critic( + backend=backend, + checkpoint_id="beta", + plan=_plan(), + artifact=_artifact(), + grader_results=[], + effort=CriticEffortLevel.low, + min_attempts=2, + ) + assert len(findings) == 2 + + +def test_effort_medium_matches_historical_baseline() -> None: + """CriticEffortLevel.medium must keep min_attempts=3, max_tokens=2000. + + These values match the Phase-0 baseline so existing certs and the F7 + seeded-flaw floor remain calibrated after the effort-level refactor. + """ + params = EFFORT_PARAMS[CriticEffortLevel.medium] + assert params.min_attempts == 3 + assert params.max_tokens == 2000