Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 55 additions & 3 deletions csis/verification/critic_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,60 @@
returns a recent precision metric; if it drops below floor, promotions
freeze. This is a separate signal from the verified-gain rate so the
Critic can't be tuned-into-quietude.

``CriticEffortLevel`` parameterises review depth, mirroring the effort tiers
shipped in Claude Code v2.1.147 (``/code-review low|medium|high|max``). The
Coordinator picks ``low`` for cheap sanity-checks inside a tight budget and
``max`` for pre-promotion gates where a missed flaw is costlier than latency.
The mapping is intentionally coarse — the point is that the Coordinator can
express intent without hard-coding token counts in call sites.
"""
from __future__ import annotations

import json
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Iterable

from csis.backends.base import LLMBackend, LLMRequest
from csis.contracts import Artifact, CriticFinding, GraderResult, Plan


class CriticEffortLevel(str, Enum):
"""How deeply should the critic look?

Maps to (min_attempts, max_tokens) in EFFORT_PARAMS.

low — quick sanity check; 1 attempt, 500 tokens. Use inside tight
per-iteration budgets (e.g. rapid Researcher → Builder loops).
medium — default; 3 attempts, 2 000 tokens. Matches the historical
Phase-0 baseline.
high — thorough; 5 attempts, 4 000 tokens. Use before Verifier cert.
max — exhaustive; 8 attempts, 8 000 tokens. Use at promotion gates
where a missed flaw is more costly than extra latency.
"""

low = "low"
medium = "medium"
high = "high"
max = "max"


@dataclass(frozen=True)
class _EffortParams:
min_attempts: int
max_tokens: int


EFFORT_PARAMS: dict[CriticEffortLevel, _EffortParams] = {
CriticEffortLevel.low: _EffortParams(min_attempts=1, max_tokens=500),
CriticEffortLevel.medium: _EffortParams(min_attempts=3, max_tokens=2000),
CriticEffortLevel.high: _EffortParams(min_attempts=5, max_tokens=4000),
CriticEffortLevel.max: _EffortParams(min_attempts=8, max_tokens=8000),
}


CRITIC_SYSTEM_PROMPT = """You are the Critic in CSIS Phase-0.

Your one job is to falsify the claim that the artifact passes the plan
Expand Down Expand Up @@ -84,14 +126,24 @@ def run_critic(
plan: Plan,
artifact: Artifact,
grader_results: list[GraderResult],
min_attempts: int = 3,
min_attempts: int | None = None,
effort: CriticEffortLevel = CriticEffortLevel.medium,
) -> list[CriticFinding]:
"""Run the V2 critic at the requested effort level.

``effort`` selects the (min_attempts, max_tokens) pair from EFFORT_PARAMS.
If ``min_attempts`` is supplied explicitly it overrides the effort-derived
value; this preserves backward compatibility for callers that hard-code a
count (e.g. the F7 seeded-flaw evaluator, which always wants exactly 3).
"""
params = EFFORT_PARAMS[effort]
effective_min = min_attempts if min_attempts is not None else params.min_attempts
req = LLMRequest(
role="critic",
checkpoint_id=checkpoint_id,
system=CRITIC_SYSTEM_PROMPT,
prompt=build_critic_prompt(plan, artifact, grader_results, min_attempts),
max_tokens=2000,
prompt=build_critic_prompt(plan, artifact, grader_results, effective_min),
max_tokens=params.max_tokens,
)
resp = backend.complete(req)
return parse_critic_output(resp.text)
Expand Down
95 changes: 95 additions & 0 deletions tests/test_verification.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
F1 — mock-vs-mock cross-checkpoint must be structural, not decorative
F6 — corrupted grader: pinned source hash check
F7 — critic incentive: seeded synthetic flaws + minimum attempts
CriticEffortLevel — effort tiers produce correct min_attempts + max_tokens
"""
from __future__ import annotations

Expand All @@ -22,6 +23,8 @@
build_certificate,
)
from csis.verification.critic_stack import (
EFFORT_PARAMS,
CriticEffortLevel,
CriticEvaluator,
SeededFlaw,
parse_critic_output,
Expand Down Expand Up @@ -202,3 +205,95 @@ def test_seeded_flaw_evaluator_tracks_catch_rate() -> None:
caught_2 = evaluator.submit_seeded(backend=backend, checkpoint_id="beta", flaw=flaw)
assert caught_1 and not caught_2
assert evaluator.catch_rate() == 0.5


# ---- CriticEffortLevel ------------------------------------------------------


def test_effort_params_ordering() -> None:
"""Higher effort levels must demand more attempts and allow more tokens.

This test fails if the EFFORT_PARAMS table is mis-ordered, which would
let a low-effort critic silently substitute for a high-effort gate.
"""
levels = [CriticEffortLevel.low, CriticEffortLevel.medium, CriticEffortLevel.high, CriticEffortLevel.max]
for i in range(len(levels) - 1):
lo = EFFORT_PARAMS[levels[i]]
hi = EFFORT_PARAMS[levels[i + 1]]
assert lo.min_attempts < hi.min_attempts, (
f"{levels[i]} min_attempts={lo.min_attempts} must be < "
f"{levels[i+1]} min_attempts={hi.min_attempts}"
)
assert lo.max_tokens < hi.max_tokens, (
f"{levels[i]} max_tokens={lo.max_tokens} must be < "
f"{levels[i+1]} max_tokens={hi.max_tokens}"
)


def test_run_critic_effort_low_uses_fewer_attempts_than_high() -> None:
"""run_critic at 'low' effort sends a smaller min_attempts prompt than 'high'."""
captured: list[str] = []

class CapturingBackend(MockBackend):
def complete(self, req): # type: ignore[override]
captured.append(req.prompt)
return super().complete(req)

backend_low = CapturingBackend()
# Return enough falsification attempts for any effort level.
big_response = (
'[{"attempt":"a","falsified":false},'
'{"attempt":"b","falsified":false},'
'{"attempt":"c","falsified":false},'
'{"attempt":"d","falsified":false},'
'{"attempt":"e","falsified":false},'
'{"attempt":"f","falsified":false},'
'{"attempt":"g","falsified":false},'
'{"attempt":"h","falsified":false}]'
)
backend_low.script("critic", "beta", big_response)
run_critic(backend=backend_low, checkpoint_id="beta", plan=_plan(),
artifact=_artifact(), grader_results=[], effort=CriticEffortLevel.low)

backend_max = CapturingBackend()
backend_max.script("critic", "beta", big_response)
run_critic(backend=backend_max, checkpoint_id="beta", plan=_plan(),
artifact=_artifact(), grader_results=[], effort=CriticEffortLevel.max)

low_prompt, max_prompt = captured[0], captured[1]
# The prompt embeds the min_attempts number; max must request more.
assert "1" in low_prompt # low: at least 1
assert "8" in max_prompt # max: at least 8


def test_run_critic_explicit_min_attempts_overrides_effort() -> None:
"""Explicit min_attempts= kwarg overrides the effort-derived value.

This preserves backward compatibility for the F7 seeded-flaw evaluator
and any other caller that hard-codes a count.
"""
backend = MockBackend()
backend.script("critic", "beta",
'[{"attempt":"x","falsified":false},{"attempt":"y","falsified":false}]')
# Pass effort=low (which gives 1 attempt) but override to 2 explicitly.
findings = run_critic(
backend=backend,
checkpoint_id="beta",
plan=_plan(),
artifact=_artifact(),
grader_results=[],
effort=CriticEffortLevel.low,
min_attempts=2,
)
assert len(findings) == 2


def test_effort_medium_matches_historical_baseline() -> None:
"""CriticEffortLevel.medium must keep min_attempts=3, max_tokens=2000.

These values match the Phase-0 baseline so existing certs and the F7
seeded-flaw floor remain calibrated after the effort-level refactor.
"""
params = EFFORT_PARAMS[CriticEffortLevel.medium]
assert params.min_attempts == 3
assert params.max_tokens == 2000