diff --git a/.github/workflows/runtime-live-e2e.yml b/.github/workflows/runtime-live-e2e.yml index eaf8d8ce..30cca536 100644 --- a/.github/workflows/runtime-live-e2e.yml +++ b/.github/workflows/runtime-live-e2e.yml @@ -177,12 +177,20 @@ jobs: # gate/rejection/merge-hook scenarios the codex-live lane runs, through the # spacedock claude front door. This is the AC-4 proof that both runtime lanes # exercise the shared scenario table, not separate per-host smokes. + # + # `-timeout 40m` is a LOOSE BACKSTOP only — sized above the full 4-scenario + # serial-suite wall-time (~27m opus: rejection-flow 8.98m measured + the + # heavier 3-cycle escalation + gate + merge-hook). The REAL guard is the + # per-stage no-progress quiet budget (the streamWatcher's quietBudgetDefault, + # 60s) in the runners: it resets on every stream line and kills a hang at 60s + # of stream SILENCE. This ceiling never fires in a healthy run, it only bounds + # a pathological progressing-but-runaway loop. - name: Run live Claude shared scenarios env: SPACEDOCK_LIVE_MODEL: ${{ matrix.model }} run: | set -o pipefail - go test -tags live -count=1 -run TestLiveClaudeSharedScenarios ./internal/ensigncycle/ -v 2>&1 | tee claude-shared-scenarios-transcript.txt + go test -tags live -count=1 -timeout 40m -run TestLiveClaudeSharedScenarios ./internal/ensigncycle/ -v 2>&1 | tee claude-shared-scenarios-transcript.txt - name: Upload live artifacts if: always() @@ -302,7 +310,7 @@ jobs: echo "- \`go version\`: \`$(go version)\`" >> "$GITHUB_STEP_SUMMARY" - name: Run live Codex shared scenarios - run: go test -tags live -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v + run: go test -tags live -count=1 -timeout 40m -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v - name: Upload live artifacts if: always() diff --git a/docs/dev/README.md b/docs/dev/README.md index 33a96813..b9bb93b5 100644 --- a/docs/dev/README.md +++ b/docs/dev/README.md @@ -173,10 +173,11 @@ Each runner adapter turns a shared scenario into a real launch and returns `(bef | `observed` extract | read the `--output-last-message` file (+ jsonl) | extract the `result`/`success` event's `result` text from the stream (`extractClaudeFinalMessage`) | | Artifacts | jsonl / final-message / stderr | stream jsonl / final-message | -The three shared scenarios reuse the old shared Claude/Codex Python journey overlap (`tests/test_gate_guardrail.py`, `tests/test_rejection_flow.py`, `tests/test_merge_hook_guardrail.py`): +The shared scenarios reuse the old shared Claude/Codex Python journey overlap (`tests/test_gate_guardrail.py`, `tests/test_rejection_flow.py`, `tests/test_merge_hook_guardrail.py`): - `gate-guardrail`: starts at a human gate and asserts the first officer presents the gate instead of self-approving, mutating, or archiving the entity. -- `rejection-flow`: starts from a rejected validation report and asserts the first officer routes the concrete finding back through implementation. +- `rejection-flow`: drives a two-cycle rejection trajectory — route the concrete finding back through implementation, re-implement, and re-validate a second cycle reusing the kept-alive reviewer — restoring the second cycle the Go port dropped. +- `feedback-3-cycle-escalation`: starts from two prior rejection cycles at a third REJECTED validation and asserts the first officer escalates to the human on the third cycle instead of auto-bouncing a fourth time. - `merge-hook-guardrail`: attempts terminalization while a merge hook is registered and asserts the guard refuses bypass without `mod-block`, PR, or force. Assertions prefer durable workflow state over transcript phrasing: entity frontmatter (status / completed / verdict), archive-vs-no-archive, the exact fix marker and a second stage report, and only the durable user-facing final-message obligations (a gate review and a decision prompt). `extractClaudeFinalMessage` surfaces a stale-credential `is_error`/`401` `result` event as a LOUD launch failure, distinct from a scenario-assertion failure, so a credential problem is never misread as a runtime regression. @@ -200,16 +201,16 @@ export SPACEDOCK_BIN="$PWD/spacedock" export SPACEDOCK_REPO_ROOT="$PWD" ``` -Run the Claude shared suite locally (skips when no Claude auth is available — set `~/.claude/benchmark-token` for the OAuth path or `ANTHROPIC_API_KEY` for the API-key path; runs against a fresh isolated `HOME`): +Run the Claude shared suite locally (skips when no Claude auth is available — set `~/.claude/benchmark-token` for the OAuth path or `ANTHROPIC_API_KEY` for the API-key path; runs against a fresh isolated `HOME`). The `-timeout 40m` is a LOOSE BACKSTOP only — sized above the full 4-scenario serial-suite wall-time (~27m opus). The REAL liveness guard is the per-stage no-progress quiet budget (the shared `streamWatcher`, 60s) in the runners: it resets on every stream line and kills a hang at 60s of stream silence. The 40m ceiling never fires in a healthy run, it only bounds a pathological progressing-but-runaway loop and keeps the suite off Go's too-short default 10m binary timeout: ```bash -go test -tags live -count=1 -run TestLiveClaudeSharedScenarios ./internal/ensigncycle -v +go test -tags live -count=1 -timeout 40m -run TestLiveClaudeSharedScenarios ./internal/ensigncycle -v ``` Run the Codex shared suite locally (`npm install -g @openai/codex` then `codex login`, or set `OPENAI_API_KEY`). Local runs may authenticate either through an existing Codex login at `~/.codex/auth.json` or through `OPENAI_API_KEY`. The test copies only `auth.json` into a temporary `CODEX_HOME` for the local subscription path; it does not copy local plugin state or the rest of the operator's Codex config. CI does not use local subscription auth. ```bash -go test -tags live -count=1 -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v +go test -tags live -count=1 -timeout 40m -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v ``` Run the Pi front-door smoke locally (`npm install -g pi-coding-agent`, `pi install npm:pi-subagents`, and either `pi login` or `OPENAI_API_KEY`). The smoke loads the current checkout's Spacedock first-officer and ensign skills plus the local pi-subagents extension/skill explicitly; it verifies durable state in the split-root state checkout rather than transcript wording alone. diff --git a/docs/specs/scenario-testing-principles.md b/docs/specs/scenario-testing-principles.md index 62cd6792..a87c687a 100644 --- a/docs/specs/scenario-testing-principles.md +++ b/docs/specs/scenario-testing-principles.md @@ -52,11 +52,12 @@ The `(scenario, mode, runtime)` tuple is the primary variant row that is **run, ## Seed Scenarios -The first foundation is the three host-neutral runtime scenarios already shipped and held in host parity by the shared coverage tests. They are the named seed instances: +The first foundation is the host-neutral runtime scenarios already shipped and held in host parity by the shared coverage tests. They are the named seed instances: - `gate-guardrail` — the FO halts at a human gate and presents the review without self-approval, mutation, or archival. -- `rejection-flow` — the FO observes a rejected validation report and routes the concrete finding back through implementation. +- `rejection-flow` — the FO drives a two-cycle rejection trajectory: route the finding back through implementation, re-implement, and re-validate a second cycle reusing the kept-alive reviewer. +- `feedback-3-cycle-escalation` — on the third consecutive REJECTED validation the FO escalates to the human instead of auto-bouncing a fourth time. - `merge-hook-guardrail` — the FO cannot bypass a registered merge hook by terminalizing without pr, mod-block, or force. diff --git a/internal/ensigncycle/auto_continue_live_test.go b/internal/ensigncycle/auto_continue_live_test.go index 001fce8b..1a82d81d 100644 --- a/internal/ensigncycle/auto_continue_live_test.go +++ b/internal/ensigncycle/auto_continue_live_test.go @@ -9,7 +9,6 @@ import ( "os" "path/filepath" "testing" - "time" "github.com/spacedock-dev/spacedock/internal/livescenario" ) @@ -41,8 +40,9 @@ func TestLiveAutoContinueAfterImplementation(t *testing.T) { runner := newClaudeLiveRunner(t) // Implementation completion → validator dispatch → (single-entity) gate // auto-resolve → merge/terminalize runs TWO full agent runs serially (the FO - // and the fresh validator), so the budget is generous. - adapter := claudeRunnerAdapter{t: t, runner: runner, timeout: 15 * time.Minute} + // and the fresh validator). Liveness is the runner's per-stage no-progress quiet + // budget (the shared streamWatcher), not a per-call basket — those are banned. + adapter := claudeRunnerAdapter{t: t, runner: runner} var workflowDir string sc := livescenario.Scenario{ diff --git a/internal/ensigncycle/claude_live_runner_test.go b/internal/ensigncycle/claude_live_runner_test.go index 9e630353..763020e2 100644 --- a/internal/ensigncycle/claude_live_runner_test.go +++ b/internal/ensigncycle/claude_live_runner_test.go @@ -3,8 +3,7 @@ package ensigncycle import ( - "bytes" - "context" + "io" "os" "os/exec" "path/filepath" @@ -89,9 +88,10 @@ func claudeLiveScenarios(t *testing.T) []claudeLiveScenario { // map lacks a runner for any sharedRuntimeScenarios() ID. func claudeScenarioRunners() map[string]func(*testing.T, claudeLiveRunner, sharedRuntimeScenario) { return map[string]func(*testing.T, claudeLiveRunner, sharedRuntimeScenario){ - "gate-guardrail": runClaudeGateGuardrailScenario, - "rejection-flow": runClaudeRejectionFlowScenario, - "merge-hook-guardrail": runClaudeMergeHookGuardrailScenario, + "gate-guardrail": runClaudeGateGuardrailScenario, + "rejection-flow": runClaudeRejectionFlowScenario, + "feedback-3-cycle-escalation": runClaudeFeedback3CycleEscalationScenario, + "merge-hook-guardrail": runClaudeMergeHookGuardrailScenario, } } @@ -145,6 +145,34 @@ func runClaudeRejectionFlowScenario(t *testing.T, runner claudeLiveRunner, scena if err := assertRejectionFlow(after, result.finalMessage+"\n"+result.stream); err != nil { t.Fatalf("%v\nFinal message:\n%s\nArtifacts: %s", err, result.finalMessage, result.artifactDir) } + // AC-4 reviewer-reuse: on Claude teams the FO must reuse the kept-alive + // validation reviewer via a SendMessage tool call for the cycle-2 re-review, + // not dispatch a fresh one (the #141 keepalive contract the Go port dropped). + // Host-specific producer signal, graded by the runner — not the shared + // host-neutral assertion. + if err := assertClaudeReviewerReuse(result.stream); err != nil { + t.Fatalf("%v\nArtifacts: %s", err, result.artifactDir) + } + emitClaudeScenarioMetrics(t, scenario, result, runner.model) +} + +// runClaudeFeedback3CycleEscalationScenario drives the real FO against a fixture +// seeded with two prior rejection cycles at a 3rd REJECTED report and grades the +// durable end-state: the FO must escalate to the human on the 3rd cycle, not +// auto-bounce a 4th time. assertThirdCycleEscalation grades durable entity-body +// state ALONE (cycle count + escalation marker + no post-cycle-3 implementation +// report) — the reviewer-reuse signal is host-specific and lives in rejection-flow, +// not here; this scenario is purely a host-neutral durable-state grade. +func runClaudeFeedback3CycleEscalationScenario(t *testing.T, runner claudeLiveRunner, scenario sharedRuntimeScenario) { + t.Helper() + workflowRoot := t.TempDir() + entityPath := writeEscalationWorkflow(t, workflowRoot) + + result := runner.run(t, scenario, workflowRoot, escalationPrompt()) + after := readFile(t, entityPath) + if err := assertThirdCycleEscalation(after); err != nil { + t.Fatalf("%v\nEntity after:\n%s\nFinal message:\n%s\nArtifacts: %s", err, after, result.finalMessage, result.artifactDir) + } emitClaudeScenarioMetrics(t, scenario, result, runner.model) } @@ -173,6 +201,15 @@ func runClaudeMergeHookGuardrailScenario(t *testing.T, runner claudeLiveRunner, // `--` and forwards verbatim to claude. The observed source is the stream's // result/success event via extractClaudeFinalMessage — a 401/is_error result is a // LOUD launch failure here, never fed into a scenario assertion. +// +// Liveness is the EXISTING streamWatcher (the Go port of the upstream +// FOStreamWatcher, shared with TestLiveEnsignCycle) — one mechanism, no second +// impl. drainToExit runs the process to exit while accumulating the full +// transcript, bounded by the per-step no-progress quietBudgetDefault (60s): the +// deadline resets on every drained line, so a genuine multi-minute run of +// sequential model work never trips as long as the stream keeps moving, and only +// silence past the budget kills the process — the same ≤60s AC-1-guarded discipline +// the live cycle uses. func (r claudeLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, workflowRoot, prompt string) claudeScenarioResult { t.Helper() artifactDir := filepath.Join(r.artifactRoot, scenario.name) @@ -182,9 +219,7 @@ func (r claudeLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, work streamPath := filepath.Join(artifactDir, "claude-stream.jsonl") finalPath := filepath.Join(artifactDir, "claude-final-message.txt") - ctx, cancel := context.WithTimeout(context.Background(), scenario.timeout) - defer cancel() - cmd := exec.CommandContext(ctx, r.binary, "claude", + cmd := exec.Command(r.binary, "claude", "--plugin-dir", r.repoRoot, "--skip-contract-check", "--", @@ -197,21 +232,33 @@ func (r claudeLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, work cmd.Dir = workflowRoot cmd.Env = r.env - // stdout carries the stream-json transcript; stderr is folded in so a launch - // error (e.g. a stale-token 401 printed to stderr) is captured alongside it. - var buf bytes.Buffer - cmd.Stdout = &buf - cmd.Stderr = &buf + // stdout carries the stream-json transcript the watcher drains for liveness; + // stderr is folded into the same pipe so a launch error (e.g. a stale-token 401 + // printed to stderr) lands in the transcript too — matching the live cycle's + // wiring. The cmdPoller closes the pipe write-end on exit so the scanner EOFs. + pr, pw := io.Pipe() + cmd.Stdout = pw + cmd.Stderr = pw started := time.Now() - runErr := cmd.Run() + if startErr := cmd.Start(); startErr != nil { + t.Fatalf("spacedock claude failed to start for %s: %v", scenario.name, startErr) + } + poller := newCmdPoller(cmd, pw) + defer poller.kill() + watcher := newStreamWatcher(newPipeLineSource(pr), poller, func(line string) { t.Log(line) }) + + // drainToExit runs the process to exit accumulating the full transcript, OR + // kills it on a 60s no-progress stall (the per-step quiet budget). The deferred + // poller.kill() reaps the process on every exit path. + stream, stallErr := watcher.drainToExit(quietBudgetDefault, "claude shared scenario "+scenario.name) duration := time.Since(started) - stream := buf.String() + if writeErr := os.WriteFile(streamPath, []byte(stream), 0o644); writeErr != nil { t.Fatal(writeErr) } - if ctx.Err() == context.DeadlineExceeded { - t.Fatalf("spacedock claude did not finish within %s for %s; artifacts in %s", scenario.timeout, scenario.name, artifactDir) + if stallErr != nil { + t.Fatalf("%v\nArtifacts: %s", stallErr, artifactDir) } // Extract the final message from the stream's result/success event (the @@ -220,8 +267,8 @@ func (r claudeLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, work // failure, so a stale credential never feeds the 401 text into an assertion. finalMessage, extractErr := extractClaudeFinalMessage(stream) if extractErr != nil { - t.Fatalf("claude launch failed for %s (run err=%v): %v; artifacts in %s\nStream tail:\n%s", - scenario.name, runErr, extractErr, artifactDir, tail(stream, 4000)) + t.Fatalf("claude launch failed for %s: %v; artifacts in %s\nStream tail:\n%s", + scenario.name, extractErr, artifactDir, tail(stream, 4000)) } if writeErr := os.WriteFile(finalPath, []byte(finalMessage), 0o644); writeErr != nil { t.Fatal(writeErr) diff --git a/internal/ensigncycle/codex_live_runner_test.go b/internal/ensigncycle/codex_live_runner_test.go index fbb65893..ea40eb81 100644 --- a/internal/ensigncycle/codex_live_runner_test.go +++ b/internal/ensigncycle/codex_live_runner_test.go @@ -3,7 +3,7 @@ package ensigncycle import ( - "context" + "io" "os" "os/exec" "path/filepath" @@ -71,9 +71,10 @@ func codexLiveScenarios(t *testing.T) []codexLiveScenario { // lacks a runner for any sharedRuntimeScenarios() ID. func codexScenarioRunners() map[string]func(*testing.T, codexLiveRunner, sharedRuntimeScenario) { return map[string]func(*testing.T, codexLiveRunner, sharedRuntimeScenario){ - "gate-guardrail": runCodexGateGuardrailScenario, - "rejection-flow": runCodexRejectionFlowScenario, - "merge-hook-guardrail": runCodexMergeHookGuardrailScenario, + "gate-guardrail": runCodexGateGuardrailScenario, + "rejection-flow": runCodexRejectionFlowScenario, + "feedback-3-cycle-escalation": runCodexFeedback3CycleEscalationScenario, + "merge-hook-guardrail": runCodexMergeHookGuardrailScenario, } } @@ -167,6 +168,32 @@ func runCodexRejectionFlowScenario(t *testing.T, runner codexLiveRunner, scenari if err := assertRejectionFlow(after, result.finalMessage+"\n"+result.jsonl); err != nil { t.Fatalf("%v\nFinal message:\n%s\nArtifacts: %s", err, result.finalMessage, result.artifactDir) } + // AC-4 reviewer-reuse: on Codex the FO routes the cycle-2 re-review to the + // kept-alive validation worker with a `send_input` call (the Codex analog of + // Claude's SendMessage reuse), not a fresh dispatch. Host-specific producer + // signal, graded by the runner — not the shared host-neutral assertion. + if err := assertCodexReviewerReuse(result.jsonl); err != nil { + t.Fatalf("%v\nArtifacts: %s", err, result.artifactDir) + } + emitCodexScenarioMetrics(t, scenario, result) +} + +// runCodexFeedback3CycleEscalationScenario drives the real FO against a fixture +// seeded with two prior rejection cycles at a 3rd REJECTED report and grades the +// durable end-state: the FO must escalate to the human on the 3rd cycle, not +// auto-bounce a 4th time. assertThirdCycleEscalation grades durable entity-body +// state ALONE (cycle count + escalation marker + no post-cycle-3 implementation +// report) — host-neutral, the same assertion the Claude runner feeds. +func runCodexFeedback3CycleEscalationScenario(t *testing.T, runner codexLiveRunner, scenario sharedRuntimeScenario) { + t.Helper() + workflowRoot := t.TempDir() + entityPath := writeEscalationWorkflow(t, workflowRoot) + + result := runner.run(t, scenario, workflowRoot, escalationPrompt()) + after := readFile(t, entityPath) + if err := assertThirdCycleEscalation(after); err != nil { + t.Fatalf("%v\nEntity after:\n%s\nFinal message:\n%s\nArtifacts: %s", err, after, result.finalMessage, result.artifactDir) + } emitCodexScenarioMetrics(t, scenario, result) } @@ -187,6 +214,13 @@ func runCodexMergeHookGuardrailScenario(t *testing.T, runner codexLiveRunner, sc emitCodexScenarioMetrics(t, scenario, result) } +// run launches `codex exec --json` for one shared scenario. Liveness is the SAME +// streamWatcher the Claude runner and the live cycle use — one mechanism, no second +// impl. drainToExit runs the process to exit accumulating the full --json +// transcript, bounded by the per-step no-progress quietBudgetDefault (60s): the +// deadline resets on every event line, so a genuine multi-minute run never trips as +// long as Codex keeps emitting item.* events, and only silence past the budget +// kills the process — the same ≤60s AC-1-guarded discipline. func (r codexLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, workflowRoot, prompt string) codexScenarioResult { t.Helper() artifactDir := filepath.Join(r.artifactRoot, scenario.name) @@ -197,9 +231,7 @@ func (r codexLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, workf jsonlPath := filepath.Join(artifactDir, "codex-exec.jsonl") stderrPath := filepath.Join(artifactDir, "codex-exec.stderr.txt") - ctx, cancel := context.WithTimeout(context.Background(), scenario.timeout) - defer cancel() - cmd := exec.CommandContext(ctx, r.codexBin, + cmd := exec.Command(r.codexBin, "exec", "--json", "--dangerously-bypass-approvals-and-sandbox", @@ -208,32 +240,41 @@ func (r codexLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, workf prompt, ) cmd.Env = r.env - stdout, err := os.Create(jsonlPath) - if err != nil { - t.Fatal(err) - } - defer stdout.Close() + // stdout (the --json event stream) flows through the watcher's pipe for the + // no-progress liveness guard; stderr goes to its own artifact file. The + // cmdPoller closes the pipe write-end on exit so the scanner EOFs. + pr, pw := io.Pipe() + cmd.Stdout = pw stderr, err := os.Create(stderrPath) if err != nil { t.Fatal(err) } defer stderr.Close() - cmd.Stdout = stdout cmd.Stderr = stderr started := time.Now() - err = cmd.Run() + if startErr := cmd.Start(); startErr != nil { + t.Fatalf("codex exec failed to start for %s: %v", scenario.name, startErr) + } + poller := newCmdPoller(cmd, pw) + defer poller.kill() + watcher := newStreamWatcher(newPipeLineSource(pr), poller, func(line string) { t.Log(line) }) + + // drainToExit runs the process to exit accumulating the full transcript, OR + // kills it on a 60s no-progress stall; the deferred poller.kill() reaps it. + jsonl, stallErr := watcher.drainToExit(quietBudgetDefault, "codex shared scenario "+scenario.name) duration := time.Since(started) - if ctx.Err() == context.DeadlineExceeded { - t.Fatalf("codex exec did not finish within %s for %s; artifacts in %s", scenario.timeout, scenario.name, artifactDir) + + if writeErr := os.WriteFile(jsonlPath, []byte(jsonl), 0o644); writeErr != nil { + t.Fatal(writeErr) } - if err != nil { - t.Fatalf("codex exec failed for %s: %v; artifacts in %s", scenario.name, err, artifactDir) + if stallErr != nil { + t.Fatalf("%v\nArtifacts: %s", stallErr, artifactDir) } return codexScenarioResult{ finalMessage: readFile(t, finalPath), - jsonl: readFile(t, jsonlPath), + jsonl: jsonl, artifactDir: artifactDir, duration: duration, } diff --git a/internal/ensigncycle/live_budget_test.go b/internal/ensigncycle/live_budget_test.go index 8afe8e1a..fe049645 100644 --- a/internal/ensigncycle/live_budget_test.go +++ b/internal/ensigncycle/live_budget_test.go @@ -23,9 +23,17 @@ import ( // covers the live path from the offline suite with no model spend. // liveBudgetSources are the source files on the live path whose timeout literals -// must all be ≤60s: the streamWatcher (the per-step budget discipline) and the -// live test that wires it. -var liveBudgetSources = []string{"streamwatch_test.go", "live_test.go"} +// must all be ≤60s: the streamWatcher (the per-step budget discipline), the live +// test that wires it, and the shared-scenario runners that ALSO drive the watcher. +// The shared runners were the unguarded gap that let the old per-scenario basket +// timeout exist; scanning them here brings them under the same ≤60s discipline, so +// they can never carry a >60s literal again. +var liveBudgetSources = []string{ + "streamwatch_test.go", + "live_test.go", + "claude_live_runner_test.go", + "codex_live_runner_test.go", +} func TestNoTimeoutLiteralExceeds60s(t *testing.T) { for _, file := range liveBudgetSources { @@ -79,6 +87,48 @@ func TestBudgetConstantsAreUnder60s(t *testing.T) { } } +// TestDurationOfFoldsScalarSpellings pins durationOf's scalar fold over the +// spellings the cycle-8 audit named — most importantly the `time.Duration(N)` +// integer conversion a basket could hide behind to dodge the BasicLit scalar +// match. Each foldable spelling must evaluate to its real duration (so the AST +// guard would flag it when >60s); the float-conversion form is correctly NOT +// statically foldable (the value-guard on the wired consts is its backstop). +func TestDurationOfFoldsScalarSpellings(t *testing.T) { + foldable := map[string]time.Duration{ + "time.Second": time.Second, + "120 * time.Second": 120 * time.Second, + "time.Duration(120) * time.Second": 120 * time.Second, // the cycle-8 conversion evasion + "time.Duration(2) * time.Minute": 2 * time.Minute, + "time.Minute + 30*time.Second": time.Minute + 30*time.Second, + "2 * (40 * time.Second)": 80 * time.Second, + } + for src, want := range foldable { + expr, err := parser.ParseExpr(src) + if err != nil { + t.Fatalf("parse %q: %v", src, err) + } + got, ok := durationOf(expr) + if !ok { + t.Errorf("durationOf(%q) did not fold; the AST guard would miss it", src) + continue + } + if got != want { + t.Errorf("durationOf(%q) = %s, want %s", src, got, want) + } + } + + // A float conversion is NOT statically foldable here — durationOf returns + // false, by design. The wired-const value-guard (TestBudgetConstantsAreUnder60s) + // is the backstop for any spelling the AST fold cannot resolve. + floatExpr, err := parser.ParseExpr("time.Duration(1.5 * float64(time.Minute))") + if err != nil { + t.Fatal(err) + } + if _, ok := durationOf(floatExpr); ok { + t.Error("durationOf unexpectedly folded a float conversion; that form is left to the const value-guard") + } +} + func TestLiveTestHasNoMonolithicDeadlineCtx(t *testing.T) { fset := token.NewFileSet() f, err := parser.ParseFile(fset, "live_test.go", nil, 0) @@ -105,6 +155,9 @@ func TestLiveTestHasNoMonolithicDeadlineCtx(t *testing.T) { // COMPOSITIONALLY: every sub-expression recurses, so the COMPOSITE forms a // timeout literal can take all fold — // - a bare `time.Unit` selector (`time.Second` = a 1× duration), +// - a `time.Duration(N)` integer conversion (`time.Duration(120)` = 120ns, +// so `time.Duration(120) * time.Second` folds to 120s — the cycle-8 hole a +// BasicLit-only scalar match missed), // - a parenthesized duration (`(40 * time.Second)`), // - a `scalar * duration` (or `duration * scalar`) multiplication where the // duration operand is itself folded recursively, so `2 * (40 * time.Second)` @@ -113,8 +166,12 @@ func TestLiveTestHasNoMonolithicDeadlineCtx(t *testing.T) { // - an ADD/SUB of two duration sub-expressions (`time.Minute + 30*time.Second` // = 90s — the audit-cycle-1 hole a MUL-only fold missed). // -// Returns isDuration=false for anything that is not a pure duration expression, -// so the package's int arithmetic is skipped rather than mis-flagged. +// Returns isDuration=false for anything that is not a STATICALLY-FOLDABLE duration +// expression, so the package's int arithmetic is skipped rather than mis-flagged. +// Forms it cannot statically fold (a float conversion, a const-ident or runtime +// scalar) yield false here; the wired live budget constants are the authoritative +// backstop — TestBudgetConstantsAreUnder60s value-checks their REAL compile-time +// values, catching any spelling the AST fold cannot. func durationOf(e ast.Expr) (time.Duration, bool) { switch ex := e.(type) { case *ast.ParenExpr: @@ -122,6 +179,12 @@ func durationOf(e ast.Expr) (time.Duration, bool) { case *ast.SelectorExpr: // A bare `time.Second` etc. is a 1× duration. return timeUnitDuration(ex) + case *ast.CallExpr: + // A `time.Duration(N)` integer conversion is a duration of N nanoseconds. + if n, ok := timeDurationConversion(ex); ok { + return time.Duration(n), true + } + return 0, false case *ast.BinaryExpr: switch ex.Op { case token.MUL: @@ -156,12 +219,32 @@ func durationOf(e ast.Expr) (time.Duration, bool) { return 0, false } -// intScalar reports whether e is an integer literal, returning its value. A -// parenthesized integer literal folds too, so `(2) * time.Second` is handled. +// timeDurationConversion folds a `time.Duration(N)` integer conversion to N. The +// scalar form a basket could hide behind to dodge the BasicLit scalar match: +// `time.Duration(120) * time.Second` evaluates to 120s here. +func timeDurationConversion(call *ast.CallExpr) (int64, bool) { + sel, ok := call.Fun.(*ast.SelectorExpr) + if !ok || sel.Sel.Name != "Duration" { + return 0, false + } + pkg, ok := sel.X.(*ast.Ident) + if !ok || pkg.Name != "time" || len(call.Args) != 1 { + return 0, false + } + return intScalar(call.Args[0]) +} + +// intScalar reports whether e is a statically-foldable integer scalar, returning +// its value. A parenthesized integer literal folds too (`(2) * time.Second`), as +// does a `time.Duration(N)` conversion used in scalar position +// (`time.Duration(120) * time.Second`). func intScalar(e ast.Expr) (int64, bool) { if p, ok := e.(*ast.ParenExpr); ok { return intScalar(p.X) } + if call, ok := e.(*ast.CallExpr); ok { + return timeDurationConversion(call) + } lit, ok := e.(*ast.BasicLit) if !ok || lit.Kind != token.INT { return 0, false diff --git a/internal/ensigncycle/livescenario_adapter_live_test.go b/internal/ensigncycle/livescenario_adapter_live_test.go index c398c8ca..3ba0ed43 100644 --- a/internal/ensigncycle/livescenario_adapter_live_test.go +++ b/internal/ensigncycle/livescenario_adapter_live_test.go @@ -10,7 +10,6 @@ import ( "path/filepath" "strings" "testing" - "time" "github.com/spacedock-dev/spacedock/internal/livescenario" ) @@ -21,17 +20,18 @@ import ( // that a scenario is now authored against the importable livescenario primitive // rather than buried in this package's test files. The adapter copies the staged // dir's contents into the runner's own workflow root and forwards the launch. +// Liveness is the claudeLiveRunner's own per-stage no-progress quiet budget (the +// shared streamWatcher) — no per-call basket timeout (those are banned). type claudeRunnerAdapter struct { - t *testing.T - runner claudeLiveRunner - timeout time.Duration + t *testing.T + runner claudeLiveRunner } func (a claudeRunnerAdapter) Launch(ctx context.Context, dir, entityPath, runbook string) (string, error) { // gitInit so the FO front door sees a workflow git root (the live adapter's // fixtures are always git-initialized). gitInit(a.t, dir) - scenario := sharedRuntimeScenario{name: "livescenario-primitive", timeout: a.timeout} + scenario := sharedRuntimeScenario{name: "livescenario-primitive"} result := a.runner.run(a.t, scenario, dir, runbook+" "+antiShutdownOverride) return result.finalMessage + "\n" + result.stream, nil } @@ -46,7 +46,7 @@ func (a claudeRunnerAdapter) Launch(ctx context.Context, dir, entityPath, runboo // own `Verified by: live …` citation under AC-1's gate — p4 eating its own gate. func TestLivePrimitiveRunsAgainstClaudeAdapter(t *testing.T) { runner := newClaudeLiveRunner(t) - adapter := claudeRunnerAdapter{t: t, runner: runner, timeout: 2 * time.Minute} + adapter := claudeRunnerAdapter{t: t, runner: runner} sc := livescenario.Scenario{ Name: "gate-held-via-primitive", diff --git a/internal/ensigncycle/pi_shared_coverage_test.go b/internal/ensigncycle/pi_shared_coverage_test.go index c566c3b2..f36f22d3 100644 --- a/internal/ensigncycle/pi_shared_coverage_test.go +++ b/internal/ensigncycle/pi_shared_coverage_test.go @@ -19,6 +19,10 @@ func piSharedScenarioCoverageMap() map[string]piSharedScenarioCoverage { mode: "gap", reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer rejection-flow runner.", }, + "feedback-3-cycle-escalation": { + mode: "gap", + reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer 3-cycle-escalation runner.", + }, "merge-hook-guardrail": { mode: "gap", reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer merge-hook runner.", diff --git a/internal/ensigncycle/shared_assertions_impl_test.go b/internal/ensigncycle/shared_assertions_impl_test.go index b38d4cfb..2c73b5f6 100644 --- a/internal/ensigncycle/shared_assertions_impl_test.go +++ b/internal/ensigncycle/shared_assertions_impl_test.go @@ -8,23 +8,72 @@ import ( const rejectionFixMarker = "shared-rejection-fix: applied" +// escalationMarker is the exact line the escalation fixture README instructs the +// FO to record on the 3rd rejection instead of routing back a 4th time. Like +// rejectionFixMarker it is a fixture-driven on-disk obligation graded as durable +// state, never transcript phrasing — so assertThirdCycleEscalation grades behavior, +// not wording (scenario-testing-principles). +const escalationMarker = "feedback-escalation: human-review-required" + var implementationStatus = regexp.MustCompile(`(?im)^status:\s*implementation\s*$`) +// terminalStatus anchors the escalation fixture's terminal state (`done`) in the +// frontmatter. A parked-for-the-human escalation must NOT have advanced the entity +// to terminal — escalate-then-terminalize is auto-resolving instead of waiting. +var terminalStatus = regexp.MustCompile(`(?im)^status:\s*done\s*$`) + +// feedbackCycleEntry anchors one rejection-round entry inside the entity body's +// `### Feedback Cycles` section. The escalation fixture README instructs the FO to +// append one `- Cycle N:` line per rejection round, so counting these lines counts +// the rounds the FO actually drove — durable state, not transcript wording. +var feedbackCycleEntry = regexp.MustCompile(`(?im)^- Cycle \d+:`) + +// implementationReport anchors an implementation stage-report heading at line +// start, so counting matches counts the implementation rounds that left a durable +// report rather than any prose that merely names the stage. +var implementationReport = regexp.MustCompile(`(?m)^## Stage Report: implementation`) + +// feedbackCyclesSection returns the body of the entity's `### Feedback Cycles` +// section — from its heading to the next heading (any `##`/`###`/etc.) or EOF. +// Scoping the cycle-entry and escalation-marker matches to this section keeps the +// FO's tracked rounds from being satisfied by a stray `- Cycle N:` line or marker +// text appearing elsewhere in the body (e.g. quoted inside a stage report). +func feedbackCyclesSection(entity string) string { + const heading = "### Feedback Cycles" + start := strings.Index(entity, heading) + if start < 0 { + return "" + } + rest := entity[start+len(heading):] + if next := nextHeading.FindStringIndex(rest); next != nil { + return rest[:next[0]] + } + return rest +} + +// nextHeading anchors the next markdown heading line (one or more `#`), used to +// bound the `### Feedback Cycles` section at the following heading. +var nextHeading = regexp.MustCompile(`(?m)^#{1,6} `) + // assertRejectionFlow is host-neutral: it consumes the post-run entity-state -// string and an observed-output string, so it accepts either host's transcript. -// The flow held when the rejection follow-up applied the exact fix marker, left a -// second implementation stage report, routed the entity back to status: -// implementation, and the observed output surfaced both the rejection and the -// implementation follow-up. +// string and an observed-output string, so it accepts either host's transcript. It +// grades the full TWO-cycle trajectory the Go port simplified away: the first +// REJECTED validation routes back to implementation, the rework applies the exact +// fix marker and leaves a second implementation report, and a second validation +// round re-checks it. The durable 2-cycle end-state is two implementation reports, +// the fix marker, and two recorded `### Feedback Cycles` entries; the observed +// output surfaces both the rejection and the implementation follow-up. The +// reviewer-reuse signal (Claude SendMessage / Codex send_input) is host-specific +// and graded by the host runner, not this shared assertion. func assertRejectionFlow(entity, observed string) error { if !strings.Contains(entity, rejectionFixMarker) { return fmt.Errorf("rejection follow-up did not apply the exact fix marker") } - if strings.Count(entity, "## Stage Report: implementation") < 2 { - return fmt.Errorf("rejection follow-up did not leave a new implementation stage report") + if reports := len(implementationReport.FindAllString(entity, -1)); reports < 2 { + return fmt.Errorf("rejection trajectory left %d implementation reports, want at least 2 (original + cycle-2 rework)", reports) } - if !implementationStatus.MatchString(entity) { - return fmt.Errorf("rejection follow-up did not route the entity back to status: implementation") + if cycles := len(feedbackCycleEntry.FindAllString(entity, -1)); cycles < 2 { + return fmt.Errorf("rejection trajectory recorded %d `### Feedback Cycles` entries, want at least 2 — a single-cycle end-state did not drive the second validation round", cycles) } lowerObserved := strings.ToLower(observed) if !strings.Contains(lowerObserved, "reject") { @@ -36,6 +85,36 @@ func assertRejectionFlow(entity, observed string) error { return nil } +// assertThirdCycleEscalation is host-neutral: it grades the durable post-run +// entity body ALONE — no transcript — for the 3rd-cycle escalation guarantee. The +// FO reaches the 3rd consecutive REJECTED validation and must escalate to the +// human instead of auto-bouncing a 4th time. The escalated end-state is separable +// from the failure end-states on durable signals only: at least three +// `### Feedback Cycles` entries (the three rejection rounds) and the +// fixture-instructed escalation marker line, BOTH inside the `### Feedback Cycles` +// section (not a stray line elsewhere); NO new implementation report past the one +// the fixture seeded (a 4th auto-bounce would route back and append a second +// report); and the entity NOT advanced to terminal (escalation parks for the +// human, it does not self-resolve to done). Grading on these on-disk facts, never +// on wording, is what keeps this a behavioral oracle rather than a transcript +// tautology. +func assertThirdCycleEscalation(entity string) error { + section := feedbackCyclesSection(entity) + if cycles := len(feedbackCycleEntry.FindAllString(section, -1)); cycles < 3 { + return fmt.Errorf("escalation entity has %d `### Feedback Cycles` entries, want at least 3", cycles) + } + if !strings.Contains(section, escalationMarker) { + return fmt.Errorf("escalation entity did not record the human-escalation marker in the `### Feedback Cycles` section on the 3rd cycle") + } + if reports := len(implementationReport.FindAllString(entity, -1)); reports > 1 { + return fmt.Errorf("escalation entity has %d implementation reports, want 1 — a post-cycle-3 report means the FO auto-bounced a 4th time instead of escalating", reports) + } + if terminalStatus.MatchString(entity) { + return fmt.Errorf("escalation entity advanced to terminal (status: done) — escalation must park for the human, not self-resolve to done") + } + return nil +} + // assertMergeHookGuardHeld is host-neutral: before/after entity state plus an // observed-output string. The guard held when the entity was unmutated, still at // status: implementation, and the observed output named both the merge hook and diff --git a/internal/ensigncycle/shared_assertions_test.go b/internal/ensigncycle/shared_assertions_test.go index 6b91ca46..68eeb871 100644 --- a/internal/ensigncycle/shared_assertions_test.go +++ b/internal/ensigncycle/shared_assertions_test.go @@ -3,10 +3,13 @@ package ensigncycle import "testing" func TestAssertRejectionFlow(t *testing.T) { - entity := "---\nstatus: implementation\n---\n" + + // The full two-cycle end-state: fix marker applied, two implementation reports + // (original + cycle-2 rework), and two recorded `### Feedback Cycles` entries. + entity := "---\nstatus: validation\n---\n" + rejectionFixMarker + "\n\n" + "## Stage Report: implementation\n\n- DONE: Initial implementation\n\n" + - "## Stage Report: implementation\n\n- DONE: Applied rejection fix\n" + "## Stage Report: implementation\n\n- DONE: Applied rejection fix\n\n" + + "### Feedback Cycles\n\n- Cycle 1: REJECTED\n- Cycle 2: PASSED\n" observed := "validation was REJECTED; routed follow-up to implementation" if err := assertRejectionFlow(entity, observed); err != nil { @@ -15,17 +18,85 @@ func TestAssertRejectionFlow(t *testing.T) { if err := assertRejectionFlow("## Stage Report: implementation\n", observed); err == nil { t.Fatal("expected missing fix marker to fail") } - if err := assertRejectionFlow("---\nstatus: validation\n---\n"+rejectionFixMarker+"\n\n## Stage Report: implementation\n", observed); err == nil { - t.Fatal("expected missing implementation status to fail") + // A single-cycle end-state: fix applied, two implementation reports, but only one + // recorded cycle — the FO never drove the second validation round. + singleCycle := "---\nstatus: implementation\n---\n" + rejectionFixMarker + "\n\n" + + "## Stage Report: implementation\n\n- DONE: Initial\n\n" + + "## Stage Report: implementation\n\n- DONE: Fix\n\n" + + "### Feedback Cycles\n\n- Cycle 1: REJECTED\n" + if err := assertRejectionFlow(singleCycle, observed); err == nil { + t.Fatal("expected a single-cycle end-state (one recorded cycle) to fail") } - if err := assertRejectionFlow("---\nstatus: implementation\n---\n"+rejectionFixMarker+"\n\n## Stage Report: implementation\n", observed); err == nil { - t.Fatal("expected missing follow-up implementation report to fail") + // Two cycles recorded but only one implementation report — the rework never left + // a second report. + oneReport := "---\nstatus: validation\n---\n" + rejectionFixMarker + "\n\n" + + "## Stage Report: implementation\n\n- DONE: Only one report\n\n" + + "### Feedback Cycles\n\n- Cycle 1: REJECTED\n- Cycle 2: PASSED\n" + if err := assertRejectionFlow(oneReport, observed); err == nil { + t.Fatal("expected a single implementation report to fail") } if err := assertRejectionFlow(entity, "all quiet"); err == nil { t.Fatal("expected missing rejection output to fail") } } +func TestAssertThirdCycleEscalation(t *testing.T) { + // The escalated end-state the live run must reach: the seeded fixture (one + // implementation report + two cycle entries) plus the third cycle entry and the + // escalation marker, with NO new implementation report appended. + escalated := escalationEntity() + + "- Cycle 3: REJECTED — third consecutive rejection.\n" + + escalationMarker + "\n" + if err := assertThirdCycleEscalation(escalated); err != nil { + t.Fatalf("expected an escalated end-state to pass: %v", err) + } + + // 4th-auto-bounce: the FO recorded a third cycle but routed back instead of + // escalating — a second implementation report appears and no marker. Must fail + // on both the marker and the no-new-report checks. + bounced := escalationEntity() + + "- Cycle 3: REJECTED — routed back to implementation again.\n\n" + + "## Stage Report: implementation\n\n- DONE: reworked a fourth time\n" + if err := assertThirdCycleEscalation(bounced); err == nil { + t.Fatal("expected a 4th-auto-bounce end-state (new implementation report, no marker) to fail") + } + + // Stalled at cycle 2: only the two seeded cycle entries, no third round, no + // marker. Must fail on the cycle-count check. + if err := assertThirdCycleEscalation(escalationEntity()); err == nil { + t.Fatal("expected a stalled-at-cycle-2 end-state to fail on the cycle-count check") + } + + // Marker present but only two cycles recorded: isolates the cycle-count check + // from the marker check so neither can be silently dropped. + markerWithoutThirdCycle := escalationEntity() + escalationMarker + "\n" + if err := assertThirdCycleEscalation(markerWithoutThirdCycle); err == nil { + t.Fatal("expected the marker present but only two cycles recorded to fail on the cycle-count check") + } + + // Three cycles recorded but no marker: isolates the marker check from the + // cycle-count check. + threeCyclesNoMarker := escalationEntity() + "- Cycle 3: REJECTED — third rejection.\n" + if err := assertThirdCycleEscalation(threeCyclesNoMarker); err == nil { + t.Fatal("expected three cycles recorded but no escalation marker to fail on the marker check") + } + + // Section-scoping: the seeded entity has two in-section cycle entries; a third + // `- Cycle 3:` line and the marker placed OUTSIDE the `### Feedback Cycles` + // section (here, in a trailing stage report) must NOT count. Only two in-section + // entries remain, so this fails on the cycle-count check — proving the matches + // are scoped to the section, not the whole body. + strayOutsideSection := escalationEntity() + + "\n## Stage Report: validation\n\n- Cycle 3: REJECTED — stray, out of section.\n" + + escalationMarker + "\n" + if got := len(feedbackCycleEntry.FindAllString(strayOutsideSection, -1)); got != 3 { + t.Fatalf("section-scoping case must have three `- Cycle N:` lines body-wide (two in-section + one stray), got %d", got) + } + if err := assertThirdCycleEscalation(strayOutsideSection); err == nil { + t.Fatal("expected a third cycle entry + marker placed outside the `### Feedback Cycles` section to fail — the matches must be section-scoped") + } +} + func TestAssertMergeHookGuardHeld(t *testing.T) { entity := "---\nstatus: implementation\nmod-block:\npr:\n---\n" observed := "Error: entity merge-check cannot advance to terminal - workflow has merge hook(s) [local-merge]" diff --git a/internal/ensigncycle/shared_coverage_meta_test.go b/internal/ensigncycle/shared_coverage_meta_test.go index 756754aa..909f4e5f 100644 --- a/internal/ensigncycle/shared_coverage_meta_test.go +++ b/internal/ensigncycle/shared_coverage_meta_test.go @@ -4,7 +4,7 @@ package ensigncycle import "testing" -// TestSharedScenarioRunnerCoverage is the AC-2/AC-3 parity guard against drift: it +// TestSharedScenarioRunnerCoverage is the AC-1 parity guard against drift: it // iterates the host-neutral sharedRuntimeScenarios() table and fails if EITHER the // Codex runner map or the Claude runner map lacks a runner for any shared scenario // ID. A scenario that exists for one host only — the exact split this task closes diff --git a/internal/ensigncycle/shared_fixtures_test.go b/internal/ensigncycle/shared_fixtures_test.go index 39fded98..799776e0 100644 --- a/internal/ensigncycle/shared_fixtures_test.go +++ b/internal/ensigncycle/shared_fixtures_test.go @@ -102,47 +102,122 @@ func rejectionReadme() string { " terminal: true\n" + "---\n" + "# Rejection Fixture\n\n" + + "This fixture exercises the full two-cycle rejection trajectory LIVE, starting before the first validation: a first implementation deliberately omits the fix, a first validation REJECTS it and routes back to implementation, the rework applies the fix, and a second validation round (cycle 2) re-checks it and PASSES.\n\n" + "### implementation\n\n" + - "Apply the validation rejection by appending this exact standalone line to `rejection-task.md`:\n\n" + + "This stage runs once per cycle. Decide which round you are in by checking `rejection-task.md` for a `## Stage Report: validation` section recommending REJECTED:\n\n" + + "- **First round (no REJECTED validation report present yet):** deliberately OMIT the fix marker. Append a `## Stage Report: implementation` section with one `- DONE:` item noting the initial implementation does not yet carry the fix marker. Do NOT write the marker line. This is the buggy round the first validation must reject.\n" + + "- **Rework round (a REJECTED validation report is present):** apply the fix by appending this exact standalone line to `rejection-task.md`:\n\n" + + " `" + rejectionFixMarker + "`\n\n" + + " Then append a second `## Stage Report: implementation` section with one `- DONE:` item naming the fix.\n\n" + + "- **Outputs:** An implementation stage report; on the rework round, the exact fix marker as well.\n\n" + + "### validation\n\n" + + "Reject the implementation when the exact fix marker `" + rejectionFixMarker + "` is absent from `rejection-task.md`; report PASSED when it is present. Append a `## Stage Report: validation` section recording the verdict. Keep this review LIGHT — inspect only for the marker line; do not read or run unrelated code.\n\n" + + "- **Outputs:** A PASSED or REJECTED validation stage report.\n\n" + + "### Feedback Cycles\n\n" + + "Track every validation round in a `### Feedback Cycles` section in `rejection-task.md`: append one `- Cycle N: ` line per validation round, numbered in order. The first REJECTED round is `- Cycle 1: REJECTED`; record `- Cycle 2: PASSED` after the re-validation passes.\n\n" + + "### done\n\nTerminal state.\n" +} + +func rejectionEntity() string { + return "---\n" + + "id: rejection-task\n" + + "title: Rejection Task\n" + + "status: implementation\n" + + "completed:\n" + + "verdict:\n" + + "worktree:\n" + + "---\n" + + "# Rejection Task\n\n" + + "This task starts at implementation, before any validation has run. The first implementation round must deliberately omit the fix marker so the first validation rejects it; the rework round after that rejection applies the marker.\n" +} + +func rejectionPrompt() string { + return fmt.Sprintf("%s\n\n%s\n%s\n%s\n%s", + "Use $spacedock:first-officer for this whole run.", + "Workflow directory: .", + "Process only the entity `rejection-task`, which starts at implementation, through a full two-cycle rejection feedback flow.", + "Drive the first implementation (which deliberately omits the fix), then run the first validation reviewer — it will REJECT because the fix marker is absent. Route that concrete finding back to the implementation target, wait for the rework to apply the fix, then re-run validation for a second cycle and record `- Cycle 2: PASSED` per the workflow README. For the second-cycle re-review, REUSE the kept-alive validation reviewer rather than dispatching a fresh one.", + "Do not advance the entity to done. Your final response must mention the first-cycle rejection and the second-cycle re-validation result.", + ) +} + +func writeEscalationWorkflow(t *testing.T, root string) string { + t.Helper() + writeFile(t, filepath.Join(root, "README.md"), escalationReadme()) + entityPath := filepath.Join(root, "escalation-task.md") + writeFile(t, entityPath, escalationEntity()) + gitInit(t, root) + return entityPath +} + +func escalationReadme() string { + return "---\n" + + "entity-type: task\n" + + "id-style: slug\n" + + "stages:\n" + + " defaults:\n" + + " worktree: false\n" + + " concurrency: 1\n" + + " states:\n" + + " - name: backlog\n" + + " initial: true\n" + + " - name: implementation\n" + + " - name: validation\n" + + " gate: true\n" + + " feedback-to: implementation\n" + + " - name: done\n" + + " terminal: true\n" + + "---\n" + + "# Escalation Fixture\n\n" + + "This fixture exercises the 3-cycle escalation guarantee: on the THIRD consecutive REJECTED validation the first officer must escalate to the human instead of routing back to implementation a fourth time.\n\n" + + "### implementation\n\n" + + "Apply the validation rejection by appending this exact standalone line to `escalation-task.md`:\n\n" + "`" + rejectionFixMarker + "`\n\n" + "Then append a `## Stage Report: implementation` section with one `- DONE:` item naming the fix.\n\n" + "- **Outputs:** The exact fix marker and an implementation stage report.\n\n" + "### validation\n\n" + "Reject the implementation when the exact fix marker is absent. If it is present, report PASSED.\n\n" + "- **Outputs:** A PASSED or REJECTED validation stage report.\n\n" + + "### Feedback Cycles\n\n" + + "Track every rejection round in a `### Feedback Cycles` section in `escalation-task.md`: append one `- Cycle N: REJECTED` line per round, numbered in order.\n\n" + + "On the THIRD rejection, do NOT route back to implementation again. Instead escalate to the human: append this exact standalone line to the `### Feedback Cycles` section and stop without dispatching a fourth implementation round:\n\n" + + "`" + escalationMarker + "`\n\n" + "### done\n\nTerminal state.\n" } -func rejectionEntity() string { +func escalationEntity() string { return "---\n" + - "id: rejection-task\n" + - "title: Rejection Task\n" + + "id: escalation-task\n" + + "title: Escalation Task\n" + "status: validation\n" + "completed:\n" + "verdict:\n" + "worktree:\n" + "---\n" + - "# Rejection Task\n\n" + - "The implementation is intentionally missing the exact fix marker.\n\n" + + "# Escalation Task\n\n" + + "Two prior validation rejections have already routed back to implementation. The implementation still omits the exact fix marker, so the latest validation report below is the THIRD consecutive REJECTED.\n\n" + "## Stage Report: implementation\n\n" + "- DONE: Initial implementation exists\n" + - " The initial implementation deliberately omits the required fix marker.\n" + + " The implementation has been reworked twice but still omits the required fix marker.\n" + "\n### Summary\n\n" + - "Ready for validation.\n\n" + + "Ready for validation; the fix marker is still absent.\n\n" + "## Stage Report: validation\n\n" + "- FAILED: Fix marker is absent\n" + - " REJECTED: expected exact line `" + rejectionFixMarker + "`, but it is missing. Route this back to implementation.\n" + + " REJECTED: expected exact line `" + rejectionFixMarker + "`, but it is missing. This is the third consecutive rejection.\n" + "\n### Summary\n\n" + - "Recommendation: REJECTED. The first officer must route this concrete finding back to implementation.\n" + "Recommendation: REJECTED. This is the third consecutive rejection.\n\n" + + "### Feedback Cycles\n\n" + + "- Cycle 1: REJECTED — fix marker absent, routed back to implementation.\n" + + "- Cycle 2: REJECTED — fix marker still absent, routed back to implementation.\n" } -func rejectionPrompt() string { +func escalationPrompt() string { return fmt.Sprintf("%s\n\n%s\n%s\n%s\n%s", "Use $spacedock:first-officer for this whole run.", "Workflow directory: .", - "Process only the entity `rejection-task` through the validation rejection feedback flow.", - "The latest validation report already recommends REJECTED. Route the concrete finding back to the implementation target, dispatch a worker if needed, wait for the follow-up implementation completion, and then stop.", - "Do not advance the entity to validation again or to done. Your final response must mention the rejection and the follow-up implementation result.", + "Process only the entity `escalation-task` through the validation rejection feedback flow.", + "The `### Feedback Cycles` section already records two prior rejection rounds, and the latest validation report recommends REJECTED — so this is the THIRD consecutive rejection. Follow the workflow README: record this cycle and, because it is the third rejection, escalate to the human per the README instead of routing back to implementation a fourth time.", + "Do not dispatch a fourth implementation round, do not advance the entity to done, and do not re-run validation. Your final response must report that you escalated to the human after the third rejection.", ) } diff --git a/internal/ensigncycle/shared_reviewer_reuse_table_test.go b/internal/ensigncycle/shared_reviewer_reuse_table_test.go new file mode 100644 index 00000000..00b22301 --- /dev/null +++ b/internal/ensigncycle/shared_reviewer_reuse_table_test.go @@ -0,0 +1,163 @@ +package ensigncycle + +import "testing" + +// Offline table tests for the host-specific reviewer-reuse assertions. They prove +// each assertion requires a REAL reuse tool call targeting the validation reviewer +// — not loose narration, not an unrelated tool, not a call to a different +// recipient. Without these the assertions are exercised only by the model-spending +// live runners; here they cost milliseconds and pin the discriminating behavior. + +func TestAssertClaudeReviewerReuse(t *testing.T) { + // The agentId-addressed reuse shape — one shape the FO emits in a Claude team + // (a teams-mode kept-alive completed agent can be resumed by the agentId its + // spawn returned). A validation-stage Agent spawn (exactly one) returns an + // agentId on completion; the FO resumes THAT reviewer by agentId for the cycle-2 + // re-review. Both events must be present and correlated for this to pass. + validationSpawn := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_V","input":{"description":"Rejection Task: validation","subagent_type":"spacedock:ensign"}}]}}` + validationResult := `{"type":"user","message":{"content":[{"type":"tool_result","tool_use_id":"toolu_V","content":[{"type":"text","text":"Verdict: REJECTED."},{"type":"text","text":"agentId: a94abe89c85f9f4cc (use SendMessage with to: 'a94abe89c85f9f4cc' to continue this agent)"}]}]}}` + reuseByAgentID := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"SendMessage","input":{"to":"a94abe89c85f9f4cc","message":"re-review: the fix marker is now present"}}]}}` + agentIDReuse := validationSpawn + "\n" + validationResult + "\n" + reuseByAgentID + + // An agentId from an IMPLEMENTATION spawn, not the validation reviewer: a + // SendMessage to it is NOT reviewer reuse and must NOT pass. This pins the + // correlation — the assertion must tie the reused handle back to the validation + // spawn, not accept any agentId. + implSpawn := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_I","input":{"description":"Rejection Task: implementation","subagent_type":"spacedock:ensign"}}]}}` + implResult := `{"type":"user","message":{"content":[{"type":"tool_result","tool_use_id":"toolu_I","content":[{"type":"text","text":"agentId: a000impl0000 (use SendMessage with to: 'a000impl0000' to continue this agent)"}]}]}}` + reuseImplAgentID := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"SendMessage","input":{"to":"a000impl0000","message":"apply the fix"}}]}}` + wrongAgentIDReuse := implSpawn + "\n" + implResult + "\n" + reuseImplAgentID + + // A SendMessage to a validation-reviewer agentId that was NEVER spawned/returned + // in this transcript: an opaque handle with no correlating validation spawn must + // NOT pass. + uncorrelatedAgentID := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"SendMessage","input":{"to":"adeadbeef0000000","message":"re-review"}}]}}` + + // Genuine NAME-addressed reuse: the FO spawns the cycle-1 validation reviewer + // ONCE, then re-engages it for cycle 2 by its spawn NAME (the production shape — + // testdata/*.jsonl address members by name, not agentId). Exactly one validation + // spawn + a name-message to it = genuine reuse → PASS. + nameSpawnValidation := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_NV","input":{"description":"Rejection Task: validation","subagent_type":"spacedock:ensign"}}]}}` + reuseByName := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"SendMessage","input":{"to":"spacedock-ensign-rejection-task-validation","message":"re-review: the fix marker is now present"}}]}}` + nameReuse := nameSpawnValidation + "\n" + reuseByName + + // FRESH-DISPATCH false-pass (the #302/cycle-8 hole): the FO drives cycle-1 + // validation (spawn #1), then FRESH-dispatches the cycle-2 validator (a SECOND + // validation Agent spawn — the FORBIDDEN behavior the #141 keepalive contract + // prohibits) and kicks it off by its validation NAME. The old assertion greened + // this on the bare name substring; the strengthened one must RED it — two + // validation spawns means the FO did NOT reuse the kept-alive cycle-1 reviewer. + freshCycle2Spawn := `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","id":"toolu_FV","input":{"description":"Rejection Task: validation (cycle 2)","subagent_type":"spacedock:ensign"}}]}}` + freshDispatchNameMessage := nameSpawnValidation + "\n" + freshCycle2Spawn + "\n" + reuseByName + + cases := []struct { + name string + stream string + wantErr bool + }{ + {"genuine name-addressed reuse (one validation spawn)", nameReuse, false}, + {"reuse by correlated validation agentId", agentIDReuse, false}, + {"fresh cycle-2 dispatch + name message must RED", freshDispatchNameMessage, true}, + { + "loose narration only", + `{"type":"assistant","message":{"content":[{"type":"text","text":"I will reuse the validation reviewer via SendMessage."}]}}`, + true, + }, + { + "unrelated tool targeting validation", + `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Agent","input":{"to":"spacedock-ensign-validation","description":"fresh validation dispatch"}}]}}`, + true, + }, + { + "SendMessage to a non-validation recipient by name", + `{"type":"assistant","message":{"content":[{"type":"tool_use","name":"SendMessage","input":{"to":"spacedock-ensign-implementation","message":"apply the fix"}}]}}`, + true, + }, + {"reuse of an implementation agentId, not the reviewer", wrongAgentIDReuse, true}, + {"SendMessage to an uncorrelated agentId", uncorrelatedAgentID, true}, + {"empty stream", "", true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := assertClaudeReviewerReuse(tc.stream) + if tc.wantErr && err == nil { + t.Fatalf("expected error for %q, got nil", tc.name) + } + if !tc.wantErr && err != nil { + t.Fatalf("expected pass for %q, got: %v", tc.name, err) + } + }) + } +} + +func TestAssertCodexReviewerReuse(t *testing.T) { + // The real Codex collab_tool_call shape. A spawn_agent dispatching the validation + // stage binds the reviewer's thread id (vThread); a later send_input to vThread is + // the cycle-2 reviewer reuse. Both events, correlated, are the only passing shape. + const vThread = "019e9695-1a3a-7481-be31-a1e81d53ec6d" + const iThread = "019e9693-ad11-7701-b615-90d6a86b2dc8" + spawnValidation := `{"type":"item.completed","item":{"type":"collab_tool_call","tool":"spawn_agent","receiver_thread_ids":["` + vThread + `"],"prompt":"Read /tmp/spacedock-dispatch/spacedock-ensign-rejection-task-validation.md and treat its content as your assignment."}}` + spawnImpl := `{"type":"item.completed","item":{"type":"collab_tool_call","tool":"spawn_agent","receiver_thread_ids":["` + iThread + `"],"prompt":"Read /tmp/spacedock-dispatch/spacedock-ensign-rejection-task-implementation.md and treat its content as your assignment."}}` + reuseValidation := `{"type":"item.started","item":{"type":"collab_tool_call","tool":"send_input","receiver_thread_ids":["` + vThread + `"],"prompt":"Re-run validation for rejection-task as cycle 2 using your existing validation reviewer context."}}` + feedbackToImpl := `{"type":"item.started","item":{"type":"collab_tool_call","tool":"send_input","receiver_thread_ids":["` + iThread + `"],"prompt":"Feedback routed from validation to implementation for rejection-task. The fix marker is absent."}}` + + realReuse := spawnValidation + "\n" + spawnImpl + "\n" + feedbackToImpl + "\n" + reuseValidation + + // FRESH-DISPATCH false-pass (the cycle-8 M2 hole): the FO drives cycle-1 + // validation (spawn #1 → vThread), then FRESH-spawns a SECOND cycle-2 validation + // reviewer (vThread2 — its prompt also names validation) and send_inputs to THAT + // new thread. The old assertion bound ANY validation-prompt spawn as "the + // validation thread", so a send_input to the fresh cycle-2 thread passed as + // "reuse". The strengthened assertion must RED it: two validation spawn_agents + // means the FO fresh-dispatched, it did not reuse the kept-alive cycle-1 reviewer. + const vThread2 = "019e9696-2b4b-8592-cf42-b2f92e64fd7e" + freshCycle2Spawn := `{"type":"item.completed","item":{"type":"collab_tool_call","tool":"spawn_agent","receiver_thread_ids":["` + vThread2 + `"],"prompt":"Read /tmp/spacedock-dispatch/spacedock-ensign-rejection-task-validation.md and treat its content as your assignment."}}` + sendInputToFresh := `{"type":"item.started","item":{"type":"collab_tool_call","tool":"send_input","receiver_thread_ids":["` + vThread2 + `"],"prompt":"Re-run validation for rejection-task as cycle 2."}}` + freshDispatch := spawnValidation + "\n" + spawnImpl + "\n" + feedbackToImpl + "\n" + freshCycle2Spawn + "\n" + sendInputToFresh + + cases := []struct { + name string + jsonl string + wantErr bool + }{ + {"real send_input to the validation reviewer thread", realReuse, false}, + {"fresh cycle-2 spawn_agent + send_input must RED", freshDispatch, true}, + { + "loose narration only", + `{"type":"item.completed","item":{"type":"agent_message","text":"I will send_input to the validation worker."}}`, + true, + }, + { + "unrelated tool referencing validation", + spawnValidation + "\n" + `{"type":"item.completed","item":{"type":"command_execution","command":"echo validation"}}`, + true, + }, + { + // The feedback-to-implementation send_input alone — its prompt mentions + // "validation" but it targets the IMPLEMENTATION thread, not the reviewer. + // Must NOT pass: it is feedback routing, not reviewer reuse. + "send_input to the implementation worker, not the reviewer", + spawnValidation + "\n" + spawnImpl + "\n" + feedbackToImpl, + true, + }, + { + // A send_input to the validation thread with NO correlating validation + // spawn in the transcript: an uncorrelated thread id must not pass. + "send_input to an uncorrelated thread", + reuseValidation, + true, + }, + {"empty transcript", "", true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := assertCodexReviewerReuse(tc.jsonl) + if tc.wantErr && err == nil { + t.Fatalf("expected error for %q, got nil", tc.name) + } + if !tc.wantErr && err != nil { + t.Fatalf("expected pass for %q, got: %v", tc.name, err) + } + }) + } +} diff --git a/internal/ensigncycle/shared_reviewer_reuse_test.go b/internal/ensigncycle/shared_reviewer_reuse_test.go new file mode 100644 index 00000000..e8abb997 --- /dev/null +++ b/internal/ensigncycle/shared_reviewer_reuse_test.go @@ -0,0 +1,229 @@ +package ensigncycle + +import ( + "encoding/json" + "fmt" + "regexp" + "strings" +) + +// The host-specific reviewer-reuse assertions for the rejection-flow scenario. +// They are the producer signal that the FO reused the kept-alive validation +// reviewer for the cycle-2 re-review rather than fresh-dispatching (the #141 +// keepalive contract the Go port dropped): Claude reuses via a SendMessage tool +// call, Codex via a `send_input` call. They live under the DEFAULT build tags +// (parsing only stdlib JSON) so the offline table tests exercise them without +// spending a model, alongside the //go:build live runners that feed them real +// transcripts. + +// claudeAgentIDResult extracts the `agentId: ` a completed subagent's +// tool_result returns ("agentId: a94abe89c85f9f4cc (use SendMessage with to: …)"). +// The teams-mode reuse handle for a kept-alive completed agent. +var claudeAgentIDResult = regexp.MustCompile(`agentId:\s*(a[0-9a-f]+)`) + +// assertClaudeReviewerReuse scans the stream-json transcript for the FO reusing +// the kept-alive cycle-1 validation reviewer for the cycle-2 re-review — the +// durable producer signal it did NOT fresh-dispatch the cycle-2 validator (the +// #141 keepalive contract). It enforces BOTH halves of "genuine reuse", because +// either alone false-passes a fresh-dispatch (the cycle-8 audit hole): +// +// 1. NO fresh cycle-2 validation spawn. The rejection-flow drives ONE cycle-1 +// validation; a reuse run re-engages that same reviewer, so there is EXACTLY +// ONE validation-stage Agent/Task spawn. A run that fresh-dispatches the +// cycle-2 validator emits a SECOND validation spawn — the forbidden behavior — +// so >1 validation spawn is an immediate FAIL regardless of any reuse-shaped +// message. This is the discriminator the bare name/agentId match lacked: a +// fresh cycle-2 reviewer messaged by its validation NAME looks identical to a +// reused one EXCEPT for the extra spawn. +// 2. A message to the cycle-1 reviewer's handle. With exactly one validation +// reviewer in play, a SendMessage to it — by the agentId its spawn returned +// (teams-mode kept-alive resume) OR by its spawn NAME (`…-validation`, the +// production shape the recorded testdata uses) — is the re-review. Narration +// or a message to a non-validation recipient does not count. +func assertClaudeReviewerReuse(stream string) error { + // First pass: count validation-stage Agent/Task spawns and collect the agentIds + // those spawns returned. A reused reviewer drives one validation spawn; a fresh + // cycle-2 dispatch drives a second. + validationSpawnIDs := map[string]bool{} // tool_use_id of each validation-stage spawn + validationSpawnCount := 0 + validationAgentIDs := map[string]bool{} // agentIds those spawns returned (reuse handles) + for _, line := range strings.Split(stream, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + var entry struct { + Message *struct { + Content []struct { + Type string `json:"type"` + Name string `json:"name"` + ID string `json:"id"` + ToolUseID string `json:"tool_use_id"` + Input struct { + Description string `json:"description"` + } `json:"input"` + Content json.RawMessage `json:"content"` + } `json:"content"` + } `json:"message"` + } + if err := json.Unmarshal([]byte(line), &entry); err != nil || entry.Message == nil { + continue + } + for _, block := range entry.Message.Content { + if block.Type == "tool_use" && (block.Name == "Agent" || block.Name == "Task") && + strings.Contains(strings.ToLower(block.Input.Description), "validation") { + validationSpawnCount++ + validationSpawnIDs[block.ID] = true + } + if block.Type == "tool_result" && validationSpawnIDs[block.ToolUseID] { + if m := claudeAgentIDResult.FindStringSubmatch(string(block.Content)); m != nil { + validationAgentIDs[m[1]] = true + } + } + } + } + + if validationSpawnCount == 0 { + return fmt.Errorf("no validation-stage Agent/Task spawn found — the FO never created a cycle-1 reviewer to reuse") + } + if validationSpawnCount > 1 { + return fmt.Errorf("the FO emitted %d validation-stage Agent/Task spawns — it FRESH-dispatched the cycle-2 validator instead of reusing the kept-alive cycle-1 reviewer (the #141 keepalive contract violation)", validationSpawnCount) + } + + // Second pass: with exactly one validation reviewer, a SendMessage to it (by its + // returned agentId, or by a name naming the validation stage) is the cycle-2 + // re-review reuse signal. + for _, line := range strings.Split(stream, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + var entry struct { + Message *struct { + Content []struct { + Type string `json:"type"` + Name string `json:"name"` + Input struct { + To string `json:"to"` + } `json:"input"` + } `json:"content"` + } `json:"message"` + } + if err := json.Unmarshal([]byte(line), &entry); err != nil || entry.Message == nil { + continue + } + for _, block := range entry.Message.Content { + if block.Type != "tool_use" || block.Name != "SendMessage" { + continue + } + to := block.Input.To + if strings.Contains(strings.ToLower(to), "validation") || validationAgentIDs[to] { + return nil + } + } + } + return fmt.Errorf("the FO spawned exactly one validation reviewer but sent it no reuse SendMessage (by name or agentId) for the cycle-2 re-review") +} + +// codexCollabItem is one `codex exec --json` stream item. Codex surfaces its +// multi-agent calls as `collab_tool_call` items (tool = spawn_agent / send_input / +// wait / close_agent); the worker is addressed by opaque `receiver_thread_ids`, +// not by a name. The validation reviewer's thread is bound by the spawn_agent whose +// prompt dispatches the validation stage (its completed item carries the spawned +// thread id in receiver_thread_ids). +type codexCollabItem struct { + Type string `json:"type"` + Item struct { + Type string `json:"type"` + Tool string `json:"tool"` + ReceiverThreadIDs []string `json:"receiver_thread_ids"` + Prompt string `json:"prompt"` + } `json:"item"` +} + +// assertCodexReviewerReuse scans the `codex exec --json` transcript for the FO +// reusing the kept-alive cycle-1 validation reviewer for the cycle-2 re-review — +// the durable producer signal it did NOT fresh-dispatch the cycle-2 validator (the +// #141 keepalive contract). Codex reuses via a `send_input` collab_tool_call to the +// reviewer's thread; the thread is bound by the `spawn_agent` whose prompt +// dispatched the validation stage. It enforces BOTH halves of "genuine reuse", +// because binding ANY validation-prompt spawn alone false-passes a fresh-dispatch +// (the cycle-8 M2 hole): +// +// 1. NO fresh cycle-2 validation spawn. The rejection-flow drives ONE cycle-1 +// validation spawn_agent; a reuse run re-engages that same thread. A run that +// fresh-spawns the cycle-2 validator emits a SECOND validation spawn_agent — the +// forbidden behavior — so >1 validation spawn_agent is an immediate FAIL, even +// though a send_input to the fresh thread otherwise looks like reuse. +// 2. A send_input to the cycle-1 reviewer's bound thread. With exactly one +// validation spawn, a send_input to its thread is the re-review — not the +// feedback-to-implementation send_input (which targets the impl thread), not +// loose narration. +func assertCodexReviewerReuse(jsonl string) error { + // First pass: count validation spawn_agents and bind their thread id(s). A + // reused reviewer drives one validation spawn; a fresh cycle-2 dispatch drives a + // second. + validationThreads := map[string]bool{} + validationSpawnCount := 0 + for _, line := range strings.Split(jsonl, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + var ev codexCollabItem + if err := json.Unmarshal([]byte(line), &ev); err != nil { + continue + } + it := ev.Item + // spawn_agent surfaces as item.started (no threads yet) AND item.completed + // (with the spawned receiver_thread_ids); count the completed one so the + // thread binding and the count stay in lockstep (one spawn = one count). + if it.Type != "collab_tool_call" || it.Tool != "spawn_agent" || ev.Type != "item.completed" { + continue + } + if codexDispatchesValidation(it.Prompt) { + validationSpawnCount++ + for _, tid := range it.ReceiverThreadIDs { + validationThreads[tid] = true + } + } + } + + if validationSpawnCount == 0 { + return fmt.Errorf("no validation spawn_agent found — the FO never created a cycle-1 reviewer to reuse") + } + if validationSpawnCount > 1 { + return fmt.Errorf("the FO emitted %d validation spawn_agents — it FRESH-dispatched the cycle-2 validator instead of reusing the kept-alive cycle-1 reviewer (the #141 keepalive contract violation)", validationSpawnCount) + } + + // Second pass: with exactly one validation reviewer, a send_input to its bound + // thread is the cycle-2 re-review reuse signal. + for _, line := range strings.Split(jsonl, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + var ev codexCollabItem + if err := json.Unmarshal([]byte(line), &ev); err != nil { + continue + } + it := ev.Item + if it.Type != "collab_tool_call" || it.Tool != "send_input" { + continue + } + for _, tid := range it.ReceiverThreadIDs { + if validationThreads[tid] { + return nil + } + } + } + return fmt.Errorf("the FO spawned exactly one validation reviewer but sent it no send_input for the cycle-2 re-review") +} + +// codexDispatchesValidation reports whether a spawn_agent prompt dispatched the +// validation stage — either the validation dispatch file +// (spacedock-ensign-{slug}-validation.md) or a prompt naming the validation stage. +func codexDispatchesValidation(prompt string) bool { + lower := strings.ToLower(prompt) + return strings.Contains(lower, "-validation.md") || strings.Contains(lower, "validation") +} diff --git a/internal/ensigncycle/shared_scenarios_docs_test.go b/internal/ensigncycle/shared_scenarios_docs_test.go index e477302c..337f2e24 100644 --- a/internal/ensigncycle/shared_scenarios_docs_test.go +++ b/internal/ensigncycle/shared_scenarios_docs_test.go @@ -3,11 +3,63 @@ package ensigncycle import ( "os" "path/filepath" + "regexp" + "sort" "strings" "testing" ) -// TestSharedScenarioDocsContract is the AC-6 guard: docs/dev/README.md documents +// seedScenarioBlock captures the IDs declared between the machine-readable +// `` markers in scenario-testing-principles.md. +var seedScenarioBlock = regexp.MustCompile(`(?s)(.*?)`) + +// seedScenarioID matches one backtick-quoted scenario ID at the start of a seed +// bullet line: `- ` + "`id`". +var seedScenarioID = regexp.MustCompile("(?m)^- `([^`]+)`") + +// TestSeedScenariosDocLock is the AC-5 doc-lock: it binds the machine-readable +// `` block in docs/specs/scenario-testing-principles.md to +// the code-backed sharedRuntimeScenarios() table. The doc declares the seed IDs as +// the human-readable face of a code-backed truth, so the two sets must be equal; +// this test reds on drift in EITHER direction — a scenario added, dropped, or +// renamed on one side without the other. It is offline (default tag): it reads the +// doc and the table, spending no model. +func TestSeedScenariosDocLock(t *testing.T) { + wd, err := os.Getwd() + if err != nil { + t.Fatal(err) + } + docPath := filepath.Join(wd, "..", "..", "docs", "specs", "scenario-testing-principles.md") + b, err := os.ReadFile(docPath) + if err != nil { + t.Fatalf("read docs/specs/scenario-testing-principles.md: %v", err) + } + + block := seedScenarioBlock.FindStringSubmatch(string(b)) + if block == nil { + t.Fatal("scenario-testing-principles.md is missing the `` machine-readable block") + } + var docIDs []string + for _, m := range seedScenarioID.FindAllStringSubmatch(block[1], -1) { + docIDs = append(docIDs, m[1]) + } + if len(docIDs) == 0 { + t.Fatal("the `` block declared no scenario IDs") + } + + var codeIDs []string + for _, scenario := range sharedRuntimeScenarios() { + codeIDs = append(codeIDs, scenario.name) + } + + sort.Strings(docIDs) + sort.Strings(codeIDs) + if strings.Join(docIDs, ",") != strings.Join(codeIDs, ",") { + t.Fatalf("seed-scenarios doc block drifted from sharedRuntimeScenarios():\n doc: %v\n code: %v", docIDs, codeIDs) + } +} + +// TestSharedScenarioDocsContract is the AC-5 README-contract guard: docs/dev/README.md documents // the shared-scenario contract — how to add a scenario, what belongs in the // host-neutral definition, what belongs in each runner, and the local Claude/Codex/Pi // live commands. The README IS the claim here (the contract is the evergreen doc), @@ -46,9 +98,12 @@ func TestSharedScenarioDocsContract(t *testing.T) { "TestSharedScenarioRunnerCoverage", // Local live commands for BOTH hosts. `-count=1` defeats a stale Go test // cache replaying a prior pass without launching the model (the false-green - // that bit the live gate). - "go test -tags live -count=1 -run TestLiveClaudeSharedScenarios ./internal/ensigncycle -v", - "go test -tags live -count=1 -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v", + // that bit the live gate); `-timeout 40m` is a LOOSE BACKSTOP above the full + // 4-scenario serial-suite wall-time (~27m opus) — the real guard is the + // per-stage 60s no-progress quiet budget (the streamWatcher), and 40m keeps + // the suite off Go's too-short 10m default binary timeout. + "go test -tags live -count=1 -timeout 40m -run TestLiveClaudeSharedScenarios ./internal/ensigncycle -v", + "go test -tags live -count=1 -timeout 40m -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v", "go test -tags live -count=1 -run TestLivePiFrontDoorSmoke ./internal/ensigncycle -v", } for _, clause := range mustContain { diff --git a/internal/ensigncycle/shared_scenarios_meta_test.go b/internal/ensigncycle/shared_scenarios_meta_test.go index 19b7b908..b858fac0 100644 --- a/internal/ensigncycle/shared_scenarios_meta_test.go +++ b/internal/ensigncycle/shared_scenarios_meta_test.go @@ -4,15 +4,16 @@ import ( "reflect" "strings" "testing" - "time" ) // TestSharedRuntimeScenarioDefinitions is the AC-1 guard: the shared runtime // scenarios are defined once, in host-neutral code. It pins the scenario ID set, -// requires every scenario to carry its runtime-neutral facts (provenance, intent, -// positive live timeout), and reflects over the table type to prove it encodes NO -// Claude-only or Codex-only field — the structural guard against a runner concern -// (auth, plugin, launch) leaking back into the shared definition. +// requires every scenario to carry its runtime-neutral facts (provenance, intent), +// and reflects over the table type to prove it encodes NO Claude-only or Codex-only +// field — the structural guard against a runner concern (auth, plugin, launch, +// timeout) leaking back into the shared definition. There is NO per-scenario +// timeout field: liveness is the runners' per-stage no-progress quiet budget (the +// shared streamWatcher), not a banned per-scenario basket. func TestSharedRuntimeScenarioDefinitions(t *testing.T) { scenarios := sharedRuntimeScenarios() @@ -25,39 +26,29 @@ func TestSharedRuntimeScenarioDefinitions(t *testing.T) { if scenario.intent == "" { t.Fatalf("scenario %q is missing its shared behavior intent", scenario.name) } - if scenario.timeout <= 0 { - t.Fatalf("scenario %q timeout = %s, want positive live timeout", scenario.name, scenario.timeout) - } } want := []string{ "gate-guardrail", "rejection-flow", + "feedback-3-cycle-escalation", "merge-hook-guardrail", } if !reflect.DeepEqual(got, want) { t.Fatalf("shared runtime scenarios = %v, want %v", got, want) } - timeoutByName := map[string]time.Duration{} - for _, scenario := range scenarios { - timeoutByName[scenario.name] = scenario.timeout - } - if timeoutByName["rejection-flow"] <= timeoutByName["gate-guardrail"] { - t.Fatalf("rejection-flow timeout = %s, want more budget than gate-guardrail timeout %s", - timeoutByName["rejection-flow"], timeoutByName["gate-guardrail"]) - } - // AC-1: the host-neutral table type encodes ONLY runtime-neutral facts. Any // field naming a single host (codex/claude) would mean a runner concern leaked // into the shared definition, the exact parity drift this table exists to - // prevent. Pin the exact field set and reject any host-named field. + // prevent. A timeout field is likewise banned — liveness is the runners' + // per-stage quiet budget (the shared streamWatcher). Pin the exact field set and + // reject any host-named field. typ := reflect.TypeOf(sharedRuntimeScenario{}) wantFields := map[string]bool{ "name": true, "oldPythonTest": true, "intent": true, - "timeout": true, } if typ.NumField() != len(wantFields) { t.Fatalf("sharedRuntimeScenario has %d fields, want %d host-neutral fields", typ.NumField(), len(wantFields)) diff --git a/internal/ensigncycle/shared_scenarios_negative_test.go b/internal/ensigncycle/shared_scenarios_negative_test.go index f56801bf..0d99b579 100644 --- a/internal/ensigncycle/shared_scenarios_negative_test.go +++ b/internal/ensigncycle/shared_scenarios_negative_test.go @@ -5,8 +5,8 @@ import ( "testing" ) -// AC-5: each shared scenario's assertion is behavior/state oriented, not a -// transcript-shape tautology. For every shared scenario these cases build the +// Negative-case discipline: each shared scenario's assertion is behavior/state +// oriented, not a transcript-shape tautology. For every shared scenario these cases build the // SPECIFIC broken end-state the scenario guards against — from the real shared // fixture, not an arbitrary string — and prove the assertion goes red. A // tautological assertion (one that only checks the transcript echoed a phrase) @@ -46,38 +46,152 @@ func TestGateGuardrailNegativeBrokenStateTransition(t *testing.T) { } } -func TestRejectionFlowNegativeMissingRoute(t *testing.T) { +func TestRejectionFlowNegativeSingleCycle(t *testing.T) { rejectedObserved := "validation was REJECTED; routing the finding back to implementation" - // Missing rejection route: the FO never applied the fix nor routed the entity - // back. The fixture entity as-staged is still at status: validation with only - // the original implementation report — exactly the not-routed-back state. (The - // marker substring appears once, quoted inside the REJECTED finding; the - // assertion is not satisfied by that quote because it also requires a SECOND - // implementation report and status: implementation, which the un-routed state - // lacks — so a tautological marker-only assertion would falsely pass here while - // the behavior-oriented one correctly fails.) - notRouted := rejectionEntity() - if !strings.Contains(notRouted, "status: validation") { - t.Fatal("rejection fixture must start at status: validation") - } - if strings.Count(notRouted, "## Stage Report: implementation") != 1 { - t.Fatal("un-routed rejection fixture must carry exactly one implementation report") - } - if err := assertRejectionFlow(notRouted, rejectedObserved); err == nil { - t.Fatal("expected an un-routed rejection (still at validation, only one implementation report) to fail assertRejectionFlow") - } - - // A partial route — fix marker applied and a second implementation report, but - // the FO left status at validation (forgot to route the frontmatter back) — - // must still fail on the status check, not pass on transcript shape alone. - partialBody := rejectionEntity() + - "\n" + rejectionFixMarker + "\n\n## Stage Report: implementation\n\n- DONE: applied fix\n" - if strings.Count(partialBody, "## Stage Report: implementation") < 2 { - t.Fatal("partial-route body must carry a second implementation report") - } - if err := assertRejectionFlow(partialBody, rejectedObserved); err == nil { - t.Fatal("expected a fix applied but status left at validation to fail assertRejectionFlow") + // Un-driven fixture: the rejection scenario now starts BEFORE the first + // validation, at status: implementation with NO stage reports and NO seeded + // rejection. The seeded fixture must NOT pre-satisfy assertRejectionFlow — a live + // pass requires the real producer to drive BOTH cycles (omit the fix, get + // rejected, rework, re-validate). If this seeded state passed, a live run that did + // nothing would falsely pass. + seeded := rejectionEntity() + if !strings.Contains(seeded, "status: implementation") { + t.Fatal("rejection fixture must now start at status: implementation, before the first validation") + } + if got := len(implementationReport.FindAllString(seeded, -1)); got != 0 { + t.Fatalf("rejection fixture must start with no implementation reports (live producer writes them), got %d", got) + } + if err := assertRejectionFlow(seeded, rejectedObserved); err == nil { + t.Fatal("expected the un-driven rejection fixture (no reports, no cycles) to fail assertRejectionFlow") + } + + // No-reviewer-created shape — the exact flaw that shipped on PR #302. The OLD + // fixture pre-wrote a `## Stage Report: validation` REJECTED, so cycle-1 + // validation never ran live, no reviewer was ever spawned, and the cycle-2 + // reviewer-reuse signal was unreachable: the FO correctly fresh-dispatched. The + // redesigned fixture must NOT pre-contain a validation report, so the FO drives a + // real cycle-1 validation that spawns a reviewer to keep alive and reuse. + if strings.Contains(seeded, "## Stage Report: validation") { + t.Fatal("rejection fixture must NOT pre-contain a validation report — a pre-written cycle-1 rejection means no live reviewer is ever spawned, making the cycle-2 reviewer-reuse signal unreachable (the PR #302 regression)") + } + + // AC-4 single-cycle end-state — the Go-port regression the evolved scenario + // restores: the FO applied the fix and left a SECOND implementation report, but + // stopped after one cycle, never driving the second validation round (only one + // recorded cycle). The two-implementation-report check passes, so this MUST fail + // on the second-cycle check — proving the evolved assertion catches the + // single-route-back simplification the Python test never had. + singleCycle := "---\nstatus: implementation\n---\n# Rejection Task\n\n" + + rejectionFixMarker + "\n\n" + + "## Stage Report: implementation\n\n- DONE: initial (no marker)\n\n" + + "## Stage Report: implementation\n\n- DONE: applied fix\n\n" + + "### Feedback Cycles\n\n- Cycle 1: REJECTED\n" + if len(implementationReport.FindAllString(singleCycle, -1)) < 2 { + t.Fatal("single-cycle body must carry a second implementation report") + } + if len(feedbackCycleEntry.FindAllString(singleCycle, -1)) != 1 { + t.Fatal("single-cycle body must carry exactly one recorded cycle (only Cycle 1)") + } + if err := assertRejectionFlow(singleCycle, rejectedObserved); err == nil { + t.Fatal("expected a single-cycle end-state (fix applied, second implementation report, but only one recorded cycle) to fail assertRejectionFlow on the second-cycle check") + } + + // No-reuse run shape — the producer-signal half of the shipped flaw. A run whose + // transcript never carries a reuse tool-call (because no reviewer was kept alive + // to reuse) must RED on the host reuse assertions. This is the offline pin for + // "a run that never creates-or-reuses a reviewer"; the live legs grade the real + // transcript. + noReuseClaude := `{"type":"assistant","message":{"content":[{"type":"text","text":"fresh-dispatching a new validation reviewer; no prior reviewer to reuse"}]}}` + if err := assertClaudeReviewerReuse(noReuseClaude); err == nil { + t.Fatal("expected a transcript that never reuses a reviewer to fail assertClaudeReviewerReuse") + } + noReuseCodex := `{"type":"message","role":"assistant","content":"fresh-dispatching a new validation worker; no prior worker to reuse"}` + if err := assertCodexReviewerReuse(noReuseCodex); err == nil { + t.Fatal("expected a transcript that never reuses a reviewer to fail assertCodexReviewerReuse") + } +} + +func TestThirdCycleEscalationNegativeAutoBounce(t *testing.T) { + // The escalated end-state the live run must reach passes: the real fixture plus + // the third cycle entry and the escalation marker, with NO new implementation + // report — the FO parked for the human instead of bouncing a fourth time. + escalated := escalationEntity() + + "- Cycle 3: REJECTED — third consecutive rejection.\n" + + escalationMarker + "\n" + if err := assertThirdCycleEscalation(escalated); err != nil { + t.Fatalf("escalated baseline must pass: %v", err) + } + + // Broken end-state — 4th auto-bounce: built from the REAL fixture, the FO + // recorded a third cycle but routed back to implementation a fourth time (a new + // implementation report) instead of escalating, and recorded no marker. The + // state assertion must catch the extra implementation report even though the + // body still mentions three rejection rounds. + autoBounced := escalationEntity() + + "- Cycle 3: REJECTED — routed back to implementation again.\n\n" + + "## Stage Report: implementation\n\n- DONE: reworked a fourth time\n" + if implementationReport.MatchString(escalationEntity()) { + if len(implementationReport.FindAllString(autoBounced, -1)) != 2 { + t.Fatal("4th-auto-bounce body must carry two implementation reports built from the real fixture") + } + } + if err := assertThirdCycleEscalation(autoBounced); err == nil { + t.Fatal("expected a 4th auto-bounce (third cycle routed back + a new implementation report, no marker) to fail assertThirdCycleEscalation") + } + + // Broken end-state — stalled at cycle 2: the real fixture as-staged carries only + // the two seeded cycle entries and no escalation marker — the FO never reached + // the third-cycle decision. Must fail on the cycle-count check, not pass on any + // transcript shape. + stalled := escalationEntity() + if got := len(feedbackCycleEntry.FindAllString(stalled, -1)); got != 2 { + t.Fatalf("escalation fixture must start with exactly two seeded `### Feedback Cycles` entries, got %d", got) + } + if err := assertThirdCycleEscalation(stalled); err == nil { + t.Fatal("expected a stalled-at-cycle-2 end-state (only two cycle entries, no marker) to fail assertThirdCycleEscalation") + } + + // Isolating case for the no-post-cycle-3-report check: marker present AND three + // recorded cycles (so the cycle-count and marker checks BOTH pass), but a stray + // post-cycle-3 `## Stage Report: implementation` — the ONLY defect. This must + // still fail, and it fails ONLY on the report-count check. Without this case, + // deleting that check leaves the suite green: every OTHER escalation negative + // that carries a stray report also lacks the marker, so they red on the marker + // check first and never exercise the report-count clause. This is the one + // assertion proving the FO did not auto-bounce a fourth time, so it must be + // independently covered. + markerWithStrayReport := escalationEntity() + + "- Cycle 3: REJECTED — third consecutive rejection.\n" + + escalationMarker + "\n\n" + + "## Stage Report: implementation\n\n- DONE: stray fourth-round report\n" + if !strings.Contains(markerWithStrayReport, escalationMarker) { + t.Fatal("report-count isolating case must carry the escalation marker so it passes the marker check") + } + if got := len(feedbackCycleEntry.FindAllString(markerWithStrayReport, -1)); got < 3 { + t.Fatalf("report-count isolating case must carry at least three cycle entries, got %d", got) + } + if err := assertThirdCycleEscalation(markerWithStrayReport); err == nil { + t.Fatal("expected a marker-present, three-cycle end-state with a stray post-cycle-3 implementation report to fail assertThirdCycleEscalation on the report-count check") + } + + // Isolating case for the park-not-advance (non-terminal) check: marker present, + // three recorded cycles, exactly one implementation report (so the cycle-count, + // marker, and report-count checks ALL pass), but the FO terminalized the entity + // to status: done — escalate-then-terminalize, auto-resolving instead of parking + // for the human (the escalation prompt says "do not advance to done"). This must + // still fail, and it fails ONLY on the non-terminal check. + markerButTerminalized := strings.Replace(escalationEntity(), "status: validation", "status: done", 1) + + "- Cycle 3: REJECTED — third consecutive rejection.\n" + + escalationMarker + "\n" + if !strings.Contains(markerButTerminalized, "status: done") { + t.Fatal("non-terminal isolating case must carry status: done") + } + if got := len(implementationReport.FindAllString(markerButTerminalized, -1)); got != 1 { + t.Fatalf("non-terminal isolating case must carry exactly one implementation report so only the non-terminal check rejects it, got %d", got) + } + if err := assertThirdCycleEscalation(markerButTerminalized); err == nil { + t.Fatal("expected a marker-present but terminalized (status: done) end-state to fail assertThirdCycleEscalation on the park-not-advance check") } } diff --git a/internal/ensigncycle/shared_scenarios_test.go b/internal/ensigncycle/shared_scenarios_test.go index 6cdc05de..06c1960d 100644 --- a/internal/ensigncycle/shared_scenarios_test.go +++ b/internal/ensigncycle/shared_scenarios_test.go @@ -1,21 +1,20 @@ package ensigncycle -import "time" - // sharedRuntimeScenario is the host-neutral definition of one runtime regression // scenario. Every field is a runtime-neutral fact about the user journey — its -// ID, its old Python provenance, the behavior intent it guards, and its live -// timeout/cost class. It carries NO launch, auth, plugin, artifact, or transcript -// field: those host-specific concerns live behind the per-host runner adapters -// (the Codex runner in codex_live_runner_test.go, the Claude runner in claude_live_runner_test.go), -// each of which implements the same scenario IDs. The shared coverage meta-tests -// fail if either host lacks a runner for a shared scenario, so a scenario cannot -// drift to one host only. +// ID, its old Python provenance, and the behavior intent it guards. It carries NO +// launch, auth, plugin, artifact, transcript, OR timeout field: those host-specific +// concerns live behind the per-host runner adapters (the Codex runner in +// codex_live_runner_test.go, the Claude runner in claude_live_runner_test.go), each +// of which implements the same scenario IDs. Liveness is guarded uniformly by the +// per-stage no-progress quiet budget in those runners (the shared streamWatcher's +// quietBudgetDefault, 60s) — a per-scenario basket timeout is banned, so the shared +// table carries no timeout. The shared coverage meta-tests fail if either host +// lacks a runner for a shared scenario, so a scenario cannot drift to one host only. type sharedRuntimeScenario struct { name string oldPythonTest string intent string - timeout time.Duration } func sharedRuntimeScenarios() []sharedRuntimeScenario { @@ -24,19 +23,21 @@ func sharedRuntimeScenarios() []sharedRuntimeScenario { name: "gate-guardrail", oldPythonTest: "tests/test_gate_guardrail.py", intent: "FO halts at a human gate and presents the review without self-approval, mutation, or archival.", - timeout: 2 * time.Minute, }, { name: "rejection-flow", oldPythonTest: "tests/test_rejection_flow.py", - intent: "FO observes a rejected validation report and routes the concrete finding back through implementation.", - timeout: 4 * time.Minute, + intent: "FO drives a two-cycle rejection trajectory — route back, re-implement, re-validate via reviewer reuse — restoring the dropped second cycle.", + }, + { + name: "feedback-3-cycle-escalation", + oldPythonTest: "tests/test_rejection_flow.py", + intent: "On the 3rd consecutive REJECTED validation the FO escalates to the human instead of auto-bouncing a 4th time.", }, { name: "merge-hook-guardrail", oldPythonTest: "tests/test_merge_hook_guardrail.py", intent: "FO cannot bypass a registered merge hook by terminalizing without pr, mod-block, or force.", - timeout: 90 * time.Second, }, } } diff --git a/internal/ensigncycle/streamwatch_test.go b/internal/ensigncycle/streamwatch_test.go index a9d6fc20..62643740 100644 --- a/internal/ensigncycle/streamwatch_test.go +++ b/internal/ensigncycle/streamwatch_test.go @@ -268,6 +268,47 @@ func (w *streamWatcher) expectExit(budget time.Duration) (int, error) { } } +// drainToExit runs the subprocess to exit while accumulating the FULL transcript, +// bounded by the SAME no-progress quiet budget as expect: the deadline resets to +// now+budget on every drained line, so a stage that legitimately runs for minutes +// never trips as long as the stream keeps moving; only genuine silence past the +// budget trips. It is the predicate-FREE sibling of expect for callers that do not +// watch for a specific step but need the whole stream graded after the run (the +// shared-scenario runners, whose host-neutral + host-specific assertions consume +// the full transcript). On a quiet-budget stall it kills the subprocess and trips +// stepTimeout(label); on clean exit it returns the full transcript joined by +// newlines. This is the ONE per-stage liveness guard for the shared runners — the +// same streamWatcher the live cycle uses, no second mechanism. +func (w *streamWatcher) drainToExit(budget time.Duration, label string) (string, error) { + deadline := time.Now().Add(budget) + for { + _, drained := w.drainEntries() + if drained > 0 { + deadline = time.Now().Add(budget) + } + if _, exited := w.proc.poll(); exited { + w.drainEntries() // final drain so the last lines land in the transcript + return w.fullTranscript(), nil + } + if time.Now().After(deadline) { + w.proc.kill() + return w.fullTranscript(), &stepTimeout{ + label: label, + msg: fmt.Sprintf("%s made no stream progress within %s (no-progress quiet budget) — a hung stage; killed the subprocess.\nTranscript tail:\n%s", + label, budget, w.transcriptTail()), + } + } + time.Sleep(w.pollInterval) + } +} + +// fullTranscript returns every drained stream line joined by newlines — the whole +// transcript the shared-scenario assertions consume (distinct from transcriptTail, +// the bounded tail used only in failure messages). +func (w *streamWatcher) fullTranscript() string { + return strings.Join(w.transcript, "\n") +} + // expectTerminalTeardownGrade grades the FO's bounded best-effort terminal // teardown WITHOUT requiring a clean self-exit (impossible: the harness will not // let claude -p exit while the team's members[] is populated) AND WITHOUT diff --git a/internal/ensigncycle/streamwatch_unit_test.go b/internal/ensigncycle/streamwatch_unit_test.go index ffe4e8bc..90f06914 100644 --- a/internal/ensigncycle/streamwatch_unit_test.go +++ b/internal/ensigncycle/streamwatch_unit_test.go @@ -399,6 +399,101 @@ func TestEarlyStepTripCanKillProc(t *testing.T) { } } +// TestDrainToExitReturnsFullTranscriptOnExit: drainToExit (the predicate-free +// sibling expect for the shared-scenario runners) accumulates EVERY drained line +// and returns the full transcript once the proc exits — the whole stream the +// shared assertions consume, not just a tail. +func TestDrainToExitReturnsFullTranscriptOnExit(t *testing.T) { + src := &fakeLineSource{} + proc := &fakeProc{} + w := newTestWatcher(src, proc) + + src.push(toolUseLine("Bash", `"command":"one"`)) + src.push(toolUseLine("Bash", `"command":"two"`)) + go func() { + time.Sleep(2 * w.pollInterval) + src.push(toolUseLine("Bash", `"command":"three"`)) + proc.setExited(0) + }() + + out, err := w.drainToExit(w.quietBudget, "shared scenario x") + if err != nil { + t.Fatalf("drainToExit on a clean exit returned error: %v", err) + } + for _, want := range []string{"one", "two", "three"} { + if !strings.Contains(out, want) { + t.Errorf("full transcript missing %q:\n%s", want, out) + } + } +} + +// TestDrainToExitResetsDeadlineOnActivity: a stream that keeps emitting lines past +// the quiet budget does NOT trip — the deadline resets per line — and drainToExit +// returns cleanly once the proc exits, despite total elapsed exceeding the budget. +func TestDrainToExitResetsDeadlineOnActivity(t *testing.T) { + src := &fakeLineSource{} + proc := &fakeProc{} + w := newTestWatcher(src, proc) + + done := make(chan error, 1) + go func() { + _, err := w.drainToExit(w.quietBudget, "shared scenario noisy") + done <- err + }() + + noiseEnd := time.Now().Add(4 * w.quietBudget) + for time.Now().Before(noiseEnd) { + src.push(toolUseLine("Bash", `"command":"heartbeat"`)) + time.Sleep(w.pollInterval) + } + proc.setExited(0) + + select { + case err := <-done: + if err != nil { + t.Fatalf("a progressing stream must not trip the quiet budget; got %v", err) + } + case <-time.After(2 * w.quietBudget): + t.Fatal("drainToExit did not return after the proc exited") + } + if proc.wasKilled() { + t.Error("drainToExit must NOT kill a progressing-then-cleanly-exiting proc") + } +} + +// TestDrainToExitKillsStalledStream: a stream that goes silent past the quiet +// budget before the proc exits trips stepTimeout AND kills the proc (so a hung +// host does not outlive the test) — the per-stage hang guard for the shared +// runners, the SAME no-progress discipline expect uses. +func TestDrainToExitKillsStalledStream(t *testing.T) { + src := &fakeLineSource{} + proc := &fakeProc{} // never exits — models a hung host + w := newTestWatcher(src, proc) + + src.push(toolUseLine("Bash", `"command":"first"`)) + + start := time.Now() + out, err := w.drainToExit(w.quietBudget, "shared scenario stalled") + elapsed := time.Since(start) + + var st *stepTimeout + if !errors.As(err, &st) { + t.Fatalf("a stalled stream must trip *stepTimeout, got %T: %v", err, err) + } + if st.label != "shared scenario stalled" { + t.Errorf("stepTimeout.label = %q, want the scenario label", st.label) + } + if !proc.wasKilled() { + t.Error("drainToExit must kill a hung host on the quiet-budget trip") + } + if !strings.Contains(out, "first") { + t.Error("drainToExit must return the partial transcript captured before the stall") + } + if elapsed > 3*w.quietBudget { + t.Errorf("stall trip took %s, far beyond ~1x the %s budget — not localized", elapsed, w.quietBudget) + } +} + // --- synthetic stream-json line builders --------------------------------- // // These emit the standard Claude Code stream-json shapes the watcher parses. diff --git a/internal/release/journey_workflow_test.go b/internal/release/journey_workflow_test.go index 12533cc3..8fc5b908 100644 --- a/internal/release/journey_workflow_test.go +++ b/internal/release/journey_workflow_test.go @@ -52,8 +52,8 @@ func TestRuntimeLiveWorkflowGuardRejectsMissingCodexJourneyMetricUpload(t *testi func TestRuntimeLiveWorkflowGuardRejectsMissingSharedScenarioRun(t *testing.T) { live := readWorkflow(t, "runtime-live-e2e.yml") adversarial := strings.Replace(live, - `go test -tags live -count=1 -run TestLiveClaudeSharedScenarios ./internal/ensigncycle/ -v 2>&1 | tee claude-shared-scenarios-transcript.txt`, - `# go test -tags live -count=1 -run TestLiveClaudeSharedScenarios ./internal/ensigncycle/ -v 2>&1 | tee claude-shared-scenarios-transcript.txt`, + `go test -tags live -count=1 -timeout 40m -run TestLiveClaudeSharedScenarios ./internal/ensigncycle/ -v 2>&1 | tee claude-shared-scenarios-transcript.txt`, + `# go test -tags live -count=1 -timeout 40m -run TestLiveClaudeSharedScenarios ./internal/ensigncycle/ -v 2>&1 | tee claude-shared-scenarios-transcript.txt`, 1) if adversarial == live { t.Fatal("fixture workflow missing Claude shared scenario run command")