spacedock-dev · clkao · Jun 5, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 5, 2026
diff --git a/.github/workflows/runtime-live-e2e.yml b/.github/workflows/runtime-live-e2e.yml
@@ -177,12 +177,20 @@ jobs:
       # gate/rejection/merge-hook scenarios the codex-live lane runs, through the
       # spacedock claude front door. This is the AC-4 proof that both runtime lanes
       # exercise the shared scenario table, not separate per-host smokes.
+      #
+      # `-timeout 40m` is a LOOSE BACKSTOP only — sized above the full 4-scenario
+      # serial-suite wall-time (~27m opus: rejection-flow 8.98m measured + the
+      # heavier 3-cycle escalation + gate + merge-hook). The REAL guard is the
+      # per-stage no-progress quiet budget (the streamWatcher's quietBudgetDefault,
+      # 60s) in the runners: it resets on every stream line and kills a hang at 60s
+      # of stream SILENCE. This ceiling never fires in a healthy run, it only bounds
+      # a pathological progressing-but-runaway loop.
       - name: Run live Claude shared scenarios
         env:
           SPACEDOCK_LIVE_MODEL: ${{ matrix.model }}
         run: |
           set -o pipefail
-          go test -tags live -count=1 -run TestLiveClaudeSharedScenarios ./internal/ensigncycle/ -v 2>&1 | tee claude-shared-scenarios-transcript.txt
+          go test -tags live -count=1 -timeout 40m -run TestLiveClaudeSharedScenarios ./internal/ensigncycle/ -v 2>&1 | tee claude-shared-scenarios-transcript.txt
 
       - name: Upload live artifacts
         if: always()
@@ -302,7 +310,7 @@ jobs:
           echo "- \`go version\`: \`$(go version)\`" >> "$GITHUB_STEP_SUMMARY"
 
       - name: Run live Codex shared scenarios
-        run: go test -tags live -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v
+        run: go test -tags live -count=1 -timeout 40m -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v
 
       - name: Upload live artifacts
         if: always()

diff --git a/docs/dev/README.md b/docs/dev/README.md
@@ -173,10 +173,11 @@ Each runner adapter turns a shared scenario into a real launch and returns `(bef
 | `observed` extract | read the `--output-last-message` file (+ jsonl) | extract the `result`/`success` event's `result` text from the stream (`extractClaudeFinalMessage`) |
 | Artifacts | jsonl / final-message / stderr | stream jsonl / final-message |
 
-The three shared scenarios reuse the old shared Claude/Codex Python journey overlap (`tests/test_gate_guardrail.py`, `tests/test_rejection_flow.py`, `tests/test_merge_hook_guardrail.py`):
+The shared scenarios reuse the old shared Claude/Codex Python journey overlap (`tests/test_gate_guardrail.py`, `tests/test_rejection_flow.py`, `tests/test_merge_hook_guardrail.py`):
 
 - `gate-guardrail`: starts at a human gate and asserts the first officer presents the gate instead of self-approving, mutating, or archiving the entity.
-- `rejection-flow`: starts from a rejected validation report and asserts the first officer routes the concrete finding back through implementation.
+- `rejection-flow`: drives a two-cycle rejection trajectory — route the concrete finding back through implementation, re-implement, and re-validate a second cycle reusing the kept-alive reviewer — restoring the second cycle the Go port dropped.
+- `feedback-3-cycle-escalation`: starts from two prior rejection cycles at a third REJECTED validation and asserts the first officer escalates to the human on the third cycle instead of auto-bouncing a fourth time.
 - `merge-hook-guardrail`: attempts terminalization while a merge hook is registered and asserts the guard refuses bypass without `mod-block`, PR, or force.
 
 Assertions prefer durable workflow state over transcript phrasing: entity frontmatter (status / completed / verdict), archive-vs-no-archive, the exact fix marker and a second stage report, and only the durable user-facing final-message obligations (a gate review and a decision prompt). `extractClaudeFinalMessage` surfaces a stale-credential `is_error`/`401` `result` event as a LOUD launch failure, distinct from a scenario-assertion failure, so a credential problem is never misread as a runtime regression.
@@ -200,16 +201,16 @@ export SPACEDOCK_BIN="$PWD/spacedock"
 export SPACEDOCK_REPO_ROOT="$PWD"
 ```
 
-Run the Claude shared suite locally (skips when no Claude auth is available — set `~/.claude/benchmark-token` for the OAuth path or `ANTHROPIC_API_KEY` for the API-key path; runs against a fresh isolated `HOME`):
+Run the Claude shared suite locally (skips when no Claude auth is available — set `~/.claude/benchmark-token` for the OAuth path or `ANTHROPIC_API_KEY` for the API-key path; runs against a fresh isolated `HOME`). The `-timeout 40m` is a LOOSE BACKSTOP only — sized above the full 4-scenario serial-suite wall-time (~27m opus). The REAL liveness guard is the per-stage no-progress quiet budget (the shared `streamWatcher`, 60s) in the runners: it resets on every stream line and kills a hang at 60s of stream silence. The 40m ceiling never fires in a healthy run, it only bounds a pathological progressing-but-runaway loop and keeps the suite off Go's too-short default 10m binary timeout:
 
 ```bash
-go test -tags live -count=1 -run TestLiveClaudeSharedScenarios ./internal/ensigncycle -v
+go test -tags live -count=1 -timeout 40m -run TestLiveClaudeSharedScenarios ./internal/ensigncycle -v
 ```
 
 Run the Codex shared suite locally (`npm install -g @openai/codex` then `codex login`, or set `OPENAI_API_KEY`). Local runs may authenticate either through an existing Codex login at `~/.codex/auth.json` or through `OPENAI_API_KEY`. The test copies only `auth.json` into a temporary `CODEX_HOME` for the local subscription path; it does not copy local plugin state or the rest of the operator's Codex config. CI does not use local subscription auth.
 
 ```bash
-go test -tags live -count=1 -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v
+go test -tags live -count=1 -timeout 40m -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v
 ```
 
 Run the Pi front-door smoke locally (`npm install -g pi-coding-agent`, `pi install npm:pi-subagents`, and either `pi login` or `OPENAI_API_KEY`). The smoke loads the current checkout's Spacedock first-officer and ensign skills plus the local pi-subagents extension/skill explicitly; it verifies durable state in the split-root state checkout rather than transcript wording alone.

diff --git a/docs/specs/scenario-testing-principles.md b/docs/specs/scenario-testing-principles.md
@@ -52,11 +52,12 @@ The `(scenario, mode, runtime)` tuple is the primary variant row that is **run,
 
 ## Seed Scenarios
 
-The first foundation is the three host-neutral runtime scenarios already shipped and held in host parity by the shared coverage tests. They are the named seed instances:
+The first foundation is the host-neutral runtime scenarios already shipped and held in host parity by the shared coverage tests. They are the named seed instances:
 
 <!-- seed-scenarios -->
 - `gate-guardrail` — the FO halts at a human gate and presents the review without self-approval, mutation, or archival.
-- `rejection-flow` — the FO observes a rejected validation report and routes the concrete finding back through implementation.
+- `rejection-flow` — the FO drives a two-cycle rejection trajectory: route the finding back through implementation, re-implement, and re-validate a second cycle reusing the kept-alive reviewer.
+- `feedback-3-cycle-escalation` — on the third consecutive REJECTED validation the FO escalates to the human instead of auto-bouncing a fourth time.
 - `merge-hook-guardrail` — the FO cannot bypass a registered merge hook by terminalizing without pr, mod-block, or force.
 <!-- /seed-scenarios -->
 

diff --git a/internal/ensigncycle/auto_continue_live_test.go b/internal/ensigncycle/auto_continue_live_test.go
@@ -9,7 +9,6 @@ import (
 	"os"
 	"path/filepath"
 	"testing"
-	"time"
 
 	"github.com/spacedock-dev/spacedock/internal/livescenario"
 )
@@ -41,8 +40,9 @@ func TestLiveAutoContinueAfterImplementation(t *testing.T) {
 	runner := newClaudeLiveRunner(t)
 	// Implementation completion → validator dispatch → (single-entity) gate
 	// auto-resolve → merge/terminalize runs TWO full agent runs serially (the FO
-	// and the fresh validator), so the budget is generous.
-	adapter := claudeRunnerAdapter{t: t, runner: runner, timeout: 15 * time.Minute}
+	// and the fresh validator). Liveness is the runner's per-stage no-progress quiet
+	// budget (the shared streamWatcher), not a per-call basket — those are banned.
+	adapter := claudeRunnerAdapter{t: t, runner: runner}
 
 	var workflowDir string
 	sc := livescenario.Scenario{

diff --git a/internal/ensigncycle/claude_live_runner_test.go b/internal/ensigncycle/claude_live_runner_test.go
@@ -3,8 +3,7 @@
 package ensigncycle
 
 import (
-	"bytes"
-	"context"
+	"io"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -89,9 +88,10 @@ func claudeLiveScenarios(t *testing.T) []claudeLiveScenario {
 // map lacks a runner for any sharedRuntimeScenarios() ID.
 func claudeScenarioRunners() map[string]func(*testing.T, claudeLiveRunner, sharedRuntimeScenario) {
 	return map[string]func(*testing.T, claudeLiveRunner, sharedRuntimeScenario){
-		"gate-guardrail":       runClaudeGateGuardrailScenario,
-		"rejection-flow":       runClaudeRejectionFlowScenario,
-		"merge-hook-guardrail": runClaudeMergeHookGuardrailScenario,
+		"gate-guardrail":              runClaudeGateGuardrailScenario,
+		"rejection-flow":              runClaudeRejectionFlowScenario,
+		"feedback-3-cycle-escalation": runClaudeFeedback3CycleEscalationScenario,
+		"merge-hook-guardrail":        runClaudeMergeHookGuardrailScenario,
 	}
 }
 
@@ -145,6 +145,34 @@ func runClaudeRejectionFlowScenario(t *testing.T, runner claudeLiveRunner, scena
 	if err := assertRejectionFlow(after, result.finalMessage+"\n"+result.stream); err != nil {
 		t.Fatalf("%v\nFinal message:\n%s\nArtifacts: %s", err, result.finalMessage, result.artifactDir)
 	}
+	// AC-4 reviewer-reuse: on Claude teams the FO must reuse the kept-alive
+	// validation reviewer via a SendMessage tool call for the cycle-2 re-review,
+	// not dispatch a fresh one (the #141 keepalive contract the Go port dropped).
+	// Host-specific producer signal, graded by the runner — not the shared
+	// host-neutral assertion.
+	if err := assertClaudeReviewerReuse(result.stream); err != nil {
+		t.Fatalf("%v\nArtifacts: %s", err, result.artifactDir)
+	}
+	emitClaudeScenarioMetrics(t, scenario, result, runner.model)
+}
+
+// runClaudeFeedback3CycleEscalationScenario drives the real FO against a fixture
+// seeded with two prior rejection cycles at a 3rd REJECTED report and grades the
+// durable end-state: the FO must escalate to the human on the 3rd cycle, not
+// auto-bounce a 4th time. assertThirdCycleEscalation grades durable entity-body
+// state ALONE (cycle count + escalation marker + no post-cycle-3 implementation
+// report) — the reviewer-reuse signal is host-specific and lives in rejection-flow,
+// not here; this scenario is purely a host-neutral durable-state grade.
+func runClaudeFeedback3CycleEscalationScenario(t *testing.T, runner claudeLiveRunner, scenario sharedRuntimeScenario) {
+	t.Helper()
+	workflowRoot := t.TempDir()
+	entityPath := writeEscalationWorkflow(t, workflowRoot)
+
+	result := runner.run(t, scenario, workflowRoot, escalationPrompt())
+	after := readFile(t, entityPath)
+	if err := assertThirdCycleEscalation(after); err != nil {
+		t.Fatalf("%v\nEntity after:\n%s\nFinal message:\n%s\nArtifacts: %s", err, after, result.finalMessage, result.artifactDir)
+	}
 	emitClaudeScenarioMetrics(t, scenario, result, runner.model)
 }
 
@@ -173,6 +201,15 @@ func runClaudeMergeHookGuardrailScenario(t *testing.T, runner claudeLiveRunner,
 // `--` and forwards verbatim to claude. The observed source is the stream's
 // result/success event via extractClaudeFinalMessage — a 401/is_error result is a
 // LOUD launch failure here, never fed into a scenario assertion.
+//
+// Liveness is the EXISTING streamWatcher (the Go port of the upstream
+// FOStreamWatcher, shared with TestLiveEnsignCycle) — one mechanism, no second
+// impl. drainToExit runs the process to exit while accumulating the full
+// transcript, bounded by the per-step no-progress quietBudgetDefault (60s): the
+// deadline resets on every drained line, so a genuine multi-minute run of
+// sequential model work never trips as long as the stream keeps moving, and only
+// silence past the budget kills the process — the same ≤60s AC-1-guarded discipline
+// the live cycle uses.
 func (r claudeLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, workflowRoot, prompt string) claudeScenarioResult {
 	t.Helper()
 	artifactDir := filepath.Join(r.artifactRoot, scenario.name)
@@ -182,9 +219,7 @@ func (r claudeLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, work
 	streamPath := filepath.Join(artifactDir, "claude-stream.jsonl")
 	finalPath := filepath.Join(artifactDir, "claude-final-message.txt")
 
-	ctx, cancel := context.WithTimeout(context.Background(), scenario.timeout)
-	defer cancel()
-	cmd := exec.CommandContext(ctx, r.binary, "claude",
+	cmd := exec.Command(r.binary, "claude",
 		"--plugin-dir", r.repoRoot,
 		"--skip-contract-check",
 		"--",
@@ -197,21 +232,33 @@ func (r claudeLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, work
 	cmd.Dir = workflowRoot
 	cmd.Env = r.env
 
-	// stdout carries the stream-json transcript; stderr is folded in so a launch
-	// error (e.g. a stale-token 401 printed to stderr) is captured alongside it.
-	var buf bytes.Buffer
-	cmd.Stdout = &buf
-	cmd.Stderr = &buf
+	// stdout carries the stream-json transcript the watcher drains for liveness;
+	// stderr is folded into the same pipe so a launch error (e.g. a stale-token 401
+	// printed to stderr) lands in the transcript too — matching the live cycle's
+	// wiring. The cmdPoller closes the pipe write-end on exit so the scanner EOFs.
+	pr, pw := io.Pipe()
+	cmd.Stdout = pw
+	cmd.Stderr = pw
 
 	started := time.Now()
-	runErr := cmd.Run()
+	if startErr := cmd.Start(); startErr != nil {
+		t.Fatalf("spacedock claude failed to start for %s: %v", scenario.name, startErr)
+	}
+	poller := newCmdPoller(cmd, pw)
+	defer poller.kill()
+	watcher := newStreamWatcher(newPipeLineSource(pr), poller, func(line string) { t.Log(line) })
+
+	// drainToExit runs the process to exit accumulating the full transcript, OR
+	// kills it on a 60s no-progress stall (the per-step quiet budget). The deferred
+	// poller.kill() reaps the process on every exit path.
+	stream, stallErr := watcher.drainToExit(quietBudgetDefault, "claude shared scenario "+scenario.name)
 	duration := time.Since(started)
-	stream := buf.String()
+
 	if writeErr := os.WriteFile(streamPath, []byte(stream), 0o644); writeErr != nil {
 		t.Fatal(writeErr)
 	}
-	if ctx.Err() == context.DeadlineExceeded {
-		t.Fatalf("spacedock claude did not finish within %s for %s; artifacts in %s", scenario.timeout, scenario.name, artifactDir)
+	if stallErr != nil {
+		t.Fatalf("%v\nArtifacts: %s", stallErr, artifactDir)
 	}
 
 	// Extract the final message from the stream's result/success event (the
@@ -220,8 +267,8 @@ func (r claudeLiveRunner) run(t *testing.T, scenario sharedRuntimeScenario, work
 	// failure, so a stale credential never feeds the 401 text into an assertion.
 	finalMessage, extractErr := extractClaudeFinalMessage(stream)
 	if extractErr != nil {
-		t.Fatalf("claude launch failed for %s (run err=%v): %v; artifacts in %s\nStream tail:\n%s",
-			scenario.name, runErr, extractErr, artifactDir, tail(stream, 4000))
+		t.Fatalf("claude launch failed for %s: %v; artifacts in %s\nStream tail:\n%s",
+			scenario.name, extractErr, artifactDir, tail(stream, 4000))
 	}
 	if writeErr := os.WriteFile(finalPath, []byte(finalMessage), 0o644); writeErr != nil {
 		t.Fatal(writeErr)