diff --git a/docs/dev/README.md b/docs/dev/README.md index e3e31185..7d17b464 100644 --- a/docs/dev/README.md +++ b/docs/dev/README.md @@ -95,6 +95,7 @@ A task moves to implementation once its design is approved. The work here is to - **Inputs:** The fleshed-out task body from ideation with approach and acceptance criteria - **Outputs:** The deliverable committed to the relevant repo or state checkout, with a summary of what was produced and where +- Implementation completion is not a stopping point: once the deliverable is committed and the stage report filed, the entity routes immediately to independent `validation` dispatch — a fresh validator, since `validation` is `fresh: true` — unless a gate, blocker, terminal ceremony, or captain decision intervenes. The FO does not park a completed implementation and wait. - **Good:** Minimal changes that satisfy acceptance criteria, clean Go packages, stable CLI output, tests where appropriate, and a self-contained deliverable - **Bad:** Over-engineering, unrelated refactoring, skipping tests, ignoring edge cases identified in ideation, or leaving the deliverable incomplete for validation to finish diff --git a/internal/ensigncycle/auto_continue_fixtures_test.go b/internal/ensigncycle/auto_continue_fixtures_test.go new file mode 100644 index 00000000..af166680 --- /dev/null +++ b/internal/ensigncycle/auto_continue_fixtures_test.go @@ -0,0 +1,162 @@ +package ensigncycle + +import ( + "fmt" + "os" + "path/filepath" + "regexp" + "testing" +) + +// Auto-continue fixtures and assertion for AC-5: a dev-shaped workflow parked at +// an implementation-ready state, exercising whether the FO continues the +// lifecycle (advance to validation + dispatch a fresh validator / present the +// gate) instead of stopping after the implementation report is filed. Like the +// other shared fixtures these live under the DEFAULT build tag so the offline +// negative case (auto_continue_negative_test.go) grades the assertion with no +// model spend, while the //go:build live half (auto_continue_live_test.go) drives +// the same fixture against a real agent. + +// validationStatusOrBeyond matches an entity whose status advanced to validation +// (or the terminal done) — i.e. the FO did NOT leave it parked at implementation. +var validationStatusOrBeyond = regexp.MustCompile(`(?im)^status:\s*(validation|done)\s*$`) + +// implementationStatusAC5 matches an entity still parked at implementation — the +// failure mode this regression guards against. +var implementationStatusAC5 = regexp.MustCompile(`(?im)^status:\s*implementation\s*$`) + +func writeAutoContinueWorkflowNoGit(dir string) (string, error) { + if err := os.WriteFile(filepath.Join(dir, "README.md"), []byte(autoContinueReadme()), 0o644); err != nil { + return "", err + } + entityPath := filepath.Join(dir, "auto-continue-task.md") + if err := os.WriteFile(entityPath, []byte(autoContinueEntity()), 0o644); err != nil { + return "", err + } + return entityPath, nil +} + +func writeAutoContinueWorkflow(t *testing.T, root string) string { + t.Helper() + entityPath, err := writeAutoContinueWorkflowNoGit(root) + if err != nil { + t.Fatal(err) + } + gitInit(t, root) + return entityPath +} + +// autoContinueReadme is the dev-shaped fixture workflow from AC-4: a non-split- +// root workflow backlog → implementation → validation(worktree, fresh, gate) → +// done. validation is `fresh: true` so the FO must dispatch a FRESH validator, +// and `gate: true` so the FO presents the validation gate after the validator +// reports. The implementation stage carries a concrete validator instruction so a +// dispatched validator has real work that leaves a durable validation report. +func autoContinueReadme() string { + return "---\n" + + "entity-type: task\n" + + "id-style: slug\n" + + "stages:\n" + + " defaults:\n" + + " worktree: false\n" + + " concurrency: 1\n" + + " states:\n" + + " - name: backlog\n" + + " initial: true\n" + + " - name: implementation\n" + + " worktree: true\n" + + " - name: validation\n" + + " worktree: true\n" + + " fresh: true\n" + + " feedback-to: implementation\n" + + " gate: true\n" + + " - name: done\n" + + " terminal: true\n" + + "---\n" + + "# Auto-Continue Fixture\n\n" + + "### backlog\n\nSeed the task.\n\n- **Outputs:** A seed task.\n\n" + + "### implementation\n\nProduce the deliverable.\n\n- **Outputs:** The deliverable plus an implementation stage report.\n\n" + + "### validation\n\n" + + "Verify the implementation against the acceptance criteria. Append a `## Stage Report: validation` " + + "section to the entity with one `- DONE:` item and a PASSED or REJECTED recommendation.\n\n" + + "- **Outputs:** A PASSED or REJECTED validation stage report.\n\n" + + "### done\n\nTerminal state.\n" +} + +// autoContinueEntity is parked at status: implementation with a filed +// implementation stage report — the implementation-ready state from AC-4. A +// correct FO verifies this report and immediately advances to validation + +// dispatches a fresh validator (then presents the validation gate); a broken FO +// stops here with the report filed and the status unchanged. +func autoContinueEntity() string { + return "---\n" + + "id: auto-continue-task\n" + + "title: Auto Continue Task\n" + + "status: implementation\n" + + "completed:\n" + + "verdict:\n" + + "worktree:\n" + + "---\n" + + "# Auto Continue Task\n\n" + + "The implementation is complete and its stage report is filed below. The next lifecycle " + + "step is independent validation.\n\n" + + "## Acceptance criteria\n\n" + + "**AC-1** The deliverable exists and is committed.\n" + + "Verified by: the implementation stage report below plus a validation pass.\n\n" + + "## Stage Report: implementation\n\n" + + "- DONE: Produce the deliverable\n" + + " The deliverable is committed and ready for independent verification.\n" + + "\n### Summary\n\n" + + "Implementation is complete. The first officer must advance to validation and dispatch a fresh validator.\n" +} + +// autoContinuePrompt is the NEUTRAL runbook from AC-4 — `Use $spacedock:first- +// officer` with no "drive to done" coaching. It points the FO at the workflow and +// the one parked entity and asks it to proceed normally. It deliberately does NOT +// tell the FO to advance, dispatch, or validate — whether it does so is exactly +// the behavior under test. Run non-interactively (`claude -p`), the FO enters +// single-entity mode and drives the parked implementation forward on its own; a +// broken FO stops after the implementation report instead. +func autoContinuePrompt() string { + return fmt.Sprintf("%s\n\n%s\n%s\n%s", + "Use $spacedock:first-officer for this whole run.", + "Workflow directory: .", + "Process the entity `auto-continue-task`. Its implementation worker has just completed and filed its stage report.", + "Proceed with the workflow as the first-officer contract directs, then give your final response.", + ) +} + +// assertAutoContinue is host-neutral: before/after entity-state strings plus the +// FO's observed output. It grades the DURABLE outcome, not transcript phrasing. +// The lifecycle continued when the entity is no longer parked at implementation +// (status advanced to validation or done) AND a validation stage report appears +// in the entity body — the durable footprint of a fresh validator the FO +// dispatched. A run that narrates "advancing to validation" in the transcript but +// leaves the durable state at status: implementation with no validation report +// fails on the state checks, not on transcript shape. +func assertAutoContinue(before, after, observed string) error { + if implementationStatusAC5.MatchString(after) { + return fmt.Errorf("FO left the entity parked at status: implementation — it stopped instead of advancing") + } + if !validationStatusOrBeyond.MatchString(after) { + return fmt.Errorf("FO did not advance the entity to status: validation (or beyond)") + } + if !regexpValidationReport.MatchString(after) { + return fmt.Errorf("no `## Stage Report: validation` appeared — the FO did not dispatch/run a validator") + } + // The implementation report must still be present (the FO advanced, it did not + // discard the prior stage's report) — guards against an after-state that simply + // replaced the body rather than appending the validation report. + if !regexpImplementationReport.MatchString(before) { + return fmt.Errorf("fixture invariant broken: before-state lacks the implementation stage report") + } + if !regexpImplementationReport.MatchString(after) { + return fmt.Errorf("the implementation stage report was lost from the entity after the run") + } + return nil +} + +var ( + regexpValidationReport = regexp.MustCompile(`(?m)^## Stage Report: validation\b`) + regexpImplementationReport = regexp.MustCompile(`(?m)^## Stage Report: implementation\b`) +) diff --git a/internal/ensigncycle/auto_continue_live_test.go b/internal/ensigncycle/auto_continue_live_test.go new file mode 100644 index 00000000..001fce8b --- /dev/null +++ b/internal/ensigncycle/auto_continue_live_test.go @@ -0,0 +1,89 @@ +//go:build live + +// ABOUTME: AC-5 live regression — a real FO, given an implementation-ready dev +// ABOUTME: entity, must advance to validation and dispatch a fresh validator, not stop. +package ensigncycle + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" + + "github.com/spacedock-dev/spacedock/internal/livescenario" +) + +// TestLiveAutoContinueAfterImplementation is AC-5's live half: a real FO is +// pointed at a dev-shaped workflow whose one entity is parked at an +// implementation-ready state (status: implementation with a filed implementation +// stage report) under the NEUTRAL `Use $spacedock:first-officer` runbook — no +// "drive to done" coaching. The scenario grades on the DURABLE outcome via the +// shared, state-oriented assertAutoContinue: the FO must advance the entity past +// implementation AND leave a `## Stage Report: validation` behind — the durable +// footprint of a fresh validator it dispatched. A run that stops after the +// implementation report, leaving the entity at status: implementation, REDS the +// grade. Running this (`go test -tags live -run TestLiveAutoContinueAfterImplementation`) +// against a real credential produces the session artifact AC-5's `Verified by: +// live …` citation requires. +// +// Non-interactive `claude -p` puts the FO in single-entity mode, where it drives +// the parked entity all the way to terminal `done` and ARCHIVES it to +// `_archive/auto-continue-task.md`. That over-runs the minimum (it more than +// proves the FO did not stop after implementation), so the grade reads the entity +// from wherever it lands — the original path or the archive — via the captured +// workflow dir. The primitive's tolerant post-run read hands the Assert an empty +// after-body when the original path was archived; the Assert then resolves the +// real end-state. It reuses claudeRunnerAdapter + errGraded from +// livescenario_adapter_live_test.go and the assertAutoContinue grade shared with +// the offline negative case. +func TestLiveAutoContinueAfterImplementation(t *testing.T) { + runner := newClaudeLiveRunner(t) + // Implementation completion → validator dispatch → (single-entity) gate + // auto-resolve → merge/terminalize runs TWO full agent runs serially (the FO + // and the fresh validator), so the budget is generous. + adapter := claudeRunnerAdapter{t: t, runner: runner, timeout: 15 * time.Minute} + + var workflowDir string + sc := livescenario.Scenario{ + Name: "auto-continue-after-implementation", + Runbook: autoContinuePrompt(), + Setup: func(dir string) (string, error) { + // Capture the staged workflow dir so the Assert can find the entity even + // after the FO archives it. Stage WITHOUT git-init; the adapter git-inits + // once the primitive has captured the pre-run state (matching the live order). + workflowDir = dir + return writeAutoContinueWorkflowNoGit(dir) + }, + Assert: func(before, after livescenario.EntityState, observed string) error { + afterBody := resolveAutoContinueEndState(workflowDir, after.Body) + if err := assertAutoContinue(before.Body, afterBody, observed); err != nil { + return errGraded(err.Error()) + } + return nil + }, + } + + if err := livescenario.Run(context.Background(), t.TempDir(), sc, adapter); err != nil { + t.Fatalf("live auto-continue scenario graded FAIL: %v", err) + } +} + +// resolveAutoContinueEndState returns the entity's durable end-state body. The FO +// may archive a terminalized entity, moving it out of its original path; in that +// case the primitive's after-body is empty (the original path is gone) and the +// real end-state lives at `_archive/auto-continue-task.md`. This reads that +// archived copy when present, otherwise falls back to the primitive's after-body +// (the entity stayed put — e.g. held at the validation gate). It NEVER fabricates +// state: a genuinely absent entity yields an empty body and the state-oriented +// assertAutoContinue reds. +func resolveAutoContinueEndState(workflowDir, afterBody string) string { + if afterBody != "" { + return afterBody + } + archived := filepath.Join(workflowDir, "_archive", "auto-continue-task.md") + if data, err := os.ReadFile(archived); err == nil { + return string(data) + } + return afterBody +} diff --git a/internal/ensigncycle/auto_continue_negative_test.go b/internal/ensigncycle/auto_continue_negative_test.go new file mode 100644 index 00000000..98fc7af8 --- /dev/null +++ b/internal/ensigncycle/auto_continue_negative_test.go @@ -0,0 +1,58 @@ +package ensigncycle + +import ( + "strings" + "testing" +) + +// AC-5 offline negative case: assertAutoContinue is behavior/state oriented, not +// a transcript-shape tautology. These cases build the SPECIFIC broken end-state +// the regression guards against — an FO that filed the implementation report and +// STOPPED, leaving the entity at status: implementation with no validation report +// — and prove the assertion reds even when the transcript narrates intent to +// continue. They are offline (default tag): the assertion is a pure function over +// entity-state + observed strings, so they spend no model. + +func TestAutoContinueNegativeStoppedAfterImplementation(t *testing.T) { + before := autoContinueEntity() + + // Baseline: an FO that truly advanced — status moved to validation and a + // validation stage report was appended — GRADES PASS. Without this the negative + // cases could pass against an assertion that always errors. + advanced := strings.Replace(before, "status: implementation", "status: validation", 1) + + "\n## Stage Report: validation\n\n- DONE: Verified the deliverable\n PASSED.\n" + if advanced == before { + t.Fatal("fixture must contain `status: implementation` to advance") + } + if err := assertAutoContinue(before, advanced, "Advanced to validation and dispatched a fresh validator; gate presented."); err != nil { + t.Fatalf("a truly-advanced end-state must grade PASS, got: %v", err) + } + + // Broken: the FO stopped after filing the implementation report. The durable + // state is byte-identical to the staged fixture (still status: implementation, + // no validation report). Even WITH a transcript that narrates advancing to + // validation, the state-oriented grade must catch the stop. + stoppedObserved := "Implementation complete. Advancing to validation and dispatching a fresh validator." + if err := assertAutoContinue(before, before, stoppedObserved); err == nil { + t.Fatal("an FO that left the entity at status: implementation with no validation report must RED the grade even with a transcript narrating advancement") + } + + // Broken: status advanced to validation in the frontmatter but NO validation + // stage report was produced — a partial move (the FO bumped status but never + // dispatched/ran the validator). This must still fail on the missing validation + // report, not pass on the status bump alone. + statusBumpedNoReport := strings.Replace(before, "status: implementation", "status: validation", 1) + if statusBumpedNoReport == before { + t.Fatal("fixture must contain `status: implementation` to bump") + } + if err := assertAutoContinue(before, statusBumpedNoReport, "advancing to validation"); err == nil { + t.Fatal("a status bump with no validation stage report must RED the grade (the validator never ran)") + } + + // Broken: a validation report was appended but the status was left at + // implementation — the inverse partial move. Must fail on the status check. + reportNoStatus := before + "\n## Stage Report: validation\n\n- DONE: Verified\n PASSED.\n" + if err := assertAutoContinue(before, reportNoStatus, "validated the deliverable"); err == nil { + t.Fatal("a validation report with status left at implementation must RED the grade") + } +} diff --git a/internal/ensigncycle/auto_continue_pi_live_test.go b/internal/ensigncycle/auto_continue_pi_live_test.go new file mode 100644 index 00000000..72f72789 --- /dev/null +++ b/internal/ensigncycle/auto_continue_pi_live_test.go @@ -0,0 +1,194 @@ +//go:build live + +// ABOUTME: AC-5 Pi live regression — the origin runtime where the bug bit. A real +// ABOUTME: Pi FO, given a completed implementation, must advance + dispatch a validator. +package ensigncycle + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" +) + +// TestLivePiAutoContinueAfterImplementation reproduces the auto-continue lifecycle +// invariant on Pi's `pi-subagents` substrate — the ORIGIN runtime where the bug +// concretely bit (the subagent result returns to the parent and the parent must +// itself drive the next lifecycle step; there is no separate event-loop turn to +// resume into). A real Pi first officer is pointed at a split-root dev-shaped +// workflow whose one entity is parked at status: implementation with a filed +// implementation stage report. Under a NEUTRAL runbook (no "advance" / "dispatch +// the validator" coaching — that is the behavior under test), the FO must continue +// the shared `## Completion and Gates` lifecycle in the same turn: advance the +// entity to validation and dispatch a FRESH validator subagent, which leaves a +// durable `## Stage Report: validation`. +// +// The grade is the shared, state-oriented assertAutoContinue over the entity's +// before→after body — the SAME assertion the Claude leg and the offline negative +// use. A run that verifies the implementation report and STOPS (the bug) leaves +// the entity at status: implementation with no validation report and REDS the grade. +func TestLivePiAutoContinueAfterImplementation(t *testing.T) { + piBin := piBinaryOrSkip(t) + repo := repoRoot(t) + piSubagentsRoot := piSubagentsPackageRoot(t) + binary := piSpacedockBinary(t, repo) + + piHome := t.TempDir() + sessionDir := t.TempDir() + cleanHome := t.TempDir() + seedPiLocalAuth(t, piHome, os.Getenv("HOME")) + workflowRoot, stateRoot, entityPath := writePiAutoContinueWorkflow(t) + artifactDir := filepath.Join(piLiveArtifactDir(t, "pi-auto-continue"), "run") + if err := os.MkdirAll(filepath.Join(artifactDir, "sessions"), 0o755); err != nil { + t.Fatal(err) + } + env := piLiveEnv(piHome, sessionDir, cleanHome, filepath.Dir(binary), piSubagentsRoot) + + before := readFile(t, entityPath) + + prompt := piAutoContinuePrompt(repo, workflowRoot, stateRoot, entityPath) + runPiLiveCommand(t, artifactDir, workflowRoot, env, piBin, + "--print", + "--session-dir", filepath.Join(artifactDir, "sessions"), + "--extension", filepath.Join(piSubagentsRoot, "src", "extension", "index.ts"), + "--skill", filepath.Join(piSubagentsRoot, "skills", "pi-subagents"), + "--skill", filepath.Join(repo, "skills", "first-officer"), + "--skill", filepath.Join(repo, "skills", "ensign"), + prompt, + ) + + after := resolvePiAutoContinueEndState(stateRoot, entityPath) + observed := piLiveObservedOutput(t, artifactDir) + if err := assertAutoContinue(before, after, observed); err != nil { + t.Fatalf("Pi auto-continue scenario graded FAIL: %v; artifacts in %s\n--- entity after ---\n%s", err, artifactDir, after) + } +} + +// resolvePiAutoContinueEndState returns the split-root entity's durable end-state +// body. The Pi FO may archive a terminalized entity, moving it out of its original +// path in the state checkout; this reads the archived copy when the original is +// gone, otherwise the original (entity held at the gate, or still at validation). +// A genuinely absent entity yields an empty body and the state-oriented +// assertAutoContinue reds — it never fabricates state. +func resolvePiAutoContinueEndState(stateRoot, entityPath string) string { + if data, err := os.ReadFile(entityPath); err == nil { + return string(data) + } + for _, p := range []string{ + filepath.Join(stateRoot, "_archive", "auto-continue-task", "index.md"), + filepath.Join(stateRoot, "_archive", "auto-continue-task.md"), + } { + if data, err := os.ReadFile(p); err == nil { + return string(data) + } + } + return "" +} + +// piBinaryOrSkip returns the pi binary path or skips when Pi is not installed — +// the same skip the existing Pi smoke uses, so the offline lane is unaffected. +func piBinaryOrSkip(t *testing.T) string { + t.Helper() + p, err := exec.LookPath("pi") + if err != nil { + t.Skip("pi not on PATH; install Pi CLI before running the live Pi auto-continue drive") + } + return p +} + +// piLiveObservedOutput concatenates the Pi run's stdout + stderr artifacts as the +// observed final output for the grade. assertAutoContinue grades primarily on the +// durable entity state; the observed string is carried for parity with the +// Claude/offline legs (which also pass observed) but the state checks are the +// load-bearing ones, so a transcript-only narration cannot satisfy the grade. +func piLiveObservedOutput(t *testing.T, artifactDir string) string { + t.Helper() + var b strings.Builder + for _, name := range []string{"pi-stdout.txt", "pi-stderr.txt"} { + p := filepath.Join(artifactDir, name) + if data, err := os.ReadFile(p); err == nil { + b.Write(data) + b.WriteString("\n") + } + } + return b.String() +} + +// writePiAutoContinueWorkflow stages a split-root dev-shaped workflow +// (backlog → implementation → validation(fresh,gate) → done) with the entity +// parked at an implementation-ready state in the STATE checkout. It mirrors +// writePiSplitRootSmokeWorkflow's split-root layout (separate code root + state +// checkout, both git-initialized) so the FO drives the real split-root path. The +// validation stage is non-worktree (like the proven Pi split-root smoke); the +// behavior under test is the parent advancing + dispatching the fresh validator, +// not worktree mechanics. +func writePiAutoContinueWorkflow(t *testing.T) (workflowRoot, stateRoot, entityPath string) { + t.Helper() + workflowRoot = t.TempDir() + stateRoot = filepath.Join(workflowRoot, ".spacedock-state") + writeFile(t, filepath.Join(workflowRoot, "README.md"), piAutoContinueReadme()) + entityPath = filepath.Join(stateRoot, "auto-continue-task", "index.md") + writeFile(t, entityPath, autoContinueEntity()) + gitInit(t, workflowRoot) + gitInit(t, stateRoot) + return workflowRoot, stateRoot, entityPath +} + +// piAutoContinueReadme is the split-root variant of autoContinueReadme(): same +// dev-shaped stage graph, plus `state: .spacedock-state` so the entity body and +// stage reports live in the state checkout (the split-root contract the bug's +// origin entity used). +func piAutoContinueReadme() string { + return "---\n" + + "entity-type: task\n" + + "id-style: slug\n" + + "state: .spacedock-state\n" + + "stages:\n" + + " defaults:\n" + + " worktree: false\n" + + " concurrency: 1\n" + + " states:\n" + + " - name: backlog\n" + + " initial: true\n" + + " - name: implementation\n" + + " - name: validation\n" + + " fresh: true\n" + + " feedback-to: implementation\n" + + " gate: true\n" + + " - name: done\n" + + " terminal: true\n" + + "---\n" + + "# Pi Auto-Continue Fixture\n\n" + + "### backlog\n\nSeed the task.\n\n- **Outputs:** A seed task.\n\n" + + "### implementation\n\nProduce the deliverable.\n\n- **Outputs:** The deliverable plus an implementation stage report.\n\n" + + "### validation\n\n" + + "Verify the implementation against the acceptance criteria. Append a `## Stage Report: validation` " + + "section to the entity with one `- DONE:` item and a PASSED or REJECTED recommendation.\n\n" + + "- **Outputs:** A PASSED or REJECTED validation stage report.\n\n" + + "### done\n\nTerminal state.\n" +} + +// piAutoContinuePrompt is the NEUTRAL Pi runbook. It points the FO at the +// split-root workflow and the one entity whose implementation worker has just +// completed, names the substrate facts the Pi FO needs (use pi-subagents +// subagent(...) for dispatch; this is split-root), and asks it to PROCEED per its +// contract — it deliberately does NOT instruct the FO to advance, dispatch a +// validator, or drive to done. Whether the FO continues the lifecycle is exactly +// the behavior under test. +func piAutoContinuePrompt(repo, workflowRoot, stateRoot, entityPath string) string { + return fmt.Sprintf(`You are the Spacedock first officer running on Pi for a live auto-continue regression. + +Use the local Spacedock first-officer skill and the Pi first-officer runtime adapter. Dispatch any workers with the pi-subagents subagent(...) tool. Do not use or mention Claude Agent, SendMessage, TeamCreate, or TeamDelete tools. + +This is a split-root Spacedock workflow. +Repo root: %[1]s +Workflow directory: %[2]s +State checkout: %[3]s +Entity file: %[4]s + +The entity `+"`auto-continue-task`"+` is parked at status: implementation, and its implementation worker has just completed and filed its `+"`## Stage Report: implementation`"+`. Proceed with the workflow exactly as the first-officer contract directs, then give your final response. + +When you dispatch a worker, give it: load and follow the local Spacedock ensign skill at %[1]s/skills/ensign/SKILL.md and the Pi ensign adapter at %[1]s/skills/ensign/references/pi-ensign-runtime.md; the workflow directory, state checkout, and entity file above; and the target stage. The worker must not edit YAML frontmatter, must append its stage report with the exact heading for its stage, and must path-scoped commit only the entity path in the state checkout.`, repo, workflowRoot, stateRoot, entityPath) +} diff --git a/internal/livescenario/scenario.go b/internal/livescenario/scenario.go index 6724942d..a317f214 100644 --- a/internal/livescenario/scenario.go +++ b/internal/livescenario/scenario.go @@ -68,7 +68,7 @@ func Run(ctx context.Context, dir string, sc Scenario, runner Runner) error { if err != nil { return fmt.Errorf("scenario %q launch failed: %w", sc.Name, err) } - after, err := readEntityState(entityPath) + after, err := readPostRunState(entityPath) if err != nil { return fmt.Errorf("scenario %q could not read post-run state: %w", sc.Name, err) } @@ -78,7 +78,9 @@ func Run(ctx context.Context, dir string, sc Scenario, runner Runner) error { return nil } -// readEntityState reads the entity file's body bytes into an EntityState. +// readEntityState reads the entity file's body bytes into an EntityState. The +// pre-run read uses this: the staged entity must exist, so a missing file is a +// setup error. func readEntityState(entityPath string) (EntityState, error) { data, err := os.ReadFile(entityPath) if err != nil { @@ -86,3 +88,20 @@ func readEntityState(entityPath string) (EntityState, error) { } return EntityState{Body: string(data)}, nil } + +// readPostRunState reads the entity body after the run. A real agent may move or +// remove the entity (e.g. archiving a terminalized entity), which is a legitimate +// durable OUTCOME the Assert should grade — not a Run-level read error. So a +// vanished entity yields an empty-body EntityState rather than failing the run; +// the Assert decides whether that is a pass or a fail (and, being workflow-aware, +// can locate the moved copy itself). +func readPostRunState(entityPath string) (EntityState, error) { + data, err := os.ReadFile(entityPath) + if err != nil { + if os.IsNotExist(err) { + return EntityState{Body: ""}, nil + } + return EntityState{}, err + } + return EntityState{Body: string(data)}, nil +} diff --git a/internal/livescenario/scenario_test.go b/internal/livescenario/scenario_test.go index ab68562c..ad2e1c64 100644 --- a/internal/livescenario/scenario_test.go +++ b/internal/livescenario/scenario_test.go @@ -55,6 +55,7 @@ func (e *gradeError) Error() string { return e.msg } // (ensigncycle) supplies the real launch behind //go:build live. type stubRunner struct { mutateAfter func(beforeBody string) string + removeAfter bool observed string } @@ -63,6 +64,12 @@ func (s stubRunner) Launch(ctx context.Context, dir, entityPath, runbook string) if err != nil { return "", err } + if s.removeAfter { + if rerr := os.Remove(entityPath); rerr != nil { + return "", rerr + } + return s.observed, nil + } if s.mutateAfter != nil { if werr := os.WriteFile(entityPath, []byte(s.mutateAfter(string(beforeBody))), 0o644); werr != nil { return "", werr @@ -122,3 +129,32 @@ func TestScenarioSetupFailureSurfaces(t *testing.T) { t.Fatal("a setup failure must surface as a run failure, not a PASS") } } + +// TestScenarioVanishedEntityReachesAssert locks the tolerant post-run read: when +// the agent moves or removes the entity during the run (e.g. archiving a +// terminalized entity), Run does NOT fail with a read error — it hands the Assert +// an empty-body after-state so the Assert can grade the outcome (and, being +// workflow-aware, locate the moved copy). A scenario that grades an empty after as +// FAIL sees the FAIL; a scenario that accepts it sees the PASS — Run never +// pre-empts that decision with its own read error. +func TestScenarioVanishedEntityReachesAssert(t *testing.T) { + var sawAfter EntityState + sc := Scenario{ + Name: "vanished-entity", + Runbook: "remove the entity", + Setup: func(dir string) (string, error) { + path := filepath.Join(dir, "entity.md") + return path, os.WriteFile(path, []byte("status: implementation\n"), 0o644) + }, + Assert: func(before, after EntityState, observed string) error { + sawAfter = after + return nil // accept — the Assert, not Run, owns the vanished-entity verdict + }, + } + if err := Run(context.Background(), t.TempDir(), sc, stubRunner{removeAfter: true, observed: "archived"}); err != nil { + t.Fatalf("a vanished post-run entity must reach the Assert, not fail Run: %v", err) + } + if sawAfter.Body != "" { + t.Fatalf("vanished post-run entity should yield an empty after-body, got %q", sawAfter.Body) + } +} diff --git a/skills/first-officer/references/first-officer-shared-core.md b/skills/first-officer/references/first-officer-shared-core.md index d3e1f689..3415546e 100644 --- a/skills/first-officer/references/first-officer-shared-core.md +++ b/skills/first-officer/references/first-officer-shared-core.md @@ -130,6 +130,8 @@ The checklist review produces an explicit count summary: `{N} done, {N} skipped, If not gated: terminal → merge; else decide reuse-or-fresh. +**A completed non-gated, non-terminal stage is not a stopping point.** After verifying the report, the FO MUST advance the entity to the next stage and dispatch it (reuse-or-fresh per below) BEFORE ending its turn. It does not file a completion-only status and stop, waiting for the captain or a later turn to resume — advancing is the FO's own next action, not the captain's. The only spans that legitimately halt the turn here are: the next stage is `gate: true` (present the gate and wait), the entity is terminal (run the merge/cleanup ceremony), an explicit blocker (a rebase-conflict halt, an unmet clarification), or a captain decision the contract requires. Absent one of those, stopping after a completion-only report is a contract violation. + A completed worker is reusable only when the worker is still addressable through a live runtime handle AND all reuse conditions below pass. Otherwise dispatch fresh. **Reuse conditions** (all must hold — if any fails, dispatch fresh): diff --git a/skills/first-officer/references/pi-first-officer-runtime.md b/skills/first-officer/references/pi-first-officer-runtime.md index 9873191d..7909028d 100644 --- a/skills/first-officer/references/pi-first-officer-runtime.md +++ b/skills/first-officer/references/pi-first-officer-runtime.md @@ -25,6 +25,8 @@ For Spacedock stage dispatches through `pi-subagents`, do not use the `subagent( For `pi-subagents`, the completion signal is the subagent result returned to the parent. After the result arrives, read the entity file and verify the stage report exactly as the shared core requires. Do not advance state based only on a cheerful worker summary. +Verifying the stage report is not the end of the parent's turn. Once the report is verified for a non-gated, non-terminal stage, the parent MUST continue the shared `## Completion and Gates` lifecycle in the same turn — advance the entity and dispatch the next stage (a fresh subagent when the next stage is `fresh: true`) — and only then return its final response. It does not return a completion-only result for the captain to resume from, unless the shared core's halt spans apply (next stage gated, terminal, blocked, or awaiting a captain decision). + For `pi-agent-teams`, completion is observed through the adapter's task/member notification and then verified against the entity file. The adapter should expose a clear completed/failed result to the first officer; the entity stage report remains the source of truth. ## Follow-up and Reuse