diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index f4dafd1..91db5b7 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -9,7 +9,7 @@ { "name": "autodev", "description": "Autonomous development workflow skills for coding agents", - "version": "6.2.2", + "version": "6.3.0", "source": "./", "author": { "name": "Jon Langevin", diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 9728fa1..940fae4 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "autodev", "description": "Autonomous development workflow skills for coding agents: design, review, planning, execution, monitoring, and retrospectives", - "version": "6.2.2", + "version": "6.3.0", "author": { "name": "Jon Langevin", "email": "jon@gocodealone.com" diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index e77ff1f..47f5913 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -2,7 +2,7 @@ "name": "autodev", "displayName": "Autonomous Dev Kit", "description": "Autonomous development workflow skills for coding agents", - "version": "6.2.2", + "version": "6.3.0", "author": { "name": "Jon Langevin", "email": "jon@gocodealone.com" diff --git a/.github/workflows/hooks-check.yml b/.github/workflows/hooks-check.yml new file mode 100644 index 0000000..cee7beb --- /dev/null +++ b/.github/workflows/hooks-check.yml @@ -0,0 +1,29 @@ +name: Hooks Check +on: + push: + paths: + - 'hooks/**' + - 'tests/hook-contracts.sh' + - 'tests/hook-stdout-discipline.sh' + - '.github/workflows/hooks-check.yml' + pull_request: + paths: + - 'hooks/**' + - 'tests/hook-contracts.sh' + - 'tests/hook-stdout-discipline.sh' + - '.github/workflows/hooks-check.yml' + +permissions: + contents: read + +jobs: + hooks: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install jq + run: sudo apt-get update && sudo apt-get install -y jq + - name: Hook stdout discipline tests + run: bash tests/hook-stdout-discipline.sh + - name: Hook contract tests + run: bash tests/hook-contracts.sh diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index d6a71d7..a08af16 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -1,5 +1,45 @@ # Autonomous Dev Kit Release Notes +## v6.3.0 — 2026-06-01 + +Pipeline-hardening release closing five recurring gate-miss / context-waste issues +observed across autonomous runs and Codex compaction. + +- **`adversarial-design-review` — auth/authz chain-composition bug-class (#59):** a new + plan-phase row that walks the design's auth/authz chain component-by-component against + the plan's wiring and flags any gate enforced by a *client-asserted* value + (`evidence.granted_permissions`, a header) instead of server-side against an + authenticated principal. +- **`pr-monitoring` — sanctioned bash poll-loop (#60):** documents the host-scoped + CI-wait pattern. Under Claude Code, a bounded `run_in_background` bash sleep-loop that + blocks to completion and re-invokes the lead once on settle (the prior background-Agent + monitor early-exited ~6× per run); Codex/Cursor use a self-poll-on-wakeup fallback. +- **`subagent-driven-development` / `team-conventions` — completion trust-boundary (#58, + ADR 0003):** a flipped `Implement: N` is a claim, not evidence — the lead must run + `verification-before-completion` before trusting it. A deterministic hook-block is + infeasible (the pre-tool payload lacks the task subject + caller identity), so + correctness rests on lead verification, not on who flipped the checkbox. +- **`run-hook.cmd` — stdout JSON discipline (#41):** the wrapper now captures each hook's + stdout and emits only valid-JSON-or-empty to the host's hook parser, recovering a block + decision even when a locale/diagnostic warning precedes it (previously such noise could + invalidate the hook's JSON). Diagnostics are routed to stderr; jq-absent hosts pass + through unchanged. New `tests/hook-stdout-discipline.sh`. +- **`pretool-pr-review-reminder` — once-per-session (#61):** the gh-version/Copilot + reviewer reminder now emits once per session (deduped via a `.claude/autodev-state` + marker, quote-strip-matched so a quoted `--body` mentioning `gh pr create` no longer + trips it) and is reset by `pre-compact-snapshot` so it re-emits once after a compaction. +- **`adversarial-design-review` — artifact-class precedent (#63):** a new design-phase row + that surveys how the codebase already implements an *artifact class* (where a scenario + stands up a server, where a fixture lives — `ls scenarios/*/cmd/server/main.go`), not + just the *mechanism*; grep for sibling instances and follow the established shape or + justify divergence. +- **`session-start` — Linux time-dedup fix (#64):** the SessionStart hook tried BSD + `stat -f %m` before GNU `stat -c %Y`; on Linux `stat -f` succeeds-but-wrong (fs info), + so the time-based dedup never suppressed re-fires. Now GNU-first with a numeric guard — + fixing re-fire spam for all Linux autodev users. +- **CI:** new `hooks-check.yml` runs the hook contract + stdout-discipline tests on any + `hooks/`/test change, so these fixes are regression-gated. + ## v6.2.2 — 2026-05-31 New **Existence / runtime-validity** bug-class in `adversarial-design-review` diff --git a/agents/team-conventions.md b/agents/team-conventions.md index 86c4ef0..dac0a5e 100644 --- a/agents/team-conventions.md +++ b/agents/team-conventions.md @@ -25,6 +25,9 @@ them. - Request code review per `skills/requesting-code-review/SKILL.md` using the adversarial-framing brief. - Address review per `skills/receiving-code-review/SKILL.md`. +- **Never self-complete an `Implement: N` task.** Do not flip your own Implement task + to `completed` and do not clear its `blockedBy` to back-door completion. DM the + spec-reviewer when ready; the code-reviewer is the sole flipper of Implement tasks. ## Spec reviewer @@ -46,6 +49,18 @@ them. `skills/requesting-code-review/SKILL.md` until verdict is SHIP-IT (or REVERT-AND-REWRITE after max rounds). - Reflexive approval is forbidden. +- **You are the sole role that flips an `Implement: N` task to `completed`**, and only + after quality review passes. Flip BOTH the "Review quality:" task and the + corresponding "Implement:" task. + +## Lead / orchestrator + +- **A `completed` Implement-N is a claim, not evidence.** Whoever flipped it, run + `skills/verification-before-completion/SKILL.md` (clean build + tests + CI green) + before accepting the task as done or invoking + `skills/finishing-a-development-branch/SKILL.md`. Correctness rests on this gate, not + on who flipped the checkbox. See + `decisions/0003-implement-n-completion-trust-boundary.md`. ## Modes diff --git a/decisions/0003-implement-n-completion-trust-boundary.md b/decisions/0003-implement-n-completion-trust-boundary.md new file mode 100644 index 0000000..0f2c4dd --- /dev/null +++ b/decisions/0003-implement-n-completion-trust-boundary.md @@ -0,0 +1,66 @@ +# 0003. Implement-N completion is a lead-verified trust boundary, not a hook-blocked invariant + +**Status:** Accepted +**Date:** 2026-06-01 +**Decision-makers:** autodev maintainers, autonomous pipeline +**Related:** issue #58; `skills/subagent-driven-development/SKILL.md`; `agents/team-conventions.md`; docs/plans/2026-06-01-pipeline-hardening-4issues-design.md + +## Context + +Across infra-admin v1 + v1.1 autonomous runs, `Implement: N` tasks were flipped to +`completed` by implementers or via a blockedBy-clear *before* the code-reviewer's +quality gate, violating the team-conventions contract ("only code-reviewer flips +Implement-N to completed"). In v1.1 this masked a non-compiling tree (uncommitted +helper) and a CI-failing hash regression — both reported "done"; only the lead's +independent `verification-before-completion` pass caught them. + +Issue #58 asked for **plugin/harness enforcement**: reject +`TaskUpdate(status=completed)` on `Implement: *` tasks unless `owner == +code-reviewer`. + +We investigated whether a deterministic plugin hook can enforce this. It cannot: + +- The PreToolUse payload for a `TaskUpdate` call carries the *tool input* + (`taskId`, `status`, `owner`) but **not** the task's current `subject` + ("Implement: N") nor the identity of the calling subagent. The task store is + harness state a bash hook cannot read (there is no `TaskList` available to a + hook). +- Therefore a hook cannot reliably answer the two questions the block requires — + "is this taskId an Implement task?" and "is the caller the code-reviewer?". The + only deterministic option, "block all `status=completed`", would break every + legitimate completion (spec-review, quality-review, and the orchestrator's own). +- The closest feasible hook approach — a `SubagentStop` hook reading + `transcript_path` to map `taskId → "Implement: N"` (via `TaskCreate` records) and + flag a matching `TaskUpdate(status=completed)` — was also rejected: it fires + *after* completion (warn, not block), sees only subagent completions (not a + lead-level `TaskUpdate`), depends on the unstable transcript JSONL shape, and is + racy across subagents. Fragile transcript-format coupling for marginal, + after-the-fact value. + +## Decision + +We reject the infeasible hard-block and instead **shift the trust boundary**: a +flipped `Implement: N` is a *claim*, not *evidence*, and is **not trusted as done +until the lead runs `autodev:verification-before-completion`** (build + test from a +clean tree, CI green) before treating the task as complete or proceeding to +`finishing-a-development-branch`. + +We encode this in `skills/subagent-driven-development/SKILL.md` (the +"Completion is not trusted until lead-verified" rule) and restate the +implementer/code-reviewer conventions in `agents/team-conventions.md`. We do **not** +add an advisory `TaskUpdate` hook: it cannot block, it cannot identify Implement +tasks, and it would fire on every completion (noise) for no enforcement gain. + +## Consequences + +The harm #58 names — a premature "done" masking a broken tree — is addressed at the +point it actually bit: the lead's verification gate, which already caught both v1.1 +regressions. The convention ("code-reviewer is the sole flipper") remains as +team discipline but is no longer load-bearing for correctness; correctness rests on +lead verification, which does not depend on who flipped the checkbox. + +The limitation is documented so the infeasible hard-block is not re-proposed. If a +future harness exposes the task subject **and** the calling subagent identity in +the PreToolUse payload, the deterministic block becomes feasible and this ADR +should be revisited (the convention is then mechanically enforceable). Until then, +"green checkbox ≠ verified" is the operative rule. diff --git a/decisions/0004-v6.3.0-scope-amendment-63-64.md b/decisions/0004-v6.3.0-scope-amendment-63-64.md new file mode 100644 index 0000000..7698580 --- /dev/null +++ b/decisions/0004-v6.3.0-scope-amendment-63-64.md @@ -0,0 +1,48 @@ +# 0004. v6.3.0 scope amendment — fold in #63 (artifact-class precedent) and #64 (session-start Linux stat) + +**Status:** Accepted +**Date:** 2026-06-01 +**Decision-makers:** autodev maintainers, autonomous pipeline, user (explicit approval) +**Related:** issues #63, #64; docs/plans/2026-06-01-pipeline-hardening-4issues.md (Tasks 8, 9); ADR 0003 + +## Context + +The v6.3.0 pipeline-hardening plan was locked with 7 tasks (#41/#58/#59/#60/#61 + CI +gate + version bump). During execution two new items arose: + +- **#64 (bug, execution-discovered):** Task 6's new `hooks-check.yml` ran + `tests/hook-contracts.sh` on ubuntu for the first time and surfaced a **pre-existing** + Linux bug in `hooks/session-start` — BSD `stat -f %m` is tried before GNU `stat -c %Y`, + and on Linux `stat -f` succeeds-but-wrong (prints fs info), breaking the time-dedup. The + fix is a prerequisite for Task 6's deliverable (a green CI gate that runs + `hook-contracts.sh`). +- **#63 (enhancement, newly filed):** add a design-phase `adversarial-design-review` + check that surveys how the codebase implements the *artifact class* (where a scenario + stands up a server, where a fixture lives), not just the *mechanism*. Same skill the + release already edits for #59. + +The user explicitly approved addressing both ("if you found a new issue, go ahead and +address it"; "pick up the new filed issues and address them as well"). + +## Decision + +Amend the locked v6.3.0 manifest from 7 to 9 tasks (single PR unchanged): add Task 8 +(#64 session-start `stat` ordering + numeric guard + re-enable `hook-contracts.sh` in CI) +and Task 9 (#63 `Artifact-class precedent` design-phase bug-class row). Both are small, +additive, host-neutral, and thematically identical to the locked hardening scope. The +original `Locked` stamp is preserved here for audit; the plan re-stamps to `Amended` and +re-runs `alignment-check`. + +We fold them into v6.3.0 rather than deferring to a follow-on release because: #64 is +required for Task 6's CI gate to be green and real; #63 edits the same skill as #59; and +the v6.3.0 PR is still open, so one coherent release covers all the hardening the user +asked for. + +## Consequences + +v6.3.0 closes seven issues (#41/#58/#59/#60/#61/#63/#64). The `hooks-check.yml` CI gate +now runs both hook test suites green on Linux (session-start time-dedup works on Linux +for the first time — a real bug fix for all Linux autodev users, beyond the test gate). +The amendment is recorded so the scope expansion is durable, not silent. Future +scope-expansions during execution should follow the same path: surface → user approval → +ADR → manifest amend → re-align → re-lock. diff --git a/docs/plans/2026-06-01-pipeline-hardening-4issues-design.md b/docs/plans/2026-06-01-pipeline-hardening-4issues-design.md new file mode 100644 index 0000000..e211073 --- /dev/null +++ b/docs/plans/2026-06-01-pipeline-hardening-4issues-design.md @@ -0,0 +1,320 @@ +# Autodev Pipeline Hardening (v6.3.0) — 5 Recurring Gate-Miss / Context-Waste Fixes — Design + +**Status:** Approved (autonomous — user pre-authorized full-pipeline execution; trigger v6.3.0 release after merge) +**Date:** 2026-06-01 +**Issues:** #41, #58, #59, #60, #61 (GoCodeAlone/autonomous-dev-kit) +**ADR:** decisions/0003-implement-n-completion-trust-boundary.md (#58) +**Adversarial review:** cycle 1 = FAIL (1C/3I/3m, all resolved); #61 added (user-reported same session). cycle 2 = FAIL (1 new Important: pre-compact-snapshot early-exit would skip the #61 marker-clear → fixed: clear placed unconditionally before the early-exit). cycle 3 = PASS (converged; applied the empty-session_key guard). + +## Problem + +Four open issues, all recurring gate-misses observed across infra-admin v1/v1.1 +autonomous runs + Codex compaction: + +- **#59** — plan-phase reviews can pass a design that says "behind X auth filter"/ + "RBAC enforced" while the plan wires a *weaker*, shape-matching gate (v1.1: + client-supplied `evidence.granted_permissions` made write-tier RBAC theater; + caught only on plan-review cycle 2). +- **#60** — a `run_in_background` pr-monitoring **Agent** told to poll-loop-with- + sleep instead returns after ~1 cycle and re-completes repeatedly without the + conclusive report (early-exited ~6× in v1.1). The lead fell back to self-polling + `gh pr checks`, which worked but lost fire-on-event semantics. +- **#41** — the PreCompact hook (via Codex) is reported as emitting invalid hook + JSON ("hook returned invalid PreCompact hook JSON output"), interrupting + compaction-time state handling. Root cause class: locale warnings / diagnostics + leaking onto the hook's stdout before/around its JSON. +- **#58** — `Implement: N` tasks get flipped to `completed` by implementers or via + blockedBy-clear *before* the code-reviewer's quality gate, violating the + team-conventions contract. In v1.1 this masked a non-compiling tree + a + CI-failing hash regression both reported "done"; only the lead's + verification-before-completion caught them. +- **#61** — the `pretool-pr-review-reminder` hook injects its ~12-line gh-version + + Copilot-reviewer `` block on **every** `gh pr create` (observed 7× + in one session) — wasted context for advice the agent already has. + +## Goals + +One coherent v6.3.0 release hardening the autonomous pipeline against these four. + +1. **#59** — add a plan-phase bug-class that walks the design's auth/authz chain + component-by-component vs the plan's wiring; flag any gate that is + client-asserted rather than server-enforced against an authenticated principal. +2. **#60** — make the **bash poll-loop** the sanctioned pr-monitoring wait pattern + (a `run_in_background` Bash sleep-loop that *blocks to completion* and + re-invokes the lead once on settle), documenting the subagent early-exit + failure mode. Keep the subagent monitor as a documented fallback. +3. **#41** — harden the **`run-hook.cmd` wrapper** (the single choke-point all + hooks run through) to (a) export a deterministic locale with portable fallback + and (b) enforce **stdout JSON discipline**: only valid-JSON-or-empty reaches + the host's hook parser; non-JSON noise is routed to stderr. Plus a regression + test for noisy/locale-warning output. +4. **#58** — fix the *real* harm via a **trust boundary**, since a hard hook-block + is infeasible (see Design): a flipped `Implement: N` is NOT trusted-done until + the lead runs `verification-before-completion` (build + test from a clean + tree). Strengthen `subagent-driven-development` + `team-conventions` + ADR. +5. **#61** — emit the pr-review reminder **once per session** (deduped via a + `.claude/autodev-state` marker like `session-start`), reset by the PreCompact + hook so it re-emits once after a compaction. + +## Non-Goals + +- No deterministic hook that *blocks* `TaskUpdate(status=completed)` on `Implement:*` + — infeasible (see #58 design). No fragile heuristic that pretends to. +- No change to the hook event registrations in `hooks.json` (the wrapper change is + transparent to all of them). +- No new pr-monitoring tool; reuse the existing `Bash run_in_background` primitive. +- No change to GHA/other-platform behavior; these are skill-doc + hook + version + changes within the plugin. + +## Global Design Guidance + +`Guidance: none at docs/design-guidance.md; canon = README §Cross-LLM, +docs/plans/2026-04-25-cross-llm-portability-design.md, ADRs 0001/0002.` + +| guidance | response | +|---|---| +| Host-neutral / cross-LLM first | The wrapper fix (#41) is the host-neutral choke-point. #60 is explicitly **host-scoped**: bash poll-loop under ``, self-poll fallback documented for `` (no false cross-LLM claim). #58 trust-boundary + #59 bug-class + #61 dedup-marker are host-neutral prose/state. #61 degrades to emit-every-time on identity-less hosts. | +| Checklist is the floor | #59 row joins the mandatory-scan plan-phase set. | +| Don't claim enforcement you can't deliver | #58 explicitly rejects the infeasible hard-block and documents why (ADR) — directly applies the Existence/runtime-validity discipline (#55). | +| Strict stdout discipline for hooks | #41 centralizes it in the wrapper, not per-hook. | + +## Design + +### #59 — Auth/authz chain-composition bug-class (plan phase) + +Add one row to the **plan-phase** checklist of `skills/adversarial-design-review/ +SKILL.md`. Place it **near the top** of the plan-phase table (right after +`Verification-class mismatch`), not after the plugin-layout/config-schema rows — +auth/authz is a security-class finding and ordering signals priority (m2): + +``` +| **Auth/authz chain composition** | When the design names an auth/authz chain ("behind the X auth filter", "RBAC-enforced", "admin-only"), walk that chain component-by-component against the plan's actual wiring. For each gate, verify it is enforced **server-side against an authenticated principal**, not shape-matched by a client-asserted value. Flag any gate where the plan's check reads from request/client-supplied input (`evidence.granted_permissions`, a header, a body field) instead of an authenticated subject (`authz.Enforce(authenticatedSubject, …)`). A plan that wires a weaker gate than the design's chain implies = finding. | +``` + +(Plan-phase, not design-phase: chain *composition* is a plan-wiring concern — the +design states the intent; the plan is where a weaker gate slips in.) + +### #60 — Bash poll-loop as the sanctioned pr-monitoring pattern + +In `skills/pr-monitoring/SKILL.md`, add a **"Waiting for CI: the sanctioned +pattern"** section near the top of the process. The recommendation is **host-scoped** +(the bash poll-loop is a Claude-Code Bash-`run_in_background` construct; Codex/Cursor +don't expose it identically): + +- **`` — Recommended default:** a `Bash` tool call with + `run_in_background: true` running a bounded sleep-loop that polls `gh pr checks` + until no check is `pending` (or a failure), then prints a settle line and exits. + The harness re-invokes the lead once on exit (≈0 tokens while sleeping). On + settle the lead reads the result and admin-merges on `failures=0`. **Concrete + cap (m3):** bound the loop (e.g. `for i in $(seq 1 120)` × `sleep 30` = 60 min) + so it can never spin forever; on cap, print a timeout line and exit for the lead + to restart. +- **Why:** a `run_in_background` **Agent** instructed to sleep-loop tends to return + after ~1 cycle and re-complete repeatedly (the agent loop is not a blocking + sleep) — observed early-exiting ~6× (#60). The bash sleep-loop genuinely blocks + to completion. +- **``:** these hosts keep the existing `` guidance (host's equivalent poll mechanism). Where no + blocking-background-bash exists, the sanctioned fallback is **self-poll on each + lead wakeup** — the lead runs `gh pr checks` once per turn and re-checks on the + next turn — which loses fire-on-event but is reliable. Document this explicitly + so the Codex path isn't left undefined. +- **Fallback (all hosts):** the existing background-Agent monitor stays documented + for multi-PR review-comment handling needing active fix-and-push; note its + early-exit failure mode. + +### #41 — Wrapper-level stdout-JSON discipline (+ keep existing locale logic) + +Harden `hooks/run-hook.cmd` (Unix portion — the choke-point every hook runs +through). The **primary** fix is stdout discipline; the existing locale handling +is already correct and stays. + +1. **Locale (UNCHANGED):** the wrapper already falls back `C.UTF-8 → C` (covering + `LC_ALL`/`LC_CTYPE`/`LANG`) when `C.UTF-8` is inherited but not installed — + keep it as-is. We do **not** extend it to unset/empty locales (no evidence the + Codex failure is an unset locale, and forcing `C` on an otherwise-working + UTF-8 default could corrupt multibyte hook output). The real #41 cause is + diagnostics leaking onto stdout, addressed below — the locale fallback already + prevents the *bash* locale warning; stdout discipline catches whatever else + (perl/git/jq diagnostics, shell tracing) leaks. + +2. **Stdout JSON discipline (the fix):** run the hook with stdout captured (stderr + passes through untouched; stdin still flows from host → hook). Then, when `jq` + is available: + - **empty** stdout → emit nothing. + - stdout is **valid JSON as a whole** → emit it verbatim. + - otherwise → **extract** the hook's JSON rather than suppress it: take the last + line beginning with `{` (`printf '%s\n' "$out" | grep -E '^\{' | tail -1`); + if that line is valid JSON (`jq -e .`), emit **that** on stdout and route the + rest (the leading warning/diagnostic prefix) to **stderr**. If no JSON line is + found, route the whole capture to stderr and emit nothing. + - This is the C1 correction: a `decision:block` JSON preceded by a locale/perl + warning (`warning\n{"decision":"block"}`) is **not** valid-as-a-whole, so the + extraction path delivers the block instead of dropping it. + - Preserve the hook's exit code. When `jq` is **absent**, pass stdout through + unchanged (don't break hooks on minimal hosts). + - **Trailing-newline note (I3):** capturing via `out=$(…)` strips trailing + newlines; re-emitting via `printf '%s\n' "$out"` restores exactly one. JSON + parsers accept one trailing newline — a regression case asserts this. + +3. **Regression test** (`tests/hook-stdout-discipline.sh`, runs the real + `run-hook.cmd` against fixture hooks): (a) warning-then-`{"decision":"block"}` + → wrapper emits **only** the block JSON on stdout, warning on stderr, exit code + preserved; (b) only-noise → stdout empty, noise on stderr; (c) clean valid JSON + → unchanged, exactly one trailing newline; (d) jq-absent (PATH stub) → + pass-through verbatim. Also run `tests/hook-contracts.sh` for no real-hook + regression. + +### #58 — Implement-N completion trust boundary (hard-block is infeasible) + +**Investigation (the key finding):** a deterministic plugin hook that *blocks* +`TaskUpdate(status=completed)` on `Implement:*` tasks unless `owner == +code-reviewer` is **not feasible**: +- The PreToolUse payload for a `TaskUpdate` call carries the *tool input* + (`taskId`, `status`, `owner`) but **not the task's current subject** ("Implement: + N") nor the calling subagent's identity. The task store is harness state the + hook cannot read (no `TaskList` from a bash hook). +- So the hook cannot reliably determine "is taskId an Implement task?" or "is the + caller the code-reviewer?" — the two facts the block needs. A heuristic (e.g. + block all `completed`) would break legitimate completions. +- **Closest feasible hook approach, also rejected (m1):** a `SubagentStop` hook + *does* receive `transcript_path` (used by `subagent-scope-guard` / + `completion-claim-guard`), so it could parse the transcript for `TaskCreate` + calls to map `taskId → "Implement: N"` subject and flag a matching + `TaskUpdate(status=completed)`. Rejected as too unreliable: it only sees + *subagent* completions (not a lead-level `TaskUpdate`), the transcript JSONL + shape is not a stable API surface, and event ordering across subagents is racy. + It would also only *warn after the fact* (SubagentStop fires post-completion), + not block. So it does not deliver the asked-for enforcement and adds a fragile + transcript-format dependency for marginal value. + +**Feasible fix — shift the trust boundary (ADR 0003):** the harm in v1.1 was not +*who flipped the bit* but that a flipped `Implement: N` was **trusted as done** +while the tree didn't compile / CI failed. So: +- `skills/subagent-driven-development/SKILL.md`: add an explicit **"Completion is + not trusted until lead-verified"** rule — regardless of who/what flips an + `Implement: N` to `completed`, the lead MUST run + `autodev:verification-before-completion` (build + test from a clean tree, CI + green) before treating that task as truly done or proceeding to + `finishing-a-development-branch`. A green checkbox is a *claim*, not *evidence*. +- `agents/team-conventions.md`: restate the code-reviewer-sole-flipper convention + AND add the implementer rule "never self-complete an `Implement: N`; DM the + reviewer instead" + the lead's trust-boundary gate. +- ADR 0003 records: rejected the infeasible hard-block; chose the verification + trust-boundary; documented the hook-payload limitation so it isn't re-proposed. + +### #61 — pr-review reminder: once per session, reset on compaction + +`hooks/pretool-pr-review-reminder` currently emits its `` block on every +`gh pr create`. Add session-dedup mirroring `hooks/session-start`'s +`session-start-seen` precedent: + +- Derive a session key from `transcript_path` basename — the same value + `pre-compact-snapshot` already computes as `session_key` for session-locks + attribution (note: this is *not* `session-start`'s primary dedup key, which is + `session_id:source_kind`; basename is a sufficient, stable per-session key for a + file marker). Marker file: `${cwd}/.claude/autodev-state/pr-reminder-seen` + containing the session key(s) already reminded. +- On a matched `gh pr create`: if the current session key is already in the marker + → `exit 0` silently (no emit). Else → emit the reminder once and append the + session key to the marker. +- **Reset on compaction (I-NEW-1 — placement is load-bearing):** + `hooks/pre-compact-snapshot` removes the current session's key from + `pr-reminder-seen` so the next `gh pr create` after a compaction re-emits once. + This clear MUST be placed **unconditionally near the top of pre-compact-snapshot + — immediately after `session_key`/`cwd_dir` are computed (≈ line 30) and BEFORE + the `[ -z "$state_section" ] && exit 0` no-locked-plans early-exit**. Co-locating + it with the emit block (after the early-exit) would silently skip the clear in + the common no-locked-plan case, leaving the reminder permanently silenced after + the first PR. The regression test asserts the clear fires even when there are no + locked plans. Guard the clear with `[ -n "$session_key" ]` — when + `transcript_path` is absent (`session_key=""`), skip the clear (no blanket file + wipe), matching the reminder's emit-every-time degradation on identity-less hosts. +- Degrade gracefully: no `transcript_path` (hookless/identity-less host) → fall + back to current behavior (emit every time) rather than suppress, so a host that + can't dedup still gets the advice. +- **Match precision (minor, in-scope):** tighten the trigger so it matches an + actual `gh pr create` invocation, not the literal substring inside a quoted + argument (e.g. an issue/PR *body* that mentions `gh pr create`) — anchor on a + word boundary / command position. (Observed: this design's own tracking-issue + body tripped the hook.) +- **Regression test:** two `gh pr create` payloads with the same session key → + emitted once; a `pre-compact-snapshot` run between them **with no locked plans + present** (the early-exit case) → still clears the marker so the reminder is + emitted again (post-compact reset); no `transcript_path` → emits each time. + +- **#41** is security-adjacent: a hook that leaks non-JSON to stdout can corrupt + the host's view of a *block* decision (e.g. a scope-guard `{"decision":"block"}` + preceded by a locale warning could be ignored → a guarded action proceeds). + Centralizing JSON discipline in the wrapper makes every block decision reliably + delivered. No secrets touched; no network; the wrapper reads only the hook's own + stdout. +- **#59** *is* an auth/authz review-quality improvement — it strengthens detection + of client-asserted-permission theater in plans. +- **#58/#60** are process/doc; no runtime security surface. + +## Amendment (2026-06-01, user-approved — see decisions/0004) + +Two items folded into v6.3.0 during execution, both user-approved (manifest 7 → 9 tasks, +PR count unchanged): +- **#64** — `hooks/session-start` time-dedup is broken on Linux (BSD `stat -f %m` tried + before GNU `stat -c %Y`; `stat -f` succeeds-but-wrong on Linux). Surfaced by Task 6's + `hooks-check.yml` running `hook-contracts.sh` on ubuntu for the first time. Fixed + (GNU-first + numeric guard) and `hook-contracts.sh` re-enabled in the CI gate. +- **#63** — new design-phase `Artifact-class precedent` bug-class: survey how the codebase + implements the *artifact class* (where a scenario stands up a server, where a fixture + lives), not just the *mechanism*; grep for sibling instances and follow the established + shape or justify divergence. + +## Infrastructure Impact + +None at runtime. Plugin skill/hook/doc changes + a version bump. Release path is +the existing `release-tag.yml` (push to main touching `.claude-plugin/plugin.json` +→ version-check → tag → marketplace dispatch). + +## Multi-Component Validation + +- **#41 wrapper × hooks:** `tests/hook-stdout-discipline.sh` runs the **real** + `run-hook.cmd` against fixture hook scripts (noisy + clean + jq-absent) and + asserts stdout/stderr/exit-code — the real wrapper↔hook boundary, not a mock. + Also run the existing `tests/hook-contracts.sh` to confirm no regression to the + real hooks (session-start, scope-guard, completion-claim-guard, pre-compact). +- **#61 reminder dedup:** the regression test runs the real + `pretool-pr-review-reminder` against two same-session `gh pr create` payloads + (emit once) + a `pre-compact-snapshot` between them (re-emit once) + a + no-`transcript_path` payload (emit each time) — the real hook↔state boundary. +- **#59/#60/#58:** `tests/skill-content-grep.sh` (host-neutral lint) + + `tests/skill-cross-refs.sh` pass on the edited skills; plan-phase reviewers will + enumerate the new class because the plan-phase checklist is embedded in their + dispatch prompt. + +## Assumptions + +| id | assumption | challenge | fallback | +|---|---|---|---| +| A1 | All autodev hooks emit JSON-or-nothing on stdout (Claude-Code/Codex protocol) | A hook might emit intentional plain-text stdout | Verified: every registered hook emits `{…}` JSON or nothing. If a future hook needs plain-text stdout it must opt out — but none do today. | +| A2 | The PreToolUse payload lacks task subject + caller identity | Harness could add it later | Verified against the documented hook payload; if it ever exposes both, the hard-block becomes feasible and ADR 0003 should be revisited. | +| A3 | A `run_in_background` Bash sleep-loop genuinely blocks + re-invokes the lead once on exit (Claude Code) | Codex/Cursor lack this primitive | Directly observed working across this session's many CI waits ([[feedback_ci_wait_use_bash_poll_loop]]); scoped to ``, with the self-poll fallback documented for other hosts. | +| A6 | #41 last-JSON-line extraction recovers a block decision behind a warning | A hook could emit multi-line pretty-printed JSON (not single-line) | autodev hooks all emit single-line `jq -n` JSON (one object, one line); the `grep '^{' | tail -1` extraction matches. If a future hook pretty-prints, the whole-blob `jq .` path still handles the no-warning case; the test covers the warning-prefixed single-line case. | +| A4 | The wrapper can capture stdout without breaking hooks that read stdin | Hooks read the harness JSON from stdin, not the wrapper | The wrapper only redirects the hook's *stdout*; stdin still flows from the host to the hook unchanged. | +| A5 | v6.3.0 is the right bump + the tag is free | Could collide (the #804 lesson) | Verified `git ls-remote --tags` shows v6.3.0 free; current 6.2.2 → 6.3.0 minor. | + +## Rollback + +- Revert the PR(s). #59/#60/#58 are additive prose; #41 reverts the wrapper to its + prior conditional-locale form (no migration). The version bump reverts with the + same commit; do not push the v6.3.0 tag if reverted pre-merge. +- **#41 runtime note:** the wrapper is on the hook hot-path. If the captured-stdout + change regresses a hook, rollback = revert `run-hook.cmd` to the `exec bash` form + + re-run `tests/hook-contracts.sh`. The regression test gates this pre-merge. + +## Self-Challenge + +- **Simplest alternative:** fix the PreCompact hook script alone (#41) instead of + the wrapper. Rejected — the issue explicitly wants the wrapper-level fix so + *every* hook gets stdout discipline, not just one. +- **Most fragile assumption:** A1 (hooks emit JSON-or-nothing). Mitigated by the + jq-absent pass-through + the fact that all current hooks comply. +- **YAGNI sweep:** no advisory `TaskUpdate` hook for #58 (it can't block, fires on + every completion → noise); no new monitor tool for #60; no per-hook locale edits + (#41 is wrapper-only). All rejected as surface the issues didn't ask for. diff --git a/docs/plans/2026-06-01-pipeline-hardening-4issues.md b/docs/plans/2026-06-01-pipeline-hardening-4issues.md new file mode 100644 index 0000000..878be0d --- /dev/null +++ b/docs/plans/2026-06-01-pipeline-hardening-4issues.md @@ -0,0 +1,535 @@ +# Autodev Pipeline Hardening (v6.3.0) Implementation Plan + +> **For the implementing agent:** REQUIRED SUB-SKILL: Use autodev:executing-plans to implement this plan task-by-task. + +**Goal:** Ship 5 recurring-gate-miss / context-waste fixes (#41/#58/#59/#60/#61) as one v6.3.0 release. + +**Architecture:** Skill-doc additions (#59 bug-class, #60 pr-monitoring pattern, #58 trust-boundary) + two hook changes through the choke-points (#41 `run-hook.cmd` stdout-JSON discipline; #61 `pretool-pr-review-reminder` session-dedup + `pre-compact-snapshot` marker-clear) + a CI workflow that runs the hook regression tests + a version bump that triggers `release-tag.yml`. + +**Tech Stack:** Markdown skill files; bash hooks + `jq`; bash test harness (`tests/hook-contracts.sh`); GitHub Actions. + +**Base branch:** main + +--- + +## Scope Manifest + +**PR Count:** 1 +**Tasks:** 9 +**Estimated Lines of Change:** ~340 (informational; not enforced) + +**Out of scope:** +- A deterministic hook that *blocks* `TaskUpdate(status=completed)` on `Implement:*` (infeasible — ADR 0003). +- Any advisory `TaskUpdate` hook for #58 (rejected — noisy, can't identify Implement tasks). +- Extending the wrapper locale logic to unset/empty locales (#41 keeps existing locale handling). +- Any new pr-monitoring tool / Codex-specific background primitive (#60 documents host-scoped patterns only). +- Changing hook event registrations in `hooks.json` (the wrapper change is transparent; only the new `hooks-check.yml` workflow is added). + +**PR Grouping:** + +| PR # | Title | Tasks | Branch | +|------|-------|-------|--------| +| 1 | feat: v6.3.0 pipeline hardening (#41/#58/#59/#60/#61/#63/#64) | Task 1, Task 2, Task 3, Task 4, Task 5, Task 6, Task 7, Task 8, Task 9 | feat/pipeline-hardening-4issues-v6.3.0 | + +**Status:** Amended 2026-06-01T06:00:00Z (user-approved scope expansion to #63 + #64; see Amendment note + decisions/0004) + +--- + +## Project Design Guidance + +`Guidance: none at docs/design-guidance.md; canon = README §Cross-LLM + ADRs 0001/0002/0003.` Mapping: +- Host-neutral → #41 wrapper + #59/#58/#61 are host-neutral; #60 explicitly host-scoped (verified by `tests/skill-content-grep.sh` host-token lint). +- Don't claim enforcement you can't deliver → #58 (ADR 0003) + #41 (a test that runs in CI, Task 6). +- Checklist is the floor → #59 joins the mandatory plan-phase scan. + +--- + +### Task 1: #59 — auth/authz chain-composition bug-class (plan-phase) + +**Files:** +- Modify: `skills/adversarial-design-review/SKILL.md` (plan-phase table, insert right after the `Verification-class mismatch` row) + +**Step 1: Verify the insertion anchor** +Run: `grep -n "Verification-class mismatch" skills/adversarial-design-review/SKILL.md` +Expected: one match in the plan-phase table (lines ~104-113). The new row goes immediately after it. + +**Step 2: Insert the row** (exact, after the `Verification-class mismatch` row): +``` +| **Auth/authz chain composition** | When the design names an auth/authz chain ("behind the X auth filter", "RBAC-enforced", "admin-only"), walk that chain component-by-component against the plan's actual wiring. For each gate, verify it is enforced **server-side against an authenticated principal**, not shape-matched by a client-asserted value. Flag any gate where the plan's check reads from request/client-supplied input (`evidence.granted_permissions`, a header, a body field) instead of an authenticated subject (`authz.Enforce(authenticatedSubject, …)`). A plan that wires a weaker gate than the design's chain implies = finding. | +``` + +**Step 3: Verify placement + lint** +Run: `awk '/Bug-class checklist — plan phase/{p=1} p && /Auth\/authz chain composition/{print NR": ok"}' skills/adversarial-design-review/SKILL.md` +Expected: one line (row is in the plan-phase section). +Run: `bash tests/skill-content-grep.sh 2>&1 | tail -1` +Expected: `PASS: …`. + +**Step 4: Commit** +```bash +git add skills/adversarial-design-review/SKILL.md +git commit -m "feat(adversarial-design-review): add auth/authz chain-composition plan-phase bug-class (#59)" +``` +Rollback: revert commit (additive prose row). + +--- + +### Task 2: #60 — pr-monitoring bash poll-loop sanctioned pattern + +**Files:** +- Modify: `skills/pr-monitoring/SKILL.md` (add a "Waiting for CI: the sanctioned pattern" subsection near the top of the process, after the overview) + +**Step 1: Add the section** (host-scoped, per design §#60). Insert after the "## When to Use" / before "## The Process" (or near the top of the process): + +```markdown +## Waiting for CI: the sanctioned pattern + + +**Recommended default — a bash poll-loop, not a long-lived monitor Agent.** Issue a +`Bash` tool call with `run_in_background: true` running a **bounded** sleep-loop that +polls `gh pr checks ` until no check is `pending` (or a failure), prints a settle +line, and exits. The harness re-invokes the lead **once** on exit (≈0 tokens while it +sleeps); the lead then reads the result and admin-merges on `failures=0`. + +Bound the loop so it can never spin forever, e.g.: +```bash +for i in $(seq 1 120); do # 120 × 30s = 60 min cap + raw=$(gh pr checks --json bucket 2>/dev/null) + fail=$(echo "$raw" | jq '[.[]|select(.bucket=="fail")]|length') + pend=$(echo "$raw" | jq '[.[]|select(.bucket=="pending")]|length') + { [ "$fail" != 0 ] && [ -n "$fail" ]; } && { echo "FAILURES=$fail"; break; } + { [ "$pend" = 0 ] || [ -z "$pend" ]; } && { echo "SETTLED"; break; } + sleep 30 +done +``` +On the 120-iteration cap, print a timeout line and exit for the lead to restart. + +**Why not a background Agent:** a `run_in_background` **Agent** told to sleep-loop +tends to return after ~1 cycle and re-complete repeatedly (the agent loop is not a +blocking sleep) — observed early-exiting ~6× in one run. The bash sleep-loop genuinely +blocks to completion. + + + +Use your host's equivalent poll mechanism. Where no blocking-background-bash exists, +the sanctioned fallback is **self-poll on each lead wakeup**: run `gh pr checks ` +once per turn and re-check next turn. This loses fire-on-event but is reliable; do not +dispatch a background Agent expecting it to block on a sleep-loop. + + +**Fallback (all hosts):** the background-Agent monitor below remains documented for +multi-PR review-comment handling needing active fix-and-push; note its early-exit +failure mode and prefer the poll-loop / self-poll for pure CI-wait. +``` + +**Step 2: Verify lint + cross-refs** +Run: `bash tests/skill-content-grep.sh 2>&1 | tail -1` → `PASS` +Run: `bash tests/skill-cross-refs.sh 2>&1 | tail -1` → `PASS: all cross-skill references resolve.` + +**Step 3: Commit** +```bash +git add skills/pr-monitoring/SKILL.md +git commit -m "docs(pr-monitoring): sanction the bash poll-loop CI-wait pattern, host-scoped (#60)" +``` +Rollback: revert commit. + +--- + +### Task 3: #58 — Implement-N completion trust boundary + +**Files:** +- Modify: `skills/subagent-driven-development/SKILL.md` (add a "Completion is not trusted until lead-verified" rule) +- Modify: `agents/team-conventions.md` (restate the implementer/code-reviewer rules + the lead trust-boundary) +- (ADR `decisions/0003-implement-n-completion-trust-boundary.md` already committed with the design.) + +**Step 1: Add the trust-boundary rule to subagent-driven-development** — near the "Red Flags"/completion section, add: +```markdown +### Completion is not trusted until lead-verified + +A green `Implement: N` checkbox is a **claim, not evidence**. Regardless of who or +what flips an `Implement: N` task to `completed` (implementer, a blockedBy-clear, or +the code-reviewer), the lead MUST run `autodev:verification-before-completion` — +build + test from a clean tree, CI green — before treating that task as truly done or +invoking `finishing-a-development-branch`. The team-conventions "only code-reviewer +flips Implement-N" rule remains team discipline, but correctness rests on lead +verification, which does not depend on who flipped the bit. (A deterministic hook that +blocks the flip is infeasible — the PreToolUse payload lacks the task subject and +caller identity; see decisions/0003-implement-n-completion-trust-boundary.md.) +``` + +**Step 2: Add the rules to team-conventions.md** — under the Implementer + a Code-reviewer/Lead section: +- Implementer: "Never self-complete an `Implement: N` task. DM the spec-reviewer when ready; the code-reviewer is the sole flipper." +- Lead: "Treat any `completed` Implement-N as a claim; run `verification-before-completion` (clean build + tests + CI green) before accepting it or finishing the branch." + +**Step 3: Verify lint + cross-refs** +Run: `bash tests/skill-content-grep.sh 2>&1 | tail -1` → `PASS` +Run: `bash tests/skill-cross-refs.sh 2>&1 | tail -1` → `PASS` + +**Step 4: Commit** +```bash +git add skills/subagent-driven-development/SKILL.md agents/team-conventions.md +git commit -m "docs(subagent-driven-development): completion trust-boundary for Implement-N (#58, ADR 0003)" +``` +Rollback: revert commit. + +--- + +### Task 4: #41 — run-hook.cmd stdout-JSON discipline + test + +**Files:** +- Modify: `hooks/run-hook.cmd` (Unix portion — replace the final `exec bash …` with a capture-then-discipline block; keep the locale logic unchanged) +- Create: `tests/hook-stdout-discipline.sh` + +**Step 1: Write the failing test** (`tests/hook-stdout-discipline.sh`) — runs the REAL `run-hook.cmd` against fixture hooks: +```bash +#!/usr/bin/env bash +# tests/hook-stdout-discipline.sh — verify run-hook.cmd enforces stdout JSON discipline. +set -uo pipefail +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +WRAPPER="$REPO_ROOT/hooks/run-hook.cmd" +failures=0 +pass() { printf 'PASS: %s\n' "$*"; } +fail() { printf 'FAIL: %s\n' "$*" >&2; failures=$((failures+1)); } +command -v jq >/dev/null 2>&1 || { echo "SKIP: jq required"; exit 0; } + +tmp="$(mktemp -d)" +# Cleanup trap set BEFORE any fixture is copied into hooks/ (rm -f on absent files is safe). +trap 'rm -f "$REPO_ROOT"/hooks/fix-warn-then-json "$REPO_ROOT"/hooks/fix-noise "$REPO_ROOT"/hooks/fix-clean; rm -rf "$tmp"' EXIT +mkfix() { printf '%s\n' "$1" > "$tmp/$2"; chmod +x "$tmp/$2"; } + +# Fixture A: locale warning to stderr-or-stdout then a block JSON on stdout. +mkfix '#!/usr/bin/env bash +echo "perl: warning: Setting locale failed." # leaks to stdout +printf "%s\n" "{\"decision\":\"block\",\"reason\":\"x\"}"' fix-warn-then-json +# Fixture B: only noise, no JSON. +mkfix '#!/usr/bin/env bash +echo "just a diagnostic line"' fix-noise +# Fixture C: clean single-line JSON. +mkfix '#!/usr/bin/env bash +printf "%s\n" "{\"hookSpecificOutput\":{\"hookEventName\":\"X\"}}"' fix-clean + +run() { local out err rc; out="$("$WRAPPER" "$1" 2>"$tmp/err")"; rc=$?; printf '%s' "$out" > "$tmp/out"; err="$(cat "$tmp/err")"; OUT="$out"; ERR="$err"; RC=$rc; } + +# Point the wrapper at the tmp fixtures by copying them next to run-hook.cmd is +# heavy; instead invoke the wrapper with a fixture in hooks/ via a temp symlink dir. +# Simplest: call the wrapper with HOOK_DIR override is not supported, so copy fixtures +# into hooks/ under a unique prefix and clean up. +for f in fix-warn-then-json fix-noise fix-clean; do cp "$tmp/$f" "$REPO_ROOT/hooks/$f"; done + +# (a) warning + block JSON → stdout is ONLY the block JSON; warning ON stderr (m1). +run fix-warn-then-json +if printf '%s' "$OUT" | jq -e '.decision=="block"' >/dev/null 2>&1 \ + && ! printf '%s' "$OUT" | grep -q 'perl: warning' \ + && printf '%s' "$ERR" | grep -q 'perl: warning'; then + pass "(a) block JSON on stdout, warning routed to stderr" +else fail "(a) expected block JSON on stdout + warning on stderr, got OUT=[$OUT] ERR=[$ERR]"; fi + +# (b) only noise → stdout empty, noise on stderr. +run fix-noise +{ [ -z "$OUT" ] && printf '%s' "$ERR" | grep -q 'diagnostic'; } \ + && pass "(b) noise suppressed from stdout, routed to stderr" || fail "(b) expected empty stdout, got: $OUT" + +# (c) clean JSON → unchanged + valid. +run fix-clean +printf '%s' "$OUT" | jq -e '.hookSpecificOutput.hookEventName=="X"' >/dev/null 2>&1 \ + && pass "(c) clean JSON passthrough" || fail "(c) clean JSON broke, got: $OUT" + +# (d) jq-absent (I2) → wrapper passes stdout through VERBATIM (warning + JSON both present). +# Stub PATH so the wrapper's `command -v jq` fails; assert via grep (no jq needed here). +nojq="$tmp/nojq"; mkdir -p "$nojq" +OUTD="$(PATH="$nojq" "$WRAPPER" fix-warn-then-json 2>/dev/null)" +{ printf '%s' "$OUTD" | grep -q 'perl: warning' && printf '%s' "$OUTD" | grep -q '"decision":"block"'; } \ + && pass "(d) jq-absent → verbatim passthrough (no discipline applied)" \ + || fail "(d) expected verbatim passthrough with jq absent, got: $OUTD" + +echo ""; echo "Results: $failures failure(s)"; [ "$failures" -eq 0 ] +``` + +> Implementation note: fixtures are copied into `hooks/` (the wrapper resolves +> scripts relative to its own dir) and cleaned up in the trap (set BEFORE the copy so +> a mid-test failure still cleans up). The top `command -v jq` guard SKIPs only when +> the **test machine** has no jq (can't run a/b/c); on CI (jq present) all four cases +> run — case (d) stubs PATH so the **wrapper** sees no jq and asserts verbatim +> passthrough via grep (no jq needed for that assertion). + +**Step 2: Run → FAIL** (wrapper still `exec bash`, doesn't strip the warning): +Run: `bash tests/hook-stdout-discipline.sh 2>&1 | tail -6` +Expected: `(a)` FAILs (stdout contains the perl warning + JSON). + +**Step 3: Implement the wrapper discipline** — replace the final `exec bash "${SCRIPT_DIR}/${SCRIPT_NAME}" "$@"` line of `hooks/run-hook.cmd` with: +```bash +# Run the hook with stdout captured (stderr + stdin pass through untouched). +# Enforce stdout JSON discipline: only valid-JSON-or-empty reaches the host's hook +# parser; diagnostics that leak onto stdout (locale/perl/git warnings) are routed to +# stderr. A block decision preceded by a warning is recovered, not dropped (#41). +if command -v jq >/dev/null 2>&1; then + hook_out="$(bash "${SCRIPT_DIR}/${SCRIPT_NAME}" "$@")" + hook_rc=$? + if [ -z "$hook_out" ]; then + : # nothing to emit + elif printf '%s' "$hook_out" | jq -e . >/dev/null 2>&1; then + printf '%s\n' "$hook_out" # valid JSON as a whole + else + json_line="$(printf '%s\n' "$hook_out" | grep -E '^\{' | tail -1)" + if [ -n "$json_line" ] && printf '%s' "$json_line" | jq -e . >/dev/null 2>&1; then + printf '%s\n' "$hook_out" | grep -vF "$json_line" >&2 # diagnostics → stderr + printf '%s\n' "$json_line" # recovered JSON → stdout + else + printf '%s\n' "$hook_out" >&2 # all noise → stderr + fi + fi + exit "$hook_rc" +else + exec bash "${SCRIPT_DIR}/${SCRIPT_NAME}" "$@" # jq absent: pass through unchanged +fi +``` + +**Step 4: Run → PASS** +Run: `bash tests/hook-stdout-discipline.sh 2>&1 | tail -6` +Expected: `Results: 0 failure(s)` (all of a/b/c pass). +Run: `bash tests/hook-contracts.sh 2>&1 | tail -3` +Expected: existing hook contracts still pass (no regression to real hooks through the wrapper). + +**Step 5: Commit** +```bash +git add hooks/run-hook.cmd tests/hook-stdout-discipline.sh +git commit -m "fix(run-hook.cmd): enforce stdout JSON discipline, recover block decisions behind warnings (#41)" +``` +Rollback: revert `run-hook.cmd` to the `exec bash` form + re-run `tests/hook-contracts.sh`. Hook-path change — verified by the regression test (Task 6 CI-gates it). + +--- + +### Task 5: #61 — pr-review reminder dedup + pre-compact clear + +**Files:** +- Modify: `hooks/pretool-pr-review-reminder` (session-dedup + tighter match) +- Modify: `hooks/pre-compact-snapshot` (clear the marker, unconditionally, before the early-exit) +- Modify: `tests/hook-contracts.sh` (dedup + post-compact-reset cases) + +**Step 1: Add the dedup + post-compact tests to `tests/hook-contracts.sh`** (reuse `run_hook` + a tmp cwd): +```bash +test_pr_reminder_dedup() { + local tmp; tmp="$(mktemp -d)" + trap 'rm -rf "$tmp"' RETURN # I3: repo-convention cleanup + local sess='/x/transcripts/sess-abc.jsonl' + local payload='{"tool_name":"Bash","tool_input":{"command":"gh pr create --title t --body b"},"cwd":"'"$tmp"'","transcript_path":"'"$sess"'"}' + # first call emits + local out1; out1="$(run_hook pretool-pr-review-reminder "$payload")" + printf '%s' "$out1" | jq -e '.hookSpecificOutput.additionalContext' >/dev/null 2>&1 \ + || fail "pr-reminder: first call should emit" + # second call (same session) is silent + local out2; out2="$(run_hook pretool-pr-review-reminder "$payload")" + [ -z "$out2" ] && pass "pr-reminder: deduped within session" \ + || fail "pr-reminder: second call should be silent, got: $out2" + # a PreCompact run with NO locked plans must still clear the marker + run_hook pre-compact-snapshot '{"cwd":"'"$tmp"'","transcript_path":"'"$sess"'"}' >/dev/null 2>&1 || true + local out3; out3="$(run_hook pretool-pr-review-reminder "$payload")" + printf '%s' "$out3" | jq -e '.hookSpecificOutput.additionalContext' >/dev/null 2>&1 \ + && pass "pr-reminder: re-emits after PreCompact reset" \ + || fail "pr-reminder: should re-emit after compaction, got: $out3" + # I1: a quoted body that merely mentions 'gh pr create' must NOT emit + local fp='{"tool_name":"Bash","tool_input":{"command":"gh issue create --title t --body \"see gh pr create docs\""},"cwd":"'"$tmp"'","transcript_path":"'"$sess"'"}' + local outfp; outfp="$(run_hook pretool-pr-review-reminder "$fp")" + [ -z "$outfp" ] && pass "pr-reminder: not tripped by 'gh pr create' inside a quoted body" \ + || fail "pr-reminder: false-positive on quoted body, got: $outfp" + # degrade gracefully: no transcript_path → emits every time (no marker write) + local tmp2; tmp2="$(mktemp -d)" + local np='{"tool_name":"Bash","tool_input":{"command":"gh pr create --title t"},"cwd":"'"$tmp2"'"}' + local na; na="$(run_hook pretool-pr-review-reminder "$np")" + local nb; nb="$(run_hook pretool-pr-review-reminder "$np")" + { printf '%s' "$na" | jq -e '.hookSpecificOutput' >/dev/null 2>&1 && printf '%s' "$nb" | jq -e '.hookSpecificOutput' >/dev/null 2>&1; } \ + && pass "pr-reminder: no transcript_path → emits every time" \ + || fail "pr-reminder: should emit each time without transcript_path" + rm -rf "$tmp2" +} +# (call test_pr_reminder_dedup in the main run sequence) +``` + +**Step 2: Run → FAIL** (reminder has no dedup yet): +Run: `bash tests/hook-contracts.sh 2>&1 | grep -i "pr-reminder" | head` +Expected: the "deduped within session" case FAILs (second call still emits). + +**Step 3: Implement dedup in `pretool-pr-review-reminder`** — after computing `cmd` and confirming `gh pr create`, before emitting: +- **Tighten the match via quote-stripping (I1 — mirror `pre-tool-scope-guard`'s precedent), NOT a boundary regex.** A boundary regex still matches `gh pr create` inside a quoted `--body`. Strip quoted segments first, then match on the stripped command: + ```bash + cmd_unquoted=$(printf '%s' "$cmd" | sed "s/\"[^\"]*\"//g; s/'[^']*'//g") # double-first, matching pre-tool-scope-guard + printf '%s' "$cmd_unquoted" | grep -q 'gh pr create' || exit 0 + ``` + So `gh issue create --body "… gh pr create …"` → stripped → `gh issue create --body ` → no match → no emit; a real `gh pr create --title t --body b` → stripped → `gh pr create --title t --body ` → matches. +- Compute `session_key`: `transcript_path=$(… jq -r .transcript_path …); session_key=""; [ -n "$transcript_path" ] && session_key=$(basename "$transcript_path")`. +- Marker: `marker="${cwd_dir}/.claude/autodev-state/pr-reminder-seen"`. +- If `[ -n "$session_key" ]` and `grep -qxF "$session_key" "$marker" 2>/dev/null` → `exit 0` (already reminded). Else emit, and if `[ -n "$session_key" ]` append: `mkdir -p "$(dirname "$marker")"; printf '%s\n' "$session_key" >> "$marker"`. +- No `session_key` (no transcript) → emit every time (current behavior). + +**Step 4: Implement the clear in `pre-compact-snapshot`** — UNCONDITIONALLY, immediately after `session_key` is computed (≈ line 30) and BEFORE the `[ -z "$state_section" ] && exit 0` early-exit: +```bash +# #61: clear this session's pr-review-reminder marker so the reminder re-emits once +# after compaction (the post-compaction context lost the earlier reminder). Must run +# before the no-locked-plans early-exit. Guard on a non-empty session key. +if [ -n "${session_key:-}" ]; then + reminder_marker="${cwd_dir}/.claude/autodev-state/pr-reminder-seen" + if [ -f "$reminder_marker" ]; then + grep -vxF "$session_key" "$reminder_marker" > "${reminder_marker}.tmp" 2>/dev/null \ + && mv "${reminder_marker}.tmp" "$reminder_marker" || rm -f "${reminder_marker}.tmp" + fi +fi +``` + +**Step 5: Run → PASS** +Run: `bash tests/hook-contracts.sh 2>&1 | tail -4` +Expected: all pass, including `pr-reminder: deduped within session` + `pr-reminder: re-emits after PreCompact reset`. + +**Step 6: Commit** +```bash +git add hooks/pretool-pr-review-reminder hooks/pre-compact-snapshot tests/hook-contracts.sh +git commit -m "fix(hooks): pr-review reminder once-per-session + PreCompact reset (#61)" +``` +Rollback: revert commit (hooks restore prior emit-every-time behavior). + +--- + +### Task 6: CI-gate the hook regression tests + +**Files:** +- Create: `.github/workflows/hooks-check.yml` + +**Rationale:** #41 is a hook-reliability fix; its regression test must actually run in CI (a test that never runs is theater — the existence/runtime-validity discipline). No workflow currently runs the hook tests. + +**Step 1: Add the workflow** +```yaml +name: Hooks Check +on: + push: + paths: + - 'hooks/**' + - 'tests/hook-contracts.sh' + - 'tests/hook-stdout-discipline.sh' + - '.github/workflows/hooks-check.yml' + pull_request: + paths: + - 'hooks/**' + - 'tests/hook-contracts.sh' + - 'tests/hook-stdout-discipline.sh' +jobs: + hooks: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install jq + run: sudo apt-get update && sudo apt-get install -y jq + - name: Hook contract tests + run: bash tests/hook-contracts.sh + - name: Hook stdout discipline tests + run: bash tests/hook-stdout-discipline.sh +``` + +**Step 2: Verify both test scripts currently pass locally (the gate is real)** +Run: `bash tests/hook-contracts.sh 2>&1 | tail -2 && bash tests/hook-stdout-discipline.sh 2>&1 | tail -2` +Expected: both report 0 failures (after Tasks 4+5). + +**Step 3: Lint the workflow YAML** +Run: `python3 -c "import yaml; yaml.safe_load(open('.github/workflows/hooks-check.yml'))" && echo "YAML OK"` +Expected: `YAML OK`. + +**Step 4: Commit** +```bash +git add .github/workflows/hooks-check.yml +git commit -m "ci: run hook contract + stdout-discipline tests on hooks/tests changes (#41/#61)" +``` +Rollback: revert commit (removes the workflow). + +--- + +### Task 7: Version bump → v6.3.0 + RELEASE-NOTES + +**Files:** +- Modify: `.claude-plugin/plugin.json`, `.claude-plugin/marketplace.json`, `.cursor-plugin/plugin.json` (via `scripts/bump-version.sh`) +- Modify: `RELEASE-NOTES.md` + +**Step 1: Confirm start version + the tag is free** +Run: `grep -m1 '"version"' .claude-plugin/plugin.json; git ls-remote --tags origin refs/tags/v6.3.0` +Expected: `6.2.2`; `git ls-remote` prints nothing (v6.3.0 free). + +**Step 2: Bump** +Run: `scripts/bump-version.sh 6.3.0` +Expected: `Bumping version: 6.2.2 → 6.3.0` across the 3 manifests. + +**Step 3: Consistency gate (the exact check `release-tag.yml` runs)** +Run: `bash tests/version-check.sh` +Expected: `OK: All version files agree on version 6.3.0`. + +**Step 4: Add RELEASE-NOTES entry** — a `## v6.3.0 — 2026-06-01` section at the top (below the title), summarizing the 5 fixes (#41/#58/#59/#60/#61). + +**Step 5: Commit** +```bash +git add .claude-plugin/plugin.json .claude-plugin/marketplace.json .cursor-plugin/plugin.json RELEASE-NOTES.md +git commit -m "chore: bump version to 6.3.0 (#41/#58/#59/#60/#61)" +``` +Rollback: `scripts/bump-version.sh 6.3.0 6.2.2` + revert; pre-merge revert prevents the v6.3.0 tag (release fires only on merge-to-main touching plugin.json). + +> After merge: `release-tag.yml` auto-tags v6.3.0 + dispatches the marketplace. Create the GH Release from the RELEASE-NOTES v6.3.0 section (tag-only is the workflow's behavior; v6.2.0/6.2.2 precedent). + +--- + +### Task 8: #64 — session-start time-dedup Linux portability (amendment) + +**Discovered at execution:** Task 6's new `hooks-check.yml` ran `hook-contracts.sh` on +ubuntu for the first time and surfaced a **pre-existing** Linux bug: +`hooks/session-start` computes the last-emit mtime with BSD `stat -f %m` **before** GNU +`stat -c %Y`. On Linux `stat -f` means "file system status", succeeds (exit 0), and +prints fs info instead of the mtime → `[ "$last" -gt 0 ]` errors and the time-dedup +never suppresses re-fires. User-approved amendment (#64). + +**Files:** +- Modify: `hooks/session-start` (the `stat` ordering + numeric guard) +- Modify: `.github/workflows/hooks-check.yml` (re-enable `hook-contracts.sh` once fixed) + +**Step 1: Fix the `stat` ordering** — GNU first, then BSD, then guard numeric: +```bash +last=$(stat -c %Y "$LAST_EMIT_FILE" 2>/dev/null || stat -f %m "$LAST_EMIT_FILE" 2>/dev/null || echo 0) +case "$last" in (*[!0-9]*|'') last=0 ;; esac +``` +**Step 2: Re-enable `hook-contracts.sh` in `hooks-check.yml`** (the CI gate is now real on Linux). +**Step 3: Verify** — `bash tests/hook-contracts.sh` → `All hook contract tests passed.` +(macOS: `stat -c %Y` fails → falls to BSD `stat -f %m`; Linux: `stat -c %Y` works.) CI on +ubuntu confirms the time-dedup case now passes. +**Step 4: Commit** — `fix(session-start): GNU stat -c %Y before BSD -f %m … (#64)`. +Rollback: revert commit. + +### Task 9: #63 — Artifact-class precedent design-phase bug-class (amendment) + +**User-approved amendment (#63).** A design can pass mechanism-correctness review yet put +an artifact in the wrong *place/shape* (v1.1: a scenario test fixture inside the +production engine repo, when sibling scenarios own a `cmd/server/main.go`). + +**Files:** +- Modify: `skills/adversarial-design-review/SKILL.md` (design-phase table, after `Repo-precedent conflicts`) + +**Step 1: Insert the row** (after `Repo-precedent conflicts`): +``` +| **Artifact-class precedent** | Survey how the codebase already implements this *artifact class* — not just the *mechanism*. … Grep for sibling instances (`ls scenarios/*/cmd/server/main.go`, sibling plugins, migrations, CLI commands, fixtures) and confirm the design follows the established shape — or explicitly justifies divergence. … Run the decisive `ls`/`grep` for the artifact class, not just for the mechanism. | +``` +**Step 2: Verify** — awk confirms the row is in the **design-phase** section; `bash tests/skill-content-grep.sh` PASS. +**Step 3: Commit** — `feat(adversarial-design-review): add Artifact-class precedent design-phase bug-class (#63)`. +Rollback: revert commit (additive prose row). + +## Verification summary (change-class mapping) + +| Task | Change class | Verification | Expected | +|---|---|---|---| +| 1 | Documentation (skill) | skill-content-grep + awk placement | PASS; row in plan-phase section | +| 2 | Documentation (skill) | skill-content-grep + skill-cross-refs | PASS | +| 3 | Documentation (skill+agent) | skill-content-grep + skill-cross-refs | PASS | +| 4 | Hook (wrapper) | `tests/hook-stdout-discipline.sh` + `tests/hook-contracts.sh` | 0 failures; block JSON recovered behind warning | +| 5 | Hook (reminder+precompact) | `tests/hook-contracts.sh` dedup+reset cases | 0 failures; deduped + re-emits post-compact | +| 6 | CI workflow | YAML lint + both hook test scripts pass locally | YAML OK; 0 failures | +| 7 | Version pin | `tests/version-check.sh` | all 3 manifests agree on 6.3.0 | + +## Multi-Component / Integration proof + +The real boundaries: (a) wrapper↔hook — Task 4's `hook-stdout-discipline.sh` runs the +**real** `run-hook.cmd` against fixture hooks (warning+JSON, noise, clean) and asserts +stdout/stderr; `hook-contracts.sh` runs the real registered hooks through the wrapper +(no regression). (b) reminder↔state↔pre-compact — Task 5's tests run the **real** +`pretool-pr-review-reminder` + `pre-compact-snapshot` against a shared tmp +`.claude/autodev-state`, asserting dedup + post-compact reset. (c) Task 6 CI-gates both +so the hook fixes can't silently regress. diff --git a/docs/plans/2026-06-01-pipeline-hardening-4issues.md.scope-lock b/docs/plans/2026-06-01-pipeline-hardening-4issues.md.scope-lock new file mode 100644 index 0000000..f9c2454 --- /dev/null +++ b/docs/plans/2026-06-01-pipeline-hardening-4issues.md.scope-lock @@ -0,0 +1 @@ +815ea55f53f1673b36497497a3141abf8c0fc4d2cb5291303ade1d123046498a diff --git a/hooks/pre-compact-snapshot b/hooks/pre-compact-snapshot index a5284e2..5bc19ef 100755 --- a/hooks/pre-compact-snapshot +++ b/hooks/pre-compact-snapshot @@ -29,6 +29,26 @@ transcript_path=$(printf '%s' "$hook_input" | jq -r '.transcript_path // empty' session_key="" [ -n "$transcript_path" ] && session_key=$(basename "$transcript_path") +# ── #61: clear this session's pr-review-reminder marker ─────────────────────── +# So the reminder re-emits once after compaction (the post-compaction context lost the +# earlier reminder). MUST run before the no-locked-plans early-exit below. Guard on a +# non-empty session key (no blanket file wipe on identity-less hosts). +if [ -n "${session_key:-}" ]; then + reminder_marker="${cwd_dir}/.claude/autodev-state/pr-reminder-seen" + if [ -f "$reminder_marker" ]; then + # grep -v exits 1 when no lines remain (only our key) — tolerate it under set -e. + remaining="$(grep -vxF "$session_key" "$reminder_marker" 2>/dev/null || true)" + if [ -n "$remaining" ]; then + # Atomic: write other sessions' keys to a temp file then rename. + printf '%s\n' "$remaining" > "${reminder_marker}.tmp" 2>/dev/null \ + && mv "${reminder_marker}.tmp" "$reminder_marker" \ + || rm -f "${reminder_marker}.tmp" + else + rm -f "$reminder_marker" # only our key → remove marker + fi + fi +fi + # ── Find locked plans and collect their state ───────────────────────────────── plans_dir="${cwd_dir}/docs/plans" state_section="" diff --git a/hooks/pretool-pr-review-reminder b/hooks/pretool-pr-review-reminder index a74bbfa..500e8c8 100755 --- a/hooks/pretool-pr-review-reminder +++ b/hooks/pretool-pr-review-reminder @@ -26,8 +26,23 @@ tool_name=$(printf '%s' "$hook_input" | jq -r '.tool_name // empty' 2>/dev/null cmd=$(printf '%s' "$hook_input" | jq -r '.tool_input.command // empty' 2>/dev/null || true) [ -z "$cmd" ] && exit 0 -# Only act on PR creation commands. -printf '%s' "$cmd" | grep -q 'gh pr create' || exit 0 +# Only act on real PR-creation commands. Strip quoted segments first so a quoted +# --body that merely mentions "gh pr create" does not trip the hook (#61; mirrors +# pre-tool-scope-guard's quote-strip precedent). +cmd_unquoted=$(printf '%s' "$cmd" | sed "s/\"[^\"]*\"//g; s/'[^']*'//g") +printf '%s' "$cmd_unquoted" | grep -q 'gh pr create' || exit 0 + +# Session dedup (#61): emit once per session; reset by pre-compact-snapshot on +# compaction. Keyed by transcript basename; degrade to emit-every-time when absent. +cwd_dir=$(printf '%s' "$hook_input" | jq -r '.cwd // empty' 2>/dev/null || true) +[ -z "$cwd_dir" ] && cwd_dir="${PWD}" +transcript_path=$(printf '%s' "$hook_input" | jq -r '.transcript_path // empty' 2>/dev/null || true) +session_key="" +[ -n "$transcript_path" ] && session_key=$(basename "$transcript_path") +marker="${cwd_dir}/.claude/autodev-state/pr-reminder-seen" +if [ -n "$session_key" ] && grep -qxF "$session_key" "$marker" 2>/dev/null; then + exit 0 # already reminded this session +fi reminder=$(cat <<'REMINDER' @@ -52,4 +67,10 @@ emit_additional_context() { emit_additional_context "PreToolUse" "$reminder" +# Record this session so the reminder doesn't repeat (only when we have a session key). +if [ -n "$session_key" ]; then + mkdir -p "$(dirname "$marker")" 2>/dev/null || true + printf '%s\n' "$session_key" >> "$marker" 2>/dev/null || true +fi + exit 0 diff --git a/hooks/run-hook.cmd b/hooks/run-hook.cmd index 45955ea..73f5872 100755 --- a/hooks/run-hook.cmd +++ b/hooks/run-hook.cmd @@ -55,4 +55,27 @@ if { [ "${LC_ALL:-}" = "C.UTF-8" ] || [ "${LC_CTYPE:-}" = "C.UTF-8" ] || [ "${LA [ "${LANG:-}" = "C.UTF-8" ] && export LANG=C fi -exec bash "${SCRIPT_DIR}/${SCRIPT_NAME}" "$@" +# Run the hook with stdout captured (stderr + stdin pass through untouched). +# Enforce stdout JSON discipline: only valid-JSON-or-empty reaches the host's hook +# parser; diagnostics that leak onto stdout (locale/perl/git warnings) are routed to +# stderr. A block decision preceded by a warning is recovered, not dropped (#41). +if command -v jq >/dev/null 2>&1; then + hook_out="$(bash "${SCRIPT_DIR}/${SCRIPT_NAME}" "$@")" + hook_rc=$? + if [ -z "$hook_out" ]; then + : # nothing to emit + elif printf '%s' "$hook_out" | jq -e . >/dev/null 2>&1; then + printf '%s\n' "$hook_out" # valid JSON as a whole + else + json_line="$(printf '%s\n' "$hook_out" | grep -E '^\{' | tail -1)" + if [ -n "$json_line" ] && printf '%s' "$json_line" | jq -e . >/dev/null 2>&1; then + printf '%s\n' "$hook_out" | grep -vxF "$json_line" >&2 # diagnostics -> stderr (full-line) + printf '%s\n' "$json_line" # recovered JSON -> stdout + else + printf '%s\n' "$hook_out" >&2 # all noise -> stderr + fi + fi + exit "$hook_rc" +else + exec bash "${SCRIPT_DIR}/${SCRIPT_NAME}" "$@" # jq absent: pass through unchanged +fi diff --git a/hooks/session-start b/hooks/session-start index 43b5100..98369e3 100755 --- a/hooks/session-start +++ b/hooks/session-start @@ -81,9 +81,13 @@ LAST_EMIT_FILE="${STATE_DIR}/session-start-last-emit" DEDUP_WINDOW_SEC=5 if [ -f "$LAST_EMIT_FILE" ]; then now=$(date +%s 2>/dev/null || echo 0) - last=$(stat -f %m "$LAST_EMIT_FILE" 2>/dev/null \ - || stat -c %Y "$LAST_EMIT_FILE" 2>/dev/null \ + # GNU stat (-c %Y) first, then BSD (-f %m). BSD-first is wrong on Linux: `stat -f` + # means "file system status" there, succeeds (exit 0), and prints fs info instead + # of the mtime — breaking the numeric comparison and the dedup (#64). + last=$(stat -c %Y "$LAST_EMIT_FILE" 2>/dev/null \ + || stat -f %m "$LAST_EMIT_FILE" 2>/dev/null \ || echo 0) + case "$last" in (*[!0-9]*|'') last=0 ;; esac # guard: only a clean integer if [ "$now" -gt 0 ] && [ "$last" -gt 0 ] && [ $((now - last)) -lt "$DEDUP_WINDOW_SEC" ]; then exit 0 fi diff --git a/skills/adversarial-design-review/SKILL.md b/skills/adversarial-design-review/SKILL.md index cdbbfc8..1f6ddc9 100644 --- a/skills/adversarial-design-review/SKILL.md +++ b/skills/adversarial-design-review/SKILL.md @@ -87,6 +87,7 @@ checklist is the floor, not the ceiling. Additional findings are welcome. | **Project-guidance conflicts** | Read `docs/design-guidance.md` or equivalent project guidance. Does the artifact violate product direction, architecture constraints, UX/domain principles, quality/security/ops requirements, or non-goals? Cite both the guidance path/section and the conflicting design/plan section. If guidance is missing and the design does not show Q&A capture, flag it. | | **Assumptions under attack** | Load-bearing claims, stated or unstated. "We assume the upstream API is idempotent." "We assume single-tenant." "We assume the user has admin." Challenge each assumption, name how it could be false, and flag any where a false assumption collapses the design. | | **Repo-precedent conflicts** | Does this design fight existing patterns, skills, or conventions in this repo? Cite the conflicting `path/file.md:line`. If the design proposes a new pattern that contradicts an established one, the design must justify the divergence. | +| **Artifact-class precedent** | Survey how the codebase already implements this *artifact class* — not just the *mechanism*. The Repo-precedent class above checks the HOW (e.g. how a plugin registers a module type); this checks the WHERE/shape (e.g. where a scenario stands up a server, where a test fixture lives, where a migration goes). Grep for sibling instances of the same kind of artifact (`ls scenarios/*/cmd/server/main.go`, sibling plugins, migrations, CLI commands, fixtures) and confirm the design follows the established shape — or explicitly justifies divergence. A design that puts a test fixture in the production engine repo when sibling scenarios own their `cmd/server/main.go` = finding. Run the decisive `ls`/`grep` for the artifact class, not just for the mechanism. | | **YAGNI violations** | Features in the design that aren't justified by stated requirements. Configuration knobs nobody asked for. Generality nobody needs. Future-proofing for cases that may never arrive. | | **Missing failure modes** | What fails first under partial failure, network partition, restart-mid-operation, malformed input, adversarial input, the dependency being down? If the design doesn't address it, flag it. | | **Security / privacy at architecture level** | Auth/authz boundaries, secret flow, least privilege, blast radius on compromise, PII exposure, log leakage, dependency/trust boundary, abuse case, CSRF/SSRF/auth-confused-deputy at the design level (not at the code level — that's `requesting-code-review`'s job). | @@ -106,6 +107,7 @@ inherits the design's blast radius) and adds: |---|---| | **Over-decomposition / under-decomposition** | Does task granularity match `writing-plans`'s 2–5-minute target per step? A 40-step plan for a CSV export is suspect. A 2-step plan for a schema migration is suspect. Flag both directions. | | **Verification-class mismatch** | For each task, does its verification step match its change class per the table in `skills/writing-plans/SKILL.md` ("Verification per change class")? A schema migration verified by unit tests = finding. An API endpoint with no curl invocation = finding. | +| **Auth/authz chain composition** | When the design names an auth/authz chain ("behind the X auth filter", "RBAC-enforced", "admin-only"), walk that chain component-by-component against the plan's actual wiring. For each gate, verify it is enforced **server-side against an authenticated principal**, not shape-matched by a client-asserted value. Flag any gate where the plan's check reads from request/client-supplied input (`evidence.granted_permissions`, a header, a body field) instead of an authenticated subject (`authz.Enforce(authenticatedSubject, …)`). A plan that wires a weaker gate than the design's chain implies = finding. | | **Hidden serial dependencies** | Tasks the plan claims are independent but actually share state (same file, same DB row, same config key). If executed in parallel, they'll collide. Flag any such pair. | | **Missing rollback wiring** | The design specifies a rollback story (per the design-phase class above). Is it actually implemented in the plan as a task or step? Or is it a paragraph nobody is going to write code for? | | **Missing integration proof** | For multi-component changes, does the plan include an end-to-end or integration verification that exercises the real boundary? If it only tests each component with mocks, flag it unless the design explicitly justified that as sufficient. | diff --git a/skills/pr-monitoring/SKILL.md b/skills/pr-monitoring/SKILL.md index a395cd4..626a1ee 100644 --- a/skills/pr-monitoring/SKILL.md +++ b/skills/pr-monitoring/SKILL.md @@ -21,6 +21,50 @@ Invoked automatically by `finishing-a-development-branch` in autonomous mode aft When multiple PRs were created in the same session, prefer launching **one monitor agent that covers all PRs** to reduce GitHub API request volume. You may launch one agent per PR instead if the PRs are on unrelated codebases, if you anticipate heavy parallel CI load, or if a previous shared monitor was rate-limited. Default to the single-agent approach and only deviate with a reason. +## Waiting for CI: the sanctioned pattern + +For a pure CI-wait (poll until checks settle, then act), prefer a blocking poll-loop +over a long-lived monitor Agent. The pattern is host-scoped. + + +**Recommended default — a bash poll-loop, not a long-lived monitor Agent.** Issue a +`Bash` tool call with `run_in_background: true` running a **bounded** sleep-loop that +polls `gh pr checks ` until no check is `pending` (or a failure), prints a settle +line, and exits. The harness re-invokes the lead **once** on exit (≈0 tokens while it +sleeps); the lead then reads the result and admin-merges on `failures=0`. + +Bound the loop so it can never spin forever, e.g.: + +```bash +for i in $(seq 1 120); do # 120 × 30s = 60 min cap + raw=$(gh pr checks --json bucket 2>/dev/null) + fail=$(echo "$raw" | jq '[.[]|select(.bucket=="fail")]|length') + pend=$(echo "$raw" | jq '[.[]|select(.bucket=="pending")]|length') + { [ "$fail" != 0 ] && [ -n "$fail" ]; } && { echo "FAILURES=$fail"; break; } + { [ "$pend" = 0 ] || [ -z "$pend" ]; } && { echo "SETTLED"; break; } + sleep 30 +done +``` + +On the 120-iteration cap, print a timeout line and exit for the lead to restart. + +**Why not a background Agent for this:** a `run_in_background` **Agent** told to +sleep-loop tends to return after ~1 cycle and re-complete repeatedly (the agent loop is +not a blocking sleep) — observed early-exiting ~6× in one run. The bash sleep-loop +genuinely blocks to completion. + + + +Use your host's equivalent poll mechanism. Where no blocking-background-bash exists, the +sanctioned fallback is **self-poll on each lead wakeup**: run `gh pr checks ` once +per turn and re-check on the next turn. This loses fire-on-event but is reliable; do not +dispatch a background Agent expecting it to block on a sleep-loop. + + +**Fallback (all hosts):** the background-Agent monitor in "The Process" below remains the +tool for multi-PR review-comment handling that needs active fix-and-push; note its +early-exit failure mode and prefer the poll-loop / self-poll for a pure CI-wait. + ## The Process Run a `balanced`-tier agent that monitors PRs in a loop for up to **60 minutes**, polling every **10 minutes**, until all CI checks pass and no unresolved reviews remain. diff --git a/skills/subagent-driven-development/SKILL.md b/skills/subagent-driven-development/SKILL.md index 807005d..dbe97d5 100644 --- a/skills/subagent-driven-development/SKILL.md +++ b/skills/subagent-driven-development/SKILL.md @@ -296,6 +296,26 @@ Codex subagents do not share a task list. Use these conventions instead: --- +## Completion is not trusted until lead-verified + +A green `Implement: N` checkbox is a **claim, not evidence**. Regardless of who or what +flips an `Implement: N` task to `completed` (an implementer, a `blockedBy`-clear, or the +code-reviewer), the lead MUST run `autodev:verification-before-completion` — build + test +from a clean tree, CI green — before treating that task as truly done or invoking +`finishing-a-development-branch`. Recurring failure mode (infra-admin v1.1): a +non-compiling tree and a CI-failing regression were both marked "done"; only the lead's +verification caught them. + +The team-conventions rule "only the code-reviewer flips `Implement: N` to completed" +remains team discipline, but **correctness rests on lead verification, which does not +depend on who flipped the bit**. A deterministic hook that *blocks* the flip is +infeasible — the pre-tool payload a hook sees carries the task id and status but not the +task's subject or the calling sub-agent's identity, so a hook cannot tell an `Implement` +task from any other or identify the caller. See +`decisions/0003-implement-n-completion-trust-boundary.md`. + +--- + ## Red Flags **Never:** diff --git a/tests/hook-contracts.sh b/tests/hook-contracts.sh index 3023e5e..6defdf3 100755 --- a/tests/hook-contracts.sh +++ b/tests/hook-contracts.sh @@ -419,6 +419,38 @@ test_pretool_pr_review_json() { assert_hook_context_json "pretool-pr-review-reminder" "PreToolUse" "$output" } +test_pr_reminder_dedup() { + local tmp; tmp="$(mktemp -d)" + trap 'rm -rf "$tmp"' RETURN + local sess='/x/transcripts/sess-abc.jsonl' + local payload='{"tool_name":"Bash","tool_input":{"command":"gh pr create --title t --body b"},"cwd":"'"$tmp"'","transcript_path":"'"$sess"'"}' + local out1; out1="$(run_hook pretool-pr-review-reminder "$payload")" + printf '%s' "$out1" | jq -e '.hookSpecificOutput.additionalContext' >/dev/null 2>&1 \ + && pass "pr-reminder: first call emits" || fail "pr-reminder: first call should emit" + local out2; out2="$(run_hook pretool-pr-review-reminder "$payload")" + [ -z "$out2" ] && pass "pr-reminder: deduped within session" \ + || fail "pr-reminder: second call should be silent, got: $out2" + # PreCompact with NO locked plans must still clear the marker. + run_hook pre-compact-snapshot '{"cwd":"'"$tmp"'","transcript_path":"'"$sess"'"}' >/dev/null 2>&1 || true + local out3; out3="$(run_hook pretool-pr-review-reminder "$payload")" + printf '%s' "$out3" | jq -e '.hookSpecificOutput.additionalContext' >/dev/null 2>&1 \ + && pass "pr-reminder: re-emits after PreCompact reset" \ + || fail "pr-reminder: should re-emit after compaction, got: $out3" + # A quoted body that merely mentions 'gh pr create' must NOT emit. + local fp='{"tool_name":"Bash","tool_input":{"command":"gh issue create --title t --body \"see gh pr create docs\""},"cwd":"'"$tmp"'","transcript_path":"'"$sess"'"}' + local outfp; outfp="$(run_hook pretool-pr-review-reminder "$fp")" + [ -z "$outfp" ] && pass "pr-reminder: not tripped by 'gh pr create' inside a quoted body" \ + || fail "pr-reminder: false-positive on quoted body, got: $outfp" + # Degrade gracefully: no transcript_path -> emits every time. + local tmp2; tmp2="$(mktemp -d)" + local np='{"tool_name":"Bash","tool_input":{"command":"gh pr create --title t"},"cwd":"'"$tmp2"'"}' + local na nb; na="$(run_hook pretool-pr-review-reminder "$np")"; nb="$(run_hook pretool-pr-review-reminder "$np")" + { printf '%s' "$na" | jq -e '.hookSpecificOutput' >/dev/null 2>&1 && printf '%s' "$nb" | jq -e '.hookSpecificOutput' >/dev/null 2>&1; } \ + && pass "pr-reminder: no transcript_path -> emits every time" \ + || fail "pr-reminder: should emit each time without transcript_path" + rm -rf "$tmp2" +} + test_posttool_pr_created_json() { local output output="$(run_hook posttool-pr-created '{"tool_name":"Bash","tool_input":{"command":"gh pr create --title test --body test"},"tool_response":"https://github.com/owner/repo/pull/123","cwd":"'"$REPO_ROOT"'"}')" @@ -1778,6 +1810,7 @@ test_prompt_strict_ignores_single_workspace_lock_when_session_has_no_lock test_prompt_strict_uses_session_locked_plan_only test_prompt_strict_ignores_prose_mention_of_locked_status test_pretool_pr_review_json +test_pr_reminder_dedup test_posttool_pr_created_json test_pre_compact_snapshot_json test_wrapper_suppresses_pre_compact_locale_noise diff --git a/tests/hook-stdout-discipline.sh b/tests/hook-stdout-discipline.sh new file mode 100755 index 0000000..196859b --- /dev/null +++ b/tests/hook-stdout-discipline.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# tests/hook-stdout-discipline.sh — verify run-hook.cmd enforces stdout JSON discipline. +set -uo pipefail +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +WRAPPER="$REPO_ROOT/hooks/run-hook.cmd" +failures=0 +pass() { printf 'PASS: %s\n' "$*"; } +fail() { printf 'FAIL: %s\n' "$*" >&2; failures=$((failures+1)); } +command -v jq >/dev/null 2>&1 || { echo "SKIP: jq required for a/b/c"; exit 0; } + +tmp="$(mktemp -d)" +# Cleanup trap set BEFORE any fixture is copied into hooks/ (rm -f on absent files is safe). +trap 'rm -f "$REPO_ROOT"/hooks/fix-warn-then-json "$REPO_ROOT"/hooks/fix-noise "$REPO_ROOT"/hooks/fix-clean; rm -rf "$tmp"' EXIT +mkfix() { printf '%s\n' "$1" > "$tmp/$2"; chmod +x "$tmp/$2"; } + +# Fixture A: a warning leaks to stdout, then a block JSON on stdout. +mkfix '#!/usr/bin/env bash +echo "perl: warning: Setting locale failed." +printf "%s\n" "{\"decision\":\"block\",\"reason\":\"x\"}"' fix-warn-then-json +# Fixture B: only noise, no JSON. +mkfix '#!/usr/bin/env bash +echo "just a diagnostic line"' fix-noise +# Fixture C: clean single-line JSON. +mkfix '#!/usr/bin/env bash +printf "%s\n" "{\"hookSpecificOutput\":{\"hookEventName\":\"X\"}}"' fix-clean + +for f in fix-warn-then-json fix-noise fix-clean; do cp "$tmp/$f" "$REPO_ROOT/hooks/$f"; done + +run() { OUT="$("$WRAPPER" "$1" 2>"$tmp/err")"; RC=$?; ERR="$(cat "$tmp/err")"; } + +# (a) warning + block JSON → stdout ONLY the block JSON; warning ON stderr. +run fix-warn-then-json +if printf '%s' "$OUT" | jq -e '.decision=="block"' >/dev/null 2>&1 \ + && ! printf '%s' "$OUT" | grep -q 'perl: warning' \ + && printf '%s' "$ERR" | grep -q 'perl: warning'; then + pass "(a) block JSON on stdout, warning routed to stderr" +else fail "(a) expected block JSON on stdout + warning on stderr, got OUT=[$OUT] ERR=[$ERR]"; fi + +# (b) only noise → stdout empty, noise on stderr. +run fix-noise +{ [ -z "$OUT" ] && printf '%s' "$ERR" | grep -q 'diagnostic'; } \ + && pass "(b) noise suppressed from stdout, routed to stderr" || fail "(b) expected empty stdout, got: $OUT" + +# (c) clean JSON → unchanged + valid. +run fix-clean +printf '%s' "$OUT" | jq -e '.hookSpecificOutput.hookEventName=="X"' >/dev/null 2>&1 \ + && pass "(c) clean JSON passthrough" || fail "(c) clean JSON broke, got: $OUT" + +# (d) jq-absent → wrapper passes stdout through VERBATIM (warning + JSON both present). +# Stub PATH so the wrapper's `command -v jq` fails, but `bash` is still resolvable +# (the wrapper's jq-absent branch `exec bash`s the fixture). assert via grep (no jq). +nojq="$tmp/nojq"; mkdir -p "$nojq" +# Symlink the externals the wrapper's jq-absent path needs (bash for `exec bash`, +# dirname for SCRIPT_DIR) but NOT jq, so `command -v jq` fails. +for bin in bash dirname; do ln -sf "$(command -v "$bin")" "$nojq/$bin"; done +OUTD="$(PATH="$nojq" "$WRAPPER" fix-warn-then-json 2>/dev/null)" +{ printf '%s' "$OUTD" | grep -q 'perl: warning' && printf '%s' "$OUTD" | grep -q '"decision":"block"'; } \ + && pass "(d) jq-absent → verbatim passthrough (no discipline applied)" \ + || fail "(d) expected verbatim passthrough with jq absent, got: $OUTD" + +echo ""; echo "Results: $failures failure(s)"; [ "$failures" -eq 0 ]