From 22a9071d6e6568bf0ee9d0e2c9357f161cf91b77 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 30 Apr 2026 23:34:42 +0000 Subject: [PATCH 1/7] feat: adversarial design/plan review + holistic lifecycle improvements (v5.4.0) Agent-Logs-Url: https://github.com/GoCodeAlone/claude-superpowers/sessions/00b94a42-65c9-443f-a930-36d72c87dd30 Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --- .claude-plugin/marketplace.json | 2 +- .claude-plugin/plugin.json | 2 +- .cursor-plugin/plugin.json | 2 +- README.md | 24 +- RELEASE-NOTES.md | 56 +++++ docs/roadmap.md | 67 +++++ skills/adversarial-design-review/SKILL.md | 290 ++++++++++++++++++++++ skills/alignment-check/SKILL.md | 8 +- skills/brainstorming/SKILL.md | 70 ++++-- skills/using-superpowers/SKILL.md | 2 +- skills/writing-plans/SKILL.md | 42 ++-- tests/cross-llm-coverage.md | 1 + 12 files changed, 518 insertions(+), 48 deletions(-) create mode 100644 docs/roadmap.md create mode 100644 skills/adversarial-design-review/SKILL.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 047cd23..010cdf4 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -9,7 +9,7 @@ { "name": "superpowers", "description": "Core skills library for Claude Code: TDD, debugging, collaboration patterns, and proven techniques", - "version": "5.3.0", + "version": "5.4.0", "source": "./", "author": { "name": "Jesse Vincent", diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index f489e23..6e1f198 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "superpowers", "description": "Core skills library for Claude Code: TDD, debugging, collaboration patterns, and proven techniques", - "version": "5.3.0", + "version": "5.4.0", "author": { "name": "Jesse Vincent", "email": "jesse@fsck.com" diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index db19b5c..9b62c5d 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -2,7 +2,7 @@ "name": "superpowers", "displayName": "Superpowers", "description": "Core skills library: TDD, debugging, collaboration patterns, and proven techniques", - "version": "5.3.0", + "version": "5.4.0", "author": { "name": "Jesse Vincent", "email": "jesse@fsck.com" diff --git a/README.md b/README.md index a8d0764..92cb7b9 100644 --- a/README.md +++ b/README.md @@ -101,19 +101,25 @@ Per-skill host-conditional audit: [tests/cross-llm-coverage.md](tests/cross-llm- ## The Basic Workflow -1. **brainstorming** - Activates before writing code. Refines rough ideas through questions, explores alternatives, presents design in sections for validation. Saves design document. +1. **brainstorming** - Activates before writing code. Refines rough ideas through questions, explores alternatives, lists load-bearing assumptions, runs a self-challenge round, presents design in sections for validation. Saves design document. -2. **using-git-worktrees** - Activates after design approval. Creates isolated workspace on new branch, runs project setup, verifies clean test baseline. +2. **adversarial-design-review (design phase)** - Activates after design doc is committed. Adversarially attacks the *ideas* in the design (not just structure): unstated assumptions, repo-precedent conflicts, YAGNI violations, missing failure modes, security gaps, rollback story, simpler alternatives, user-intent drift. PASS/FAIL with max 2 revision cycles. -3. **writing-plans** - Activates with approved design. Breaks work into bite-sized tasks (2-5 minutes each). Every task has exact file paths, complete code, verification steps. +3. **using-git-worktrees** - Activates after design approval. Creates isolated workspace on new branch, runs project setup, verifies clean test baseline. -4. **subagent-driven-development** or **executing-plans** - Activates with plan. Dispatches fresh subagent per task with two-stage review (spec compliance, then code quality), or executes in batches with human checkpoints. +4. **writing-plans** - Activates with approved design. Breaks work into bite-sized tasks (2-5 minutes each). Every task has exact file paths, complete code, verification steps. Runtime-affecting tasks include rollback notes. -5. **test-driven-development** - Activates during implementation. Enforces RED-GREEN-REFACTOR: write failing test, watch it fail, write minimal code, watch it pass, commit. Deletes code written before tests. +5. **adversarial-design-review (plan phase)** - Activates after plan doc is committed. Inherits the design checklist plus plan-specific scans: task granularity, verification-class match, hidden serial dependencies, rollback wiring. -6. **requesting-code-review** - Activates between tasks. Reviews against plan, reports issues by severity. Critical issues block progress. +6. **alignment-check** - Activates after adversarial review of plan passes. Narrowly structural: every design requirement maps to a plan task; every plan task traces to a design requirement. -7. **finishing-a-development-branch** - Activates when tasks complete. Verifies tests, presents options (merge/PR/keep/discard), cleans up worktree. +7. **subagent-driven-development** or **executing-plans** - Activates with plan. Dispatches fresh subagent per task with two-stage review (spec compliance, then code quality), or executes in batches with human checkpoints. + +8. **test-driven-development** - Activates during implementation. Enforces RED-GREEN-REFACTOR: write failing test, watch it fail, write minimal code, watch it pass, commit. Deletes code written before tests. + +9. **requesting-code-review** - Activates between tasks. Reviews against plan, reports issues by severity. Critical issues block progress. + +10. **finishing-a-development-branch** - Activates when tasks complete. Verifies tests, presents options (merge/PR/keep/discard), cleans up worktree. **The agent checks for relevant skills before any task.** Mandatory workflows, not suggestions. @@ -129,9 +135,11 @@ Per-skill host-conditional audit: [tests/cross-llm-coverage.md](tests/cross-llm- - **verification-before-completion** - Ensure it's actually fixed **Collaboration** -- **brainstorming** - Socratic design refinement +- **brainstorming** - Socratic design refinement (with assumption-listing and self-challenge round) +- **adversarial-design-review** - Adversarial attack on design and plan ideas before execution (two phases: design, plan) - **writing-plans** - Detailed implementation plans - **executing-plans** - Batch execution with checkpoints +- **alignment-check** - Structural design ↔ plan trace (forward + reverse) - **dispatching-parallel-agents** - Concurrent subagent workflows - **requesting-code-review** - Pre-review checklist - **receiving-code-review** - Responding to feedback diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 1c95e4e..2213a01 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -1,5 +1,61 @@ # Superpowers Release Notes +## v5.4.0 (2026-04-30) + +### New Features + +**Adversarial design / plan review (`skills/adversarial-design-review/`)** + +A new lifecycle stage that adversarially attacks the *ideas* in designs and plans — not just their structural coverage. Closes the only remaining gap in the review-gate stack: every other gate attacks code or structure; this one attacks ideas. + +Two phases, one skill: + +- **`--phase=design`** — invoked by `brainstorming` after the design doc is committed, before `writing-plans` runs. +- **`--phase=plan`** — invoked by `writing-plans` after the plan is committed, before `alignment-check` runs. + +Mandatory bug-class checklist (design phase): unstated assumptions, repo-precedent conflicts, YAGNI violations, missing failure modes, security/privacy at architecture level, rollback story, simpler alternative not considered, user-intent drift. Plan phase adds: over/under-decomposition, verification-class mismatch, hidden serial dependencies, missing rollback wiring. + +Adversarial framing reused verbatim from `requesting-code-review` (find ≥3 things wrong; reflexive approval forbidden; full bug-class scan transcript required even on Clean). Every report MUST include a non-empty "Options the author may not have considered" section so reviewers offer alternatives, not just objections. + +PASS/FAIL with max 2 revision cycles per gate before user escalation, mirroring `alignment-check`. User overrides are recorded inline in the artifact. + +**Brainstorming: explicit assumptions + self-challenge round** + +`brainstorming` now requires: + +- An explicit list of load-bearing assumptions in every design ("we assume the upstream API is idempotent"). The design doc gets an `## Assumptions` section. +- A lightweight self-challenge round before the design is presented to the user — five quick checks (laziest plausible solution? most fragile assumption? YAGNI? failure modes? repo-pattern conflicts?) that clean up obvious issues before the user sees the design. +- An `## Rollback` section in the design for change classes that affect runtime (build, deployment, version pins, startup config, migrations, plugin loading) — same trigger list as `runtime-launch-validation`. + +The heavyweight pass remains `adversarial-design-review`; the self-challenge is intentionally lightweight. + +**Writing-plans: rollback notes for runtime-affecting tasks** + +For any task whose change class triggers `runtime-launch-validation`, the plan must now include a one-line rollback note in the task body ("Rollback: revert commit + re-run migration tool down + smoke check"). This makes the design's rollback story concretely traceable into the plan, so `adversarial-design-review --phase=plan` can verify it isn't an orphaned paragraph. + +**Pipeline rewiring** + +The autonomous pipeline now includes the new gates: + +``` +brainstorming → adversarial-design-review (design) + → writing-plans + → adversarial-design-review (plan) + → alignment-check + → subagent-driven-development + → finishing-a-development-branch → pr-monitoring +``` + +`alignment-check` is now scoped to **structural** trace only — adversarial concerns are cleared by the time it runs, so it stays narrow and fast. + +### Why + +Every existing review gate attacks code (`requesting-code-review`, spec-reviewer, code-reviewer, `verification-before-completion`) or structure (`alignment-check`). Nothing attacked the **ideas** in the design or plan themselves. Misconceptions, unstated assumptions, YAGNI features, and over-engineered approaches survived all the way to implementation, where they were the most expensive to fix. `adversarial-design-review` catches them at the cheapest stage. Stacking it on top of `alignment-check` is additive, not redundant — they catch different bug classes. + +### Roadmap + +`docs/roadmap.md` was added in this release to track items considered during the holistic evaluation that are not landing in this version: durable decision logs (ADRs), post-merge retrospective skill, skill-usage telemetry, brainstorming cost-control gate, and cross-skill consistency invariants. Each entry has a one-paragraph rationale and trigger condition. + ## v5.3.0 (2026-04-29) ### New Features diff --git a/docs/roadmap.md b/docs/roadmap.md new file mode 100644 index 0000000..dc2e754 --- /dev/null +++ b/docs/roadmap.md @@ -0,0 +1,67 @@ +# Roadmap + +Items considered during the v5.4.0 holistic evaluation of the superpowers plugin that are not landing immediately. Each entry has a one-paragraph rationale, a trigger condition (when it becomes worth doing), and a sketch of the shape of the change. + +This file is maintained so good ideas surfaced during evaluation are not lost. + +## Decision log / ADR step + +**Status:** considered, not landed. + +**Why not yet:** designs in `docs/plans/` capture *what* we chose, not *why we rejected alternatives* in a durable way. New contributors re-litigate decisions. However, the v5.4.0 `adversarial-design-review` skill now produces a report that includes "Options the author may not have considered" and a verdict reasoning paragraph, both committed alongside the design. That report functions as an organic decision log without inventing a new artifact. Adding ADRs on top would be process tax for a problem the new reviewer largely solves. + +**Trigger to land:** if adversarial-review reports prove insufficient as a decision log in practice (e.g., contributors keep re-asking why a path was rejected, or the reports are too narrow), introduce a lightweight `decisions/` directory with one ADR per significant choice. Use Michael Nygard's template (Context / Decision / Consequences). Wire into `writing-plans` so plans that diverge from a previously-recorded decision are flagged. + +## Post-merge retrospective skill + +**Status:** considered, not landed. + +**Why not yet:** `pr-monitoring` ends the pipeline. There's no closing-the-loop step that asks "what would we change about this skill set based on what happened?" Worth doing eventually, but needs its own design — adversarial review, skill activation effectiveness, post-incident learnings all blur together and should be untangled before a skill is written. + +**Trigger to land:** when a meaningful number of merged PRs have used the v5.4.0 pipeline end-to-end (target: ≥10 distinct features). At that point there's enough signal to design a retrospective skill against real evidence rather than speculation. + +**Sketch:** a `post-merge-retrospective` skill invoked after `pr-monitoring` reports the PR is merged and CI green. Reads the design, plan, adversarial-review reports, code-review threads, and CI history; produces a short report on (a) which adversarial findings were prescient, (b) which gates produced false positives, (c) skill activations that didn't fire when they should have. Output feeds into a `docs/retros/` directory. + +## Skill-usage telemetry / self-audit + +**Status:** considered, not landed. + +**Why not yet:** v5.3.0 introduced `.claude/superpowers-state/in-progress.jsonl` — an append-only activity log written by a `PostToolUse` hook. That log is currently used only for compaction recovery, but it is the natural substrate for skill-activation telemetry. Wiring it up requires a non-trivial design: privacy considerations (don't ship anything off-machine), aggregation cadence, surfacing format, and how to handle hosts without hooks. Out of scope for v5.4.0. + +**Trigger to land:** when there is a reported case of a skill failing to activate when it should have, and we want a deterministic post-hoc check rather than a guess. + +**Sketch:** add a `tests/skill-activation-audit.sh` script that reads the state JSONL and reports counts per skill over a window, plus a `lib/skills-core.js` helper to surface "expected but not invoked" patterns in a session. Strictly local; never transmitted. + +## Brainstorming cost-control gate + +**Status:** considered, not landed. + +**Why not yet:** `brainstorming` can theoretically spiral when the user keeps answering questions. In practice this is rarely cited as a problem — adaptive batching plus the new self-challenge round in v5.4.0 already cap most runaway sessions. Adding a hard upper bound on rounds would impose process tax on a problem we have not actually observed. + +**Trigger to land:** if a user reports a brainstorm that exceeded N round-trips without converging, OR if metrics from the telemetry roadmap entry above show an outlier distribution. + +**Sketch:** soft cap at 5 question-batches; on exceeding, agent forcibly proposes the best-current-approximation design and asks the user to either approve, refine, or explicitly extend the budget. Lives as a single section in `brainstorming/SKILL.md`, not a new skill. + +## Cross-skill consistency invariants (test extension) + +**Status:** considered, not landed in v5.4.0. + +**Why not yet:** several skills now reference each other's filenames and steps (`finishing-a-development-branch` Step 1c is cited from at least three places; `runtime-launch-validation` triggers are referenced from `writing-plans` and `adversarial-design-review`; `requesting-code-review`'s bug-class checklist is cited from `team-conventions.md`). A rename or step-renumbering breaks silently. Not worth blocking v5.4.0 for, but worth a small follow-up because the surface area for silent breakage just grew. + +**Trigger to land:** next time a cross-skill reference breaks in review, OR the next skill PR after v5.4.0. + +**Sketch:** extend `tests/skill-content-grep.sh` (or add a sibling `tests/skill-cross-refs.sh`) that: + +1. Greps every `Step \d[a-z]?` and `# .*` heading reference across `skills/*.md` and `agents/*.md`. +2. For each `/SKILL.md` Step-N reference, verifies the target heading exists in the cited skill. +3. Emits actionable failures with both citing and target file paths. + +Cheap, deterministic, removes a class of silent-rot bugs. The grep guard infrastructure already in place is the right place to add it. + +## Out of scope (not adopted) + +These were considered and explicitly rejected during the v5.4.0 evaluation: + +- **Heavy adversarial debate during brainstorming** — risks turning the user's design conversation into a multi-agent debate they didn't ask for. The v5.4.0 lightweight self-challenge is the lighter alternative we picked instead. +- **Hostile / steelman-the-rejection reviewer framing** — theatrical, low signal. The "find ≥3 things wrong" framing from `requesting-code-review` is sharp enough. +- **Separate `pre-mortem` skill** — folded into `adversarial-design-review`'s "Missing failure modes" bug class instead of being a standalone skill. One artifact, one pass, less skill sprawl. diff --git a/skills/adversarial-design-review/SKILL.md b/skills/adversarial-design-review/SKILL.md new file mode 100644 index 0000000..0c17549 --- /dev/null +++ b/skills/adversarial-design-review/SKILL.md @@ -0,0 +1,290 @@ +--- +name: adversarial-design-review +description: Use after a design or implementation plan is drafted, before downstream skills accept it - adversarially attacks the ideas in the artifact (not just structural coverage) to surface unstated assumptions, repo-precedent conflicts, YAGNI violations, missing failure modes, security gaps, and simpler alternatives the author didn't consider +--- + +# Adversarial Design / Plan Review + +## Overview + +Every other review gate in this plugin attacks **code**. `alignment-check` attacks +**structure** (forward + reverse trace). Nothing attacks the **ideas** in the +design or plan themselves. This skill closes that gap. + +The cheapest place to kill a bad idea is **before** the plan is written. The +second-cheapest is before code is written. After that, costs rise sharply. This +skill runs adversarially at both points. + +**Core principle:** a design or plan is a hypothesis. Treat it like a PR diff +that hasn't been reviewed yet. Find what's wrong with it on purpose. + +## When to Use + +Two phases, two invocations: + +- **`--phase=design`** — invoked by `brainstorming` after the design doc is + written and committed, **before** transitioning to `writing-plans`. +- **`--phase=plan`** — invoked by `writing-plans` after the plan doc is + written and committed, **before** `alignment-check`. + +Manual invocation is also supported on any committed design or plan in +`docs/plans/`. + +```dot +digraph adversarial_review_flow { + "brainstorming" [shape=box]; + "design doc committed" [shape=box]; + "adversarial-design-review --phase=design" [shape=diamond]; + "writing-plans" [shape=box]; + "plan doc committed" [shape=box]; + "adversarial-design-review --phase=plan" [shape=diamond]; + "alignment-check" [shape=box]; + "subagent-driven-development" [shape=doublecircle]; + + "brainstorming" -> "design doc committed"; + "design doc committed" -> "adversarial-design-review --phase=design"; + "adversarial-design-review --phase=design" -> "brainstorming" [label="FAIL: revise design", style=dashed]; + "adversarial-design-review --phase=design" -> "writing-plans" [label="PASS"]; + "writing-plans" -> "plan doc committed"; + "plan doc committed" -> "adversarial-design-review --phase=plan"; + "adversarial-design-review --phase=plan" -> "writing-plans" [label="FAIL: revise plan", style=dashed]; + "adversarial-design-review --phase=plan" -> "alignment-check" [label="PASS"]; + "alignment-check" -> "subagent-driven-development"; +} +``` + +## Adversarial Framing (mandatory) + +The reviewer's prompt MUST use adversarial framing — not validation framing. +Same discipline as `skills/requesting-code-review/SKILL.md`, applied to design +artifacts. + +**Required prompt phrasing in every dispatch:** + +> Find at least three things wrong with this design (or plan), even if they +> seem minor — or, if fewer than three are found, explicitly document every +> bug-class check you ran and what you found (or didn't). Bias toward finding +> issues. You are NOT validating that the artifact is good — you are looking +> for misconceptions, unstated assumptions, and ideas the author didn't +> consider. Reflexive approval is forbidden. + +**Forbidden phrasing:** "review the design", "verify the plan looks good", +"confirm correctness", any wording that implies the reviewer's job is to +sign off. These produce theatre. + +The reviewer is not a yes-person. The reviewer is a skeptic whose job is to +make the design or plan stronger by attacking it. + +## Bug-class checklist — design phase (must scan) + +The reviewer MUST explicitly scan and report findings for each class. The +checklist is the floor, not the ceiling. Additional findings are welcome. + +| Class | Definition | +|---|---| +| **Unstated assumptions** | Load-bearing claims that aren't written down. "We assume the upstream API is idempotent." "We assume single-tenant." "We assume the user has admin." List them. Flag any where, if the assumption is wrong, the design collapses. | +| **Repo-precedent conflicts** | Does this design fight existing patterns, skills, or conventions in this repo? Cite the conflicting `path/file.md:line`. If the design proposes a new pattern that contradicts an established one, the design must justify the divergence. | +| **YAGNI violations** | Features in the design that aren't justified by stated requirements. Configuration knobs nobody asked for. Generality nobody needs. Future-proofing for cases that may never arrive. | +| **Missing failure modes** | What fails first under partial failure, network partition, restart-mid-operation, malformed input, adversarial input, the dependency being down? If the design doesn't address it, flag it. | +| **Security / privacy at architecture level** | Auth boundaries, secret flow, blast radius on compromise, PII exposure, log leakage, CSRF/SSRF/auth-confused-deputy at the design level (not at the code level — that's `requesting-code-review`'s job). | +| **Rollback story** | How do we undo this if it goes wrong in production? For any change class that runtime-launch-validation already triggers on (build/deployment/version pins/startup config/migrations/plugin loading), the design MUST specify a rollback path. If absent → finding. | +| **Simpler alternative not considered** | Name the laziest plausible solution. Did the design consider it and reject it for stated reasons? If not → finding. "Couldn't this be a flat file?" "Couldn't this be a cron job?" "Couldn't this be a single SQL view?" | +| **User-intent drift** | Re-read the original ask. Does the design solve what the user asked for, or does it solve a different problem that was easier to design for? Compare the design's stated goals against the user's stated goals; flag drift. | + +## Bug-class checklist — plan phase (must scan) + +The plan-phase reviewer scans the design-phase classes above (since the plan +inherits the design's blast radius) and adds: + +| Class | Definition | +|---|---| +| **Over-decomposition / under-decomposition** | Does task granularity match `writing-plans`'s 2–5-minute target per step? A 40-step plan for a CSV export is suspect. A 2-step plan for a schema migration is suspect. Flag both directions. | +| **Verification-class mismatch** | For each task, does its verification step match its change class per the table in `skills/writing-plans/SKILL.md` ("Verification per change class")? A schema migration verified by unit tests = finding. An API endpoint with no curl invocation = finding. | +| **Hidden serial dependencies** | Tasks the plan claims are independent but actually share state (same file, same DB row, same config key). If executed in parallel, they'll collide. Flag any such pair. | +| **Missing rollback wiring** | The design specifies a rollback story (per the design-phase class above). Is it actually implemented in the plan as a task or step? Or is it a paragraph nobody is going to write code for? | + +## Process + +1. **Read the artifact under review.** For `--phase=design`, read the design + doc at `docs/plans/YYYY-MM-DD--design.md`. For `--phase=plan`, read + both the design doc AND the plan doc at + `docs/plans/YYYY-MM-DD-.md` — the plan inherits the design's + premises and you must attack both layers. +2. **Read the original user ask.** Where available (transcript, issue body, + PR description). User-intent drift can't be caught without it. +3. **Spot-check the repo for precedent conflicts.** Grep for related + skills, similar designs in `docs/plans/`, established patterns. Cite + what you find. +4. **Run every bug-class check** in the relevant checklist. For each class, + record one of: + - **Finding** with file/section + severity (Critical / Important / Minor) + - **Clean** with a one-sentence note on what you specifically checked +5. **Surface options, not just objections.** For findings, propose a + concrete fix or alternative. "This design assumes X" → "Alternative: state + X explicitly, and add a fallback if X is false at runtime." +6. **Write the report.** Format below. Commit verdict: PASS / FAIL. + +## Report format + +````markdown +### Adversarial Review Report + +**Phase:** design | plan +**Artifact:** docs/plans/YYYY-MM-DD-.md +**Status:** PASS | FAIL + +**Findings (Critical):** +- [class] [section/line]: . Recommendation: . + +**Findings (Important):** +- [class] [section/line]: . Recommendation: . + +**Findings (Minor):** +- [class] [section/line]: . Recommendation: . + +**Bug-class scan transcript:** +| Class | Result | Note | +|---|---|---| +| Unstated assumptions | Finding / Clean | | +| Repo-precedent conflicts | Finding / Clean | | +| ... | ... | ... | + +**Options the author may not have considered:** +1. : +2. : + +**Verdict reasoning:** +```` + +A bare "looks good" verdict is rejected. The bug-class scan transcript MUST +list every class with a result, even if the result is Clean. + +## PASS / FAIL semantics + +- **PASS** — no Critical findings; Important findings either resolved or + explicitly accepted by the author with reasoning. +- **FAIL** — one or more Critical findings, OR Important findings the + author has not addressed. + +On FAIL: + +- Feed findings back to the upstream skill (`brainstorming` for design + phase, `writing-plans` for plan phase). +- The upstream skill revises the artifact based on Critical and Important + findings, then re-invokes adversarial review. +- **Max 2 revision cycles** before escalating to the user with a summary of + unresolved findings. This mirrors the bound in + `skills/alignment-check/SKILL.md`. +- The user may **override** a finding (mark it accepted with reasoning). + Overrides are recorded in the artifact (e.g., "Reviewer flagged X as + YAGNI; accepted because Y") so the decision is durable. + +On PASS: + +- For `--phase=design`: invoke `writing-plans`. +- For `--phase=plan`: invoke `alignment-check` (which is now narrowly + structural — adversarial concerns are already cleared). + +## Dispatching the reviewer agent + +Dispatch a `balanced`-tier subagent. Same tier as `alignment-check` and +`requesting-code-review` reviewers — this is review-class work, not +orchestration. + + +Use the Agent tool to dispatch: + +```` +Agent tool (general-purpose, model: balanced): + description: "Adversarial review: " + prompt: | + You are adversarially reviewing a software document. + + Read these files: + - + - (only for --phase=plan) + - The original user ask (paste it inline below). + + USER ASK (verbatim): + + + ## Required framing + Find at least three things wrong with this , even if they + seem minor — or, if fewer than three are found, explicitly document + every bug-class check you ran and what you found (or didn't). Bias + toward finding issues. You are NOT validating that the artifact is + good — you are looking for misconceptions, unstated assumptions, and + ideas the author didn't consider. Reflexive approval is forbidden. + + ## Required scans + Scan every bug class listed in the relevant checklist (paste the + checklist for the chosen phase verbatim into the dispatch prompt — do + not make the subagent read this skill file; embed the table inline). + + ## Required output + Use the Report format from the skill. Every bug class must appear in + the scan transcript with a result (Finding or Clean) and a one-sentence + note. Findings must include severity, file/section reference, and a + concrete recommendation. + + Set Status to PASS only if there are zero Critical findings AND every + Important finding either has a fix recommendation accepted by the + author or is escalated as an open question. Otherwise FAIL. +```` + + + +Run the adversarial review inline: read the design (and plan, if +`--phase=plan`), perform every bug-class scan in the checklist, and produce +the Report format above. The framing requirements still apply — adversarial +mindset, ≥3 findings or full transcript, no reflexive approval. + + +## Integration + +**Called by:** +- `brainstorming` — `--phase=design`, after design doc is committed. +- `writing-plans` — `--phase=plan`, after plan doc is committed, before + `alignment-check`. +- Manual — user invokes against any artifact in `docs/plans/`. + +**Calls:** +- `brainstorming` — on FAIL during `--phase=design`, for revision. +- `writing-plans` — on FAIL during `--phase=plan`, for revision. +- `writing-plans` — on PASS during `--phase=design`. +- `alignment-check` — on PASS during `--phase=plan`. + +## Why two phases, not one + +Different bug classes live in different artifacts: + +- The **design** is the place to ask "is this the right idea?". Catching a + YAGNI violation here saves N tasks of plan-writing and N×M lines of + implementation. +- The **plan** is the place to ask "is the breakdown sound?". Verification- + class mismatches and hidden serial dependencies don't show up in the + design — only in the plan. + +Folding them into one pass at one stage misses half the findings. + +## Why "options the author may not have considered" is mandatory + +A reviewer that only objects produces a frustrated author. A reviewer that +**also** offers concrete alternatives produces a stronger artifact. The +"Options" section of the report is non-negotiable: every report must include +at least one alternative the author may not have weighed, even if the +verdict is PASS. This is the antidote to reflexive sign-off and the +antidote to demoralizing critique. + +## Relationship to other review skills + +| Skill | Attacks | When | +|---|---|---| +| `adversarial-design-review --phase=design` | Ideas in the design | After brainstorming | +| `adversarial-design-review --phase=plan` | Ideas in the plan | After writing-plans | +| `alignment-check` | Structural coverage (design ↔ plan trace) | After plan-phase adversarial review | +| `requesting-code-review` | Code (scope + bug classes) | After each task's commit | +| `verification-before-completion` | Claims (evidence before assertions) | Before claiming done | + +Each gate has a distinct target. Stacking them does not produce duplicate +findings — they catch different bug classes at different stages. diff --git a/skills/alignment-check/SKILL.md b/skills/alignment-check/SKILL.md index 343dc85..8d54b50 100644 --- a/skills/alignment-check/SKILL.md +++ b/skills/alignment-check/SKILL.md @@ -7,13 +7,15 @@ description: Use after writing-plans to verify the implementation plan covers al ## Overview -Verify that an implementation plan faithfully covers every requirement in the approved design — nothing missing, nothing extra. This is an automated gate between planning and execution. +Verify that an implementation plan faithfully covers every requirement in the approved design — nothing missing, nothing extra. This is an automated **structural** gate between planning and execution. **Core principle:** Every design requirement maps to a plan task. Every plan task traces to a design requirement. Drift in either direction is caught before execution begins. +**Scope:** This skill is narrowly structural. It does NOT attack the ideas in the design or plan — that is the job of `adversarial-design-review`, which runs first (in autonomous mode) on both the design and the plan. By the time `alignment-check` runs, idea-level findings are already resolved; alignment is a forward + reverse trace, nothing more. + ## When to Use -Invoked automatically by `writing-plans` in autonomous mode. Can also be invoked manually after writing a plan. +Invoked automatically by `writing-plans` in autonomous mode, **after** `adversarial-design-review --phase=plan` passes. Can also be invoked manually after writing a plan. ## The Process @@ -125,7 +127,7 @@ Proceed to execution: ## Integration **Called by:** -- `writing-plans` (autonomous mode) — after plan is written +- `writing-plans` (autonomous mode) — after the plan is written AND `adversarial-design-review --phase=plan` has passed - Manual invocation — when user wants to verify alignment **Calls:** diff --git a/skills/brainstorming/SKILL.md b/skills/brainstorming/SKILL.md index de249ad..59b8b45 100644 --- a/skills/brainstorming/SKILL.md +++ b/skills/brainstorming/SKILL.md @@ -26,9 +26,12 @@ You MUST create a task for each of these items and complete them in order: 1. **Explore project context** — check files, docs, recent commits 2. **Ask clarifying questions** — adaptive batching: group related questions to reduce round-trips; use targeted singles for follow-ups 3. **Propose 2-3 approaches** — with trade-offs and your recommendation -4. **Present design** — in sections scaled to their complexity, get user approval after each section -5. **Write design doc** — save to `docs/plans/YYYY-MM-DD--design.md` and commit -6. **Transition to implementation** — invoke writing-plans skill to create implementation plan +4. **List load-bearing assumptions explicitly** — every design rests on assumptions ("upstream API is idempotent", "single-tenant", "user has admin"); write them down so the adversarial reviewer can attack them +5. **Self-challenge round** — before presenting to the user, role-play a skeptic against your own design and surface the top 3 doubts (see "Self-challenge round" below). Cleans up obvious issues before the user sees the design. +6. **Present design** — in sections scaled to their complexity, get user approval after each section. Include the assumption list and the top doubts surfaced by the self-challenge so the user sees them. +7. **Write design doc** — save to `docs/plans/YYYY-MM-DD--design.md` and commit. The doc MUST include an `## Assumptions` section and a `## Rollback` section for change classes that affect runtime (build, deployment, version pins, startup config, migrations, plugin loading paths) — same trigger list as `runtime-launch-validation`. +8. **Adversarial design review** — invoke `adversarial-design-review --phase=design`. On FAIL, revise per findings and re-run (max 2 cycles). On PASS, proceed. +9. **Transition to implementation** — invoke writing-plans skill to create implementation plan ## Process Flow @@ -37,22 +40,29 @@ digraph brainstorming { "Explore project context" [shape=box]; "Ask clarifying questions" [shape=box]; "Propose 2-3 approaches" [shape=box]; + "List assumptions" [shape=box]; + "Self-challenge round" [shape=box]; "Present design sections" [shape=box]; "User approves design?" [shape=diamond]; "Write design doc" [shape=box]; + "Adversarial design review" [shape=diamond]; "Invoke writing-plans skill" [shape=doublecircle]; "Explore project context" -> "Ask clarifying questions"; "Ask clarifying questions" -> "Propose 2-3 approaches"; - "Propose 2-3 approaches" -> "Present design sections"; + "Propose 2-3 approaches" -> "List assumptions"; + "List assumptions" -> "Self-challenge round"; + "Self-challenge round" -> "Present design sections"; "Present design sections" -> "User approves design?"; "User approves design?" -> "Present design sections" [label="no, revise"]; "User approves design?" -> "Write design doc" [label="yes"]; - "Write design doc" -> "Invoke writing-plans skill"; + "Write design doc" -> "Adversarial design review"; + "Adversarial design review" -> "Write design doc" [label="FAIL: revise"]; + "Adversarial design review" -> "Invoke writing-plans skill" [label="PASS"]; } ``` -**The terminal state is invoking writing-plans.** Do NOT invoke frontend-design, mcp-builder, or any other implementation skill. The ONLY skill you invoke after brainstorming is writing-plans. +**The terminal state is invoking writing-plans** (after adversarial review passes). Do NOT invoke frontend-design, mcp-builder, or any other implementation skill. The ONLY skill you invoke after brainstorming is `adversarial-design-review` and then `writing-plans`. ## The Process @@ -83,35 +93,60 @@ digraph brainstorming { - Once you believe you understand what you're building, present the design - Scale each section to its complexity: a few sentences if straightforward, up to 200-300 words if nuanced - Ask after each section whether it looks right so far -- Cover: architecture, components, data flow, error handling, testing +- Cover: architecture, components, data flow, error handling, testing, **assumptions** (load-bearing claims), **rollback** (for runtime-affecting change classes) +- Include the top 3 doubts surfaced by the self-challenge round so the user can react before approving - Be ready to go back and clarify if something doesn't make sense +## Self-challenge round + +Before presenting the design to the user, role-play a skeptic against your own design for one short pass. The goal is to clean up the obvious issues *before* the user sees them and *before* the heavyweight `adversarial-design-review` runs. + +Ask yourself, in order, and keep notes: + +1. **What's the laziest plausible solution to the user's actual ask?** Could a flat file, a cron job, or a single SQL view do it? If yes, why does the design pick something heavier? +2. **Which of my listed assumptions, if false, would break this design?** Pick the most fragile one. +3. **What does this design solve that the user did NOT ask for?** YAGNI sweep — list any feature, knob, or generality that isn't traceable to the user's stated goals. +4. **What fails first under partial failure / restart-mid-operation / malformed input?** Pick one and decide whether the design addresses it. +5. **Does this design fight any existing pattern in the repo?** If yes, name the conflicting `path/file.md`. + +If any answer surfaces a real issue, revise the design before presenting it. Otherwise, surface the top 3 doubts to the user when you present (so they can decide whether the trade-offs are acceptable). This is a 30-second exercise, not a debate. + +This is intentionally lightweight; the heavyweight pass is `adversarial-design-review`, which runs after the design is committed. + ## Design-only mode When the user wants design exploration without execution, they pass `--design-only` to brainstorming. **Behavior under `--design-only`:** -1. Run the full brainstorming flow (explore context → questions → approaches → design → write design doc → commit). -2. When invoking writing-plans, propagate the `--design-only` flag. -3. writing-plans honors the flag: alignment-check PASS → STOP (no execution dispatched). On alignment FAIL, writing-plans revises and re-checks per its normal FAIL handling, then stops — still no execution dispatched. On persistent FAIL (after max 2 revision cycles), escalates to user with unresolved drift summary — no execution dispatched regardless. -4. The pipeline ends with a committed design doc + plan in `docs/plans/`. +1. Run the full brainstorming flow (explore context → questions → approaches → assumptions → self-challenge → design → write design doc → commit). +2. Invoke `adversarial-design-review --phase=design`. On FAIL, revise the design and re-run (max 2 cycles). On persistent FAIL, escalate to user with unresolved findings — no plan dispatched. +3. On PASS, invoke writing-plans, propagating the `--design-only` flag. +4. writing-plans honors the flag: adversarial-design-review (plan phase) PASS → alignment-check PASS → STOP (no execution dispatched). On FAIL at any gate, writing-plans revises and re-checks per its normal FAIL handling, then stops — still no execution dispatched. On persistent FAIL (after max 2 revision cycles per gate), escalates to user — no execution dispatched regardless. +5. The pipeline ends with a committed design doc + plan in `docs/plans/` plus the adversarial review reports. -**Default (no flag):** brainstorming → writing-plans → alignment-check → subagent-driven-development → … (autonomous handoff to execution). +**Default (no flag):** brainstorming → adversarial-design-review (design) → writing-plans → adversarial-design-review (plan) → alignment-check → subagent-driven-development → … (autonomous handoff to execution). ## After the Design **Documentation:** - Write the validated design to `docs/plans/YYYY-MM-DD--design.md` +- Include explicit `## Assumptions` and `## Rollback` sections (the latter only required for change classes that affect runtime — see the trigger list in `runtime-launch-validation` / `finishing-a-development-branch` Step 1b) - Use elements-of-style:writing-clearly-and-concisely skill if available - Commit the design document to git +**Adversarial review (mandatory):** +- Invoke `adversarial-design-review --phase=design` against the committed design +- On PASS, proceed to autonomous handoff +- On FAIL, revise the design based on Critical and Important findings and re-run (max 2 cycles before escalating to the user with unresolved findings) +- The user may override a finding (mark it accepted with reasoning recorded in the design doc) + **Autonomous handoff:** - This is the user's **last interaction point** — everything after runs autonomously -- Invoke the writing-plans skill with autonomous context: the design is approved, no further user input needed +- Invoke the writing-plans skill with autonomous context: the design is approved AND adversarially reviewed, no further user input needed - The writing-plans skill will prefer Claude's Plan Mode if available (Claude Code), falling back to its built-in planning process in other environments -- The pipeline from here: writing-plans → alignment-check → team execution → PR creation → PR monitoring -- Do NOT invoke any other skill. writing-plans is the next step. It handles the rest of the autonomous pipeline. +- The pipeline from here: writing-plans → adversarial-design-review (plan phase) → alignment-check → team execution → PR creation → PR monitoring +- Do NOT invoke any other skill. `adversarial-design-review` runs first; on PASS, writing-plans is the next step. It handles the rest of the autonomous pipeline. ## Key Principles @@ -119,6 +154,9 @@ When the user wants design exploration without execution, they pass `--design-on - **Multiple choice preferred** - Easier to answer than open-ended when possible - **YAGNI ruthlessly** - Remove unnecessary features from all designs - **Explore alternatives** - Always propose 2-3 approaches before settling +- **Make assumptions explicit** - Load-bearing assumptions belong in writing, not in the agent's head +- **Self-challenge before presenting** - Cleans up obvious issues; saves the user a round-trip +- **Adversarial review before execution** - The cheapest place to kill a bad idea is before the plan is written - **Incremental validation** - Present design, get approval before moving on - **Be flexible** - Go back and clarify when something doesn't make sense -- **Design approval = autonomy handoff** - After design approval, the pipeline runs without user input +- **Design approval = autonomy handoff** - After design approval AND adversarial review pass, the pipeline runs without user input diff --git a/skills/using-superpowers/SKILL.md b/skills/using-superpowers/SKILL.md index 8314a6b..06cf7e6 100644 --- a/skills/using-superpowers/SKILL.md +++ b/skills/using-superpowers/SKILL.md @@ -81,7 +81,7 @@ When multiple skills could apply, use this order: 1. **Process skills first** (brainstorming, debugging) - these determine HOW to approach the task 2. **Implementation skills second** (frontend-design, mcp-builder) - these guide execution 3. **Pipeline skills auto-chain** — these invoke each other automatically in the autonomous pipeline: - brainstorming → writing-plans → alignment-check → subagent-driven-development → finishing-a-development-branch → pr-monitoring + brainstorming → adversarial-design-review (design phase) → writing-plans → adversarial-design-review (plan phase) → alignment-check → subagent-driven-development → finishing-a-development-branch → pr-monitoring "Let's build X" → brainstorming first, then the pipeline runs autonomously after design approval. "Fix this bug" → debugging first, then domain-specific skills. diff --git a/skills/writing-plans/SKILL.md b/skills/writing-plans/SKILL.md index 1c1e368..4ecd12d 100644 --- a/skills/writing-plans/SKILL.md +++ b/skills/writing-plans/SKILL.md @@ -37,15 +37,17 @@ When Claude's Plan Mode is not available, use the full planning process describe ## Autonomous Mode -When invoked from brainstorming with autonomous context (design already approved): +When invoked from brainstorming with autonomous context (design already approved AND adversarially reviewed): 1. **Skip user plan review** — write the plan directly without presenting it for approval -2. **Invoke `superpowers:alignment-check`** — dispatch the alignment verification agent -3. **On alignment PASS** — invoke `superpowers:subagent-driven-development` to begin execution -4. **On alignment FAIL** — revise the plan based on drift items, re-check (max 2 cycles) -5. **On persistent FAIL** — escalate to user with unresolved drift summary +2. **Invoke `superpowers:adversarial-design-review --phase=plan`** — adversarially attack the plan (and inherited design) before structural alignment is checked +3. **On adversarial-review PASS** — invoke `superpowers:alignment-check` +4. **On adversarial-review FAIL** — revise the plan based on Critical and Important findings, re-run adversarial review (max 2 cycles per gate) +5. **On alignment PASS** — invoke `superpowers:subagent-driven-development` to begin execution +6. **On alignment FAIL** — revise the plan based on drift items, re-check (max 2 cycles) +7. **On persistent FAIL at any gate** — escalate to user with unresolved findings/drift summary -The autonomous flag propagates through the entire pipeline: writing-plans → alignment-check → execution → PR creation → PR monitoring. +The autonomous flag propagates through the entire pipeline: writing-plans → adversarial-design-review (plan phase) → alignment-check → execution → PR creation → PR monitoring. ## Design-only mode @@ -59,11 +61,13 @@ Do not add YAML frontmatter to signal design-only mode. Saved plan documents mus 1. Save the plan to `docs/plans/YYYY-MM-DD-.md` as normal. 2. Commit the plan as normal. -3. Invoke `superpowers:alignment-check` as normal. -4. **On alignment PASS: STOP.** Do NOT invoke `superpowers:subagent-driven-development`. -5. **On alignment FAIL:** revise the plan based on drift items and run `superpowers:alignment-check` again, with a maximum of 2 alignment-check cycles total. If a revised plan passes alignment, still STOP and do not proceed to execution. -6. **On persistent FAIL after those 2 cycles:** escalate to the user with an unresolved drift summary. Do NOT invoke `superpowers:subagent-driven-development` or dispatch any execution. -7. The plan + design sit in `docs/plans/` for future execution. The orchestrator (or a future invocation) can resume by passing the plan to `superpowers:subagent-driven-development` directly once alignment issues are resolved. +3. Invoke `superpowers:adversarial-design-review --phase=plan` as normal. +4. On adversarial-review FAIL: revise the plan based on findings and re-run adversarial review (max 2 cycles). +5. On adversarial-review PASS: invoke `superpowers:alignment-check` as normal. +6. **On alignment PASS: STOP.** Do NOT invoke `superpowers:subagent-driven-development`. +7. **On alignment FAIL:** revise the plan based on drift items and run `superpowers:alignment-check` again, with a maximum of 2 alignment-check cycles total. If a revised plan passes alignment, still STOP and do not proceed to execution. +8. **On persistent FAIL at any gate (after the cycle bound for that gate):** escalate to the user with an unresolved findings/drift summary. Do NOT invoke `superpowers:subagent-driven-development` or dispatch any execution. +9. The plan + design + adversarial review reports sit in `docs/plans/` for future execution. The orchestrator (or a future invocation) can resume by passing the plan to `superpowers:subagent-driven-development` directly once gate issues are resolved. **When to use:** @@ -71,7 +75,7 @@ Do not add YAML frontmatter to signal design-only mode. Saved plan documents mus - Cross-cutting designs that affect multiple workstreams; lock the design in before any one workstream starts. - Designs with prerequisites in-flight elsewhere; queue the plan now, execute when prerequisites land. -**Default (no flag):** `superpowers:alignment-check` PASS → invoke `superpowers:subagent-driven-development`. Same as before. +**Default (no flag):** `superpowers:adversarial-design-review --phase=plan` PASS → `superpowers:alignment-check` PASS → invoke `superpowers:subagent-driven-development`. Adversarial review runs **before** alignment check so that idea-level findings are resolved before structural trace. ## Verification per change class @@ -92,7 +96,9 @@ When writing a plan task, the verification step must match the change class. A g These examples are illustrative minimums; per-task `Expected:` fields must be literal values the check can assert against. -Every plan task must include the verification step appropriate to its change class, as defined in the table above. For tasks whose `finishing-a-development-branch` Step 1b trigger conditions are met (build configuration, deployment configuration, version pins on runtime components, startup configuration, migrations, plugin loading paths), include the runtime-launch-validation step in the TDD breakdown as well. Hook/trigger/event-handler changes are NOT in the Step 1b trigger list — they use only the class-appropriate verification from the table. +Every plan task must include the verification step appropriate to its change class, as defined in the table above. For tasks whose `finishing-a-development-branch` Step 1b trigger conditions are met (build configuration, deployment configuration, version pins on runtime components, startup configuration, migrations, plugin loading paths), include the runtime-launch-validation step in the TDD breakdown as well **and include a one-line rollback note** in the task ("Rollback: revert commit + re-run migration tool down + smoke check"; "Rollback: pin to previous version X.Y.Z and rebuild"). Hook/trigger/event-handler changes are NOT in the Step 1b trigger list — they use only the class-appropriate verification from the table. + +The rollback note exists so that adversarial-design-review (plan phase) can verify the design's rollback story is actually wired into the plan, not orphaned in a paragraph. Plans without rollback notes for runtime-affecting tasks will fail adversarial review. The plan author writes the expected output literally — not "passes tests" but "logs `engine ready` within 10 seconds and `/healthz` returns 200". @@ -177,13 +183,15 @@ git commit -m "feat: add specific feature" ### Autonomous Mode (from brainstorming pipeline) -When running autonomously (design already approved, no user interaction): +When running autonomously (design already approved AND adversarially reviewed, no user interaction): 1. Save the plan to `docs/plans/.md` 2. Commit the plan -3. Invoke `superpowers:alignment-check` to verify design-to-plan alignment -4. On PASS: invoke `superpowers:subagent-driven-development` (which uses Agent Teams when available) -5. Do NOT ask the user for execution choice — the pipeline is autonomous +3. Invoke `superpowers:adversarial-design-review --phase=plan` to attack the plan's ideas +4. On adversarial-review PASS: invoke `superpowers:alignment-check` to verify design-to-plan structural alignment +5. On alignment PASS: invoke `superpowers:subagent-driven-development` (which uses Agent Teams when available) +6. On any FAIL: revise per findings/drift, re-run that gate (max 2 cycles per gate), then either continue or escalate to user +7. Do NOT ask the user for execution choice — the pipeline is autonomous ### Manual Mode (direct invocation) diff --git a/tests/cross-llm-coverage.md b/tests/cross-llm-coverage.md index 9f43801..0957c03 100644 --- a/tests/cross-llm-coverage.md +++ b/tests/cross-llm-coverage.md @@ -5,6 +5,7 @@ host-neutral. Updated whenever a skill changes. | Skill | Claude Code | Codex | OpenCode | Cursor | Notes | |---|---|---|---|---|---| +| adversarial-design-review | host-conditional | host-conditional | host-conditional | host-conditional | Agent dispatch block in ``; inline-execution prose in `` | | alignment-check | host-conditional | host-conditional | host-conditional | host-conditional | spawn block in ``; prose fallback outside | | brainstorming | host-conditional | host-conditional | host-conditional | host-conditional | `AskUserQuestion` in ``; numbered-list fallback in `` | | dispatching-parallel-agents | host-neutral | host-neutral | host-neutral | host-neutral | generic parallel-dispatch pattern; no tool-specific refs | From f7367aee6f1c8139942d575e3b3e23f1e45ecd61 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 1 May 2026 03:13:11 +0000 Subject: [PATCH 2/7] feat: ship deferred roadmap items as functionality (v5.5.0) Agent-Logs-Url: https://github.com/GoCodeAlone/claude-superpowers/sessions/1e87f44a-1b33-4b41-a3fc-f111acd5069e Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --- .claude-plugin/marketplace.json | 2 +- .claude-plugin/plugin.json | 2 +- .cursor-plugin/plugin.json | 2 +- README.md | 35 ++-- RELEASE-NOTES.md | 67 +++++++- decisions/0000-template.md | 35 ++++ decisions/README.md | 13 ++ docs/retros/README.md | 7 + docs/roadmap.md | 74 ++------ skills/brainstorming/SKILL.md | 22 ++- skills/post-merge-retrospective/SKILL.md | 151 ++++++++++++++++ skills/pr-monitoring/SKILL.md | 10 +- skills/recording-decisions/SKILL.md | 88 ++++++++++ skills/using-superpowers/SKILL.md | 4 +- skills/writing-plans/SKILL.md | 6 + tests/cross-llm-coverage.md | 2 + tests/skill-activation-audit.sh | 210 +++++++++++++++++++++++ tests/skill-cross-refs.sh | 176 +++++++++++++++++++ 18 files changed, 829 insertions(+), 77 deletions(-) create mode 100644 decisions/0000-template.md create mode 100644 decisions/README.md create mode 100644 docs/retros/README.md create mode 100644 skills/post-merge-retrospective/SKILL.md create mode 100644 skills/recording-decisions/SKILL.md create mode 100755 tests/skill-activation-audit.sh create mode 100755 tests/skill-cross-refs.sh diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 010cdf4..404ed21 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -9,7 +9,7 @@ { "name": "superpowers", "description": "Core skills library for Claude Code: TDD, debugging, collaboration patterns, and proven techniques", - "version": "5.4.0", + "version": "5.5.0", "source": "./", "author": { "name": "Jesse Vincent", diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 6e1f198..744ef6e 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "superpowers", "description": "Core skills library for Claude Code: TDD, debugging, collaboration patterns, and proven techniques", - "version": "5.4.0", + "version": "5.5.0", "author": { "name": "Jesse Vincent", "email": "jesse@fsck.com" diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index 9b62c5d..693e62a 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -2,7 +2,7 @@ "name": "superpowers", "displayName": "Superpowers", "description": "Core skills library: TDD, debugging, collaboration patterns, and proven techniques", - "version": "5.4.0", + "version": "5.5.0", "author": { "name": "Jesse Vincent", "email": "jesse@fsck.com" diff --git a/README.md b/README.md index 92cb7b9..9e08d97 100644 --- a/README.md +++ b/README.md @@ -101,28 +101,40 @@ Per-skill host-conditional audit: [tests/cross-llm-coverage.md](tests/cross-llm- ## The Basic Workflow -1. **brainstorming** - Activates before writing code. Refines rough ideas through questions, explores alternatives, lists load-bearing assumptions, runs a self-challenge round, presents design in sections for validation. Saves design document. +1. **brainstorming** - Activates before writing code. Refines rough ideas through questions, explores alternatives, lists load-bearing assumptions, runs a self-challenge round, presents design in sections for validation. Soft cap of 5 question-batches; on exceed, agent presents best-current-approximation and asks user to approve / refine / extend the budget. Saves design document. 2. **adversarial-design-review (design phase)** - Activates after design doc is committed. Adversarially attacks the *ideas* in the design (not just structure): unstated assumptions, repo-precedent conflicts, YAGNI violations, missing failure modes, security gaps, rollback story, simpler alternatives, user-intent drift. PASS/FAIL with max 2 revision cycles. -3. **using-git-worktrees** - Activates after design approval. Creates isolated workspace on new branch, runs project setup, verifies clean test baseline. +3. **recording-decisions** - Activates inside brainstorming and writing-plans whenever a non-trivial choice is made (divergence from precedent, trade-off between ≥2 plausible approaches, adversarial-review override, cross-skill structural change). Adds a numbered ADR in `decisions/` so the *why* survives renames and refactors. -4. **writing-plans** - Activates with approved design. Breaks work into bite-sized tasks (2-5 minutes each). Every task has exact file paths, complete code, verification steps. Runtime-affecting tasks include rollback notes. +4. **using-git-worktrees** - Activates after design approval. Creates isolated workspace on new branch, runs project setup, verifies clean test baseline. -5. **adversarial-design-review (plan phase)** - Activates after plan doc is committed. Inherits the design checklist plus plan-specific scans: task granularity, verification-class match, hidden serial dependencies, rollback wiring. +5. **writing-plans** - Activates with approved design. Breaks work into bite-sized tasks (2-5 minutes each). Every task has exact file paths, complete code, verification steps. Runtime-affecting tasks include rollback notes. -6. **alignment-check** - Activates after adversarial review of plan passes. Narrowly structural: every design requirement maps to a plan task; every plan task traces to a design requirement. +6. **adversarial-design-review (plan phase)** - Activates after plan doc is committed. Inherits the design checklist plus plan-specific scans: task granularity, verification-class match, hidden serial dependencies, rollback wiring. -7. **subagent-driven-development** or **executing-plans** - Activates with plan. Dispatches fresh subagent per task with two-stage review (spec compliance, then code quality), or executes in batches with human checkpoints. +7. **alignment-check** - Activates after adversarial review of plan passes. Narrowly structural: every design requirement maps to a plan task; every plan task traces to a design requirement. -8. **test-driven-development** - Activates during implementation. Enforces RED-GREEN-REFACTOR: write failing test, watch it fail, write minimal code, watch it pass, commit. Deletes code written before tests. +8. **subagent-driven-development** or **executing-plans** - Activates with plan. Dispatches fresh subagent per task with two-stage review (spec compliance, then code quality), or executes in batches with human checkpoints. -9. **requesting-code-review** - Activates between tasks. Reviews against plan, reports issues by severity. Critical issues block progress. +9. **test-driven-development** - Activates during implementation. Enforces RED-GREEN-REFACTOR: write failing test, watch it fail, write minimal code, watch it pass, commit. Deletes code written before tests. -10. **finishing-a-development-branch** - Activates when tasks complete. Verifies tests, presents options (merge/PR/keep/discard), cleans up worktree. +10. **requesting-code-review** - Activates between tasks. Reviews against plan, reports issues by severity. Critical issues block progress. + +11. **finishing-a-development-branch** - Activates when tasks complete. Verifies tests, presents options (merge/PR/keep/discard), cleans up worktree. + +12. **pr-monitoring** - Activates after autonomous PR creation. Watches CI and review comments; fixes failures and responds to feedback until green. + +13. **post-merge-retrospective** - Activates after `pr-monitoring` exits successfully on a merged PR with green CI. Reads the design, plan, adversarial-review reports, code-review threads, and CI history; produces a short retro in `docs/retros/` scoring each adversarial finding (Prescient / Resolved upfront / False positive / Inconclusive), naming gate misses, and surfacing plugin-level follow-ups when patterns emerge across retros. **The agent checks for relevant skills before any task.** Mandatory workflows, not suggestions. +## Auditing skill activations + +`tests/skill-activation-audit.sh` reads `.claude/superpowers-state/in-progress.jsonl` (the activity log written by the `record-activity` hook) and reports which pipeline gates fired during a session. Use it post-hoc when you want to confirm whether the autonomous pipeline ran end-to-end or stopped earlier than expected. Strictly local — never transmits anything. + +`tests/skill-cross-refs.sh` verifies that cross-skill references inside `skills/` and `agents/` markdown resolve (skill names, `Step N` references, `superpowers:` mentions). Run it before committing any skill edit that renames a skill or renumbers a step. + ## What's Inside ### Skills Library @@ -135,8 +147,9 @@ Per-skill host-conditional audit: [tests/cross-llm-coverage.md](tests/cross-llm- - **verification-before-completion** - Ensure it's actually fixed **Collaboration** -- **brainstorming** - Socratic design refinement (with assumption-listing and self-challenge round) +- **brainstorming** - Socratic design refinement (with assumption-listing, self-challenge round, and a 5-batch question budget) - **adversarial-design-review** - Adversarial attack on design and plan ideas before execution (two phases: design, plan) +- **recording-decisions** - ADRs in `decisions/` for non-trivial trade-offs and rejected alternatives - **writing-plans** - Detailed implementation plans - **executing-plans** - Batch execution with checkpoints - **alignment-check** - Structural design ↔ plan trace (forward + reverse) @@ -145,6 +158,8 @@ Per-skill host-conditional audit: [tests/cross-llm-coverage.md](tests/cross-llm- - **receiving-code-review** - Responding to feedback - **using-git-worktrees** - Parallel development branches - **finishing-a-development-branch** - Merge/PR decision workflow +- **pr-monitoring** - Watches CI and reviews after autonomous PR creation +- **post-merge-retrospective** - Closes the loop on merged PRs; scores each adversarial finding and surfaces gate misses - **subagent-driven-development** - Fast iteration with two-stage review (spec compliance, then code quality) **Meta** diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 2213a01..f7f3b38 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -1,5 +1,68 @@ # Superpowers Release Notes +## v5.5.0 (2026-05-01) + +### New Features + +Five items that v5.4.0 deferred into a roadmap have shipped as actual functionality: + +**Decision log / ADRs (`skills/recording-decisions/`, `decisions/`)** + +Architecture Decision Records for non-trivial trade-offs and rejected alternatives. Numbered sequentially in `decisions/`, using Michael Nygard's three-section format (Context / Decision / Consequences) with a "Reversibility" addendum. The skill is light by design: a numbering rule, a template, a four-condition trigger, and a commit convention. Wired into `brainstorming` (when designs make non-trivial choices) and `writing-plans` (when plans introduce a non-obvious choice not already covered by an ADR cited from the design). The template lives at `decisions/0000-template.md`. + +**Post-merge retrospective (`skills/post-merge-retrospective/`, `docs/retros/`)** + +Closes the autonomous-pipeline loop. After `pr-monitoring` exits successfully on a merged PR with green CI and a design + plan in `docs/plans/`, this skill: + +- Scores each adversarial-review finding as Prescient / Resolved upfront / False positive / Inconclusive based on what showed up in code reviews and CI. +- Walks every code-review comment and CI failure and names the gate that *should* have caught it earlier (gate misses = the actionable signal). +- Verifies the expected pipeline gates fired using `tests/skill-activation-audit.sh`. +- Produces a one-page retro at `docs/retros/YYYY-MM-DD--retro.md`. +- Surfaces plugin-level follow-ups when a gate miss recurs across multiple retros. + +Wired into `pr-monitoring`'s exit conditions. The retro is intentionally short — long retros don't get read. + +**Skill-usage telemetry (`tests/skill-activation-audit.sh`)** + +Parses `.claude/superpowers-state/in-progress.jsonl` (the activity log written by the existing `record-activity` PostToolUse hook) and reports which skills / agents fired during a session. Detects "expected but missing" pipeline gates by walking the canonical chain (brainstorming → adversarial-design-review → … → pr-monitoring). Strictly local; never transmits anything off the machine. Used directly by `post-merge-retrospective`. Exit code 2 when expected gates didn't fire so it can be wired into CI for automation runs. + +**Brainstorming cost-control gate (`skills/brainstorming/SKILL.md`)** + +Soft cap of 5 question-batches per brainstorming session. On exceed, the agent stops asking, presents a best-current-approximation design with confidence annotations, and gives the user three options: approve as-is, refine specific sections (one additional capped batch), or explicitly extend the budget. Convergence is now a feature, not an accident; question fatigue is a real failure mode and this cap addresses it without becoming a hard refusal. + +**Cross-skill consistency invariants (`tests/skill-cross-refs.sh`)** + +New test that scans `skills/**/SKILL.md` and `agents/*.md` for cross-references and verifies they resolve: + +- `/SKILL.md` paths and `superpowers:` mentions resolve to either a skills directory or an `agents/.md` file. +- ` Step [a-z]?` references resolve to a heading or bold-line label in the cited skill. +- Skips fenced code blocks (placeholder examples like `path/SKILL.md` inside ```code``` are not real references). + +Catches a class of silent rot that became more likely as v5.4.0 added cross-skill citations between `runtime-launch-validation`, `writing-plans`, `adversarial-design-review`, and `finishing-a-development-branch` Step 1b/1c. + +### Pipeline integration + +The autonomous chain now extends through the post-merge stage: + +``` +brainstorming → adversarial-design-review (design) + → writing-plans + → adversarial-design-review (plan) + → alignment-check + → subagent-driven-development + → finishing-a-development-branch + → pr-monitoring + → post-merge-retrospective +``` + +Cross-cutting: `recording-decisions` is invoked from inside brainstorming and writing-plans whenever a non-trivial choice is made. + +### Documentation + +- `docs/roadmap.md` rewritten — the previous "deferred" sections are now a "shipped as" mapping table; only the explicit "rejected" entries remain. +- `README.md` "Basic Workflow" extended through stage 13 (post-merge-retrospective) with a new "Auditing skill activations" section. +- `tests/cross-llm-coverage.md` adds rows for the two new skills. + ## v5.4.0 (2026-04-30) ### New Features @@ -54,7 +117,9 @@ Every existing review gate attacks code (`requesting-code-review`, spec-reviewer ### Roadmap -`docs/roadmap.md` was added in this release to track items considered during the holistic evaluation that are not landing in this version: durable decision logs (ADRs), post-merge retrospective skill, skill-usage telemetry, brainstorming cost-control gate, and cross-skill consistency invariants. Each entry has a one-paragraph rationale and trigger condition. +`docs/roadmap.md` was added in this release to track items considered during the holistic evaluation that did not land in this version: durable decision logs (ADRs), post-merge retrospective skill, skill-usage telemetry, brainstorming cost-control gate, and cross-skill consistency invariants. Each entry had a rationale and trigger condition. + +**Update for v5.5.0:** all five of those items have shipped as actual functionality. See the v5.5.0 entry above. ## v5.3.0 (2026-04-29) diff --git a/decisions/0000-template.md b/decisions/0000-template.md new file mode 100644 index 0000000..5e7c1d4 --- /dev/null +++ b/decisions/0000-template.md @@ -0,0 +1,35 @@ +# 0000. ADR Template + +**Status:** Accepted | Superseded by NNNN | Deprecated +**Date:** YYYY-MM-DD +**Decision-makers:** +**Related:** , , , + +## Context + + + +## Decision + +We will because . + +**Alternatives considered and rejected:** + +- **** — +- **** — + +## Consequences + +**Positive:** + +- +- + +**Negative:** + +- +- + +**Reversibility:** diff --git a/decisions/README.md b/decisions/README.md new file mode 100644 index 0000000..8f18c6d --- /dev/null +++ b/decisions/README.md @@ -0,0 +1,13 @@ +# Decision Records + +Architecture Decision Records (ADRs) for this repository, in [Michael Nygard's three-section format](https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions). See `skills/recording-decisions/SKILL.md` for when and how to add new ADRs. + +## Index + +ADRs are numbered sequentially. `0000-template.md` is the template — copy and increment. New ADRs are `Accepted`; superseded ADRs keep their file but their status updates to `Superseded by NNNN`. + +| # | Title | Status | +|------|----------|----------| +| 0000 | Template | (n/a) | + +When you add an ADR, append a row to the index above in the same commit. diff --git a/docs/retros/README.md b/docs/retros/README.md new file mode 100644 index 0000000..d11fd61 --- /dev/null +++ b/docs/retros/README.md @@ -0,0 +1,7 @@ +# Post-merge retrospectives + +Retros produced by `skills/post-merge-retrospective/` after a PR merges with green CI. One file per merged feature: `YYYY-MM-DD--retro.md`. + +These exist so we have evidence — not opinion — about which gates pull their weight. Patterns across multiple retros drive plugin-level changes (new bug classes in `adversarial-design-review`, new triggers in `runtime-launch-validation`, etc.). + +See `skills/post-merge-retrospective/SKILL.md` for the format and process. diff --git a/docs/roadmap.md b/docs/roadmap.md index dc2e754..bc7679d 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -1,67 +1,21 @@ # Roadmap -Items considered during the v5.4.0 holistic evaluation of the superpowers plugin that are not landing immediately. Each entry has a one-paragraph rationale, a trigger condition (when it becomes worth doing), and a sketch of the shape of the change. +This file used to track items that had been considered but deferred. Those items are now shipped (v5.5.0): -This file is maintained so good ideas surfaced during evaluation are not lost. +| Former roadmap item | Shipped as | +|---|---| +| Decision log / ADRs | `skills/recording-decisions/SKILL.md` + `decisions/` directory + `decisions/0000-template.md` | +| Post-merge retrospective | `skills/post-merge-retrospective/SKILL.md` + `docs/retros/` directory; wired into `pr-monitoring` exit | +| Skill-usage telemetry | `tests/skill-activation-audit.sh` (reads `.claude/superpowers-state/in-progress.jsonl`) | +| Brainstorming cost-control gate | 5-batch soft cap section in `skills/brainstorming/SKILL.md` | +| Cross-skill consistency invariants | `tests/skill-cross-refs.sh` | -## Decision log / ADR step +## Out of scope (rejected) -**Status:** considered, not landed. +These were considered and explicitly rejected during plugin evaluation. They are recorded here so future contributors don't re-litigate the same paths: -**Why not yet:** designs in `docs/plans/` capture *what* we chose, not *why we rejected alternatives* in a durable way. New contributors re-litigate decisions. However, the v5.4.0 `adversarial-design-review` skill now produces a report that includes "Options the author may not have considered" and a verdict reasoning paragraph, both committed alongside the design. That report functions as an organic decision log without inventing a new artifact. Adding ADRs on top would be process tax for a problem the new reviewer largely solves. - -**Trigger to land:** if adversarial-review reports prove insufficient as a decision log in practice (e.g., contributors keep re-asking why a path was rejected, or the reports are too narrow), introduce a lightweight `decisions/` directory with one ADR per significant choice. Use Michael Nygard's template (Context / Decision / Consequences). Wire into `writing-plans` so plans that diverge from a previously-recorded decision are flagged. - -## Post-merge retrospective skill - -**Status:** considered, not landed. - -**Why not yet:** `pr-monitoring` ends the pipeline. There's no closing-the-loop step that asks "what would we change about this skill set based on what happened?" Worth doing eventually, but needs its own design — adversarial review, skill activation effectiveness, post-incident learnings all blur together and should be untangled before a skill is written. - -**Trigger to land:** when a meaningful number of merged PRs have used the v5.4.0 pipeline end-to-end (target: ≥10 distinct features). At that point there's enough signal to design a retrospective skill against real evidence rather than speculation. - -**Sketch:** a `post-merge-retrospective` skill invoked after `pr-monitoring` reports the PR is merged and CI green. Reads the design, plan, adversarial-review reports, code-review threads, and CI history; produces a short report on (a) which adversarial findings were prescient, (b) which gates produced false positives, (c) skill activations that didn't fire when they should have. Output feeds into a `docs/retros/` directory. - -## Skill-usage telemetry / self-audit - -**Status:** considered, not landed. - -**Why not yet:** v5.3.0 introduced `.claude/superpowers-state/in-progress.jsonl` — an append-only activity log written by a `PostToolUse` hook. That log is currently used only for compaction recovery, but it is the natural substrate for skill-activation telemetry. Wiring it up requires a non-trivial design: privacy considerations (don't ship anything off-machine), aggregation cadence, surfacing format, and how to handle hosts without hooks. Out of scope for v5.4.0. - -**Trigger to land:** when there is a reported case of a skill failing to activate when it should have, and we want a deterministic post-hoc check rather than a guess. - -**Sketch:** add a `tests/skill-activation-audit.sh` script that reads the state JSONL and reports counts per skill over a window, plus a `lib/skills-core.js` helper to surface "expected but not invoked" patterns in a session. Strictly local; never transmitted. - -## Brainstorming cost-control gate - -**Status:** considered, not landed. - -**Why not yet:** `brainstorming` can theoretically spiral when the user keeps answering questions. In practice this is rarely cited as a problem — adaptive batching plus the new self-challenge round in v5.4.0 already cap most runaway sessions. Adding a hard upper bound on rounds would impose process tax on a problem we have not actually observed. - -**Trigger to land:** if a user reports a brainstorm that exceeded N round-trips without converging, OR if metrics from the telemetry roadmap entry above show an outlier distribution. - -**Sketch:** soft cap at 5 question-batches; on exceeding, agent forcibly proposes the best-current-approximation design and asks the user to either approve, refine, or explicitly extend the budget. Lives as a single section in `brainstorming/SKILL.md`, not a new skill. - -## Cross-skill consistency invariants (test extension) - -**Status:** considered, not landed in v5.4.0. - -**Why not yet:** several skills now reference each other's filenames and steps (`finishing-a-development-branch` Step 1c is cited from at least three places; `runtime-launch-validation` triggers are referenced from `writing-plans` and `adversarial-design-review`; `requesting-code-review`'s bug-class checklist is cited from `team-conventions.md`). A rename or step-renumbering breaks silently. Not worth blocking v5.4.0 for, but worth a small follow-up because the surface area for silent breakage just grew. - -**Trigger to land:** next time a cross-skill reference breaks in review, OR the next skill PR after v5.4.0. - -**Sketch:** extend `tests/skill-content-grep.sh` (or add a sibling `tests/skill-cross-refs.sh`) that: - -1. Greps every `Step \d[a-z]?` and `# .*` heading reference across `skills/*.md` and `agents/*.md`. -2. For each `/SKILL.md` Step-N reference, verifies the target heading exists in the cited skill. -3. Emits actionable failures with both citing and target file paths. - -Cheap, deterministic, removes a class of silent-rot bugs. The grep guard infrastructure already in place is the right place to add it. - -## Out of scope (not adopted) - -These were considered and explicitly rejected during the v5.4.0 evaluation: - -- **Heavy adversarial debate during brainstorming** — risks turning the user's design conversation into a multi-agent debate they didn't ask for. The v5.4.0 lightweight self-challenge is the lighter alternative we picked instead. -- **Hostile / steelman-the-rejection reviewer framing** — theatrical, low signal. The "find ≥3 things wrong" framing from `requesting-code-review` is sharp enough. +- **Heavy adversarial debate during brainstorming** — risks turning the user's design conversation into a multi-agent debate they didn't ask for. The lightweight self-challenge round in `brainstorming` is the chosen alternative. +- **Hostile / steelman-the-rejection reviewer framing** — theatrical, low signal. The "find ≥3 things wrong" framing from `requesting-code-review` and `adversarial-design-review` is sharp enough. - **Separate `pre-mortem` skill** — folded into `adversarial-design-review`'s "Missing failure modes" bug class instead of being a standalone skill. One artifact, one pass, less skill sprawl. + +If a future contributor wants to revisit any of these, the bar is: explain what changed about the trade-off that wasn't true at evaluation time. Then write an ADR before adding the skill. diff --git a/skills/brainstorming/SKILL.md b/skills/brainstorming/SKILL.md index 59b8b45..bba5a4e 100644 --- a/skills/brainstorming/SKILL.md +++ b/skills/brainstorming/SKILL.md @@ -113,6 +113,25 @@ If any answer surfaces a real issue, revise the design before presenting it. Oth This is intentionally lightweight; the heavyweight pass is `adversarial-design-review`, which runs after the design is committed. +## Question-batch budget (cost-control gate) + +A brainstorming session has a **soft cap of 5 question-batches** before the agent must converge. A "batch" is one round-trip — one outgoing message that contains questions (whether one targeted follow-up or four AskUserQuestion-style options). The cap exists so a session cannot spiral indefinitely while the user keeps answering; convergence is a feature, not an accident. + +**Counting rule:** every outgoing message that asks the user a question counts as one batch, regardless of how many sub-questions it bundles. Re-asking a question to clarify the user's prior answer also counts. Pure status messages ("I'm exploring the codebase…") do not count. + +**On reaching the cap:** + +1. **Stop asking.** Do not send a 6th question batch. +2. **Present the best-current-approximation design** using whatever signal you have. State explicitly which sections you are less confident about and why (which questions were not asked, which were answered ambiguously). +3. **Ask the user, with three multiple-choice options, how to proceed:** + - **Approve as-is** — accept the approximation; proceed to design doc + adversarial review. + - **Refine specific sections** — name the sections; the agent gets ONE additional batch (capped at 4 questions) targeting those sections only, then re-presents. + - **Extend the budget** — explicit user opt-in to N more batches (user picks N; agent confirms before continuing). + +The "extend the budget" option exists because some genuinely complex designs need more conversation; the cap is soft, not a hard refusal. But the user must explicitly opt in — the agent cannot extend on its own. + +**Why a cap at all:** brainstorming is the most user-facing skill in the plugin. Question fatigue is a real failure mode. Capping at 5 batches forces the agent to commit to an approximation rather than stalling — and the user can always explicitly extend. + ## Design-only mode When the user wants design exploration without execution, they pass `--design-only` to brainstorming. @@ -133,7 +152,8 @@ When the user wants design exploration without execution, they pass `--design-on - Write the validated design to `docs/plans/YYYY-MM-DD--design.md` - Include explicit `## Assumptions` and `## Rollback` sections (the latter only required for change classes that affect runtime — see the trigger list in `runtime-launch-validation` / `finishing-a-development-branch` Step 1b) - Use elements-of-style:writing-clearly-and-concisely skill if available -- Commit the design document to git +- **Record decisions** — if the design triggers any condition in `skills/recording-decisions/SKILL.md` (divergence from precedent, non-trivial trade-off between ≥2 plausible approaches, adversarial-review override, cross-skill structural change), invoke `recording-decisions` to add an ADR in `decisions/`, then cite it from this design doc +- Commit the design document to git (and any new ADRs in the same commit) **Adversarial review (mandatory):** - Invoke `adversarial-design-review --phase=design` against the committed design diff --git a/skills/post-merge-retrospective/SKILL.md b/skills/post-merge-retrospective/SKILL.md new file mode 100644 index 0000000..a83bc98 --- /dev/null +++ b/skills/post-merge-retrospective/SKILL.md @@ -0,0 +1,151 @@ +--- +name: post-merge-retrospective +description: Use after a PR has merged and CI is green - reads design, plan, adversarial-review reports, code-review threads, and CI history to produce a short retrospective in docs/retros/ that closes the loop on which gates worked and which didn't +--- + +# Post-Merge Retrospective + +## Overview + +Every other skill in this plugin acts on the work in flight. This one acts on completed work. After a PR merges and CI is green, this skill produces a short retrospective: which adversarial-review findings turned out to matter, which gates produced false positives, which skill activations didn't fire when they should have. The output sits in `docs/retros/` and feeds back into the next iteration of the plugin itself. + +**Core principle:** the only way the gates get sharper is if completed work is read back against the predictions they made. This skill does the reading. + +## When to use this skill + +Invoked automatically by `pr-monitoring` when **all** of: + +1. The PR is merged. +2. The base-branch CI is green for the merge commit. +3. The PR was created via the autonomous pipeline (i.e., a design + plan + adversarial-review report exist in `docs/plans/` for this branch). + +Manual invocation is also supported on any merged PR with the matching artifacts. + +If the PR was opened ad-hoc (no design / plan in `docs/plans/`), this skill exits cleanly without writing a retro — there's nothing to compare against. + +## Process + +1. **Locate the artifacts.** From the merged PR, identify: + - Design doc: `docs/plans/YYYY-MM-DD--design.md` + - Plan doc: `docs/plans/YYYY-MM-DD-.md` + - Adversarial-review reports for design and plan phases (committed alongside) + - Code-review threads: `gh pr view --json reviews,comments` + - CI history for the branch: `gh run list --branch --json conclusion,name,createdAt` + - Any ADRs cited from the design or plan + +2. **Score each adversarial-review finding.** + For every finding raised in either phase's adversarial-review report, classify it as one of: + - **Prescient** — the finding called out something that turned out to matter (showed up as a code-review comment, CI failure, follow-up bug fix, or revert). + - **Resolved upfront** — the finding was addressed during plan revision and prevented an issue downstream (no code-review comment / CI failure traces back to it). + - **False positive** — the finding flagged something that did NOT cause downstream issues and the design rationale held up. + - **Inconclusive** — not enough signal to decide either way. + +3. **Score each code-review comment.** + For every code-review comment that requested a change, ask: which gate, if any, *should* have caught this earlier? Map the comment to the most-upstream gate that could have caught it (`brainstorming` self-challenge / `adversarial-design-review` design / `adversarial-design-review` plan / `alignment-check` / `requesting-code-review` / none). Comments mapped to a gate that was supposed to catch them but didn't are **gate misses** — the most actionable retro signal. + +4. **Score CI failure history.** + For each unique CI failure on the branch, ask: was this caught by `verification-before-completion` / `runtime-launch-validation` / something else, or did it slip past every local gate? Slips are gate misses too. + +5. **Score skill activations.** + Read `.claude/superpowers-state/in-progress.jsonl` (if present in the repo's `.claude/` directory) and verify the expected pipeline ran: + `brainstorming → adversarial-design-review (design) → writing-plans → adversarial-design-review (plan) → alignment-check → subagent-driven-development → finishing-a-development-branch → pr-monitoring → post-merge-retrospective`. + For each gate that was *expected* to fire and didn't, that's a missed-activation. Use `tests/skill-activation-audit.sh` (this repo) to confirm what fired. + +6. **Write the retro.** + Save to `docs/retros/YYYY-MM-DD--retro.md` using the format below. Commit it. + +## Retro format + +```markdown +# Retro: + +**PR:** # +**Merged:** YYYY-MM-DD +**Branch:** <branch> +**Design:** docs/plans/YYYY-MM-DD-<topic>-design.md +**Plan:** docs/plans/YYYY-MM-DD-<feature>.md +**Related ADRs:** <decisions/NNNN-...md, ...> + +## Adversarial-review findings, scored + +| Phase | Finding | Severity | Outcome | +|---|---|---|---| +| design | <one-line summary> | Critical / Important / Minor | Prescient / Resolved upfront / False positive / Inconclusive | +| plan | ... | ... | ... | + +## Gate misses + +For each code-review comment or CI failure that *should* have been caught earlier, name the gate that missed it and why. If none — say so. + +| Issue | Gate that missed | Why it slipped | Fix idea (optional) | +|---|---|---|---| +| <one-line description> | adversarial-design-review (plan) | <one sentence> | <one sentence> | + +If there are zero gate misses, write: "No gate misses this PR. All downstream issues were caught by the gate they were assigned to, or were genuinely novel and not in any gate's bug-class scope." + +## Missed skill activations + +Pipeline gates expected to fire (per `using-superpowers`): list any that didn't. Pull from `tests/skill-activation-audit.sh`. + +| Gate | Fired? | Notes | +|---|---|---| +| brainstorming | yes | | +| adversarial-design-review (design) | yes | | +| adversarial-design-review (plan) | no | <why — e.g., manual override; deferred to alignment-check> | +| ... | ... | | + +## What worked + +2-4 bullets, concrete. "Adversarial review caught the missing rollback path; plan was revised before execution started." + +## What didn't + +2-4 bullets, concrete. No abstract laments. "Code review found a thread-safety bug in the cache layer; this should have been an `adversarial-design-review --phase=design` finding under failure-modes — the design doc said `cache is in-process` without addressing concurrency." + +## Plugin-level follow-ups + +If a gate miss recurs across multiple retros, propose a concrete plugin change: a new bug class in `adversarial-design-review`, a new line in `runtime-launch-validation`, a new entry in `tests/skill-cross-refs.sh`. Cite the prior retros. + +If no plugin-level changes are warranted, say so. +``` + +## Dispatch + +<host: claude-code> +This is short, structured analysis work — one pass over the artifacts. Run inline, not as a subagent. The lead agent has the context already. If the artifact set is large (10+ code-review threads, dozens of CI runs), dispatch a `balanced`-tier general-purpose subagent with the artifact paths inline. +</host> + +<host: codex, opencode, cursor> +Run inline. The lead agent has the context already. The retro is a structured artifact, not a long-running task — produce the markdown directly. +</host> + +## Why this skill exists + +`pr-monitoring` exits when CI is green and reviews are resolved. That's the end of the in-flight pipeline, but it's not the end of the loop. Without `post-merge-retrospective`, the plugin has no organic way to know which gates are actually pulling their weight. With it, every merged PR produces a small piece of evidence that gets compared across PRs over time. That's how the gate set sharpens. + +The retro is intentionally short. Long retros don't get read. The format above fits on one screen for a typical PR; the gate-miss table is the only required-non-empty section if there's anything to learn. + +## Integration + +**Called by:** +- `pr-monitoring` — on its successful exit (CI green + reviews resolved). +- Manual — any merged PR with matching artifacts. + +**Calls:** none. Retro is a leaf; the next iteration of the pipeline picks up cross-retro patterns when an author writes the next design. + +**Reads:** +- `docs/plans/` (design, plan, adversarial-review reports) +- `decisions/` (ADRs cited from the design / plan) +- `gh pr view`, `gh pr review-comments`, `gh run list` +- `.claude/superpowers-state/in-progress.jsonl` (if present) +- `tests/skill-activation-audit.sh` (this repo) + +**Writes:** +- `docs/retros/YYYY-MM-DD-<feature>-retro.md` + +## Anti-patterns + +- **Long, narrative retros.** The format is a table-driven one-pager. If it grows past two screens, the structure is being abused. +- **Validating the work.** This isn't "did we ship the right thing?" — that's the user's call. This is "did the gates do their job?" — that's a process question with binary answers per gate. +- **Skipping the gate-miss table.** "Everything went great" is fine as a statement, but the table format forces you to walk every code-review comment and CI failure. Skipping it means signal is being lost. +- **Acting on a single retro.** Plugin-level follow-ups require pattern across retros. One miss is signal; two is a trend. diff --git a/skills/pr-monitoring/SKILL.md b/skills/pr-monitoring/SKILL.md index eef5374..fb17911 100644 --- a/skills/pr-monitoring/SKILL.md +++ b/skills/pr-monitoring/SKILL.md @@ -84,7 +84,10 @@ Agent tool (general-purpose, model: balanced, run_in_background: true): - No unresolved review comments - No pending "changes requested" reviews - On exit, report final status. + On exit: + - If the PR is merged AND base-branch CI is green for the merge commit AND a design + plan exist in `docs/plans/` for this branch, invoke `superpowers:post-merge-retrospective` to produce a retro in `docs/retros/`. This is the autonomous closing-the-loop step. + - If the PR is closed without merge, skip the retrospective and exit cleanly. + - Report final status either way. ### 4. Wait Between Checks @@ -101,6 +104,8 @@ Use your host's equivalent mechanism to periodically poll the following in a loo Continue until all checks pass, no unresolved inline comments remain, and no "changes requested" reviews are pending. +When the PR has merged with green base-branch CI and a design + plan exist in `docs/plans/` for this branch, invoke `superpowers:post-merge-retrospective` to write a retro in `docs/retros/`. If the PR was closed without merge, skip the retro and exit cleanly. + </host> @@ -118,6 +123,9 @@ Continue until all checks pass, no unresolved inline comments remain, and no "ch **Called by:** - `finishing-a-development-branch` (autonomous mode) — after PR creation +**Calls:** +- `superpowers:post-merge-retrospective` — on its own clean exit when the PR has merged with green base-branch CI + **Uses:** - `gh` CLI for all GitHub operations - `superpowers:systematic-debugging` principles for CI failure analysis diff --git a/skills/recording-decisions/SKILL.md b/skills/recording-decisions/SKILL.md new file mode 100644 index 0000000..4bdacff --- /dev/null +++ b/skills/recording-decisions/SKILL.md @@ -0,0 +1,88 @@ +--- +name: recording-decisions +description: Use when the design or plan makes a non-trivial trade-off that future contributors will need context for - records an Architecture Decision Record (ADR) in decisions/ so the rejected alternatives and reasoning are durable, not lost in transcript history +--- + +# Recording Decisions + +## Overview + +Architecture Decision Records (ADRs) capture the **why** behind a choice — particularly the alternatives that were rejected and the reasoning. Designs in `docs/plans/` say *what* we built; ADRs say *why this and not that*. + +`adversarial-design-review` produces a report alongside each design that lists "Options the author may not have considered" and a verdict reasoning paragraph. That covers idea-level alternatives at one moment in time. ADRs are the persistent index across the project: a rename, a refactor, or a new contributor's "why is it like this?" question goes through `decisions/`, not through hunting for the right adversarial-review report inside an old design folder. + +**Core principle:** record once, in a stable location, with a stable number, in Michael Nygard's three-section format. + +## When to use this skill + +Invoke this skill when **any** of these conditions hold: + +1. **Divergence from precedent.** The design / plan picks a path that differs from a previously-established pattern in this repo (e.g., a different testing strategy, a different state-management choice, a different deployment shape than other components). +2. **Non-trivial trade-off.** The design weighs ≥2 plausible approaches and picks one for reasons that won't be obvious from reading the code. A flat file vs. SQLite vs. Postgres choice. A library-pin floor (`>=X.Y`) vs. exact pin (`==X.Y.Z`) decision. Sync vs. async. Polling vs. webhook. +3. **Adversarial-review override.** The design author accepted an adversarial-review finding as "yes, but here's why" rather than fixing it. The acceptance reasoning belongs in an ADR so future contributors don't re-litigate. +4. **Cross-skill structural change.** Any change that affects multiple skills' integration (e.g., introducing a new gate in the autonomous pipeline, renaming a step that other skills cite). + +If none of the four conditions hold, an ADR is not required. ADRs are not for every commit — they are for choices that future contributors will read the code and ask "why is it like this?" about. + +## Process + +1. **Pick the next free number.** ADRs are numbered sequentially: `0000-template.md`, `0001-...md`, `0002-...md`. Run `ls decisions/ | grep -E '^[0-9]{4}-' | sort -n | tail -1` (or just `ls`) and increment. +2. **Copy the template.** + ```bash + cp decisions/0000-template.md decisions/NNNN-<short-slug>.md + ``` + The slug is kebab-case, ≤6 words: `0007-pin-postgres-major-version-only.md`. +3. **Fill the three sections.** Context, Decision, Consequences. Each section is short (≤150 words is a good target). If you can't say it in 150 words, the trade-off probably isn't crisp yet. +4. **Set status.** New ADRs start as `Status: Accepted`. Superseded ADRs are not deleted — they get `Status: Superseded by NNNN` and the new ADR cites them in its Context. +5. **Cite from the design / plan.** The design or plan that triggered the ADR MUST cite it: `See decisions/0007-pin-postgres-major-version-only.md`. This is the back-link that makes ADRs discoverable. +6. **Commit alongside the design / plan.** ADRs are committed in the same commit as the design (when triggered by `brainstorming`) or the same commit as the plan (when triggered by `writing-plans`). + +## ADR format (Michael Nygard, lightly extended) + +```markdown +# NNNN. <Short verb-led title> + +**Status:** Accepted | Superseded by MMMM | Deprecated +**Date:** YYYY-MM-DD +**Decision-makers:** <handles or roles> +**Related:** <design path>, <plan path>, <adversarial-review report>, <prior ADRs> + +## Context + +<What is the situation? What forces are at play? What constraints exist? +What did we know at the time? What did we explicitly NOT know? Cite sources +where possible.> + +## Decision + +<We will <verb> <thing> because <reason>. Be precise. Name the alternatives +considered and rejected, with one sentence each on why they were rejected.> + +## Consequences + +<What becomes easier? What becomes harder? What new constraints does this +introduce? What does this cost us if we want to undo it later? List 2-5 +consequences; both positive and negative are required — a one-sided list +is a smell.> +``` + +## Integration + +**Called by:** +- `brainstorming` — when the design triggers any of the four conditions above. +- `writing-plans` — when the plan introduces a non-obvious choice not already covered by an ADR. +- `adversarial-design-review` — recommended when an Important finding is accepted by the author with reasoning. +- Manual — any contributor recording a decision retroactively. + +**Calls:** none. ADRs are leaves; they record state, not next steps. + +## Anti-patterns + +- **ADR for every task.** ADRs are for choices, not for documenting work. If the entry is just "we built feature X", it belongs in the design / plan / commit message, not in `decisions/`. +- **ADR as future tense.** ADRs record decisions that have been made, not proposals. Use the design doc for proposals; promote to an ADR after the choice is locked. +- **Editing accepted ADRs.** Accepted ADRs are immutable except for status changes (Accepted → Superseded by NNNN). To change the decision, write a new ADR that supersedes the old one. The old one stays in the repo; future contributors need to see the history. +- **Skipping the alternatives.** "Decision: use Postgres" with no rejected alternatives is a non-ADR. The Nygard format demands the trade-off — without it, it's a glorified README entry. + +## Why this skill is light by design + +ADRs work when they're cheap to write and hard to dodge. This skill is intentionally short: a numbering rule, a template, a four-condition trigger, and a commit convention. The heavy lifting (figuring out what to write) lives in the design and adversarial-review process. This skill is the storage protocol. diff --git a/skills/using-superpowers/SKILL.md b/skills/using-superpowers/SKILL.md index 06cf7e6..a7776b5 100644 --- a/skills/using-superpowers/SKILL.md +++ b/skills/using-superpowers/SKILL.md @@ -81,7 +81,9 @@ When multiple skills could apply, use this order: 1. **Process skills first** (brainstorming, debugging) - these determine HOW to approach the task 2. **Implementation skills second** (frontend-design, mcp-builder) - these guide execution 3. **Pipeline skills auto-chain** — these invoke each other automatically in the autonomous pipeline: - brainstorming → adversarial-design-review (design phase) → writing-plans → adversarial-design-review (plan phase) → alignment-check → subagent-driven-development → finishing-a-development-branch → pr-monitoring + brainstorming → adversarial-design-review (design phase) → writing-plans → adversarial-design-review (plan phase) → alignment-check → subagent-driven-development → finishing-a-development-branch → pr-monitoring → post-merge-retrospective + + Cross-cutting skills invoked from within the pipeline when conditions trigger: `recording-decisions` (when designs/plans make non-trivial trade-offs). "Let's build X" → brainstorming first, then the pipeline runs autonomously after design approval. "Fix this bug" → debugging first, then domain-specific skills. diff --git a/skills/writing-plans/SKILL.md b/skills/writing-plans/SKILL.md index 4ecd12d..2aace4d 100644 --- a/skills/writing-plans/SKILL.md +++ b/skills/writing-plans/SKILL.md @@ -100,6 +100,12 @@ Every plan task must include the verification step appropriate to its change cla The rollback note exists so that adversarial-design-review (plan phase) can verify the design's rollback story is actually wired into the plan, not orphaned in a paragraph. Plans without rollback notes for runtime-affecting tasks will fail adversarial review. +## Recording decisions + +If the plan introduces a non-trivial choice that wasn't already captured by an ADR cited in the design (e.g., a library pick, a sync-vs-async choice, a polling-vs-webhook decision made at plan time rather than at design time), invoke `skills/recording-decisions/SKILL.md` to add an ADR in `decisions/` and cite it from the relevant task. ADRs are how the *why* survives renames and refactors; the design and plan answer *what*. + +If every decision in the plan is already covered by ADRs cited from the design, skip this step. + The plan author writes the expected output literally — not "passes tests" but "logs `engine ready` within 10 seconds and `/healthz` returns 200". ## Bite-Sized Task Granularity diff --git a/tests/cross-llm-coverage.md b/tests/cross-llm-coverage.md index 0957c03..526e8b1 100644 --- a/tests/cross-llm-coverage.md +++ b/tests/cross-llm-coverage.md @@ -7,6 +7,8 @@ host-neutral. Updated whenever a skill changes. |---|---|---|---|---|---| | adversarial-design-review | host-conditional | host-conditional | host-conditional | host-conditional | Agent dispatch block in `<host: claude-code>`; inline-execution prose in `<host: codex, opencode, cursor>` | | alignment-check | host-conditional | host-conditional | host-conditional | host-conditional | spawn block in `<host: claude-code>`; prose fallback outside | +| post-merge-retrospective | host-conditional | host-conditional | host-conditional | host-conditional | Inline-vs-subagent decision block in `<host: claude-code>`; inline-only prose in `<host: codex, opencode, cursor>` | +| recording-decisions | host-neutral | host-neutral | host-neutral | host-neutral | ADR storage protocol; no host-specific tooling | | brainstorming | host-conditional | host-conditional | host-conditional | host-conditional | `AskUserQuestion` in `<host: claude-code>`; numbered-list fallback in `<host: codex, opencode, cursor>` | | dispatching-parallel-agents | host-neutral | host-neutral | host-neutral | host-neutral | generic parallel-dispatch pattern; no tool-specific refs | | executing-plans | host-conditional | host-conditional | host-conditional | host-conditional | tool-use block in `<host: claude-code>`; prose fallback in `<host: codex, opencode, cursor>` | diff --git a/tests/skill-activation-audit.sh b/tests/skill-activation-audit.sh new file mode 100755 index 0000000..07b8ff4 --- /dev/null +++ b/tests/skill-activation-audit.sh @@ -0,0 +1,210 @@ +#!/usr/bin/env bash +# tests/skill-activation-audit.sh +# Reads .claude/superpowers-state/in-progress.jsonl and reports which +# superpowers skills / agents fired during the recorded session(s), +# plus a heuristic check for "expected but not invoked" pipeline gates. +# +# This is strictly local — it never transmits anything off the machine. +# Use it post-hoc to confirm whether the autonomous pipeline ran as +# expected, or to identify where it stopped. +# +# Usage: +# ./tests/skill-activation-audit.sh # default state file +# ./tests/skill-activation-audit.sh /path/to/jsonl # explicit path +# ./tests/skill-activation-audit.sh --quiet # only flag gaps +# +# Exit codes: +# 0 — audit completed; no expected-but-missing pipeline gates detected +# 2 — one or more expected pipeline gates did not fire +# 3 — state file unreadable or malformed +# 4 — usage error + +set -euo pipefail + +QUIET=0 +STATE_FILE="" + +for arg in "$@"; do + case "$arg" in + --quiet|-q) QUIET=1 ;; + --help|-h) + sed -n '2,18p' "$0" | sed 's/^# //; s/^#//' + exit 0 + ;; + -*) + printf 'unknown option: %s\n' "$arg" >&2 + exit 4 + ;; + *) + if [ -n "$STATE_FILE" ]; then + printf 'unexpected positional argument: %s\n' "$arg" >&2 + exit 4 + fi + STATE_FILE="$arg" + ;; + esac +done + +# Default: look up from CWD into .claude/superpowers-state/in-progress.jsonl +if [ -z "$STATE_FILE" ]; then + STATE_FILE="${PWD}/.claude/superpowers-state/in-progress.jsonl" +fi + +if [ ! -r "$STATE_FILE" ]; then + printf 'No state file at %s\n' "$STATE_FILE" >&2 + printf '\nThis is normal if:\n' >&2 + printf ' - the PostToolUse activity hook is not installed in this checkout\n' >&2 + printf ' - the session has not invoked any Skill / Agent / Task* tool yet\n' >&2 + printf ' - this host does not write the state file (Codex / OpenCode / Cursor)\n' >&2 + exit 3 +fi + +# Pipeline gates we expect for an autonomous run, in order. The pipeline +# is the canonical chain documented in skills/using-superpowers/SKILL.md. +PIPELINE_GATES=( + brainstorming + adversarial-design-review + writing-plans + alignment-check + subagent-driven-development + finishing-a-development-branch + pr-monitoring +) + +# Optional gates — present only when conditions trigger them. Reported +# but their absence is NOT a failure. +OPTIONAL_GATES=( + recording-decisions + post-merge-retrospective + using-git-worktrees + test-driven-development + systematic-debugging + receiving-code-review + requesting-code-review + runtime-launch-validation +) + +# --- Parse JSONL ---------------------------------------------------------- + +# Each line is {"ts":"...","tool":"...","detail":"skill=foo args=..."} or +# {"ts":"...","tool":"Agent","detail":"agent=... desc=\"...\" bg=..."}. +# We tolerate jq missing — fall back to grep if jq isn't installed. + +extract_skills() { + if command -v jq >/dev/null 2>&1; then + # detail is a free-form string; pull `skill=<name>` from it. + jq -r 'select(.tool=="Skill") | .detail' "$STATE_FILE" 2>/dev/null \ + | sed -nE 's/.*skill=([A-Za-z0-9_:-]+).*/\1/p' \ + | sed -E 's/^superpowers://' + else + grep -E '"tool":"Skill"' "$STATE_FILE" 2>/dev/null \ + | sed -nE 's/.*skill=([A-Za-z0-9_:-]+).*/\1/p' \ + | sed -E 's/^superpowers://' + fi +} + +extract_agents() { + if command -v jq >/dev/null 2>&1; then + jq -r 'select(.tool=="Agent" or .tool=="Task") | .detail' "$STATE_FILE" 2>/dev/null \ + | sed -nE 's/.*agent=([A-Za-z0-9_-]+).*/\1/p' + else + grep -E '"tool":"(Agent|Task)"' "$STATE_FILE" 2>/dev/null \ + | sed -nE 's/.*agent=([A-Za-z0-9_-]+).*/\1/p' + fi +} + +skills_seen=$(extract_skills | sort | uniq -c | sort -rn || true) +agents_seen=$(extract_agents | sort | uniq -c | sort -rn || true) +all_seen_skills=$(extract_skills | sort -u || true) + +if [ "$QUIET" -eq 0 ]; then + printf '=== Skill activation audit ===\n' + printf 'State file: %s\n' "$STATE_FILE" + printf 'Total entries: %s\n' "$(wc -l < "$STATE_FILE" 2>/dev/null | tr -d ' ' || echo 0)" + printf '\n--- Skill invocations (count, name) ---\n' + if [ -n "$skills_seen" ]; then + printf '%s\n' "$skills_seen" + else + printf '(none)\n' + fi + printf '\n--- Agent / Task dispatches (count, agent_type) ---\n' + if [ -n "$agents_seen" ]; then + printf '%s\n' "$agents_seen" + else + printf '(none)\n' + fi +fi + +# --- Pipeline gap check --------------------------------------------------- + +# A gate is "expected" only if at least one preceding pipeline gate fired. +# Otherwise this run wasn't an autonomous pipeline run and we don't expect +# any of these gates to have fired. + +any_pipeline_seen=0 +for gate in "${PIPELINE_GATES[@]}"; do + if printf '%s\n' "$all_seen_skills" | grep -qx "$gate"; then + any_pipeline_seen=1 + break + fi +done + +missing_gates=() +if [ "$any_pipeline_seen" -eq 1 ]; then + # We saw at least one pipeline gate; check what's missing AFTER the + # earliest gate we saw. Reports gates that are "downstream of where + # we got to" — the user can compare against where they expected to stop. + earliest_idx=-1 + for i in "${!PIPELINE_GATES[@]}"; do + gate="${PIPELINE_GATES[$i]}" + if printf '%s\n' "$all_seen_skills" | grep -qx "$gate"; then + earliest_idx="$i" + break + fi + done + + for i in "${!PIPELINE_GATES[@]}"; do + [ "$i" -lt "$earliest_idx" ] && continue + gate="${PIPELINE_GATES[$i]}" + if ! printf '%s\n' "$all_seen_skills" | grep -qx "$gate"; then + missing_gates+=("$gate") + fi + done +fi + +if [ "$QUIET" -eq 0 ]; then + printf '\n--- Expected pipeline gates ---\n' + if [ "$any_pipeline_seen" -eq 0 ]; then + printf '(no autonomous-pipeline skills observed; nothing to check)\n' + else + for gate in "${PIPELINE_GATES[@]}"; do + if printf '%s\n' "$all_seen_skills" | grep -qx "$gate"; then + printf ' [x] %s\n' "$gate" + else + printf ' [ ] %s\n' "$gate" + fi + done + fi + + printf '\n--- Optional gates (not failures if absent) ---\n' + for gate in "${OPTIONAL_GATES[@]}"; do + if printf '%s\n' "$all_seen_skills" | grep -qx "$gate"; then + printf ' [x] %s\n' "$gate" + else + printf ' [ ] %s\n' "$gate" + fi + done +fi + +if [ "${#missing_gates[@]}" -gt 0 ]; then + printf '\nMISSING pipeline gates after first observed gate:\n' >&2 + for gate in "${missing_gates[@]}"; do + printf ' - %s\n' "$gate" >&2 + done + printf '\nIf the run intentionally stopped earlier (e.g., --design-only,\n' >&2 + printf 'manual interruption, or escalation to user), this is expected.\n' >&2 + printf 'Otherwise the pipeline did not complete; investigate.\n' >&2 + exit 2 +fi + +exit 0 diff --git a/tests/skill-cross-refs.sh b/tests/skill-cross-refs.sh new file mode 100755 index 0000000..3ab5f7c --- /dev/null +++ b/tests/skill-cross-refs.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash +# tests/skill-cross-refs.sh +# Verifies that cross-skill references in skills/ and agents/ markdown +# resolve to existing targets. Catches silent-rot when a skill is renamed +# or a step is renumbered. +# +# Two classes of references are checked: +# 1. Skill / agent references — `<name>/SKILL.md` paths and +# `superpowers:<name>` strings. Verifies the target exists either as +# skills/<name>/SKILL.md or as agents/<name>.md. +# 2. Step references — `<skill> Step N[a-z]?` mentions in prose. +# Verifies that the cited skill's SKILL.md contains a heading or +# bold-line whose label is `Step <N>` / `Step <N><letter>`. +# +# Fenced code blocks (``` … ```) are skipped, mirroring the discipline of +# tests/skill-content-grep.sh — placeholder examples like `path/SKILL.md` +# inside ```code``` are not real references. +# +# Exit codes: +# 0 — no broken references +# 1 — one or more broken references +# 3 — script error (missing tools, etc.) + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +cd "$REPO_ROOT" + +failures=0 +tmp_failures="$(mktemp)" +trap 'rm -f "$tmp_failures"' EXIT + +# Build the set of known skill names and agent names from the filesystem. +known_skills="$(find skills -mindepth 1 -maxdepth 1 -type d -printf '%f\n' | sort -u)" +known_agents="$(find agents -mindepth 1 -maxdepth 1 -type f -name '*.md' -printf '%f\n' | sed -E 's|\.md$||' | sort -u)" + +# Helper: is the name a known skill or agent? +is_known_target() { + local name="$1" + printf '%s\n' "$known_skills" | grep -qx "$name" && return 0 + printf '%s\n' "$known_agents" | grep -qx "$name" && return 0 + return 1 +} + +# Strip fenced code blocks from a file, emitting "LINENO:CONTENT" for +# every non-fenced line. Mirrors the AWK in tests/skill-content-grep.sh. +strip_fences() { + awk ' + BEGIN { fence_width = 0; ln = 0 } + { + ln++ + stripped = $0 + sub(/^[[:space:]]*/, "", stripped) + if (stripped ~ /^```/) { + n = 0 + s = stripped + while (length(s) > 0 && substr(s, 1, 1) == "`") { n++; s = substr(s, 2) } + if (s ~ /^[a-zA-Z0-9_+-]*[[:space:]]*$/) { + if (fence_width == 0) { fence_width = n; next } + if (n == fence_width) { fence_width = 0; next } + } + } + if (fence_width > 0) { next } + print ln ":" $0 + } + ' "$1" +} + +# Files to scan. Exclude *creation-log* / changelog-style files where the +# point is to record historical names that no longer exist. +mapfile -t scan_files < <(find skills agents -type f -name '*.md' \ + ! -iname 'CREATION-LOG.md' | sort) + +# --- 1. Skill / agent references ---------------------------------------- + +for f in "${scan_files[@]}"; do + annotated="$(strip_fences "$f")" + + # Pattern 1: bare `<slug>/SKILL.md` references + while IFS=: read -r line_no line; do + [ -z "${line_no:-}" ] && continue + name="$(printf '%s' "$line" | grep -oE '[a-z][a-z0-9-]+/SKILL\.md' \ + | head -1 | sed -E 's|/SKILL\.md$||' || true)" + [ -z "$name" ] && continue + case "$line" in + *"skills/${name}/SKILL.md"*) ;; + *) + if [ ! -f "skills/${name}/SKILL.md" ]; then + printf '%s:%s: skill reference "%s/SKILL.md" — no such skill\n' \ + "$f" "$line_no" "$name" >> "$tmp_failures" + fi + ;; + esac + done < <(printf '%s\n' "$annotated" | grep -E '[a-z][a-z0-9-]+/SKILL\.md' || true) + + # Pattern 2: `skills/<name>/SKILL.md` paths + while IFS=: read -r line_no line; do + [ -z "${line_no:-}" ] && continue + name="$(printf '%s' "$line" | grep -oE 'skills/[a-z][a-z0-9-]+/SKILL\.md' \ + | head -1 | sed -E 's|^skills/||; s|/SKILL\.md$||' || true)" + [ -z "$name" ] && continue + if [ ! -f "skills/${name}/SKILL.md" ]; then + printf '%s:%s: path reference "skills/%s/SKILL.md" — file missing\n' \ + "$f" "$line_no" "$name" >> "$tmp_failures" + fi + done < <(printf '%s\n' "$annotated" | grep -E 'skills/[a-z][a-z0-9-]+/SKILL\.md' || true) + + # Pattern 3: `superpowers:<name>` mentions — must resolve to a skill OR agent + while IFS=: read -r line_no line; do + [ -z "${line_no:-}" ] && continue + # A line may have multiple superpowers:<name> mentions; check each. + for name in $(printf '%s' "$line" \ + | grep -oE 'superpowers:[a-z][a-z0-9-]+' \ + | sed -E 's|^superpowers:||' | sort -u); do + if ! is_known_target "$name"; then + printf '%s:%s: "superpowers:%s" — no such skill or agent\n' \ + "$f" "$line_no" "$name" >> "$tmp_failures" + fi + done + done < <(printf '%s\n' "$annotated" | grep -E 'superpowers:[a-z][a-z0-9-]+' || true) +done + +# --- 2. Step references -------------------------------------------------- + +# Match patterns like "<skill-name> Step 1b" or "<skill-name>'s Step 1b". +# Reference is valid if the cited SKILL.md contains a markdown heading or +# bold-line whose label starts with "Step <N>" / "Step <N><letter>". + +# Helper: does skills/<skill>/SKILL.md contain a "Step <id>" heading or +# bold-line? +has_step() { + local skill="$1" step="$2" + local file="skills/${skill}/SKILL.md" + [ -f "$file" ] || return 1 + # Match contexts where "Step N" is a label, not just prose: + # - markdown heading: lines starting with #'s + # - bold: **Step N...** or **Step N:** + # - list-item label: `- Step N:` + # Word-boundary on the trailing side prevents "Step 1" matching "Step 11". + grep -qE "(^#|^\*\*|\*\*Step|^- )(\*\*)?Step[[:space:]]+${step}([^0-9a-zA-Z]|$)" "$file" \ + || grep -qE "Step[[:space:]]+${step}[[:space:]]*[:.]" "$file" +} + +for f in "${scan_files[@]}"; do + annotated="$(strip_fences "$f")" + + while IFS=: read -r line_no line; do + [ -z "${line_no:-}" ] && continue + while read -r skill step; do + [ -z "${skill:-}" ] && continue + [ -z "${step:-}" ] && continue + # Only check known skills — ignores false positives like "Project Step 1" + if ! printf '%s\n' "$known_skills" | grep -qx "$skill"; then + continue + fi + if ! has_step "$skill" "$step"; then + printf '%s:%s: "%s Step %s" — label not found in skills/%s/SKILL.md\n' \ + "$f" "$line_no" "$skill" "$step" "$skill" >> "$tmp_failures" + fi + done < <(printf '%s\n' "$line" \ + | grep -oE "[a-z][a-z0-9-]+(['s]*)?[[:space:]]+Step[[:space:]]+[0-9]+[a-z]?" \ + | sed -E "s/^([a-z][a-z0-9-]+)[a-z']*[[:space:]]+Step[[:space:]]+([0-9]+[a-z]?).*/\1 \2/") + done < <(printf '%s\n' "$annotated" \ + | grep -E "[a-z][a-z0-9-]+[a-z']*[[:space:]]+Step[[:space:]]+[0-9]+[a-z]?" || true) +done + +# --- Report -------------------------------------------------------------- + +if [ -s "$tmp_failures" ]; then + echo "FAIL: broken cross-skill references:" + sort -u "$tmp_failures" + exit 1 +fi + +echo "PASS: all cross-skill references resolve." +exit 0 From b51698a6987ca838970a5664b407f4abc95833c2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 1 May 2026 03:15:08 +0000 Subject: [PATCH 3/7] fix: address code-review feedback (mktemp guard, marker-based help, sort guidance, audit dedup note) Agent-Logs-Url: https://github.com/GoCodeAlone/claude-superpowers/sessions/1e87f44a-1b33-4b41-a3fc-f111acd5069e Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --- skills/post-merge-retrospective/SKILL.md | 4 ++-- skills/recording-decisions/SKILL.md | 6 +++++- tests/skill-activation-audit.sh | 9 ++++++++- tests/skill-cross-refs.sh | 2 +- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/skills/post-merge-retrospective/SKILL.md b/skills/post-merge-retrospective/SKILL.md index a83bc98..32a7803 100644 --- a/skills/post-merge-retrospective/SKILL.md +++ b/skills/post-merge-retrospective/SKILL.md @@ -47,9 +47,9 @@ If the PR was opened ad-hoc (no design / plan in `docs/plans/`), this skill exit For each unique CI failure on the branch, ask: was this caught by `verification-before-completion` / `runtime-launch-validation` / something else, or did it slip past every local gate? Slips are gate misses too. 5. **Score skill activations.** - Read `.claude/superpowers-state/in-progress.jsonl` (if present in the repo's `.claude/` directory) and verify the expected pipeline ran: + Read `.claude/superpowers-state/in-progress.jsonl` (if present in the repo's `.claude/` directory) and verify the expected pipeline ran. The canonical chain documented in `skills/using-superpowers/SKILL.md` is: `brainstorming → adversarial-design-review (design) → writing-plans → adversarial-design-review (plan) → alignment-check → subagent-driven-development → finishing-a-development-branch → pr-monitoring → post-merge-retrospective`. - For each gate that was *expected* to fire and didn't, that's a missed-activation. Use `tests/skill-activation-audit.sh` (this repo) to confirm what fired. + For each gate that was *expected* to fire and didn't, that's a missed-activation. Use `tests/skill-activation-audit.sh` (this repo) to confirm what fired — note that the audit script reports each skill once even when invoked twice (e.g., adversarial-design-review for both phases), so cross-check phase counts against the JSONL `args=--phase=<design|plan>` entries when both phases are required. 6. **Write the retro.** Save to `docs/retros/YYYY-MM-DD-<feature>-retro.md` using the format below. Commit it. diff --git a/skills/recording-decisions/SKILL.md b/skills/recording-decisions/SKILL.md index 4bdacff..dbf45c4 100644 --- a/skills/recording-decisions/SKILL.md +++ b/skills/recording-decisions/SKILL.md @@ -26,7 +26,11 @@ If none of the four conditions hold, an ADR is not required. ADRs are not for ev ## Process -1. **Pick the next free number.** ADRs are numbered sequentially: `0000-template.md`, `0001-...md`, `0002-...md`. Run `ls decisions/ | grep -E '^[0-9]{4}-' | sort -n | tail -1` (or just `ls`) and increment. +1. **Pick the next free number.** ADRs are numbered sequentially: `0000-template.md`, `0001-...md`, `0002-...md`. With four-digit zero-padded prefixes, lexicographic sort is equivalent to numeric sort: + ```bash + ls decisions/ | grep -E '^[0-9]{4}-' | sort | tail -1 + ``` + Take the prefix of the result and add 1. 2. **Copy the template.** ```bash cp decisions/0000-template.md decisions/NNNN-<short-slug>.md diff --git a/tests/skill-activation-audit.sh b/tests/skill-activation-audit.sh index 07b8ff4..dc948e5 100755 --- a/tests/skill-activation-audit.sh +++ b/tests/skill-activation-audit.sh @@ -28,7 +28,14 @@ for arg in "$@"; do case "$arg" in --quiet|-q) QUIET=1 ;; --help|-h) - sed -n '2,18p' "$0" | sed 's/^# //; s/^#//' + # Print the leading comment block (everything from line 2 up to the + # first non-comment line). Marker-based so the help text stays in + # sync if the header is edited. + awk ' + NR==1 { next } # shebang + /^#/ { sub(/^# ?/, ""); print; next } + { exit } + ' "$0" exit 0 ;; -*) diff --git a/tests/skill-cross-refs.sh b/tests/skill-cross-refs.sh index 3ab5f7c..f8c18f1 100755 --- a/tests/skill-cross-refs.sh +++ b/tests/skill-cross-refs.sh @@ -27,7 +27,7 @@ REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" cd "$REPO_ROOT" failures=0 -tmp_failures="$(mktemp)" +tmp_failures="$(mktemp)" || { echo "ERROR: mktemp failed" >&2; exit 3; } trap 'rm -f "$tmp_failures"' EXIT # Build the set of known skill names and agent names from the filesystem. From f591b700bc97c025897e13f055066d6620a6f28c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 1 May 2026 03:30:33 +0000 Subject: [PATCH 4/7] feat: scope-lock + strict-interpretation gates against agent rescoping (v5.6.0) Agent-Logs-Url: https://github.com/GoCodeAlone/claude-superpowers/sessions/1e9d2946-d531-4ae1-a466-1a9a285d6230 Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --- .claude-plugin/marketplace.json | 2 +- .claude-plugin/plugin.json | 2 +- .cursor-plugin/plugin.json | 2 +- README.md | 33 +- RELEASE-NOTES.md | 68 ++++ skills/alignment-check/SKILL.md | 19 +- .../finishing-a-development-branch/SKILL.md | 39 +- skills/recording-decisions/SKILL.md | 3 +- skills/scope-lock/SKILL.md | 158 ++++++++ skills/subagent-driven-development/SKILL.md | 13 +- skills/using-superpowers/SKILL.md | 24 +- skills/writing-plans/SKILL.md | 36 ++ tests/cross-llm-coverage.md | 1 + tests/plan-scope-check.sh | 340 ++++++++++++++++++ 14 files changed, 715 insertions(+), 25 deletions(-) create mode 100644 skills/scope-lock/SKILL.md create mode 100755 tests/plan-scope-check.sh diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 404ed21..a13ebe4 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -9,7 +9,7 @@ { "name": "superpowers", "description": "Core skills library for Claude Code: TDD, debugging, collaboration patterns, and proven techniques", - "version": "5.5.0", + "version": "5.6.0", "source": "./", "author": { "name": "Jesse Vincent", diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 744ef6e..c15462a 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "superpowers", "description": "Core skills library for Claude Code: TDD, debugging, collaboration patterns, and proven techniques", - "version": "5.5.0", + "version": "5.6.0", "author": { "name": "Jesse Vincent", "email": "jesse@fsck.com" diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index 693e62a..41d7936 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -2,7 +2,7 @@ "name": "superpowers", "displayName": "Superpowers", "description": "Core skills library: TDD, debugging, collaboration patterns, and proven techniques", - "version": "5.5.0", + "version": "5.6.0", "author": { "name": "Jesse Vincent", "email": "jesse@fsck.com" diff --git a/README.md b/README.md index 9e08d97..dbf36db 100644 --- a/README.md +++ b/README.md @@ -109,23 +109,25 @@ Per-skill host-conditional audit: [tests/cross-llm-coverage.md](tests/cross-llm- 4. **using-git-worktrees** - Activates after design approval. Creates isolated workspace on new branch, runs project setup, verifies clean test baseline. -5. **writing-plans** - Activates with approved design. Breaks work into bite-sized tasks (2-5 minutes each). Every task has exact file paths, complete code, verification steps. Runtime-affecting tasks include rollback notes. +5. **writing-plans** - Activates with approved design. Breaks work into bite-sized tasks (2-5 minutes each). Every task has exact file paths, complete code, verification steps. Runtime-affecting tasks include rollback notes. Plan MUST contain a `## Scope Manifest` block declaring PR Count, Tasks, Out-of-scope items, and a per-PR grouping table — this is the contract `scope-lock` enforces. 6. **adversarial-design-review (plan phase)** - Activates after plan doc is committed. Inherits the design checklist plus plan-specific scans: task granularity, verification-class match, hidden serial dependencies, rollback wiring. -7. **alignment-check** - Activates after adversarial review of plan passes. Narrowly structural: every design requirement maps to a plan task; every plan task traces to a design requirement. +7. **alignment-check** - Activates after adversarial review of plan passes. Narrowly structural: every design requirement maps to a plan task; every plan task traces to a design requirement; the Scope Manifest is well-formed (forward + reverse + manifest trace via `tests/plan-scope-check.sh`). -8. **subagent-driven-development** or **executing-plans** - Activates with plan. Dispatches fresh subagent per task with two-stage review (spec compliance, then code quality), or executes in batches with human checkpoints. +8. **scope-lock** - Activates immediately after `alignment-check` PASS. Stamps the plan with `Locked <timestamp>`, computes the manifest's sha256 into `<plan>.scope-lock`, commits both. From this point until completion (or an explicit user-approved unlock), the task list, PR count, and feature scope are immutable. `subagent-driven-development` re-checks the lock between tasks; `finishing-a-development-branch` re-checks before any PR is created. -9. **test-driven-development** - Activates during implementation. Enforces RED-GREEN-REFACTOR: write failing test, watch it fail, write minimal code, watch it pass, commit. Deletes code written before tests. +9. **subagent-driven-development** or **executing-plans** - Activates with a locked plan. Dispatches fresh subagent per task with two-stage review (spec compliance, then code quality). Between tasks, re-runs the scope-lock check; on lock drift, stops the line and surfaces the discrepancy. -10. **requesting-code-review** - Activates between tasks. Reviews against plan, reports issues by severity. Critical issues block progress. +10. **test-driven-development** - Activates during implementation. Enforces RED-GREEN-REFACTOR: write failing test, watch it fail, write minimal code, watch it pass, commit. Deletes code written before tests. -11. **finishing-a-development-branch** - Activates when tasks complete. Verifies tests, presents options (merge/PR/keep/discard), cleans up worktree. +11. **requesting-code-review** - Activates between tasks. Reviews against plan, reports issues by severity. Critical issues block progress. -12. **pr-monitoring** - Activates after autonomous PR creation. Watches CI and review comments; fixes failures and responds to feedback until green. +12. **finishing-a-development-branch** - Activates when tasks complete. Step 1d (Scope Completeness Check) verifies every manifest task has implementing commits and that the autonomous run produces the planned number of PRs (no silent collapse). Verifies tests, presents options (merge/PR/keep/discard), cleans up worktree. -13. **post-merge-retrospective** - Activates after `pr-monitoring` exits successfully on a merged PR with green CI. Reads the design, plan, adversarial-review reports, code-review threads, and CI history; produces a short retro in `docs/retros/` scoring each adversarial finding (Prescient / Resolved upfront / False positive / Inconclusive), naming gate misses, and surfacing plugin-level follow-ups when patterns emerge across retros. +13. **pr-monitoring** - Activates after autonomous PR creation (one monitor per PR in the manifest). Watches CI and review comments; fixes failures and responds to feedback until green. + +14. **post-merge-retrospective** - Activates after `pr-monitoring` exits successfully on a merged PR with green CI. Reads the design, plan, adversarial-review reports, code-review threads, and CI history; produces a short retro in `docs/retros/` scoring each adversarial finding (Prescient / Resolved upfront / False positive / Inconclusive), naming gate misses, and surfacing plugin-level follow-ups when patterns emerge across retros. **The agent checks for relevant skills before any task.** Mandatory workflows, not suggestions. @@ -135,6 +137,12 @@ Per-skill host-conditional audit: [tests/cross-llm-coverage.md](tests/cross-llm- `tests/skill-cross-refs.sh` verifies that cross-skill references inside `skills/` and `agents/` markdown resolve (skill names, `Step N` references, `superpowers:<name>` mentions). Run it before committing any skill edit that renames a skill or renumbers a step. +`tests/plan-scope-check.sh` verifies the Scope Manifest invariant. Three modes: `--plan <path>` (well-formedness — PR Count matches the grouping table; every task in the body appears in the table; etc.), `--verify-lock <path>` (manifest sha256 matches the `.scope-lock` file written at alignment time), and `--against-branch <plan>` (planned branches in the manifest exist locally or on origin). The autonomous pipeline runs all three at the appropriate gates; CI can run `--plan` against every plan in `docs/plans/`. + +## Strict-interpretation invariant + +Once a plan is locked, ambiguous user phrases — "reorder as needed", "create a PR", "test locally", "ship a demo", "be quick" — do NOT authorize rescoping, PR collapse, or partial-scope shipping. The agent picks the most-faithful-to-the-locked-manifest interpretation; if multiple strict readings remain plausible, it stops and asks. See the table in `skills/using-superpowers/SKILL.md` § "Strict-interpretation invariant" for the full mapping and the unlock path. + ## What's Inside ### Skills Library @@ -149,15 +157,16 @@ Per-skill host-conditional audit: [tests/cross-llm-coverage.md](tests/cross-llm- **Collaboration** - **brainstorming** - Socratic design refinement (with assumption-listing, self-challenge round, and a 5-batch question budget) - **adversarial-design-review** - Adversarial attack on design and plan ideas before execution (two phases: design, plan) -- **recording-decisions** - ADRs in `decisions/` for non-trivial trade-offs and rejected alternatives -- **writing-plans** - Detailed implementation plans +- **recording-decisions** - ADRs in `decisions/` for non-trivial trade-offs, rejected alternatives, and user-approved scope reductions +- **writing-plans** - Detailed implementation plans (with mandatory Scope Manifest) - **executing-plans** - Batch execution with checkpoints -- **alignment-check** - Structural design ↔ plan trace (forward + reverse) +- **alignment-check** - Structural design ↔ plan trace (forward + reverse + manifest) +- **scope-lock** - Once a plan passes alignment, the task list, PR count, and feature scope are immutable until completion or explicit user-approved reduction - **dispatching-parallel-agents** - Concurrent subagent workflows - **requesting-code-review** - Pre-review checklist - **receiving-code-review** - Responding to feedback - **using-git-worktrees** - Parallel development branches -- **finishing-a-development-branch** - Merge/PR decision workflow +- **finishing-a-development-branch** - Merge/PR decision workflow (with Step 1d Scope Completeness Check) - **pr-monitoring** - Watches CI and reviews after autonomous PR creation - **post-merge-retrospective** - Closes the loop on merged PRs; scores each adversarial finding and surfaces gate misses - **subagent-driven-development** - Fast iteration with two-stage review (spec compliance, then code quality) diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index f7f3b38..07fc877 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -1,5 +1,73 @@ # Superpowers Release Notes +## v5.6.0 (2026-05-01) + +### Why this release exists + +A user reported the agent going off the rails: told to "continue autonomously, create a PR, test locally, reorder as needed", the agent (a) reinterpreted "reorder as needed" as license to rescope, (b) collapsed a 6-PR plan into 1 PR, (c) shipped partial scope as a "demo". Each step looked plausible in isolation. Cumulatively, the contract was lost. + +This release adds the gates that make each of those steps individually visible and individually blockable. + +### New skill: `scope-lock` + +Once `alignment-check` returns PASS, the plan's task list, PR count, and feature scope are **locked**. The lock is enforced by: + +- A required `## Scope Manifest` section in every plan, declaring `**PR Count:**`, `**Tasks:**`, `**Out of scope:**`, and a `**PR Grouping:**` table mapping tasks → PRs → branches. +- A `Status:` line stamped `Locked <UTC ISO-8601 timestamp>` after alignment passes. +- A `<plan>.scope-lock` file containing the sha256 of the manifest section, committed alongside the locked plan. +- A re-check of the lock at every per-task checkpoint in `subagent-driven-development` and before any PR creation in `finishing-a-development-branch`. + +Unlock is heavyweight and explicit: the user must approve the specific tasks/PRs being dropped, an ADR is written via `recording-decisions`, the manifest is updated, and `alignment-check` re-runs against the reduced plan. Cheap unlock = no lock at all. + +There is no "demo mode". Either the locked manifest ships, or the unlock path runs. + +### New test: `tests/plan-scope-check.sh` + +Three modes: + +- `--plan <path>` — manifest well-formedness: `**PR Count:**` matches the PR Grouping table row count; every Task ID in the table exists as a `### Task N:` heading in the body; every body task appears in exactly one PR row; `**Out of scope:**` is present (legacy plans without any manifest are grandfathered unless `--strict` is passed). +- `--verify-lock <path>` — verifies the manifest's current sha256 matches `<path>.scope-lock`. Catches post-lock tampering. +- `--against-branch <path>` — verifies every branch listed in the PR Grouping table exists locally or on origin. Catches the "collapsed N PRs into 1" failure mode at PR-creation time. + +Exit codes: `0` clean / `1` failures / `3` usage error. Wirable into CI. + +### New invariant: strict-interpretation rule (in `using-superpowers`) + +When the autonomous pipeline is running and a user instruction is ambiguous, the agent MUST pick the **most-faithful-to-the-locked-manifest** interpretation. A table mapping common ambiguous phrases to their forbidden-loose and mandated-strict readings: + +| Phrase | ❌ Loose | ✅ Strict | +|---|---|---| +| "reorder as needed" | rescope, drop tasks | reorder tasks within the same PR | +| "create a PR" | one PR for whatever subset | the number of PRs in the manifest | +| "test locally" | skip CI | run every plan task's verification | +| "ship a demo" | partial scope, happy-path | no demo mode; ship locked manifest | +| "be efficient" | drop tests/reviews/tasks | speed comes from parallelism, not skipping | + +When multiple strict interpretations remain plausible, the agent stops and asks. Picking one and proceeding is forbidden. + +### Wired into existing skills + +- **`writing-plans`** — every plan MUST start with the `## Scope Manifest` block; `**Base branch:**` added to the header. The PR Grouping table is the contract `scope-lock` enforces. Authoring rules added to prevent empty `Out of scope:` and orphan tasks. +- **`alignment-check`** — third trace added (manifest trace) on top of forward and reverse. Runs `tests/plan-scope-check.sh --plan` as part of the gate. After PASS, invokes `scope-lock` to stamp and hash. Drift items now include `MANIFEST DRIFT`, `UNSCOPED`, `COUNT MISMATCH`. +- **`subagent-driven-development`** — Sequential Mode adds Step 0 "scope-lock checkpoint" before each task dispatch. Red-flags expanded with explicit prohibitions on dropping/adding tasks, collapsing PRs, and skipping the per-task scope check. +- **`finishing-a-development-branch`** — new Step 1d "Scope Completeness Check" verifies every manifest task has implementing commits and that the manifest's PR count matches reality. Autonomous mode now creates one PR per row in the PR Grouping table; collapsing is a stop-the-line error. PR body template includes a Scope Manifest section. +- **`recording-decisions`** — fifth trigger condition added: user-approved scope reduction. ADR is cited from the manifest's `Status: Reduced …` line and from each PR body shipped under the reduced manifest. +- **`pr-monitoring`** — autonomous mode spawns one monitor per PR (manifest-driven), not one monitor per branch. +- **`using-superpowers`** — pipeline auto-chain extended with explicit `scope-lock` step between alignment-check and subagent-driven-development. Strict-interpretation invariant added. + +### Documentation + +- `README.md` workflow extended to 14 stages (scope-lock inserted at 8); new "Strict-interpretation invariant" section. +- `tests/cross-llm-coverage.md` — row added for `scope-lock` (host-neutral; pure markdown + shell). + +### Versioning + +5.5.0 → 5.6.0 across `.claude-plugin/plugin.json`, `.claude-plugin/marketplace.json`, `.cursor-plugin/plugin.json`. + +### Backward compatibility + +Plans created before v5.6.0 do not have a `## Scope Manifest` section. `tests/plan-scope-check.sh` grandfathers them by default; pass `--strict` to require the manifest on all plans (e.g., for CI on a fresh-start repo). New plans created via `writing-plans` from v5.6.0 onward always include the manifest. + ## v5.5.0 (2026-05-01) ### New Features diff --git a/skills/alignment-check/SKILL.md b/skills/alignment-check/SKILL.md index 8d54b50..082407e 100644 --- a/skills/alignment-check/SKILL.md +++ b/skills/alignment-check/SKILL.md @@ -43,6 +43,13 @@ For each task in the plan: - Find the design requirement it satisfies - If no requirement justifies it: flag as SCOPE CREEP +**Manifest trace (design → manifest, manifest → plan body):** +- Verify the plan contains a `## Scope Manifest` section (see `skills/scope-lock/SKILL.md` for the format). +- For each design requirement, find the PR row(s) in the manifest that ship it; flag UNSCOPED if none. +- For each PR row in the manifest, verify every Task ID it lists exists as a `### Task N:` heading in the plan body; flag MANIFEST DRIFT otherwise. +- Verify `**PR Count:** N` matches the row count of the PR Grouping table; flag COUNT MISMATCH otherwise. +- This is enforced programmatically by `tests/plan-scope-check.sh --plan <plan>` — alignment-check MUST run that script and fail if it returns non-zero. + **Report format:** ### Alignment Report @@ -120,10 +127,18 @@ Re-run alignment check after revision. **Max 2 revision cycles** before escalati ## On PASS -Proceed to execution: +After alignment passes, **lock the plan's scope** so subsequent execution cannot silently rescope. Invoke `superpowers:scope-lock` with the plan path. The scope-lock skill: + +1. Stamps the plan's `**Status:**` line with `Locked <UTC ISO-8601 timestamp>`. +2. Computes the manifest's sha256 and writes `<plan-path>.scope-lock`. +3. Commits both files (`chore: lock scope for <feature> (alignment passed)`). + +After the lock is in place, proceed to execution: - If autonomous mode: invoke `subagent-driven-development` (which uses Agent Teams) - If manual mode: return control to user +If the plan does NOT contain a `## Scope Manifest` section, alignment-check fails before the lock step. The manifest is mandatory for autonomous-pipeline plans (see `skills/scope-lock/SKILL.md` for the format and `skills/writing-plans/SKILL.md` for the authoring rules). + ## Integration **Called by:** @@ -132,4 +147,6 @@ Proceed to execution: **Calls:** - `writing-plans` (on FAIL) — for plan revision +- `superpowers:scope-lock` (on PASS) — to apply the post-alignment lock that prevents silent rescoping during execution - `subagent-driven-development` (on PASS, autonomous mode) — to begin execution +- `tests/plan-scope-check.sh --plan <plan>` (during the manifest trace) — programmatic check that the plan's Scope Manifest is well-formed diff --git a/skills/finishing-a-development-branch/SKILL.md b/skills/finishing-a-development-branch/SKILL.md index c162ab4..2a3cec7 100644 --- a/skills/finishing-a-development-branch/SKILL.md +++ b/skills/finishing-a-development-branch/SKILL.md @@ -18,8 +18,9 @@ Guide completion of development work by presenting clear options and handling ch When running in the autonomous pipeline (invoked from subagent-driven-development in autonomous mode): 1. **Verify tests pass** — same as manual mode, abort if failing -2. **Skip option presentation** — go directly to PR creation -3. **Auto-push and create PR:** +2. **Run Step 1d (Scope Completeness Check)** — see below. This is a mandatory gate in autonomous mode. The agent MUST NOT silently collapse N planned PRs into 1, nor declare success on a partial scope. If Step 1d surfaces a failure, the autonomous pipeline halts and asks the user. +3. **Skip option presentation** — go directly to PR creation +4. **For every PR row in the manifest's PR Grouping table, create one PR.** The manifest is the contract. If the table has 3 rows, the autonomous run produces 3 PRs, each pointing at the branch named in the row. Do NOT collapse rows — collapsing is the exact failure mode `skills/scope-lock/SKILL.md` defends against. Per-PR steps: ```bash feature_branch="<feature-branch>" feature_name="<feature-name>" @@ -50,6 +51,9 @@ When running in the autonomous pipeline (invoked from subagent-driven-developmen ## Implementation Plan See: docs/plans/YYYY-MM-DD-<feature>.md + ## Scope Manifest + <copy the **PR Count**, **Tasks**, **Status** lines + this PR's row from the PR Grouping table> + ## Changes <per-task summary of what was implemented> @@ -57,8 +61,8 @@ When running in the autonomous pipeline (invoked from subagent-driven-developmen EOF )" ``` -4. **Invoke pr-monitoring** — spawn background agent to monitor CI and reviews -5. **Report PR URL** — output the PR link for the user +5. **Invoke pr-monitoring** — spawn one background monitor per PR created +6. **Report PR URLs** — output every PR link for the user (one per row in the manifest's PR Grouping table) **Do NOT:** - Present the 4-option menu in autonomous mode @@ -121,7 +125,32 @@ Action: ``` 4. Resolve before merging — bump the lagging pin, OR state explicitly why the skew is intentional and safe. -If NOT triggered: skip this step and continue to Step 2. +If NOT triggered: skip this step and continue to Step 1d. + +### Step 1d: Scope Completeness Check (mandatory) + +**Trigger:** always. This step is the gate that prevents the agent from declaring victory on a partial-scope solution. + +Action: + +1. Identify the plan: `docs/plans/YYYY-MM-DD-<feature>.md`. If there is no plan in `docs/plans/` for this branch (manual/ad-hoc work), skip this step. +2. Run `bash tests/plan-scope-check.sh --plan <plan-path> --verify-lock <plan-path>`. The script verifies the manifest is well-formed and the locked hash still matches. +3. **For every `### Task N:` heading in the plan body**, verify that a commit on the feature branch implements that task. Use the task's `**Files:**` block (`Create:` / `Modify:` / `Test:`) to map files to the task; `git log --oneline <base>..HEAD -- <file>` should show at least one commit per task. +4. **Compute the actual PR count** for autonomous mode: count distinct branches in the manifest's PR Grouping `Branch` column that have commits ahead of base. This must equal `**PR Count:**` in the manifest. + +**On any failure of Step 1d:** + +- **Missing tasks:** stop. Do NOT create any PR. Report exactly which task(s) have no implementing commits, and ask the user one of: + > Tasks <list> have no implementing commits on this branch. Options: + > 1. Implement the missing tasks (preferred). + > 2. Approve a scope reduction — I will invoke `recording-decisions` to write an ADR removing those tasks from the manifest, then re-run `alignment-check` against the reduced design+plan. + > 3. Abort the PR creation; keep the branch as-is for inspection. + > + > Which option? +- **PR count mismatch (autonomous mode):** if the manifest expects N PRs but the branch layout produced fewer, the agent must split the branch via `git rebase --onto` per the manifest's grouping table — NOT collapse the manifest. Collapsing N planned PRs into 1 is exactly the failure mode `scope-lock` exists to prevent. +- **Locked-hash mismatch:** the manifest has been edited after the lock. Surface the diff and stop. The user must either revert the edit or go through the unlock path (`recording-decisions` + re-run alignment-check). + +Do not proceed past Step 1d on any failure without explicit user direction. There is no "demo mode" — see the anti-patterns in `skills/scope-lock/SKILL.md`. ### Step 2: Determine Base Branch diff --git a/skills/recording-decisions/SKILL.md b/skills/recording-decisions/SKILL.md index dbf45c4..ac5cfd5 100644 --- a/skills/recording-decisions/SKILL.md +++ b/skills/recording-decisions/SKILL.md @@ -21,8 +21,9 @@ Invoke this skill when **any** of these conditions hold: 2. **Non-trivial trade-off.** The design weighs ≥2 plausible approaches and picks one for reasons that won't be obvious from reading the code. A flat file vs. SQLite vs. Postgres choice. A library-pin floor (`>=X.Y`) vs. exact pin (`==X.Y.Z`) decision. Sync vs. async. Polling vs. webhook. 3. **Adversarial-review override.** The design author accepted an adversarial-review finding as "yes, but here's why" rather than fixing it. The acceptance reasoning belongs in an ADR so future contributors don't re-litigate. 4. **Cross-skill structural change.** Any change that affects multiple skills' integration (e.g., introducing a new gate in the autonomous pipeline, renaming a step that other skills cite). +5. **User-approved scope reduction.** The user explicitly approved removing tasks or PRs from a locked manifest (see `skills/scope-lock/SKILL.md`'s unlock path). The ADR records which tasks/PRs were dropped, why, and what carries over to a future plan. This ADR is then cited from the manifest's `**Status:** Reduced …` line and from the PR body of every PR shipped under the reduced manifest. -If none of the four conditions hold, an ADR is not required. ADRs are not for every commit — they are for choices that future contributors will read the code and ask "why is it like this?" about. +If none of the five conditions hold, an ADR is not required. ADRs are not for every commit — they are for choices that future contributors will read the code and ask "why is it like this?" about. ## Process diff --git a/skills/scope-lock/SKILL.md b/skills/scope-lock/SKILL.md new file mode 100644 index 0000000..9ad7ec3 --- /dev/null +++ b/skills/scope-lock/SKILL.md @@ -0,0 +1,158 @@ +--- +name: scope-lock +description: Use whenever the autonomous pipeline reaches alignment-check PASS - locks the plan's task list, PR count, and feature scope so the executing agent cannot silently rescope, collapse PRs, or ship partial work as a "demo" without explicit user approval recorded as an ADR +--- + +# Scope Lock + +## Overview + +After `alignment-check` passes, the implementation plan is **locked**: task list, PR count, and feature scope are immutable until the work completes (or the user explicitly approves a reduction). This skill defines what "locked" means, what unlocks it, and how the rest of the pipeline must behave under the lock. + +**Why this skill exists:** observed failure mode — an agent told to "continue autonomously, create a PR, also test locally, reorder as needed" interpreted "reorder as needed" as license to rescope, collapsed a 6-PR plan into a single PR, and shipped a partial-scope solution as a "demo". Each step looked plausible in isolation. Cumulatively the agent went off the rails. The lock makes each of those steps individually visible and individually blockable. + +**Core principle:** the plan is the contract. Once `alignment-check` says it covers the design and only the design, the pipeline executes the contract — it does not renegotiate it. + +## When to use + +Invoked automatically by `alignment-check` immediately after it returns PASS. Also invoked manually by any subsequent skill (`subagent-driven-development`, `finishing-a-development-branch`) before performing an action that depends on the locked manifest (a task transition, a PR creation, a completion claim). + +Manual invocation: + +- **At lock time** (after alignment passes): stamp the plan and record the manifest hash. +- **At execution checkpoints** (between tasks): verify reality still matches the lock. +- **At completion time** (before PR creation): assert manifest is fully satisfied. +- **At unlock time** (user-approved scope reduction): record the reduction as an ADR, update the manifest, re-stamp. + +## The Scope Manifest + +The manifest is a section the plan author writes during `writing-plans`. After `alignment-check` PASS, it becomes immutable. Every plan MUST contain it. Plans without it fail the alignment check and `tests/plan-scope-check.sh`. + +```markdown +## Scope Manifest + +**PR Count:** N +**Tasks:** N +**Estimated Lines of Change:** ~N (informational; not enforced) + +**Out of scope:** +- <explicit non-goal> +- <explicit non-goal> + +**PR Grouping:** + +| PR # | Title | Tasks | Branch | +|------|-------|-------|--------| +| 1 | <PR title> | Task 1, Task 2 | feat/<slug>-1 | +| 2 | <PR title> | Task 3, Task 4 | feat/<slug>-2 | +| ... | ... | ... | ... | + +**Status:** Draft | Locked YYYY-MM-DDTHH:MM:SSZ | Reduced YYYY-MM-DDTHH:MM:SSZ (see decisions/NNNN) +``` + +Every plan task ID listed under `Tasks` in the table must exist in the plan body. Every task in the plan body must appear in exactly one PR row. + +If the work is genuinely a single PR, the table has one row — the row still has to exist. Single-PR plans are not exempt from the manifest; they are exempt only from the multi-PR PR-count assertion. + +## Lock state machine + +``` + alignment-check PASS + Draft ─────────────────────────────────► Locked + ▲ │ + │ │ user approves scope reduction; + │ alignment-check FAIL → revise │ recording-decisions writes ADR; + │ ▼ + │ Reduced + │ │ + │ │ re-run alignment-check on the reduced plan + └──────────────────────────────────────────┘ +``` + +- **Draft**: the plan author is still revising. No execution is permitted. +- **Locked**: alignment passed. The manifest hash is recorded. Execution is permitted; renegotiation is not. +- **Reduced**: the user explicitly approved a scope reduction; an ADR was written; the manifest was updated; alignment was re-run on the reduced plan, which produced a new Locked stamp. The original Locked stamp is preserved in the ADR's Context for audit. + +There is no "Expanded" state by design. Adding scope mid-flight requires going back to Draft (re-do brainstorming for the new scope). This is intentional friction. + +## What the lock prohibits + +While `Status: Locked …`, the following are **stop-the-line errors** for any pipeline skill: + +1. **Dropping a task.** A task in the manifest cannot be skipped. If the agent encounters a task that turns out to be infeasible, it MUST surface this to the user and request a scope reduction (which goes through the unlock path, not a unilateral skip). +2. **Adding a task not in the manifest.** Discovering "we also need to do X" mid-execution is not a license to silently add X. Either X is already implied by an existing task (then it goes under that task) or X is new scope (then it goes through brainstorming + a new design + a new plan or an explicit manifest amendment). +3. **Collapsing PRs.** If the manifest has 3 PR rows, the autonomous pipeline must produce 3 PRs. Collapsing into 1 PR is a stop-the-line error even if "all the code is the same". +4. **Splitting a PR.** Same rule in reverse. The grouping table is the contract. +5. **Re-ordering tasks within the same PR is allowed.** Re-ordering tasks across PRs is **not** allowed without an unlock — it changes which task ships in which PR, which changes review boundaries. +6. **"Reorder as needed", "create a PR", "test locally", and similar imperative-but-vague user phrases do NOT authorize any of the above.** These phrases speak to *how* the agent runs the manifest, not to *what* is in the manifest. See the strict-interpretation rule in `using-superpowers`. + +## Unlock path (user-approved scope reduction) + +If during execution the agent or the user determines that a task or PR should be removed: + +1. **Stop the line.** Pause execution; do not commit or push anything that depends on the dropped scope. +2. **Surface the proposed reduction explicitly.** State which tasks would be removed, which PRs are affected, and why. Do not paraphrase a vague user phrase as approval. +3. **Wait for explicit user confirmation.** "Yes, remove tasks 4 and 5" — exact tasks named, exact reduction acknowledged. Anything less than this MUST be treated as not-yet-approved. +4. **Invoke `recording-decisions`** with reduction-specific context: which tasks/PRs are dropped, what was rejected, what carries over (or gets re-planned). The ADR is the audit record. +5. **Update the manifest in place.** Remove the dropped task rows; update `**PR Count:**` and `**Tasks:**`; flip status to `Reduced YYYY-MM-DDTHH:MM:SSZ (see decisions/NNNN-...md)`. +6. **Re-run `alignment-check`** on the reduced plan. The reduced manifest must still cover every requirement in the (possibly also reduced) design. If the design was not also reduced, alignment will fail — that's the correct behavior. The user must reduce the design first via a new `brainstorming --design-only` pass. +7. **On alignment PASS,** the lock re-engages with a new `Locked` stamp. + +The unlock path is intentionally heavyweight. Cheap unlock = no lock at all. + +## Lock enforcement at each pipeline stage + +**`alignment-check` (pre-lock and re-lock):** +- After PASS, edit the plan's `**Status:**` line to `Locked <UTC ISO-8601 timestamp>`. +- Compute `sha256` of the manifest section (from `## Scope Manifest` to the next `##` heading) and write it to `<plan-path>.scope-lock`. This is the lock file. +- Commit both files in the same commit: `chore: lock scope for <feature> (alignment passed)`. + +**`subagent-driven-development` (per-task checkpoint):** +- Before dispatching the next task, run `tests/plan-scope-check.sh --plan <plan-path>` to verify (a) the plan's manifest hash still matches `<plan-path>.scope-lock`, (b) every commit on the feature branch traces to a task in the manifest, (c) no manifest task is missing. +- On any FAIL, stop dispatching new work; surface the discrepancy to the user. +- After all tasks complete, run the same check before invoking `finishing-a-development-branch`. + +**`finishing-a-development-branch` (Step 1d, see that skill):** +- Before any PR is created, assert the manifest is fully satisfied: every task's verification step has run; every task's commit is on the branch. +- In autonomous mode, the number of PRs created MUST equal the manifest's `**PR Count:**`. The branch layout MUST match the per-PR grouping table. +- If the actual layout doesn't match (e.g., all work is on a single branch but the manifest planned 3), the agent must split the branch via `git rebase --onto` per the grouping table — NOT collapse the manifest to match what was implemented. The manifest is the contract. + +**`pr-monitoring` (already wired):** +- Reads the per-PR grouping table to know which monitor instance handles which PR. + +**`post-merge-retrospective` (already wired):** +- Reads the manifest and stamps to know what was promised vs. what shipped. A reduced manifest is fine; an undocumented reduction is a gate miss. + +## Anti-patterns + +- **"The plan is just a guide."** No. After alignment PASS, the plan is the contract. Treating the plan as advisory after lock is the failure mode this skill exists to prevent. +- **Collapsing PRs because "they're all related."** Relatedness does not justify collapse. The PR grouping table reflects review-friendliness and rollback granularity, not just code locality. +- **Treating user vagueness as license.** "Reorder as needed" does not mean "rescope as needed". When in doubt, the agent picks the strictest interpretation and surfaces it. See `using-superpowers` strict-interpretation invariant. +- **Silently dropping a task because it turned out to be hard.** That's the unlock path's job. A unilateral skip is a contract breach. +- **"Demo" framing.** Once the manifest is locked, there is no demo mode. Either you ship the contract or you go through the unlock path. "Let me just get something working" is exactly the rationalization this skill blocks. + +## Integration + +**Called by:** +- `alignment-check` — to apply the initial lock after PASS. +- `subagent-driven-development` — to verify the lock at each task checkpoint. +- `finishing-a-development-branch` — to verify the lock before PR creation. +- Manual — when a user asks "are we still on plan?" the agent runs the check. + +**Calls:** +- `recording-decisions` — when the user explicitly approves a scope reduction. +- `tests/plan-scope-check.sh` — for the programmatic verification. + +**Reads:** +- `docs/plans/<plan>.md` — the plan and its manifest. +- `docs/plans/<plan>.md.scope-lock` — the manifest hash recorded at lock time. +- `git log --oneline <base>..HEAD` — actual commits to compare against the manifest. + +**Writes:** +- `docs/plans/<plan>.md` — the `**Status:**` line, on lock or reduce. +- `docs/plans/<plan>.md.scope-lock` — the manifest hash file. +- (via `recording-decisions`) `decisions/NNNN-scope-reduction-<feature>.md`. + +## Why a separate skill + +`alignment-check` is "does this plan cover this design?" — a one-shot structural test at hand-off. `scope-lock` is "is the plan still being honored?" — a recurring runtime invariant. Keeping them separate keeps each skill's responsibility focused. Alignment runs once; the lock is checked at every checkpoint. diff --git a/skills/subagent-driven-development/SKILL.md b/skills/subagent-driven-development/SKILL.md index d830a13..f2b2d3d 100644 --- a/skills/subagent-driven-development/SKILL.md +++ b/skills/subagent-driven-development/SKILL.md @@ -253,6 +253,13 @@ parallel agents. One subagent handles one task at a time; reviews happen between For each task in the plan: +0. **Scope-lock checkpoint** — before dispatching the next task, run + `bash tests/plan-scope-check.sh --plan <plan-path> --verify-lock <plan-path>`. If it + exits non-zero, stop the line: the manifest has drifted from the locked hash, or + reality has drifted from the manifest. Surface the specific discrepancy to the user + and wait for instruction. Do NOT silently re-lock or proceed. See + `skills/scope-lock/SKILL.md` for the unlock path (which requires explicit user + approval and an ADR via `recording-decisions`). 1. **Dispatch implementer subagent** — provide the full task text, the design doc path, and the working directory in the prompt. Use `./implementer-prompt.md` as the base template. 2. **Answer questions** if the implementer surfaces blockers. @@ -264,7 +271,7 @@ For each task in the plan: 7. If quality issues found → implementer fixes → re-review until approved. 8. Mark task complete and move to the next. -After all tasks: invoke `superpowers:finishing-a-development-branch`. +After all tasks: run `bash tests/plan-scope-check.sh --plan <plan-path> --verify-lock <plan-path>` one final time, then invoke `superpowers:finishing-a-development-branch`. If the scope check fails, do not invoke finishing — surface the discrepancy first. <host: codex> @@ -290,6 +297,10 @@ Codex subagents do not share a task list. Use these conventions instead: **Never:** - Start implementation on main/master without explicit user consent - Skip reviews (spec compliance OR code quality) +- Skip the scope-lock checkpoint between tasks (Step 0 in Sequential Mode; equivalent watchdog cadence in Agent Teams Mode — see Resilience section) +- Drop a task because it turned out to be hard. Surface the obstruction to the user; the unlock path in `skills/scope-lock/SKILL.md` is the only sanctioned way to remove a task from the manifest. +- Add a task that isn't in the manifest. Discovering "we also need X" mid-execution is not a license to silently add it. Either it's already covered by an existing task, or it's a manifest amendment (which requires going back through brainstorming for the new scope). +- Collapse PRs. The manifest's PR Grouping table is a contract; if it has 3 rows, the work ships as 3 PRs. - Proceed with unfixed issues - Make subagents/teammates read plan files — provide full text in the prompt instead - Skip scene-setting context in any subagent prompt diff --git a/skills/using-superpowers/SKILL.md b/skills/using-superpowers/SKILL.md index a7776b5..182e37d 100644 --- a/skills/using-superpowers/SKILL.md +++ b/skills/using-superpowers/SKILL.md @@ -81,9 +81,9 @@ When multiple skills could apply, use this order: 1. **Process skills first** (brainstorming, debugging) - these determine HOW to approach the task 2. **Implementation skills second** (frontend-design, mcp-builder) - these guide execution 3. **Pipeline skills auto-chain** — these invoke each other automatically in the autonomous pipeline: - brainstorming → adversarial-design-review (design phase) → writing-plans → adversarial-design-review (plan phase) → alignment-check → subagent-driven-development → finishing-a-development-branch → pr-monitoring → post-merge-retrospective + brainstorming → adversarial-design-review (design phase) → writing-plans → adversarial-design-review (plan phase) → alignment-check → **scope-lock** → subagent-driven-development → finishing-a-development-branch → pr-monitoring → post-merge-retrospective - Cross-cutting skills invoked from within the pipeline when conditions trigger: `recording-decisions` (when designs/plans make non-trivial trade-offs). + Cross-cutting skills invoked from within the pipeline when conditions trigger: `recording-decisions` (when designs/plans make non-trivial trade-offs, including user-approved scope reductions); `scope-lock` (re-checked at every per-task checkpoint and before PR creation). "Let's build X" → brainstorming first, then the pipeline runs autonomously after design approval. "Fix this bug" → debugging first, then domain-specific skills. @@ -99,3 +99,23 @@ The skill itself tells you which. ## User Instructions Instructions say WHAT, not HOW. "Add X" or "Fix Y" doesn't mean skip workflows. + +### Strict-interpretation invariant (autonomous mode) + +When the autonomous pipeline is running and a user instruction is **ambiguous**, the agent MUST pick the **most-faithful-to-the-locked-plan** interpretation. Picking the looser interpretation in the name of "being helpful" is the failure mode this rule exists to prevent. + +| Phrase | ❌ Loose interpretation (forbidden) | ✅ Strict interpretation (mandated) | +|---|---|---| +| "reorder as needed" | rescope, drop tasks, change PR count | reorder tasks within the same PR; manifest unchanged | +| "create a PR" | create one PR for whatever subset is convenient | create the number of PRs in the manifest's PR Grouping table | +| "test locally" | skip CI; ship something that "works on my machine" | run the verification steps every plan task declares; CI still runs at the end | +| "make it work" / "just get something working" | trim scope until the partial result runs | implement the full manifest; if blocked, surface the blocker | +| "ship a demo" | partial scope + happy-path-only tests | there is no demo mode; either ship the locked manifest or invoke the unlock path | +| "do whatever you think is best" | unilaterally restructure plan | do the locked manifest; surface choices not covered by the manifest | +| "be efficient" / "be quick" | drop tests, drop reviews, drop tasks | run the pipeline at full discipline; speed comes from parallelism, not from skipping | + +**When multiple strict interpretations remain plausible**, the agent stops and asks. Picking one and proceeding is not allowed. The cheapest place to catch a misinterpretation is before any commit; the most expensive is after a PR is opened. + +**Locked plans are inviolate.** If the user phrase appears to conflict with the locked manifest in `docs/plans/<feature>.md`, the locked manifest wins until the user goes through the unlock path defined in `skills/scope-lock/SKILL.md`. "I told you to reorder" does not retroactively authorize rescoping; "yes, drop tasks 4 and 5" does (and triggers `recording-decisions`). + +This rule is **rigid**, not flexible. Do not adapt it. The whole point is that ambiguity is resolved upward, never sideways. diff --git a/skills/writing-plans/SKILL.md b/skills/writing-plans/SKILL.md index 2aace4d..8c77727 100644 --- a/skills/writing-plans/SKILL.md +++ b/skills/writing-plans/SKILL.md @@ -132,9 +132,45 @@ The plan author writes the expected output literally — not "passes tests" but **Tech Stack:** [Key technologies/libraries] +**Base branch:** [main | develop | other] + --- ``` +## Scope Manifest (REQUIRED) + +Every plan MUST contain a `## Scope Manifest` section immediately after the header. This section is the **scope contract** between the plan author and the execution pipeline. After `alignment-check` passes, the manifest is locked (see `skills/scope-lock/SKILL.md`). Without the manifest, alignment-check fails and execution cannot start. + +```markdown +## Scope Manifest + +**PR Count:** N +**Tasks:** N +**Estimated Lines of Change:** ~N (informational; not enforced) + +**Out of scope:** +- <explicit non-goal> +- <explicit non-goal — if truly nothing, write "(none)" — empty bullets are not allowed> + +**PR Grouping:** + +| PR # | Title | Tasks | Branch | +|------|-------|-------|--------| +| 1 | <PR title> | Task 1, Task 2 | feat/<slug>-1 | +| 2 | <PR title> | Task 3, Task 4 | feat/<slug>-2 | +| ... | ... | ... | ... | + +**Status:** Draft +``` + +**Authoring rules:** + +- **PR Count must match the PR Grouping table row count.** If the work fits in a single PR, the table has one row — write it anyway. The downstream gate enforces this. +- **Tasks must match the count of `### Task N:` headings in the plan body.** Every task in the body appears in exactly one PR row. No orphan tasks; no phantom tasks. +- **`**Out of scope:**` is not optional.** If the design genuinely has no non-goals, write `(none)`. Forcing the author to think about what is *not* being built reduces the chance the executing agent will helpfully add it. +- **Each PR is independently reviewable and revertible.** If you can't say what each PR delivers in one sentence, the grouping is wrong — go back to the design. +- **Status starts as `Draft`.** `alignment-check` flips it to `Locked …` after PASS. Manual edits to the Status line by anyone other than alignment-check / scope-lock are a contract violation. + ## Task Structure ````markdown diff --git a/tests/cross-llm-coverage.md b/tests/cross-llm-coverage.md index 526e8b1..97e4924 100644 --- a/tests/cross-llm-coverage.md +++ b/tests/cross-llm-coverage.md @@ -9,6 +9,7 @@ host-neutral. Updated whenever a skill changes. | alignment-check | host-conditional | host-conditional | host-conditional | host-conditional | spawn block in `<host: claude-code>`; prose fallback outside | | post-merge-retrospective | host-conditional | host-conditional | host-conditional | host-conditional | Inline-vs-subagent decision block in `<host: claude-code>`; inline-only prose in `<host: codex, opencode, cursor>` | | recording-decisions | host-neutral | host-neutral | host-neutral | host-neutral | ADR storage protocol; no host-specific tooling | +| scope-lock | host-neutral | host-neutral | host-neutral | host-neutral | Pure markdown + shell-script invariant; no host-specific tooling | | brainstorming | host-conditional | host-conditional | host-conditional | host-conditional | `AskUserQuestion` in `<host: claude-code>`; numbered-list fallback in `<host: codex, opencode, cursor>` | | dispatching-parallel-agents | host-neutral | host-neutral | host-neutral | host-neutral | generic parallel-dispatch pattern; no tool-specific refs | | executing-plans | host-conditional | host-conditional | host-conditional | host-conditional | tool-use block in `<host: claude-code>`; prose fallback in `<host: codex, opencode, cursor>` | diff --git a/tests/plan-scope-check.sh b/tests/plan-scope-check.sh new file mode 100755 index 0000000..c5beb30 --- /dev/null +++ b/tests/plan-scope-check.sh @@ -0,0 +1,340 @@ +#!/usr/bin/env bash +# tests/plan-scope-check.sh +# Verifies the Scope Manifest invariant defined by skills/scope-lock/SKILL.md. +# +# Modes: +# --plan <path> Verify the manifest is well-formed (required block +# present, PR Count consistent with PR Grouping table, +# every Task ID appears in the body of the plan). +# Plans without any "## Scope Manifest" section are +# grandfathered (pre-scope-lock plans); pass --strict +# to require the manifest on all plans. +# --verify-lock <path> Verify the manifest section's sha256 matches +# <path>.scope-lock (only meaningful after the plan is +# in Locked status). +# --against-branch <plan> Verify the actual git branch layout matches the +# PR Grouping table: every commit since the merge-base +# with the plan's base branch is reachable from a +# branch listed in the table; every branch in the +# table exists locally or on origin. +# +# Multiple modes can be combined. With no flags, runs --plan on every plan in +# docs/plans/*.md (skipping *-design.md and *.scope-lock). +# +# Exit codes: +# 0 — all checks passed +# 1 — one or more checks failed +# 3 — usage error or environment problem +# +# This script is intentionally conservative: when something is ambiguous, it +# reports it and exits non-zero. The Scope Manifest is the contract; ambiguity +# in the contract is a failure. + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +cd "$REPO_ROOT" + +usage() { + awk 'NR==1 { next } /^#/ { sub(/^# ?/, ""); print; next } { exit }' "$0" +} + +# --- Helpers -------------------------------------------------------------- + +# Extract the Scope Manifest section from a plan file. Prints lines from the +# `## Scope Manifest` heading through (but not including) the next H2 heading +# at start of line. Empty output if the section is absent. +extract_manifest() { + awk ' + /^## Scope Manifest[[:space:]]*$/ { in_section = 1; print; next } + in_section && /^## / { in_section = 0 } + in_section { print } + ' "$1" +} + +# Compute sha256 of stdin in a portable way (sha256sum on Linux, shasum -a 256 +# on macOS). Outputs only the hex digest. +sha256_stdin() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum | awk '{print $1}' + elif command -v shasum >/dev/null 2>&1; then + shasum -a 256 | awk '{print $1}' + else + echo "error: need sha256sum or shasum installed" >&2 + return 3 + fi +} + +# Check the manifest is well-formed. Args: plan path. Echoes problems to stdout. +# Legacy plans (no manifest section AND no `# scope-manifest: required` marker +# in a hidden HTML comment) are skipped — only plans that opt into the format +# are enforced. New plans created by writing-plans always include the section, +# so this only matters for grandfathering historical plans pre-dating the +# scope-lock skill. +check_manifest_wellformed() { + local plan="$1" + local manifest + manifest="$(extract_manifest "$plan")" + + if [ -z "$manifest" ]; then + # Legacy / pre-scope-lock plan. Skip silently unless --strict is set. + if [ "${STRICT:-0}" = "1" ]; then + printf '%s: missing "## Scope Manifest" section (--strict)\n' "$plan" + return 1 + fi + return 0 + fi + + local pr_count tasks_count status_line + pr_count="$(printf '%s\n' "$manifest" \ + | grep -E '^\*\*PR Count:\*\*[[:space:]]*[0-9]+' \ + | head -1 \ + | sed -E 's/.*\*\*PR Count:\*\*[[:space:]]*([0-9]+).*/\1/' || true)" + tasks_count="$(printf '%s\n' "$manifest" \ + | grep -E '^\*\*Tasks:\*\*[[:space:]]*[0-9]+' \ + | head -1 \ + | sed -E 's/.*\*\*Tasks:\*\*[[:space:]]*([0-9]+).*/\1/' || true)" + status_line="$(printf '%s\n' "$manifest" \ + | grep -E '^\*\*Status:\*\*' \ + | head -1 || true)" + + local rc=0 + + if [ -z "$pr_count" ]; then + printf '%s: manifest missing **PR Count:** N\n' "$plan" + rc=1 + fi + if [ -z "$tasks_count" ]; then + printf '%s: manifest missing **Tasks:** N\n' "$plan" + rc=1 + fi + if [ -z "$status_line" ]; then + printf '%s: manifest missing **Status:** field\n' "$plan" + rc=1 + fi + + # Out of scope MUST appear, even if it's "(none)". + if ! printf '%s\n' "$manifest" | grep -qE '^\*\*Out of scope:\*\*'; then + printf '%s: manifest missing **Out of scope:** section\n' "$plan" + rc=1 + fi + + # PR Grouping table: a markdown table whose header includes "PR #" and + # "Tasks". Count the data rows (lines starting with `|` followed by an + # integer-only first column). + local grouping_rows + grouping_rows="$(printf '%s\n' "$manifest" \ + | awk ' + /^\| *PR *# *\|/ { in_table = 1; next } + in_table && /^\|[- :|]+$/ { next } # separator row (only -, :, space, |) + in_table && /^\| *[0-9]+ *\|/ { print; next } + in_table && /^[^|]/ { in_table = 0 } + ' || true)" + local grouping_count + grouping_count="$(printf '%s\n' "$grouping_rows" | grep -cE '^\| *[0-9]+ *\|' || true)" + + if [ -z "$grouping_count" ] || [ "$grouping_count" -eq 0 ]; then + printf '%s: manifest missing or empty **PR Grouping** table\n' "$plan" + rc=1 + elif [ -n "$pr_count" ] && [ "$grouping_count" -ne "$pr_count" ]; then + printf '%s: PR Count (%s) disagrees with PR Grouping table rows (%s)\n' \ + "$plan" "$pr_count" "$grouping_count" + rc=1 + fi + + # Verify every Task ID referenced in the grouping table appears in the body + # of the plan as a `### Task N:` heading. + if [ -n "$grouping_rows" ]; then + # Extract all `Task N` mentions from column 3 of the table (the Tasks col). + # Robust split: take everything between the 3rd and 4th `|`, then grep + # `Task N` substrings from it. + local task_ids + task_ids="$(printf '%s\n' "$grouping_rows" \ + | awk -F'|' '{print $4}' \ + | grep -oE 'Task[[:space:]]+[0-9]+' \ + | sed -E 's/[[:space:]]+/ /g' \ + | sort -u || true)" + while read -r task_ref; do + [ -z "$task_ref" ] && continue + local n + n="$(printf '%s\n' "$task_ref" | sed -E 's/Task +//')" + if ! grep -qE "^### Task ${n}([: ]|\$)" "$plan"; then + printf '%s: PR Grouping references "%s" but plan body has no "### Task %s:" heading\n' \ + "$plan" "$task_ref" "$n" + rc=1 + fi + done < <(printf '%s\n' "$task_ids") + + # Also: verify every `### Task N:` heading in the body appears in the + # grouping table (no orphan tasks that ship without a PR home). + local body_tasks + body_tasks="$(grep -oE '^### Task [0-9]+' "$plan" | sed -E 's/^### //' | sort -u || true)" + while read -r task_ref; do + [ -z "$task_ref" ] && continue + if ! printf '%s\n' "$task_ids" | grep -qx "$task_ref"; then + printf '%s: plan body has "%s" but the PR Grouping table does not include it\n' \ + "$plan" "$task_ref" + rc=1 + fi + done < <(printf '%s\n' "$body_tasks") + + # Tasks count consistency + local body_task_count + body_task_count="$(printf '%s\n' "$body_tasks" | grep -cE '^Task [0-9]+' || true)" + if [ -n "$tasks_count" ] && [ "$body_task_count" -ne "$tasks_count" ]; then + printf '%s: **Tasks:** %s disagrees with body task count (%s "### Task N:" headings)\n' \ + "$plan" "$tasks_count" "$body_task_count" + rc=1 + fi + fi + + return "$rc" +} + +# Verify the manifest's sha256 matches <plan>.scope-lock. Args: plan path. +check_lock_hash() { + local plan="$1" + local lock="${plan}.scope-lock" + if [ ! -f "$lock" ]; then + printf '%s: lock file %s not found (manifest is not locked)\n' "$plan" "$lock" + return 1 + fi + local expected actual + expected="$(awk 'NF && !/^#/ {print; exit}' "$lock")" + actual="$(extract_manifest "$plan" | sha256_stdin)" + if [ "$expected" != "$actual" ]; then + printf '%s: manifest hash mismatch (lock=%s, current=%s)\n' \ + "$plan" "${expected:0:12}…" "${actual:0:12}…" + return 1 + fi + return 0 +} + +# Compare actual git branch layout vs. the PR Grouping table. Args: plan path. +# The plan-relative base branch is read from the line `**Base branch:** main` +# in the plan header (defaults to `main` if absent). +check_against_branch() { + local plan="$1" + local base + base="$(grep -E '^\*\*Base branch:\*\*' "$plan" | head -1 \ + | sed -E 's/.*\*\*Base branch:\*\*[[:space:]]*([A-Za-z0-9._/-]+).*/\1/' || true)" + [ -z "$base" ] && base="main" + + local rc=0 + + # Grouping branch column = 5th `|`-delimited field. + local manifest + manifest="$(extract_manifest "$plan")" + local branches + branches="$(printf '%s\n' "$manifest" \ + | awk -F'|' ' + /^\| *PR *# *\|/ { in_table = 1; next } + in_table && /^\|[- :|]+$/ { next } + in_table && /^\| *[0-9]+ *\|/ { + gsub(/^ +| +$/, "", $5) + if ($5 != "") print $5 + next + } + in_table && /^[^|]/ { in_table = 0 } + ' \ + | sort -u || true)" + + if [ -z "$branches" ]; then + printf '%s: PR Grouping table has no Branch column entries\n' "$plan" + return 1 + fi + + while read -r br; do + [ -z "$br" ] && continue + if ! git rev-parse --verify "refs/heads/${br}" >/dev/null 2>&1 \ + && ! git rev-parse --verify "refs/remotes/origin/${br}" >/dev/null 2>&1; then + printf '%s: planned branch %s does not exist locally or on origin\n' "$plan" "$br" + rc=1 + fi + done < <(printf '%s\n' "$branches") + + return "$rc" +} + +# --- Argument parsing ----------------------------------------------------- + +MODE_PLAN=() +MODE_VERIFY_LOCK=() +MODE_AGAINST_BRANCH=() +STRICT=0 + +while [ $# -gt 0 ]; do + case "$1" in + --plan) + [ -n "${2:-}" ] || { usage; exit 3; } + MODE_PLAN+=("$2"); shift 2 ;; + --verify-lock) + [ -n "${2:-}" ] || { usage; exit 3; } + MODE_VERIFY_LOCK+=("$2"); shift 2 ;; + --against-branch) + [ -n "${2:-}" ] || { usage; exit 3; } + MODE_AGAINST_BRANCH+=("$2"); shift 2 ;; + --strict) + STRICT=1; shift ;; + --help|-h) + usage; exit 0 ;; + *) + printf 'unknown argument: %s\n\n' "$1" >&2 + usage >&2 + exit 3 ;; + esac +done +export STRICT + +# If no explicit --plan/--verify-lock/--against-branch was given, default +# to scanning all plans in docs/plans/ for well-formedness. +if [ "${#MODE_PLAN[@]}" -eq 0 ] \ + && [ "${#MODE_VERIFY_LOCK[@]}" -eq 0 ] \ + && [ "${#MODE_AGAINST_BRANCH[@]}" -eq 0 ]; then + while IFS= read -r f; do + MODE_PLAN+=("$f") + done < <(find docs/plans -maxdepth 1 -name '*.md' \ + ! -name '*-design.md' ! -name 'README.md' 2>/dev/null | sort) + if [ "${#MODE_PLAN[@]}" -eq 0 ]; then + echo "No plans found in docs/plans/. Nothing to check." + exit 0 + fi +fi + +# --- Run ------------------------------------------------------------------ + +failures=0 + +for plan in "${MODE_PLAN[@]:-}"; do + [ -z "$plan" ] && continue + if [ ! -f "$plan" ]; then + printf '%s: file not found\n' "$plan" >&2 + failures=$((failures + 1)) + continue + fi + if ! check_manifest_wellformed "$plan"; then + failures=$((failures + 1)) + fi +done + +for plan in "${MODE_VERIFY_LOCK[@]:-}"; do + [ -z "$plan" ] && continue + if ! check_lock_hash "$plan"; then + failures=$((failures + 1)) + fi +done + +for plan in "${MODE_AGAINST_BRANCH[@]:-}"; do + [ -z "$plan" ] && continue + if ! check_against_branch "$plan"; then + failures=$((failures + 1)) + fi +done + +if [ "$failures" -gt 0 ]; then + printf '\nFAIL: %s scope-manifest check(s) failed.\n' "$failures" >&2 + exit 1 +fi + +echo "PASS: scope-manifest checks succeeded." +exit 0 From 60a147a2acf8694c69ab77b9dc0edca78523d812 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 1 May 2026 04:07:17 +0000 Subject: [PATCH 5/7] feat: add 5 scope-enforcement hooks (PreToolUse, UserPromptSubmit, Stop, SubagentStop, PreCompact) Agent-Logs-Url: https://github.com/GoCodeAlone/claude-superpowers/sessions/eb1c005b-4d4e-4c5d-9f2f-07785768e3f7 Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --- hooks/completion-claim-guard | 86 +++++++++++++++++ hooks/hooks.json | 60 ++++++++++++ hooks/pre-compact-snapshot | 109 +++++++++++++++++++++ hooks/pre-tool-scope-guard | 136 ++++++++++++++++++++++++++ hooks/prompt-strict-interpretation | 147 +++++++++++++++++++++++++++++ hooks/subagent-scope-guard | 106 +++++++++++++++++++++ skills/using-superpowers/SKILL.md | 2 + 7 files changed, 646 insertions(+) create mode 100755 hooks/completion-claim-guard create mode 100755 hooks/pre-compact-snapshot create mode 100755 hooks/pre-tool-scope-guard create mode 100755 hooks/prompt-strict-interpretation create mode 100755 hooks/subagent-scope-guard diff --git a/hooks/completion-claim-guard b/hooks/completion-claim-guard new file mode 100755 index 0000000..3f5d564 --- /dev/null +++ b/hooks/completion-claim-guard @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# hooks/completion-claim-guard +# Stop hook: blocks the agent from declaring success while a locked plan's +# manifest hash fails verification — which indicates the plan was tampered with +# directly (bypassing the unlock path) or the scope-lock file was corrupted. +# +# Also blocks if the plan-scope-check script reports structural problems with +# any locked plan's manifest (inconsistent PR count, missing task IDs, etc.). +# +# To prevent infinite loops the hook honours the stop_hook_active flag: once +# the agent has been told to continue and tries to stop again, the hook exits 0 +# and lets the stop proceed (the agent has had one chance to self-correct). +# +# Global opt-out: set SUPERPOWERS_HOOKS_DISABLE=1 + +set -euo pipefail + +[ "${SUPERPOWERS_HOOKS_DISABLE:-}" = "1" ] && exit 0 + +# Require stdin (Stop always sends a JSON payload). +[ -t 0 ] && exit 0 +command -v jq >/dev/null 2>&1 || exit 0 + +hook_input=$(cat || true) +[ -z "$hook_input" ] && exit 0 + +cwd_dir=$(printf '%s' "$hook_input" | jq -r '.cwd // empty' 2>/dev/null || true) +[ -z "$cwd_dir" ] && cwd_dir="${PWD}" + +# Honour stop_hook_active to avoid infinite loops. +# When true the agent was already told to continue once; let the stop proceed. +stop_hook_active=$(printf '%s' "$hook_input" | jq -r '.stop_hook_active // false' 2>/dev/null || echo "false") +[ "$stop_hook_active" = "true" ] && exit 0 + +block() { + local reason="$1" + printf '{"decision":"block","reason":%s}\n' \ + "$(printf '%s' "$reason" | jq -Rs .)" + exit 2 +} + +# ── Find locked plans ──────────────────────────────────────────────────────── +plans_dir="${cwd_dir}/docs/plans" +[ -d "$plans_dir" ] || exit 0 + +locked_plans=$(grep -rl '\*\*Status:\*\* Locked' "$plans_dir" 2>/dev/null \ + | grep '\.md$' | grep -v '\.scope-lock' || true) + +[ -z "$locked_plans" ] && exit 0 # no locked plans — nothing to guard + +checker="${cwd_dir}/tests/plan-scope-check.sh" + +failures="" + +while IFS= read -r plan; do + [ -z "$plan" ] && continue + plan_name=$(basename "$plan") + + # ── Hash verification: was the manifest edited behind the lock? ────────── + if [ -x "$checker" ]; then + if ! bash "$checker" --verify-lock "$plan" >/dev/null 2>&1; then + failures="${failures} • ${plan_name}: Scope Manifest hash does not match the lock file (.scope-lock). The manifest was edited after it was locked, bypassing the unlock path.\n" + fi + fi + + # ── Structural well-formedness: is the manifest internally consistent? ─── + if [ -x "$checker" ]; then + if ! bash "$checker" --plan "$plan" >/dev/null 2>&1; then + failures="${failures} • ${plan_name}: Scope Manifest is structurally inconsistent (PR count, task list, or table mismatch).\n" + fi + fi +done <<< "$locked_plans" + +[ -z "$failures" ] && exit 0 + +block "Completion blocked — scope-manifest integrity problems detected before stop: + +${failures} +Resolve these before declaring done: + 1. If the manifest was edited directly, go through the unlock path: + recording-decisions → update manifest → re-run alignment-check. + 2. If the structural check failed, fix the manifest to match the plan body. + 3. After resolving, run: tests/plan-scope-check.sh --verify-lock <plan> + and confirm it exits 0 before stopping. + +See skills/scope-lock/SKILL.md for the unlock path." diff --git a/hooks/hooks.json b/hooks/hooks.json index 49087bd..219bb35 100644 --- a/hooks/hooks.json +++ b/hooks/hooks.json @@ -12,6 +12,18 @@ ] } ], + "PreToolUse": [ + { + "matcher": "Bash|Write|Edit|MultiEdit", + "hooks": [ + { + "type": "command", + "command": "'${CLAUDE_PLUGIN_ROOT}/hooks/run-hook.cmd' pre-tool-scope-guard", + "timeout": 15 + } + ] + } + ], "PostToolUse": [ { "matcher": "Skill|Agent|Task.*", @@ -23,6 +35,54 @@ } ] } + ], + "UserPromptSubmit": [ + { + "matcher": ".*", + "hooks": [ + { + "type": "command", + "command": "'${CLAUDE_PLUGIN_ROOT}/hooks/run-hook.cmd' prompt-strict-interpretation", + "timeout": 10 + } + ] + } + ], + "Stop": [ + { + "matcher": ".*", + "hooks": [ + { + "type": "command", + "command": "'${CLAUDE_PLUGIN_ROOT}/hooks/run-hook.cmd' completion-claim-guard", + "timeout": 30 + } + ] + } + ], + "SubagentStop": [ + { + "matcher": ".*", + "hooks": [ + { + "type": "command", + "command": "'${CLAUDE_PLUGIN_ROOT}/hooks/run-hook.cmd' subagent-scope-guard", + "timeout": 15 + } + ] + } + ], + "PreCompact": [ + { + "matcher": ".*", + "hooks": [ + { + "type": "command", + "command": "'${CLAUDE_PLUGIN_ROOT}/hooks/run-hook.cmd' pre-compact-snapshot", + "timeout": 10 + } + ] + } ] } } diff --git a/hooks/pre-compact-snapshot b/hooks/pre-compact-snapshot new file mode 100755 index 0000000..1d5ca82 --- /dev/null +++ b/hooks/pre-compact-snapshot @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +# hooks/pre-compact-snapshot +# PreCompact hook: snapshots the current lock state before a context compaction. +# +# Writes each locked plan's name and manifest hash to the superpowers-state +# file so that the SessionStart hook can detect lock-state changes after the +# compacted context is restored (e.g., if a plan was locked or unlocked during +# a long session and then the context was compacted, the resumption context will +# include the lock state so the agent does not re-derive intent). +# +# Also injects the snapshot as additional_context so the compacted transcript +# itself retains the lock state. +# +# Global opt-out: set SUPERPOWERS_HOOKS_DISABLE=1 + +set -euo pipefail + +[ "${SUPERPOWERS_HOOKS_DISABLE:-}" = "1" ] && exit 0 + +[ -t 0 ] && exit 0 +command -v jq >/dev/null 2>&1 || exit 0 + +hook_input=$(cat || true) +[ -z "$hook_input" ] && exit 0 + +cwd_dir=$(printf '%s' "$hook_input" | jq -r '.cwd // empty' 2>/dev/null || true) +[ -z "$cwd_dir" ] && cwd_dir="${PWD}" + +# ── Portable sha256 (same helper as tests/plan-scope-check.sh) ─────────────── +sha256_file() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum "$1" 2>/dev/null | awk '{print $1}' + elif command -v shasum >/dev/null 2>&1; then + shasum -a 256 "$1" 2>/dev/null | awk '{print $1}' + else + echo "unavailable" + fi +} + +# ── Find locked plans and collect their state ───────────────────────────────── +plans_dir="${cwd_dir}/docs/plans" +state_section="" + +if [ -d "$plans_dir" ]; then + while IFS= read -r plan; do + [ -z "$plan" ] && continue + plan_name=$(basename "$plan") + lock_file="${plan}.scope-lock" + + # Extract Status line + status_line=$(grep '\*\*Status:\*\*' "$plan" 2>/dev/null | tail -1 | sed 's/.*\*\*Status:\*\*[[:space:]]*//' || true) + + # Hash of the lock file (the manifest hash stored at lock time) + if [ -f "$lock_file" ]; then + lock_hash=$(sha256_file "$lock_file") + state_section="${state_section} ${plan_name}: ${status_line} (lock-file sha256: ${lock_hash})\n" + else + state_section="${state_section} ${plan_name}: ${status_line} (no lock file — not yet locked or lock file missing)\n" + fi + done < <(find "$plans_dir" -maxdepth 1 -name '*.md' \ + ! -name '*-design.md' ! -name 'README.md' 2>/dev/null \ + | grep -v '\.scope-lock' | sort || true) +fi + +if [ -z "$state_section" ]; then + # No plans found — nothing to snapshot. Exit silently. + exit 0 +fi + +# ── Append to superpowers-state file ───────────────────────────────────────── +STATE_DIR="${cwd_dir}/.claude/superpowers-state" +mkdir -p "$STATE_DIR" 2>/dev/null || true +STATE_FILE="${STATE_DIR}/in-progress.jsonl" +LOCK_FILE="${STATE_DIR}/.in-progress.lock" + +ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +entry=$(jq -nc \ + --arg ts "$ts" \ + --arg tool "PreCompact" \ + --arg detail "lock-snapshot: ${state_section}" \ + '{ts: $ts, tool: $tool, detail: $detail}' 2>/dev/null || true) + +if [ -n "$entry" ]; then + printf '%s\n' "$entry" >> "$STATE_FILE" 2>/dev/null || true +fi + +# ── Inject as additional_context ───────────────────────────────────────────── +snapshot_text="<superpowers-lock-snapshot ts=\"${ts}\"> +Pre-compaction scope-lock state (verify these after resuming): +${state_section} +If any plan shows Locked status, treat the locked manifest as the authoritative +contract. Do not re-derive scope from the conversation history. Use +tests/plan-scope-check.sh --verify-lock <plan> to confirm the hash is intact. +</superpowers-lock-snapshot>" + +escape_for_json() { + local s="$1" + s="${s//\\/\\\\}" + s="${s//\"/\\\"}" + s="${s//$'\n'/\\n}" + s="${s//$'\r'/\\r}" + s="${s//$'\t'/\\t}" + printf '%s' "$s" +} + +snapshot_escaped=$(escape_for_json "$snapshot_text") + +printf '{"additional_context":"%s"}\n' "$snapshot_escaped" +exit 0 diff --git a/hooks/pre-tool-scope-guard b/hooks/pre-tool-scope-guard new file mode 100755 index 0000000..a55a13c --- /dev/null +++ b/hooks/pre-tool-scope-guard @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# hooks/pre-tool-scope-guard +# PreToolUse hook: blocks destructive git operations and protected-file edits +# that could violate a locked scope manifest. +# +# Blocks: +# Bash tool — git push --force / --force-with-lease / -f (always; rewrites remote history) +# — git rebase -i / git reset --hard (always; rewrites local history) +# — git push / gh pr create when locked plan hash mismatches +# — git push / git commit to main or master (unless SUPERPOWERS_ALLOW_DEFAULT_BRANCH=1) +# Write / Edit / MultiEdit tool +# — *.scope-lock files (unless SUPERPOWERS_SCOPE_LOCK_WRITE=1) +# — docs/plans/*.md when plan is Locked (unless SUPERPOWERS_PLAN_LOCK_WRITE=1) +# +# Global opt-out: set SUPERPOWERS_HOOKS_DISABLE=1 + +set -euo pipefail + +[ "${SUPERPOWERS_HOOKS_DISABLE:-}" = "1" ] && exit 0 + +# Require stdin (PreToolUse always sends a JSON payload). +[ -t 0 ] && exit 0 +command -v jq >/dev/null 2>&1 || exit 0 + +hook_input=$(cat || true) +[ -z "$hook_input" ] && exit 0 + +tool_name=$(printf '%s' "$hook_input" | jq -r '.tool_name // empty' 2>/dev/null || true) +cwd_dir=$(printf '%s' "$hook_input" | jq -r '.cwd // empty' 2>/dev/null || true) +[ -z "$cwd_dir" ] && cwd_dir="${PWD}" + +# Output a block decision and exit 2 (the exit code Claude Code uses for blocks). +block() { + local reason="$1" + printf '{"decision":"block","reason":%s}\n' \ + "$(printf '%s' "$reason" | jq -Rs .)" + exit 2 +} + +# Return the path of the first locked plan found under docs/plans/, or empty. +find_locked_plan() { + local plans_dir="${cwd_dir}/docs/plans" + [ -d "$plans_dir" ] || return 0 + grep -rl '\*\*Status:\*\* Locked' "$plans_dir" 2>/dev/null \ + | grep '\.md$' | grep -v '\.scope-lock' | head -1 || true +} + +# Run plan-scope-check --verify-lock; returns 0 if hash matches. +verify_lock() { + local plan="$1" + local checker="${cwd_dir}/tests/plan-scope-check.sh" + [ -x "$checker" ] || return 0 # no checker present — pass through + bash "$checker" --verify-lock "$plan" >/dev/null 2>&1 +} + +case "$tool_name" in + Bash) + cmd=$(printf '%s' "$hook_input" | jq -r '.tool_input.command // empty' 2>/dev/null || true) + [ -z "$cmd" ] && exit 0 + + # ── 1. Force push (always blocked) ────────────────────────────────────── + # Catches: --force, --force-with-lease, -f flag + # These overwrite remote refs and lose commits for anyone who already pulled. + if printf '%s' "$cmd" | grep -q 'git push' && \ + printf '%s' "$cmd" | grep -qE '(--force-with-lease|--force| -f( |$)| -f$)'; then + block "Force push blocked — rewrites remote history and permanently discards commits that others may have pulled. During autonomous pipeline execution this is never acceptable. If the remote branch genuinely needs correction, stop and get explicit user approval. Unlock path: ask the user, then proceed manually with SUPERPOWERS_HOOKS_DISABLE=1 scoped to that one command." + fi + + # ── 2. Local history rewrites (always blocked) ────────────────────────── + # git rebase -i: interactive rebase rewrites commit SHAs. + # git reset --hard: destructively discards work-tree and commit history. + if printf '%s' "$cmd" | grep -qE 'git (rebase[[:space:]]+-i|rebase[[:space:]].*-i|reset[[:space:]]+--hard)'; then + block "Destructive history rewrite blocked (git rebase -i or git reset --hard). These commands alter commit history and can discard work irreversibly. Stop and ask the user whether this is genuinely intended before proceeding." + fi + + # ── 3. push / PR creation while locked plan hash mismatches ───────────── + # Catches post-lock plan tampering before anything reaches the remote. + if printf '%s' "$cmd" | grep -qE '(git push|gh pr create|gh pr merge)'; then + locked_plan=$(find_locked_plan) + if [ -n "$locked_plan" ]; then + if ! verify_lock "$locked_plan"; then + block "Locked plan hash mismatch: $(basename "$locked_plan") — the Scope Manifest has been modified since it was locked. This means either the plan was edited directly (bypassing the unlock path) or the scope-lock file was corrupted. Resolve via the scope-lock unlock path (recording-decisions → update manifest → re-run alignment-check) before pushing or creating PRs." + fi + fi + fi + + # ── 4. Push/commit to default branch (blocked unless opt-out) ─────────── + if printf '%s' "$cmd" | grep -qE 'git (push|commit)' && \ + [ "${SUPERPOWERS_ALLOW_DEFAULT_BRANCH:-}" != "1" ]; then + # Explicit main/master push target + if printf '%s' "$cmd" | grep -qE 'git push.+(origin[[:space:]]+(main|master)|HEAD:(main|master)|(main|master):[[:space:]]*(main|master))'; then + block "Pushing directly to main or master is blocked during autonomous pipeline execution. Work on a feature branch and open a PR for review. If this is genuinely intentional, set SUPERPOWERS_ALLOW_DEFAULT_BRANCH=1 in your environment and retry the command manually." + fi + # Bare 'git push' or 'git push origin' when current branch is main/master + if printf '%s' "$cmd" | grep -qE 'git push[[:space:]]*(origin[[:space:]]*)?$'; then + current_branch=$(cd "$cwd_dir" && git rev-parse --abbrev-ref HEAD 2>/dev/null || true) + if [ "$current_branch" = "main" ] || [ "$current_branch" = "master" ]; then + block "Current branch is '${current_branch}' (the default branch). Pushing directly to main or master is blocked during autonomous pipeline execution. Switch to a feature branch first. To override, set SUPERPOWERS_ALLOW_DEFAULT_BRANCH=1." + fi + fi + fi + ;; + + Write|Edit|MultiEdit) + # Collect all file paths involved in the operation. + if [ "$tool_name" = "MultiEdit" ]; then + file_paths=$(printf '%s' "$hook_input" \ + | jq -r '.tool_input.edits[]?.file_path // empty' 2>/dev/null || true) + else + file_paths=$(printf '%s' "$hook_input" \ + | jq -r '.tool_input.file_path // .tool_input.path // empty' 2>/dev/null || true) + fi + + while IFS= read -r fpath; do + [ -z "$fpath" ] && continue + + # ── 5. scope-lock files (always blocked unless sentinel env var) ───── + if printf '%s' "$fpath" | grep -qE '\.scope-lock$'; then + if [ "${SUPERPOWERS_SCOPE_LOCK_WRITE:-}" != "1" ]; then + block "Writing to '$(basename "$fpath")' is blocked — .scope-lock files are written exclusively by the scope-lock skill during alignment-check PASS. Direct edits break the manifest integrity guarantee and allow silent scope tampering. To update the lock legitimately: go through the unlock path (recording-decisions → update manifest → re-run alignment-check), which will regenerate the lock file." + fi + fi + + # ── 6. Locked plan files (blocked unless sentinel env var) ────────── + if printf '%s' "$fpath" | grep -qE 'docs/plans/[^/]+\.md$'; then + if [ "${SUPERPOWERS_PLAN_LOCK_WRITE:-}" != "1" ] && [ -f "$fpath" ]; then + if grep -q '\*\*Status:\*\* Locked' "$fpath" 2>/dev/null; then + block "Editing '$(basename "$fpath")' is blocked — this plan is Locked. Direct plan edits bypass the unlock path and silently break the manifest hash. To modify the plan: (1) invoke recording-decisions to document the approved scope reduction as an ADR, (2) follow the scope-lock unlock path (update manifest → re-stamp lock). The unlock path is defined in skills/scope-lock/SKILL.md." + fi + fi + fi + done <<< "$file_paths" + ;; +esac + +exit 0 diff --git a/hooks/prompt-strict-interpretation b/hooks/prompt-strict-interpretation new file mode 100755 index 0000000..b9d0bfb --- /dev/null +++ b/hooks/prompt-strict-interpretation @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +# hooks/prompt-strict-interpretation +# UserPromptSubmit hook: injects a strict-interpretation reminder when the +# user's prompt contains phrases that agents have historically used as license +# to rescope, collapse PRs, or skip pipeline discipline — and a locked plan +# exists in the current workspace. +# +# Does NOT block the prompt. Injects additional_context so the model sees the +# reminder before it starts planning its response. +# +# Global opt-out: set SUPERPOWERS_HOOKS_DISABLE=1 + +set -euo pipefail + +[ "${SUPERPOWERS_HOOKS_DISABLE:-}" = "1" ] && exit 0 + +# Require stdin (UserPromptSubmit always sends a JSON payload). +[ -t 0 ] && exit 0 +command -v jq >/dev/null 2>&1 || exit 0 + +hook_input=$(cat || true) +[ -z "$hook_input" ] && exit 0 + +prompt=$(printf '%s' "$hook_input" | jq -r '.prompt // empty' 2>/dev/null || true) +[ -z "$prompt" ] && exit 0 + +cwd_dir=$(printf '%s' "$hook_input" | jq -r '.cwd // empty' 2>/dev/null || true) +[ -z "$cwd_dir" ] && cwd_dir="${PWD}" + +# ── Check for trigger phrases ──────────────────────────────────────────────── +# These are phrases that agents have used as license to rescope on a locked plan. +# Matching is case-insensitive. The list is intentionally broad because the cost +# of a false positive (an extra reminder) is lower than the cost of a missed hit +# (silent rescoping). + +matched_phrase="" + +check_phrase() { + local pattern="$1" + local label="$2" + if printf '%s' "$prompt" | grep -qiE "$pattern"; then + matched_phrase="$label" + return 0 + fi + return 1 +} + +# Scope/task reordering +check_phrase 'reorder[[:space:]]+(as[[:space:]]+needed|the[[:space:]]+(tasks?|steps?|prs?|work))' \ + '"reorder as needed"' || true +# PR creation shortcuts +check_phrase '(create|open|submit|make|push|ship)[[:space:]]+(a[[:space:]]+)?pr\b' \ + '"create a PR"' || true +# Local-only testing as a skip +check_phrase 'test[[:space:]]+local(ly)?|local[[:space:]]+(test|testing|run)' \ + '"test locally"' || true +# Demo-mode shortcuts +check_phrase 'ship[[:space:]]+(a[[:space:]]+)?demo|demo[[:space:]]+(mode|it|this|version)|quick[[:space:]]+demo' \ + '"ship a demo"' || true +# Speed shortcuts +check_phrase '\b(be[[:space:]]+(quick|fast|efficient|brief)|hurry|rush)\b' \ + '"be quick/efficient"' || true +# Vague delegation / scope bypass +check_phrase '(make[[:space:]]+it[[:space:]]+work|just[[:space:]]+(get[[:space:]]+)?something[[:space:]]+(working|done|shipped))' \ + '"make it work"' || true +check_phrase '(do[[:space:]]+whatever[[:space:]]+you[[:space:]]+(think|want)|whatever[[:space:]]+you[[:space:]]+(think|want|feel)[[:space:]]+is[[:space:]]+best)' \ + '"do whatever you think is best"' || true +# Autonomous / auto-mode (new) — these phrases suggest the agent should +# operate without discipline checks, which they explicitly do NOT authorize. +check_phrase '(go|run|continue|proceed|work|operate)[[:space:]]+(fully[[:space:]]+)?autonomous(ly)?' \ + '"run autonomously"' || true +check_phrase 'auto[[:space:]]?mode|auto[[:space:]]?pilot' \ + '"auto mode"' || true +check_phrase '\bautonomous(ly)?\b' \ + '"autonomously"' || true +# "go ahead" / "go ahead and" — often used to grant blanket permission +check_phrase 'go[[:space:]]+ahead([[:space:]]+and)?' \ + '"go ahead"' || true +# "just do it" / "just continue" — vague authorization +check_phrase 'just[[:space:]]+(do|continue|proceed|execute|run)[[:space:]]+(it|this|that|everything|the[[:space:]]+rest)?' \ + '"just do it"' || true +# Explicit requests to skip checkpoints +check_phrase '(without[[:space:]]+(stopping|pausing|checking|asking)|no[[:space:]]+(need[[:space:]]+to[[:space:]]+(stop|pause|check|ask|confirm))|don.?t[[:space:]]+(stop|pause|ask|check|confirm))' \ + '"skip checkpoints"' || true + +# No trigger phrase found — nothing to inject. +[ -z "$matched_phrase" ] && exit 0 + +# ── Check for a locked plan ────────────────────────────────────────────────── +plans_dir="${cwd_dir}/docs/plans" +locked_plan="" +locked_plan_name="" + +if [ -d "$plans_dir" ]; then + locked_plan=$(grep -rl '\*\*Status:\*\* Locked' "$plans_dir" 2>/dev/null \ + | grep '\.md$' | grep -v '\.scope-lock' | head -1 || true) +fi + +# No locked plan — the invariant only applies under a lock. +[ -z "$locked_plan" ] && exit 0 + +locked_plan_name=$(basename "$locked_plan") + +# ── Inject strict-interpretation reminder ──────────────────────────────────── +reminder=$(cat <<REMINDER +⚠️ STRICT-INTERPRETATION GATE — LOCKED PLAN IN EFFECT + +Detected phrase: ${matched_phrase} +Locked plan: ${locked_plan_name} + +This phrase has historically been used as license to rescope, collapse PRs, or +skip pipeline discipline. Under a Locked plan it authorizes NONE of those things. + +Mandated interpretations (from skills/using-superpowers/SKILL.md): + + "reorder as needed" → reorder tasks within the same PR only; manifest unchanged + "create a PR" → create the number of PRs in the Scope Manifest's PR Grouping table + "test locally" → run every task's declared verification steps; CI still runs + "make it work" → implement the full manifest; surface blockers, do not trim scope + "ship a demo" → no demo mode exists; ship the locked manifest or invoke unlock path + "be quick / be efficient" → parallelism, not skipping + "go autonomous / auto mode"→ no discipline bypass; pipeline runs in full regardless + "go ahead / just do it" → proceed with the locked manifest; gates still apply + +If the user's intent genuinely conflicts with the locked manifest, STOP and ask. +Do not pick a looser interpretation and proceed. The locked manifest wins until +the user goes through the unlock path (recording-decisions + re-run alignment-check). + +See skills/scope-lock/SKILL.md for the unlock path. +REMINDER +) + +# Escape for JSON embedding. +escape_for_json() { + local s="$1" + s="${s//\\/\\\\}" + s="${s//\"/\\\"}" + s="${s//$'\n'/\\n}" + s="${s//$'\r'/\\r}" + s="${s//$'\t'/\\t}" + printf '%s' "$s" +} + +reminder_escaped=$(escape_for_json "$reminder") + +printf '{"additional_context":"%s"}\n' "$reminder_escaped" +exit 0 diff --git a/hooks/subagent-scope-guard b/hooks/subagent-scope-guard new file mode 100755 index 0000000..a000298 --- /dev/null +++ b/hooks/subagent-scope-guard @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# hooks/subagent-scope-guard +# SubagentStop hook: checks that a subagent did not modify protected files +# (*.scope-lock, locked docs/plans/*.md) as part of its work. +# +# A subagent should never write scope-lock files directly (only alignment-check +# via the scope-lock skill does that) and should never edit a locked plan file +# (only the unlock path does that). If either is detected the subagent's stop +# is blocked so the lead agent is alerted before the subagent's changes are +# accepted. +# +# Honoured stop_hook_active flag to avoid infinite loops — the guard fires once. +# +# Global opt-out: set SUPERPOWERS_HOOKS_DISABLE=1 + +set -euo pipefail + +[ "${SUPERPOWERS_HOOKS_DISABLE:-}" = "1" ] && exit 0 + +[ -t 0 ] && exit 0 +command -v jq >/dev/null 2>&1 || exit 0 + +hook_input=$(cat || true) +[ -z "$hook_input" ] && exit 0 + +cwd_dir=$(printf '%s' "$hook_input" | jq -r '.cwd // empty' 2>/dev/null || true) +[ -z "$cwd_dir" ] && cwd_dir="${PWD}" + +stop_hook_active=$(printf '%s' "$hook_input" | jq -r '.stop_hook_active // false' 2>/dev/null || echo "false") +[ "$stop_hook_active" = "true" ] && exit 0 + +block() { + local reason="$1" + printf '{"decision":"block","reason":%s}\n' \ + "$(printf '%s' "$reason" | jq -Rs .)" + exit 2 +} + +# ── Detect uncommitted or recently committed protected-file changes ─────────── +# We check both the working tree (uncommitted) and the last commit on HEAD +# (just committed but not yet reviewed by the lead). +violations="" + +# Working-tree and index modifications to protected files +if command -v git >/dev/null 2>&1; then + ( + cd "$cwd_dir" 2>/dev/null || exit 0 + + # Uncommitted changes to scope-lock files + scope_lock_dirty=$(git status --porcelain 2>/dev/null \ + | awk '{print $2}' | grep '\.scope-lock$' || true) + if [ -n "$scope_lock_dirty" ]; then + while IFS= read -r f; do + violations="${violations} • Uncommitted change to scope-lock file: ${f}\n" + done <<< "$scope_lock_dirty" + fi + + # Uncommitted changes to docs/plans/*.md + plan_dirty=$(git status --porcelain 2>/dev/null \ + | awk '{print $2}' | grep '^docs/plans/.*\.md$' | grep -v '\.scope-lock' || true) + if [ -n "$plan_dirty" ]; then + while IFS= read -r f; do + violations="${violations} • Uncommitted change to plan file: ${f}\n" + done <<< "$plan_dirty" + fi + + # Last commit: did it touch scope-lock or locked plan files? + if git rev-parse --verify HEAD >/dev/null 2>&1; then + last_commit_files=$(git diff-tree --no-commit-id -r --name-only HEAD 2>/dev/null || true) + + lock_in_commit=$(printf '%s' "$last_commit_files" | grep '\.scope-lock$' || true) + if [ -n "$lock_in_commit" ]; then + while IFS= read -r f; do + violations="${violations} • Last commit modified scope-lock file: ${f}\n" + done <<< "$lock_in_commit" + fi + + plans_in_commit=$(printf '%s' "$last_commit_files" \ + | grep '^docs/plans/.*\.md$' | grep -v '\.scope-lock' || true) + if [ -n "$plans_in_commit" ]; then + while IFS= read -r f; do + # Only flag if the plan is currently Locked + if [ -f "$f" ] && grep -q '\*\*Status:\*\* Locked' "$f" 2>/dev/null; then + violations="${violations} • Last commit modified a Locked plan file: ${f}\n" + fi + done <<< "$plans_in_commit" + fi + fi + ) +fi + +[ -z "$violations" ] && exit 0 + +block "Subagent stop blocked — protected files were modified: + +${violations} +Subagents must not write .scope-lock files or edit Locked plan files directly. +These files are controlled exclusively by the scope-lock skill via the unlock path. + +Before this subagent's changes are accepted: + 1. Revert the protected-file changes (git checkout -- <file>). + 2. If a scope change is genuinely needed, surface it to the lead agent and + go through the unlock path: recording-decisions → update manifest → + re-run alignment-check. + +See skills/scope-lock/SKILL.md for the unlock path." diff --git a/skills/using-superpowers/SKILL.md b/skills/using-superpowers/SKILL.md index 182e37d..72474b1 100644 --- a/skills/using-superpowers/SKILL.md +++ b/skills/using-superpowers/SKILL.md @@ -113,6 +113,8 @@ When the autonomous pipeline is running and a user instruction is **ambiguous**, | "ship a demo" | partial scope + happy-path-only tests | there is no demo mode; either ship the locked manifest or invoke the unlock path | | "do whatever you think is best" | unilaterally restructure plan | do the locked manifest; surface choices not covered by the manifest | | "be efficient" / "be quick" | drop tests, drop reviews, drop tasks | run the pipeline at full discipline; speed comes from parallelism, not from skipping | +| "go autonomous" / "run autonomously" / "auto mode" / "auto-mode" | treat as license to rescope, collapse PRs, skip pipeline discipline, or remove checkpoints | "autonomous" means running without interrupting the user — it does not relax any pipeline rule; all gates, checkpoints, and manifest constraints still apply | +| "go ahead" / "continue" / "just do it" | proceed without pipeline gates; treat as blanket permission to rescope or collapse | proceed with the locked manifest as defined; if the next step is ambiguous under the lock, stop and ask rather than picking the looser reading | **When multiple strict interpretations remain plausible**, the agent stops and asks. Picking one and proceeding is not allowed. The cheapest place to catch a misinterpretation is before any commit; the most expensive is after a PR is opened. From f96d5fbe761d11b2b6f2c19b425b098c1cf9ce02 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 1 May 2026 04:15:39 +0000 Subject: [PATCH 6/7] fix: block agent self-bypass via SUPERPOWERS_* env var assignment in Bash commands Agent-Logs-Url: https://github.com/GoCodeAlone/claude-superpowers/sessions/333dd4a3-c249-43c5-911a-56193578cb9c Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --- hooks/pre-tool-scope-guard | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/hooks/pre-tool-scope-guard b/hooks/pre-tool-scope-guard index a55a13c..5d57384 100755 --- a/hooks/pre-tool-scope-guard +++ b/hooks/pre-tool-scope-guard @@ -4,7 +4,8 @@ # that could violate a locked scope manifest. # # Blocks: -# Bash tool — git push --force / --force-with-lease / -f (always; rewrites remote history) +# Bash tool — SUPERPOWERS_* env var assignment (self-bypass prevention; always, before disable check) +# — git push --force / --force-with-lease / -f (always; rewrites remote history) # — git rebase -i / git reset --hard (always; rewrites local history) # — git push / gh pr create when locked plan hash mismatches # — git push / git commit to main or master (unless SUPERPOWERS_ALLOW_DEFAULT_BRANCH=1) @@ -12,13 +13,14 @@ # — *.scope-lock files (unless SUPERPOWERS_SCOPE_LOCK_WRITE=1) # — docs/plans/*.md when plan is Locked (unless SUPERPOWERS_PLAN_LOCK_WRITE=1) # -# Global opt-out: set SUPERPOWERS_HOOKS_DISABLE=1 +# Global opt-out: set SUPERPOWERS_HOOKS_DISABLE=1 in the *operator's* terminal +# environment before starting the session. The agent cannot set this itself. set -euo pipefail -[ "${SUPERPOWERS_HOOKS_DISABLE:-}" = "1" ] && exit 0 - # Require stdin (PreToolUse always sends a JSON payload). +# Read input BEFORE the SUPERPOWERS_HOOKS_DISABLE check so the self-bypass +# detector can fire regardless of the current disable-flag state. [ -t 0 ] && exit 0 command -v jq >/dev/null 2>&1 || exit 0 @@ -58,6 +60,21 @@ case "$tool_name" in cmd=$(printf '%s' "$hook_input" | jq -r '.tool_input.command // empty' 2>/dev/null || true) [ -z "$cmd" ] && exit 0 + # ── 0. Self-bypass prevention (ALWAYS first; NOT gated by SUPERPOWERS_HOOKS_DISABLE) ── + # An agent setting a SUPERPOWERS_* env var poisons its own shell session for + # subsequent tool calls, allowing it to bypass every downstream gate. + # Patterns caught: + # export SUPERPOWERS_HOOKS_DISABLE=1 + # SUPERPOWERS_ALLOW_DEFAULT_BRANCH=1 git push origin main + # env SUPERPOWERS_SCOPE_LOCK_WRITE=1 bash -c '...' + # SUPERPOWERS_PLAN_LOCK_WRITE=1; git commit ... (before semicolon) + if printf '%s' "$cmd" | grep -qE '(^|[;&|[:space:]])(export[[:space:]]+)?SUPERPOWERS_[A-Z_]+='; then + block "Self-bypass attempt blocked: setting a SUPERPOWERS_* environment variable from inside a Bash tool call is not permitted. These variables control pipeline enforcement gates and may only be set by the human operator in their terminal environment before starting the agent session. The agent cannot grant itself permission to bypass its own constraints — that would make the gates meaningless." + fi + + # ── Global opt-out (operator-level only, checked after self-bypass guard) ── + [ "${SUPERPOWERS_HOOKS_DISABLE:-}" = "1" ] && exit 0 + # ── 1. Force push (always blocked) ────────────────────────────────────── # Catches: --force, --force-with-lease, -f flag # These overwrite remote refs and lose commits for anyone who already pulled. @@ -102,6 +119,9 @@ case "$tool_name" in ;; Write|Edit|MultiEdit) + # Operator-level global opt-out for file-write guards. + [ "${SUPERPOWERS_HOOKS_DISABLE:-}" = "1" ] && exit 0 + # Collect all file paths involved in the operation. if [ "$tool_name" = "MultiEdit" ]; then file_paths=$(printf '%s' "$hook_input" \ From ee9ea537a59fa2c49067f1e0a035908441bcabfc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 1 May 2026 04:34:17 +0000 Subject: [PATCH 7/7] fix: address all 9 PR review feedback items Agent-Logs-Url: https://github.com/GoCodeAlone/claude-superpowers/sessions/333f45fb-54f9-4471-a162-9d3bdd01cb0e Co-authored-by: intel352 <77607+intel352@users.noreply.github.com> --- hooks/pre-compact-snapshot | 19 +++++-------------- hooks/pre-tool-scope-guard | 9 ++++++++- hooks/subagent-scope-guard | 8 +++++--- skills/scope-lock/SKILL.md | 2 +- tests/plan-scope-check.sh | 13 +++++-------- tests/skill-activation-audit.sh | 15 ++++++++++++--- tests/skill-cross-refs.sh | 19 +++++++++++-------- 7 files changed, 47 insertions(+), 38 deletions(-) diff --git a/hooks/pre-compact-snapshot b/hooks/pre-compact-snapshot index 1d5ca82..18a179d 100755 --- a/hooks/pre-compact-snapshot +++ b/hooks/pre-compact-snapshot @@ -26,17 +26,6 @@ hook_input=$(cat || true) cwd_dir=$(printf '%s' "$hook_input" | jq -r '.cwd // empty' 2>/dev/null || true) [ -z "$cwd_dir" ] && cwd_dir="${PWD}" -# ── Portable sha256 (same helper as tests/plan-scope-check.sh) ─────────────── -sha256_file() { - if command -v sha256sum >/dev/null 2>&1; then - sha256sum "$1" 2>/dev/null | awk '{print $1}' - elif command -v shasum >/dev/null 2>&1; then - shasum -a 256 "$1" 2>/dev/null | awk '{print $1}' - else - echo "unavailable" - fi -} - # ── Find locked plans and collect their state ───────────────────────────────── plans_dir="${cwd_dir}/docs/plans" state_section="" @@ -50,10 +39,12 @@ if [ -d "$plans_dir" ]; then # Extract Status line status_line=$(grep '\*\*Status:\*\*' "$plan" 2>/dev/null | tail -1 | sed 's/.*\*\*Status:\*\*[[:space:]]*//' || true) - # Hash of the lock file (the manifest hash stored at lock time) + # Read the manifest hash stored inside the lock file (first non-comment, + # non-blank line). This is the same value tests/plan-scope-check.sh + # --verify-lock compares against the current manifest sha256. if [ -f "$lock_file" ]; then - lock_hash=$(sha256_file "$lock_file") - state_section="${state_section} ${plan_name}: ${status_line} (lock-file sha256: ${lock_hash})\n" + manifest_hash=$(awk 'NF && !/^#/ {print; exit}' "$lock_file" 2>/dev/null || true) + state_section="${state_section} ${plan_name}: ${status_line} (manifest-sha256: ${manifest_hash:-unknown})\n" else state_section="${state_section} ${plan_name}: ${status_line} (no lock file — not yet locked or lock file missing)\n" fi diff --git a/hooks/pre-tool-scope-guard b/hooks/pre-tool-scope-guard index 5d57384..22d9515 100755 --- a/hooks/pre-tool-scope-guard +++ b/hooks/pre-tool-scope-guard @@ -68,7 +68,14 @@ case "$tool_name" in # SUPERPOWERS_ALLOW_DEFAULT_BRANCH=1 git push origin main # env SUPERPOWERS_SCOPE_LOCK_WRITE=1 bash -c '...' # SUPERPOWERS_PLAN_LOCK_WRITE=1; git commit ... (before semicolon) - if printf '%s' "$cmd" | grep -qE '(^|[;&|[:space:]])(export[[:space:]]+)?SUPERPOWERS_[A-Z_]+='; then + # Not blocked (benign debugging/grepping): + # echo "SUPERPOWERS_HOOKS_DISABLE=1" + # grep 'SUPERPOWERS_.*=' file + # Best-effort: strip single- and double-quoted strings before checking so + # that mentions of SUPERPOWERS_* inside quoted arguments don't trigger a + # false-positive block. + cmd_no_quotes=$(printf '%s' "$cmd" | sed "s/\"[^\"]*\"//g; s/'[^']*'//g") + if printf '%s' "$cmd_no_quotes" | grep -qE '(^|[;&|[:space:]])(export[[:space:]]+)?SUPERPOWERS_[A-Z_]+='; then block "Self-bypass attempt blocked: setting a SUPERPOWERS_* environment variable from inside a Bash tool call is not permitted. These variables control pipeline enforcement gates and may only be set by the human operator in their terminal environment before starting the agent session. The agent cannot grant itself permission to bypass its own constraints — that would make the gates meaningless." fi diff --git a/hooks/subagent-scope-guard b/hooks/subagent-scope-guard index a000298..601b281 100755 --- a/hooks/subagent-scope-guard +++ b/hooks/subagent-scope-guard @@ -43,8 +43,8 @@ violations="" # Working-tree and index modifications to protected files if command -v git >/dev/null 2>&1; then - ( - cd "$cwd_dir" 2>/dev/null || exit 0 + _saved_pwd="${PWD}" + if cd "$cwd_dir" 2>/dev/null; then # Uncommitted changes to scope-lock files scope_lock_dirty=$(git status --porcelain 2>/dev/null \ @@ -86,7 +86,9 @@ if command -v git >/dev/null 2>&1; then done <<< "$plans_in_commit" fi fi - ) + + cd "$_saved_pwd" + fi fi [ -z "$violations" ] && exit 0 diff --git a/skills/scope-lock/SKILL.md b/skills/scope-lock/SKILL.md index 9ad7ec3..2634c64 100644 --- a/skills/scope-lock/SKILL.md +++ b/skills/scope-lock/SKILL.md @@ -108,7 +108,7 @@ The unlock path is intentionally heavyweight. Cheap unlock = no lock at all. - Commit both files in the same commit: `chore: lock scope for <feature> (alignment passed)`. **`subagent-driven-development` (per-task checkpoint):** -- Before dispatching the next task, run `tests/plan-scope-check.sh --plan <plan-path>` to verify (a) the plan's manifest hash still matches `<plan-path>.scope-lock`, (b) every commit on the feature branch traces to a task in the manifest, (c) no manifest task is missing. +- Before dispatching the next task, run `tests/plan-scope-check.sh --verify-lock <plan-path>` to verify (a) the plan's manifest hash still matches `<plan-path>.scope-lock`, (b) every commit on the feature branch traces to a task in the manifest, (c) no manifest task is missing. - On any FAIL, stop dispatching new work; surface the discrepancy to the user. - After all tasks complete, run the same check before invoking `finishing-a-development-branch`. diff --git a/tests/plan-scope-check.sh b/tests/plan-scope-check.sh index c5beb30..009d396 100755 --- a/tests/plan-scope-check.sh +++ b/tests/plan-scope-check.sh @@ -13,9 +13,7 @@ # <path>.scope-lock (only meaningful after the plan is # in Locked status). # --against-branch <plan> Verify the actual git branch layout matches the -# PR Grouping table: every commit since the merge-base -# with the plan's base branch is reachable from a -# branch listed in the table; every branch in the +# PR Grouping table: every branch listed in the # table exists locally or on origin. # # Multiple modes can be combined. With no flags, runs --plan on every plan in @@ -66,11 +64,10 @@ sha256_stdin() { } # Check the manifest is well-formed. Args: plan path. Echoes problems to stdout. -# Legacy plans (no manifest section AND no `# scope-manifest: required` marker -# in a hidden HTML comment) are skipped — only plans that opt into the format -# are enforced. New plans created by writing-plans always include the section, -# so this only matters for grandfathering historical plans pre-dating the -# scope-lock skill. +# Legacy plans (no manifest section) are skipped — only plans that opt into the +# format are enforced. New plans created by writing-plans always include the +# section, so this only matters for grandfathering historical plans pre-dating +# the scope-lock skill. check_manifest_wellformed() { local plan="$1" local manifest diff --git a/tests/skill-activation-audit.sh b/tests/skill-activation-audit.sh index dc948e5..ae8809d 100755 --- a/tests/skill-activation-audit.sh +++ b/tests/skill-activation-audit.sh @@ -67,15 +67,24 @@ if [ ! -r "$STATE_FILE" ]; then fi # Pipeline gates we expect for an autonomous run, in order. The pipeline -# is the canonical chain documented in skills/using-superpowers/SKILL.md. +# is the canonical chain documented in skills/using-superpowers/SKILL.md: +# brainstorming → adversarial-design-review (design) → writing-plans → +# adversarial-design-review (plan) → alignment-check → scope-lock → +# subagent-driven-development → finishing-a-development-branch → +# pr-monitoring → post-merge-retrospective +# Note: adversarial-design-review appears twice (design and plan phases); +# this list de-dupes it — the audit reports the count seen so gaps can be +# identified but cannot distinguish the two phases without --phase= args. PIPELINE_GATES=( brainstorming adversarial-design-review writing-plans alignment-check + scope-lock subagent-driven-development finishing-a-development-branch pr-monitoring + post-merge-retrospective ) # Optional gates — present only when conditions trigger them. Reported @@ -112,10 +121,10 @@ extract_skills() { extract_agents() { if command -v jq >/dev/null 2>&1; then - jq -r 'select(.tool=="Agent" or .tool=="Task") | .detail' "$STATE_FILE" 2>/dev/null \ + jq -r 'select(.tool=="Agent" or (.tool | type=="string" and startswith("Task"))) | .detail' "$STATE_FILE" 2>/dev/null \ | sed -nE 's/.*agent=([A-Za-z0-9_-]+).*/\1/p' else - grep -E '"tool":"(Agent|Task)"' "$STATE_FILE" 2>/dev/null \ + grep -E '"tool":"(Agent|Task[^"]*)"' "$STATE_FILE" 2>/dev/null \ | sed -nE 's/.*agent=([A-Za-z0-9_-]+).*/\1/p' fi } diff --git a/tests/skill-cross-refs.sh b/tests/skill-cross-refs.sh index f8c18f1..e5b8bc0 100755 --- a/tests/skill-cross-refs.sh +++ b/tests/skill-cross-refs.sh @@ -31,8 +31,8 @@ tmp_failures="$(mktemp)" || { echo "ERROR: mktemp failed" >&2; exit 3; } trap 'rm -f "$tmp_failures"' EXIT # Build the set of known skill names and agent names from the filesystem. -known_skills="$(find skills -mindepth 1 -maxdepth 1 -type d -printf '%f\n' | sort -u)" -known_agents="$(find agents -mindepth 1 -maxdepth 1 -type f -name '*.md' -printf '%f\n' | sed -E 's|\.md$||' | sort -u)" +known_skills="$(find skills -mindepth 1 -maxdepth 1 -type d | sed -E 's|.*/||' | sort -u)" +known_agents="$(find agents -mindepth 1 -maxdepth 1 -type f -name '*.md' | sed -E 's|.*/||; s|\.md$||' | sort -u)" # Helper: is the name a known skill or agent? is_known_target() { @@ -68,12 +68,14 @@ strip_fences() { # Files to scan. Exclude *creation-log* / changelog-style files where the # point is to record historical names that no longer exist. -mapfile -t scan_files < <(find skills agents -type f -name '*.md' \ - ! -iname 'CREATION-LOG.md' | sort) +# Use a newline-separated string for portability (no mapfile / Bash 4+). +scan_files_list="$(find skills agents -type f -name '*.md' \ + ! -iname 'CREATION-LOG.md' | sort)" # --- 1. Skill / agent references ---------------------------------------- -for f in "${scan_files[@]}"; do +while IFS= read -r f; do + [ -z "$f" ] && continue annotated="$(strip_fences "$f")" # Pattern 1: bare `<slug>/SKILL.md` references @@ -118,7 +120,7 @@ for f in "${scan_files[@]}"; do fi done done < <(printf '%s\n' "$annotated" | grep -E 'superpowers:[a-z][a-z0-9-]+' || true) -done +done <<< "$scan_files_list" # --- 2. Step references -------------------------------------------------- @@ -141,7 +143,8 @@ has_step() { || grep -qE "Step[[:space:]]+${step}[[:space:]]*[:.]" "$file" } -for f in "${scan_files[@]}"; do +while IFS= read -r f; do + [ -z "$f" ] && continue annotated="$(strip_fences "$f")" while IFS=: read -r line_no line; do @@ -162,7 +165,7 @@ for f in "${scan_files[@]}"; do | sed -E "s/^([a-z][a-z0-9-]+)[a-z']*[[:space:]]+Step[[:space:]]+([0-9]+[a-z]?).*/\1 \2/") done < <(printf '%s\n' "$annotated" \ | grep -E "[a-z][a-z0-9-]+[a-z']*[[:space:]]+Step[[:space:]]+[0-9]+[a-z]?" || true) -done +done <<< "$scan_files_list" # --- Report --------------------------------------------------------------