diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 661e34a..15b81bb 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -9,7 +9,7 @@ { "name": "autodev", "description": "Autonomous development workflow skills for coding agents", - "version": "6.1.5", + "version": "6.2.0", "source": "./", "author": { "name": "Jon Langevin", diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 5aa00dc..5b5d32b 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "autodev", "description": "Autonomous development workflow skills for coding agents: design, review, planning, execution, monitoring, and retrospectives", - "version": "6.1.5", + "version": "6.2.0", "author": { "name": "Jon Langevin", "email": "jon@gocodealone.com" diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index 9aa7827..28f47aa 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -2,7 +2,7 @@ "name": "autodev", "displayName": "Autonomous Dev Kit", "description": "Autonomous development workflow skills for coding agents", - "version": "6.1.5", + "version": "6.2.0", "author": { "name": "Jon Langevin", "email": "jon@gocodealone.com" diff --git a/README.md b/README.md index 67a3656..559d29e 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,7 @@ adversarial review challenges it explicitly. **Testing** - **test-driven-development** - RED-GREEN-REFACTOR cycle (includes testing anti-patterns reference) +- **demonstration-fidelity** - A demo/example/showcase must execute the real artifact — no reimplementation, hard-coded output, or different-language fake **Debugging** - **systematic-debugging** - 4-phase root cause process (includes root-cause-tracing, defense-in-depth, condition-based-waiting techniques) diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 47c23c1..c671f3a 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -1,5 +1,14 @@ # Autonomous Dev Kit Release Notes +## v6.2.0 — 2026-05-29 + +New skill **demonstration-fidelity** + an advisory write-time hook, closing a verification-theater gap: an agent writes real code, then "demonstrates" it with a demo that never executes the real artifact — reimplementing the logic, hard-coding the output, or rewriting it in another language. The demo proves nothing yet is presented as proof. + +- **`skills/demonstration-fidelity/SKILL.md`** (host-neutral, load-bearing on every harness): a demonstration MUST execute the real artifact and show output produced by that run. Forbids reimplementation, hard-coded output, stubbing the artifact-under-demonstration, and detached prototypes — regardless of language. Allows substituting a *dependency* at a real interface seam **with disclosure**. Establishes "fidelity, not language sameness" (a real cross-language client crossing a real interface is valid), a 3-question fidelity test, a fake-vs-faithful example, and a rationalization table seeded from RED-baseline transcripts. +- **`hooks/pretool-demo-fidelity-guard`** (advisory, NEVER blocks; Claude + Codex + Cursor via `hooks.json`): on a Write/Edit to a demo-like path, injects a fidelity reminder pointing at the skill. Heuristic is anchored to path *segments* (`demos`/`examples`) + basename prefixes (`demo*`/`example*`/`showcase*`/`quickstart*`) with segment/suffix exclusions (`test`/`spec`/`testdata`/`fixtures`/`vendor` segments, `*_test.*`/`*.spec.*` basenames) — so `example_test.go`/`testdata/` are skipped while `examples/latest-feature-demo.py` still fires. Session dedup keyed on `basename(transcript_path)`; fails **open** (fires) on state I/O failure; honors `SUPERPOWERS_HOOKS_DISABLE=1`. +- **Pipeline wiring:** new `runtime-launch-validation` "Demonstration / example / showcase" change-class row (carving out artifact-stub-forbidden vs. disclosed-dependency-seam-allowed so it does not contradict RLV's "no stub on either end"); a `verification-before-completion` `demo/example works` claim-matrix row; a `finishing-a-development-branch` Step 1b demo note; `using-autodev` cross-cutting listing; README + `tests/cross-llm-coverage.md` rows. +- **Tests:** 22 `tests/hook-contracts.sh` assertions for the new guard (fires/silent/excluded/dedup/fail-open/disable-env/malformed-stdin/never-blocks). Skill is host-neutral (`skill-content-grep.sh`) and cross-refs resolve (`skill-cross-refs.sh`). + ## v6.1.5 — 2026-05-28 SessionStart time-based dedup as defense in depth. diff --git a/docs/plans/2026-05-29-demonstration-fidelity-design.md b/docs/plans/2026-05-29-demonstration-fidelity-design.md new file mode 100644 index 0000000..d1b24e9 --- /dev/null +++ b/docs/plans/2026-05-29-demonstration-fidelity-design.md @@ -0,0 +1,363 @@ +# Demonstration Fidelity — Design + +**Date:** 2026-05-29 +**Branch:** feat/demonstration-fidelity-2026-05-29T1128 +**Author:** autonomous pipeline (dogfood) +**Status:** Draft (rev 3 — post design-phase adversarial review, cycle 2) + +## Problem + +An agent implements a real artifact, then produces a "demo" intended to show +that artifact working — but the demo does **not** execute the artifact. Observed +failure (production, daily Claude + Codex use): agent wrote the feature in one +language, then built a demo in a *different* language that **hard-coded** the +outputs for presentation. The demo proved nothing about the real code, yet was +presented as proof it works. "Fake code." + +This is a verification-theater failure specific to demonstration artifacts. It +slips past every existing gate: + +| Existing gate | Why it misses this | +|---|---| +| `scope-lock` ("there is no demo mode") | Kills *partial-scope* work shipped as a demo. Says nothing about a *full-scope* demo that fakes its output. | +| `runtime-launch-validation` | Triggered by change-class (build/deploy/migration…), not by "I'm writing a demo." Its "Library/SDK → tiny consumer program" row never forbids that consumer being a reimplementation or printing literals. | +| `verification-before-completion` | "Evidence before assertion," but its claim matrix has no `demo/example works` row, so a fabricated demo never gets challenged. | + +**Gap:** nothing in the kit owns the invariant *a demonstration must execute the +real artifact.* + +## RED baseline (run before writing the skill — TDD Iron Law) + +Two pressure scenarios were dispatched to fresh subagents **before** any skill +text was written. Verbatim transcripts harvested; see also the production report +above (primary RED evidence). + +- **Baseline #1** — Go library fn, easy to run, *with* a self-report request. + Agent built a faithful Go demo (`main.go` importing the real package), ran it, + and explicitly rejected hard-coding: *"a demo that prints hard-coded strings + would look identical on screen but prove nothing — so I imported and called + it."* **Observer effect:** asking for a self-report primed good behavior. +- **Baseline #2** — Go HTTP service needing Postgres + auth (hard to run), *no* + self-report priming, strong "no time to stand up DB" pressure. Agent built a + faithful demo via `httptest` + the **real handler** + an in-memory store + implementing the **real `Store` interface**; made minor honest substitutions + (`chi.URLParam`→`r.PathValue`, auth omitted, in-memory store) and disclosed + them: *"what's real vs. faked … so you can answer the room honestly."* Did + **not** hard-code output. + +**Learnings that shape the skill:** + +1. Capable models often resist *full* fabrication — but the production report + proves it still happens (weaker model / stronger pressure / genuine + cross-language confusion). The skill must make fidelity the explicit default + and give a checklist that catches the severe case when an agent *is* tempted. +2. Baseline #2 exposes the real **gray zone**: faking the *dependency seam*. The + line is **not** "never substitute." It is: substitute only at a **real + interface seam** (e.g., a `Store` interface, an HTTP boundary), **disclose** + every substitution, and **never hand-author the output**. The output shown + must be produced by executing the real artifact's real code path. +3. The severe end (different language + hard-coded output presented as real) is + **absolutely forbidden**, no disclosure cures it — it executes nothing. + +## Invariant (the teaching) + +A demonstration / example / showcase / sample / quickstart / "proof it works" +artifact MUST exercise the real artifact through its real public interface, and +the output it shows MUST be produced by that execution. + +Forbidden **regardless of language**: + +- **Reimplementation / transliteration** — re-coding the logic for the demo + instead of calling it. +- **Hard-coded output** — hand-authoring the "expected" output and presenting it + as produced output. +- **Stub/mock substitution of the artifact-under-demonstration** — wiring the + demo to a fake *in place of the thing being demonstrated*. +- **Detached prototype** — building a parallel throwaway instead of invoking the + shipped entry point. + +Allowed, **with mandatory disclosure**: + +- Substituting a *dependency* of the artifact at a **real interface seam** (data + store, external service, clock) so the demo runs locally — provided the + artifact's own code path executes unchanged, and the substitution is stated + plainly ("data source is an in-memory fixture; the handler is the real one"). + Precedent: `runtime-launch-validation`'s *Database migration* row (apply + against an *ephemeral* DB) and its *Fall-back when local launch is infeasible* + section both sanction running the real artifact against a stand-in dependency. + +**Reconciling with RLV's "no stub on either end" (important — these must not +contradict):** RLV's "exercise a real interaction … not a mock or stub on either +end" rule governs the **two ends of the boundary being demonstrated**. When the +*artifact* is the boundary under demonstration, stubbing *it* is forbidden — that +is the whole point. A *dependency sitting behind* the artifact (a `Store` the +handler calls) is **not** an end of the demonstrated boundary; substituting it at +a real interface seam, with disclosure, leaves the artifact's own end real. The +forbidden case is stubbing the **artifact-under-demonstration**; the allowed case +is substituting a **dependency** behind it. The RLV change-class row this design +adds (Components §2) states this carve-out explicitly so the two skills agree. + +**Critical nuance (target fidelity, not language sameness):** cross-language is +*not* the crime. A real client written in another language that crosses a real +interface into the running artifact — e.g., a Python client making real HTTP +calls to a running Go service — is a *valid* demo, **provided that crossing is +actually exercised** (both ends of *that* boundary are real — no stub on either +end of the client↔service interaction). The rule keys on *did the real code run +to produce this output*, never on *is the demo in the same language*. + +## Approaches considered + +- **A. New skill `demonstration-fidelity` + pipeline wiring + advisory hook + (CHOSEN).** Discoverable at demo-writing time (its own trigger), harness-agnostic + teaching, plus a write-time backstop on Claude/Codex/Cursor. Defense in depth. +- **B. Extend `runtime-launch-validation` + `verification-before-completion` + only.** Lower sprawl, but an agent mid-demo does not think "runtime launch + validation"; weak discoverability at the moment of failure. +- **C. Skill only, no hook.** Simplest; loses the write-time reminder. +- **D. Blocking Stop-hook interceptor on "this demonstrates X" claims** + (raised by adversarial review). Catches the *presentation moment* directly. + **Rejected / accepted-as-out-of-scope** because: (1) the user explicitly chose + advisory-never-blocks and rejected hard-block-on-completion; (2) a Stop hook + must `decision:block` to have any effect (a non-blocking Stop nudge is a no-op + once the agent has stopped), so "advisory Stop hook" is not a real option; (3) + the completion-moment is instead covered **harness-agnostically** by the new + `verification-before-completion` claim-matrix row (the agent's own pre-stop + discipline challenges "demo works"), which needs no blocking hook. Recorded + here as an explicitly-considered alternative. + +User selected **A**, advisory (never-blocking) hook, single PR. + +## Defense-in-depth layering (which layer owns which failure mode) + +| Failure mode | Owning layer | +|---|---| +| **Dominant:** fake demo in a normally-named file / README block / inline / cross-language, presented as proof | **The skill** (applies to *any* proof artifact, any language, any location) **+** the `verification-before-completion` claim-matrix row (challenges the "demo works" claim at completion time, harness-agnostic) | +| Demo written to a *filename-detectable* path (`demo/`, `examples/`, `demo_*.py`) | the advisory PreToolUse hook **nudge** (best-effort bonus only) | +| Partial-scope work mislabeled "demo" | existing `scope-lock` | + +The skill is **load-bearing**; the hook is a **bonus**. The design does NOT rely +on filename detection to catch the dominant failure mode — the skill and the +completion-claim discipline do. + +## Components + +1. **`skills/demonstration-fidelity/SKILL.md`** — universal, host-neutral. The + load-bearing layer (every harness reads skill markdown). Applies to **any** + proof artifact regardless of filename, location, or language. Contains: + overview + invariant, when-to-use triggers, a 3-question fidelity test, the + allowed seam-substitution + mandatory-disclosure rule, the valid + cross-interface pattern, one fake-vs-faithful example pair, a rationalization + table seeded from the RED baseline, red-flags, common mistakes, cross-refs to + `runtime-launch-validation` / `verification-before-completion` / `scope-lock`. + No Claude-only tokens (passes `tests/skill-content-grep.sh`). + + **Draft CSO description** (symptom-first, per writing-skills): *"Use when + creating a demo, example, quickstart, showcase, or any artifact meant to + prove an implementation works — before writing it, to ensure it executes the + real code instead of reimplementing it, hard-coding output, or faking it in + another language."* + +2. **Pipeline wiring (cross-refs):** + - `runtime-launch-validation`: new change-class row + a "See also" entry. + **Exact row wording (so it does not contradict RLV's existing "no stub on + either end" boundary row):** + `| Demonstration / example / showcase artifact (anything built to show a + change working) | The real artifact, invoked through its real entry point; + output captured from that run | Output is produced by the real code path, + not literals; the artifact-under-demonstration is NOT stubbed; any + substituted *dependency* sits behind a real interface seam and is disclosed. + See \`demonstration-fidelity\`. |` + - `verification-before-completion`: claim-matrix row + `demo/example works | the real artifact executed via the demo produced the + shown output | hand-written/hard-coded output, a reimplementation`. **This + is the harness-agnostic completion-time catch for the dominant failure + mode.** + - `finishing-a-development-branch`: Step 1b note — if the change shipped any + demo/example artifact, `demonstration-fidelity` applies before merge. + - `using-autodev`: add to the skill listing / red-flags so it is discoverable. + - `README.md` skills library + `tests/cross-llm-coverage.md` row (host-neutral). + +3. **`hooks/pretool-demo-fidelity-guard`** — advisory, **never blocks**. + PreToolUse on `Write|Edit`. **Best-effort nudge only — not the primary + defense.** Emits `hookSpecificOutput.additionalContext` with a one-line + fidelity reminder pointing at the skill when the target path looks like a + *demo* artifact. + + **Tightened heuristic — anchored to path semantics, NOT bare substrings** + (substrings `test`/`spec` would wrongly eat `latest`/`contest`/`attestation`/ + `inspector`/`spectrum`/`retrospective` demos — empirically confirmed by the + reviewer). Split the path on `/` into segments. + + **Fire only when** (trigger): + - a path **segment** is exactly `demos` or `examples`, **or** + - the **basename starts with** `demo`, `example`, `showcase`, or `quickstart` + (e.g. `demo_*.py`, `quickstart.md`), + + **and NOT excluded.** Exclude only when (anchored, never bare-substring): + - any path **segment** ∈ {`test`, `tests`, `spec`, `specs`, `testdata`, + `fixtures`, `vendor`, `node_modules`, `.git`}, **or** + - the **basename** matches `*_test.*`, `*.test.*`, or `*.spec.*`. + + Verified outcomes: excludes `example_test.go` (basename `*_test.*`), + `sample_config.yaml` (`sample` is not a trigger), `testdata/foo.json` + (segment `testdata`); **keeps** `examples/latest-feature-demo.py`, + `examples/attestation-demo.go`, `demo_inspector.py` (no excluded segment, + basename not a test/spec suffix). FN by design: inline/README demos and demos + in normally-named files (owned by the skill). Residual FP is low and + advisory-only — a single ignorable line. + + **Dedup:** session-scoped, keyed by `:` appended to + `.claude/autodev-state/demo-fidelity-seen.jsonl` (one reminder per path per + session). **Fail-open = fire:** if the state dir/file is unreadable or + unwritable, the hook emits the reminder rather than silently suppressing it + (a write failure must never silence the nudge). Honors + `SUPERPOWERS_HOOKS_DISABLE=1`. Emits a *static* reminder string only — never + echoes file contents (no leakage). Any parse error → exit 0 silently (cannot + wedge a session). Registered in `hooks/hooks.json` under the existing + `Write|Edit` PreToolUse group. + + **Precedent divergence noted:** `pretool-pr-review-reminder` has no dedup + because `gh pr create` is rare; demo-file writes/edits are frequent, so + per-path session dedup is justified to prevent reminder fatigue. + +4. **Tests:** + - `tests/hook-contracts.sh`: add cases — fires `additionalContext` on a demo + path; silent on a non-demo path and on excluded test/fixture paths; never + blocks; respects the disable env; emits valid JSON; dedups within a session; + fail-open fires when state is unwritable. + - **Discoverability check** (addresses the untested-CSO finding): a subagent + scenario — agent told to "build a demo of X," skill present but NOT named — + observe whether the description triggers a skill load. Recorded in the plan's + verification, not a CI gate (behavioral, best-effort). + - Keep `tests/skill-content-grep.sh`, `tests/skill-cross-refs.sh` green. + +## Global Design Guidance + +`Guidance: none found as docs/design-guidance.md; constraints cited from canon +equivalents (README §Cross-LLM, docs/plans/2026-04-25-cross-llm-portability-design.md, +skills/writing-skills).` + +| guidance (source) | design response | +|---|---| +| Harness-agnostic / host-neutral first (README §Cross-LLM; cross-llm-portability design) | Skill is host-neutral and load-bearing for *all* harnesses; hook reaches Claude+Codex+Cursor; no Claude-only tokens; coverage table row added. | +| Skills = judgment calls; mechanical constraints = automate (writing-skills "Don't create for mechanical constraints") | Fidelity is a *judgment* call (cross-language can be valid; seam-substitution can be valid) → the skill is primary; the hook is advisory-only, not a regex gate that would false-positive on valid demos. | +| TDD Iron Law for skills (writing-skills) | RED baseline run (2 scenarios) before the skill is written; rationalization table seeded from harvested transcripts; plan gates skill-writing on baseline completion (Task 0). | +| Token efficiency (writing-skills) | Skill core kept lean; condensed phrasing. | +| One excellent example, not multi-language dilution (writing-skills) | Exactly one fake-vs-faithful example pair. | +| Scope-lock discipline | Single-PR Scope Manifest; explicitly out-of-scope: a general "anti-fabrication" skill, a blocking Stop interceptor (Option D). | + +## Security Review + +- **Auth/secrets/PII:** none introduced. Hook reads only the tool-input file + path from stdin JSON and appends a small dedup marker under + `.claude/autodev-state/` (same mechanism existing hooks use). No network, no + secrets, no PII. +- **Least privilege / abuse:** hook never executes the file under write, never + echoes file contents (emits a fixed reminder string only — no content leak), + never blocks. Honors `SUPERPOWERS_HOOKS_DISABLE=1`. Fails open (any parse error + → exit 0 silent; any state I/O failure → fire the reminder) so it can neither + wedge a session nor silently self-disable. +- **Trust boundary:** advisory `additionalContext` is model-facing text only; it + cannot alter files or run commands. + +## Infrastructure Impact + +None. Plugin-only change; no cloud resources, deploys, migrations, or cost. +`hooks/hooks.json` gains one PreToolUse entry — a plugin-loading-path change, +which is itself a `runtime-launch-validation` trigger (validated by running +`tests/hook-contracts.sh` + a manual stdin invocation of the hook). + +## Multi-Component Validation + +- **Hook ↔ harness boundary:** `tests/hook-contracts.sh` feeds real stdin JSON + to the real hook script and asserts the emitted JSON contract (real boundary, + not a mock). +- **hooks.json ↔ dispatcher:** registration parsed; `run-hook.cmd` dispatch path + exercised by the contract suite. +- **Skill ↔ cross-refs:** `tests/skill-cross-refs.sh` resolves the new references + across `skills/` + README. +- **Skill ↔ grep guard:** `tests/skill-content-grep.sh` confirms host-neutrality. +- **Behavioral (best-effort, not CI):** discoverability subagent scenario above — + acknowledged weakest link; recorded for the retro's fair-comparison baseline. + +## Assumptions + +1. Agents load a skill by its description when about to write a demo (CSO). + *Fragile* — mitigated by (a) a symptom-first description, (b) cross-refs from + RLV/finishing/verification, (c) the write-time hook reminder, and (d) a + discoverability test in the plan. +2. The `hookSpecificOutput.additionalContext` schema is consumed by Claude **and** + Codex (verified — daily use on both). +3. The dominant failure mode (inline / normally-named / cross-language fake demos) + is **owned by the skill + the completion-claim-matrix row**, not the hook. The + hook intentionally covers only the filename-detectable subset; this is a + labeled bonus, not a coverage gap in the primary defense. +4. Advisory `additionalContext` on PreToolUse is non-blocking and won't disrupt + flow. + +## Rollback + +Change classes touched: plugin-loading path (new hook + `hooks.json` entry). +Rollback = revert the PR (removes skill, wiring, hook, hooks.json entry, version +bump 6.1.5→6.2.0). No state migration. Dedup files under +`.claude/autodev-state/demo-fidelity-seen.jsonl` are untracked by git and benign +if left on disk after rollback. Safe, single-step. + +**Granular neutralization (no full revert needed):** if the advisory hook proves +noisy in production, it can be disabled *without* touching the load-bearing skill +or the `verification-before-completion` row — either remove only its +`hooks.json` PreToolUse entry, or set `SUPERPOWERS_HOOKS_DISABLE=1`. The skill + +claim-matrix row (the dominant-mode defense) survive independently. This is why +bundling the hook in the same PR is low-risk. + +## Self-challenge / adversarial-review resolutions + +- **TDD Iron Law (was Critical):** baseline now actually run (2 scenarios, + above); plan gates skill-writing on baseline (Task 0). Resolved. +- **User-intent drift (was Critical):** dominant failure mode reassigned to the + skill + completion-claim-matrix row (harness-agnostic); hook explicitly demoted + to best-effort bonus; Option D recorded as considered-and-out-of-scope per + user's advisory-only choice. Resolved. +- **Hook FP rate (was Important):** heuristic tightened (segment/prefix match + + test/fixture exclusions); `sample` dropped as a trigger; FP/FN documented. + Resolved. +- **Dedup lifecycle (was Important):** file scheme, session keying, fail-open-to- + fire, and untracked/ignorable lifecycle specified. Resolved. +- **Discoverability untested (was Important):** discoverability subagent scenario + added to the plan; CSO description drafted above. Resolved. +- **Single-PR justification (was Important):** user decision; the 9 files are one + cohesive feature; recorded as accepted. Resolved. (Plus granular-neutralization + note in Rollback so a noisy hook need not force a full revert.) + +### Backport 2026-05-29 (plan-phase adversarial review) + +- **Failed assumption:** dedup keyed on ``. **Evidence:** PreToolUse + payloads carry no `session_id` (only `session-start` reads it); the established + PreToolUse session-key idiom is `basename(transcript_path)` — `hooks/pre-tool-scope-guard:39-41`. + **Corrected behavior:** dedup key = `sha256(basename(transcript_path):path)`; + empty transcript_path → per-path dedup (advisory-acceptable). State I/O wrapped + `|| true` so `set -euo pipefail` fails **open (fire)**, never closed. + **Manifest scope:** unchanged (no task/PR/scope delta) — lock hash unaffected. + +### Backport 2026-05-29 (code-review I-1) + +- **Gap:** exclusion suffix globs `*_test.*|*.test.*|*.spec.*` missed RSpec + `*_spec.rb` → `examples/widget_spec.rb` fired spuriously. **Fix:** added + `*_spec.*`. Also hardened seg loops with `"${segs[@]:-}"` (bash 3.2 `set -u`) + and added a one-clause RLV overlap note. **Manifest scope:** unchanged. + +### Cycle-2 resolutions (rev 3) + +- **Hook exclusion over-excluded (NEW Important):** substring `test`/`spec` + exclusions replaced with path-**segment**-exact + basename-**suffix**-glob + anchoring. Keeps `examples/latest-*-demo.py` etc.; still excludes + `example_test.go`/`testdata/`. Resolved. +- **RLV "no stub on either end" contradiction (NEW Important):** added an explicit + reconciliation paragraph in the Invariant + the **exact** RLV change-class row + wording in Components §2 carving out artifact-stub (forbidden) vs. disclosed + dependency-seam substitution (allowed); fixed the imprecise "ephemeral/local + instance" citation to RLV's DB-migration row + Fall-back section. Resolved. +- **Discoverability non-gating (Minor):** accepted for this PR; the skill's plan + adds a one-time discoverability subagent check, and a follow-up to add a + periodic discoverability re-check to the audit cadence is noted (not blocking). +- **Rollback granularity (Minor):** granular-neutralization note added. Resolved. diff --git a/docs/plans/2026-05-29-demonstration-fidelity.md b/docs/plans/2026-05-29-demonstration-fidelity.md new file mode 100644 index 0000000..770ee90 --- /dev/null +++ b/docs/plans/2026-05-29-demonstration-fidelity.md @@ -0,0 +1,288 @@ +# Demonstration Fidelity Implementation Plan + +> **For the implementing agent:** REQUIRED SUB-SKILL: Use autodev:executing-plans to implement this plan task-by-task. + +**Goal:** Add a harness-agnostic `demonstration-fidelity` skill + advisory write-time hook + pipeline wiring so agents stop shipping fake demos (reimplementation / hard-coded output / artifact-stub) that don't execute the real code. + +**Architecture:** Skill markdown is the universal load-bearing layer (all harnesses). An advisory, never-blocking PreToolUse hook is a best-effort write-time nudge (Claude/Codex/Cursor). Cross-refs wire it into RLV, verification-before-completion, finishing, using-autodev, README, coverage table. Design: `docs/plans/2026-05-29-demonstration-fidelity-design.md` (adversarial-review PASS rev3). + +**Tech Stack:** Bash hooks (jq), Markdown skills, existing `tests/*.sh` harness. + +**Base branch:** main + +--- + +## Scope Manifest + +**PR Count:** 1 +**Tasks:** 8 +**Estimated Lines of Change:** ~420 (informational) + +**Out of scope:** +- A general "anti-fabrication / fake-evidence" skill beyond demonstrations (YAGNI; the reported failure is demos). +- A blocking Stop-hook interceptor on completion claims (Option D — user chose advisory-only; a non-blocking Stop hook is a no-op). +- OpenCode per-tool hook port (OpenCode has no PreToolUse equivalent today; skill markdown still covers it). +- Editing user CLAUDE.md/AGENTS.md. + +**PR Grouping:** + +| PR # | Title | Tasks | Branch | +|------|-------|-------|--------| +| 1 | feat: demonstration-fidelity skill + advisory hook + wiring (v6.2.0) | Task 1, Task 2, Task 3, Task 4, Task 5, Task 6, Task 7, Task 8 | feat/demonstration-fidelity-2026-05-29T1128 | + +**Status:** Locked 2026-05-29T11:52:19Z + +--- + +### Task 1: RED baseline gate (Iron Law — already satisfied) + +writing-skills Iron Law: no skill without a failing test first. The RED baseline +is already run + documented in the design (2 subagent scenarios + production +report). This task is the gate that proves it before any skill text is written. + +**Files:** +- Read: `docs/plans/2026-05-29-demonstration-fidelity-design.md` (§ "RED baseline") + +**Step 1:** Verify the baseline is recorded. +Run: `grep -c "Baseline #" docs/plans/2026-05-29-demonstration-fidelity-design.md` +Expected: `>= 2` (two baseline scenarios documented). + +**Step 2:** Confirm learnings shaped the invariant (seam-substitution carve-out traces to Baseline #2). +Run: `grep -n "dependency seam\|seam-substitution\|Baseline #2" docs/plans/2026-05-29-demonstration-fidelity-design.md` +Expected: non-empty. + +No commit (gate only). Proceed only if both pass. + +--- + +### Task 2: Advisory hook `hooks/pretool-demo-fidelity-guard` (TDD) + +**Change class:** Hook/trigger/event-handler + plugin-loading path. Verify by +firing the real hook via stdin (hook-contracts.sh) AND a manual stdin invocation +(runtime-launch-validation of the loading path). **Rollback:** revert commit + +remove the `hooks.json` entry (Task 3); or `SUPERPOWERS_HOOKS_DISABLE=1`. + +**Files:** +- Create: `hooks/pretool-demo-fidelity-guard` +- Test: `tests/hook-contracts.sh` (add a `demo-fidelity` case block) + +**Step 1 (RED): add failing contract cases to `tests/hook-contracts.sh`.** Cases: +- demo path `examples/foo-demo.py` (+ `transcript_path` set) → stdout JSON has `hookSpecificOutput.additionalContext` matching `demonstration-fidelity`; exit 0; no `decision`/`block`. +- excluded `pkg/example_test.go` → empty stdout (silent); exit 0. +- excluded `testdata/example.json` → silent. +- excluded `examples/testdata/demo.py` → silent (excluded segment `testdata` wins over trigger segment `examples`). +- kept `examples/latest-feature-demo.py` → fires (rev2-regression guard: basename has substring `test`/`spec`? no — `latest` contains `test` but exclusion is segment/suffix-anchored, not substring). +- kept `examples/Showcase.go` (capitalized) → fires (path lowercased before matching). +- non-demo `internal/server.go` → silent. +- `SUPERPOWERS_HOOKS_DISABLE=1` + demo path → silent. +- malformed/empty stdin → exit 0, no crash. +- dedup: same demo path twice with the **same** `transcript_path` → fires once (second is suppressed). +- fail-open: state file path forced unwritable (e.g. point `cwd` at a dir where `.claude/autodev-state` cannot be created) → still **fires** (fail-open = fire, never silent). + +**Step 2 (RED run):** `bash tests/hook-contracts.sh 2>&1 | tail -20` +Expected: FAIL (hook script does not exist yet). + +**Step 3 (GREEN): implement `hooks/pretool-demo-fidelity-guard`.** Model on +`hooks/pretool-pr-review-reminder` (same `emit_additional_context` shape). Logic: +- `set -euo pipefail`; `[ "${SUPERPOWERS_HOOKS_DISABLE:-}" = "1" ] && exit 0`. +- `[ -t 0 ] && exit 0`; require `jq`; read stdin; empty → exit 0. +- `tool_name` ∈ {`Write`,`Edit`,`MultiEdit`} else exit 0. +- path = `.tool_input.file_path`; empty → exit 0. +- **lowercase** path for matching (handles `Examples/`, `Demo*`). +- Split on `/`. Trigger iff: a segment == `demos`|`examples`, OR basename starts with `demo`|`example`|`showcase`|`quickstart`. +- Exclude iff: a segment ∈ {`test`,`tests`,`spec`,`specs`,`testdata`,`fixtures`,`vendor`,`node_modules`,`.git`}, OR basename matches `*_test.*`|`*.test.*`|`*.spec.*`. Excluded → exit 0. +- **Session key (NOT `session_id`):** `transcript_path=$(printf '%s' "$hook_input" | jq -r '.transcript_path // empty')`; `session_key=$(basename "$transcript_path" 2>/dev/null || echo "")`. PreToolUse payloads carry `transcript_path`, **not** `session_id` — verified at `hooks/pre-tool-scope-guard:39-41`, which uses exactly this idiom. Empty `transcript_path` → `session_key=""` (degrades to per-path dedup for that harness; acceptable for an advisory nudge). +- Dedup: `key=$(printf '%s' "${session_key}:${file_path}" | sha256sum | cut -d" " -f1)` (or `shasum -a 256` fallback); state file `${cwd}/.claude/autodev-state/demo-fidelity-seen` (one key per line). If `grep -qxF "$key" "$state" 2>/dev/null` → exit 0 (already nudged this session). Else append + emit. +- **Fail-open guard (critical with `set -euo pipefail`):** wrap every state I/O so a failure CANNOT fail-closed — `mkdir -p "$dir" 2>/dev/null || true`, `grep ... || true`, `printf '%s\n' "$key" >> "$state" 2>/dev/null || true`. A read/write failure must fall through to **emit** (fire), never to a silent exit. (A naive unguarded `>>` under `errexit` would fail-CLOSED — the bug this guard prevents.) +- Emit static `additionalContext` reminder (no file contents) via `emit_additional_context "PreToolUse" "$reminder"`; exit 0. +- Any unexpected error path → exit 0 silently (cannot wedge a session). Note: "fail-open = fire" applies specifically to *state I/O* failures; a malformed-payload parse failure still exits 0 silent. + +Reminder string (static): +``` + +You appear to be writing a demonstration/example artifact. A demo MUST execute the +real artifact and show its actual output. Do NOT reimplement the logic, hard-code +the output, or stub the thing being demonstrated. Substituting a *dependency* at a +real interface seam is allowed only if disclosed. See autodev:demonstration-fidelity. + +``` +`chmod +x hooks/pretool-demo-fidelity-guard`. + +**Step 4 (GREEN run):** `bash tests/hook-contracts.sh 2>&1 | tail -20` +Expected: PASS (all cases). + +**Step 5:** Manual runtime-launch-validation (plugin-loading path): +Run: `printf '{"tool_name":"Write","tool_input":{"file_path":"examples/demo_main.go"},"cwd":"'$PWD'"}' | bash hooks/pretool-demo-fidelity-guard` +Expected: JSON with `additionalContext` containing `demonstration-fidelity`; capture for PR body. + +**Step 6:** Commit. `git add hooks/pretool-demo-fidelity-guard tests/hook-contracts.sh && git commit -m "feat(hooks): advisory demo-fidelity write-time guard"` + +--- + +### Task 3: Register hook in `hooks/hooks.json` + +**Change class:** plugin-loading path. **Rollback:** revert commit (hook becomes inert). + +**Files:** Modify: `hooks/hooks.json` (PreToolUse array). + +**Step 1:** Add a **new, separate** element to the `PreToolUse` array (do NOT merge into the existing `Bash|Write|Edit|MultiEdit` scope-guard block — that would alter scope-guard's matcher). Exact element: +```json +{ + "matcher": "Write|Edit|MultiEdit", + "hooks": [ + { + "type": "command", + "command": "\"${CLAUDE_PLUGIN_ROOT}/hooks/run-hook.cmd\" pretool-demo-fidelity-guard", + "timeout": 10 + } + ] +} +``` + +**Step 2 (verify):** `jq . hooks/hooks.json >/dev/null && echo VALID` +Expected: `VALID`. + +**Step 3 (verify registration via contracts):** `bash tests/hook-contracts.sh 2>&1 | tail -5` +Expected: PASS (includes hooks.json well-formedness + new hook wiring). + +**Step 4:** Commit. `git add hooks/hooks.json && git commit -m "feat(hooks): register pretool-demo-fidelity-guard"` + +--- + +### Task 4: Write `skills/demonstration-fidelity/SKILL.md` (GREEN) + +**Change class:** Documentation (skill). Verify: host-neutral grep + word count + cross-refs. + +**Files:** Create: `skills/demonstration-fidelity/SKILL.md`. + +**Step 1:** Write the skill. Frontmatter `name: demonstration-fidelity`, symptom-first description (from design). Body (host-neutral, no forbidden tokens): +- Overview + the Invariant (execute the real artifact; output produced by that run). +- Forbidden-regardless-of-language list (reimplementation, hard-coded output, artifact-stub, detached prototype). +- Allowed-with-disclosure: dependency-seam substitution (cite RLV DB-migration + Fall-back). +- Fidelity-not-language-sameness nuance (valid cross-language client demo). +- 3-question fidelity test. +- One fake-vs-faithful example pair (single language, no multi-language dilution). +- Rationalization table (seeded from RED baseline — "tooling finicky so I'll just print expected output", "looks identical on screen", "no time to stand up the DB"). +- Red flags + Common mistakes. +- Cross-refs: runtime-launch-validation, verification-before-completion, scope-lock (skill-name form, no `@`). + +**Step 2 (verify host-neutral):** `bash tests/skill-content-grep.sh 2>&1 | tail -5` +Expected: PASS (no Claude-only tokens). + +**Step 3 (verify cross-refs resolve):** `bash tests/skill-cross-refs.sh 2>&1 | tail -5` +Expected: PASS. + +**Step 4 (token budget):** `wc -w skills/demonstration-fidelity/SKILL.md` +Expected: < 800 words (lean; target ~500 core). + +**Step 5:** Commit. `git add skills/demonstration-fidelity && git commit -m "feat(skills): demonstration-fidelity skill"` + +--- + +### Task 5: Wire cross-refs into existing skills + README + coverage + +**Change class:** Documentation. Verify: cross-refs resolve + grep each edit. + +**Files:** +- Modify: `skills/runtime-launch-validation/SKILL.md` (add the exact Demonstration change-class row from design §2 + a "See also" line). +- Modify: `skills/verification-before-completion/SKILL.md` (claim-matrix row `demo/example works | real artifact executed via the demo produced the shown output | hand-written/hard-coded output, a reimplementation`). +- Modify: `skills/finishing-a-development-branch/SKILL.md` (Step 1b: note — if the diff ships a demo/example artifact, `demonstration-fidelity` applies). +- Modify: `skills/using-autodev/SKILL.md` (add to skill discovery / red-flags so it loads at demo time). +- Modify: `README.md` (Skills Library → Testing group: `demonstration-fidelity`). +- Modify: `tests/cross-llm-coverage.md` (host-neutral row). + +**Step 1:** Apply all six edits. + +**Step 2 (verify):** `bash tests/skill-cross-refs.sh && bash tests/skill-content-grep.sh 2>&1 | tail -8` +Expected: both PASS. + +**Step 3 (verify RLV/VBC rows present):** +Run: `grep -n "demonstration-fidelity" skills/runtime-launch-validation/SKILL.md skills/verification-before-completion/SKILL.md skills/finishing-a-development-branch/SKILL.md README.md tests/cross-llm-coverage.md` +Expected: a hit in each file. + +**Step 4:** Commit. `git commit -am "feat(wiring): cross-ref demonstration-fidelity into RLV/VBC/finishing/using-autodev/README/coverage"` + +--- + +### Task 6: GREEN behavioral verification (writing-skills) + discoverability + +**Change class:** Skill test (behavioral; best-effort, not CI-gating). + +**Step 1:** Dispatch a subagent WITH the skill available, given the same fake-demo +pressure scenario as RED Baseline #2 (hard-to-run artifact), and the skill named. +Expected: agent applies fidelity — runs the real artifact (or substitutes only a +disclosed dependency seam), never hard-codes output. Capture summary. + +**Step 2 (discoverability):** Dispatch a second subagent given "build a demo of X," +skill present but NOT named, autodev loaded. Observe whether the symptom-first +description triggers a skill load / fidelity behavior. +Expected: skill loads or fidelity behavior emerges (best-effort; record outcome). + +**Step 3:** Record both outcomes in the PR body. No commit (verification only). + +**GATE (writing-skills Iron Law GREEN — blocks Task 7):** Step 1 MUST show fidelity +behavior — the agent runs the real artifact (or substitutes only a disclosed +dependency seam) and does NOT hard-code output or reimplement. If the agent still +fakes the demo with the skill present, the skill's GREEN test FAILED: return to +Task 4, revise the skill to close the rationalization, re-run Step 1. Do NOT +proceed to Task 7 (version bump / release) on a failing GREEN. A skill whose GREEN +test fails is an untested skill and must not ship. (Step 2 discoverability is +best-effort and non-gating; only Step 1 fidelity gates.) + +--- + +### Task 7: Version bump + release notes + +**Change class:** Version pin (plugin manifest). **Rollback:** revert commit. + +**Files:** +- Modify: `.claude-plugin/plugin.json` (`"version": "6.1.5"` → `"6.2.0"`). +- Modify: `.cursor-plugin/plugin.json` (`6.1.5`→`6.2.0` — it carries a version; `tests/version-check.sh` requires all manifests agree, so this bump is mandatory, not conditional). +- Modify: `RELEASE-NOTES.md` (prepend v6.2.0 entry: new skill + advisory hook + wiring). + +**Step 1:** Apply bumps. (New feature → minor bump 6.1.5→6.2.0.) + +**Step 2 (verify):** `jq -r .version .claude-plugin/plugin.json` → `6.2.0`; `bash tests/version-check.sh 2>&1 | tail -5` → PASS. + +**Step 3:** Commit. `git commit -am "chore: bump version to 6.2.0"` + +--- + +### Task 8: Full suite + scope-lock verify (pre-PR gate) + +> **Lock ordering:** the `.scope-lock` sidecar is written by `scope-lock-apply` +> at lock time — i.e. after `alignment-check` PASS and **before** Task-1 +> execution begins (`alignment-check` invokes `scope-lock`). By the time Task 8 +> runs, `docs/plans/2026-05-29-demonstration-fidelity.md.scope-lock` exists, so +> `--verify-lock` below is valid. If the lock file is missing here, scope-lock +> was skipped — stop and run `bash hooks/scope-lock-apply ` before the PR. + +**Step 1:** Run the full local suite: +``` +bash tests/hook-contracts.sh && bash tests/skill-content-grep.sh && \ +bash tests/skill-cross-refs.sh && bash tests/version-check.sh && \ +bash tests/plan-scope-check.sh --plan docs/plans/2026-05-29-demonstration-fidelity.md +``` +Expected: all PASS. + +**Step 2:** Verify scope-lock hash still matches: +`bash tests/plan-scope-check.sh --verify-lock docs/plans/2026-05-29-demonstration-fidelity.md` +Expected: PASS (manifest unchanged since lock). + +**Step 3:** Hand off to `finishing-a-development-branch` (Step 1b runtime-launch transcript already captured in Task 2 Step 5). + +--- + +## Global Design Guidance + +Inherits the design's `## Global Design Guidance` (cited canon: README §Cross-LLM, +cross-llm-portability design, writing-skills). Mapped to tasks: host-neutrality → +Task 4/5 grep gate; TDD Iron Law → Task 1 gate + Task 2/4 RED→GREEN; one-example +rule → Task 4; scope discipline → Scope Manifest + Task 8 verify. + +## Rollback summary + +Single-step PR revert removes skill + hook + hooks.json entry + wiring + version +bump. Hook independently neutralizable (drop `hooks.json` entry or +`SUPERPOWERS_HOOKS_DISABLE=1`) without reverting the skill. Dedup jsonl untracked ++ benign. diff --git a/docs/plans/2026-05-29-demonstration-fidelity.md.scope-lock b/docs/plans/2026-05-29-demonstration-fidelity.md.scope-lock new file mode 100644 index 0000000..7ad4d77 --- /dev/null +++ b/docs/plans/2026-05-29-demonstration-fidelity.md.scope-lock @@ -0,0 +1 @@ +661d9faf234fdc5e4c8e2de72c6e4db95a0af91c49c47334dd5580ee079cc00f diff --git a/hooks/hooks.json b/hooks/hooks.json index 1201ad7..567694b 100644 --- a/hooks/hooks.json +++ b/hooks/hooks.json @@ -32,6 +32,16 @@ "timeout": 10 } ] + }, + { + "matcher": "Write|Edit|MultiEdit", + "hooks": [ + { + "type": "command", + "command": "\"${CLAUDE_PLUGIN_ROOT}/hooks/run-hook.cmd\" pretool-demo-fidelity-guard", + "timeout": 10 + } + ] } ], "PostToolUse": [ diff --git a/hooks/pretool-demo-fidelity-guard b/hooks/pretool-demo-fidelity-guard new file mode 100755 index 0000000..53c6126 --- /dev/null +++ b/hooks/pretool-demo-fidelity-guard @@ -0,0 +1,129 @@ +#!/usr/bin/env bash +# hooks/pretool-demo-fidelity-guard +# PreToolUse hook (advisory, NEVER blocks): when an agent is about to write a +# demonstration/example artifact, remind it that a demo MUST execute the real +# artifact — no reimplementation, no hard-coded output, no stubbing the thing +# being demonstrated. Substituting a *dependency* at a real interface seam is +# allowed only if disclosed. See skills/demonstration-fidelity/SKILL.md. +# +# This is a best-effort, filename-detectable nudge only. The load-bearing +# defense is the demonstration-fidelity skill (which covers inline / README / +# normally-named / cross-language demos) plus the verification-before-completion +# "demo/example works" claim-matrix row. This hook never blocks and never reads +# file contents. +# +# Detection is anchored to path SEGMENTS + basename suffix globs — never bare +# substrings — so `latest`/`contest`/`attestation`/`inspector`/`spectrum` +# demos are not wrongly excluded while `example_test.go`/`testdata/`/`vendor/` +# are. +# +# Global opt-out: set SUPERPOWERS_HOOKS_DISABLE=1 + +set -euo pipefail + +[ "${SUPERPOWERS_HOOKS_DISABLE:-}" = "1" ] && exit 0 + +# Require stdin (PreToolUse always sends a JSON payload). +[ -t 0 ] && exit 0 +command -v jq >/dev/null 2>&1 || exit 0 + +hook_input=$(cat || true) +[ -z "$hook_input" ] && exit 0 + +tool_name=$(printf '%s' "$hook_input" | jq -r '.tool_name // empty' 2>/dev/null || true) +case "$tool_name" in + Write|Edit|MultiEdit) ;; + *) exit 0 ;; +esac + +file_path=$(printf '%s' "$hook_input" | jq -r '.tool_input.file_path // empty' 2>/dev/null || true) +[ -z "$file_path" ] && exit 0 + +# Lowercase so Examples/, Demo*, Showcase match case-insensitively. +lc_path=$(printf '%s' "$file_path" | tr '[:upper:]' '[:lower:]') +base=${lc_path##*/} + +# Split into path segments. +IFS='/' read -r -a segs <<< "$lc_path" || true + +# ── Exclusion (segment-exact + basename suffix globs; never bare substrings) ─ +# "${segs[@]:-}" form so an (unreachable) empty array can't trip `set -u` under +# bash 3.2 (macOS system bash, which run-hook.cmd may exec). +for seg in "${segs[@]:-}"; do + case "$seg" in + test|tests|spec|specs|testdata|fixtures|vendor|node_modules|.git) exit 0 ;; + esac +done +case "$base" in + *_test.*|*.test.*|*.spec.*|*_spec.*) exit 0 ;; +esac + +# ── Trigger (segment-exact demos/examples, or basename prefix) ─────────────── +fire=0 +for seg in "${segs[@]:-}"; do + case "$seg" in + demos|examples) fire=1; break ;; + esac +done +if [ "$fire" -eq 0 ]; then + case "$base" in + demo*|example*|showcase*|quickstart*) fire=1 ;; + esac +fi +[ "$fire" -eq 0 ] && exit 0 + +# ── Session-scoped dedup ───────────────────────────────────────────────────── +# PreToolUse payloads carry transcript_path, NOT session_id (cf. +# hooks/pre-tool-scope-guard); derive the session key the same way. +transcript_path=$(printf '%s' "$hook_input" | jq -r '.transcript_path // empty' 2>/dev/null || true) +session_key="" +[ -n "$transcript_path" ] && session_key=$(basename "$transcript_path" 2>/dev/null || echo "") + +cwd_dir=$(printf '%s' "$hook_input" | jq -r '.cwd // empty' 2>/dev/null || true) +[ -z "$cwd_dir" ] && cwd_dir="${PWD}" + +dedup_key="" +if command -v sha256sum >/dev/null 2>&1; then + dedup_key=$(printf '%s' "${session_key}:${file_path}" | sha256sum 2>/dev/null | cut -d' ' -f1 || true) +elif command -v shasum >/dev/null 2>&1; then + dedup_key=$(printf '%s' "${session_key}:${file_path}" | shasum -a 256 2>/dev/null | cut -d' ' -f1 || true) +fi + +state_dir="${cwd_dir}/.claude/autodev-state" +state_file="${state_dir}/demo-fidelity-seen" + +if [ -n "$dedup_key" ]; then + # Already nudged this session for this path → stay silent. + # (grep used as an `if` condition; errexit does not trip on conditions, so it + # is NOT wrapped in `|| true` — wrapping it would force the condition true.) + if [ -f "$state_file" ] && grep -qxF "$dedup_key" "$state_file" 2>/dev/null; then + exit 0 + fi + # Record. Fail-OPEN: any state I/O failure must fall through to EMIT, never + # suppress. Guarded so `set -euo pipefail` cannot fail-CLOSED on an unwritable + # state dir, and so a failed `>>` redirection cannot leak to stderr (the + # group redirect applies before the inner append is attempted). + if mkdir -p "$state_dir" 2>/dev/null; then + { printf '%s\n' "$dedup_key" >> "$state_file"; } 2>/dev/null || true + fi +fi + +reminder=$(cat <<'REMINDER' + +You appear to be writing a demonstration/example artifact. A demo MUST execute the +real artifact and show its actual output. Do NOT reimplement the logic, hard-code +the output, or stub the thing being demonstrated. Substituting a *dependency* at a +real interface seam is allowed only if disclosed. See autodev:demonstration-fidelity. + +REMINDER +) + +emit_additional_context() { + local event_name="$1" + local context="$2" + jq -n --arg event "$event_name" --arg context "$context" \ + '{hookSpecificOutput:{hookEventName:$event,additionalContext:$context}}' +} + +emit_additional_context "PreToolUse" "$reminder" +exit 0 diff --git a/skills/demonstration-fidelity/SKILL.md b/skills/demonstration-fidelity/SKILL.md new file mode 100644 index 0000000..dadeb08 --- /dev/null +++ b/skills/demonstration-fidelity/SKILL.md @@ -0,0 +1,92 @@ +--- +name: demonstration-fidelity +description: Use when creating a demo, example, quickstart, showcase, sample, or any artifact meant to prove an implementation works — before writing it. Triggers when about to "show it working", build a proof-of-concept, or generate sample output, especially under time pressure or when the real code is awkward to run. Catches fake demos that reimplement the logic, hard-code the output, or rewrite it in another language instead of executing the real artifact. +--- + +> Condensed format: load `autodev:condensed-pipeline-writing` to expand shorthand. + +# Demonstration Fidelity + +## Iron Law + +**A demonstration must execute the real artifact, and the output it shows must be produced by that execution.** + +A demo, example, quickstart, showcase, screenshot, or "here's it working" proof is a *claim that the code works*. If it doesn't run the real code, the claim is fabricated — however convincing the output looks. This operationalizes `autodev:verification-before-completion` for demo artifacts and is a sibling of `autodev:runtime-launch-validation`. + +## Forbidden — regardless of language + +- **Reimplementation** — re-coding the artifact's logic in the demo instead of calling it. +- **Hard-coded output** — hand-authoring "expected" output and presenting it as produced output. +- **Stubbing the artifact-under-demonstration** — wiring the demo to a fake *in place of the thing you are demonstrating*. +- **Detached prototype** — a parallel throwaway instead of the shipped entry point. + +These prove nothing. They are fake code. + +## Allowed — with disclosure + +Substituting a **dependency** at a **real interface seam** (data store, external service, clock) so the demo runs locally — **provided** the artifact's own code path runs unchanged (you stubbed a *dependency*, not the artifact) **and** you state it plainly ("data source is an in-memory fixture; the handler is the real one"). This is the `autodev:runtime-launch-validation` posture (ephemeral DB row; Fall-back section). Disclosed seam-substitution is honest; faking the artifact is not. + +## Fidelity, not language sameness + +Cross-language is **not** the crime. A real client in another language crossing a **real interface** into the running artifact — e.g. a Python client making real HTTP calls to a running Go service — is valid, as long as that crossing is exercised (no stub on either end of *that* boundary). The question is always **"did the real code run to produce this output?"** — never "same language?". + +## The 3-question fidelity test + +1. **Execution:** does the demo call/import/invoke the real artifact — not a copy of it? +2. **Provenance:** was every value shown produced by that run and captured — not typed by you? +3. **Seams:** if you substituted anything, was it a *dependency* (not the artifact), and did you disclose it? + +Any "no" → the demo is fake. Fix it before presenting. + +## Example — fake vs. faithful + +Artifact: Go `text.Dedupe(s string) string`. + +**Fake** (different language, hard-coded — proves nothing): + +```python +# demo.py — DO NOT DO THIS +print("BEFORE:\n a\n a\n b") +print("AFTER:\n a\n b") # hand-typed; Dedupe never ran +``` + +**Faithful** (runs the real function, prints its real return value): + +```go +// demo/main.go +package main + +import ("fmt"; "example.com/app/text") + +func main() { + in := "a\n a\n b" + fmt.Printf("AFTER:\n%s\n", text.Dedupe(in)) // real output, captured by running it +} +``` + +If the module tooling is awkward, sidestep the *tooling* (throwaway module, ephemeral dependency) — never sidestep *execution*. + +## Rationalizations — STOP + +| Excuse | Reality | +|---|---| +| "Build/DB tooling is finicky — I'll just print the expected output." | Sidestep the tooling, not the execution. A throwaway module / in-memory dependency runs the real code; printed literals run nothing. | +| "A hard-coded demo looks identical on screen." | Looking identical is the trap. The value of a demo is that the real code produced it. | +| "Quicker to rewrite it in Python/bash for the demo." | Fine only if that script actually calls/crosses into the real artifact. A script printing literals is fake in any language. | +| "The real thing needs a DB/service I can't stand up." | Substitute the *dependency* at a real seam and disclose it; run the real artifact. Never fake the artifact. | +| "It's just for the meeting / illustrative." | A demo presented as proof is a claim — `autodev:verification-before-completion` applies. | +| "I simplified the logic for clarity." | A simplified reimplementation is a different program. Demo the real one. | + +## Red flags + +- The demo imports nothing from the module under demonstration. +- You typed or pasted the "output" instead of capturing a run. +- The demo is in another language and never crosses a real interface into the artifact. +- "Simulated" / "for demonstration purposes" / "pretend" appears in the demo. +- You have not actually run it and watched the output. + +## See also + +- `autodev:verification-before-completion` — evidence before any "works/done" claim (its claim matrix has a `demo/example works` row). +- `autodev:runtime-launch-validation` — launch the built artifact; its "Demonstration / example / showcase" change-class row points here. +- `autodev:scope-lock` — "there is no demo mode" for *partial scope* (distinct from fidelity). diff --git a/skills/finishing-a-development-branch/SKILL.md b/skills/finishing-a-development-branch/SKILL.md index b17e5cb..e055713 100644 --- a/skills/finishing-a-development-branch/SKILL.md +++ b/skills/finishing-a-development-branch/SKILL.md @@ -112,6 +112,8 @@ If NOT triggered (pure logic refactor, doc-only, test-only): skip this step. **The launch transcript is required in the PR body when this step triggers.** Without it, the PR is not ready for merge — even if all unit tests pass. +**Demonstration artifacts:** if the change ships any demo/example/showcase/quickstart artifact (in this diff or the PR body), `autodev:demonstration-fidelity` applies — confirm the demo executes the real artifact (no reimplementation, hard-coded output, or different-language fake) before merge. + ### Step 1c: Version-Skew Audit (conditional) **Trigger:** the diff updates a non-dev-only version pin (any "version: vX.Y.Z", "image: foo:vX.Y.Z", or `@vX.Y.Z`) — excludes dev-only tooling pins (linters, formatters) where skew is generally benign. diff --git a/skills/runtime-launch-validation/SKILL.md b/skills/runtime-launch-validation/SKILL.md index 497c429..3fc55e6 100644 --- a/skills/runtime-launch-validation/SKILL.md +++ b/skills/runtime-launch-validation/SKILL.md @@ -45,6 +45,9 @@ Triggered NOT by: | Library / SDK | Import into a tiny consumer program, exercise the new public surface | Output, behavior matches docs | | Plugin / extension | Load it into the host application, exercise a representative call | Host doesn't crash on load; representative call returns | | Interface boundary change (new method, field, event type, or hook — see `agents/boundary-classes.md` for the canonical boundary-class list) | Launch both sides/participants as applicable; exercise a real interaction across the boundary — not a mock or stub on either end | The receiving side correctly processes the new data/method/event/hook; no fallback silently swallows the new path; failure-signature scrape clean on all participating sides | +| Demonstration / example / showcase artifact (anything built to show a change working) | The real artifact, invoked through its real entry point; capture output from that run | Output is produced by the real code path, not literals; the artifact-under-demonstration is NOT stubbed; any substituted *dependency* sits behind a real interface seam and is disclosed. See `autodev:demonstration-fidelity`. | + +When a demonstration *also* exercises a new boundary, both this row and the "Interface boundary change" row apply: stub neither the artifact nor the boundary under test — only a disclosed *dependency* behind the artifact may be substituted. ## Failure-signature scrape @@ -95,6 +98,7 @@ The constraint is not an excuse to skip; it's a request for help. ## See also - `skills/verification-before-completion/SKILL.md` — general evidence-before-assertion principle +- `autodev:demonstration-fidelity` — demo/example/showcase artifacts must execute the real artifact (the "Demonstration" change-class row above) - `skills/finishing-a-development-branch/SKILL.md` — Step 1b invokes this skill - `skills/writing-plans/SKILL.md` — related planning guidance for per-change-class verification - `agents/boundary-classes.md` — canonical definition of interface boundary classes (producer→consumer, caller→callee, sender→handler, plugin→host) diff --git a/skills/using-autodev/SKILL.md b/skills/using-autodev/SKILL.md index 9260ff3..4d151df 100644 --- a/skills/using-autodev/SKILL.md +++ b/skills/using-autodev/SKILL.md @@ -83,7 +83,7 @@ When multiple skills could apply, use this order: 3. **Pipeline skills auto-chain** — these invoke each other automatically in the autonomous pipeline: brainstorming → adversarial-design-review (design phase) → writing-plans → adversarial-design-review (plan phase) → alignment-check → **scope-lock** → subagent-driven-development → finishing-a-development-branch → pr-monitoring → post-merge-retrospective - Cross-cutting skills invoked from within the pipeline when conditions trigger: `project-design-guidance` (before designs/plans and during retros when durable guidance changes); `recording-decisions` (when designs/plans make non-trivial trade-offs, including user-approved manifest amendments); `scope-lock` (re-checked at every per-task checkpoint and before PR creation); `condensed-pipeline-writing` (for dense internal design/review/plan artifacts). + Cross-cutting skills invoked from within the pipeline when conditions trigger: `project-design-guidance` (before designs/plans and during retros when durable guidance changes); `recording-decisions` (when designs/plans make non-trivial trade-offs, including user-approved manifest amendments); `scope-lock` (re-checked at every per-task checkpoint and before PR creation); `condensed-pipeline-writing` (for dense internal design/review/plan artifacts); `demonstration-fidelity` (before writing any demo/example/showcase/proof artifact — it must execute the real code, not fake it). "Let's build X" → brainstorming first, then the pipeline runs autonomously after design approval. "Fix this bug" → debugging first, then domain-specific skills. diff --git a/skills/verification-before-completion/SKILL.md b/skills/verification-before-completion/SKILL.md index 04c9a45..4931986 100644 --- a/skills/verification-before-completion/SKILL.md +++ b/skills/verification-before-completion/SKILL.md @@ -37,6 +37,7 @@ Skip step = unverified claim. | agent completed | inspect diff + verify | agent report | | requirements met | checklist vs plan/design | tests alone | | lint clean (Go-repo PR) | `golangci-lint run` exit 0 | tests green alone | +| demo/example works | the real artifact executed via the demo produced the shown output (see `autodev:demonstration-fidelity`) | hand-written/hard-coded output, a reimplementation, a different-language fake | ## Red Flags diff --git a/tests/cross-llm-coverage.md b/tests/cross-llm-coverage.md index e6fc73e..935c48e 100644 --- a/tests/cross-llm-coverage.md +++ b/tests/cross-llm-coverage.md @@ -24,6 +24,7 @@ host-neutral. Updated whenever a skill changes. | using-git-worktrees | host-neutral | host-neutral | host-neutral | host-neutral | already portable (Group I) | | using-autodev | host-neutral | host-neutral | host-neutral | host-neutral | host-access phrasing is prose-based ("In Claude Code: … In other environments: …"); no forbidden tokens | | verification-before-completion | host-neutral | host-neutral | host-neutral | host-neutral | already portable (Group I) | +| demonstration-fidelity | host-neutral | host-neutral | host-neutral | host-neutral | pure markdown; no host-specific tooling. Advisory backstop is the separate `hooks/pretool-demo-fidelity-guard` (Claude+Codex+Cursor via hooks.json) | | writing-plans | host-neutral | host-neutral | host-neutral | host-neutral | Plan Mode reference is prose-based ("If you are running in Claude Code…"); no `` blocks needed | | writing-skills | host-conditional | host-conditional | host-conditional | host-conditional | `TodoWrite` checklist and tier-brand names wrapped in `` blocks | diff --git a/tests/hook-contracts.sh b/tests/hook-contracts.sh index 0c1302c..1cf1b80 100755 --- a/tests/hook-contracts.sh +++ b/tests/hook-contracts.sh @@ -1518,6 +1518,117 @@ JSONL pass "skill-activation-audit: reads compact state rows" } +# ── pretool-demo-fidelity-guard (advisory, never blocks) ───────────────────── +demo_fidelity_payload() { + # $1 = file_path, $2 = transcript_path, $3 = cwd + printf '{"tool_name":"Write","tool_input":{"file_path":"%s"},"cwd":"%s","transcript_path":"%s"}' \ + "$1" "$3" "$2" +} + +test_demo_fidelity_fires_and_never_blocks() { + local tmp transcript output + tmp="$(mktemp -d)"; transcript="${tmp}/sessionA.jsonl"; : > "$transcript" + output="$(run_hook pretool-demo-fidelity-guard "$(demo_fidelity_payload "examples/foo-demo.py" "$transcript" "$tmp")")" + assert_hook_context_json "demo-fidelity:fires" "PreToolUse" "$output" + if printf '%s' "$output" | grep -q 'demonstration-fidelity'; then + pass "demo-fidelity: reminder references the skill" + else + fail "demo-fidelity: reminder must reference demonstration-fidelity: ${output}" + fi + if printf '%s' "$output" | jq -e 'has("decision")' >/dev/null 2>&1; then + fail "demo-fidelity: advisory hook must never emit decision/block: ${output}" + else + pass "demo-fidelity: never blocks (no decision key)" + fi + rm -rf "$tmp" +} + +test_demo_fidelity_fires_on_legit_demos() { + local tmp transcript output p + tmp="$(mktemp -d)" + # Capitalized + names containing test/spec as substrings (NOT segments) must still fire. + for p in "examples/latest-feature-demo.py" "examples/attestation-demo.go" "examples/Showcase.go" "demo_runner.go" "quickstart.md"; do + transcript="${tmp}/$(printf '%s' "$p" | tr '/.' '__').jsonl"; : > "$transcript" + output="$(run_hook pretool-demo-fidelity-guard "$(demo_fidelity_payload "$p" "$transcript" "$tmp")")" + if printf '%s' "$output" | jq -e '.hookSpecificOutput.additionalContext | length > 0' >/dev/null 2>&1; then + pass "demo-fidelity: fires on ${p}" + else + fail "demo-fidelity: must fire on legit demo ${p}: ${output}" + fi + done + rm -rf "$tmp" +} + +test_demo_fidelity_silent_on_excluded_and_nondemo() { + local tmp transcript output p + tmp="$(mktemp -d)"; transcript="${tmp}/s.jsonl"; : > "$transcript" + for p in "pkg/example_test.go" "testdata/example.json" "examples/testdata/demo.py" "internal/server.go" "config/sample_config.yaml" "vendor/example/demo.go" "app/spec/example_helper.rb" "examples/widget_spec.rb" "demo_service.spec.ts"; do + output="$(run_hook pretool-demo-fidelity-guard "$(demo_fidelity_payload "$p" "$transcript" "$tmp")")" + if [ -z "$output" ]; then + pass "demo-fidelity: silent on ${p}" + else + fail "demo-fidelity: must be silent on ${p}: ${output}" + fi + done + rm -rf "$tmp" +} + +test_demo_fidelity_silent_on_non_write_tool() { + local tmp transcript output + tmp="$(mktemp -d)"; transcript="${tmp}/s.jsonl"; : > "$transcript" + output="$(printf '{"tool_name":"Bash","tool_input":{"command":"echo hi > examples/foo-demo.py"},"cwd":"%s","transcript_path":"%s"}' "$tmp" "$transcript" | env LC_ALL=C LANG=C LC_CTYPE=C hooks/pretool-demo-fidelity-guard || true)" + if [ -z "$output" ]; then pass "demo-fidelity: silent on non-Write tool"; else fail "demo-fidelity: must ignore non-Write tools: ${output}"; fi + rm -rf "$tmp" +} + +test_demo_fidelity_respects_disable_env() { + local tmp transcript output + tmp="$(mktemp -d)"; transcript="${tmp}/s.jsonl"; : > "$transcript" + output="$(printf '{"tool_name":"Write","tool_input":{"file_path":"examples/foo-demo.py"},"cwd":"%s","transcript_path":"%s"}' "$tmp" "$transcript" | env SUPERPOWERS_HOOKS_DISABLE=1 LC_ALL=C LANG=C LC_CTYPE=C hooks/pretool-demo-fidelity-guard || true)" + if [ -z "$output" ]; then pass "demo-fidelity: respects SUPERPOWERS_HOOKS_DISABLE"; else fail "demo-fidelity: must be silent when disabled: ${output}"; fi + rm -rf "$tmp" +} + +test_demo_fidelity_handles_malformed_stdin() { + local output + output="$(printf '%s' 'not json {{{' | env LC_ALL=C LANG=C LC_CTYPE=C hooks/pretool-demo-fidelity-guard || true)" + if [ -z "$output" ]; then pass "demo-fidelity: silent + no crash on malformed stdin"; else fail "demo-fidelity: malformed stdin must not emit: ${output}"; fi + output="$(printf '%s' '' | env LC_ALL=C LANG=C LC_CTYPE=C hooks/pretool-demo-fidelity-guard || true)" + if [ -z "$output" ]; then pass "demo-fidelity: silent on empty stdin"; else fail "demo-fidelity: empty stdin must not emit: ${output}"; fi +} + +test_demo_fidelity_dedups_within_session() { + local tmp transcript out1 out2 + tmp="$(mktemp -d)"; transcript="${tmp}/sessionDedup.jsonl"; : > "$transcript" + out1="$(run_hook pretool-demo-fidelity-guard "$(demo_fidelity_payload "examples/foo-demo.py" "$transcript" "$tmp")")" + out2="$(run_hook pretool-demo-fidelity-guard "$(demo_fidelity_payload "examples/foo-demo.py" "$transcript" "$tmp")")" + if printf '%s' "$out1" | jq -e '.hookSpecificOutput.additionalContext | length > 0' >/dev/null 2>&1; then + pass "demo-fidelity: first write fires" + else + fail "demo-fidelity: first write must fire: ${out1}" + fi + if [ -z "$out2" ]; then + pass "demo-fidelity: dedups second write of same path in same session" + else + fail "demo-fidelity: second write of same path must be suppressed: ${out2}" + fi + rm -rf "$tmp" +} + +test_demo_fidelity_fail_open_when_state_unwritable() { + local tmp transcript output + tmp="$(mktemp -d)"; transcript="${tmp}/s.jsonl"; : > "$transcript" + # Make .claude a regular file so mkdir -p .claude/autodev-state cannot succeed. + printf '' > "${tmp}/.claude" + output="$(run_hook pretool-demo-fidelity-guard "$(demo_fidelity_payload "examples/foo-demo.py" "$transcript" "$tmp")")" + if printf '%s' "$output" | jq -e '.hookSpecificOutput.additionalContext | length > 0' >/dev/null 2>&1; then + pass "demo-fidelity: fail-open — fires when dedup state is unwritable" + else + fail "demo-fidelity: must fire (fail-open) when state unwritable: ${output}" + fi + rm -rf "$tmp" +} + require_jq test_session_start_json test_session_start_time_dedup_suppresses_rapid_refires @@ -1569,6 +1680,14 @@ test_e2e_abandon_then_no_nag test_e2e_fresh_session_no_claim_no_nag test_record_activity_compact_state test_skill_activation_audit_reads_compact_state +test_demo_fidelity_fires_and_never_blocks +test_demo_fidelity_fires_on_legit_demos +test_demo_fidelity_silent_on_excluded_and_nondemo +test_demo_fidelity_silent_on_non_write_tool +test_demo_fidelity_respects_disable_env +test_demo_fidelity_handles_malformed_stdin +test_demo_fidelity_dedups_within_session +test_demo_fidelity_fail_open_when_state_unwritable if [ "$failures" -ne 0 ]; then printf '\n%d hook contract test(s) failed.\n' "$failures" >&2