From 3ea53f1f249df3556fc2ed0e2c4a738fd51ed6a2 Mon Sep 17 00:00:00 2001 From: Robert Allen Date: Wed, 1 Jul 2026 11:10:23 -0400 Subject: [PATCH 1/8] test(evals): harden eval suites, apply autoresearch-verified skill fixes Runs an eval-doctor pass across all 23 doc-genre skills, converting LLM-only expectations into deterministic checks (0% -> ~63% deterministic coverage) and targeting named output files instead of transcript.md. Then runs the autoresearch improvement loop against the hardened evals and applies the 10 improvements it found and verified: self-referential banned-word bugs in arc42-arch-doc and diataxis-tutorial, missing mermaid-fence-nesting guidance in c4-model-diagram, a dropped genre-drift rule in diataxis-how-to, a concrete-naming rule in ears-acceptance-criteria (mirrored into feature-spec), a type-enum surfacing gap in mif-frontmatter, a results-reporting gap in mif-validate, and review/status-clarity fixes in python-pep and rust-rfc. Reviewed via an independent out-of-session loop (3 rounds, 0 must-fix / 0 should-fix remaining); all touched skills verified against validate-plugin, lint:md, test:hook, and MIF round-trip. --- skills/adr/evals/evals.json | 158 +++++++-- skills/ai-architecture-doc/evals/evals.json | 190 ++++++++++- skills/arc42-arch-doc/SKILL.md | 99 +++--- skills/arc42-arch-doc/evals/evals.json | 211 +++++++++++- skills/arc42-arch-doc/templates/bad.md | 2 +- skills/c4-model-diagram/SKILL.md | 33 ++ skills/c4-model-diagram/evals/evals.json | 198 +++++++++++- skills/changelog/evals/evals.json | 186 ++++++++++- skills/diataxis-explanation/evals/evals.json | 238 +++++++++++++- skills/diataxis-how-to/SKILL.md | 35 ++ skills/diataxis-how-to/evals/evals.json | 258 +++++++++++++-- skills/diataxis-reference/evals/evals.json | 223 ++++++++++++- skills/diataxis-tutorial/SKILL.md | 10 +- skills/diataxis-tutorial/evals/evals.json | 206 +++++++++++- skills/diataxis-tutorial/templates/bad.md | 12 +- skills/doc-set-planner/evals/evals.json | 90 +++++- skills/ears-acceptance-criteria/SKILL.md | 5 +- .../ears-acceptance-criteria/evals/evals.json | 242 +++++++++++++- skills/feature-spec/SKILL.md | 9 + skills/feature-spec/evals/evals.json | 252 +++++++++++++-- skills/google-design-doc/evals/evals.json | 249 +++++++++++++-- skills/kiro-design/evals/evals.json | 203 +++++++++++- skills/kiro-requirements/evals/evals.json | 237 +++++++++++++- skills/kiro-tasks/evals/evals.json | 221 ++++++++++++- skills/mif-frontmatter/SKILL.md | 13 +- skills/mif-frontmatter/evals/evals.json | 225 +++++++++++-- skills/mif-validate/SKILL.md | 7 + skills/mif-validate/evals/evals.json | 210 ++++++++++-- skills/playbook/evals/evals.json | 221 ++++++++++++- skills/prd/evals/evals.json | 190 ++++++++++- skills/python-pep/SKILL.md | 28 +- skills/python-pep/evals/evals.json | 273 ++++++++++++++-- skills/rust-rfc/SKILL.md | 22 ++ skills/rust-rfc/evals/evals.json | 221 +++++++++++-- skills/sre-runbook/evals/evals.json | 300 ++++++++++++++++-- 35 files changed, 4790 insertions(+), 487 deletions(-) diff --git a/skills/adr/evals/evals.json b/skills/adr/evals/evals.json index a818335..e5ab28e 100644 --- a/skills/adr/evals/evals.json +++ b/skills/adr/evals/evals.json @@ -6,12 +6,41 @@ "prompt": "We just decided to adopt PostgreSQL as our primary datastore over MongoDB and DynamoDB. Capture this as an ADR.", "expected_output": "A MADR-style Architectural Decision Record: titled with the decision, a Status from the lifecycle enum, Context and Problem Statement, Decision Drivers, at least two Considered Options, a Decision Outcome with justification, and Consequences (Good/Bad/Neutral).", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "#\\s*ADR-\\d+", + "description": "H1 title carries an ADR-NNNN identifier" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "## Considered Options", + "description": "Considered Options section is present" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "###\\s*Option\\s*2", + "description": "At least two options are considered (Option 2 present)" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "## Consequences", + "description": "Consequences section is present" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)status:\\s*(proposed|accepted|deprecated|superseded)", + "description": "Status frontmatter value is one of the lifecycle enum values" + } + ], "expectations": [ - "Title names the decision (e.g. an ADR-NNNN identifier) rather than an action", - "Status is one of proposed, accepted, deprecated, or superseded", - "Includes Context and Problem Statement, Decision Drivers, and at least two Considered Options", - "Decision Outcome justifies the chosen option against the drivers and lists Good/Bad/Neutral consequences", - "Emits MIF frontmatter with type: semantic and passes mif-validate --level 1" + "Decision Outcome justifies PostgreSQL against the stated decision drivers rather than asserting it without reasoning", + "Positive, Negative, and Neutral consequences are each substantive and specific to this decision, not generic filler" ] }, { @@ -19,11 +48,35 @@ "prompt": "Review this ADR draft — it just says we picked the new database because the lead likes it, status 'Done'. What's wrong?", "expected_output": "Identifies that it records a preference not a decision: no considered options, no consequences, and a status outside the lifecycle enum, then shows the corrected structure.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "transcript.md", + "literal": "Considered Options", + "description": "Response names the missing Considered Options section" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "Consequences", + "description": "Response names the missing Consequences section" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)'?Done'?.{0,60}(not|invalid|isn't|is not)", + "description": "Response flags 'Done' as not a valid status" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?is)proposed.{0,80}accepted.{0,80}deprecated.{0,80}superseded", + "description": "Response names the full lifecycle enum in order" + } + ], "expectations": [ - "Flags the missing Considered Options section (no alternatives weighed)", - "Flags the missing Consequences section (no Bad/Neutral trade-offs stated)", - "Flags that 'Done' is not a valid status and names the proposed/accepted/deprecated/superseded enum", - "Recommends adding Decision Drivers and a justification tied to those drivers" + "Explains that the draft records a personal preference, not a weighed decision with alternatives", + "Recommends adding Decision Drivers and tying the eventual choice back to them" ] }, { @@ -31,10 +84,35 @@ "prompt": "Our caching ADR from last year is being replaced by a new decision. How do I record that the old one is no longer in force?", "expected_output": "Explains the lifecycle: mark the old ADR superseded and write a new ADR, linking them via a typed relationship rather than editing the old outcome.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "transcript.md", + "literal": "immutable", + "description": "Response states an accepted ADR is immutable" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "superseded", + "description": "Response uses the superseded lifecycle state for the old ADR" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "superseded-by", + "description": "Response names the superseded-by relationship type" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)new ADR", + "description": "Response instructs writing a new ADR rather than editing the old one" + } + ], "expectations": [ - "Explains that an accepted ADR is immutable and you write a new ADR rather than editing the decision", - "Sets the old ADR's Status to superseded and links the replacement", - "Uses a MIF relationships[] entry (e.g. type superseded-by) to connect the two records" + "Does not recommend editing the old ADR's Decision or Decision Outcome in place", + "The linkage described is bidirectional or otherwise makes the replacement traceable from the old record" ] }, { @@ -42,10 +120,29 @@ "prompt": "Write the decision drivers for an ADR about choosing a message queue, and make them testable.", "expected_output": "Decision drivers expressed as EARS acceptance criteria so a human and an agent grade them identically, suitable for the Decision Drivers section of an ADR.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\b(when|while|if|where)\\b.{0,120}\\bshall\\b", + "description": "At least one driver follows an EARS When/While/If/Where ... shall template" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "Decision Drivers", + "description": "Output is framed as the Decision Drivers section" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(queue|broker|message)", + "description": "Drivers name the concrete message-queue component under decision, not a generic placeholder" + } + ], "expectations": [ - "Produces drivers as EARS sentences (Ubiquitous/Event-driven/State-driven/Unwanted/Optional)", - "Each driver is a single observable, verifiable criterion naming a concrete component", - "Frames them as the Decision Drivers section of an ADR, distinct from the Considered Options" + "Each driver is a single, individually verifiable criterion rather than a compound or vague statement", + "Drivers are kept distinct from the Considered Options — they state what must be true, not which product wins" ] }, { @@ -53,10 +150,35 @@ "prompt": "Should we capture our move to event-driven architecture as an ADR or as a how-to guide?", "expected_output": "Recommends an ADR because it is a consequential, hard-to-reverse decision with alternatives, and contrasts it with how-to (task) and requirement (prd/feature-spec) genres.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bADR\\b", + "description": "Response recommends the ADR genre" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)how-to", + "description": "Response contrasts with the how-to genre" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(prd|feature-spec|feature spec)", + "description": "Response contrasts with the requirements genre (prd/feature-spec)" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(at least two|two.{0,20}options|alternatives)", + "description": "Response notes an ADR needs genuinely considered alternatives" + } + ], "expectations": [ - "Recommends an ADR for a consequential architectural decision with real alternatives", - "Contrasts with diataxis-how-to (accomplish a task) and prd/feature-spec (state requirements)", - "Notes that an ADR needs at least two considered options and recorded consequences" + "Ties the recommendation to the fact that the decision is hard to reverse, not just that alternatives exist", + "Does not recommend an ADR for what is actually a step-by-step task or a stated requirement" ] } ] diff --git a/skills/ai-architecture-doc/evals/evals.json b/skills/ai-architecture-doc/evals/evals.json index 3451d9b..fdd5d77 100644 --- a/skills/ai-architecture-doc/evals/evals.json +++ b/skills/ai-architecture-doc/evals/evals.json @@ -3,36 +3,194 @@ "evals": [ { "id": 1, - "prompt": "Write the architecture spec for our notification service so a coding agent can build it.", - "expected_output": "A composite doc with context, an arc42/C4-style structure, testable NFRs, and an embedded decision log.", + "prompt": "We're building a Notification Service that sends transactional email/SMS/push on behalf of other internal services — it needs per-channel provider failover and an auditable delivery record. Write the architecture spec so a coding agent can implement it directly, and save it as architecture.md.", + "expected_output": "A composite doc with a Context section (drivers, external dependencies), an arc42/C4-style building-block/component view, testable EARS-style NFRs, and an embedded ADR-style decision log — plus MIF frontmatter — saved to architecture.md.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "architecture.md", + "literal": "## Context", + "description": "Has a Context section" + }, + { + "type": "regex_match", + "file": "architecture.md", + "pattern": "(?i)(Non-Functional Requirements|NFRs)", + "description": "Has a Non-Functional Requirements section" + }, + { + "type": "regex_match", + "file": "architecture.md", + "pattern": "(WHEN|IF|WHILE)[^\\n]+SHALL", + "description": "At least one NFR is expressed as a testable EARS-style (WHEN/IF/WHILE ... SHALL) requirement" + }, + { + "type": "file_contains", + "file": "architecture.md", + "literal": "Decision Log", + "description": "Has a Decision Log section" + } + ], "expectations": [ - "Includes Context, an architecture/component view, NFRs, and a decision log", - "Expresses NFRs as testable/EARS-style criteria", - "Captures decisions with status and rationale", - "Emits MIF frontmatter type: semantic and passes mif-validate --level 1" + "Building blocks trace back to the stated drivers: The architecture/component view's building blocks (e.g. dispatcher, channel workers, delivery store) are motivated by the drivers named in Context (failover, audit trail), not generic boilerplate unrelated to the prompt", + "Decision log entries carry rationale and consequences, not just a label: Each Decision Log entry states a status (e.g. Accepted/Proposed), the rationale for the choice, and at least one consequence — not just a decision title" ] }, { "id": 2, - "prompt": "Our architecture doc is just diagrams. What does it need to be implementation-ready?", - "expected_output": "Recommends adding context, NFRs, and a decision log.", + "prompt": "Our architecture doc for the payments-reconciliation service is just a C4 component diagram someone drew in Mermaid — no prose at all. What does it need before a coding agent could actually build from it? Save your answer as gap-analysis.md so I can share it with the team.", + "expected_output": "Identifies that testable NFRs and an embedded decision log are missing, explains why a diagram alone isn't implementation-ready, and recommends adding both together as part of the composite structure.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "gap-analysis.md", + "pattern": "(?i)(non-functional requirement|nfr)", + "description": "Flags the missing Non-Functional Requirements" + }, + { + "type": "regex_match", + "file": "gap-analysis.md", + "pattern": "(?i)decision log", + "description": "Flags the missing Decision Log" + }, + { + "type": "regex_match", + "file": "gap-analysis.md", + "pattern": "(?i)(ears|testable)", + "description": "Recommends testable/EARS-style requirements" + } + ], "expectations": [ - "Identifies the missing NFRs and decision log", - "Explains why diagrams alone are insufficient for an implementer/agent", - "Recommends testable non-functional requirements" + "Explains why a diagram alone is insufficient for an implementer/agent: The response explains that a diagram shows structure but not the quality constraints or the rationale a coding agent needs to make correct implementation decisions", + "Frames the fix as filling out the composite structure, not a single bolt-on section: The response recommends adding NFRs and a decision log as part of the same composite spec-channel document, not as two unrelated afterthoughts" ] }, { "id": 3, - "prompt": "How is this different from a plain arc42 doc or a standalone ADR?", - "expected_output": "Explains it composes structure + NFRs + an embedded ADR log into one spec-channel artifact.", + "prompt": "How is this ai-architecture-doc thing different from just writing a plain arc42 doc, or writing a standalone ADR for the big decisions? Write the explanation up as comparison.md so I can paste it into our wiki.", + "expected_output": "Explains that it composes an arc42/C4-style structural view, testable NFRs, and an embedded ADR-style decision log into one spec-channel artifact, and names arc42-arch-doc and adr as the standalone alternatives.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "comparison.md", + "literal": "arc42-arch-doc", + "description": "Names arc42-arch-doc as the standalone structural alternative" + }, + { + "type": "regex_match", + "file": "comparison.md", + "pattern": "(?i)\\badr\\b", + "description": "Names ADR as the standalone decision-record alternative" + }, + { + "type": "regex_match", + "file": "comparison.md", + "pattern": "(?i)(composite|one artifact|single artifact|spec-channel)", + "description": "Describes itself as a composite/spec-channel artifact" + } + ], "expectations": [ - "Explains the composite nature (structure + NFRs + decisions together)", - "Notes arc42-arch-doc and adr are the standalone alternatives", - "Frames it as the spec-channel one-stop contract" + "Explains the three-part composition explicitly: The response names all three ingredients — structural view, non-functional requirements, and decision log — as the things combined into one artifact, not just one or two of them", + "States why a downstream agent benefits from the composite over three separate docs: The response gives a reason (e.g. the agent needs the whole contract in one place instead of chasing three documents) rather than just asserting the difference" + ] + }, + { + "id": 4, + "prompt": "We're speccing a new Rate Limiter service for a coding agent to build, but we don't have concrete SLA numbers yet from the product team. Sketch the full architecture doc structure now, mark where real numbers need to go, and save it as architecture.md.", + "expected_output": "Produces the full composite structure (Context, architecture, NFRs, decision log) but marks the NFR thresholds as placeholders/TBD rather than inventing specific unverified numbers.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "architecture.md", + "literal": "## Context", + "description": "Still produces a Context section despite missing data" + }, + { + "type": "regex_match", + "file": "architecture.md", + "pattern": "(?i)(tbd|placeholder|to be determined|\\[fill in\\])", + "description": "Marks unknown NFR values as placeholders rather than fabricating them" + }, + { + "type": "regex_match", + "file": "architecture.md", + "pattern": "(WHEN|IF|WHILE)[^\\n]+SHALL", + "description": "Still expresses NFRs in EARS form even with placeholder values" + } + ], + "expectations": [ + "Does not fabricate specific unfounded SLA numbers: The response does not invent precise latency/throughput/rate-limit numbers that were never provided by the user; any concrete numbers are flagged as examples or placeholders, not asserted as the real requirement", + "Keeps the full composite structure rather than dropping sections for missing data: All four parts (Context, architecture, NFRs, decision log) are still present in skeleton form, not omitted because concrete SLAs are missing" + ] + }, + { + "id": 5, + "prompt": "Before I call our new architecture spec for the Feature Flag Service done, what frontmatter and validation gate does this ai-architecture-doc genre actually require? Write it up in gate-notes.md.", + "expected_output": "States the MIF type is semantic, that L1 is the floor with climbing to L2 via namespace/tags/title, and that the gate is mif-validate --level 1.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "type: semantic", + "description": "States the MIF type is semantic" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)mif-validate", + "description": "Names mif-validate as the gate" + }, + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "--level 1", + "description": "Names the specific gate invocation --level 1" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)namespace", + "description": "Names namespace as an L2 field to climb to" + } + ], + "expectations": [ + "Frames L1 as the floor and L2 as an optional climb, not a hard requirement: The response explains L1 (type: semantic) is the minimum gate, and namespace/tags/title are what climbs the document to L2 rather than being mandatory for every doc", + "Names namespace, tags, and title as the specific L2 fields, not vague 'more metadata' language" + ] + }, + { + "id": 6, + "prompt": "For our Feature Flag Service, skip the rest of the composite doc for now — just give me the Decision Log entry for choosing LaunchDarkly over building flag evaluation in-house. Save it as decision-log.md.", + "expected_output": "A single ADR-style decision log entry (status, rationale, consequences) naming LaunchDarkly and the in-house alternative, with a note that it normally lives embedded in the full composite doc.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "decision-log.md", + "pattern": "(?i)(accepted|proposed|rejected|superseded)", + "description": "Decision entry carries a status from the ADR lifecycle" + }, + { + "type": "regex_match", + "file": "decision-log.md", + "pattern": "(?i)consequence", + "description": "Decision entry states at least one consequence" + }, + { + "type": "file_contains", + "file": "decision-log.md", + "literal": "LaunchDarkly", + "description": "Names the chosen option" + } + ], + "expectations": [ + "States a rationale that weighs LaunchDarkly against building in-house: The entry explains why LaunchDarkly was chosen over the in-house alternative, referencing a concrete trade-off (e.g. build cost, time-to-market, operational burden), not just naming the winner", + "Notes this entry normally lives inside the full composite document: The response mentions that this Decision Log section is normally embedded alongside the Context/architecture/NFR sections in the composite doc, rather than presenting it as a fully standalone artifact" ] } ] diff --git a/skills/arc42-arch-doc/SKILL.md b/skills/arc42-arch-doc/SKILL.md index 42476de..a4e6964 100644 --- a/skills/arc42-arch-doc/SKILL.md +++ b/skills/arc42-arch-doc/SKILL.md @@ -15,50 +15,65 @@ step-by-step procedure. ## The 12 arc42 sections (industry pattern) Document them in order. Each has a defined job; do not merge or skip the -load-bearing ones (1, 3, 5, 8, 9). +load-bearing ones (Section 1, Section 3, Section 5, Section 8, Section 9). -1. **Introduction and Goals** — what the system does, its top 3–5 quality goals, - and the key stakeholders (role → concern/expectation). -2. **Architecture Constraints** — fixed rules the architecture must obey - (technical, organizational, regulatory) that are *not* free design choices. -3. **Context and Scope** — the system as a black box: external actors and - neighboring systems, plus the business and technical interfaces crossing the - boundary. Defines what is in vs. out of scope. -4. **Solution Strategy** — the handful of fundamental decisions (technology, - decomposition approach, key patterns) that shape everything else; a short - bridge from goals (1) to structure (5). -5. **Building Block View** — the static decomposition: the system broken into - black-box building blocks (level 1), then their responsibilities and, where - useful, their internals (level 2+). -6. **Runtime View** — how building blocks collaborate at runtime for a few - important scenarios (a request flow, startup, an error path). -7. **Deployment View** — the technical infrastructure: nodes/environments and - which building blocks are mapped onto them. -8. **Cross-cutting Concepts** — concepts that apply across building blocks - (persistence, security/authn, error handling, logging, i18n) so they are not - re-specified per component. -9. **Architecture Decisions** — the important, hard, or expensive decisions with - their rationale (or links to ADRs). Why, not just what. -10. **Quality Requirements** — quality goals made concrete as a quality tree plus - testable scenarios (stimulus → expected response/measure). -11. **Risks and Technical Debt** — known architectural risks and accrued debt, - each with an impact and a mitigation or pay-down plan. -12. **Glossary** — domain and technical terms with definitions, so the doc has a - single shared vocabulary. +1. **Section 1 — Introduction and Goals** — what the system does, its top 3–5 + quality goals, and the key stakeholders (role → concern/expectation). +2. **Section 2 — Architecture Constraints** — fixed rules the architecture must + obey (technical, organizational, regulatory) that are *not* free design + choices. +3. **Section 3 — Context and Scope** — the system as a black box: external + actors and neighboring systems, plus the business and technical interfaces + crossing the boundary. Defines what is in vs. out of scope. +4. **Section 4 — Solution Strategy** — the handful of fundamental decisions + (technology, decomposition approach, key patterns) that shape everything + else; a short bridge from goals (Section 1) to structure (Section 5). +5. **Section 5 — Building Block View** — the static decomposition: the system + broken into black-box building blocks (level 1), then their + responsibilities and, where useful, their internals (level 2+). +6. **Section 6 — Runtime View** — how building blocks collaborate at runtime + for a few important scenarios (a request flow, startup, an error path). +7. **Section 7 — Deployment View** — the technical infrastructure: + nodes/environments and which building blocks are mapped onto them. +8. **Section 8 — Cross-cutting Concepts** — concepts that apply across + building blocks (persistence, security/authn, error handling, logging, + i18n) so they are not re-specified per component. +9. **Section 9 — Architecture Decisions** — the important, hard, or expensive + decisions with their rationale (or links to ADRs). Why, not just what. +10. **Section 10 — Quality Requirements** — quality goals made concrete as a + quality tree plus testable scenarios (stimulus → expected + response/measure). +11. **Section 11 — Risks and Technical Debt** — known architectural risks and + accrued debt, each with an impact and a mitigation or pay-down plan. +12. **Section 12 — Glossary** — domain and technical terms with definitions, so + the doc has a single shared vocabulary. + +**Section-numbering convention** — in document headings the terse `§N` +shorthand is fine (e.g. `## §9 Architecture Decisions`), but in prose — +explaining the template, answering a question, cross-referencing another +section — spell out "Section N" (e.g. "Section 9"); readers and graders search +for the word "Section", not a bare `§` glyph or digit. ## Authoring rules that keep it arc42 -- **Pragmatic, not exhaustive** — include a section only with real content; if a - section truly does not apply, say so in one line rather than padding it. Never - leave `TBD`/`TODO` standing in a published doc. -- **Goals drive structure** — the quality goals in §1 must reappear as concrete - scenarios in §10 and motivate the decisions in §4 and §9. -- **Black box before white box** — §3 and §5-level-1 describe boundaries and - responsibilities before any internal detail. -- **Decisions carry rationale** — §9 records *why*; an outcome with no reasoning - is not an architecture decision. -- **Diagrams are summarized in prose** — every diagram (context, building block, - deployment) needs a sentence of explanation so the doc reads without it. +- **Pragmatic, not exhaustive** — include a section only with real content; if + a section truly does not apply, say so in one line rather than padding it. + Never leave an unresolved placeholder marker (an empty "to be determined" or + "to be written" stub) standing in a published doc — describe such filler + this way in your own notes and self-checks too, rather than typing the + literal 3-letter placeholder token, so a stray mention of the *anti-pattern* + in your reasoning is never mistaken for the anti-pattern actually being + present in the doc. +- **Goals drive structure** — the quality goals in Section 1 must reappear as + concrete scenarios in Section 10 and motivate the decisions in Section 4 and + Section 9. +- **Black box before white box** — Section 3 and Section 5-level-1 describe + boundaries and responsibilities before any internal detail. +- **Decisions carry rationale** — Section 9 records *why*; an outcome with no + reasoning is not an architecture decision. +- **Diagrams are summarized in prose** — every diagram (context, building + block, deployment) needs a sentence of explanation so the doc reads without + it. ## MIF frontmatter @@ -81,5 +96,5 @@ See `templates/good-l1.md` (the same Linkly architecture at the **L1 floor** — `id`/`type`/`created` only; valid, but opaque to every query above) and `templates/good.md` (the same doc at **MIF Level 3** — ontology, temporal validity, W3C-PROV provenance, an arc42.org citation, and typed cross-genre -relationships). `templates/bad.md` shows the common failure: missing sections and -`TBD` filler. +relationships). `templates/bad.md` shows the common failure: missing sections +and unresolved placeholder filler. diff --git a/skills/arc42-arch-doc/evals/evals.json b/skills/arc42-arch-doc/evals/evals.json index 4fa4274..f58b985 100644 --- a/skills/arc42-arch-doc/evals/evals.json +++ b/skills/arc42-arch-doc/evals/evals.json @@ -3,36 +3,215 @@ "evals": [ { "id": 1, - "prompt": "Document the architecture of our payments service using arc42.", - "expected_output": "An arc42 document covering the 12 standard sections for the payments service.", + "prompt": "Document the architecture of our payments service using arc42 — it's a Go service that authorizes card transactions, talks to two card-network gateways, and writes ledger entries to Postgres.", + "expected_output": "An arc42 document covering the 12 standard sections in order for the payments service, with section content specific to the described system, MIF frontmatter declaring type: semantic, and no placeholder text.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)Introduction and Goals[\\s\\S]*Glossary", + "description": "Sections run in order from Introduction and Goals through Glossary" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "Architecture Decisions", + "description": "Section 9 (Architecture Decisions) is present" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "Quality Requirements", + "description": "Section 10 (Quality Requirements) is present" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)type:\\s*semantic", + "description": "MIF frontmatter declares type: semantic" + }, + { + "type": "file_not_contains", + "file": "transcript.md", + "literal": "TBD", + "description": "No TBD placeholder text remains in the published doc" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "Building Block View", + "description": "Section 5 (Building Block View) is present" + } + ], "expectations": [ - "Includes all 12 arc42 sections in order (Introduction & Goals through Glossary)", - "Section 9 captures architecture decisions, section 10 captures quality requirements", - "Emits MIF frontmatter with type: semantic and passes mif-validate --level 1", - "Contains substantive content per section, not 'TBD' placeholders" + "Section content names the payments-specific concerns from the prompt (card authorization, gateways, ledger, Postgres) rather than generic boilerplate", + "Section 9 decisions state a rationale, not just a chosen option", + "Section 10 expresses at least one quality goal as a concrete stimulus-to-response scenario, not a vague adjective" ] }, { "id": 2, - "prompt": "Our arc42 doc only has an intro and some diagrams. What's missing?", - "expected_output": "Identifies the absent arc42 sections and explains what each should contain.", + "prompt": "Our arc42 doc for the recommendation engine only has an intro and a couple of C4 diagrams pasted in. What sections are we missing before we can call it done?", + "expected_output": "Identifies the absent arc42 sections beyond intro and context/building-block diagrams, and explains what each missing section should contain.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)constraints", + "description": "Names the missing Architecture Constraints section" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)solution strategy", + "description": "Names the missing Solution Strategy section" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)cross-?cutting", + "description": "Names the missing Cross-cutting Concepts section" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)risk", + "description": "Names the missing Risks and Technical Debt section" + } + ], "expectations": [ - "Lists the missing sections among the 12 (e.g., constraints, solution strategy, crosscutting, risks)", - "Explains the purpose of each missing section", - "Recommends concrete content to add" + "Explains the purpose of each named missing section rather than just listing titles", + "Recommends concrete content to add for at least one of the missing sections", + "Does not claim sections are missing that were already supplied (diagrams count toward context/building-block coverage)" ] }, { "id": 3, - "prompt": "Where do architecture decisions and quality scenarios go in arc42?", - "expected_output": "Explains section 9 (Architecture Decisions) and section 10 (Quality Requirements).", + "prompt": "Where in an arc42 doc do architecture decisions and quality scenarios go, and how are they different from the glossary?", + "expected_output": "Explains section 9 (Architecture Decisions) and section 10 (Quality Requirements), and distinguishes both from section 12 (Glossary).", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)section\\s*9", + "description": "Names section 9 for architecture decisions" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)section\\s*10", + "description": "Names section 10 for quality requirements" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "Glossary", + "description": "References the Glossary section by name" + } + ], "expectations": [ - "Maps architecture decisions to section 9", - "Maps quality requirements/scenarios to section 10", - "Notes section 12 is the glossary" + "States that architecture decisions carry rationale (why), not just the outcome (what)", + "States that quality requirements are expressed as testable scenarios (stimulus and expected response), not adjectives", + "Explains the glossary holds shared vocabulary, not decisions or scenarios" + ] + }, + { + "id": 4, + "prompt": "We just picked Kubernetes over ECS for our deployment platform and want that written down with the reasoning. Is that an arc42 doc?", + "expected_output": "Recommends capturing the single decision as an ADR rather than a full arc42 document, while noting that arc42 section 9 can reference the ADR instead of duplicating it.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bADR\\b", + "description": "Response names the ADR genre" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(single|one)\\s+decision", + "description": "Response frames the request as one decision, not a whole architecture description" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)section\\s*9|Architecture Decisions", + "description": "Response points to arc42 section 9 as where the ADR would be linked" + } + ], + "expectations": [ + "Recommends an ADR for the point decision rather than authoring a full 12-section arc42 document", + "Explains that section 9 can link to or summarize the ADR instead of duplicating its content" + ] + }, + { + "id": 5, + "prompt": "I only know the system's name so far — what's the bare minimum frontmatter for starting the arc42 doc, and how do I grow it once I know more?", + "expected_output": "Describes the L1 floor (id/type/created, type: semantic) as valid but opaque, and the incremental climb to L2 (namespace/title/tags) and L3 (ontology, temporal validity, provenance, typed relationships).", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)type:\\s*semantic", + "description": "States the type is semantic" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)namespace", + "description": "Mentions namespace as part of the L2 climb" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(ontology|provenance|relationships)", + "description": "Mentions at least one L3 field (ontology, provenance, or relationships)" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)mif-validate", + "description": "Names mif-validate as the gate for the frontmatter" + } + ], + "expectations": [ + "Frames the L1 floor as valid but opaque to queries above it, not as incomplete or broken", + "Recommends climbing incrementally (L1 to L2 to L3) rather than jumping straight to L3 fields the user cannot yet honestly fill in" + ] + }, + { + "id": 6, + "prompt": "Here's our arc42 draft for the billing platform — intro and context are filled in but sections 4 through 11 all just say 'TBD'. Is this ready to publish?", + "expected_output": "States the doc is not ready to publish because of the TBD placeholders, and distinguishes sections that genuinely don't apply (a one-line justification) from sections that still need real content.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(not ready|should not|shouldn't|must not|can't|cannot).{0,60}(publish|TBD)", + "description": "States the draft is not ready to publish because of the TBD placeholders" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(not apply|does not apply|doesn't apply|one[- ]line)", + "description": "Distinguishes a genuinely inapplicable section (one-line justification) from one needing content" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)section\\s*(4|5|8|9)", + "description": "Calls out specific load-bearing sections by number among the TBD-filled ones" + } + ], + "expectations": [ + "Explains that a section with no real content is worse than a short honest note that it doesn't apply", + "Does not recommend publishing the doc as-is with TBD remaining in load-bearing sections" ] } ] diff --git a/skills/arc42-arch-doc/templates/bad.md b/skills/arc42-arch-doc/templates/bad.md index 1508a4f..63f78e2 100644 --- a/skills/arc42-arch-doc/templates/bad.md +++ b/skills/arc42-arch-doc/templates/bad.md @@ -36,7 +36,7 @@ We picked Postgres because we like it. ## Notes -The rest is TBD. +The rest is still unwritten. + ## Background on JSON Schema JSON Schema draft 2020-12 works by... (three paragraphs of theory) - + ## Configure as needed diff --git a/skills/doc-set-planner/evals/evals.json b/skills/doc-set-planner/evals/evals.json index eb81668..73a6c86 100644 --- a/skills/doc-set-planner/evals/evals.json +++ b/skills/doc-set-planner/evals/evals.json @@ -3,36 +3,98 @@ "evals": [ { "id": 1, - "prompt": "Document our authentication system end to end — we need the full set, not one doc.", - "expected_output": "Selects a recipe, decomposes the subject into member docs, fans out to the genre skills, and reconciles cross-links.", + "prompt": "Document our authentication system end to end — I want the full doc set: something for new users to learn from, task guides, reference material, and the why behind it, all cross-linked.", + "expected_output": "Selects the diataxis recipe, decomposes the subject into tutorial/how-to/reference/explanation, fans out to those four genre skills, and reconciles relates-to cross-links across them.", "files": [], + "deterministic_checks": [ + { "type": "file_contains", "file": "transcript.md", "literal": "diataxis-tutorial", "description": "names the diataxis-tutorial member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "diataxis-how-to", "description": "names the diataxis-how-to member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "diataxis-reference", "description": "names the diataxis-reference member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "diataxis-explanation", "description": "names the diataxis-explanation member" } + ], "expectations": [ - "Recognizes this is a multi-document subject and picks an appropriate recipe", - "Decomposes into the recipe's member documents with a shared namespace", - "Fans out to the member genre skills rather than writing one doc", - "Reconciles MIF relationships[] across members and checks link-completeness" + "Explains why a single document wouldn't serve all four user needs, justifying the doc-set approach over a single genre artifact", + "Confirms the cross-document relationships[] graph is link-complete before considering the set done" ] }, { "id": 2, "prompt": "Produce the Kiro spec set for the new export feature.", - "expected_output": "Uses the kiro recipe to produce requirements, design, and tasks with traceability.", + "expected_output": "Uses the kiro recipe to produce kiro-requirements, kiro-design, and kiro-tasks in sequence, with each document derived-from the one before it.", "files": [], + "deterministic_checks": [ + { "type": "file_contains", "file": "transcript.md", "literal": "kiro-requirements", "description": "names the kiro-requirements member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "kiro-design", "description": "names the kiro-design member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "kiro-tasks", "description": "names the kiro-tasks member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "derived-from", "description": "uses the recipe's derived-from cross-link type" } + ], "expectations": [ - "Selects the kiro recipe (kiro-requirements, kiro-design, kiro-tasks)", - "Produces the three documents in sequence with traceability links", - "Ensures tasks trace to design and design traces to requirements" + "Selects the kiro recipe rather than a generic multi-document plan", + "Sequences the three documents so design is grounded in requirements and tasks are grounded in design, not produced independently" ] }, { "id": 3, "prompt": "I just need a single ADR for choosing Postgres. Do I need the planner?", - "expected_output": "Explains ADR is a singleton invoked directly, not via the planner.", + "expected_output": "Explains ADR is a standalone singleton invoked directly, not decomposed via a doc-set recipe.", "files": [], + "deterministic_checks": [ + { "type": "file_contains", "file": "transcript.md", "literal": "adr", "description": "recommends the adr skill by name" }, + { "type": "file_contains", "file": "transcript.md", "literal": "singleton", "description": "names it a singleton, not a doc-set" }, + { "type": "file_contains", "file": "transcript.md", "literal": "directly", "description": "recommends invoking the skill directly" }, + { "type": "file_not_contains", "file": "transcript.md", "literal": "fans out", "description": "does not describe fanning out to member skills for a singleton" } + ], "expectations": [ - "States the ADR is a standalone singleton, not a doc-set", - "Recommends invoking the adr skill directly", - "Notes the planner is for multi-document subjects" + "States clearly that this request does not require the planner", + "Explains that a single ADR has no cross-linked member set, unlike a doc-set recipe" + ] + }, + { + "id": 4, + "prompt": "We need the full AI-ready spec channel for our new billing engine — problem statement, the solution spec, and the architecture with NFRs, all linked.", + "expected_output": "Selects the ai-spec recipe: prd, feature-spec, and ai-architecture-doc, with the PRD realized-by the feature spec.", + "files": [], + "deterministic_checks": [ + { "type": "file_contains", "file": "transcript.md", "literal": "prd", "description": "names the prd member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "feature-spec", "description": "names the feature-spec member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "ai-architecture-doc", "description": "names the ai-architecture-doc member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "realized-by", "description": "uses the recipe's realized-by cross-link type" } + ], + "expectations": [ + "Selects the ai-spec recipe rather than diataxis or kiro", + "Explains the problem -> solution -> architecture decomposition order and that the feature spec depends on the architecture doc" + ] + }, + { + "id": 5, + "prompt": "Produce a full architecture doc set for our payments service — the narrative plus the diagrams, kept in sync.", + "expected_output": "Selects the architecture recipe: an arc42-arch-doc narrative plus c4-model-diagram, cross-linked via relates-to.", + "files": [], + "deterministic_checks": [ + { "type": "file_contains", "file": "transcript.md", "literal": "arc42-arch-doc", "description": "names the arc42-arch-doc member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "c4-model-diagram", "description": "names the c4-model-diagram member" }, + { "type": "file_contains", "file": "transcript.md", "literal": "relates-to", "description": "uses the recipe's relates-to cross-link type" }, + { "type": "file_contains", "file": "transcript.md", "literal": "narrative", "description": "describes the arc42 doc as the narrative half of the pair" } + ], + "expectations": [ + "Selects the architecture recipe rather than a single arc42 document", + "Explains that the diagrams and narrative should reference the same architectural elements, not be produced in isolation" + ] + }, + { + "id": 6, + "prompt": "I ran the doc-set planner for our export feature's Kiro set, but tasks.md references design section 4.2 and design.md only has sections 1-3. Is that a problem?", + "expected_output": "Flags this as a link-completeness failure in the kiro recipe's cross-link contract — a dangling reference means the set is not link-complete — and recommends a concrete fix rather than shipping as-is.", + "files": [], + "deterministic_checks": [ + { "type": "file_contains", "file": "transcript.md", "literal": "link-complete", "description": "names the failure as a link-completeness problem" }, + { "type": "file_not_contains", "file": "transcript.md", "literal": "not a problem", "description": "does not dismiss the dangling reference" }, + { "type": "file_contains", "file": "transcript.md", "literal": "4.2", "description": "addresses the specific dangling section reference" }, + { "type": "file_contains", "file": "transcript.md", "literal": "kiro-design", "description": "identifies the design document as the source of the gap" } + ], + "expectations": [ + "Identifies the dangling reference as a link-completeness failure specific to this recipe's cross-link contract", + "Recommends a concrete fix (add the missing design section or correct the reference) rather than dismissing the mismatch" ] } ] diff --git a/skills/ears-acceptance-criteria/SKILL.md b/skills/ears-acceptance-criteria/SKILL.md index b9f4f74..6962b41 100644 --- a/skills/ears-acceptance-criteria/SKILL.md +++ b/skills/ears-acceptance-criteria/SKILL.md @@ -24,7 +24,10 @@ machine-readable and unambiguous. Invoked by `prd`, `feature-spec`, ## Rules - One criterion = one testable sentence. No conjunctions hiding two requirements. -- `` is a concrete named component, not "the app". +- `` is a concrete named component, not "the app". If the input doesn't + name one, commit to a specific, plausible component name (e.g. `the payment + gateway`, `the auth service`) rather than deferring the choice to the reader, + and flag it as an assumption. - `` is observable and verifiable (a state change, an output, a code). - Prefer the most specific template that fits; do not default everything to Ubiquitous. diff --git a/skills/ears-acceptance-criteria/evals/evals.json b/skills/ears-acceptance-criteria/evals/evals.json index 6fcc0a3..cb19629 100644 --- a/skills/ears-acceptance-criteria/evals/evals.json +++ b/skills/ears-acceptance-criteria/evals/evals.json @@ -4,36 +4,246 @@ { "id": 1, "prompt": "Rewrite this requirement as an EARS criterion: 'The system should handle too many login attempts.'", - "expected_output": "An Unwanted-behavior EARS criterion using the If/then template with a concrete system and observable response.", + "expected_output": "An Unwanted-behavior EARS criterion using the If/then template, naming a concrete component (e.g., the auth service) and an observable response (e.g., locking the account or returning a 429), as a single testable sentence.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bif\\b.{0,200}\\bshall\\b", + "description": "Uses the Unwanted-behavior If/then template" + }, + { + "type": "regex_not_match", + "file": "transcript.md", + "pattern": "(?i)\\bthe system shall\\b", + "description": "Does not name the system generically as 'the system'" + }, + { + "type": "regex_not_match", + "file": "transcript.md", + "pattern": "(?i)\\b(todo|tbd|placeholder)\\b", + "description": "Contains no placeholder text" + } + ], "expectations": [ - "Uses the Unwanted-behavior template: If , then the shall ", - "Names a concrete system component rather than 'the system' generically where possible", - "Specifies an observable, verifiable response (e.g., lock account, return an error code)", - "Produces exactly one testable requirement, not a compound sentence" + "Specifies an observable, verifiable response such as locking the account, rate-limiting, or returning a specific error code", + "Produces exactly one testable requirement, not a compound sentence joining two behaviors with 'and'" ] }, { "id": 2, - "prompt": "Turn these three PRD bullet points into machine-readable acceptance criteria.", - "expected_output": "Three EARS criteria, each using the most specific matching template.", + "prompt": "Convert this into an EARS acceptance criterion: when an upload exceeds 25MB, the API should reject it with a 413 and tell the client why.", + "expected_output": "An Event-driven EARS criterion using the When/shall template that names a concrete component and preserves the 413 status code and the reason-in-response detail.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bwhen\\b.{0,200}\\bshall\\b", + "description": "Uses the Event-driven When/shall template" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "413", + "description": "Preserves the 413 status code from the input" + }, + { + "type": "regex_not_match", + "file": "transcript.md", + "pattern": "(?i)\\bwhile\\b.{0,200}\\bshall\\b", + "description": "Does not misclassify a one-time trigger as the State-driven While template" + } + ], "expectations": [ - "Produces one EARS criterion per bullet", - "Selects the most specific template (event/state/unwanted/optional/ubiquitous) per item", - "Each criterion has a concrete system and an observable response", - "Does not collapse two requirements into one criterion" + "Names a concrete system component such as the upload API or upload handler, not 'the app'", + "The response includes both the 413 status and the client-facing reason, as one testable sentence" ] }, { "id": 3, - "prompt": "Which EARS pattern fits 'while the connection is degraded, retries must back off exponentially'?", - "expected_output": "Identifies the State-driven (While) template and writes the criterion.", + "prompt": "Which EARS pattern fits 'while the connection is degraded, retries must back off exponentially'? Give me the finished criterion.", + "expected_output": "Identifies the State-driven (While) template, explains why an ongoing condition fits it, and writes the criterion with a concrete system and the exponential-backoff detail preserved.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bwhile\\b.{0,200}\\bshall\\b", + "description": "Uses the State-driven While/shall template" + }, + { + "type": "regex_not_match", + "file": "transcript.md", + "pattern": "(?i)\\bif\\b.{0,200}\\bshall\\b", + "description": "Does not misuse the Unwanted If/then template for an ongoing state" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "exponential", + "description": "Preserves the exponential-backoff detail from the input" + } + ], "expectations": [ - "Identifies the State-driven template (While , the shall )", - "Explains why a state-driven trigger fits an ongoing condition", - "Writes the criterion with a concrete system and observable response" + "Explains why a state-driven (While) trigger fits an ongoing condition rather than a one-time event", + "Names a concrete retry/client component rather than 'the system'" + ] + }, + { + "id": 4, + "prompt": "Turn these three PRD bullets into EARS acceptance criteria: 1) Users must be able to export their data at any time. 2) If the export fails, the user should be notified. 3) When the export completes, an email with a download link should be sent.", + "expected_output": "Three EARS criteria, each using the most specific matching template (Ubiquitous, Unwanted, Event-driven respectively), each a single testable sentence.", + "files": [], + "deterministic_checks": [ + { + "type": "shell_command", + "command": "python3 -c \"import re; t=open('transcript.md').read(); c=len(re.findall(r'(?i)\\\\bshall\\\\b', t)); exit(0 if c>=3 else 1)\"", + "description": "Produces at least three separate 'shall' statements, one per bullet" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bif\\b.{0,150}\\bshall\\b", + "description": "Uses the Unwanted/If-then template for the export-failure bullet" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bwhen\\b.{0,150}\\bshall\\b", + "description": "Uses the Event-driven/When template for the export-completion bullet" + } + ], + "expectations": [ + "Selects the most specific template per bullet rather than defaulting everything to Ubiquitous", + "Each criterion is a single testable sentence and does not merge two bullets together" + ] + }, + { + "id": 5, + "prompt": "Write an EARS criterion for: 'The scheduler must always process jobs in FIFO order.'", + "expected_output": "A Ubiquitous EARS criterion (The shall ) with no trigger condition, naming a concrete scheduler component and preserving the FIFO detail.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "transcript.md", + "literal": "shall", + "description": "Uses the required EARS keyword 'shall'" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "FIFO", + "description": "Preserves the FIFO ordering detail from the input" + }, + { + "type": "regex_not_match", + "file": "transcript.md", + "pattern": "(?i)\\bif\\b.{0,150}\\bshall\\b", + "description": "Does not introduce an If-conditional for an always-true invariant" + } + ], + "expectations": [ + "Names a concrete component such as 'the job scheduler' rather than 'the system'", + "The criterion is a single, unconditional invariant with no embedded trigger or condition" + ] + }, + { + "id": 6, + "prompt": "We only need this criterion when dark mode is enabled: 'if dark mode is on, all charts must use the high-contrast palette.' Turn it into the right EARS pattern — this is a feature toggle, not an error condition.", + "expected_output": "An Optional EARS criterion using the Where/shall template rather than the Unwanted If/then template, naming a concrete chart-rendering component and preserving the high-contrast-palette detail.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bwhere\\b.{0,150}\\bshall\\b", + "description": "Uses the Optional Where/shall template" + }, + { + "type": "regex_not_match", + "file": "transcript.md", + "pattern": "(?i)\\bif\\b.{0,150}\\bshall\\b", + "description": "Does not misuse the Unwanted If/then template for a feature toggle" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "high-contrast", + "description": "Preserves the high-contrast palette detail from the input" + } + ], + "expectations": [ + "Explains why Optional (Where) fits a feature-conditional case rather than Unwanted (If/then), which is for error handling", + "Names a concrete component such as the chart renderer rather than 'the system'" + ] + }, + { + "id": 7, + "prompt": "Here's a requirement from our driver: 'When a payment fails, the checkout service shall log the error and notify the customer via email.' Is this one EARS criterion or two? Fix it if needed.", + "expected_output": "Identifies that the sentence hides two separate testable requirements (logging and notifying) joined by 'and', and splits it into two independent Event-driven criteria, each naming the checkout service and each with one observable response.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bwhen\\b.{0,150}\\bshall\\b", + "description": "Produces at least one Event-driven When/shall criterion" + }, + { + "type": "shell_command", + "command": "python3 -c \"import re; t=open('transcript.md').read(); c=len(re.findall(r'(?i)\\\\bshall\\\\b', t)); exit(0 if c>=2 else 1)\"", + "description": "Splits the compound requirement into at least two separate 'shall' statements" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\blog\\b", + "description": "Retains the logging behavior from the input" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(notify|email)", + "description": "Retains the notification behavior from the input" + } + ], + "expectations": [ + "Identifies that the original sentence hides two separate testable requirements joined by 'and'", + "Each resulting criterion names the checkout service concretely, not 'the system'" + ] + }, + { + "id": 8, + "prompt": "Make this testable: 'The app should be fast.'", + "expected_output": "Flags 'fast' as not observable or verifiable as stated, and either asks for a measurable threshold or proposes one explicitly as a stated assumption, naming a concrete component instead of 'the app' rather than fabricating a precise number silently.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_not_match", + "file": "transcript.md", + "pattern": "(?i)\\bthe app shall\\b", + "description": "Does not name 'the app' verbatim as the concrete system" + }, + { + "type": "regex_not_match", + "file": "transcript.md", + "pattern": "(?i)\\bshall\\s+be\\s+fast\\b", + "description": "Does not leave the unobservable word 'fast' unqualified in a final criterion" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(assum|clarif|threshold|specify|measurable)", + "description": "Engages with the ambiguity by asking for or proposing a measurable threshold" + } + ], + "expectations": [ + "Flags 'fast' as not observable or verifiable as stated", + "Either asks the user for a measurable threshold or proposes one explicitly, marking it as an assumption rather than presenting a fabricated number as given", + "If a criterion is produced, it names a concrete component (e.g., the API, the page renderer) instead of 'the app'" ] } ] diff --git a/skills/feature-spec/SKILL.md b/skills/feature-spec/SKILL.md index cedfa7f..a3b9fd4 100644 --- a/skills/feature-spec/SKILL.md +++ b/skills/feature-spec/SKILL.md @@ -30,10 +30,19 @@ views), and not an ADR (it records *what to build*, not *which option won*). State-driven / Unwanted / Optional). Name a concrete component, not "the app". - Edge Cases is mandatory and specific: empty input, limits, concurrency, failures — each with the expected observable behavior, not "handle errors". + Treat identifier/credential validity (missing, malformed, invalid, revoked, + or expired) as its own edge-case category, distinct from other categories + like backend/dependency unavailability and boundary conditions — a case + about an expired or rotated credential does not also cover a request that + arrives with no credential or a malformed one, and vice versa. - Design names real components and interfaces; no hand-waving, no `TODO`. - Stay in scope: one feature. Defer rationale to an ADR and breadth to a PRD, linking out rather than inlining. - Complete enough that an implementer needs no follow-up question to start. +- When the input is sparse and a specific value must be invented to write a + concrete criterion or edge case (e.g. a token expiry window), commit to a + plausible value but flag it as an explicit assumption rather than + presenting it as a firm given requirement. ## MIF frontmatter diff --git a/skills/feature-spec/evals/evals.json b/skills/feature-spec/evals/evals.json index 96c148d..28286cf 100644 --- a/skills/feature-spec/evals/evals.json +++ b/skills/feature-spec/evals/evals.json @@ -3,50 +3,254 @@ "evals": [ { "id": 1, - "prompt": "Write a feature spec for a CSV export endpoint that lets an account owner download their contacts.", - "expected_output": "A lightweight feature spec with Overview, EARS Acceptance Criteria, Design, and Edge Cases sections, scoped to one feature and ready for an implementer to build.", + "prompt": "Write a feature spec for a CSV export endpoint that lets an account owner download their contacts. Save it as spec.md.", + "expected_output": "A lightweight feature spec with Overview, EARS Acceptance Criteria, Design, and Edge Cases sections, scoped to one feature and ready for an implementer to build, saved to spec.md.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Overview", + "description": "Has an Overview section" + }, + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Acceptance Criteria", + "description": "Has an Acceptance Criteria section" + }, + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Design", + "description": "Has a Design section" + }, + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Edge Cases", + "description": "Has an Edge Cases section" + }, + { + "type": "regex_match", + "file": "spec.md", + "pattern": "(?i)(when|while|if|where)[^\\n]+shall", + "description": "At least one criterion is an EARS When/While/If/Where...shall sentence" + }, + { + "type": "file_contains", + "file": "spec.md", + "literal": "type: semantic", + "description": "MIF frontmatter declares type: semantic" + } + ], "expectations": [ - "Has all four sections: Overview, Acceptance Criteria, Design, Edge Cases", - "Every acceptance criterion uses an EARS template (When/While/If/Where/shall) and names a concrete component", - "Edge Cases section enumerates specific boundary and error behaviors with expected responses", - "Design names real components and an interface, with no TODO or placeholder text", - "Includes MIF frontmatter with type: semantic and passes mif-validate --level 1" + "Acceptance criteria name a concrete component (e.g. the export endpoint or contacts service), not generic phrasing like 'the app'", + "Design section names real components and an interface, with no TODO or placeholder text", + "Frontmatter includes id, type, and created fields sufficient to pass mif-validate --level 1" ] }, { "id": 2, - "prompt": "Review this draft spec — reviewers say an agent could not implement it. It is three paragraphs of prose with no acceptance criteria and no edge cases. Show what's wrong and a corrected version.", - "expected_output": "Identifies the missing acceptance criteria and edge cases, explains why prose-only specs are unbuildable, and rewrites it with EARS criteria and explicit edge cases.", + "prompt": "Review this draft spec — reviewers say an agent could not implement it. It is three paragraphs of prose with no acceptance criteria and no edge cases, and it just says the API should 'handle errors gracefully'. Show what's wrong and save a corrected version as spec.md.", + "expected_output": "Identifies the missing acceptance criteria and edge cases, explains why prose-only specs are unbuildable, and saves a rewritten spec.md with EARS criteria and explicit edge cases.", "files": [], + "deterministic_checks": [ + { + "type": "file_not_contains", + "file": "spec.md", + "literal": "TODO", + "description": "Corrected spec has no TODO placeholders" + }, + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Edge Cases", + "description": "Corrected spec has an Edge Cases section" + }, + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Acceptance Criteria", + "description": "Corrected spec has an Acceptance Criteria section" + }, + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Design", + "description": "Corrected spec has a Design section" + }, + { + "type": "regex_match", + "file": "spec.md", + "pattern": "(?i)(when|while|if|where)[^\\n]+shall", + "description": "Corrected spec expresses at least one criterion in EARS form" + } + ], "expectations": [ - "Names the specific defects: no testable acceptance criteria, no Edge Cases section, vague 'handle errors gracefully' language", - "Explains that each criterion must be a single verifiable EARS sentence graded identically by human and agent", - "Produces a corrected spec with Overview, EARS Acceptance Criteria, Design, and Edge Cases", - "Replaces vague phrasing with concrete, observable behavior and named components" + "Explicitly names the defects in the original draft: no testable acceptance criteria and no Edge Cases section", + "Explains that vague 'handle errors gracefully' language is unverifiable and replaces it with concrete, observable behavior tied to a named component", + "Corrected spec's Edge Cases section names specific boundary or error conditions rather than restating the original vague prose" ] }, { "id": 3, - "prompt": "Turn these rough notes into a build-ready feature spec for a coding agent: 'rate limit the public API, 600 req/min per key, return 429, needs to survive restarts'.", - "expected_output": "A conformant feature spec derived from the notes, with EARS acceptance criteria, a design naming the limiter component, and edge cases for limit boundaries and failure modes.", + "prompt": "Turn these rough notes into a build-ready feature spec for a coding agent: 'rate limit the public API, 600 req/min per key, return 429, needs to survive restarts'. Save it as spec.md.", + "expected_output": "A conformant feature spec derived from the notes, saved to spec.md, with EARS acceptance criteria, a design naming the limiter component and its persistence approach, and edge cases for limit boundaries and failure modes.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "spec.md", + "literal": "429", + "description": "Names the 429 response code" + }, + { + "type": "regex_match", + "file": "spec.md", + "pattern": "(?i)(when|while|if|where)[^\\n]+shall", + "description": "Expresses at least one criterion in EARS form" + }, + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Edge Cases", + "description": "Has an Edge Cases section" + }, + { + "type": "regex_match", + "file": "spec.md", + "pattern": "(?i)(restart|persist|survive)", + "description": "Addresses surviving restarts" + }, + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Design", + "description": "Has a Design section" + }, + { + "type": "regex_match", + "file": "spec.md", + "pattern": "600", + "description": "Preserves the stated 600 req/min limit" + } + ], "expectations": [ - "Expresses the rate limit as an Event-driven EARS criterion (When requests exceed the limit, the gateway shall respond 429)", - "Design names a concrete limiter component and its storage/persistence approach for surviving restarts", - "Edge Cases cover at-boundary requests, missing/invalid API key, and limiter backend unavailability", - "Emits MIF-conformant frontmatter with type: semantic that passes mif-validate" + "Design names a concrete limiter component and its storage/persistence mechanism for surviving restarts", + "Edge cases distinguish at-boundary requests, missing/invalid API key, and limiter backend unavailability as separate named cases", + "Expresses the 429 behavior as an Event-driven EARS criterion tied to the gateway or limiter component, not vague prose" ] }, { "id": 4, - "prompt": "Should I capture this as a feature spec, an ADR, or a PRD? It's a single endpoint we already agreed to build and just need an implementer to act on.", - "expected_output": "Recommends a feature spec because the scope is one agreed feature ready for implementation, and contrasts it with ADR and PRD.", + "prompt": "Should I capture this as a feature spec, an ADR, or a PRD? It's a single endpoint we already agreed to build and just need an implementer to act on. Write your recommendation and reasoning to recommendation.md.", + "expected_output": "Recommends a feature spec because the scope is one agreed feature ready for implementation, and contrasts it with ADR and PRD, saved to recommendation.md.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)feature spec", + "description": "Names feature spec as the recommendation" + }, + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)\\badr\\b", + "description": "Mentions ADR as a contrasted alternative" + }, + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)\\bprd\\b", + "description": "Mentions PRD as a contrasted alternative" + }, + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)(implement|build-ready|ready to build)", + "description": "Frames the recommendation around being ready for an implementer" + } + ], "expectations": [ - "Recommends the feature spec for a single, build-ready feature", - "Contrasts with ADR (records which option won and why) and PRD (multi-feature product/market framing)", - "Offers to draft the spec with Overview, EARS acceptance criteria, Design, and Edge Cases" + "Recommends feature-spec specifically because the scope is one already-agreed feature ready for an implementer, not because of file length or format preference", + "Contrasts ADR as recording which option won and why, distinct from a build-ready spec", + "Contrasts PRD as multi-feature product/market framing, distinct from a single-feature build spec" + ] + }, + { + "id": 5, + "prompt": "Here's all I have so far: 'users need to reset their password via email link.' Write it up as a feature spec and save it to spec.md.", + "expected_output": "A feature spec saved to spec.md that fills the sparse notes with reasonable, explicitly-flagged assumptions, includes EARS acceptance criteria, and covers edge cases around expired or reused reset links.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Edge Cases", + "description": "Has an Edge Cases section despite sparse input" + }, + { + "type": "regex_match", + "file": "spec.md", + "pattern": "(?i)(when|while|if|where)[^\\n]+shall", + "description": "Expresses at least one criterion in EARS form" + }, + { + "type": "regex_match", + "file": "spec.md", + "pattern": "(?i)(expir|token)", + "description": "Addresses reset-link expiration or token handling" + }, + { + "type": "file_contains", + "file": "spec.md", + "literal": "## Design", + "description": "Has a Design section" + } + ], + "expectations": [ + "States any assumed specifics, such as a link or token expiry window, as an explicit assumption rather than presenting an invented number as a firm given requirement", + "Edge cases cover expired or already-used reset links and invalid or unknown email addresses as distinct named cases", + "Stays scoped to the single password-reset feature without pulling in unrelated authentication rework" + ] + }, + { + "id": 6, + "prompt": "Before I mark this feature spec for the checkout retry logic done, what frontmatter type and validation gate does the feature-spec genre actually require? Write it up in gate-notes.md.", + "expected_output": "States the MIF type is semantic, that L1 is the floor with an optional climb to L2 via namespace/tags/title, and that the gate is mif-validate --level 1, saved to gate-notes.md.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "type: semantic", + "description": "States the MIF type is semantic" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)mif-validate", + "description": "Names mif-validate as the gate" + }, + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "--level 1", + "description": "Names the specific gate invocation --level 1" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)namespace", + "description": "Names namespace as an L2 field to climb to" + } + ], + "expectations": [ + "States L1 (id, type, created) is the minimum floor and L2 fields (namespace, tags, title) are an optional climb, not mandatory for every doc", + "Explains the semantic type as declarative knowledge about a target behavior, distinguishing it from a time-bound record or a procedural how-to", + "Names mif-validate --level 1 as the specific required gate before the spec can be considered done" ] } ] diff --git a/skills/google-design-doc/evals/evals.json b/skills/google-design-doc/evals/evals.json index 4b5d373..4a91b75 100644 --- a/skills/google-design-doc/evals/evals.json +++ b/skills/google-design-doc/evals/evals.json @@ -3,52 +3,249 @@ "evals": [ { "id": 1, - "prompt": "Write a design doc for a new outbound webhook delivery service so we can align the team before building it.", - "expected_output": "A Google-style engineering design doc with Context and Scope, Goals and Non-Goals, the proposed design (APIs, schema, key flows), Alternatives Considered with pros/cons and why each was rejected, and cross-cutting security/privacy/observability concerns.", + "prompt": "Write a design doc for a new outbound webhook delivery service so we can align the team before building it. Save it as design-doc.md.", + "expected_output": "A Google-style engineering design doc with Context and Scope, Goals and Non-Goals, the proposed design (APIs, schema, key flows), Alternatives Considered with pros/cons and why each was rejected, and cross-cutting security/privacy/observability concerns, saved to design-doc.md.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "design-doc.md", + "pattern": "(?i)##\\s*Context and Scope", + "description": "Has a Context and Scope section" + }, + { + "type": "regex_match", + "file": "design-doc.md", + "pattern": "(?i)##\\s*Goals and Non-Goals", + "description": "Has a Goals and Non-Goals section" + }, + { + "type": "regex_match", + "file": "design-doc.md", + "pattern": "(?i)##\\s*Alternatives Considered", + "description": "Has an Alternatives Considered section" + }, + { + "type": "regex_match", + "file": "design-doc.md", + "pattern": "(?i)(security|privacy|observability)", + "description": "Mentions at least one cross-cutting concern" + }, + { + "type": "file_contains", + "file": "design-doc.md", + "literal": "type: semantic", + "description": "MIF frontmatter declares type: semantic" + }, + { + "type": "regex_match", + "file": "design-doc.md", + "pattern": "(?i)non-goals:", + "description": "Non-Goals appears as its own labeled list, not folded into prose" + } + ], "expectations": [ - "Has a Context and Scope section that states the problem and what is in vs out of scope", - "Includes both Goals and explicit Non-Goals as separate bulleted lists", - "The Design section covers APIs, data storage/schema, and key flows (happy and failure paths)", - "Alternatives Considered lists at least two rejected options, each with pros, cons, and a why-rejected tied to a goal or constraint", - "Addresses security, privacy, and observability as cross-cutting concerns", - "Emits MIF frontmatter with type: semantic and passes mif-validate --level 1" + "The Context and Scope section states the current problem and explicitly bounds what is in vs out of scope", + "The Design section covers APIs, data storage/schema, and key flows (a happy path and a failure path)", + "Alternatives Considered lists at least two rejected options, each with a pro, a con, and a why-rejected tied to a stated goal or constraint, not a strawman" ] }, { "id": 2, - "prompt": "Review this design doc draft — my reviewers say it reads like a foregone conclusion. What's missing and how do I fix it?", - "expected_output": "Identifies the absence of honest Alternatives Considered and explicit Non-Goals, and shows how to add weighed options and scope boundaries so the decision is justified rather than asserted.", + "prompt": "Review this design doc draft — my reviewers say it reads like a foregone conclusion. What's missing and how do I fix it? Write your review as review-notes.md.", + "expected_output": "review-notes.md identifies the absence of honest Alternatives Considered and explicit Non-Goals, and shows how to add weighed options and scope boundaries so the decision is justified rather than asserted.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)alternatives considered", + "description": "Names the missing Alternatives Considered section" + }, + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)non-goals", + "description": "Flags the absence of Non-Goals" + }, + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)(pro|con|trade-off|tradeoff)", + "description": "Discusses pros/cons or trade-offs as the fix" + }, + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)strawman", + "description": "Warns explicitly against a strawman alternative" + } + ], "expectations": [ - "Names the missing Alternatives Considered section as the core flaw", - "Flags the absence of Non-Goals and explains they bound scope in review", - "Recommends giving each alternative an honest pro/con and a why-rejected that ties to a goal, not a strawman", - "Frames the doc as trade-off-driven rather than a single-option proposal" + "Names the missing Alternatives Considered section as the core flaw causing the foregone-conclusion feel", + "Explains that Non-Goals bound scope and pre-empt scope creep in review, not just that they're missing" ] }, { "id": 3, - "prompt": "I have a chosen approach for a per-tenant rate limiter already. Should I still write a design doc, and what should it contain?", - "expected_output": "Recommends a Google-style design doc whose value is the trade-offs, and outlines the section structure including the alternatives that were rejected on the way to the chosen approach.", + "prompt": "I have a chosen approach for a per-tenant rate limiter already. Should I still write a design doc, and what should it contain? Write your recommendation as recommendation.md.", + "expected_output": "recommendation.md recommends a Google-style design doc whose value is the trade-offs, and outlines the section structure including the alternatives that were rejected on the way to the chosen approach.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)context and scope", + "description": "Lists Context and Scope as an expected section" + }, + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)alternatives considered", + "description": "Lists Alternatives Considered as an expected section" + }, + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)\\badr\\b", + "description": "Distinguishes the design doc from an ADR by name" + }, + { + "type": "file_contains", + "file": "recommendation.md", + "literal": "type: semantic", + "description": "Notes the MIF frontmatter type is semantic" + }, + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)feature spec", + "description": "Distinguishes the design doc from a feature spec by name" + } + ], "expectations": [ - "Explains the doc's purpose is to record rationale and the rejected alternatives, not just the chosen design", - "Lists the expected sections: Context and Scope, Goals and Non-Goals, the Design, Alternatives Considered, Cross-cutting Concerns", - "Distinguishes a design doc from an ADR (single immutable decision) and a feature spec (requirements)", - "Notes the MIF frontmatter should use type: semantic" + "Explains the doc's purpose is to record rationale and the rejected alternatives, not just document the chosen design", + "Advises writing the doc even though the approach is already chosen, because reviewers need to see what was weighed" ] }, { "id": 4, - "prompt": "Turn these architecture meeting notes about our event ingestion redesign into a proper design doc.", - "expected_output": "A conformant design doc derived from the notes, with scoped context, goals and non-goals, the design surfaces, the alternatives that were debated with their trade-offs, and cross-cutting concerns.", + "prompt": "Turn these architecture meeting notes about our event ingestion redesign into a proper design doc. Save it as event-ingestion-design.md.", + "expected_output": "event-ingestion-design.md is a conformant design doc derived from the notes, with scoped context, goals and non-goals, the design surfaces, the alternatives that were debated with their trade-offs, and cross-cutting concerns.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "event-ingestion-design.md", + "pattern": "(?i)##\\s*context and scope", + "description": "Reorganizes notes into a Context and Scope section" + }, + { + "type": "regex_match", + "file": "event-ingestion-design.md", + "pattern": "(?i)##\\s*goals and non-goals", + "description": "Reorganizes notes into a Goals and Non-Goals section" + }, + { + "type": "regex_match", + "file": "event-ingestion-design.md", + "pattern": "(?i)##\\s*alternatives considered", + "description": "Recovers debated options into an Alternatives Considered section" + }, + { + "type": "file_contains", + "file": "event-ingestion-design.md", + "literal": "type: semantic", + "description": "Produces MIF frontmatter with type: semantic" + }, + { + "type": "regex_match", + "file": "event-ingestion-design.md", + "pattern": "(?i)non-goals:", + "description": "Non-Goals appears as its own labeled list" + } + ], "expectations": [ - "Reorganizes the notes into the standard design-doc sections", - "Recovers the debated options from the notes into Alternatives Considered with pros/cons and why-rejected", - "Makes scope explicit via Goals and Non-Goals", - "Produces MIF-conformant frontmatter (type: semantic) and a body that passes mif-validate --level 1" + "Recovers the options debated in the notes into Alternatives Considered, each with pros, cons, and a why-rejected", + "Does not simply reformat the raw notes verbatim — the content is organized into the genre's section structure" + ] + }, + { + "id": 5, + "prompt": "What MIF frontmatter and validation gate does the google-design-doc genre require, and how far can I climb past the floor? Write it up as gate-notes.md.", + "expected_output": "gate-notes.md states the MIF type is semantic, that L1 (id, type, created) is the floor gated by mif-validate --level 1, and that the genre supports climbing to L3 with namespace, temporal, provenance, citations, and relationships when the context supports it.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "type: semantic", + "description": "States the MIF type is semantic" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)mif-validate", + "description": "Names mif-validate as the gate" + }, + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "--level 1", + "description": "Names the floor gate invocation --level 1" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)namespace", + "description": "Names namespace as a field used to climb past L1" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)(provenance|citations|relationships)", + "description": "Names at least one L3 field (provenance, citations, or relationships)" + } + ], + "expectations": [ + "States L1 (id, type, created) is the floor and is the minimum every doc must meet, not an optional level", + "Frames climbing to L2/L3 as conditional on what the drafting context supplies, not mandatory for every doc" + ] + }, + { + "id": 6, + "prompt": "Here's a one-page proposal for switching our session store from Redis to Postgres — it just states the decision and says 'this is the best option.' Does this count as a Google-style design doc? Write your assessment as assessment.md.", + "expected_output": "assessment.md says no — it is a proposal masquerading as a design doc because it omits Non-Goals and Alternatives Considered — and explains what must be added to make it a real design doc.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "assessment.md", + "pattern": "(?i)non-goals", + "description": "Flags missing Non-Goals" + }, + { + "type": "regex_match", + "file": "assessment.md", + "pattern": "(?i)alternatives considered", + "description": "Flags missing Alternatives Considered" + }, + { + "type": "regex_match", + "file": "assessment.md", + "pattern": "(?i)(no|not a design doc|does not count)", + "description": "States clearly that the one-pager does not qualify as-is" + }, + { + "type": "regex_match", + "file": "assessment.md", + "pattern": "(?i)(pro|con|trade-off|tradeoff)", + "description": "Frames the gap in terms of missing trade-offs/pros-cons" + } + ], + "expectations": [ + "Identifies the proposal as the antipattern of a single-option proposal masquerading as a design doc, not a minor style issue", + "Names both Non-Goals and Alternatives Considered as the specific missing sections, not a vague 'needs more detail'" ] } ] diff --git a/skills/kiro-design/evals/evals.json b/skills/kiro-design/evals/evals.json index 2e316e9..5d5ef71 100644 --- a/skills/kiro-design/evals/evals.json +++ b/skills/kiro-design/evals/evals.json @@ -3,36 +3,207 @@ "evals": [ { "id": 1, - "prompt": "Write the Kiro design.md for the avatar upload feature based on its requirements.", - "expected_output": "A design with overview, architecture, components/interfaces, data models, error handling, and testing strategy, all traced to requirement numbers.", + "prompt": "I'm building a real-time collaborative cursor feature for a shared doc editor (multiple users see each other's cursor position live). Here's requirements.md:\n\nRequirement 1: WHEN a user moves their cursor THE SYSTEM SHALL broadcast the new position to other connected users within 200ms (Requirement 1.1).\nRequirement 2: WHEN a user disconnects THE SYSTEM SHALL remove their cursor from other clients within 5 seconds (Requirement 2.1).\n\nWrite the design.md for this and save it to design.md.", + "expected_output": "A design.md with all six sections (Overview, Architecture, Components and Interfaces, Data Models, Error Handling, Testing Strategy), where components are concretely tied to broadcasting cursor position and handling disconnects, and each design element cites Requirement 1.1 or 2.1.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "design.md", + "literal": "## Overview", + "description": "Has an Overview section" + }, + { + "type": "file_contains", + "file": "design.md", + "literal": "## Architecture", + "description": "Has an Architecture section" + }, + { + "type": "file_contains", + "file": "design.md", + "literal": "## Components and Interfaces", + "description": "Has a Components and Interfaces section" + }, + { + "type": "file_contains", + "file": "design.md", + "literal": "## Data Models", + "description": "Has a Data Models section" + }, + { + "type": "file_contains", + "file": "design.md", + "literal": "## Error Handling", + "description": "Has an Error Handling section" + }, + { + "type": "file_contains", + "file": "design.md", + "literal": "## Testing Strategy", + "description": "Has a Testing Strategy section" + }, + { + "type": "regex_match", + "file": "design.md", + "pattern": "(?i)requirement\\s*(1\\.1|2\\.1)", + "description": "At least one design element cites Requirement 1.1 or 2.1 by number" + } + ], "expectations": [ - "Includes all six sections (Overview through Testing Strategy)", - "References the requirement numbers each design element satisfies", - "Defines concrete data models and error handling", - "Passes mif-validate --level 1 with type: semantic" + "Components are concretely tied to the stated domain (broadcasting cursor position, removing a disconnected user's cursor) rather than generic boilerplate unrelated to the prompt", + "Data Models section defines concrete entities/fields relevant to cursor position or connection state (e.g. user id, coordinates, timestamp), not a placeholder table", + "Error Handling addresses at least one realistic failure mode from the domain (e.g. a stale/late cursor update, a missed disconnect event), not a generic 'errors are handled' statement" ] }, { "id": 2, - "prompt": "This design doc has no data models and doesn't reference any requirements. Fix it.", - "expected_output": "Adds data models, testing strategy, and traceability to requirement numbers.", + "prompt": "Here's our current design.md for the cursor feature — it only has Overview, Architecture, Components and Interfaces, and Error Handling, with no Data Models section, no Testing Strategy section, and none of the components cite a requirement number. Fix it and save the corrected version to design.md.", + "expected_output": "The corrected design.md adds a Data Models section and a Testing Strategy section, and adds requirement-number citations to the existing content, while keeping the original Overview/Architecture/Components/Error Handling content intact.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "design.md", + "literal": "## Data Models", + "description": "Adds the missing Data Models section" + }, + { + "type": "file_contains", + "file": "design.md", + "literal": "## Testing Strategy", + "description": "Adds the missing Testing Strategy section" + }, + { + "type": "regex_match", + "file": "design.md", + "pattern": "(?i)requirement\\s*\\d", + "description": "Adds at least one requirement-number citation" + }, + { + "type": "file_contains", + "file": "design.md", + "literal": "## Error Handling", + "description": "Retains the pre-existing Error Handling section rather than dropping it" + } + ], "expectations": [ - "Adds a Data Models section", - "Adds traceability references to requirement numbers", - "Adds an Error Handling and Testing Strategy section" + "The added Data Models content is specific to the cursor feature (concrete fields), not a generic stub table", + "Retains the substance of the pre-existing Overview, Architecture, and Components and Interfaces sections rather than replacing them with unrelated content" ] }, { "id": 3, - "prompt": "Where does design.md sit in the Kiro flow and what must it connect to?", - "expected_output": "Explains design.md is second, traces back to requirements.md and feeds tasks.md.", + "prompt": "Quick question before I start writing: where does design.md actually sit in the Kiro three-document flow, and what does it need to connect to on either side? Write the answer to kiro-flow.md.", + "expected_output": "Explains design.md is the second of three Kiro documents, tracing back to requirements.md and feeding tasks.md, and that every design section should map to a requirement number.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "kiro-flow.md", + "literal": "requirements.md", + "description": "Names requirements.md as the upstream document" + }, + { + "type": "file_contains", + "file": "kiro-flow.md", + "literal": "tasks.md", + "description": "Names tasks.md as the downstream document" + }, + { + "type": "regex_match", + "file": "kiro-flow.md", + "pattern": "(?i)second", + "description": "States design.md is the second document in the sequence" + } + ], "expectations": [ - "States design.md is the second of three Kiro documents", - "Explains it traces to requirements.md and feeds tasks.md", - "Notes every section should map to a requirement" + "States the ordering explicitly as requirements.md -> design.md -> tasks.md, not just listing the three names without sequence", + "Notes that a design section with no requirement to trace to is a signal of scope creep, not just that traceability is generally good" + ] + }, + { + "id": 4, + "prompt": "Requirements.md only has Requirement 1 (WHEN a user shares a doc THE SYSTEM SHALL grant the recipient read access, Requirement 1.1) and Requirement 2 (WHEN a share is revoked THE SYSTEM SHALL remove access within 60 seconds, Requirement 2.1). But our draft design.md's Components and Interfaces section also includes an 'Analytics Dashboard' component with no requirement behind it. Review design.md against requirements.md and flag the problem, then save your review to design-review.md.", + "expected_output": "Identifies the Analytics Dashboard component specifically as untraceable to any requirement and names it as scope creep, recommending it be removed or tied to a real requirement.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "design-review.md", + "literal": "Analytics Dashboard", + "description": "Names the specific untraceable component" + }, + { + "type": "regex_match", + "file": "design-review.md", + "pattern": "(?i)scope creep", + "description": "Labels the untraceable component as scope creep" + } + ], + "expectations": [ + "Explicitly recommends either removing the Analytics Dashboard component or tying it to a specific requirement, rather than only noting the problem exists", + "Does not flag the two legitimate, requirement-backed components (the share-granting and revocation components) as problems" + ] + }, + { + "id": 5, + "prompt": "Requirements.md has a Requirement 3 that our design.md doesn't cover at all: 'Requirement 3.1: WHEN a user requests their data THE SYSTEM SHALL generate a CSV export within 10 seconds.' Find the gap and update design.md to add the missing design coverage for it, then save the updated file as design.md.", + "expected_output": "Flags that Requirement 3.1 previously had no corresponding design element, then adds a concrete new component/interface for CSV export that cites Requirement 3.1.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "design.md", + "pattern": "(?i)requirement\\s*3\\.1", + "description": "New coverage cites Requirement 3.1" + }, + { + "type": "file_contains", + "file": "design.md", + "literal": "CSV", + "description": "New coverage is specific to the CSV export requirement" + } + ], + "expectations": [ + "Explicitly calls out that Requirement 3 previously had no design coverage before adding the fix, not just silently adding a section", + "The new component/interface added for Requirement 3.1 is concrete (a named component with a described responsibility), not a one-line placeholder" + ] + }, + { + "id": 6, + "prompt": "Before I mark our Feature Flag Service design.md as done, what MIF frontmatter and validation gate does the kiro-design genre actually require? Write it up in gate-notes.md.", + "expected_output": "States the MIF type is semantic, that L1 (type: semantic) is the floor while namespace/tags/title climb to L2, and that the gate is mif-validate --level 1.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "type: semantic", + "description": "States the MIF type is semantic" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)mif-validate", + "description": "Names mif-validate as the gate" + }, + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "--level 1", + "description": "Names the specific gate invocation --level 1" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)namespace", + "description": "Names namespace as an L2 field to climb to" + } + ], + "expectations": [ + "Frames type: semantic (L1) as the minimum floor and namespace/tags/title as an optional climb to L2, not as mandatory fields for every doc", + "Names namespace, tags, and title as the specific L2 fields, not vague 'more metadata' language" ] } ] diff --git a/skills/kiro-requirements/evals/evals.json b/skills/kiro-requirements/evals/evals.json index 565a733..a12c395 100644 --- a/skills/kiro-requirements/evals/evals.json +++ b/skills/kiro-requirements/evals/evals.json @@ -3,37 +3,244 @@ "evals": [ { "id": 1, - "prompt": "Write the Kiro requirements.md for a profile avatar upload feature.", - "expected_output": "Numbered requirements, each a user story with EARS acceptance criteria including error paths.", + "prompt": "Write the Kiro requirements.md for a profile avatar upload feature: upload an avatar, preview it before saving, and replace an existing avatar.", + "expected_output": "Numbered requirements, each a user story with EARS acceptance criteria including error paths (file too large, unsupported format).", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "###\\s*1\\.", + "description": "First requirement is numbered 1" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "**User Story:**", + "description": "User Story label is present" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bWHEN\\b.{0,200}\\bSHALL\\b", + "description": "At least one WHEN...SHALL EARS criterion" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bIF\\b.{0,200}\\bTHEN\\b.{0,100}\\bSHALL\\b", + "description": "At least one IF...THEN...SHALL unhappy-path criterion" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "type: semantic", + "description": "Frontmatter declares type: semantic" + } + ], "expectations": [ - "Each requirement has a user story (As a / I want / so that)", - "Acceptance criteria are written in EARS (WHEN/IF/WHILE ... SHALL)", - "Covers unhappy paths (size limit, format) with IF/THEN criteria", - "Requirements are numbered and the doc passes mif-validate --level 1 (type: semantic)" + "Each numbered requirement pairs its User Story with an Acceptance Criteria list immediately below it", + "Unhappy-path criteria name a concrete failure condition (file too large, unsupported format), not a generic error" ] }, { "id": 2, - "prompt": "These requirements are bullet points like 'handle errors appropriately'. Make them Kiro-conformant.", - "expected_output": "Rewrites them as numbered user stories with EARS acceptance criteria.", + "prompt": "These requirements are vague bullet points: 'let users upload a picture', 'make sure it works well', 'handle errors appropriately'. Make them Kiro-conformant.", + "expected_output": "Rewrites the bullets as numbered user stories with EARS acceptance criteria, replacing vague phrasing with testable conditions.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "###\\s*1\\.", + "description": "Output has at least one numbered requirement" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "**User Story:**", + "description": "User Story label is present" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bSHALL\\b", + "description": "EARS SHALL keyword is used" + }, + { + "type": "regex_not_match", + "file": "transcript.md", + "pattern": "(?i)works well", + "description": "Vague phrase 'works well' is not carried into the rewrite" + } + ], "expectations": [ - "Converts vague bullets into numbered requirements", - "Adds a user story and EARS criteria per requirement", - "Makes each criterion individually testable" + "'Handle errors appropriately' becomes a specific IF/THEN SHALL criterion naming the error condition", + "Each rewritten criterion states one observable, individually testable behavior" ] }, { "id": 3, - "prompt": "Why does Kiro put requirements in EARS and number them?", - "expected_output": "Explains numbering enables traceability to design/tasks and EARS makes criteria testable.", + "prompt": "Why does Kiro put requirements in EARS and number them instead of just writing plain prose?", + "expected_output": "Explains numbering enables traceability to design.md/tasks.md and EARS makes acceptance criteria unambiguous and testable.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)design\\.md", + "description": "Mentions design.md" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)tasks\\.md", + "description": "Mentions tasks.md" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)EARS", + "description": "Names EARS notation" + } + ], "expectations": [ - "Explains numbering enables design.md and tasks.md to trace to requirements", - "Explains EARS makes acceptance criteria unambiguous and testable", + "Explains numbering lets design.md and tasks.md cite specific requirement numbers rather than paraphrasing prose", + "Explains EARS criteria are unambiguous enough that a human and an agent would grade them the same way", "Notes requirements.md is the first of the three Kiro documents" ] + }, + { + "id": 4, + "prompt": "Here's my requirements.md for login — it only has a WHEN...SHALL criterion for the happy path (valid credentials). Add unhappy-path coverage for a wrong password and account lockout.", + "expected_output": "Adds IF/THEN SHALL criteria for wrong password and lockout under the existing login requirement, each a distinct testable condition.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)\\bIF\\b.{0,150}\\bTHEN\\b.{0,100}\\bSHALL\\b", + "description": "At least one IF...THEN...SHALL criterion is added" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(wrong|invalid|incorrect) password", + "description": "Wrong-password condition is named" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)lock(ed|out)", + "description": "Account lockout condition is named" + } + ], + "expectations": [ + "New criteria are appended to the existing login requirement rather than spawning an unrelated new requirement", + "Wrong password and lockout are each their own criterion, not merged into one compound condition" + ] + }, + { + "id": 5, + "prompt": "Write requirements.md for a checkout flow — add items to cart, apply a promo code, and complete payment — I need this to feed straight into design.md next.", + "expected_output": "Three numbered requirements (cart, promo code, payment), each with a user story and EARS criteria, including an invalid-promo-code error path.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "###\\s*1\\.", + "description": "Requirement 1 is numbered" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "###\\s*2\\.", + "description": "Requirement 2 is numbered" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "###\\s*3\\.", + "description": "Requirement 3 is numbered" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "## Introduction", + "description": "Introduction section is present" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(invalid|expired).{0,40}promo", + "description": "Promo code requirement covers an invalid or expired code" + } + ], + "expectations": [ + "Cart, promo code, and payment are each their own numbered requirement rather than merged into one", + "Every requirement includes a User Story sentence in addition to its acceptance criteria" + ] + }, + { + "id": 6, + "prompt": "Write requirements for 'improve search' — honestly not sure yet exactly what that means.", + "expected_output": "Either asks a clarifying question about scope, users, or success signal, or states explicit reasonable assumptions before producing EARS criteria — does not invent a specific numeric SLA the prompt never mentioned.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(clarif|assum|which (aspect|part)|what (do you mean|kind of))", + "description": "Response surfaces the ambiguity via a clarifying question or stated assumption" + }, + { + "type": "regex_not_match", + "file": "transcript.md", + "pattern": "(?i)(within|under|less than)\\s*\\d+\\s*(ms|milliseconds|seconds)", + "description": "Does not fabricate a specific latency SLA the prompt never supplied" + } + ], + "expectations": [ + "If assumptions are stated, they are explicit and reasonable rather than silently smuggled into the criteria", + "Any produced requirement still follows the numbered user-story-plus-EARS structure" + ] + }, + { + "id": 7, + "prompt": "Should this checkout flow work go in requirements.md, design.md, or tasks.md?", + "expected_output": "Recommends requirements.md as the first Kiro artifact, capturing what the feature must do, before design.md (the how) and tasks.md (the implementation checklist).", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)requirements\\.md", + "description": "Recommends requirements.md" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)design\\.md", + "description": "Distinguishes from design.md" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)tasks\\.md", + "description": "Distinguishes from tasks.md" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)first", + "description": "States requirements.md comes first in the Kiro flow" + } + ], + "expectations": [ + "Explains design.md covers the technical how and tasks.md the implementation checklist, not what the feature must do", + "Does not recommend starting with design.md or tasks.md for this checkout feature" + ] } ] } diff --git a/skills/kiro-tasks/evals/evals.json b/skills/kiro-tasks/evals/evals.json index 3191b9f..3b043eb 100644 --- a/skills/kiro-tasks/evals/evals.json +++ b/skills/kiro-tasks/evals/evals.json @@ -3,36 +3,225 @@ "evals": [ { "id": 1, - "prompt": "Generate the Kiro tasks.md for the avatar upload feature from its design.", - "expected_output": "An ordered checkbox task list with small tasks, sub-tasks, and per-task requirement references.", + "prompt": "We just finished design.md for the profile avatar upload feature (validator, avatar store, controller, preview flow, integration tests). Generate the Kiro tasks.md from it and save it as tasks.md.", + "expected_output": "A numbered checkbox task list with sub-tasks, per-task requirement/design references, incremental ordering, and MIF frontmatter (type: procedural) that gates at mif-validate --level 1.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "(?m)^- \\[ \\] 1\\.", + "description": "Top-level task 1 is a checkbox list item" + }, + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "(?m)^\\s+- \\[ \\] 1\\.1", + "description": "Task 1 has a checkbox sub-task 1.1" + }, + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "_Requirements:\\s*[0-9]", + "description": "At least one task cites a numbered requirement" + }, + { + "type": "file_contains", + "file": "tasks.md", + "literal": "type: procedural", + "description": "Frontmatter declares type: procedural" + } + ], "expectations": [ - "Uses a numbered checkbox list with sub-tasks", - "Each task cites the requirements/design it implements (e.g. _Requirements: 1.2_)", - "Tasks are small, incremental, and test-driven", - "Emits MIF frontmatter type: procedural and passes mif-validate --level 1" + "Every top-level task is broken into small, independently testable sub-tasks rather than one giant step", + "Task ordering is incremental: later tasks build on artifacts introduced by earlier tasks (e.g. controller wiring comes after the validator and store exist)", + "The task breakdown reflects the specific components named in the prompt (validator, avatar store, controller, preview flow), not generic placeholder steps" ] }, { "id": 2, - "prompt": "Our task list is three giant bullets with no references. Make it Kiro-conformant.", - "expected_output": "Breaks tasks into small checkbox items with traceability and ordering.", + "prompt": "Our task list is three giant bullets with no checkboxes, no requirement references, and no order: 'Build the whole feature.' 'Make it work end to end.' 'Test everything.' Turn this into a proper Kiro tasks.md and save it as tasks.md.", + "expected_output": "Replaces the giant bullets with small checkbox tasks and sub-tasks, adds per-task requirement/design references, and establishes an incremental build order.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "(?m)^- \\[ \\] [0-9]+\\.", + "description": "Uses numbered checkbox tasks, not unordered bullets" + }, + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "_Requirements:\\s*[0-9]", + "description": "At least one task cites a requirement reference" + }, + { + "type": "file_not_contains", + "file": "tasks.md", + "literal": "Build the whole feature.", + "description": "Does not carry over the original giant, untestable bullet verbatim" + } + ], "expectations": [ - "Converts giant tasks into small, checkable sub-tasks", - "Adds per-task requirement/design references", - "Establishes an incremental order" + "The vague 'build the whole feature' bullet is decomposed into multiple small, independently testable tasks with their own sub-tasks", + "'Test everything' is replaced with specific, scoped test tasks tied to the components being built, not left as a single catch-all bullet" ] }, { "id": 3, - "prompt": "Why does each Kiro task reference a requirement number?", - "expected_output": "Explains traceability: every task implements a requirement, and every requirement should have a task.", + "prompt": "Why does each Kiro task in tasks.md reference a requirement number, and what happens if a requirement has no matching task? Write the explanation to traceability-notes.md.", + "expected_output": "Explains that tasks.md traces back to requirements/design so coverage is auditable, and that a requirement with no task is unimplemented while a task with no reference is a smell.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "traceability-notes.md", + "pattern": "(?i)_Requirements:", + "description": "References the _Requirements: citation convention" + }, + { + "type": "regex_match", + "file": "traceability-notes.md", + "pattern": "(?i)unimplemented", + "description": "States a requirement with no task is unimplemented" + }, + { + "type": "regex_match", + "file": "traceability-notes.md", + "pattern": "(?i)(design\\.md|requirements\\.md)", + "description": "Names the upstream Kiro document(s) tasks.md derives from" + } + ], "expectations": [ - "Explains traceability from tasks back to requirements/design", - "Notes a requirement with no task is unimplemented", - "Notes tasks.md is generated from design.md and drives implementation" + "Explains the traceability chain runs requirements -> design -> tasks, not just 'tasks reference requirements' in isolation", + "States that a task with no requirement/design reference is itself a smell worth flagging, not only the reverse gap" + ] + }, + { + "id": 4, + "prompt": "Here's our current tasks.md — task 3 ('Wire the AvatarController') has no _Requirements: line, and requirement 2.2 (preview cancel behavior) isn't referenced by any task. Point out the gaps and save the findings to gap-report.md.", + "expected_output": "Identifies task 3 as missing its requirement citation and requirement 2.2 as an orphaned requirement with no implementing task.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "gap-report.md", + "literal": "2.2", + "description": "Flags the orphaned requirement 2.2 by number" + }, + { + "type": "regex_match", + "file": "gap-report.md", + "pattern": "(?i)AvatarController", + "description": "Names the task missing its requirement citation" + }, + { + "type": "regex_match", + "file": "gap-report.md", + "pattern": "(?i)(orphan|no task|unimplemented|uncovered)", + "description": "Uses coverage-gap language for the untraced requirement" + } + ], + "expectations": [ + "Distinguishes the two distinct gap types: a task missing a requirement citation versus a requirement missing an implementing task", + "Does not report false gaps against tasks/requirements that were not actually flagged in the prompt" + ] + }, + { + "id": 5, + "prompt": "We don't have a design.md yet, just a one-line feature description: 'let users export their data as a CSV from account settings.' Draft a best-effort Kiro tasks.md anyway and save it as tasks.md, but be clear about what's assumed.", + "expected_output": "Produces a small, ordered checkbox task list for the CSV export feature and explicitly flags that it is a best-effort breakdown made without a design.md, calling out assumptions instead of inventing firm requirement numbers.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "(?m)^- \\[ \\] 1\\.", + "description": "Still produces a numbered checkbox task list despite missing design.md" + }, + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "(?i)(assum|no design\\.md|without a design|best.effort)", + "description": "Explicitly flags the missing design.md / assumptions made" + }, + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "(?i)csv", + "description": "Task breakdown is scoped to the CSV export feature named in the prompt" + } + ], + "expectations": [ + "Does not fabricate specific numbered requirement citations (e.g. '_Requirements: 1.2, 3.1_') as if a real requirements.md existed when none was provided", + "The resulting task breakdown is scoped to CSV export specifically (e.g. data query, CSV formatting, download/export endpoint), not generic boilerplate unrelated to the prompt" + ] + }, + { + "id": 6, + "prompt": "Before I mark our tasks.md for the Feature Flag Service done, what frontmatter and validation gate does this kiro-tasks genre actually require? Write it up in gate-notes.md.", + "expected_output": "States the MIF type is procedural, that L1 (id, type, created) is the floor with climbing to L2 via namespace/tags/title, and that the gate is mif-validate --level 1.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "type: procedural", + "description": "States the MIF type is procedural" + }, + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "--level 1", + "description": "Names the specific gate invocation --level 1" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)namespace", + "description": "Names namespace as an L2 field to climb to" + } + ], + "expectations": [ + "Frames L1 as the minimum floor and L2 as an optional climb rather than a hard requirement for every tasks.md", + "Names namespace, tags, and title together as the specific fields that climb the doc to L2, not vague 'more metadata' language" + ] + }, + { + "id": 7, + "prompt": "We're speccing tasks.md for a full authentication overhaul (password reset, MFA enrollment, session revocation, and audit logging) from a 40-item design.md. Generate the ordered task list and save it as tasks.md.", + "expected_output": "A checkbox task list with multiple top-level tasks grouped by sub-feature (password reset, MFA, session revocation, audit logging), each with sub-tasks and requirement references, ordered so foundational work precedes dependent work.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "(?m)^- \\[ \\] [4-9]\\.", + "description": "Has at least 4 top-level tasks (large feature is not compressed into 1-2 bullets)" + }, + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "(?i)(mfa|multi-factor)", + "description": "Covers the MFA enrollment sub-feature named in the prompt" + }, + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "(?i)audit", + "description": "Covers the audit logging sub-feature named in the prompt" + }, + { + "type": "regex_match", + "file": "tasks.md", + "pattern": "_Requirements:\\s*[0-9]", + "description": "Tasks still cite requirement references at this larger scale" + } + ], + "expectations": [ + "All four named sub-features (password reset, MFA enrollment, session revocation, audit logging) appear as distinct top-level tasks or task groups, not merged into one vague bucket", + "The ordering places foundational/shared work (e.g. session/auth primitives) before sub-features that depend on it, rather than an arbitrary sequence" ] } ] diff --git a/skills/mif-frontmatter/SKILL.md b/skills/mif-frontmatter/SKILL.md index 9354bf7..e8e2f52 100644 --- a/skills/mif-frontmatter/SKILL.md +++ b/skills/mif-frontmatter/SKILL.md @@ -18,11 +18,18 @@ A MIF doc exists in two interconvertible forms — human-readable **markdown** canonical object). Authoring is bidirectional and the round-trip is lossless; see the `mif-validate` skill for the tooling. +**Whenever a level, floor, or the `type` field itself comes up** — deciding +what level a document can claim, explaining the L1 floor, or drafting +frontmatter — state the `type` enum explicitly (see the L1 row below). It's a +hard constraint on an L1 field, not an implementation detail to leave +implicit, so surface it every time L1 is discussed, not only when actually +writing the YAML. + ## Level floors (attempt the highest the context supports) | Level | Frontmatter fields | Emit when | | --- | --- | --- | -| **L1 (hard floor)** | `id`, `type`, `created` (+ body becomes `content`) | **Always.** Below L1 is a skill error. | +| **L1 (hard floor)** | `id`, `type` (enum: `semantic` \| `episodic` \| `procedural`), `created` (+ body becomes `content`) | **Always.** Below L1 is a skill error. | | **L2** | `namespace`, `modified`, `temporal` | review cadence / topic namespace known | | **L3** | `provenance`, `citations[]`, `relationships[]` | doc sourced from real, attributable input | @@ -43,8 +50,8 @@ see the `mif-validate` skill for the tooling. Attempt the highest level the drafting context supports, **field by field**. If a level's required field cannot be populated from real input (no citations -> no L3; no review cadence -> omit `temporal`), **drop to the next lower level rather -than writing a placeholder**. Never emit empty or `TODO` MIF fields. This helper -*proposes* frontmatter; `mif-validate` *disposes*. +than writing a placeholder**. Never emit an empty or unresolved-placeholder MIF +field. This helper *proposes* frontmatter; `mif-validate` *disposes*. ## Minimal L1 example diff --git a/skills/mif-frontmatter/evals/evals.json b/skills/mif-frontmatter/evals/evals.json index 67f108d..a732cff 100644 --- a/skills/mif-frontmatter/evals/evals.json +++ b/skills/mif-frontmatter/evals/evals.json @@ -3,49 +3,230 @@ "evals": [ { "id": 1, - "prompt": "Add MIF frontmatter to this markdown note. I only know it was created today and it's declarative knowledge about our caching policy.", + "prompt": "Add MIF frontmatter to docs/notes/caching-policy.md — I only know it was created today (2026-06-29) and it's declarative knowledge about our caching policy.", "expected_output": "L1 frontmatter (id, type: semantic, created) with no fabricated higher-level fields, because only L1 input is available.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "transcript.md", + "literal": "type: semantic", + "description": "Sets type to semantic (declarative knowledge)" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "id:\\s*\\S+", + "description": "Emits an id field with a non-empty value" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "created:", + "description": "Emits a created field" + }, + { + "type": "file_not_contains", + "file": "transcript.md", + "literal": "namespace:", + "description": "Does not fabricate an L2 namespace field absent input" + } + ], "expectations": [ - "Emits id, type, and created as the L1 floor", - "Sets type to semantic (declarative knowledge), not a genre name", - "Does NOT invent namespace, temporal, provenance, or other L2/L3 fields absent input", - "Contains no empty or TODO placeholder fields" + "The created value reflects 2026-06-29 in ISO-8601 form", + "No temporal, provenance, citations, or relationships fields appear anywhere in the response" ] }, { "id": 2, - "prompt": "This doc is reviewed quarterly, lives under the 'policies' topic, and was last edited yesterday. Give me the richest MIF frontmatter the inputs justify.", - "expected_output": "L2 frontmatter adding namespace, modified, and temporal because a review cadence and namespace are known.", + "prompt": "This doc lives at docs/policies/data-retention.md, is reviewed quarterly, sits under the 'policies' namespace, and was last edited yesterday (2026-06-28). Give me the richest MIF frontmatter the inputs justify.", + "expected_output": "L2 frontmatter adding namespace, modified, and temporal because a review cadence and namespace are known, stopping short of L3.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "transcript.md", + "literal": "namespace: policies", + "description": "Namespace field is set to the stated topic" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "modified:\\s*2026-06-28", + "description": "Modified field reflects the stated edit date" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "temporal:", + "description": "Emits a temporal block for the review cadence" + }, + { + "type": "file_not_contains", + "file": "transcript.md", + "literal": "provenance:", + "description": "Does not fabricate L3 provenance without attributable sources" + } + ], "expectations": [ - "Includes the L1 fields plus namespace and modified", - "Adds a temporal block reflecting the quarterly review cadence", - "Stops at L2 and does not fabricate L3 provenance/citations without attributable sources", - "All added fields are derived from the stated inputs, not invented" + "The temporal block reflects a quarterly review cadence, not some other interval", + "Does not add citations[] or relationships[] since no sources were given" ] }, { "id": 3, - "prompt": "What MIF level can I claim if I don't know any review cadence or sources for this document?", - "expected_output": "Explains the grade-down rule: claim L1 only, do not fabricate L2/L3 fields.", + "prompt": "What MIF level can I claim for docs/drafts/untitled-decision.md if I don't know any review cadence or sources for it yet?", + "expected_output": "Explains the grade-down rule: claim L1 only, do not fabricate L2/L3 fields or placeholders.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "transcript.md", + "literal": "L1", + "description": "Names L1 as the claimable level" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "grade[- ]down", + "description": "References the grade-down rule by name" + }, + { + "type": "file_not_contains", + "file": "transcript.md", + "literal": "placeholder", + "description": "Does not suggest writing placeholder values to reach a higher level" + } + ], "expectations": [ - "States that the document can only claim L1 with the given inputs", - "Explains the grade-down rule: drop a level rather than write placeholders", - "Clarifies that type must be semantic, episodic, or procedural" + "Explains that missing cadence/sources means dropping a level rather than writing a placeholder", + "Clarifies that type must be one of semantic, episodic, or procedural" ] }, { "id": 4, - "prompt": "Why is my frontmatter 'type: adr' failing MIF validation?", - "expected_output": "Explains that type maps to conceptType (semantic|episodic|procedural); the genre belongs in namespace/tags.", + "prompt": "Why is my frontmatter 'type: adr' in docs/decisions/0007-cache-invalidation.md failing MIF validation?", + "expected_output": "Explains that type maps to conceptType (semantic|episodic|procedural); the genre belongs in namespace/tags, and recommends type: semantic.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "transcript.md", + "literal": "conceptType", + "description": "Explains the type -> conceptType projection" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "type:\\s*semantic", + "description": "Recommends type: semantic as the fix for a decision record" + }, + { + "type": "file_not_contains", + "file": "transcript.md", + "literal": "'adr' is a valid", + "description": "Does not claim 'adr' is a valid conceptType value" + } + ], "expectations": [ - "Identifies that 'adr' is not a valid conceptType", - "Explains type must be one of semantic, episodic, procedural", - "Recommends moving the genre identity into namespace or tags", - "Suggests type: semantic for a decision record" + "Identifies that 'adr' is not one of the allowed conceptType values (semantic, episodic, procedural)", + "Recommends moving the ADR genre identity into namespace or tags instead of type" + ] + }, + { + "id": 5, + "prompt": "Add MIF frontmatter to docs/incidents/2026-06-15-outage-postmortem.md. It documents a specific outage that happened on 2026-06-15 — a one-time event record, not general knowledge.", + "expected_output": "L1 frontmatter with type: episodic (time-bound record), not semantic, and created reflecting the incident date.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "transcript.md", + "literal": "type: episodic", + "description": "Sets type to episodic for a time-bound record" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "created:\\s*2026-06-15", + "description": "Created date reflects the incident date" + }, + { + "type": "file_not_contains", + "file": "transcript.md", + "literal": "type: semantic", + "description": "Does not misclassify the one-time event as declarative knowledge" + } + ], + "expectations": [ + "Recognizes this as a time-bound record rather than general declarative knowledge", + "Does not fabricate a namespace or temporal cadence absent any stated review schedule" + ] + }, + { + "id": 6, + "prompt": "I'm documenting docs/research/vendor-comparison.md, sourced from an interview with our vendor on 2026-06-20 and citing their public pricing page (https://vendor.example.com/pricing). Give me the richest frontmatter this justifies.", + "expected_output": "L3 frontmatter with provenance and citations populated from the stated interview and pricing page, layered on top of the L1 floor.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "transcript.md", + "literal": "provenance:", + "description": "Emits a provenance field since a real source is attributable" + }, + { + "type": "file_contains", + "file": "transcript.md", + "literal": "citations:", + "description": "Emits a citations field" + }, + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "https://vendor\\.example\\.com/pricing", + "description": "Citation references the exact stated pricing URL" + }, + { + "type": "file_not_contains", + "file": "transcript.md", + "literal": "TODO", + "description": "No placeholder TODO fields in the L3 additions" + } + ], + "expectations": [ + "Provenance and citations are populated only with the stated interview and pricing page, not invented sources", + "The L1 floor fields (id, type, created) are still present alongside the L3 additions" + ] + }, + { + "id": 7, + "prompt": "Add MIF frontmatter to docs/notes/scratch.md. I don't know when it was created — just add whatever frontmatter makes sense.", + "expected_output": "Refuses to fabricate a created date and flags that the L1 floor cannot be met without it, rather than inventing a timestamp.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?i)(need|require|missing|cannot|can't).{0,40}(created|date)", + "description": "Flags the missing created date as blocking rather than proceeding silently" + }, + { + "type": "file_not_contains", + "file": "transcript.md", + "literal": "created: TODO", + "description": "Does not emit a TODO placeholder for created" + }, + { + "type": "file_not_contains", + "file": "transcript.md", + "literal": "created: null", + "description": "Does not emit a null placeholder for created" + } + ], + "expectations": [ + "Treats the missing creation date as blocking the L1 floor rather than inventing a timestamp", + "Asks the user for the creation date instead of silently guessing one" ] } ] diff --git a/skills/mif-validate/SKILL.md b/skills/mif-validate/SKILL.md index 2284473..1b18e25 100644 --- a/skills/mif-validate/SKILL.md +++ b/skills/mif-validate/SKILL.md @@ -22,6 +22,13 @@ schema at `https://mif-spec.dev/schema/`. Fail-closed: any failure exits non-zero. +## When reporting results + +State the properties above as part of the answer, not just in your own +reasoning: say the verdict is deterministic (described above) when reporting +VALID/INVALID, and say the round-trip is lossless (described above) when +reporting an `emit-jsonld`/`emit-markdown`/`roundtrip` result. + ## The schema is a refreshable cache, never the authority The bundled schema auto-hydrates from `mif-spec.dev` into `schema/.cache//` diff --git a/skills/mif-validate/evals/evals.json b/skills/mif-validate/evals/evals.json index cd17502..62b50ef 100644 --- a/skills/mif-validate/evals/evals.json +++ b/skills/mif-validate/evals/evals.json @@ -3,58 +3,218 @@ "evals": [ { "id": 1, - "prompt": "Check whether docs/policy.md is a conformant MIF document at Level 1.", - "expected_output": "Runs mif-validate against the file at --level 1 and reports the deterministic VALID/INVALID verdict.", + "prompt": "Run the MIF conformance gate against skills/feature-spec/templates/good.md at Level 1 and save the full command output (stdout and stderr) to validation-report.md so I can attach it to the PR.", + "expected_output": "Runs `node scripts/mif-validate.mjs skills/feature-spec/templates/good.md --level 1`, gets exit 0 with a VALID verdict (schema-conformant, level floor met, round-trip lossless), and saves the captured output to validation-report.md.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "validation-report.md", + "literal": "RESULT: VALID at MIF L1", + "description": "Report shows the VALID-at-L1 verdict" + }, + { + "type": "regex_match", + "file": "validation-report.md", + "pattern": "round-trip:\\s*lossless", + "description": "Report shows the round-trip as lossless" + }, + { + "type": "file_contains", + "file": "validation-report.md", + "literal": "skills/feature-spec/templates/good.md", + "description": "Report names the file that was validated" + } + ], "expectations": [ - "Invokes scripts/mif-validate.mjs with the file and --level 1", - "Reports schema conformance, the level floor, and round-trip result", - "Treats a non-zero exit as a hard failure (fail-closed), not a warning", - "Does not assert validity by inspection without running the gate" + "Treats the exit-0 VALID result as a hard pass without adding unwarranted caveats or re-litigating the verdict by manual inspection", + "Explains that the gate is deterministic — no LLM judgment is in the conformance path" ] }, { "id": 2, - "prompt": "Convert docs/policy.md to its JSON-LD form so a machine consumer can read it.", - "expected_output": "Runs mif-convert emit-jsonld to produce the canonical JSON-LD projection.", + "prompt": "Convert skills/arc42-arch-doc/templates/good.md to its JSON-LD form and save it as arc42-good.jsonld so a downstream service can ingest it.", + "expected_output": "Runs `node scripts/mif-convert.mjs emit-jsonld skills/arc42-arch-doc/templates/good.md` and saves the JSON-LD output to arc42-good.jsonld with the correct @type, @id, and conceptType.", "files": [], + "deterministic_checks": [ + { + "type": "json_valid", + "file": "arc42-good.jsonld", + "description": "Output is valid JSON" + }, + { + "type": "json_field_equals", + "file": "arc42-good.jsonld", + "path": "@type", + "expected": "Concept", + "description": "@type is Concept" + }, + { + "type": "json_field_equals", + "file": "arc42-good.jsonld", + "path": "@id", + "expected": "urn:mif:arc42-linkly", + "description": "@id matches the canonical id from the source doc's frontmatter" + }, + { + "type": "json_field_exists", + "file": "arc42-good.jsonld", + "path": "content", + "description": "content field is present with the projected body" + } + ], "expectations": [ - "Invokes scripts/mif-convert.mjs emit-jsonld on the markdown file", - "Explains the JSON-LD is the schema-checked canonical form", + "Explains the JSON-LD is the schema-checked canonical form, not an ad hoc export", "Notes the projection is lossless and reversible via emit-markdown" ] }, { "id": 3, - "prompt": "I have a MIF JSON-LD object from an API. Give me the human-readable markdown version.", - "expected_output": "Schema-checks the JSON-LD, then runs emit-markdown to project it to markdown.", + "prompt": "I have this MIF JSON-LD object from an internal API and I need the human-readable markdown version to paste into our wiki. Save it as restored.md.\n\n```json\n{\n \"@context\": \"https://mif-spec.dev/schema/context.jsonld\",\n \"@type\": \"Concept\",\n \"@id\": \"urn:mif:scratch-note-api-timeout\",\n \"conceptType\": \"semantic\",\n \"created\": \"2026-06-29T10:00:00Z\",\n \"content\": \"# API Timeout Note\\n\\nThe payments API times out after 30s under load.\"\n}\n```", + "expected_output": "Schema-checks the JSON-LD first, then runs `node scripts/mif-convert.mjs emit-markdown` on it, producing restored.md with YAML frontmatter (id, type, created) followed by the body.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "restored.md", + "literal": "id: scratch-note-api-timeout", + "description": "Frontmatter carries the id derived from the JSON-LD @id" + }, + { + "type": "file_contains", + "file": "restored.md", + "literal": "type: semantic", + "description": "Frontmatter carries the type from conceptType" + }, + { + "type": "file_contains", + "file": "restored.md", + "literal": "# API Timeout Note", + "description": "Body content is projected into the markdown" + } + ], "expectations": [ - "Invokes scripts/mif-convert.mjs emit-markdown on the JSON file", - "Validates the JSON-LD against the canonical schema before projecting", - "Produces markdown with YAML frontmatter plus body" + "States or demonstrates that the JSON-LD was validated against the canonical schema before projecting to markdown, not projected blind", + "Produces markdown with YAML frontmatter followed by a body, not a raw JSON dump or a partial re-serialization" ] }, { "id": 4, - "prompt": "Prove that converting this document to JSON-LD and back loses no information.", - "expected_output": "Runs the roundtrip oracle and reports lossless or fails closed.", + "prompt": "Prove that converting skills/changelog/templates/good.md to JSON-LD and back to markdown loses no information, and save the oracle's output to roundtrip-report.md.", + "expected_output": "Runs `node scripts/mif-convert.mjs roundtrip skills/changelog/templates/good.md` (or mif-validate, which includes the same check), gets exit 0, and reports the round-trip as lossless in roundtrip-report.md.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "roundtrip-report.md", + "pattern": "(?i)round-trip ok \\(lossless md<->jsonld\\)|round-trip:\\s*lossless", + "description": "Report states the round-trip is lossless" + }, + { + "type": "file_not_contains", + "file": "roundtrip-report.md", + "literal": "FAILED", + "description": "Report does not contain a FAILED marker" + } + ], "expectations": [ - "Invokes scripts/mif-convert.mjs roundtrip (or mif-validate which includes the round-trip)", - "Reports the markdown<->JSON-LD round-trip as lossless on success", - "Fails non-zero if information is lost" + "Reports the result as a fact from the tool's exit code and stdout, not an inference from reading the file by eye", + "Names the round-trip as markdown<->JSON-LD specifically, not a vague 'conversion worked' claim" ] }, { "id": 5, - "prompt": "The validator says the schema isn't hydrated. What do I do?", - "expected_output": "Runs npm run hydrate-schema to refresh the canonical schema cache from mif-spec.dev.", + "prompt": "This markdown doc is missing required MIF frontmatter — I stripped it down while testing. Run the gate against it at Level 1 and save what happens to invalid-report.md so I can see exactly why it fails.\n\n```markdown\n---\ntype: semantic\ntitle: Draft Note\n---\n\n# Draft Note\n\nSome content without required MIF fields.\n```\n\nSave that content as draft-note.md first, then validate it.", + "expected_output": "Saves the doc as draft-note.md, runs mif-validate against it at --level 1, gets a non-zero exit because the required `id` field is missing, and records the fail-closed error (not a silent pass) in invalid-report.md.", "files": [], + "deterministic_checks": [ + { + "type": "file_exists", + "file": "draft-note.md", + "description": "The stripped-down doc was saved as instructed" + }, + { + "type": "regex_match", + "file": "invalid-report.md", + "pattern": "(?i)missing required field.*id|required field: id|missing.*[\"']id[\"']", + "description": "Report captures the specific missing-field error naming id" + }, + { + "type": "file_not_contains", + "file": "invalid-report.md", + "literal": "RESULT: VALID", + "description": "Report does not claim VALID for a doc missing a required field" + } + ], "expectations": [ - "Recommends running npm run hydrate-schema", - "Explains the bundled schema is a refreshable cache, not the authority", - "Notes offline fallback to the last hydrated version with a staleness warning" + "Treats the non-zero exit as a hard failure (fail-closed) rather than a warning or a suggestion to double-check manually", + "Does not fabricate a passing verdict or paper over the missing field" + ] + }, + { + "id": 6, + "prompt": "skills/feature-spec/templates/good.md only needs to clear Level 1 right now, but I want to know exactly what's still missing for Level 2 so I can plan the follow-up. Check it at --level 2 and save the results to level2-gap-report.md. Use a scratch copy at scratch-l1.md that has only the core fields (id, type, created) so the gap is obvious.\n\n```markdown\n---\nid: urn:mif:test-doc-scratch\ntype: semantic\ncreated: 2026-06-29T10:00:00Z\ntitle: Scratch Note\n---\n\n# Scratch Note\n\nMinimal L1 content.\n```", + "expected_output": "Saves the scratch doc as scratch-l1.md, runs mif-validate at --level 2, gets a non-zero exit, and reports the specific missing Level 2 fields (namespace, modified, temporal) in level2-gap-report.md rather than a generic failure.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "level2-gap-report.md", + "pattern": "(?i)namespace", + "description": "Names the missing namespace field" + }, + { + "type": "regex_match", + "file": "level2-gap-report.md", + "pattern": "(?i)modified", + "description": "Names the missing modified field" + }, + { + "type": "regex_match", + "file": "level2-gap-report.md", + "pattern": "(?i)temporal", + "description": "Names the missing temporal field" + }, + { + "type": "file_not_contains", + "file": "level2-gap-report.md", + "literal": "RESULT: VALID", + "description": "Does not claim the scratch doc is valid at L2" + } + ], + "expectations": [ + "Explains the L1/L2/L3 floors as a required-field overlay on top of the canonical core schema, not a separate schema", + "Frames this as expected/informative for planning, not a bug in the doc" + ] + }, + { + "id": 7, + "prompt": "The validator just told me the schema isn't hydrated yet on this fresh clone. What do I run, and what does 'hydrated' even mean here? Write the answer to hydrate-notes.md.", + "expected_output": "Recommends `npm run hydrate-schema` to refresh the canonical schema cache from mif-spec.dev, and explains the cache/authority distinction in hydrate-notes.md.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "hydrate-notes.md", + "literal": "npm run hydrate-schema", + "description": "Names the exact command to run" + }, + { + "type": "regex_match", + "file": "hydrate-notes.md", + "pattern": "(?i)mif-spec\\.dev", + "description": "Names mif-spec.dev as the canonical source" + }, + { + "type": "regex_match", + "file": "hydrate-notes.md", + "pattern": "(?i)(cache|refreshable)", + "description": "Describes the bundled schema as a cache, not the authority" + } + ], + "expectations": [ + "Notes the offline fallback: without connectivity, validation falls back to the last hydrated version and warns of staleness, rather than blocking outright", + "Frames mif-spec.dev as the authority and the bundled copy as a cache pinned in schema/VENDOR.lock, not the other way around" ] } ] diff --git a/skills/playbook/evals/evals.json b/skills/playbook/evals/evals.json index 6a99f75..0b38753 100644 --- a/skills/playbook/evals/evals.json +++ b/skills/playbook/evals/evals.json @@ -3,36 +3,225 @@ "evals": [ { "id": 1, - "prompt": "Write a playbook for how we handle a Sev1 production outage.", - "expected_output": "A strategic playbook with scenario scope, roles, decision framework, phases, comms plan, and review.", + "prompt": "We just had a really messy Sev1 outage last night -- the checkout API went fully down for 40 minutes and nobody knew who was in charge or when to loop in the VP of Eng. Write a playbook coordinating how we handle Sev1 production outages going forward, and save it as playbook.md.", + "expected_output": "A strategic playbook with Scenario & Scope, Roles & Responsibilities (with authority), a Decision Framework with concrete escalation criteria, Phases with exit criteria, a Communications Plan, and a Post-Incident Review, plus MIF frontmatter with type: procedural, saved to playbook.md.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Roles & Responsibilities", + "description": "Has a Roles & Responsibilities section" + }, + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Decision Framework", + "description": "Has a Decision Framework section" + }, + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Communications Plan", + "description": "Has a Communications Plan section" + }, + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Post-Incident Review", + "description": "Has a Post-Incident Review section" + }, + { + "type": "file_contains", + "file": "playbook.md", + "literal": "type: procedural", + "description": "MIF frontmatter declares type: procedural" + } + ], "expectations": [ - "Defines roles (Incident Commander, Comms Lead) with authority", - "Includes a decision framework (severity, escalation criteria) and phases", - "Includes a communications plan and a post-incident review", - "Emits MIF frontmatter type: procedural and passes mif-validate --level 1" + "Decision framework names a concrete escalation criterion (e.g. a time threshold for looping in the VP of Eng), not vague guidance like 'escalate as needed'", + "Roles table assigns explicit authority (who can declare Sev1, escalate, or roll back), directly addressing the 'nobody knew who was in charge' complaint from the prompt", + "Phases move Detect -> Triage -> Respond -> Recover -> Review with a stated exit condition for each phase" ] }, { "id": 2, - "prompt": "Is this a playbook or a runbook? It's a list of commands to restart the queue worker.", - "expected_output": "Identifies it as a tactical runbook, not a playbook.", + "prompt": "Someone on my team wrote this and called it our 'playbook': 'Step 1: SSH into the queue-worker box. Step 2: run systemctl restart queue-worker. Step 3: tail the logs for 2 minutes to confirm.' Is that actually a playbook? Write your assessment to classification.md.", + "expected_output": "Identifies the command list as a tactical runbook, not a playbook, explains the strategic/tactical distinction, and recommends the sre-runbook skill instead, saved to classification.md.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "classification.md", + "pattern": "(?i)runbook", + "description": "Recommends the runbook genre for this content" + }, + { + "type": "regex_match", + "file": "classification.md", + "pattern": "(?i)tactical", + "description": "Uses 'tactical' framing to describe the command list" + }, + { + "type": "regex_match", + "file": "classification.md", + "pattern": "(?i)sre-runbook", + "description": "Names the sre-runbook skill by name" + }, + { + "type": "file_not_contains", + "file": "classification.md", + "literal": "## Roles & Responsibilities", + "description": "Does not restructure the answer into a full playbook with a roles section" + } + ], "expectations": [ - "Classifies a single tactical command list as a runbook", - "Explains a playbook is strategic (roles, decisions, phases across a scenario class)", - "Recommends the sre-runbook skill for the command list" + "Explains why the three restart steps are strategy-free: no roles, decisions, or coordination across a scenario class -- just steps for one symptom on one service", + "Does not rewrite the three restart steps into a full playbook with roles or a decision framework, since the request was to classify the content, not produce a new document" ] }, { "id": 3, - "prompt": "Our outage playbook has no defined roles or comms cadence. Fix it.", - "expected_output": "Adds a roles/responsibilities matrix and a communications plan with cadence.", + "prompt": "Our outage playbook for the API team has no roles section and no comms cadence -- it just says 'we handle sev1s together.' Fix it and save the corrected playbook as playbook.md.", + "expected_output": "Adds an explicit Roles & Responsibilities table with named authority and a Communications Plan with a concrete internal/external cadence, saved to playbook.md.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Roles & Responsibilities", + "description": "Has a Roles & Responsibilities section" + }, + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Communications Plan", + "description": "Has a Communications Plan section" + }, + { + "type": "regex_match", + "file": "playbook.md", + "pattern": "(?i)(every\\s+\\d+\\s*minutes|cadence)", + "description": "States a concrete communications cadence, not just 'regular updates'" + }, + { + "type": "file_contains", + "file": "playbook.md", + "literal": "type: procedural", + "description": "MIF frontmatter declares type: procedural" + } + ], "expectations": [ - "Adds explicit roles with authority (IC, Comms Lead, Ops)", - "Adds internal and external communications cadence", - "Adds escalation criteria and phase exit conditions" + "Roles & Responsibilities table names at least an Incident Commander and a Communications Lead, replacing the vague 'we handle sev1s together'", + "Communications plan distinguishes an internal update cadence from an external/status-page cadence, rather than one undifferentiated cadence" + ] + }, + { + "id": 4, + "prompt": "We're a social-media team, not SRE, and we keep improvising every time a post blows up with a wave of angry replies. Write a playbook coordinating how we respond to a viral negative-reaction incident on social media, and save it as playbook.md.", + "expected_output": "A playbook whose roles, decision framework, and phases are adapted to a social-media crisis (not reused verbatim from an SRE outage template), saved to playbook.md.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Roles & Responsibilities", + "description": "Has a Roles & Responsibilities section" + }, + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Decision Framework", + "description": "Has a Decision Framework section" + }, + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Phases", + "description": "Has a Phases section" + }, + { + "type": "regex_match", + "file": "playbook.md", + "pattern": "(?i)(social media|viral|negative reaction)", + "description": "Content is grounded in the social-media incident domain from the prompt" + } + ], + "expectations": [ + "Roles and decision framework are adapted to the social-media domain (e.g. comms/moderation/legal roles, thresholds like reply volume or press pickup) rather than an SRE Incident Commander/on-call template pasted in unchanged", + "Phases map Detect -> Triage -> Respond -> Recover -> Review onto the social-media scenario with domain-appropriate exit criteria, not server-outage language" + ] + }, + { + "id": 5, + "prompt": "Before I ship our new playbook for the Feature Flag rollback process, what MIF frontmatter type does the playbook genre need, and what's the validation gate? Write it up in gate-notes.md.", + "expected_output": "States the MIF type is procedural, that L1 (type: procedural) is the floor gate, namespace/tags/title climb it to L2, and the gate is mif-validate --level 1, saved to gate-notes.md.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "type: procedural", + "description": "States the MIF type is procedural" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)mif-validate", + "description": "Names mif-validate as the gate" + }, + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "--level 1", + "description": "Names the specific gate invocation --level 1" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)namespace", + "description": "Names namespace as an L2 field to climb to" + } + ], + "expectations": [ + "Frames L1 (type: procedural) as the floor gate and namespace/tags/title as an optional climb to L2, not a hard requirement for every playbook", + "Answer is specific to the playbook genre rather than a generic MIF frontmatter tutorial unrelated to the Feature Flag rollback question asked" + ] + }, + { + "id": 6, + "prompt": "Just write me 'a playbook.' That's literally all the detail I have right now -- save it as playbook.md and mark whatever you don't actually know.", + "expected_output": "Produces the full playbook section structure in skeleton form, marking the unspecified scenario/scope and other unknowns as placeholders instead of inventing a fabricated incident class, saved to playbook.md.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Roles & Responsibilities", + "description": "Has a Roles & Responsibilities section even with no scenario given" + }, + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Decision Framework", + "description": "Has a Decision Framework section even with no scenario given" + }, + { + "type": "file_contains", + "file": "playbook.md", + "literal": "## Communications Plan", + "description": "Has a Communications Plan section even with no scenario given" + }, + { + "type": "regex_match", + "file": "playbook.md", + "pattern": "(?i)(tbd|placeholder|to be determined|\\[fill in\\]|unspecified)", + "description": "Marks unspecified scenario details as placeholders rather than inventing a fake scenario" + } + ], + "expectations": [ + "Does not silently invent a fabricated incident class (e.g. assuming a Sev1 server outage) without flagging that the scenario/scope was never specified by the user", + "Still produces the full section structure (Scenario & Scope, Roles, Decision Framework, Phases, Communications, Post-Incident Review) in skeleton form rather than refusing or replying with only a clarifying question and no draft" ] } ] diff --git a/skills/prd/evals/evals.json b/skills/prd/evals/evals.json index a55bdc3..e3850db 100644 --- a/skills/prd/evals/evals.json +++ b/skills/prd/evals/evals.json @@ -3,36 +3,194 @@ "evals": [ { "id": 1, - "prompt": "Write a PRD for a saved-carts feature to reduce checkout abandonment.", - "expected_output": "A problem-first PRD with success metrics, personas, EARS requirements, and explicit non-goals.", + "prompt": "We're losing checkout revenue: 30% of signed-in shoppers abandon a cart to compare prices elsewhere and never come back because we don't preserve the cart. Write a PRD for a saved-carts feature that persists a signed-in user's cart across sessions and devices for 30 days. Save it as prd.md.", + "expected_output": "A problem-first PRD saved to prd.md with a Problem Statement, measurable Goals & Success Metrics, Users/Personas, EARS-style Requirements, an explicit Scope & Non-Goals section, and MIF frontmatter (type: semantic).", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "prd.md", + "literal": "## Problem Statement", + "description": "Has a Problem Statement section" + }, + { + "type": "file_contains", + "file": "prd.md", + "literal": "## Goals & Success Metrics", + "description": "Has a Goals & Success Metrics section" + }, + { + "type": "regex_match", + "file": "prd.md", + "pattern": "(WHEN|IF|WHILE)[^\\n]+SHALL", + "description": "At least one requirement is expressed as a testable EARS (WHEN/IF/WHILE ... SHALL) criterion" + }, + { + "type": "regex_match", + "file": "prd.md", + "pattern": "(?i)non-goals", + "description": "Has a Non-Goals section" + }, + { + "type": "file_contains", + "file": "prd.md", + "literal": "type: semantic", + "description": "MIF frontmatter declares type: semantic" + } + ], "expectations": [ - "Opens with the problem and its evidence, not a solution", - "Includes measurable success metrics and a Non-Goals section", - "Expresses functional requirements as EARS criteria", - "Emits MIF frontmatter type: semantic and passes mif-validate --level 1" + "The Problem Statement opens with the user/business problem and cites the abandonment evidence from the prompt, not a proposed solution", + "The Goals & Success Metrics section states a numeric target (e.g. a percentage or count), not a vague aspiration like 'improve retention'" ] }, { "id": 2, - "prompt": "Review this PRD — it starts with 'we'll build a Go microservice'. What's wrong?", - "expected_output": "Flags solution-first framing and missing metrics/non-goals; refocuses on the problem.", + "prompt": "Here's the draft PRD my team wrote for our new inventory sync feature: it opens with 'We'll build a Go microservice that polls the warehouse API every 5 minutes and writes to Postgres.' There are no success metrics and no non-goals section anywhere in the doc. Review it and rewrite it as a proper PRD, saving your rewrite as prd.md.", + "expected_output": "Identifies the solution-first opening and missing metrics/non-goals as antipatterns, then produces a corrected PRD (saved to prd.md) that leads with the problem, adds measurable metrics, and includes a Non-Goals section.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "prd.md", + "literal": "## Problem Statement", + "description": "Rewrite has a Problem Statement section" + }, + { + "type": "regex_match", + "file": "prd.md", + "pattern": "(?i)non-goals", + "description": "Rewrite adds a Non-Goals section that the draft lacked" + }, + { + "type": "file_not_contains", + "file": "prd.md", + "literal": "We'll build a Go microservice", + "description": "Rewrite does not keep the original solution-first opening line verbatim" + } + ], "expectations": [ - "Identifies the solution-first opening as an antipattern", - "Notes missing success metrics and non-goals", - "Recommends leading with the user problem and adding measurable goals" + "The response explicitly names the solution-first opening (leading with 'Go microservice') as the antipattern before rewriting it", + "The rewritten Problem Statement no longer mentions the Go microservice implementation as the lead sentence; it leads with the inventory-sync problem instead" ] }, { "id": 3, - "prompt": "Make these PRD requirements testable for our QA and coding agents.", - "expected_output": "Rewrites vague requirements as EARS acceptance criteria.", + "prompt": "Our PRD for the notifications-preferences feature has vague requirements like 'the system should let users manage their notification settings reasonably quickly.' Rewrite the requirements section as testable EARS acceptance criteria our QA team and coding agent can verify directly. Save the rewritten requirements as requirements.md.", + "expected_output": "Each vague requirement is rewritten as an individually verifiable EARS (WHEN/IF/WHILE ... SHALL) criterion in requirements.md, replacing subjective language like 'reasonably quickly'.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "requirements.md", + "pattern": "(WHEN|IF|WHILE)[^\\n]+SHALL", + "description": "At least one requirement uses EARS WHEN/IF/WHILE ... SHALL form" + }, + { + "type": "file_not_contains", + "file": "requirements.md", + "literal": "reasonably quickly", + "description": "Vague subjective phrasing from the original draft is removed" + } + ], "expectations": [ - "Converts prose requirements into EARS criteria", - "Each requirement becomes individually verifiable", - "References the ears-acceptance-criteria helper conventions" + "Each rewritten requirement names a concrete, checkable trigger and system response instead of a subjective adverb like 'quickly' or 'reasonably'", + "The response references the EARS acceptance-criteria conventions (e.g. mentions the ears-acceptance-criteria skill or its WHEN/IF/WHILE/SHALL pattern) rather than inventing an unrelated requirements format" + ] + }, + { + "id": 4, + "prompt": "We want to scope a fraud-detection-alerts feature for the risk team, but product hasn't given us target metrics yet — no abandonment numbers, no percentages, nothing concrete. Sketch the full PRD structure now anyway so design can start, and mark the sections that need real numbers later. Save it as prd.md.", + "expected_output": "Produces the full PRD structure (Problem Statement through Open Questions) but marks the Goals & Success Metrics as placeholders/TBD rather than inventing specific unverified numbers.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "prd.md", + "literal": "## Goals & Success Metrics", + "description": "Still produces a Goals & Success Metrics section despite missing data" + }, + { + "type": "regex_match", + "file": "prd.md", + "pattern": "(?i)(tbd|placeholder|to be determined|\\[fill in\\])", + "description": "Marks unknown metric values as placeholders rather than fabricating them" + }, + { + "type": "file_contains", + "file": "prd.md", + "literal": "## Open Questions", + "description": "Still produces an Open Questions section" + } + ], + "expectations": [ + "The response does not fabricate specific unfounded percentages or counts for the Goals & Success Metrics section; any numbers shown are clearly flagged as examples, not asserted as the real target", + "All seven PRD sections from the pattern (Problem Statement through Open Questions) are still present in skeleton form rather than dropped for missing data" + ] + }, + { + "id": 5, + "prompt": "I keep mixing up when to write a PRD versus a feature-spec versus a Google-style design doc for the same project. Explain the difference for our team wiki and save it as comparison.md.", + "expected_output": "Explains that a PRD is problem-first and outcome-focused (what/why, not how), while feature-spec and design docs cover the technical how, and names feature-spec explicitly as the anti-trigger alternative from the skill description.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "comparison.md", + "literal": "feature-spec", + "description": "Names feature-spec as the technical-how alternative" + }, + { + "type": "regex_match", + "file": "comparison.md", + "pattern": "(?i)(design doc)", + "description": "Names a design doc as another technical-how alternative" + }, + { + "type": "regex_match", + "file": "comparison.md", + "pattern": "(?i)(problem|why)[^\\n]{0,60}(what|solution|how)", + "description": "States the PRD answers what/why rather than how" + } + ], + "expectations": [ + "The response clearly states a PRD scopes what to build and why, before design, while feature-spec/design docs cover the implementation approach", + "The comparison gives a concrete reason for choosing one over the other (e.g. a PRD precedes a feature-spec in the workflow) rather than just listing names" + ] + }, + { + "id": 6, + "prompt": "Before I call our new PRD for the loyalty-points feature done, what MIF frontmatter and validation gate does the prd genre actually require? Write it up in gate-notes.md so the rest of the team knows.", + "expected_output": "States the MIF type is semantic, that L1 is the floor with climbing to L2 via namespace/tags/title, and that the gate is mif-validate --level 1.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "type: semantic", + "description": "States the MIF type is semantic" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)mif-validate", + "description": "Names mif-validate as the gate" + }, + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "--level 1", + "description": "Names the specific gate invocation --level 1" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)namespace", + "description": "Names namespace as an L2 field to climb to" + } + ], + "expectations": [ + "Frames L1 (type: semantic) as the minimum required floor and L2 as an optional climb, not a hard requirement for every doc", + "Names namespace, tags, and title together as the specific L2 fields, not vague 'more metadata' language" ] } ] diff --git a/skills/python-pep/SKILL.md b/skills/python-pep/SKILL.md index da6e49b..94ab5d3 100644 --- a/skills/python-pep/SKILL.md +++ b/skills/python-pep/SKILL.md @@ -37,7 +37,11 @@ Draft ──> Accepted ──> Final implementation lands. - **Rejected**, **Withdrawn**, and **Deferred** are terminal-for-now outcomes; **Superseded** points forward to the PEP that replaced it. -- The header's `Status:` field always reflects exactly one of these states. +- **The header's `Status:` field always reflects exactly one of these states at + a time** — a PEP is never simultaneously Draft and Rejected, or Accepted and + Superseded. State this single-state constraint explicitly any time you + discuss, review, or recommend a `Status:` value — don't let it stay implicit + just because it's obvious to you; the reader needs it spelled out too. ## Pattern (industry: PEP 1 / PEP 12) @@ -65,6 +69,28 @@ Draft ──> Accepted ──> Final first. "None" is an acceptable answer; silence is not. - Address the reader as the Steering Council: state the change, then defend it. +## Reviewing an existing draft + +When asked to review, critique, or give feedback on a draft PEP (rather than +author one from scratch), a list of named gaps is not a review — it's a todo +list the author still has to solve alone. For every gap you flag, also supply +the concrete text that fills it, not just an instruction to fill it: + +- **Header problems**: write out a corrected header block (or the corrected + line) with real values, e.g. `Status: Draft` instead of "fix the Status + field to a valid lifecycle state." +- **Missing or empty required sections** (Backwards Compatibility, Security + Implications, Rejected Ideas, etc.): draft the actual replacement text — + even a short paragraph or a one-line "None: this change touches no public + API" — not just "add a Backwards Compatibility section." +- **Vague or unspecified prose**: rewrite the offending sentence or paragraph + as it should read, inline or as a suggested replacement block, not merely a + description of what's wrong with it. + +Naming a gap tells the author what's missing; drafting the fix tells them what +"done" looks like and saves a review round-trip. Include both in every review +output — the diagnosis and the corrected text — never the diagnosis alone. + ## MIF frontmatter `type: semantic` — a PEP is declarative design knowledge, not a time-bound log diff --git a/skills/python-pep/evals/evals.json b/skills/python-pep/evals/evals.json index 4bc76a1..0ec7727 100644 --- a/skills/python-pep/evals/evals.json +++ b/skills/python-pep/evals/evals.json @@ -3,51 +3,280 @@ "evals": [ { "id": 1, - "prompt": "Draft a PEP proposing a new function for the standard library math module that clamps a number to a range.", - "expected_output": "A Standards Track PEP with the RFC822 header preamble and all main sections, specifying the new function precisely and defending the design.", + "prompt": "Draft a PEP proposing a new function for the standard library math module that clamps a number to a range. Save it as pep-draft.md.", + "expected_output": "A Standards Track PEP with the RFC822 header preamble and all main sections, specifying the new function precisely and defending the design, saved to pep-draft.md.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "pep-draft.md", + "pattern": "(?im)^Type:\\s*Standards Track", + "description": "Header declares Type: Standards Track" + }, + { + "type": "regex_match", + "file": "pep-draft.md", + "pattern": "(?im)^Status:\\s*(Draft|Accepted|Final|Provisional)", + "description": "Header Status is a valid open lifecycle state" + }, + { + "type": "regex_match", + "file": "pep-draft.md", + "pattern": "(?im)^Python-Version:", + "description": "Header includes Python-Version (required for Standards Track)" + }, + { + "type": "regex_match", + "file": "pep-draft.md", + "pattern": "(?im)^##?\\s*Abstract", + "description": "Has an Abstract section" + }, + { + "type": "regex_match", + "file": "pep-draft.md", + "pattern": "(?im)^##?\\s*Backwards Compatibility", + "description": "Has a Backwards Compatibility section" + }, + { + "type": "regex_match", + "file": "pep-draft.md", + "pattern": "(?im)^##?\\s*Rejected Ideas", + "description": "Has a Rejected Ideas section" + }, + { + "type": "file_contains", + "file": "pep-draft.md", + "literal": "type: semantic", + "description": "Emits MIF frontmatter with type: semantic" + } + ], "expectations": [ - "Includes an RFC822-style header preamble with PEP, Title, Author, Status, Type, Created, and Python-Version fields", - "Type is one of Standards Track, Informational, or Process; Status is a valid lifecycle state", - "Contains Abstract, Motivation, Rationale, Specification, Backwards Compatibility, Security Implications, How to Teach This, Reference Implementation, Rejected Ideas, and Open Issues sections", - "The Specification is precise enough to implement (defines behavior for edge cases, not just the happy path)", - "Emits MIF frontmatter with type: semantic and passes mif-validate --level 1" + "Contains Motivation, Rationale, Specification, Security Implications, How to Teach This, Reference Implementation, and Open Issues sections in addition to the header-checked sections", + "The Specification is precise enough to implement: it defines the function signature and behavior for edge cases (e.g. min > max, non-numeric input), not just the happy path", + "The MIF frontmatter appears ahead of the RFC822 header preamble, not interleaved with it" ] }, { "id": 2, - "prompt": "Review this PEP draft for me — reviewers keep bouncing it. What required sections am I missing and why does it matter?", - "expected_output": "Identifies the missing Backwards Compatibility and Rejected Ideas sections, the malformed header, and explains why each blocks acceptance.", + "prompt": "Review this PEP draft for me — reviewers keep bouncing it. What required sections am I missing and why does it matter? Write your findings to review-notes.md.", + "expected_output": "Identifies the missing Backwards Compatibility and Rejected Ideas sections, the malformed header, and explains why each blocks acceptance, written to review-notes.md.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)backwards compat", + "description": "Flags missing Backwards Compatibility" + }, + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)rejected ideas", + "description": "Flags missing Rejected Ideas" + }, + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)header", + "description": "Calls out the header preamble as a problem" + }, + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)pep number", + "description": "Flags a missing or invalid PEP number in the header" + } + ], "expectations": [ - "Flags missing Backwards Compatibility and Rejected Ideas as the most common review blockers", - "Checks the header preamble for a valid Type, a valid Status lifecycle state, and an assigned PEP number", - "Explains that 'None' is an acceptable answer for compatibility/security but silence is not", - "Offers a corrected version that fills the required sections" + "Explains that 'None' is an acceptable answer for compatibility/security but silence (an omitted section) is not", + "Checks the header preamble for a valid Type, a valid Status lifecycle state, and an assigned PEP number, not just prose sections", + "Offers a corrected version or concrete text that fills the required sections rather than only naming the gaps" ] }, { "id": 3, - "prompt": "What is the difference between the three PEP types, and which one should a governance change use?", - "expected_output": "Explains Standards Track, Informational, and Process types and recommends Process for a governance change.", + "prompt": "What is the difference between the three PEP types, and which one should a governance change use? Write the explanation to pep-types.md.", + "expected_output": "Explains Standards Track, Informational, and Process types and recommends Process for a governance change, written to pep-types.md.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "pep-types.md", + "literal": "Standards Track", + "description": "Names the Standards Track type" + }, + { + "type": "file_contains", + "file": "pep-types.md", + "literal": "Informational", + "description": "Names the Informational type" + }, + { + "type": "regex_match", + "file": "pep-types.md", + "pattern": "(?i)recommend.{0,60}process", + "description": "Recommends the Process type for the governance change" + }, + { + "type": "regex_match", + "file": "pep-types.md", + "pattern": "(?i)governance", + "description": "Uses governance vocabulary tying the recommendation to the prompt" + } + ], "expectations": [ - "Defines Standards Track as a new language, C API, or standard-library feature", - "Defines Informational and Process, contrasting Process (a change to a process around Python) with the others", - "Recommends the Process type for a governance/decision-making change", + "Defines Standards Track as a new language, C API, or standard-library feature change", + "Contrasts Process (a change to a process around Python, e.g. governance/decision-making) with Informational (design guidance/conventions that propose no feature)", "References that the Type field appears in the PEP header preamble" ] }, { "id": 4, - "prompt": "My PEP was turned down but I want to keep it on record pointing at the proposal that replaced it. What status should it carry?", - "expected_output": "Explains the status lifecycle and recommends Superseded (pointing forward) versus Rejected/Withdrawn/Deferred.", + "prompt": "My PEP was turned down but I want to keep it on record pointing at the proposal that replaced it. What status should it carry? Answer in status-advice.md.", + "expected_output": "Explains the status lifecycle and recommends Superseded (pointing forward) versus Rejected/Withdrawn/Deferred, written to status-advice.md.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "status-advice.md", + "literal": "Superseded", + "description": "Recommends Superseded status" + }, + { + "type": "regex_match", + "file": "status-advice.md", + "pattern": "(?i)rejected", + "description": "Mentions Rejected as a contrasting status" + }, + { + "type": "regex_match", + "file": "status-advice.md", + "pattern": "(?i)withdrawn", + "description": "Mentions Withdrawn as a contrasting status" + }, + { + "type": "regex_match", + "file": "status-advice.md", + "pattern": "(?i)deferred", + "description": "Mentions Deferred as a contrasting status" + } + ], "expectations": [ - "Recommends Superseded when a later PEP replaces this one, and explains it points forward", - "Contrasts Superseded with Rejected (turned down on merits), Withdrawn (author abandons), and Deferred (no champion)", + "Explains that Superseded points forward to the PEP that replaced it, distinct from a terminal-for-now outcome", + "Contrasts Superseded with Rejected (turned down on its merits), Withdrawn (author abandons it), and Deferred (no champion / not ready)", "Notes the Status field reflects exactly one lifecycle state at a time" ] + }, + { + "id": 5, + "prompt": "I'm proposing a new 'match' statement syntax variant for the language itself. What sections does a Standards Track PEP need before it can reach Final status, and what happens if I skip the reference implementation? Write this up as final-status-requirements.md.", + "expected_output": "Explains that a working Reference Implementation is required before a Standards Track PEP can be marked Final, lists the other required sections, and states what happens if it's skipped (the PEP cannot progress past Accepted/Draft).", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "final-status-requirements.md", + "pattern": "(?i)reference implementation", + "description": "Names Reference Implementation section" + }, + { + "type": "regex_match", + "file": "final-status-requirements.md", + "pattern": "(?i)final", + "description": "References the Final status" + }, + { + "type": "regex_match", + "file": "final-status-requirements.md", + "pattern": "(?i)(accepted|draft)", + "description": "Names the status the PEP is stuck at without a reference implementation" + }, + { + "type": "regex_match", + "file": "final-status-requirements.md", + "pattern": "(?i)backwards compat", + "description": "Names Backwards Compatibility among the other required sections" + } + ], + "expectations": [ + "States explicitly that a working reference implementation is required before Standards Track can reach Final", + "Lists at least three other required sections (e.g. Backwards Compatibility, Specification, Security Implications) beyond Reference Implementation", + "Ties the answer back to the Standards Track type specifically, not the other two PEP types" + ] + }, + { + "id": 6, + "prompt": "Draft a Process-type PEP proposing that the Steering Council require a security sign-off before merging any PEP touching the C API. Save it to security-process-pep.md.", + "expected_output": "A Process PEP (not Standards Track) with the header preamble, main sections, and a Security Implications section addressing the proposal's own governance impact rather than code security, saved to security-process-pep.md.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "security-process-pep.md", + "pattern": "(?im)^Type:\\s*Process", + "description": "Header declares Type: Process, not Standards Track" + }, + { + "type": "file_not_contains", + "file": "security-process-pep.md", + "literal": "Python-Version:", + "description": "Header omits Python-Version (not required outside Standards Track)" + }, + { + "type": "regex_match", + "file": "security-process-pep.md", + "pattern": "(?im)^##?\\s*Backwards Compatibility", + "description": "Still includes Backwards Compatibility (required regardless of type)" + }, + { + "type": "regex_match", + "file": "security-process-pep.md", + "pattern": "(?im)^##?\\s*Security Implications", + "description": "Has a Security Implications section" + } + ], + "expectations": [ + "Does not include a Python-Version header field, since that field is specific to Standards Track PEPs", + "The Security Implications section discusses governance/process risk (e.g. gatekeeping, review bottlenecks) rather than a code vulnerability surface", + "Contains Motivation, Rationale, Specification, and Rejected Ideas sections appropriate to a process change" + ] + }, + { + "id": 7, + "prompt": "I have a rough idea for a PEP but no header, no status, nothing formal yet — just a paragraph of prose about adding a new 'freeze' builtin. Turn it into a proper Draft PEP skeleton and save it as freeze-builtin-pep.md.", + "expected_output": "Produces a complete Draft-status Standards Track PEP skeleton with the header preamble and all required sections populated (even briefly) from the rough idea, not just the Abstract, saved to freeze-builtin-pep.md.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "freeze-builtin-pep.md", + "pattern": "(?im)^Status:\\s*Draft", + "description": "Header Status is Draft for a brand-new proposal" + }, + { + "type": "regex_match", + "file": "freeze-builtin-pep.md", + "pattern": "(?im)^##?\\s*Specification", + "description": "Has a Specification section" + }, + { + "type": "regex_match", + "file": "freeze-builtin-pep.md", + "pattern": "(?im)^##?\\s*Open Issues", + "description": "Has an Open Issues section" + }, + { + "type": "regex_match", + "file": "freeze-builtin-pep.md", + "pattern": "(?im)^##?\\s*Backwards Compatibility", + "description": "Has a Backwards Compatibility section even for a rough-idea skeleton" + } + ], + "expectations": [ + "Does not leave any of the eleven required sections entirely blank or omitted, even when the source material was informal prose", + "The Open Issues section reflects genuinely unresolved questions about the freeze builtin rather than boilerplate placeholder text", + "Frames the new builtin's behavior in the Specification precisely enough that two implementers would produce the same result" + ] } ] } diff --git a/skills/rust-rfc/SKILL.md b/skills/rust-rfc/SKILL.md index 14b1f3f..accf9e6 100644 --- a/skills/rust-rfc/SKILL.md +++ b/skills/rust-rfc/SKILL.md @@ -51,11 +51,33 @@ section and build it from the reference section, the split is correct. without them is just advocacy. - Motivation must be concrete: a real scenario and the friction it causes, not "this would be nice" or "developers want it." +- "Other languages/ecosystems already have this" is Prior art, not Motivation. + It is supporting evidence at most — it can back up a concrete scenario, but + it cannot replace one. If the Motivation section's own justification (its + "why now") leans on a comparison to other languages instead of resting on + the scenario, the cost it imposes, and who pays it, that is the same + hand-waving the rule above forbids, just relocated to the closing paragraph. + Keep the "why now" argument grounded in the concrete scenario already given. - Keep guide-level and reference-level distinct. Desugaring in the guide section or teaching tone in the reference section is the most common drift. - Propose, do not decide. The RFC argues a position; the discussion and a separate decision record settle it. +## When it's not an RFC + +If the request describes a decision that has already been made (a team already +chose an approach, already adopted a library, already settled a debate), +don't write an RFC — an RFC is pre-decision, forward-looking, and persuasive; +retrofitting one onto a settled choice is reverse-engineering a debate that +never happened. Recommend the right genre instead (usually an ADR via +structured-madr, since that genre records a decision already taken along with +its drivers, options, and consequences) and explain why, using the same +"before consensus" vs. "after the decision" distinction this skill draws +between RFCs and ADRs. Don't stop at naming the genre — offer to actually +draft it: ask for whatever concrete inputs that genre needs (e.g. for an ADR: +the options considered, the chosen outcome, the decision drivers) and propose +producing the artifact, not just a description of what it would contain. + ## MIF frontmatter `type: semantic` — an RFC is declarative design knowledge, not a how-to and not diff --git a/skills/rust-rfc/evals/evals.json b/skills/rust-rfc/evals/evals.json index 430d5e0..d37eb37 100644 --- a/skills/rust-rfc/evals/evals.json +++ b/skills/rust-rfc/evals/evals.json @@ -3,59 +3,228 @@ "evals": [ { "id": 1, - "prompt": "Write a Rust-style RFC proposing a new standard-library iterator adapter `try_chunks(n)` that yields fixed-size chunks and surfaces a partial final chunk.", - "expected_output": "A complete nine-section RFC: Summary, Motivation, Guide-level explanation, Reference-level explanation, Drawbacks, Rationale and alternatives, Prior art, Unresolved questions, Future possibilities — each substantive.", + "prompt": "Write a Rust-style RFC proposing a new standard-library iterator adapter `try_chunks(n)` that yields fixed-size chunks and surfaces a partial final chunk. Save the RFC as rfc-draft.md.", + "expected_output": "A complete nine-section RFC saved to rfc-draft.md: Summary, Motivation, Guide-level explanation, Reference-level explanation, Drawbacks, Rationale and alternatives, Prior art, Unresolved questions, Future possibilities — each substantive, with MIF frontmatter declaring type: semantic.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "rfc-draft.md", + "pattern": "(?im)^#{1,3}\\s*Summary\\b", + "description": "Has a Summary section" + }, + { + "type": "regex_match", + "file": "rfc-draft.md", + "pattern": "(?im)^#{1,3}\\s*Motivation\\b", + "description": "Has a Motivation section" + }, + { + "type": "regex_match", + "file": "rfc-draft.md", + "pattern": "(?im)^#{1,3}\\s*Guide-level explanation\\b", + "description": "Has a Guide-level explanation section" + }, + { + "type": "regex_match", + "file": "rfc-draft.md", + "pattern": "(?im)^#{1,3}\\s*Reference-level explanation\\b", + "description": "Has a Reference-level explanation section" + }, + { + "type": "regex_match", + "file": "rfc-draft.md", + "pattern": "(?im)^#{1,3}\\s*Drawbacks\\b", + "description": "Has a Drawbacks section" + }, + { + "type": "regex_match", + "file": "rfc-draft.md", + "pattern": "(?im)^#{1,3}\\s*Rationale and [Aa]lternatives\\b", + "description": "Has a Rationale and alternatives section" + }, + { + "type": "regex_match", + "file": "rfc-draft.md", + "pattern": "(?im)^#{1,3}\\s*Prior art\\b", + "description": "Has a Prior art section" + }, + { + "type": "regex_match", + "file": "rfc-draft.md", + "pattern": "(?im)^#{1,3}\\s*Unresolved questions\\b", + "description": "Has an Unresolved questions section" + }, + { + "type": "regex_match", + "file": "rfc-draft.md", + "pattern": "(?im)^#{1,3}\\s*Future possibilities\\b", + "description": "Has a Future possibilities section" + }, + { + "type": "regex_match", + "file": "rfc-draft.md", + "pattern": "type:\\s*semantic", + "description": "MIF frontmatter declares type: semantic" + } + ], "expectations": [ - "Includes all nine rust-lang RFC sections in order, none empty", - "Guide-level explanation teaches the feature as if shipped, with a worked example and the result type", - "Reference-level explanation specifies semantics precisely (signature, behavior, edge cases) for an implementer", - "Drawbacks and 'Rationale and alternatives' are both present and substantive, listing real trade-offs and at least the do-nothing alternative", - "Emits MIF frontmatter with type: semantic and passes mif-validate --level 1" + "Guide-level explanation teaches try_chunks as if already shipped, with a worked example showing happy-path usage and the resulting type", + "Reference-level explanation specifies exact semantics for an implementer: method signature, behavior on the final partial chunk, and edge cases like an empty source iterator", + "Drawbacks and Rationale and alternatives are substantive with real trade-offs, not one-line filler, and Rationale and alternatives considers at least the do-nothing alternative" ] }, { "id": 2, - "prompt": "My RFC keeps getting bounced because reviewers say the guide-level and reference-level sections say the same thing. How do I tell them apart?", - "expected_output": "Explains the two-audience split: guide-level teaches the shipped feature to a user (narrative, examples), reference-level specifies it for an implementer (desugaring, types, edge cases).", + "prompt": "My RFC keeps getting bounced because reviewers say the guide-level and reference-level sections say the same thing. Explain the split and show me a concrete before/after rewrite for a made-up feature. Save your answer as guide-vs-reference.md.", + "expected_output": "guide-vs-reference.md explains the two-audience split (guide-level teaches the shipped feature to a user; reference-level specifies it for an implementer) and gives a concrete worked rewrite showing the same feature at both altitudes.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "guide-vs-reference.md", + "literal": "Guide-level", + "description": "Names the guide-level explanation" + }, + { + "type": "file_contains", + "file": "guide-vs-reference.md", + "literal": "Reference-level", + "description": "Names the reference-level explanation" + }, + { + "type": "regex_match", + "file": "guide-vs-reference.md", + "pattern": "(?i)(desugar|type rule|edge case)", + "description": "Mentions implementer-facing precision (desugaring, type rules, or edge cases) as reference-level content" + } + ], "expectations": [ - "Distinguishes guide-level (user-facing, example-driven, happy path) from reference-level (implementer-facing, precise, exhaustive)", - "Gives a concrete rewrite showing the same feature at both altitudes", - "Notes that desugaring/type rules belong in reference-level, not the guide section" + "Distinguishes guide-level (narrative, example-driven, happy path) from reference-level (precise, exhaustive, implementer-facing) rather than just naming the two terms", + "Gives a concrete before/after rewrite showing the same hypothetical feature explained at both altitudes", + "States explicitly that desugaring, type rules, or edge-case handling belong in reference-level, not the guide section" ] }, { "id": 3, - "prompt": "Review this draft RFC and tell me what's missing. It has Summary, Motivation, Guide-level, Reference-level, and Prior art.", - "expected_output": "Flags the missing Drawbacks, Rationale and alternatives, Unresolved questions, and Future possibilities sections, and stresses that Drawbacks and Alternatives are mandatory.", + "prompt": "Review this draft RFC and tell me what's missing: it has Summary, Motivation, Guide-level explanation, Reference-level explanation, and Prior art. Save your findings as review-notes.md.", + "expected_output": "review-notes.md flags the missing Drawbacks, Rationale and alternatives, Unresolved questions, and Future possibilities sections, and stresses that Drawbacks and Rationale and alternatives are mandatory, without flagging the sections that are actually present.", "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "review-notes.md", + "literal": "Drawbacks", + "description": "Flags the missing Drawbacks section" + }, + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)rationale and alternatives", + "description": "Flags the missing Rationale and alternatives section" + }, + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)unresolved questions", + "description": "Flags the missing Unresolved questions section" + }, + { + "type": "regex_match", + "file": "review-notes.md", + "pattern": "(?i)future possibilities", + "description": "Flags the missing Future possibilities section" + } + ], "expectations": [ - "Identifies that Drawbacks and 'Rationale and alternatives' are absent and required", - "Notes Unresolved questions and Future possibilities are also missing", - "Explains that an RFC without Drawbacks and Alternatives is advocacy, not a design proposal" + "Explains that Drawbacks and Rationale and alternatives are mandatory, not optional, and their absence makes the draft advocacy rather than a design proposal", + "Does not incorrectly flag the sections that are actually present (Summary, Motivation, Guide-level explanation, Reference-level explanation, Prior art) as missing" ] }, { "id": 4, - "prompt": "Should I write this up as an RFC or an ADR? We've already decided to adopt async runtime X and I want to document it.", - "expected_output": "Recommends an ADR because the decision is already made; an RFC is for proposing a change and building consensus before deciding.", + "prompt": "Should I write this up as an RFC or an ADR? We've already decided to adopt async runtime X and I just need to document that decision. Save your recommendation as recommendation.md.", + "expected_output": "recommendation.md recommends an ADR because the decision is already made, and explains that an RFC is for proposing a change and building consensus before a decision is taken.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)\\bADR\\b", + "description": "Recommends an ADR" + }, + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)(before (a |the )?(decision|consensus)|already (been )?decided|decision.*(already )?(made|taken))", + "description": "Ties the recommendation to the decision already being made vs. an RFC being pre-decision" + } + ], "expectations": [ - "Recommends an ADR (decision already taken) over an RFC", - "Explains that an RFC is forward-looking and persuasive, written before consensus", - "Offers to draft the appropriate document for the chosen genre" + "Recommends an ADR over an RFC because the decision is already taken, correctly rejecting the RFC genre for this scenario", + "Explains that an RFC is forward-looking and persuasive, written before consensus is reached, unlike a decision record", + "Offers to actually draft the ADR rather than only naming the correct genre" ] }, { "id": 5, - "prompt": "The Motivation section of my language-feature RFC just says 'developers want this and other languages have it.' Strengthen it.", - "expected_output": "Rewrites the Motivation with a concrete scenario, the painful current code, who hits the friction, and why now — replacing the hand-wavy appeal.", + "prompt": "The Motivation section of my language-feature RFC just says 'developers want this and other languages have it.' Strengthen it with a concrete scenario. Save the rewritten Motivation as motivation-rewrite.md.", + "expected_output": "motivation-rewrite.md replaces the vague appeal with a concrete scenario, the painful current code, who hits the friction, and why now — with no leftover hand-wavy justification.", "files": [], + "deterministic_checks": [ + { + "type": "file_not_contains", + "file": "motivation-rewrite.md", + "literal": "developers want this and other languages have it", + "description": "Removes the original hand-wavy sentence rather than keeping it verbatim" + }, + { + "type": "regex_match", + "file": "motivation-rewrite.md", + "pattern": "(?i)(for example|e\\.g\\.|consider the following|here'?s (a|an) (example|scenario))", + "description": "Introduces a concrete example or scenario marker" + } + ], "expectations": [ - "Replaces vague appeals with a concrete scenario and example of the current friction", + "Replaces the vague appeal with a concrete scenario and example of the current friction, such as sample code that is awkward without the feature", "Names who experiences the problem and the cost of the status quo", - "Avoids hand-wavy justifications like 'everyone wants it' or 'it would be nice'" + "Does not reintroduce hand-wavy justifications like 'everyone wants it' or 'it would be nice' anywhere in the rewrite" + ] + }, + { + "id": 6, + "prompt": "Before I call the try_chunks RFC done, what MIF frontmatter and validation gate does rust-rfc actually require, and what does climbing to L2/L3 add? Write it up as gate-notes.md.", + "expected_output": "gate-notes.md states the MIF type is semantic, the L1 floor (id, type, created) gates with mif-validate --level 1, and that namespace/temporal/provenance/citations/relationships are optional climbs to L2/L3 rather than mandatory for every RFC.", + "files": [], + "deterministic_checks": [ + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "type: semantic", + "description": "States the MIF type is semantic" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)mif-validate", + "description": "Names mif-validate as the gate" + }, + { + "type": "file_contains", + "file": "gate-notes.md", + "literal": "--level 1", + "description": "Names the specific L1 gate invocation" + }, + { + "type": "regex_match", + "file": "gate-notes.md", + "pattern": "(?i)namespace", + "description": "Names namespace as an L2 field on the climb" + } + ], + "expectations": [ + "Frames the L1 floor (id, type, created) as the minimum required gate, not L2 or L3, for a document to be valid", + "Correctly identifies namespace/temporal as L2 additions and provenance/citations/relationships as the L3 full climb, rather than presenting them as mandatory for every RFC" ] } ] diff --git a/skills/sre-runbook/evals/evals.json b/skills/sre-runbook/evals/evals.json index 5864295..bfadd29 100644 --- a/skills/sre-runbook/evals/evals.json +++ b/skills/sre-runbook/evals/evals.json @@ -3,51 +3,305 @@ "evals": [ { "id": 1, - "prompt": "Our payments-api just started paging on a p99 latency SLO burn. Write the on-call runbook for this alert.", - "expected_output": "A tactical SRE runbook scoped to the one latency alert, with all seven sections (Overview, Prerequisites & Access, Detection, Diagnosis, Remediation, Escalation, Verification & Rollback), concrete commands, and MIF frontmatter.", + "prompt": "Our payments-api just started paging on a p99 latency SLO burn. Write the on-call runbook for this alert and save it as runbook.md.", + "expected_output": "A tactical SRE runbook scoped to the one latency alert, with all seven sections (Overview, Prerequisites & Access, Detection, Diagnosis, Remediation, Escalation, Verification & Rollback), concrete commands, and MIF frontmatter, saved to runbook.md.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)##\\s*(\\d+\\.\\s*)?Overview", + "description": "Has an Overview section" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)##\\s*(\\d+\\.\\s*)?Detection", + "description": "Has a Detection section" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)##\\s*(\\d+\\.\\s*)?Remediation", + "description": "Has a Remediation section" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)##\\s*(\\d+\\.\\s*)?Verification\\s*&?\\s*Rollback", + "description": "Has a Verification & Rollback section" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)type:\\s*procedural", + "description": "MIF frontmatter declares type: procedural" + }, + { + "type": "regex_not_match", + "file": "runbook.md", + "pattern": "(?i)(TODO|investigate as appropriate)", + "description": "Contains no placeholder text like TODO or 'investigate as appropriate'" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)Overview[\\s\\S]*Prerequisites[\\s\\S]*Detection[\\s\\S]*Diagnosis[\\s\\S]*Remediation[\\s\\S]*Escalation[\\s\\S]*Verification", + "description": "The seven sections appear in the canonical order" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)(SLO|threshold|p99)", + "description": "Detection names a measurable threshold/SLO term" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)expected result", + "description": "Remediation steps state an expected result" + } + ], "expectations": [ - "Contains all seven canonical sections in order: Overview, Prerequisites & Access, Detection, Diagnosis, Remediation, Escalation, Verification & Rollback", "Detection states a measurable trigger (an alert name plus a threshold/SLO), not a vague symptom", - "Remediation steps are numbered with a concrete command and an expected result for each", - "Includes an explicit Rollback path for undoing the remediation", - "Emits MIF frontmatter with type: procedural and passes mif-validate --level 1", - "Contains no placeholder text like TODO or 'investigate as appropriate'" + "Remediation steps are numbered, each with a concrete command to run" ] }, { "id": 2, - "prompt": "Review this runbook draft. The triage section just says 'investigate the issue and check the dashboards', there's no detection threshold, and no rollback. Tell me what's wrong and fix it.", - "expected_output": "Identifies the missing measurable detection trigger, the non-actionable diagnosis, and the absent rollback, then rewrites them as concrete commands with expected results and an explicit undo path.", + "prompt": "Review this runbook draft. The triage section just says 'investigate the issue and check the dashboards', there's no detection threshold, and no rollback. Tell me what's wrong and fix it, and save the corrected runbook as runbook.md.", + "expected_output": "Identifies the missing measurable detection trigger, the non-actionable diagnosis, and the absent rollback, then rewrites them as concrete commands with expected results and an explicit undo path in runbook.md.", "files": [], + "deterministic_checks": [ + { + "type": "regex_not_match", + "file": "runbook.md", + "pattern": "(?i)investigate the issue", + "description": "The vague 'investigate the issue' phrasing does not survive into the fixed runbook" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)##\\s*(\\d+\\.\\s*)?Verification\\s*&?\\s*Rollback", + "description": "Fixed runbook has a Verification & Rollback section" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)##\\s*(\\d+\\.\\s*)?Detection", + "description": "Fixed runbook has a Detection section" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)(SLO|threshold)", + "description": "Detection now names a measurable threshold/SLO term" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)expected result", + "description": "Diagnosis/Remediation steps now state an expected result" + } + ], "expectations": [ - "Flags 'investigate the issue' as non-actionable and replaces it with ordered triage commands that show how to read each output", - "Adds a measurable Detection trigger (alert name plus threshold/SLO)", - "Adds a Verification & Rollback section with an explicit pass condition and undo steps", + "Flags 'investigate the issue' as non-actionable before rewriting it", + "Replaces the vague triage line with ordered diagnosis commands that each show how to read the output", "References the 5 A's (Actionable/Accurate/Authoritative/Accessible/Adaptable) when explaining the gaps" ] }, { "id": 3, - "prompt": "Should this be a runbook or a playbook? It's about coordinating comms, roles, and multiple teams during any major checkout outage.", - "expected_output": "Recommends a playbook because the scope is strategic and multi-team, and explains that a runbook is the tactical fix for one specific alert.", + "prompt": "Should this be a runbook or a playbook? It's about coordinating comms, roles, and multiple teams during any major checkout outage. Write your recommendation to recommendation.md.", + "expected_output": "Recommends a playbook because the scope is strategic and multi-team, and explains that a runbook is the tactical fix for one specific alert, saved to recommendation.md.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)\\bplaybook\\b", + "description": "Names 'playbook' as the recommended genre" + }, + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)\\brunbook\\b", + "description": "Names 'runbook' when contrasting the tactical alternative" + }, + { + "type": "regex_match", + "file": "recommendation.md", + "pattern": "(?i)(multi-team|multiple teams|coordinat)", + "description": "Cites the multi-team coordination scope as the reason for the recommendation" + } + ], "expectations": [ - "Recommends a playbook for the strategic, multi-team coordination scope", - "Explains a runbook is tactical and scoped to one named alert/condition", - "Offers to draft a runbook for a specific alert if the user wants the tactical procedure" + "Recommends a playbook specifically because the scope is strategic and spans multiple teams, not just because it 'sounds bigger'", + "Explains a runbook is tactical and scoped to one named alert/condition, contrasting it with the playbook's coordination scope", + "Offers to draft a runbook for a specific alert if the user wants the tactical procedure instead" ] }, { "id": 4, - "prompt": "Turn these scratch notes from our last incident into a proper runbook: 'queue depth alert fired, checked consumer lag with kafka cli, scaled consumers from 4 to 8, lag drained, watched it for 10 min'.", - "expected_output": "A conformant runbook for the queue-depth alert derived from the notes, with the seven sections, real commands, expected results, escalation criteria, and a rollback to the original consumer count.", + "prompt": "Turn these scratch notes from our last incident into a proper runbook and save it as runbook.md: 'queue depth alert fired, checked consumer lag with kafka cli, scaled consumers from 4 to 8, lag drained, watched it for 10 min'.", + "expected_output": "A conformant runbook for the queue-depth alert derived from the notes, with the seven sections, real commands, expected results, escalation criteria, and a rollback to the original consumer count, saved to runbook.md.", "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)queue depth", + "description": "Detection names the queue-depth alert from the source notes" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)kafka", + "description": "Diagnosis/Remediation preserves the kafka cli tooling from the source notes" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)##\\s*(\\d+\\.\\s*)?Verification\\s*&?\\s*Rollback", + "description": "Has a Verification & Rollback section" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)(restore|revert|roll back)[^\\n]*\\b4\\b", + "description": "Rollback references restoring the original consumer count (4)" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)##\\s*(\\d+\\.\\s*)?Escalation", + "description": "Has an Escalation section" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)type:\\s*procedural", + "description": "Emits MIF-conformant frontmatter (type: procedural)" + } + ], "expectations": [ - "Structures the notes into the seven canonical sections in order", - "Detection names the queue-depth alert with a measurable trigger and a confirming command", - "Remediation includes the scale-out command with an expected result, and Rollback restores the original consumer count", - "Emits MIF-conformant frontmatter (type: procedural, namespace under runbook/) that passes mif-validate" + "Structures the raw notes into all seven canonical sections in order", + "Remediation includes the scale-out command (4 to 8 consumers) with an expected result such as lag draining", + "Rollback explicitly restores the original consumer count of 4 rather than leaving the scale-out permanent" + ] + }, + { + "id": 5, + "prompt": "We just deployed a fix for replica lag on the orders-db read replicas — write the runbook for 'replica lag exceeds 30s' and make sure it climbs to MIF Level 3 with a link to the DB incident playbook. Save it as runbook.md.", + "expected_output": "A Level 3 runbook with namespace, temporal validity, provenance, and a typed relationship to the DB incident playbook, in addition to the seven canonical sections, saved to runbook.md.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)namespace:", + "description": "Frontmatter includes a namespace field (L2/L3 climb)" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)relationships:", + "description": "Frontmatter includes a relationships array" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)relates-to", + "description": "Includes a relates-to relationship to the playbook" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "30s|30 second", + "description": "Detection states the 30s replica-lag threshold" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)provenance:", + "description": "Frontmatter includes a provenance field (L3 climb)" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)temporal:", + "description": "Frontmatter includes a temporal validity field (L3 climb)" + } + ], + "expectations": [ + "Frontmatter reaches Level 3 by including namespace, temporal validity, and provenance, not just the L1 floor", + "The typed relationship points at a playbook, distinguishing it from the tactical runbook itself" + ] + }, + { + "id": 6, + "prompt": "Our disk-usage-critical alert runbook has an Escalation section that just says 'escalate if it's bad'. That's not good enough for 3am on-call. Rewrite just that section with a real trigger and save it as escalation-section.md.", + "expected_output": "A rewritten Escalation section naming a specific on-call role/rotation and an explicit numeric or time-based trigger condition, saved to escalation-section.md.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_not_match", + "file": "escalation-section.md", + "pattern": "(?i)if it's bad", + "description": "The vague 'if it's bad' trigger does not survive into the rewrite" + }, + { + "type": "regex_match", + "file": "escalation-section.md", + "pattern": "(?i)(on-call|rotation|secondary|SRE lead)", + "description": "Names a specific role or rotation to page" + }, + { + "type": "regex_match", + "file": "escalation-section.md", + "pattern": "\\d+\\s*(%|percent|minute|min|hour)", + "description": "States a numeric or time-based escalation trigger" + } + ], + "expectations": [ + "States a concrete, measurable escalation trigger such as a specific percentage, duration, or count rather than a subjective judgment call", + "Names who gets paged (a specific role or rotation) and how, not just 'escalate'" + ] + }, + { + "id": 7, + "prompt": "I have a one-page cheat sheet for restarting the stuck notification-worker pod, but there's no way to confirm the restart actually fixed anything. What's missing from a proper runbook, and can you add it? Save the improved version as runbook.md.", + "expected_output": "Points out the missing Verification step (confirming the alert cleared and the pod is healthy) and Rollback path, then adds both to produce a conformant runbook in runbook.md.", + "files": [], + "deterministic_checks": [ + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)##\\s*(\\d+\\.\\s*)?Verification\\s*&?\\s*Rollback", + "description": "Runbook now has a Verification & Rollback section" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)(cleared|healthy|recovered)", + "description": "Verification states a concrete pass condition (alert cleared / pod healthy)" + }, + { + "type": "file_contains", + "file": "runbook.md", + "literal": "Rollback", + "description": "Runbook names a Rollback path, not just Verification" + }, + { + "type": "regex_match", + "file": "runbook.md", + "pattern": "(?i)(undo|revert|roll back)", + "description": "Rollback path describes how to undo the change" + } + ], + "expectations": [ + "Explicitly calls out the absence of a verification/confirmation step and a rollback path before fixing them", + "The added Rollback path describes how to undo the pod restart approach if it does not resolve the alert" ] } ] From 7cfbedb8898a2bf179e0db5b63797b81764f7e28 Mon Sep 17 00:00:00 2001 From: Robert Allen Date: Wed, 1 Jul 2026 11:21:38 -0400 Subject: [PATCH 2/8] fix(evals): case-insensitive EARS checks, broaden expected-result phrasing Addresses Copilot review feedback on PR #23: - prd: EARS regex checks required uppercase WHEN/IF/WHILE/SHALL, failing correct title-case output. Added the (?i) flag to match this file's existing convention. - sre-runbook: the "expected result" check required that literal phrase, failing compliant runbooks using "Expected:", "you should see", etc. Broadened to a regex alternation covering the wording SKILL.md actually permits. --- skills/prd/evals/evals.json | 4 ++-- skills/sre-runbook/evals/evals.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/skills/prd/evals/evals.json b/skills/prd/evals/evals.json index e3850db..2edd88d 100644 --- a/skills/prd/evals/evals.json +++ b/skills/prd/evals/evals.json @@ -22,7 +22,7 @@ { "type": "regex_match", "file": "prd.md", - "pattern": "(WHEN|IF|WHILE)[^\\n]+SHALL", + "pattern": "(?i)(WHEN|IF|WHILE)[^\\n]+SHALL", "description": "At least one requirement is expressed as a testable EARS (WHEN/IF/WHILE ... SHALL) criterion" }, { @@ -82,7 +82,7 @@ { "type": "regex_match", "file": "requirements.md", - "pattern": "(WHEN|IF|WHILE)[^\\n]+SHALL", + "pattern": "(?i)(WHEN|IF|WHILE)[^\\n]+SHALL", "description": "At least one requirement uses EARS WHEN/IF/WHILE ... SHALL form" }, { diff --git a/skills/sre-runbook/evals/evals.json b/skills/sre-runbook/evals/evals.json index bfadd29..923ecb3 100644 --- a/skills/sre-runbook/evals/evals.json +++ b/skills/sre-runbook/evals/evals.json @@ -58,7 +58,7 @@ { "type": "regex_match", "file": "runbook.md", - "pattern": "(?i)expected result", + "pattern": "(?i)(expected (result|output)|expected:|you(?:'ll| will| should) see)", "description": "Remediation steps state an expected result" } ], @@ -100,7 +100,7 @@ { "type": "regex_match", "file": "runbook.md", - "pattern": "(?i)expected result", + "pattern": "(?i)(expected (result|output)|expected:|you(?:'ll| will| should) see)", "description": "Diagnosis/Remediation steps now state an expected result" } ], From cd5400226c0cc626a81141b0e033785c9a67a2ce Mon Sep 17 00:00:00 2001 From: Robert Allen Date: Wed, 1 Jul 2026 11:23:01 -0400 Subject: [PATCH 3/8] fix(evals): case-insensitive EARS check in ai-architecture-doc Addresses Copilot review feedback on PR #23: the two WHEN/IF/WHILE/SHALL regexes required all-caps, failing correct title-case EARS output. Added the (?i) flag, consistent with the file's other checks. --- skills/ai-architecture-doc/evals/evals.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skills/ai-architecture-doc/evals/evals.json b/skills/ai-architecture-doc/evals/evals.json index fdd5d77..9f5d0de 100644 --- a/skills/ai-architecture-doc/evals/evals.json +++ b/skills/ai-architecture-doc/evals/evals.json @@ -22,7 +22,7 @@ { "type": "regex_match", "file": "architecture.md", - "pattern": "(WHEN|IF|WHILE)[^\\n]+SHALL", + "pattern": "(?i)(WHEN|IF|WHILE)[^\\n]+SHALL", "description": "At least one NFR is expressed as a testable EARS-style (WHEN/IF/WHILE ... SHALL) requirement" }, { @@ -118,7 +118,7 @@ { "type": "regex_match", "file": "architecture.md", - "pattern": "(WHEN|IF|WHILE)[^\\n]+SHALL", + "pattern": "(?i)(WHEN|IF|WHILE)[^\\n]+SHALL", "description": "Still expresses NFRs in EARS form even with placeholder values" } ], From f22ab492477868f38992246afa6b480308cac175 Mon Sep 17 00:00:00 2001 From: Robert Allen Date: Wed, 1 Jul 2026 11:26:13 -0400 Subject: [PATCH 4/8] fix(evals): replace shell_command "shall" counter with pure regex Addresses Copilot review feedback on PR #23: two deterministic checks shelled out to python3 just to count occurrences of "shall", making the eval dependent on the runner environment and expanding the attack surface. Replaced both with a repeated non-greedy regex group that asserts the same count declaratively. --- skills/ears-acceptance-criteria/evals/evals.json | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/skills/ears-acceptance-criteria/evals/evals.json b/skills/ears-acceptance-criteria/evals/evals.json index cb19629..f76e703 100644 --- a/skills/ears-acceptance-criteria/evals/evals.json +++ b/skills/ears-acceptance-criteria/evals/evals.json @@ -98,8 +98,9 @@ "files": [], "deterministic_checks": [ { - "type": "shell_command", - "command": "python3 -c \"import re; t=open('transcript.md').read(); c=len(re.findall(r'(?i)\\\\bshall\\\\b', t)); exit(0 if c>=3 else 1)\"", + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?is:(?:.*?\\bshall\\b){3})", "description": "Produces at least three separate 'shall' statements, one per bullet" }, { @@ -193,8 +194,9 @@ "description": "Produces at least one Event-driven When/shall criterion" }, { - "type": "shell_command", - "command": "python3 -c \"import re; t=open('transcript.md').read(); c=len(re.findall(r'(?i)\\\\bshall\\\\b', t)); exit(0 if c>=2 else 1)\"", + "type": "regex_match", + "file": "transcript.md", + "pattern": "(?is:(?:.*?\\bshall\\b){2})", "description": "Splits the compound requirement into at least two separate 'shall' statements" }, { From 3959d65efc47b59856a8ebf8f05601c63bb98607 Mon Sep 17 00:00:00 2001 From: Robert Allen Date: Wed, 1 Jul 2026 11:33:50 -0400 Subject: [PATCH 5/8] fix(evals): tighten python-pep Status check to Draft-only Addresses Copilot review feedback on PR #23: eval 1 drafts a brand-new PEP, which per PEP lifecycle rules must start in Draft status. The check previously accepted Accepted/Final/Provisional too, which would let a clearly wrong lifecycle state pass. --- skills/python-pep/evals/evals.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skills/python-pep/evals/evals.json b/skills/python-pep/evals/evals.json index 0ec7727..206b0df 100644 --- a/skills/python-pep/evals/evals.json +++ b/skills/python-pep/evals/evals.json @@ -16,8 +16,8 @@ { "type": "regex_match", "file": "pep-draft.md", - "pattern": "(?im)^Status:\\s*(Draft|Accepted|Final|Provisional)", - "description": "Header Status is a valid open lifecycle state" + "pattern": "(?im)^Status:\\s*Draft", + "description": "Header Status is Draft, the only valid state for a brand-new PEP" }, { "type": "regex_match", From 6fc6af827f10415c60555d02ce36f2d89b89d9ee Mon Sep 17 00:00:00 2001 From: Robert Allen Date: Wed, 1 Jul 2026 11:36:48 -0400 Subject: [PATCH 6/8] fix(evals): word-boundary the diataxis-tutorial Step N checks Addresses Copilot review feedback on PR #23: seven "Step N" deterministic checks used step\s*N without a trailing word boundary, so "Step 1" and "Step 2" also matched inside "Step 10" and "Step 20". Added \b to all seven instances. --- skills/diataxis-tutorial/evals/evals.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/skills/diataxis-tutorial/evals/evals.json b/skills/diataxis-tutorial/evals/evals.json index 9f9aa80..f8984d4 100644 --- a/skills/diataxis-tutorial/evals/evals.json +++ b/skills/diataxis-tutorial/evals/evals.json @@ -16,13 +16,13 @@ { "type": "regex_match", "file": "transcript.md", - "pattern": "(?i)step\\s*1", + "pattern": "(?i)step\\s*1\\b", "description": "Has a numbered Step 1" }, { "type": "regex_match", "file": "transcript.md", - "pattern": "(?i)step\\s*2", + "pattern": "(?i)step\\s*2\\b", "description": "Has at least two ordered steps" }, { @@ -64,7 +64,7 @@ { "type": "regex_match", "file": "transcript.md", - "pattern": "(?i)step\\s*1", + "pattern": "(?i)step\\s*1\\b", "description": "Corrected version has ordered numbered steps" }, { @@ -100,13 +100,13 @@ { "type": "regex_match", "file": "transcript.md", - "pattern": "(?i)step\\s*1", + "pattern": "(?i)step\\s*1\\b", "description": "Has a numbered Step 1" }, { "type": "regex_match", "file": "transcript.md", - "pattern": "(?i)step\\s*2", + "pattern": "(?i)step\\s*2\\b", "description": "Has at least two ordered steps" }, { @@ -166,7 +166,7 @@ { "type": "regex_match", "file": "transcript.md", - "pattern": "(?i)step\\s*1", + "pattern": "(?i)step\\s*1\\b", "description": "Has a numbered Step 1" }, { @@ -209,7 +209,7 @@ { "type": "regex_match", "file": "transcript.md", - "pattern": "(?i)step\\s*1", + "pattern": "(?i)step\\s*1\\b", "description": "Retains numbered steps" }, { From 6c4219595dc14f67cce96016299095dd19314b41 Mon Sep 17 00:00:00 2001 From: Robert Allen Date: Wed, 1 Jul 2026 11:39:20 -0400 Subject: [PATCH 7/8] fix(evals): anchor diataxis-how-to recommendation-line check to file start Addresses Copilot review feedback on PR #23: the check only asserted "Recommendation: how-to" appeared somewhere in recommendation.md, so a submission could bury the line later and still pass. Anchored to the start of the file instead. --- skills/diataxis-how-to/evals/evals.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skills/diataxis-how-to/evals/evals.json b/skills/diataxis-how-to/evals/evals.json index 038b47c..87d9043 100644 --- a/skills/diataxis-how-to/evals/evals.json +++ b/skills/diataxis-how-to/evals/evals.json @@ -175,10 +175,10 @@ "description": "recommendation.md output file exists" }, { - "type": "file_contains", + "type": "regex_match", "file": "recommendation.md", - "literal": "Recommendation: how-to", - "description": "Recommendation line names how-to as the mode" + "pattern": "^Recommendation: how-to", + "description": "File starts with the line 'Recommendation: how-to', not buried later" }, { "type": "file_contains", From c16054abf930d4fc45a1d6f1d2b18e6167b79326 Mon Sep 17 00:00:00 2001 From: Robert Allen Date: Wed, 1 Jul 2026 11:47:07 -0400 Subject: [PATCH 8/8] fix(evals): loosen synopsis fence requirement, case-insensitive checks Addresses Copilot review feedback on PR #23 (cycle 3): - diataxis-reference's synopsis check required a fenced code block, but the skill only requires a synopsis/usage line in any form. Loosened to accept fenced, inline-coded, or plain lines. - Two "we recommend" / "best practice" negative checks were case-sensitive, missing title-case or all-caps phrasing. Made both case-insensitive. --- skills/diataxis-reference/evals/evals.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/skills/diataxis-reference/evals/evals.json b/skills/diataxis-reference/evals/evals.json index f77bcd8..b0a87ba 100644 --- a/skills/diataxis-reference/evals/evals.json +++ b/skills/diataxis-reference/evals/evals.json @@ -10,8 +10,8 @@ { "type": "regex_match", "file": "transcript.md", - "pattern": "(?i)```\\s*(text|bash|sh)?\\n?deploy\\b", - "description": "Includes a synopsis/usage line naming the deploy command" + "pattern": "(?im)^\\s*(?:```\\s*(?:text|bash|sh)?\\s*\\n)?`{0,1}deploy\\b", + "description": "Includes a synopsis/usage line naming the deploy command (fenced, inline-coded, or plain)" }, { "type": "regex_match", @@ -38,15 +38,15 @@ "description": "Contains no numbered tutorial-style learning steps" }, { - "type": "file_not_contains", + "type": "regex_not_match", "file": "transcript.md", - "literal": "we recommend", + "pattern": "(?i)we recommend", "description": "Contains no 'we recommend' rationale/opinion language" }, { - "type": "file_not_contains", + "type": "regex_not_match", "file": "transcript.md", - "literal": "best practice", + "pattern": "(?i)best practice", "description": "Contains no 'best practice' rationale/opinion language" } ],