From ad41c5d8534f3b486b15a25e1df50a5e4abdb841 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 17 Mar 2026 11:28:03 -0700 Subject: [PATCH 1/3] feat: diff-based test selection for E2E and LLM-judge evals Each test declares file dependencies in a TOUCHFILES map. The test runner checks git diff against the base branch and only runs tests whose dependencies were modified. Global touchfiles (session-runner, eval-store, gen-skill-docs) trigger all tests. New scripts: test:e2e:all, test:evals:all, eval:select Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 13 +- package.json | 5 +- scripts/eval-select.ts | 86 +++++++++++++ test/helpers/touchfiles.ts | 171 ++++++++++++++++++++++++ test/skill-e2e.test.ts | 112 +++++++++++----- test/skill-llm-eval.test.ts | 62 +++++++-- test/touchfiles.test.ts | 250 ++++++++++++++++++++++++++++++++++++ 7 files changed, 650 insertions(+), 49 deletions(-) create mode 100644 scripts/eval-select.ts create mode 100644 test/helpers/touchfiles.ts create mode 100644 test/touchfiles.test.ts diff --git a/CLAUDE.md b/CLAUDE.md index 27523c7b..60859d37 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -5,8 +5,11 @@ ```bash bun install # install dependencies bun test # run free tests (browse + snapshot + skill validation) -bun run test:evals # run paid evals: LLM judge + E2E (~$4/run) -bun run test:e2e # run E2E tests only (~$3.85/run) +bun run test:evals # run paid evals: LLM judge + E2E (diff-based, ~$4/run max) +bun run test:evals:all # run ALL paid evals regardless of diff +bun run test:e2e # run E2E tests only (diff-based, ~$3.85/run max) +bun run test:e2e:all # run ALL E2E tests regardless of diff +bun run eval:select # show which tests would run based on current diff bun run dev # run CLI in dev mode, e.g. bun run dev goto https://example.com bun run build # gen docs + compile binaries bun run gen:skill-docs # regenerate SKILL.md files from templates @@ -21,6 +24,12 @@ bun run eval:summary # aggregate stats across all eval runs (tool-by-tool via `--output-format stream-json --verbose`). Results are persisted to `~/.gstack-dev/evals/` with auto-comparison against the previous run. +**Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based +on `git diff` against the base branch. Each test declares its file dependencies in +`test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store, +llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script +variants to force all tests. Run `eval:select` to preview which tests would run. + ## Project structure ``` diff --git a/package.json b/package.json index a5044b7d..e725b4ab 100644 --- a/package.json +++ b/package.json @@ -14,14 +14,17 @@ "server": "bun run browse/src/server.ts", "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts", "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts", + "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts", "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts", + "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts", "skill:check": "bun run scripts/skill-check.ts", "dev:skill": "bun run scripts/dev-skill.ts", "start": "bun run browse/src/server.ts", "eval:list": "bun run scripts/eval-list.ts", "eval:compare": "bun run scripts/eval-compare.ts", "eval:summary": "bun run scripts/eval-summary.ts", - "eval:watch": "bun run scripts/eval-watch.ts" + "eval:watch": "bun run scripts/eval-watch.ts", + "eval:select": "bun run scripts/eval-select.ts" }, "dependencies": { "playwright": "^1.58.2", diff --git a/scripts/eval-select.ts b/scripts/eval-select.ts new file mode 100644 index 00000000..cdbdcc84 --- /dev/null +++ b/scripts/eval-select.ts @@ -0,0 +1,86 @@ +#!/usr/bin/env bun +/** + * Show which E2E and LLM-judge tests would run based on the current git diff. + * + * Usage: + * bun run eval:select # human-readable output + * bun run eval:select --json # machine-readable JSON + * bun run eval:select --base main # override base branch + */ + +import * as path from 'path'; +import { + selectTests, + detectBaseBranch, + getChangedFiles, + E2E_TOUCHFILES, + LLM_JUDGE_TOUCHFILES, + GLOBAL_TOUCHFILES, +} from '../test/helpers/touchfiles'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const args = process.argv.slice(2); +const jsonMode = args.includes('--json'); +const baseIdx = args.indexOf('--base'); +const baseOverride = baseIdx >= 0 ? args[baseIdx + 1] : undefined; + +// Detect base branch +const baseBranch = baseOverride || detectBaseBranch(ROOT) || 'main'; +const changedFiles = getChangedFiles(baseBranch, ROOT); + +if (changedFiles.length === 0) { + if (jsonMode) { + console.log(JSON.stringify({ base: baseBranch, changed_files: 0, e2e: 'all', llm_judge: 'all', reason: 'no diff — would run all tests' })); + } else { + console.log(`Base: ${baseBranch}`); + console.log('No changed files detected — all tests would run.'); + } + process.exit(0); +} + +const e2eSelection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES); +const llmSelection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES); + +if (jsonMode) { + console.log(JSON.stringify({ + base: baseBranch, + changed_files: changedFiles, + e2e: { + selected: e2eSelection.selected, + skipped: e2eSelection.skipped, + reason: e2eSelection.reason, + count: `${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length}`, + }, + llm_judge: { + selected: llmSelection.selected, + skipped: llmSelection.skipped, + reason: llmSelection.reason, + count: `${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length}`, + }, + }, null, 2)); +} else { + console.log(`Base: ${baseBranch}`); + console.log(`Changed files: ${changedFiles.length}`); + console.log(); + + console.log(`E2E (${e2eSelection.reason}): ${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests`); + if (e2eSelection.selected.length > 0 && e2eSelection.selected.length < Object.keys(E2E_TOUCHFILES).length) { + console.log(` Selected: ${e2eSelection.selected.join(', ')}`); + console.log(` Skipped: ${e2eSelection.skipped.join(', ')}`); + } else if (e2eSelection.selected.length === 0) { + console.log(' No E2E tests affected.'); + } else { + console.log(' All E2E tests selected.'); + } + console.log(); + + console.log(`LLM-judge (${llmSelection.reason}): ${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests`); + if (llmSelection.selected.length > 0 && llmSelection.selected.length < Object.keys(LLM_JUDGE_TOUCHFILES).length) { + console.log(` Selected: ${llmSelection.selected.join(', ')}`); + console.log(` Skipped: ${llmSelection.skipped.join(', ')}`); + } else if (llmSelection.selected.length === 0) { + console.log(' No LLM-judge tests affected.'); + } else { + console.log(' All LLM-judge tests selected.'); + } +} diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts new file mode 100644 index 00000000..21bf07f1 --- /dev/null +++ b/test/helpers/touchfiles.ts @@ -0,0 +1,171 @@ +/** + * Diff-based test selection for E2E and LLM-judge evals. + * + * Each test declares which source files it depends on ("touchfiles"). + * The test runner checks `git diff` and only runs tests whose + * dependencies were modified. Override with EVALS_ALL=1 to run everything. + */ + +import { spawnSync } from 'child_process'; + +// --- Glob matching --- + +/** + * Match a file path against a glob pattern. + * Supports: + * ** — match any number of path segments + * * — match within a single segment (no /) + */ +export function matchGlob(file: string, pattern: string): boolean { + const regexStr = pattern + .replace(/\./g, '\\.') + .replace(/\*\*/g, '{{GLOBSTAR}}') + .replace(/\*/g, '[^/]*') + .replace(/\{\{GLOBSTAR\}\}/g, '.*'); + return new RegExp(`^${regexStr}$`).test(file); +} + +// --- Touchfile maps --- + +/** + * E2E test touchfiles — keyed by testName (the string passed to runSkillTest). + * Each test lists the file patterns that, if changed, require the test to run. + */ +export const E2E_TOUCHFILES: Record = { + // Browse core + 'browse-basic': ['browse/src/**'], + 'browse-snapshot': ['browse/src/**'], + + // SKILL.md setup + preamble (depend on ROOT SKILL.md only) + 'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'], + 'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'], + 'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'], + 'contributor-mode': ['SKILL.md', 'SKILL.md.tmpl'], + 'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'], + + // QA + 'qa-quick': ['qa/**', 'browse/src/**'], + 'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'], + 'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'], + 'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'], + 'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'], + 'qa-fix-loop': ['qa/**', 'browse/src/**'], + + // Review + 'review-sql-injection': ['review/**', 'test/fixtures/review-eval-vuln.rb'], + 'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'], + 'review-base-branch': ['review/**'], + + // Plan reviews + 'plan-ceo-review': ['plan-ceo-review/**'], + 'plan-eng-review': ['plan-eng-review/**'], + 'plan-eng-review-artifact': ['plan-eng-review/**'], + + // Ship + 'ship-base-branch': ['ship/**'], + + // Retro + 'retro': ['retro/**'], + 'retro-base-branch': ['retro/**'], + + // Document-release + 'document-release': ['document-release/**'], + + // Design + 'design-consultation-core': ['design-consultation/**'], + 'design-consultation-research': ['design-consultation/**'], + 'design-consultation-existing': ['design-consultation/**'], + 'design-consultation-preview': ['design-consultation/**'], + 'plan-design-review-audit': ['plan-design-review/**'], + 'plan-design-review-export': ['plan-design-review/**'], + 'qa-design-review-fix': ['qa-design-review/**', 'browse/src/**'], +}; + +/** + * LLM-judge test touchfiles — keyed by test description string. + */ +export const LLM_JUDGE_TOUCHFILES: Record = { + 'command reference table': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts'], + 'snapshot flags reference': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/snapshot.ts'], + 'browse/SKILL.md reference': ['browse/SKILL.md', 'browse/SKILL.md.tmpl', 'browse/src/**'], + 'setup block': ['SKILL.md', 'SKILL.md.tmpl'], + 'regression vs baseline': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'], + 'qa/SKILL.md workflow': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'], + 'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'], + 'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'], + 'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'], +}; + +/** + * Changes to any of these files trigger ALL tests (both E2E and LLM-judge). + */ +export const GLOBAL_TOUCHFILES = [ + 'test/helpers/session-runner.ts', + 'test/helpers/eval-store.ts', + 'test/helpers/llm-judge.ts', + 'scripts/gen-skill-docs.ts', + 'test/helpers/touchfiles.ts', + 'browse/test/test-server.ts', +]; + +// --- Base branch detection --- + +/** + * Detect the base branch by trying refs in order. + * Returns the first valid ref, or null if none found. + */ +export function detectBaseBranch(cwd: string): string | null { + for (const ref of ['origin/main', 'origin/master', 'main', 'master']) { + const result = spawnSync('git', ['rev-parse', '--verify', ref], { + cwd, stdio: 'pipe', timeout: 3000, + }); + if (result.status === 0) return ref; + } + return null; +} + +/** + * Get list of files changed between base branch and HEAD. + */ +export function getChangedFiles(baseBranch: string, cwd: string): string[] { + const result = spawnSync('git', ['diff', '--name-only', `${baseBranch}...HEAD`], { + cwd, stdio: 'pipe', timeout: 5000, + }); + if (result.status !== 0) return []; + return result.stdout.toString().trim().split('\n').filter(Boolean); +} + +// --- Test selection --- + +/** + * Select tests to run based on changed files. + * + * Algorithm: + * 1. If any changed file matches a global touchfile → run ALL tests + * 2. Otherwise, for each test, check if any changed file matches its patterns + * 3. Return selected + skipped lists with reason + */ +export function selectTests( + changedFiles: string[], + touchfiles: Record, + globalTouchfiles: string[] = GLOBAL_TOUCHFILES, +): { selected: string[]; skipped: string[]; reason: string } { + const allTestNames = Object.keys(touchfiles); + + // Global touchfile hit → run all + for (const file of changedFiles) { + if (globalTouchfiles.some(g => matchGlob(file, g))) { + return { selected: allTestNames, skipped: [], reason: `global: ${file}` }; + } + } + + // Per-test matching + const selected: string[] = []; + const skipped: string[] = []; + for (const [testName, patterns] of Object.entries(touchfiles)) { + const hit = changedFiles.some(f => patterns.some(p => matchGlob(f, p))); + (hit ? selected : skipped).push(testName); + } + + return { selected, skipped, reason: 'diff' }; +} diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 4378c322..53f898b4 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -1,10 +1,11 @@ import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; import { runSkillTest } from './helpers/session-runner'; import type { SkillTestResult } from './helpers/session-runner'; -import { outcomeJudge } from './helpers/llm-judge'; +import { outcomeJudge, callJudge } from './helpers/llm-judge'; import { EvalCollector, judgePassed } from './helpers/eval-store'; import type { EvalTestEntry } from './helpers/eval-store'; import { startTestServer } from '../browse/test/test-server'; +import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles'; import { spawnSync } from 'child_process'; import * as fs from 'fs'; import * as path from 'path'; @@ -21,6 +22,41 @@ const ROOT = path.resolve(import.meta.dir, '..'); const evalsEnabled = !!process.env.EVALS; const describeE2E = evalsEnabled ? describe : describe.skip; +// --- Diff-based test selection --- +// When EVALS_ALL is not set, only run tests whose touchfiles were modified. +// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch. +let selectedTests: string[] | null = null; // null = run all + +if (evalsEnabled && !process.env.EVALS_ALL) { + const baseBranch = process.env.EVALS_BASE + || detectBaseBranch(ROOT) + || 'main'; + const changedFiles = getChangedFiles(baseBranch, ROOT); + + if (changedFiles.length > 0) { + const selection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES); + selectedTests = selection.selected; + process.stderr.write(`\nE2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests\n`); + if (selection.skipped.length > 0) { + process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`); + } + process.stderr.write('\n'); + } + // If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all +} + +/** Wrap a describe block to skip entirely if none of its tests are selected. */ +function describeIfSelected(name: string, testNames: string[], fn: () => void) { + const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t)); + (anySelected ? describeE2E : describe.skip)(name, fn); +} + +/** Skip an individual test if not selected (for multi-test describe blocks). */ +function testIfSelected(testName: string, fn: () => Promise, timeout: number) { + const shouldRun = selectedTests === null || selectedTests.includes(testName); + (shouldRun ? test : test.skip)(testName, fn, timeout); +} + // Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null; @@ -133,7 +169,10 @@ if (evalsEnabled) { } } -describeE2E('Skill E2E tests', () => { +describeIfSelected('Skill E2E tests', [ + 'browse-basic', 'browse-snapshot', 'skillmd-setup-discovery', + 'skillmd-no-local-binary', 'skillmd-outside-git', 'contributor-mode', 'session-awareness', +], () => { beforeAll(() => { testServer = startTestServer(); tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-')); @@ -145,7 +184,7 @@ describeE2E('Skill E2E tests', () => { try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} }); - test('browse basic commands work without errors', async () => { + testIfSelected('browse-basic', async () => { const result = await runSkillTest({ prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence: 1. $B goto ${testServer.url} @@ -166,7 +205,7 @@ Report the results of each command.`, expect(result.exitReason).toBe('success'); }, 90_000); - test('browse snapshot flags all work', async () => { + testIfSelected('browse-snapshot', async () => { const result = await runSkillTest({ prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run: 1. $B goto ${testServer.url} @@ -191,7 +230,7 @@ Report what each command returned.`, expect(result.exitReason).toBe('success'); }, 90_000); - test('agent discovers browse binary via SKILL.md setup block', async () => { + testIfSelected('skillmd-setup-discovery', async () => { const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const setupStart = skillMd.indexOf('## SETUP'); const setupEnd = skillMd.indexOf('## IMPORTANT'); @@ -220,7 +259,7 @@ Report whether it worked.`, expect(result.exitReason).toBe('success'); }, 90_000); - test('SKILL.md setup block handles missing local binary gracefully', async () => { + testIfSelected('skillmd-no-local-binary', async () => { // Create a tmpdir with no browse binary — no local .claude/skills/gstack/browse/dist/browse const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-')); @@ -255,7 +294,7 @@ Report the exact output. Do NOT try to fix or install anything — just report w try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {} }, 60_000); - test('SKILL.md setup block works outside git repo', async () => { + testIfSelected('skillmd-outside-git', async () => { // Create a tmpdir outside any git repo const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-')); @@ -286,7 +325,7 @@ Report the exact output — either "READY: " or "NEEDS_SETUP".`, try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {} }, 60_000); - test('contributor mode files a report on gstack error', async () => { + testIfSelected('contributor-mode', async () => { const contribDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-contrib-')); const logsDir = path.join(contribDir, 'contributor-logs'); fs.mkdirSync(logsDir, { recursive: true }); @@ -342,7 +381,7 @@ File a contributor report about this issue. Then tell me what you filed.`, try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {} }, 90_000); - test('session awareness adds ELI16 context when _SESSIONS >= 3', async () => { + testIfSelected('session-awareness', async () => { const sessionDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-session-')); // Set up a git repo so there's project/branch context to reference @@ -413,7 +452,7 @@ Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple wi // --- B4: QA skill E2E --- -describeE2E('QA skill E2E', () => { +describeIfSelected('QA skill E2E', ['qa-quick'], () => { let qaDir: string; beforeAll(() => { @@ -468,7 +507,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`, // --- B5: Review skill E2E --- -describeE2E('Review skill E2E', () => { +describeIfSelected('Review skill E2E', ['review-sql-injection'], () => { let reviewDir: string; beforeAll(() => { @@ -527,7 +566,7 @@ Write your review findings to ${reviewDir}/review-output.md`, // --- Review: Enum completeness E2E --- -describeE2E('Review enum completeness E2E', () => { +describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'], () => { let enumDir: string; beforeAll(() => { @@ -603,7 +642,10 @@ The diff adds a new "returned" status to the Order model. Your job is to check i const hasApiKey = !!process.env.ANTHROPIC_API_KEY; const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip; -describeOutcome('Planted-bug outcome evals', () => { +// Wrap describeOutcome with selection — skip if no planted-bug tests are selected +const outcomeTestNames = ['qa-b6-static', 'qa-b7-spa', 'qa-b8-checkout']; +const anyOutcomeSelected = selectedTests === null || outcomeTestNames.some(t => selectedTests!.includes(t)); +(anyOutcomeSelected ? describeOutcome : describe.skip)('Planted-bug outcome evals', () => { let outcomeDir: string; beforeAll(() => { @@ -767,7 +809,7 @@ CRITICAL RULES: // --- Plan CEO Review E2E --- -describeE2E('Plan CEO Review E2E', () => { +describeIfSelected('Plan CEO Review E2E', ['plan-ceo-review'], () => { let planDir: string; beforeAll(() => { @@ -854,7 +896,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and // --- Plan Eng Review E2E --- -describeE2E('Plan Eng Review E2E', () => { +describeIfSelected('Plan Eng Review E2E', ['plan-eng-review'], () => { let planDir: string; beforeAll(() => { @@ -948,7 +990,7 @@ Focus on architecture, code quality, tests, and performance sections.`, // --- Retro E2E --- -describeE2E('Retro E2E', () => { +describeIfSelected('Retro E2E', ['retro'], () => { let retroDir: string; beforeAll(() => { @@ -1034,7 +1076,7 @@ Analyze the git history and produce the narrative report as described in the SKI // --- QA-Only E2E (report-only, no fixes) --- -describeE2E('QA-Only skill E2E', () => { +describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => { let qaOnlyDir: string; beforeAll(() => { @@ -1120,7 +1162,7 @@ Write your report to ${qaOnlyDir}/qa-reports/qa-only-report.md`, // --- QA Fix Loop E2E --- -describeE2E('QA Fix Loop E2E', () => { +describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => { let qaFixDir: string; let qaFixServer: ReturnType | null = null; @@ -1234,7 +1276,7 @@ This is a test+fix loop: find bugs, fix them in the source code, commit each fix // --- Plan-Eng-Review Test-Plan Artifact E2E --- -describeE2E('Plan-Eng-Review Test-Plan Artifact E2E', () => { +describeIfSelected('Plan-Eng-Review Test-Plan Artifact E2E', ['plan-eng-review-artifact'], () => { let planDir: string; let projectDir: string; @@ -1361,7 +1403,7 @@ Write your review to ${planDir}/review-output.md`, // --- Base branch detection smoke tests --- -describeE2E('Base branch detection', () => { +describeIfSelected('Base branch detection', ['review-base-branch', 'ship-base-branch', 'retro-base-branch'], () => { let baseBranchDir: string; const run = (cmd: string, args: string[], cwd: string) => spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 }); @@ -1374,7 +1416,7 @@ describeE2E('Base branch detection', () => { try { fs.rmSync(baseBranchDir, { recursive: true, force: true }); } catch {} }); - test('/review detects base branch and diffs against it', async () => { + testIfSelected('review-base-branch', async () => { const dir = path.join(baseBranchDir, 'review-base'); fs.mkdirSync(dir, { recursive: true }); @@ -1427,7 +1469,7 @@ Write your findings to ${dir}/review-output.md`, expect(usedGitDiff).toBe(true); }, 120_000); - test('/ship Step 0-1 detects base branch without destructive actions', async () => { + testIfSelected('ship-base-branch', async () => { const dir = path.join(baseBranchDir, 'ship-base'); fs.mkdirSync(dir, { recursive: true }); @@ -1489,7 +1531,7 @@ Write a summary of what you detected to ${dir}/ship-preflight.md including: expect(destructiveTools).toHaveLength(0); }, 90_000); - test('/retro detects default branch for git queries', async () => { + testIfSelected('retro-base-branch', async () => { const dir = path.join(baseBranchDir, 'retro-base'); fs.mkdirSync(dir, { recursive: true }); @@ -1548,7 +1590,7 @@ Write your retrospective to ${dir}/retro-output.md`, // --- Document-Release skill E2E --- -describeE2E('Document-Release skill E2E', () => { +describeIfSelected('Document-Release skill E2E', ['document-release'], () => { let docReleaseDir: string; beforeAll(() => { @@ -1652,6 +1694,7 @@ IMPORTANT: // --- Deferred skill E2E tests (destructive or require interactive UI) --- +// Deferred tests — only test.todo entries, no selection needed describeE2E('Deferred skill E2E', () => { // Ship is destructive: pushes to remote, creates PRs, modifies VERSION/CHANGELOG test.todo('/ship completes full workflow'); @@ -1689,7 +1732,10 @@ ${designMd} Return JSON: { "passed": true/false, "reasoning": "one paragraph explaining your evaluation" }`); } -describeE2E('Design Consultation E2E', () => { +describeIfSelected('Design Consultation E2E', [ + 'design-consultation-core', 'design-consultation-research', + 'design-consultation-existing', 'design-consultation-preview', +], () => { let designDir: string; beforeAll(() => { @@ -1733,7 +1779,7 @@ A civic tech data platform for government employees to access, visualize, and sh try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {} }); - test('Test 1: core flow produces valid DESIGN.md + CLAUDE.md', async () => { + testIfSelected('design-consultation-core', async () => { const result = await runSkillTest({ prompt: `Read design-consultation/SKILL.md for the design consultation workflow. @@ -1793,7 +1839,7 @@ Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`, } }, 420_000); - test('Test 2: research integration uses WebSearch', async () => { + testIfSelected('design-consultation-research', async () => { // Clean up from previous test try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {} try { fs.unlinkSync(path.join(designDir, 'CLAUDE.md')); } catch {} @@ -1850,7 +1896,7 @@ Write DESIGN.md to the working directory.`, expect(designExists).toBe(true); }, 420_000); - test('Test 3: handles existing DESIGN.md', async () => { + testIfSelected('design-consultation-existing', async () => { // Pre-create a minimal DESIGN.md fs.writeFileSync(path.join(designDir, 'DESIGN.md'), `# Design System — CivicPulse @@ -1896,7 +1942,7 @@ Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non } }, 420_000); - test('Test 4: generates font + color preview HTML', async () => { + testIfSelected('design-consultation-preview', async () => { // Clean up try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {} @@ -1960,7 +2006,7 @@ Skip research. Skip any AskUserQuestion calls — this is non-interactive. Gener // --- Plan Design Review E2E --- -describeE2E('Plan Design Review E2E', () => { +describeIfSelected('Plan Design Review E2E', ['plan-design-review-audit', 'plan-design-review-export'], () => { let reviewDir: string; beforeAll(() => { @@ -1991,7 +2037,7 @@ describeE2E('Plan Design Review E2E', () => { try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {} }); - test('Test 5: /plan-design-review produces audit report', async () => { + testIfSelected('plan-design-review-audit', async () => { const result = await runSkillTest({ prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly. @@ -2030,7 +2076,7 @@ Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Ski } }, 420_000); - test('Test 6: /plan-design-review exports DESIGN.md', async () => { + testIfSelected('plan-design-review-export', async () => { // Clean up previous test artifacts try { fs.unlinkSync(path.join(reviewDir, 'design-audit.md')); } catch {} @@ -2078,7 +2124,7 @@ Review ${testServer.url} with --quick mode. Skip any AskUserQuestion calls — t // --- QA Design Review E2E --- -describeE2E('QA Design Review E2E', () => { +describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => { let qaDesignDir: string; let qaDesignServer: ReturnType | null = null; diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index ba635613..c3e1aef2 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -17,6 +17,7 @@ import * as path from 'path'; import { callJudge, judge } from './helpers/llm-judge'; import type { JudgeScore } from './helpers/llm-judge'; import { EvalCollector } from './helpers/eval-store'; +import { selectTests, detectBaseBranch, getChangedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles'; const ROOT = path.resolve(import.meta.dir, '..'); // Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env) @@ -26,8 +27,43 @@ const describeEval = evalsEnabled ? describe : describe.skip; // Eval result collector const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null; -describeEval('LLM-as-judge quality evals', () => { - test('command reference table scores >= 4 on all dimensions', async () => { +// --- Diff-based test selection --- +let selectedTests: string[] | null = null; + +if (evalsEnabled && !process.env.EVALS_ALL) { + const baseBranch = process.env.EVALS_BASE + || detectBaseBranch(ROOT) + || 'main'; + const changedFiles = getChangedFiles(baseBranch, ROOT); + + if (changedFiles.length > 0) { + const selection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES); + selectedTests = selection.selected; + process.stderr.write(`\nLLM-judge selection (${selection.reason}): ${selection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests\n`); + if (selection.skipped.length > 0) { + process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`); + } + process.stderr.write('\n'); + } +} + +/** Wrap a describe block to skip if none of its tests are selected. */ +function describeIfSelected(name: string, testNames: string[], fn: () => void) { + const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t)); + (anySelected ? describeEval : describe.skip)(name, fn); +} + +/** Skip an individual test if not selected (for multi-test describe blocks). */ +function testIfSelected(testName: string, fn: () => Promise, timeout: number) { + const shouldRun = selectedTests === null || selectedTests.includes(testName); + (shouldRun ? test : test.skip)(testName, fn, timeout); +} + +describeIfSelected('LLM-as-judge quality evals', [ + 'command reference table', 'snapshot flags reference', + 'browse/SKILL.md reference', 'setup block', 'regression vs baseline', +], () => { + testIfSelected('command reference table', async () => { const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const start = content.indexOf('## Command Reference'); @@ -53,7 +89,7 @@ describeEval('LLM-as-judge quality evals', () => { expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); - test('snapshot flags section scores >= 4 on all dimensions', async () => { + testIfSelected('snapshot flags reference', async () => { const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const start = content.indexOf('## Snapshot System'); @@ -79,7 +115,7 @@ describeEval('LLM-as-judge quality evals', () => { expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); - test('browse/SKILL.md overall scores >= 4', async () => { + testIfSelected('browse/SKILL.md reference', async () => { const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8'); const start = content.indexOf('## Snapshot Flags'); @@ -104,7 +140,7 @@ describeEval('LLM-as-judge quality evals', () => { expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); - test('setup block scores >= 3 on actionability and clarity', async () => { + testIfSelected('setup block', async () => { const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const setupStart = content.indexOf('## SETUP'); @@ -131,7 +167,7 @@ describeEval('LLM-as-judge quality evals', () => { expect(scores.clarity).toBeGreaterThanOrEqual(3); }, 30_000); - test('regression check: compare branch vs baseline quality', async () => { + testIfSelected('regression vs baseline', async () => { const t0 = Date.now(); const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const genStart = generated.indexOf('## Command Reference'); @@ -220,10 +256,10 @@ Scores are 1-5 overall quality.`, // --- Part 7: QA skill quality evals (C6) --- -describeEval('QA skill quality evals', () => { +describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric'], () => { const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); - test('qa/SKILL.md workflow quality scores >= 4', async () => { + testIfSelected('qa/SKILL.md workflow', async () => { const t0 = Date.now(); const start = qaContent.indexOf('## Workflow'); const end = qaContent.indexOf('## Health Score Rubric'); @@ -266,7 +302,7 @@ ${section}`); expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); - test('qa/SKILL.md health score rubric is unambiguous', async () => { + testIfSelected('qa/SKILL.md health rubric', async () => { const t0 = Date.now(); const start = qaContent.indexOf('## Health Score Rubric'); const section = qaContent.slice(start); @@ -310,8 +346,8 @@ ${section}`); // --- Part 7: Cross-skill consistency judge (C7) --- -describeEval('Cross-skill consistency evals', () => { - test('greptile-history patterns are consistent across all skills', async () => { +describeIfSelected('Cross-skill consistency evals', ['cross-skill greptile consistency'], () => { + testIfSelected('cross-skill greptile consistency', async () => { const t0 = Date.now(); const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); @@ -375,10 +411,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`); // --- Part 7: Baseline score pinning (C9) --- -describeEval('Baseline score pinning', () => { +describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => { const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json'); - test('LLM eval scores do not regress below baselines', async () => { + testIfSelected('baseline score pinning', async () => { const t0 = Date.now(); if (!fs.existsSync(baselinesPath)) { console.log('No baseline file found — skipping pinning check'); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts new file mode 100644 index 00000000..7880c229 --- /dev/null +++ b/test/touchfiles.test.ts @@ -0,0 +1,250 @@ +/** + * Unit tests for diff-based test selection. + * Free (no API calls), runs with `bun test`. + */ + +import { describe, test, expect } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + matchGlob, + selectTests, + detectBaseBranch, + E2E_TOUCHFILES, + LLM_JUDGE_TOUCHFILES, + GLOBAL_TOUCHFILES, +} from './helpers/touchfiles'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +// --- matchGlob --- + +describe('matchGlob', () => { + test('** matches any depth of path segments', () => { + expect(matchGlob('browse/src/commands.ts', 'browse/src/**')).toBe(true); + expect(matchGlob('browse/src/deep/nested/file.ts', 'browse/src/**')).toBe(true); + expect(matchGlob('browse/src/cli.ts', 'browse/src/**')).toBe(true); + }); + + test('** does not match unrelated paths', () => { + expect(matchGlob('browse/src/commands.ts', 'qa/**')).toBe(false); + expect(matchGlob('review/SKILL.md', 'qa/**')).toBe(false); + }); + + test('exact match works', () => { + expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true); + expect(matchGlob('SKILL.md.tmpl', 'SKILL.md')).toBe(false); + expect(matchGlob('qa/SKILL.md', 'SKILL.md')).toBe(false); + }); + + test('* matches within a single segment', () => { + expect(matchGlob('test/fixtures/review-eval-enum.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true); + expect(matchGlob('test/fixtures/review-eval-enum-diff.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true); + expect(matchGlob('test/fixtures/review-eval-vuln.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(false); + }); + + test('dots in patterns are escaped correctly', () => { + expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true); + expect(matchGlob('SKILLxmd', 'SKILL.md')).toBe(false); + }); + + test('** at end matches files in the directory', () => { + expect(matchGlob('qa/SKILL.md', 'qa/**')).toBe(true); + expect(matchGlob('qa/SKILL.md.tmpl', 'qa/**')).toBe(true); + expect(matchGlob('qa/templates/report.md', 'qa/**')).toBe(true); + }); +}); + +// --- selectTests --- + +describe('selectTests', () => { + test('browse/src change selects browse and qa tests', () => { + const result = selectTests(['browse/src/commands.ts'], E2E_TOUCHFILES); + expect(result.selected).toContain('browse-basic'); + expect(result.selected).toContain('browse-snapshot'); + expect(result.selected).toContain('qa-quick'); + expect(result.selected).toContain('qa-fix-loop'); + expect(result.selected).toContain('qa-design-review-fix'); + expect(result.reason).toBe('diff'); + // Should NOT include unrelated tests + expect(result.selected).not.toContain('plan-ceo-review'); + expect(result.selected).not.toContain('retro'); + expect(result.selected).not.toContain('document-release'); + }); + + test('skill-specific change selects only that skill', () => { + const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES); + expect(result.selected).toEqual(['plan-ceo-review']); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 1); + }); + + test('global touchfile triggers ALL tests', () => { + const result = selectTests(['test/helpers/session-runner.ts'], E2E_TOUCHFILES); + expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length); + expect(result.skipped.length).toBe(0); + expect(result.reason).toContain('global'); + }); + + test('gen-skill-docs.ts is a global touchfile', () => { + const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES); + expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length); + expect(result.reason).toContain('global'); + }); + + test('unrelated file selects nothing', () => { + const result = selectTests(['README.md'], E2E_TOUCHFILES); + expect(result.selected).toEqual([]); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length); + }); + + test('empty changed files selects nothing', () => { + const result = selectTests([], E2E_TOUCHFILES); + expect(result.selected).toEqual([]); + }); + + test('multiple changed files union their selections', () => { + const result = selectTests( + ['plan-ceo-review/SKILL.md', 'retro/SKILL.md.tmpl'], + E2E_TOUCHFILES, + ); + expect(result.selected).toContain('plan-ceo-review'); + expect(result.selected).toContain('retro'); + expect(result.selected).toContain('retro-base-branch'); + expect(result.selected.length).toBe(3); + }); + + test('works with LLM_JUDGE_TOUCHFILES', () => { + const result = selectTests(['qa/SKILL.md'], LLM_JUDGE_TOUCHFILES); + expect(result.selected).toContain('qa/SKILL.md workflow'); + expect(result.selected).toContain('qa/SKILL.md health rubric'); + expect(result.selected.length).toBe(2); + }); + + test('SKILL.md.tmpl root template only selects root-dependent tests', () => { + const result = selectTests(['SKILL.md.tmpl'], E2E_TOUCHFILES); + // Should select the 7 tests that depend on root SKILL.md + expect(result.selected).toContain('skillmd-setup-discovery'); + expect(result.selected).toContain('contributor-mode'); + expect(result.selected).toContain('session-awareness'); + // Should NOT select unrelated tests + expect(result.selected).not.toContain('plan-ceo-review'); + expect(result.selected).not.toContain('retro'); + }); + + test('global touchfiles work for LLM-judge tests too', () => { + const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES); + expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length); + }); +}); + +// --- detectBaseBranch --- + +describe('detectBaseBranch', () => { + test('detects local main branch', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + fs.writeFileSync(path.join(dir, 'test.txt'), 'hello\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'init']); + + const result = detectBaseBranch(dir); + // Should find 'main' (or 'master' depending on git default) + expect(result).toMatch(/^(main|master)$/); + + try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} + }); + + test('returns null for empty repo with no branches', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init']); + // No commits = no branches + const result = detectBaseBranch(dir); + expect(result).toBeNull(); + + try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} + }); + + test('returns null for non-git directory', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-')); + const result = detectBaseBranch(dir); + expect(result).toBeNull(); + + try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} + }); +}); + +// --- Completeness: every testName in skill-e2e.test.ts has a TOUCHFILES entry --- + +describe('TOUCHFILES completeness', () => { + test('every E2E testName has a TOUCHFILES entry', () => { + const e2eContent = fs.readFileSync( + path.join(ROOT, 'test', 'skill-e2e.test.ts'), + 'utf-8', + ); + + // Extract all testName: 'value' entries + const testNameRegex = /testName:\s*['"`]([^'"`]+)['"`]/g; + const testNames: string[] = []; + let match; + while ((match = testNameRegex.exec(e2eContent)) !== null) { + let name = match[1]; + // Handle template literals like `qa-${label}` — these expand to + // qa-b6-static, qa-b7-spa, qa-b8-checkout + if (name.includes('${')) continue; // skip template literals, check expanded forms below + testNames.push(name); + } + + // Add the template-expanded testNames from runPlantedBugEval calls + const plantedBugRegex = /runPlantedBugEval\([^,]+,\s*[^,]+,\s*['"`]([^'"`]+)['"`]\)/g; + while ((match = plantedBugRegex.exec(e2eContent)) !== null) { + testNames.push(`qa-${match[1]}`); + } + + expect(testNames.length).toBeGreaterThan(0); + + const missing = testNames.filter(name => !(name in E2E_TOUCHFILES)); + if (missing.length > 0) { + throw new Error( + `E2E tests missing TOUCHFILES entries: ${missing.join(', ')}\n` + + `Add these to E2E_TOUCHFILES in test/helpers/touchfiles.ts`, + ); + } + }); + + test('every LLM-judge test has a TOUCHFILES entry', () => { + const llmContent = fs.readFileSync( + path.join(ROOT, 'test', 'skill-llm-eval.test.ts'), + 'utf-8', + ); + + // Extract test names from addTest({ name: '...' }) calls + const nameRegex = /name:\s*['"`]([^'"`]+)['"`]/g; + const testNames: string[] = []; + let match; + while ((match = nameRegex.exec(llmContent)) !== null) { + testNames.push(match[1]); + } + + // Deduplicate (some tests call addTest with the same name) + const unique = [...new Set(testNames)]; + expect(unique.length).toBeGreaterThan(0); + + const missing = unique.filter(name => !(name in LLM_JUDGE_TOUCHFILES)); + if (missing.length > 0) { + throw new Error( + `LLM-judge tests missing TOUCHFILES entries: ${missing.join(', ')}\n` + + `Add these to LLM_JUDGE_TOUCHFILES in test/helpers/touchfiles.ts`, + ); + } + }); +}); From 7fed990f9e1bb962d40f2ad07718178746b69542 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 17 Mar 2026 11:30:23 -0700 Subject: [PATCH 2/3] chore: bump version and changelog (v0.6.1.0) Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.md | 13 +++++++++++++ VERSION | 2 +- test/helpers/touchfiles.ts | 13 ++++++++++--- test/skill-e2e.test.ts | 6 +++--- test/touchfiles.test.ts | 11 +++++++---- 5 files changed, 34 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4593d3f1..96fc4c0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## [0.6.1.0] - 2026-03-17 + +### Added + +- **E2E and LLM-judge tests now only run what you changed.** Each test declares which source files it depends on. When you run `bun run test:e2e`, it checks your diff and skips tests whose dependencies weren't touched. A branch that only changes `/retro` now runs 2 tests instead of 31. Use `bun run test:e2e:all` to force everything. +- **`bun run eval:select` previews which tests would run.** See exactly which tests your diff triggers before spending API credits. Supports `--json` for scripting and `--base ` to override the base branch. +- **Completeness guardrail catches forgotten test entries.** A free unit test validates that every `testName` in the E2E and LLM-judge test files has a corresponding entry in the TOUCHFILES map. New tests without entries fail `bun test` immediately — no silent always-run degradation. + +### Changed + +- `test:evals` and `test:e2e` now auto-select based on diff (was: all-or-nothing) +- New `test:evals:all` and `test:e2e:all` scripts for explicit full runs + ## 0.6.0.1 — 2026-03-17 - **`/gstack-upgrade` now catches stale vendored copies automatically.** If your global gstack is up to date but the vendored copy in your project is behind, `/gstack-upgrade` detects the mismatch and syncs it. No more manually asking "did we vendor it?" — it just tells you and offers to update. diff --git a/VERSION b/VERSION index 758efdb4..44e7f9a2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6.0.1 +0.6.1.0 diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 21bf07f1..30a15579 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -57,9 +57,10 @@ export const E2E_TOUCHFILES: Record = { 'review-base-branch': ['review/**'], // Plan reviews - 'plan-ceo-review': ['plan-ceo-review/**'], - 'plan-eng-review': ['plan-eng-review/**'], - 'plan-eng-review-artifact': ['plan-eng-review/**'], + 'plan-ceo-review': ['plan-ceo-review/**'], + 'plan-ceo-review-selective': ['plan-ceo-review/**'], + 'plan-eng-review': ['plan-eng-review/**'], + 'plan-eng-review-artifact': ['plan-eng-review/**'], // Ship 'ship-base-branch': ['ship/**'], @@ -71,6 +72,12 @@ export const E2E_TOUCHFILES: Record = { // Document-release 'document-release': ['document-release/**'], + // QA bootstrap + 'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'], + + // Ship coverage audit + 'ship-coverage-audit': ['ship/**'], + // Design 'design-consultation-core': ['design-consultation/**'], 'design-consultation-research': ['design-consultation/**'], diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 1b750701..1d311bdf 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -896,7 +896,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and // --- Plan CEO Review (SELECTIVE EXPANSION) E2E --- -describeE2E('Plan CEO Review SELECTIVE EXPANSION E2E', () => { +describeIfSelected('Plan CEO Review SELECTIVE EXPANSION E2E', ['plan-ceo-review-selective'], () => { let planDir: string; beforeAll(() => { @@ -2346,7 +2346,7 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call // --- Test Bootstrap E2E --- -describeE2E('Test Bootstrap E2E', () => { +describeIfSelected('Test Bootstrap E2E', ['qa-bootstrap'], () => { let bootstrapDir: string; let bootstrapServer: ReturnType; @@ -2483,7 +2483,7 @@ This is a test+fix loop: find bugs, fix them, write regression tests, commit eac // --- Test Coverage Audit E2E --- -describeE2E('Test Coverage Audit E2E', () => { +describeIfSelected('Test Coverage Audit E2E', ['ship-coverage-audit'], () => { let coverageDir: string; beforeAll(() => { diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 7880c229..e666bb3d 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -74,10 +74,12 @@ describe('selectTests', () => { expect(result.selected).not.toContain('document-release'); }); - test('skill-specific change selects only that skill', () => { + test('skill-specific change selects only that skill and related tests', () => { const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES); - expect(result.selected).toEqual(['plan-ceo-review']); - expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 1); + expect(result.selected).toContain('plan-ceo-review'); + expect(result.selected).toContain('plan-ceo-review-selective'); + expect(result.selected.length).toBe(2); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 2); }); test('global touchfile triggers ALL tests', () => { @@ -110,9 +112,10 @@ describe('selectTests', () => { E2E_TOUCHFILES, ); expect(result.selected).toContain('plan-ceo-review'); + expect(result.selected).toContain('plan-ceo-review-selective'); expect(result.selected).toContain('retro'); expect(result.selected).toContain('retro-base-branch'); - expect(result.selected.length).toBe(3); + expect(result.selected.length).toBe(4); }); test('works with LLM_JUDGE_TOUCHFILES', () => { From 64bbbb21984e1e399afe81fd427c10fdcb9740d0 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 17 Mar 2026 14:41:13 -0700 Subject: [PATCH 3/3] =?UTF-8?q?fix:=20plan-design-review-audit=20eval=20?= =?UTF-8?q?=E2=80=94=20bump=20turns=20to=2030,=20add=20efficiency=20hints?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test was flaky at 20 turns because the agent reads a 300-line SKILL.md, navigates, extracts design data, and writes a report. Added hints to skip preamble/batch commands/write early while still testing the real SKILL.md. Now completes in ~13 turns consistently. Co-Authored-By: Claude Opus 4.6 --- test/skill-e2e.test.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 1d311bdf..34baafbc 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -2128,9 +2128,11 @@ B="${browseBin}" Read plan-design-review/SKILL.md for the design review workflow. -Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md.`, +Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md. + +EFFICIENCY: Skip the preamble bash block. Combine multiple browse commands into single bash blocks (e.g. run all Phase 2 JS extractions in one block). Write the report as soon as you have enough data — do not over-explore.`, workingDirectory: reviewDir, - maxTurns: 20, + maxTurns: 30, timeout: 360_000, testName: 'plan-design-review-audit', runId,