From ad41c5d8534f3b486b15a25e1df50a5e4abdb841 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Tue, 17 Mar 2026 11:28:03 -0700
Subject: [PATCH 1/3] feat: diff-based test selection for E2E and LLM-judge
 evals

Each test declares file dependencies in a TOUCHFILES map. The test runner
checks git diff against the base branch and only runs tests whose
dependencies were modified. Global touchfiles (session-runner, eval-store,
gen-skill-docs) trigger all tests.

New scripts: test:e2e:all, test:evals:all, eval:select

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CLAUDE.md                   |  13 +-
 package.json                |   5 +-
 scripts/eval-select.ts      |  86 +++++++++++++
 test/helpers/touchfiles.ts  | 171 ++++++++++++++++++++++++
 test/skill-e2e.test.ts      | 112 +++++++++++-----
 test/skill-llm-eval.test.ts |  62 +++++++--
 test/touchfiles.test.ts     | 250 ++++++++++++++++++++++++++++++++++++
 7 files changed, 650 insertions(+), 49 deletions(-)
 create mode 100644 scripts/eval-select.ts
 create mode 100644 test/helpers/touchfiles.ts
 create mode 100644 test/touchfiles.test.ts
diff --git a/CLAUDE.md b/CLAUDE.md
index 27523c7b..60859d37 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -5,8 +5,11 @@
 ```bash
 bun install          # install dependencies
 bun test             # run free tests (browse + snapshot + skill validation)
-bun run test:evals   # run paid evals: LLM judge + E2E (~$4/run)
-bun run test:e2e     # run E2E tests only (~$3.85/run)
+bun run test:evals   # run paid evals: LLM judge + E2E (diff-based, ~$4/run max)
+bun run test:evals:all  # run ALL paid evals regardless of diff
+bun run test:e2e     # run E2E tests only (diff-based, ~$3.85/run max)
+bun run test:e2e:all # run ALL E2E tests regardless of diff
+bun run eval:select  # show which tests would run based on current diff
 bun run dev <cmd>    # run CLI in dev mode, e.g. bun run dev goto https://example.com
 bun run build        # gen docs + compile binaries
 bun run gen:skill-docs  # regenerate SKILL.md files from templates
@@ -21,6 +24,12 @@ bun run eval:summary # aggregate stats across all eval runs
 (tool-by-tool via `--output-format stream-json --verbose`). Results are persisted
 to `~/.gstack-dev/evals/` with auto-comparison against the previous run.
 
+**Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based
+on `git diff` against the base branch. Each test declares its file dependencies in
+`test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store,
+llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script
+variants to force all tests. Run `eval:select` to preview which tests would run.
+
 ## Project structure
 
 ```
diff --git a/package.json b/package.json
index a5044b7d..e725b4ab 100644
--- a/package.json
+++ b/package.json
@@ -14,14 +14,17 @@
     "server": "bun run browse/src/server.ts",
     "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts",
     "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts",
+    "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts",
     "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts",
+    "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts",
     "skill:check": "bun run scripts/skill-check.ts",
     "dev:skill": "bun run scripts/dev-skill.ts",
     "start": "bun run browse/src/server.ts",
     "eval:list": "bun run scripts/eval-list.ts",
     "eval:compare": "bun run scripts/eval-compare.ts",
     "eval:summary": "bun run scripts/eval-summary.ts",
-    "eval:watch": "bun run scripts/eval-watch.ts"
+    "eval:watch": "bun run scripts/eval-watch.ts",
+    "eval:select": "bun run scripts/eval-select.ts"
   },
   "dependencies": {
     "playwright": "^1.58.2",
diff --git a/scripts/eval-select.ts b/scripts/eval-select.ts
new file mode 100644
index 00000000..cdbdcc84
--- /dev/null
+++ b/scripts/eval-select.ts
@@ -0,0 +1,86 @@
+#!/usr/bin/env bun
+/**
+ * Show which E2E and LLM-judge tests would run based on the current git diff.
+ *
+ * Usage:
+ *   bun run eval:select              # human-readable output
+ *   bun run eval:select --json       # machine-readable JSON
+ *   bun run eval:select --base main  # override base branch
+ */
+
+import * as path from 'path';
+import {
+  selectTests,
+  detectBaseBranch,
+  getChangedFiles,
+  E2E_TOUCHFILES,
+  LLM_JUDGE_TOUCHFILES,
+  GLOBAL_TOUCHFILES,
+} from '../test/helpers/touchfiles';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const args = process.argv.slice(2);
+const jsonMode = args.includes('--json');
+const baseIdx = args.indexOf('--base');
+const baseOverride = baseIdx >= 0 ? args[baseIdx + 1] : undefined;
+
+// Detect base branch
+const baseBranch = baseOverride || detectBaseBranch(ROOT) || 'main';
+const changedFiles = getChangedFiles(baseBranch, ROOT);
+
+if (changedFiles.length === 0) {
+  if (jsonMode) {
+    console.log(JSON.stringify({ base: baseBranch, changed_files: 0, e2e: 'all', llm_judge: 'all', reason: 'no diff — would run all tests' }));
+  } else {
+    console.log(`Base: ${baseBranch}`);
+    console.log('No changed files detected — all tests would run.');
+  }
+  process.exit(0);
+}
+
+const e2eSelection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
+const llmSelection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES);
+
+if (jsonMode) {
+  console.log(JSON.stringify({
+    base: baseBranch,
+    changed_files: changedFiles,
+    e2e: {
+      selected: e2eSelection.selected,
+      skipped: e2eSelection.skipped,
+      reason: e2eSelection.reason,
+      count: `${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length}`,
+    },
+    llm_judge: {
+      selected: llmSelection.selected,
+      skipped: llmSelection.skipped,
+      reason: llmSelection.reason,
+      count: `${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length}`,
+    },
+  }, null, 2));
+} else {
+  console.log(`Base: ${baseBranch}`);
+  console.log(`Changed files: ${changedFiles.length}`);
+  console.log();
+
+  console.log(`E2E (${e2eSelection.reason}): ${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests`);
+  if (e2eSelection.selected.length > 0 && e2eSelection.selected.length < Object.keys(E2E_TOUCHFILES).length) {
+    console.log(`  Selected: ${e2eSelection.selected.join(', ')}`);
+    console.log(`  Skipped:  ${e2eSelection.skipped.join(', ')}`);
+  } else if (e2eSelection.selected.length === 0) {
+    console.log('  No E2E tests affected.');
+  } else {
+    console.log('  All E2E tests selected.');
+  }
+  console.log();
+
+  console.log(`LLM-judge (${llmSelection.reason}): ${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests`);
+  if (llmSelection.selected.length > 0 && llmSelection.selected.length < Object.keys(LLM_JUDGE_TOUCHFILES).length) {
+    console.log(`  Selected: ${llmSelection.selected.join(', ')}`);
+    console.log(`  Skipped:  ${llmSelection.skipped.join(', ')}`);
+  } else if (llmSelection.selected.length === 0) {
+    console.log('  No LLM-judge tests affected.');
+  } else {
+    console.log('  All LLM-judge tests selected.');
+  }
+}
diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts
new file mode 100644
index 00000000..21bf07f1
--- /dev/null
+++ b/test/helpers/touchfiles.ts
@@ -0,0 +1,171 @@
+/**
+ * Diff-based test selection for E2E and LLM-judge evals.
+ *
+ * Each test declares which source files it depends on ("touchfiles").
+ * The test runner checks `git diff` and only runs tests whose
+ * dependencies were modified. Override with EVALS_ALL=1 to run everything.
+ */
+
+import { spawnSync } from 'child_process';
+
+// --- Glob matching ---
+
+/**
+ * Match a file path against a glob pattern.
+ * Supports:
+ *   ** — match any number of path segments
+ *   *  — match within a single segment (no /)
+ */
+export function matchGlob(file: string, pattern: string): boolean {
+  const regexStr = pattern
+    .replace(/\./g, '\\.')
+    .replace(/\*\*/g, '{{GLOBSTAR}}')
+    .replace(/\*/g, '[^/]*')
+    .replace(/\{\{GLOBSTAR\}\}/g, '.*');
+  return new RegExp(`^${regexStr}$`).test(file);
+}
+
+// --- Touchfile maps ---
+
+/**
+ * E2E test touchfiles — keyed by testName (the string passed to runSkillTest).
+ * Each test lists the file patterns that, if changed, require the test to run.
+ */
+export const E2E_TOUCHFILES: Record<string, string[]> = {
+  // Browse core
+  'browse-basic':    ['browse/src/**'],
+  'browse-snapshot': ['browse/src/**'],
+
+  // SKILL.md setup + preamble (depend on ROOT SKILL.md only)
+  'skillmd-setup-discovery':  ['SKILL.md', 'SKILL.md.tmpl'],
+  'skillmd-no-local-binary':  ['SKILL.md', 'SKILL.md.tmpl'],
+  'skillmd-outside-git':      ['SKILL.md', 'SKILL.md.tmpl'],
+  'contributor-mode':         ['SKILL.md', 'SKILL.md.tmpl'],
+  'session-awareness':        ['SKILL.md', 'SKILL.md.tmpl'],
+
+  // QA
+  'qa-quick':       ['qa/**', 'browse/src/**'],
+  'qa-b6-static':   ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
+  'qa-b7-spa':      ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
+  'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
+  'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
+  'qa-fix-loop':    ['qa/**', 'browse/src/**'],
+
+  // Review
+  'review-sql-injection':     ['review/**', 'test/fixtures/review-eval-vuln.rb'],
+  'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'],
+  'review-base-branch':       ['review/**'],
+
+  // Plan reviews
+  'plan-ceo-review':          ['plan-ceo-review/**'],
+  'plan-eng-review':          ['plan-eng-review/**'],
+  'plan-eng-review-artifact': ['plan-eng-review/**'],
+
+  // Ship
+  'ship-base-branch': ['ship/**'],
+
+  // Retro
+  'retro':             ['retro/**'],
+  'retro-base-branch': ['retro/**'],
+
+  // Document-release
+  'document-release': ['document-release/**'],
+
+  // Design
+  'design-consultation-core':     ['design-consultation/**'],
+  'design-consultation-research': ['design-consultation/**'],
+  'design-consultation-existing': ['design-consultation/**'],
+  'design-consultation-preview':  ['design-consultation/**'],
+  'plan-design-review-audit':     ['plan-design-review/**'],
+  'plan-design-review-export':    ['plan-design-review/**'],
+  'qa-design-review-fix':         ['qa-design-review/**', 'browse/src/**'],
+};
+
+/**
+ * LLM-judge test touchfiles — keyed by test description string.
+ */
+export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
+  'command reference table':          ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts'],
+  'snapshot flags reference':         ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/snapshot.ts'],
+  'browse/SKILL.md reference':        ['browse/SKILL.md', 'browse/SKILL.md.tmpl', 'browse/src/**'],
+  'setup block':                      ['SKILL.md', 'SKILL.md.tmpl'],
+  'regression vs baseline':           ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'],
+  'qa/SKILL.md workflow':             ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
+  'qa/SKILL.md health rubric':        ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
+  'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'],
+  'baseline score pinning':           ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'],
+};
+
+/**
+ * Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
+ */
+export const GLOBAL_TOUCHFILES = [
+  'test/helpers/session-runner.ts',
+  'test/helpers/eval-store.ts',
+  'test/helpers/llm-judge.ts',
+  'scripts/gen-skill-docs.ts',
+  'test/helpers/touchfiles.ts',
+  'browse/test/test-server.ts',
+];
+
+// --- Base branch detection ---
+
+/**
+ * Detect the base branch by trying refs in order.
+ * Returns the first valid ref, or null if none found.
+ */
+export function detectBaseBranch(cwd: string): string | null {
+  for (const ref of ['origin/main', 'origin/master', 'main', 'master']) {
+    const result = spawnSync('git', ['rev-parse', '--verify', ref], {
+      cwd, stdio: 'pipe', timeout: 3000,
+    });
+    if (result.status === 0) return ref;
+  }
+  return null;
+}
+
+/**
+ * Get list of files changed between base branch and HEAD.
+ */
+export function getChangedFiles(baseBranch: string, cwd: string): string[] {
+  const result = spawnSync('git', ['diff', '--name-only', `${baseBranch}...HEAD`], {
+    cwd, stdio: 'pipe', timeout: 5000,
+  });
+  if (result.status !== 0) return [];
+  return result.stdout.toString().trim().split('\n').filter(Boolean);
+}
+
+// --- Test selection ---
+
+/**
+ * Select tests to run based on changed files.
+ *
+ * Algorithm:
+ * 1. If any changed file matches a global touchfile → run ALL tests
+ * 2. Otherwise, for each test, check if any changed file matches its patterns
+ * 3. Return selected + skipped lists with reason
+ */
+export function selectTests(
+  changedFiles: string[],
+  touchfiles: Record<string, string[]>,
+  globalTouchfiles: string[] = GLOBAL_TOUCHFILES,
+): { selected: string[]; skipped: string[]; reason: string } {
+  const allTestNames = Object.keys(touchfiles);
+
+  // Global touchfile hit → run all
+  for (const file of changedFiles) {
+    if (globalTouchfiles.some(g => matchGlob(file, g))) {
+      return { selected: allTestNames, skipped: [], reason: `global: ${file}` };
+    }
+  }
+
+  // Per-test matching
+  const selected: string[] = [];
+  const skipped: string[] = [];
+  for (const [testName, patterns] of Object.entries(touchfiles)) {
+    const hit = changedFiles.some(f => patterns.some(p => matchGlob(f, p)));
+    (hit ? selected : skipped).push(testName);
+  }
+
+  return { selected, skipped, reason: 'diff' };
+}
diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts
index 4378c322..53f898b4 100644
--- a/test/skill-e2e.test.ts
+++ b/test/skill-e2e.test.ts
@@ -1,10 +1,11 @@
 import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
 import { runSkillTest } from './helpers/session-runner';
 import type { SkillTestResult } from './helpers/session-runner';
-import { outcomeJudge } from './helpers/llm-judge';
+import { outcomeJudge, callJudge } from './helpers/llm-judge';
 import { EvalCollector, judgePassed } from './helpers/eval-store';
 import type { EvalTestEntry } from './helpers/eval-store';
 import { startTestServer } from '../browse/test/test-server';
+import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
 import { spawnSync } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
@@ -21,6 +22,41 @@ const ROOT = path.resolve(import.meta.dir, '..');
 const evalsEnabled = !!process.env.EVALS;
 const describeE2E = evalsEnabled ? describe : describe.skip;
 
+// --- Diff-based test selection ---
+// When EVALS_ALL is not set, only run tests whose touchfiles were modified.
+// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
+let selectedTests: string[] | null = null; // null = run all
+
+if (evalsEnabled && !process.env.EVALS_ALL) {
+  const baseBranch = process.env.EVALS_BASE
+    || detectBaseBranch(ROOT)
+    || 'main';
+  const changedFiles = getChangedFiles(baseBranch, ROOT);
+
+  if (changedFiles.length > 0) {
+    const selection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
+    selectedTests = selection.selected;
+    process.stderr.write(`\nE2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests\n`);
+    if (selection.skipped.length > 0) {
+      process.stderr.write(`  Skipped: ${selection.skipped.join(', ')}\n`);
+    }
+    process.stderr.write('\n');
+  }
+  // If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
+}
+
+/** Wrap a describe block to skip entirely if none of its tests are selected. */
+function describeIfSelected(name: string, testNames: string[], fn: () => void) {
+  const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t));
+  (anySelected ? describeE2E : describe.skip)(name, fn);
+}
+
+/** Skip an individual test if not selected (for multi-test describe blocks). */
+function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
+  const shouldRun = selectedTests === null || selectedTests.includes(testName);
+  (shouldRun ? test : test.skip)(testName, fn, timeout);
+}
+
 // Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize
 const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null;
 
@@ -133,7 +169,10 @@ if (evalsEnabled) {
   }
 }
 
-describeE2E('Skill E2E tests', () => {
+describeIfSelected('Skill E2E tests', [
+  'browse-basic', 'browse-snapshot', 'skillmd-setup-discovery',
+  'skillmd-no-local-binary', 'skillmd-outside-git', 'contributor-mode', 'session-awareness',
+], () => {
   beforeAll(() => {
     testServer = startTestServer();
     tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
@@ -145,7 +184,7 @@ describeE2E('Skill E2E tests', () => {
     try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
   });
 
-  test('browse basic commands work without errors', async () => {
+  testIfSelected('browse-basic', async () => {
     const result = await runSkillTest({
       prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence:
 1. $B goto ${testServer.url}
@@ -166,7 +205,7 @@ Report the results of each command.`,
     expect(result.exitReason).toBe('success');
   }, 90_000);
 
-  test('browse snapshot flags all work', async () => {
+  testIfSelected('browse-snapshot', async () => {
     const result = await runSkillTest({
       prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run:
 1. $B goto ${testServer.url}
@@ -191,7 +230,7 @@ Report what each command returned.`,
     expect(result.exitReason).toBe('success');
   }, 90_000);
 
-  test('agent discovers browse binary via SKILL.md setup block', async () => {
+  testIfSelected('skillmd-setup-discovery', async () => {
     const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
     const setupStart = skillMd.indexOf('## SETUP');
     const setupEnd = skillMd.indexOf('## IMPORTANT');
@@ -220,7 +259,7 @@ Report whether it worked.`,
     expect(result.exitReason).toBe('success');
   }, 90_000);
 
-  test('SKILL.md setup block handles missing local binary gracefully', async () => {
+  testIfSelected('skillmd-no-local-binary', async () => {
     // Create a tmpdir with no browse binary — no local .claude/skills/gstack/browse/dist/browse
     const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
 
@@ -255,7 +294,7 @@ Report the exact output. Do NOT try to fix or install anything — just report w
     try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
   }, 60_000);
 
-  test('SKILL.md setup block works outside git repo', async () => {
+  testIfSelected('skillmd-outside-git', async () => {
     // Create a tmpdir outside any git repo
     const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
 
@@ -286,7 +325,7 @@ Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
     try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
   }, 60_000);
 
-  test('contributor mode files a report on gstack error', async () => {
+  testIfSelected('contributor-mode', async () => {
     const contribDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-contrib-'));
     const logsDir = path.join(contribDir, 'contributor-logs');
     fs.mkdirSync(logsDir, { recursive: true });
@@ -342,7 +381,7 @@ File a contributor report about this issue. Then tell me what you filed.`,
     try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {}
   }, 90_000);
 
-  test('session awareness adds ELI16 context when _SESSIONS >= 3', async () => {
+  testIfSelected('session-awareness', async () => {
     const sessionDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-session-'));
 
     // Set up a git repo so there's project/branch context to reference
@@ -413,7 +452,7 @@ Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple wi
 
 // --- B4: QA skill E2E ---
 
-describeE2E('QA skill E2E', () => {
+describeIfSelected('QA skill E2E', ['qa-quick'], () => {
   let qaDir: string;
 
   beforeAll(() => {
@@ -468,7 +507,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`,
 
 // --- B5: Review skill E2E ---
 
-describeE2E('Review skill E2E', () => {
+describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
   let reviewDir: string;
 
   beforeAll(() => {
@@ -527,7 +566,7 @@ Write your review findings to ${reviewDir}/review-output.md`,
 
 // --- Review: Enum completeness E2E ---
 
-describeE2E('Review enum completeness E2E', () => {
+describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'], () => {
   let enumDir: string;
 
   beforeAll(() => {
@@ -603,7 +642,10 @@ The diff adds a new "returned" status to the Order model. Your job is to check i
 const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
 const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip;
 
-describeOutcome('Planted-bug outcome evals', () => {
+// Wrap describeOutcome with selection — skip if no planted-bug tests are selected
+const outcomeTestNames = ['qa-b6-static', 'qa-b7-spa', 'qa-b8-checkout'];
+const anyOutcomeSelected = selectedTests === null || outcomeTestNames.some(t => selectedTests!.includes(t));
+(anyOutcomeSelected ? describeOutcome : describe.skip)('Planted-bug outcome evals', () => {
   let outcomeDir: string;
 
   beforeAll(() => {
@@ -767,7 +809,7 @@ CRITICAL RULES:
 
 // --- Plan CEO Review E2E ---
 
-describeE2E('Plan CEO Review E2E', () => {
+describeIfSelected('Plan CEO Review E2E', ['plan-ceo-review'], () => {
   let planDir: string;
 
   beforeAll(() => {
@@ -854,7 +896,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
 
 // --- Plan Eng Review E2E ---
 
-describeE2E('Plan Eng Review E2E', () => {
+describeIfSelected('Plan Eng Review E2E', ['plan-eng-review'], () => {
   let planDir: string;
 
   beforeAll(() => {
@@ -948,7 +990,7 @@ Focus on architecture, code quality, tests, and performance sections.`,
 
 // --- Retro E2E ---
 
-describeE2E('Retro E2E', () => {
+describeIfSelected('Retro E2E', ['retro'], () => {
   let retroDir: string;
 
   beforeAll(() => {
@@ -1034,7 +1076,7 @@ Analyze the git history and produce the narrative report as described in the SKI
 
 // --- QA-Only E2E (report-only, no fixes) ---
 
-describeE2E('QA-Only skill E2E', () => {
+describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
   let qaOnlyDir: string;
 
   beforeAll(() => {
@@ -1120,7 +1162,7 @@ Write your report to ${qaOnlyDir}/qa-reports/qa-only-report.md`,
 
 // --- QA Fix Loop E2E ---
 
-describeE2E('QA Fix Loop E2E', () => {
+describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
   let qaFixDir: string;
   let qaFixServer: ReturnType<typeof Bun.serve> | null = null;
 
@@ -1234,7 +1276,7 @@ This is a test+fix loop: find bugs, fix them in the source code, commit each fix
 
 // --- Plan-Eng-Review Test-Plan Artifact E2E ---
 
-describeE2E('Plan-Eng-Review Test-Plan Artifact E2E', () => {
+describeIfSelected('Plan-Eng-Review Test-Plan Artifact E2E', ['plan-eng-review-artifact'], () => {
   let planDir: string;
   let projectDir: string;
 
@@ -1361,7 +1403,7 @@ Write your review to ${planDir}/review-output.md`,
 
 // --- Base branch detection smoke tests ---
 
-describeE2E('Base branch detection', () => {
+describeIfSelected('Base branch detection', ['review-base-branch', 'ship-base-branch', 'retro-base-branch'], () => {
   let baseBranchDir: string;
   const run = (cmd: string, args: string[], cwd: string) =>
     spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
@@ -1374,7 +1416,7 @@ describeE2E('Base branch detection', () => {
     try { fs.rmSync(baseBranchDir, { recursive: true, force: true }); } catch {}
   });
 
-  test('/review detects base branch and diffs against it', async () => {
+  testIfSelected('review-base-branch', async () => {
     const dir = path.join(baseBranchDir, 'review-base');
     fs.mkdirSync(dir, { recursive: true });
 
@@ -1427,7 +1469,7 @@ Write your findings to ${dir}/review-output.md`,
     expect(usedGitDiff).toBe(true);
   }, 120_000);
 
-  test('/ship Step 0-1 detects base branch without destructive actions', async () => {
+  testIfSelected('ship-base-branch', async () => {
     const dir = path.join(baseBranchDir, 'ship-base');
     fs.mkdirSync(dir, { recursive: true });
 
@@ -1489,7 +1531,7 @@ Write a summary of what you detected to ${dir}/ship-preflight.md including:
     expect(destructiveTools).toHaveLength(0);
   }, 90_000);
 
-  test('/retro detects default branch for git queries', async () => {
+  testIfSelected('retro-base-branch', async () => {
     const dir = path.join(baseBranchDir, 'retro-base');
     fs.mkdirSync(dir, { recursive: true });
 
@@ -1548,7 +1590,7 @@ Write your retrospective to ${dir}/retro-output.md`,
 
 // --- Document-Release skill E2E ---
 
-describeE2E('Document-Release skill E2E', () => {
+describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
   let docReleaseDir: string;
 
   beforeAll(() => {
@@ -1652,6 +1694,7 @@ IMPORTANT:
 
 // --- Deferred skill E2E tests (destructive or require interactive UI) ---
 
+// Deferred tests — only test.todo entries, no selection needed
 describeE2E('Deferred skill E2E', () => {
   // Ship is destructive: pushes to remote, creates PRs, modifies VERSION/CHANGELOG
   test.todo('/ship completes full workflow');
@@ -1689,7 +1732,10 @@ ${designMd}
 Return JSON: { "passed": true/false, "reasoning": "one paragraph explaining your evaluation" }`);
 }
 
-describeE2E('Design Consultation E2E', () => {
+describeIfSelected('Design Consultation E2E', [
+  'design-consultation-core', 'design-consultation-research',
+  'design-consultation-existing', 'design-consultation-preview',
+], () => {
   let designDir: string;
 
   beforeAll(() => {
@@ -1733,7 +1779,7 @@ A civic tech data platform for government employees to access, visualize, and sh
     try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
   });
 
-  test('Test 1: core flow produces valid DESIGN.md + CLAUDE.md', async () => {
+  testIfSelected('design-consultation-core', async () => {
     const result = await runSkillTest({
       prompt: `Read design-consultation/SKILL.md for the design consultation workflow.
 
@@ -1793,7 +1839,7 @@ Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`,
     }
   }, 420_000);
 
-  test('Test 2: research integration uses WebSearch', async () => {
+  testIfSelected('design-consultation-research', async () => {
     // Clean up from previous test
     try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {}
     try { fs.unlinkSync(path.join(designDir, 'CLAUDE.md')); } catch {}
@@ -1850,7 +1896,7 @@ Write DESIGN.md to the working directory.`,
     expect(designExists).toBe(true);
   }, 420_000);
 
-  test('Test 3: handles existing DESIGN.md', async () => {
+  testIfSelected('design-consultation-existing', async () => {
     // Pre-create a minimal DESIGN.md
     fs.writeFileSync(path.join(designDir, 'DESIGN.md'), `# Design System — CivicPulse
 
@@ -1896,7 +1942,7 @@ Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non
     }
   }, 420_000);
 
-  test('Test 4: generates font + color preview HTML', async () => {
+  testIfSelected('design-consultation-preview', async () => {
     // Clean up
     try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {}
 
@@ -1960,7 +2006,7 @@ Skip research. Skip any AskUserQuestion calls — this is non-interactive. Gener
 
 // --- Plan Design Review E2E ---
 
-describeE2E('Plan Design Review E2E', () => {
+describeIfSelected('Plan Design Review E2E', ['plan-design-review-audit', 'plan-design-review-export'], () => {
   let reviewDir: string;
 
   beforeAll(() => {
@@ -1991,7 +2037,7 @@ describeE2E('Plan Design Review E2E', () => {
     try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
   });
 
-  test('Test 5: /plan-design-review produces audit report', async () => {
+  testIfSelected('plan-design-review-audit', async () => {
     const result = await runSkillTest({
       prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
 
@@ -2030,7 +2076,7 @@ Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Ski
     }
   }, 420_000);
 
-  test('Test 6: /plan-design-review exports DESIGN.md', async () => {
+  testIfSelected('plan-design-review-export', async () => {
     // Clean up previous test artifacts
     try { fs.unlinkSync(path.join(reviewDir, 'design-audit.md')); } catch {}
 
@@ -2078,7 +2124,7 @@ Review ${testServer.url} with --quick mode. Skip any AskUserQuestion calls — t
 
 // --- QA Design Review E2E ---
 
-describeE2E('QA Design Review E2E', () => {
+describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => {
   let qaDesignDir: string;
   let qaDesignServer: ReturnType<typeof Bun.serve> | null = null;
 
diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts
index ba635613..c3e1aef2 100644
--- a/test/skill-llm-eval.test.ts
+++ b/test/skill-llm-eval.test.ts
@@ -17,6 +17,7 @@ import * as path from 'path';
 import { callJudge, judge } from './helpers/llm-judge';
 import type { JudgeScore } from './helpers/llm-judge';
 import { EvalCollector } from './helpers/eval-store';
+import { selectTests, detectBaseBranch, getChangedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
 
 const ROOT = path.resolve(import.meta.dir, '..');
 // Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
@@ -26,8 +27,43 @@ const describeEval = evalsEnabled ? describe : describe.skip;
 // Eval result collector
 const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;
 
-describeEval('LLM-as-judge quality evals', () => {
-  test('command reference table scores >= 4 on all dimensions', async () => {
+// --- Diff-based test selection ---
+let selectedTests: string[] | null = null;
+
+if (evalsEnabled && !process.env.EVALS_ALL) {
+  const baseBranch = process.env.EVALS_BASE
+    || detectBaseBranch(ROOT)
+    || 'main';
+  const changedFiles = getChangedFiles(baseBranch, ROOT);
+
+  if (changedFiles.length > 0) {
+    const selection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES);
+    selectedTests = selection.selected;
+    process.stderr.write(`\nLLM-judge selection (${selection.reason}): ${selection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests\n`);
+    if (selection.skipped.length > 0) {
+      process.stderr.write(`  Skipped: ${selection.skipped.join(', ')}\n`);
+    }
+    process.stderr.write('\n');
+  }
+}
+
+/** Wrap a describe block to skip if none of its tests are selected. */
+function describeIfSelected(name: string, testNames: string[], fn: () => void) {
+  const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t));
+  (anySelected ? describeEval : describe.skip)(name, fn);
+}
+
+/** Skip an individual test if not selected (for multi-test describe blocks). */
+function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
+  const shouldRun = selectedTests === null || selectedTests.includes(testName);
+  (shouldRun ? test : test.skip)(testName, fn, timeout);
+}
+
+describeIfSelected('LLM-as-judge quality evals', [
+  'command reference table', 'snapshot flags reference',
+  'browse/SKILL.md reference', 'setup block', 'regression vs baseline',
+], () => {
+  testIfSelected('command reference table', async () => {
     const t0 = Date.now();
     const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
     const start = content.indexOf('## Command Reference');
@@ -53,7 +89,7 @@ describeEval('LLM-as-judge quality evals', () => {
     expect(scores.actionability).toBeGreaterThanOrEqual(4);
   }, 30_000);
 
-  test('snapshot flags section scores >= 4 on all dimensions', async () => {
+  testIfSelected('snapshot flags reference', async () => {
     const t0 = Date.now();
     const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
     const start = content.indexOf('## Snapshot System');
@@ -79,7 +115,7 @@ describeEval('LLM-as-judge quality evals', () => {
     expect(scores.actionability).toBeGreaterThanOrEqual(4);
   }, 30_000);
 
-  test('browse/SKILL.md overall scores >= 4', async () => {
+  testIfSelected('browse/SKILL.md reference', async () => {
     const t0 = Date.now();
     const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8');
     const start = content.indexOf('## Snapshot Flags');
@@ -104,7 +140,7 @@ describeEval('LLM-as-judge quality evals', () => {
     expect(scores.actionability).toBeGreaterThanOrEqual(4);
   }, 30_000);
 
-  test('setup block scores >= 3 on actionability and clarity', async () => {
+  testIfSelected('setup block', async () => {
     const t0 = Date.now();
     const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
     const setupStart = content.indexOf('## SETUP');
@@ -131,7 +167,7 @@ describeEval('LLM-as-judge quality evals', () => {
     expect(scores.clarity).toBeGreaterThanOrEqual(3);
   }, 30_000);
 
-  test('regression check: compare branch vs baseline quality', async () => {
+  testIfSelected('regression vs baseline', async () => {
     const t0 = Date.now();
     const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
     const genStart = generated.indexOf('## Command Reference');
@@ -220,10 +256,10 @@ Scores are 1-5 overall quality.`,
 
 // --- Part 7: QA skill quality evals (C6) ---
 
-describeEval('QA skill quality evals', () => {
+describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric'], () => {
   const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
 
-  test('qa/SKILL.md workflow quality scores >= 4', async () => {
+  testIfSelected('qa/SKILL.md workflow', async () => {
     const t0 = Date.now();
     const start = qaContent.indexOf('## Workflow');
     const end = qaContent.indexOf('## Health Score Rubric');
@@ -266,7 +302,7 @@ ${section}`);
     expect(scores.actionability).toBeGreaterThanOrEqual(4);
   }, 30_000);
 
-  test('qa/SKILL.md health score rubric is unambiguous', async () => {
+  testIfSelected('qa/SKILL.md health rubric', async () => {
     const t0 = Date.now();
     const start = qaContent.indexOf('## Health Score Rubric');
     const section = qaContent.slice(start);
@@ -310,8 +346,8 @@ ${section}`);
 
 // --- Part 7: Cross-skill consistency judge (C7) ---
 
-describeEval('Cross-skill consistency evals', () => {
-  test('greptile-history patterns are consistent across all skills', async () => {
+describeIfSelected('Cross-skill consistency evals', ['cross-skill greptile consistency'], () => {
+  testIfSelected('cross-skill greptile consistency', async () => {
     const t0 = Date.now();
     const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
     const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
@@ -375,10 +411,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`);
 
 // --- Part 7: Baseline score pinning (C9) ---
 
-describeEval('Baseline score pinning', () => {
+describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => {
   const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json');
 
-  test('LLM eval scores do not regress below baselines', async () => {
+  testIfSelected('baseline score pinning', async () => {
     const t0 = Date.now();
     if (!fs.existsSync(baselinesPath)) {
       console.log('No baseline file found — skipping pinning check');
diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts
new file mode 100644
index 00000000..7880c229
--- /dev/null
+++ b/test/touchfiles.test.ts
@@ -0,0 +1,250 @@
+/**
+ * Unit tests for diff-based test selection.
+ * Free (no API calls), runs with `bun test`.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import {
+  matchGlob,
+  selectTests,
+  detectBaseBranch,
+  E2E_TOUCHFILES,
+  LLM_JUDGE_TOUCHFILES,
+  GLOBAL_TOUCHFILES,
+} from './helpers/touchfiles';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+
+// --- matchGlob ---
+
+describe('matchGlob', () => {
+  test('** matches any depth of path segments', () => {
+    expect(matchGlob('browse/src/commands.ts', 'browse/src/**')).toBe(true);
+    expect(matchGlob('browse/src/deep/nested/file.ts', 'browse/src/**')).toBe(true);
+    expect(matchGlob('browse/src/cli.ts', 'browse/src/**')).toBe(true);
+  });
+
+  test('** does not match unrelated paths', () => {
+    expect(matchGlob('browse/src/commands.ts', 'qa/**')).toBe(false);
+    expect(matchGlob('review/SKILL.md', 'qa/**')).toBe(false);
+  });
+
+  test('exact match works', () => {
+    expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true);
+    expect(matchGlob('SKILL.md.tmpl', 'SKILL.md')).toBe(false);
+    expect(matchGlob('qa/SKILL.md', 'SKILL.md')).toBe(false);
+  });
+
+  test('* matches within a single segment', () => {
+    expect(matchGlob('test/fixtures/review-eval-enum.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true);
+    expect(matchGlob('test/fixtures/review-eval-enum-diff.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true);
+    expect(matchGlob('test/fixtures/review-eval-vuln.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(false);
+  });
+
+  test('dots in patterns are escaped correctly', () => {
+    expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true);
+    expect(matchGlob('SKILLxmd', 'SKILL.md')).toBe(false);
+  });
+
+  test('** at end matches files in the directory', () => {
+    expect(matchGlob('qa/SKILL.md', 'qa/**')).toBe(true);
+    expect(matchGlob('qa/SKILL.md.tmpl', 'qa/**')).toBe(true);
+    expect(matchGlob('qa/templates/report.md', 'qa/**')).toBe(true);
+  });
+});
+
+// --- selectTests ---
+
+describe('selectTests', () => {
+  test('browse/src change selects browse and qa tests', () => {
+    const result = selectTests(['browse/src/commands.ts'], E2E_TOUCHFILES);
+    expect(result.selected).toContain('browse-basic');
+    expect(result.selected).toContain('browse-snapshot');
+    expect(result.selected).toContain('qa-quick');
+    expect(result.selected).toContain('qa-fix-loop');
+    expect(result.selected).toContain('qa-design-review-fix');
+    expect(result.reason).toBe('diff');
+    // Should NOT include unrelated tests
+    expect(result.selected).not.toContain('plan-ceo-review');
+    expect(result.selected).not.toContain('retro');
+    expect(result.selected).not.toContain('document-release');
+  });
+
+  test('skill-specific change selects only that skill', () => {
+    const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES);
+    expect(result.selected).toEqual(['plan-ceo-review']);
+    expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 1);
+  });
+
+  test('global touchfile triggers ALL tests', () => {
+    const result = selectTests(['test/helpers/session-runner.ts'], E2E_TOUCHFILES);
+    expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
+    expect(result.skipped.length).toBe(0);
+    expect(result.reason).toContain('global');
+  });
+
+  test('gen-skill-docs.ts is a global touchfile', () => {
+    const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES);
+    expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
+    expect(result.reason).toContain('global');
+  });
+
+  test('unrelated file selects nothing', () => {
+    const result = selectTests(['README.md'], E2E_TOUCHFILES);
+    expect(result.selected).toEqual([]);
+    expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length);
+  });
+
+  test('empty changed files selects nothing', () => {
+    const result = selectTests([], E2E_TOUCHFILES);
+    expect(result.selected).toEqual([]);
+  });
+
+  test('multiple changed files union their selections', () => {
+    const result = selectTests(
+      ['plan-ceo-review/SKILL.md', 'retro/SKILL.md.tmpl'],
+      E2E_TOUCHFILES,
+    );
+    expect(result.selected).toContain('plan-ceo-review');
+    expect(result.selected).toContain('retro');
+    expect(result.selected).toContain('retro-base-branch');
+    expect(result.selected.length).toBe(3);
+  });
+
+  test('works with LLM_JUDGE_TOUCHFILES', () => {
+    const result = selectTests(['qa/SKILL.md'], LLM_JUDGE_TOUCHFILES);
+    expect(result.selected).toContain('qa/SKILL.md workflow');
+    expect(result.selected).toContain('qa/SKILL.md health rubric');
+    expect(result.selected.length).toBe(2);
+  });
+
+  test('SKILL.md.tmpl root template only selects root-dependent tests', () => {
+    const result = selectTests(['SKILL.md.tmpl'], E2E_TOUCHFILES);
+    // Should select the 7 tests that depend on root SKILL.md
+    expect(result.selected).toContain('skillmd-setup-discovery');
+    expect(result.selected).toContain('contributor-mode');
+    expect(result.selected).toContain('session-awareness');
+    // Should NOT select unrelated tests
+    expect(result.selected).not.toContain('plan-ceo-review');
+    expect(result.selected).not.toContain('retro');
+  });
+
+  test('global touchfiles work for LLM-judge tests too', () => {
+    const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES);
+    expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length);
+  });
+});
+
+// --- detectBaseBranch ---
+
+describe('detectBaseBranch', () => {
+  test('detects local main branch', () => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+    fs.writeFileSync(path.join(dir, 'test.txt'), 'hello\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'init']);
+
+    const result = detectBaseBranch(dir);
+    // Should find 'main' (or 'master' depending on git default)
+    expect(result).toMatch(/^(main|master)$/);
+
+    try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('returns null for empty repo with no branches', () => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init']);
+    // No commits = no branches
+    const result = detectBaseBranch(dir);
+    expect(result).toBeNull();
+
+    try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('returns null for non-git directory', () => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-'));
+    const result = detectBaseBranch(dir);
+    expect(result).toBeNull();
+
+    try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
+  });
+});
+
+// --- Completeness: every testName in skill-e2e.test.ts has a TOUCHFILES entry ---
+
+describe('TOUCHFILES completeness', () => {
+  test('every E2E testName has a TOUCHFILES entry', () => {
+    const e2eContent = fs.readFileSync(
+      path.join(ROOT, 'test', 'skill-e2e.test.ts'),
+      'utf-8',
+    );
+
+    // Extract all testName: 'value' entries
+    const testNameRegex = /testName:\s*['"`]([^'"`]+)['"`]/g;
+    const testNames: string[] = [];
+    let match;
+    while ((match = testNameRegex.exec(e2eContent)) !== null) {
+      let name = match[1];
+      // Handle template literals like `qa-${label}` — these expand to
+      // qa-b6-static, qa-b7-spa, qa-b8-checkout
+      if (name.includes('${')) continue; // skip template literals, check expanded forms below
+      testNames.push(name);
+    }
+
+    // Add the template-expanded testNames from runPlantedBugEval calls
+    const plantedBugRegex = /runPlantedBugEval\([^,]+,\s*[^,]+,\s*['"`]([^'"`]+)['"`]\)/g;
+    while ((match = plantedBugRegex.exec(e2eContent)) !== null) {
+      testNames.push(`qa-${match[1]}`);
+    }
+
+    expect(testNames.length).toBeGreaterThan(0);
+
+    const missing = testNames.filter(name => !(name in E2E_TOUCHFILES));
+    if (missing.length > 0) {
+      throw new Error(
+        `E2E tests missing TOUCHFILES entries: ${missing.join(', ')}\n` +
+        `Add these to E2E_TOUCHFILES in test/helpers/touchfiles.ts`,
+      );
+    }
+  });
+
+  test('every LLM-judge test has a TOUCHFILES entry', () => {
+    const llmContent = fs.readFileSync(
+      path.join(ROOT, 'test', 'skill-llm-eval.test.ts'),
+      'utf-8',
+    );
+
+    // Extract test names from addTest({ name: '...' }) calls
+    const nameRegex = /name:\s*['"`]([^'"`]+)['"`]/g;
+    const testNames: string[] = [];
+    let match;
+    while ((match = nameRegex.exec(llmContent)) !== null) {
+      testNames.push(match[1]);
+    }
+
+    // Deduplicate (some tests call addTest with the same name)
+    const unique = [...new Set(testNames)];
+    expect(unique.length).toBeGreaterThan(0);
+
+    const missing = unique.filter(name => !(name in LLM_JUDGE_TOUCHFILES));
+    if (missing.length > 0) {
+      throw new Error(
+        `LLM-judge tests missing TOUCHFILES entries: ${missing.join(', ')}\n` +
+        `Add these to LLM_JUDGE_TOUCHFILES in test/helpers/touchfiles.ts`,
+      );
+    }
+  });
+});

From 7fed990f9e1bb962d40f2ad07718178746b69542 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Tue, 17 Mar 2026 11:30:23 -0700
Subject: [PATCH 2/3] chore: bump version and changelog (v0.6.1.0)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CHANGELOG.md               | 13 +++++++++++++
 VERSION                    |  2 +-
 test/helpers/touchfiles.ts | 13 ++++++++++---
 test/skill-e2e.test.ts     |  6 +++---
 test/touchfiles.test.ts    | 11 +++++++----
 5 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4593d3f1..96fc4c0f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Changelog
 
+## [0.6.1.0] - 2026-03-17
+
+### Added
+
+- **E2E and LLM-judge tests now only run what you changed.** Each test declares which source files it depends on. When you run `bun run test:e2e`, it checks your diff and skips tests whose dependencies weren't touched. A branch that only changes `/retro` now runs 2 tests instead of 31. Use `bun run test:e2e:all` to force everything.
+- **`bun run eval:select` previews which tests would run.** See exactly which tests your diff triggers before spending API credits. Supports `--json` for scripting and `--base <branch>` to override the base branch.
+- **Completeness guardrail catches forgotten test entries.** A free unit test validates that every `testName` in the E2E and LLM-judge test files has a corresponding entry in the TOUCHFILES map. New tests without entries fail `bun test` immediately — no silent always-run degradation.
+
+### Changed
+
+- `test:evals` and `test:e2e` now auto-select based on diff (was: all-or-nothing)
+- New `test:evals:all` and `test:e2e:all` scripts for explicit full runs
+
 ## 0.6.0.1 — 2026-03-17
 
 - **`/gstack-upgrade` now catches stale vendored copies automatically.** If your global gstack is up to date but the vendored copy in your project is behind, `/gstack-upgrade` detects the mismatch and syncs it. No more manually asking "did we vendor it?" — it just tells you and offers to update.
diff --git a/VERSION b/VERSION
index 758efdb4..44e7f9a2 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.6.0.1
+0.6.1.0
diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts
index 21bf07f1..30a15579 100644
--- a/test/helpers/touchfiles.ts
+++ b/test/helpers/touchfiles.ts
@@ -57,9 +57,10 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
   'review-base-branch':       ['review/**'],
 
   // Plan reviews
-  'plan-ceo-review':          ['plan-ceo-review/**'],
-  'plan-eng-review':          ['plan-eng-review/**'],
-  'plan-eng-review-artifact': ['plan-eng-review/**'],
+  'plan-ceo-review':           ['plan-ceo-review/**'],
+  'plan-ceo-review-selective': ['plan-ceo-review/**'],
+  'plan-eng-review':           ['plan-eng-review/**'],
+  'plan-eng-review-artifact':  ['plan-eng-review/**'],
 
   // Ship
   'ship-base-branch': ['ship/**'],
@@ -71,6 +72,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
   // Document-release
   'document-release': ['document-release/**'],
 
+  // QA bootstrap
+  'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],
+
+  // Ship coverage audit
+  'ship-coverage-audit': ['ship/**'],
+
   // Design
   'design-consultation-core':     ['design-consultation/**'],
   'design-consultation-research': ['design-consultation/**'],
diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts
index 1b750701..1d311bdf 100644
--- a/test/skill-e2e.test.ts
+++ b/test/skill-e2e.test.ts
@@ -896,7 +896,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and
 
 // --- Plan CEO Review (SELECTIVE EXPANSION) E2E ---
 
-describeE2E('Plan CEO Review SELECTIVE EXPANSION E2E', () => {
+describeIfSelected('Plan CEO Review SELECTIVE EXPANSION E2E', ['plan-ceo-review-selective'], () => {
   let planDir: string;
 
   beforeAll(() => {
@@ -2346,7 +2346,7 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call
 
 // --- Test Bootstrap E2E ---
 
-describeE2E('Test Bootstrap E2E', () => {
+describeIfSelected('Test Bootstrap E2E', ['qa-bootstrap'], () => {
   let bootstrapDir: string;
   let bootstrapServer: ReturnType<typeof Bun.serve>;
 
@@ -2483,7 +2483,7 @@ This is a test+fix loop: find bugs, fix them, write regression tests, commit eac
 
 // --- Test Coverage Audit E2E ---
 
-describeE2E('Test Coverage Audit E2E', () => {
+describeIfSelected('Test Coverage Audit E2E', ['ship-coverage-audit'], () => {
   let coverageDir: string;
 
   beforeAll(() => {
diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts
index 7880c229..e666bb3d 100644
--- a/test/touchfiles.test.ts
+++ b/test/touchfiles.test.ts
@@ -74,10 +74,12 @@ describe('selectTests', () => {
     expect(result.selected).not.toContain('document-release');
   });
 
-  test('skill-specific change selects only that skill', () => {
+  test('skill-specific change selects only that skill and related tests', () => {
     const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES);
-    expect(result.selected).toEqual(['plan-ceo-review']);
-    expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 1);
+    expect(result.selected).toContain('plan-ceo-review');
+    expect(result.selected).toContain('plan-ceo-review-selective');
+    expect(result.selected.length).toBe(2);
+    expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 2);
   });
 
   test('global touchfile triggers ALL tests', () => {
@@ -110,9 +112,10 @@ describe('selectTests', () => {
       E2E_TOUCHFILES,
     );
     expect(result.selected).toContain('plan-ceo-review');
+    expect(result.selected).toContain('plan-ceo-review-selective');
     expect(result.selected).toContain('retro');
     expect(result.selected).toContain('retro-base-branch');
-    expect(result.selected.length).toBe(3);
+    expect(result.selected.length).toBe(4);
   });
 
   test('works with LLM_JUDGE_TOUCHFILES', () => {

From 64bbbb21984e1e399afe81fd427c10fdcb9740d0 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Tue, 17 Mar 2026 14:41:13 -0700
Subject: [PATCH 3/3] =?UTF-8?q?fix:=20plan-design-review-audit=20eval=20?=
 =?UTF-8?q?=E2=80=94=20bump=20turns=20to=2030,=20add=20efficiency=20hints?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test was flaky at 20 turns because the agent reads a 300-line SKILL.md,
navigates, extracts design data, and writes a report. Added hints to skip
preamble/batch commands/write early while still testing the real SKILL.md.
Now completes in ~13 turns consistently.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 test/skill-e2e.test.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts
index 1d311bdf..34baafbc 100644
--- a/test/skill-e2e.test.ts
+++ b/test/skill-e2e.test.ts
@@ -2128,9 +2128,11 @@ B="${browseBin}"
 
 Read plan-design-review/SKILL.md for the design review workflow.
 
-Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md.`,
+Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md.
+
+EFFICIENCY: Skip the preamble bash block. Combine multiple browse commands into single bash blocks (e.g. run all Phase 2 JS extractions in one block). Write the report as soon as you have enough data — do not over-explore.`,
       workingDirectory: reviewDir,
-      maxTurns: 20,
+      maxTurns: 30,
       timeout: 360_000,
       testName: 'plan-design-review-audit',
       runId,