Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
# Changelog

## [0.6.1.0] - 2026-03-17

### Added

- **E2E and LLM-judge tests now only run what you changed.** Each test declares which source files it depends on. When you run `bun run test:e2e`, it checks your diff and skips tests whose dependencies weren't touched. A branch that only changes `/retro` now runs 2 tests instead of 31. Use `bun run test:e2e:all` to force everything.
- **`bun run eval:select` previews which tests would run.** See exactly which tests your diff triggers before spending API credits. Supports `--json` for scripting and `--base <branch>` to override the base branch.
- **Completeness guardrail catches forgotten test entries.** A free unit test validates that every `testName` in the E2E and LLM-judge test files has a corresponding entry in the TOUCHFILES map. New tests without entries fail `bun test` immediately — no silent always-run degradation.

### Changed

- `test:evals` and `test:e2e` now auto-select based on diff (was: all-or-nothing)
- New `test:evals:all` and `test:e2e:all` scripts for explicit full runs

## 0.6.1 — 2026-03-17 — Boil the Lake

Every gstack skill now follows the **Completeness Principle**: always recommend the
Expand Down
13 changes: 11 additions & 2 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
```bash
bun install # install dependencies
bun test # run free tests (browse + snapshot + skill validation)
bun run test:evals # run paid evals: LLM judge + E2E (~$4/run)
bun run test:e2e # run E2E tests only (~$3.85/run)
bun run test:evals # run paid evals: LLM judge + E2E (diff-based, ~$4/run max)
bun run test:evals:all # run ALL paid evals regardless of diff
bun run test:e2e # run E2E tests only (diff-based, ~$3.85/run max)
bun run test:e2e:all # run ALL E2E tests regardless of diff
bun run eval:select # show which tests would run based on current diff
bun run dev <cmd> # run CLI in dev mode, e.g. bun run dev goto https://example.com
bun run build # gen docs + compile binaries
bun run gen:skill-docs # regenerate SKILL.md files from templates
Expand All @@ -21,6 +24,12 @@ bun run eval:summary # aggregate stats across all eval runs
(tool-by-tool via `--output-format stream-json --verbose`). Results are persisted
to `~/.gstack-dev/evals/` with auto-comparison against the previous run.

**Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based
on `git diff` against the base branch. Each test declares its file dependencies in
`test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store,
llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script
variants to force all tests. Run `eval:select` to preview which tests would run.

## Project structure

```
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.6.1
0.6.1.0
5 changes: 4 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@
"server": "bun run browse/src/server.ts",
"test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts",
"test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts",
"test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts",
"test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts",
"test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts",
"skill:check": "bun run scripts/skill-check.ts",
"dev:skill": "bun run scripts/dev-skill.ts",
"start": "bun run browse/src/server.ts",
"eval:list": "bun run scripts/eval-list.ts",
"eval:compare": "bun run scripts/eval-compare.ts",
"eval:summary": "bun run scripts/eval-summary.ts",
"eval:watch": "bun run scripts/eval-watch.ts"
"eval:watch": "bun run scripts/eval-watch.ts",
"eval:select": "bun run scripts/eval-select.ts"
},
"dependencies": {
"playwright": "^1.58.2",
Expand Down
86 changes: 86 additions & 0 deletions scripts/eval-select.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env bun
/**
* Show which E2E and LLM-judge tests would run based on the current git diff.
*
* Usage:
* bun run eval:select # human-readable output
* bun run eval:select --json # machine-readable JSON
* bun run eval:select --base main # override base branch
*/

import * as path from 'path';
import {
selectTests,
detectBaseBranch,
getChangedFiles,
E2E_TOUCHFILES,
LLM_JUDGE_TOUCHFILES,
GLOBAL_TOUCHFILES,
} from '../test/helpers/touchfiles';

const ROOT = path.resolve(import.meta.dir, '..');
const args = process.argv.slice(2);
const jsonMode = args.includes('--json');
const baseIdx = args.indexOf('--base');
const baseOverride = baseIdx >= 0 ? args[baseIdx + 1] : undefined;

// Detect base branch
const baseBranch = baseOverride || detectBaseBranch(ROOT) || 'main';
const changedFiles = getChangedFiles(baseBranch, ROOT);

if (changedFiles.length === 0) {
if (jsonMode) {
console.log(JSON.stringify({ base: baseBranch, changed_files: 0, e2e: 'all', llm_judge: 'all', reason: 'no diff — would run all tests' }));
} else {
console.log(`Base: ${baseBranch}`);
console.log('No changed files detected — all tests would run.');
}
process.exit(0);
}

const e2eSelection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
const llmSelection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES);

if (jsonMode) {
console.log(JSON.stringify({
base: baseBranch,
changed_files: changedFiles,
e2e: {
selected: e2eSelection.selected,
skipped: e2eSelection.skipped,
reason: e2eSelection.reason,
count: `${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length}`,
},
llm_judge: {
selected: llmSelection.selected,
skipped: llmSelection.skipped,
reason: llmSelection.reason,
count: `${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length}`,
},
}, null, 2));
} else {
console.log(`Base: ${baseBranch}`);
console.log(`Changed files: ${changedFiles.length}`);
console.log();

console.log(`E2E (${e2eSelection.reason}): ${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests`);
if (e2eSelection.selected.length > 0 && e2eSelection.selected.length < Object.keys(E2E_TOUCHFILES).length) {
console.log(` Selected: ${e2eSelection.selected.join(', ')}`);
console.log(` Skipped: ${e2eSelection.skipped.join(', ')}`);
} else if (e2eSelection.selected.length === 0) {
console.log(' No E2E tests affected.');
} else {
console.log(' All E2E tests selected.');
}
console.log();

console.log(`LLM-judge (${llmSelection.reason}): ${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests`);
if (llmSelection.selected.length > 0 && llmSelection.selected.length < Object.keys(LLM_JUDGE_TOUCHFILES).length) {
console.log(` Selected: ${llmSelection.selected.join(', ')}`);
console.log(` Skipped: ${llmSelection.skipped.join(', ')}`);
} else if (llmSelection.selected.length === 0) {
console.log(' No LLM-judge tests affected.');
} else {
console.log(' All LLM-judge tests selected.');
}
}
178 changes: 178 additions & 0 deletions test/helpers/touchfiles.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/**
* Diff-based test selection for E2E and LLM-judge evals.
*
* Each test declares which source files it depends on ("touchfiles").
* The test runner checks `git diff` and only runs tests whose
* dependencies were modified. Override with EVALS_ALL=1 to run everything.
*/

import { spawnSync } from 'child_process';

// --- Glob matching ---

/**
* Match a file path against a glob pattern.
* Supports:
* ** — match any number of path segments
* * — match within a single segment (no /)
*/
export function matchGlob(file: string, pattern: string): boolean {
const regexStr = pattern
.replace(/\./g, '\\.')
.replace(/\*\*/g, '{{GLOBSTAR}}')
.replace(/\*/g, '[^/]*')
.replace(/\{\{GLOBSTAR\}\}/g, '.*');
return new RegExp(`^${regexStr}$`).test(file);
}

// --- Touchfile maps ---

/**
* E2E test touchfiles — keyed by testName (the string passed to runSkillTest).
* Each test lists the file patterns that, if changed, require the test to run.
*/
export const E2E_TOUCHFILES: Record<string, string[]> = {
// Browse core
'browse-basic': ['browse/src/**'],
'browse-snapshot': ['browse/src/**'],

// SKILL.md setup + preamble (depend on ROOT SKILL.md only)
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'],
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'],
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'],
'contributor-mode': ['SKILL.md', 'SKILL.md.tmpl'],
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'],

// QA
'qa-quick': ['qa/**', 'browse/src/**'],
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
'qa-fix-loop': ['qa/**', 'browse/src/**'],

// Review
'review-sql-injection': ['review/**', 'test/fixtures/review-eval-vuln.rb'],
'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'],
'review-base-branch': ['review/**'],

// Plan reviews
'plan-ceo-review': ['plan-ceo-review/**'],
'plan-ceo-review-selective': ['plan-ceo-review/**'],
'plan-eng-review': ['plan-eng-review/**'],
'plan-eng-review-artifact': ['plan-eng-review/**'],

// Ship
'ship-base-branch': ['ship/**'],

// Retro
'retro': ['retro/**'],
'retro-base-branch': ['retro/**'],

// Document-release
'document-release': ['document-release/**'],

// QA bootstrap
'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],

// Ship coverage audit
'ship-coverage-audit': ['ship/**'],

// Design
'design-consultation-core': ['design-consultation/**'],
'design-consultation-research': ['design-consultation/**'],
'design-consultation-existing': ['design-consultation/**'],
'design-consultation-preview': ['design-consultation/**'],
'plan-design-review-audit': ['plan-design-review/**'],
'plan-design-review-export': ['plan-design-review/**'],
'qa-design-review-fix': ['qa-design-review/**', 'browse/src/**'],
};

/**
* LLM-judge test touchfiles — keyed by test description string.
*/
export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
'command reference table': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts'],
'snapshot flags reference': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/snapshot.ts'],
'browse/SKILL.md reference': ['browse/SKILL.md', 'browse/SKILL.md.tmpl', 'browse/src/**'],
'setup block': ['SKILL.md', 'SKILL.md.tmpl'],
'regression vs baseline': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'],
'qa/SKILL.md workflow': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'],
};

/**
* Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
*/
export const GLOBAL_TOUCHFILES = [
'test/helpers/session-runner.ts',
'test/helpers/eval-store.ts',
'test/helpers/llm-judge.ts',
'scripts/gen-skill-docs.ts',
'test/helpers/touchfiles.ts',
'browse/test/test-server.ts',
];

// --- Base branch detection ---

/**
* Detect the base branch by trying refs in order.
* Returns the first valid ref, or null if none found.
*/
export function detectBaseBranch(cwd: string): string | null {
for (const ref of ['origin/main', 'origin/master', 'main', 'master']) {
const result = spawnSync('git', ['rev-parse', '--verify', ref], {
cwd, stdio: 'pipe', timeout: 3000,
});
if (result.status === 0) return ref;
}
return null;
}

/**
* Get list of files changed between base branch and HEAD.
*/
export function getChangedFiles(baseBranch: string, cwd: string): string[] {
const result = spawnSync('git', ['diff', '--name-only', `${baseBranch}...HEAD`], {
cwd, stdio: 'pipe', timeout: 5000,
});
if (result.status !== 0) return [];
return result.stdout.toString().trim().split('\n').filter(Boolean);
}

// --- Test selection ---

/**
* Select tests to run based on changed files.
*
* Algorithm:
* 1. If any changed file matches a global touchfile → run ALL tests
* 2. Otherwise, for each test, check if any changed file matches its patterns
* 3. Return selected + skipped lists with reason
*/
export function selectTests(
changedFiles: string[],
touchfiles: Record<string, string[]>,
globalTouchfiles: string[] = GLOBAL_TOUCHFILES,
): { selected: string[]; skipped: string[]; reason: string } {
const allTestNames = Object.keys(touchfiles);

// Global touchfile hit → run all
for (const file of changedFiles) {
if (globalTouchfiles.some(g => matchGlob(file, g))) {
return { selected: allTestNames, skipped: [], reason: `global: ${file}` };
}
}

// Per-test matching
const selected: string[] = [];
const skipped: string[] = [];
for (const [testName, patterns] of Object.entries(touchfiles)) {
const hit = changedFiles.some(f => patterns.some(p => matchGlob(f, p)));
(hit ? selected : skipped).push(testName);
}

return { selected, skipped, reason: 'diff' };
}
Loading