Skip to content

Commit 82fe18f

Browse files
author
StackMemory Bot (CLI)
committed
feat(bench): harness benchmarks with SWE-bench baselines
- Add baselines.ts with SWE-bench Verified leaderboard reference scores (Claude Code 70.4%, Devin 55.1%, OpenHands 53.5%, Aider 48.9%) - Define internal harness targets: plan latency P95, first-pass approval rate, edit success rate, fuzzy fallback rate, context budget - Instrument harness.ts runSpike() with timing (plan + total latency) and persist HarnessRunMetrics to harness-metrics.jsonl - Add `stackmemory bench` CLI: shows local metrics vs targets vs online baselines. Supports --json, --baselines, --days flags. - summarizeRuns() computes percentiles, rates, and target pass/fail
1 parent 6701978 commit 82fe18f

5 files changed

Lines changed: 552 additions & 1 deletion

File tree

src/cli/commands/bench.ts

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
/**
2+
* Bench Command for StackMemory CLI
3+
*
4+
* Runs harness benchmarks and compares against online baselines
5+
* (SWE-bench Verified, internal targets).
6+
*/
7+
8+
import { Command } from 'commander';
9+
import { existsSync, readFileSync, readdirSync } from 'fs';
10+
import { join } from 'path';
11+
import {
12+
SWE_BENCH_BASELINES,
13+
HARNESS_TARGETS,
14+
summarizeRuns,
15+
} from '../../orchestrators/multimodal/baselines.js';
16+
import type { HarnessRunMetrics } from '../../orchestrators/multimodal/baselines.js';
17+
18+
function loadRunMetrics(projectRoot: string): HarnessRunMetrics[] {
19+
const metricsFile = join(
20+
projectRoot,
21+
'.stackmemory',
22+
'build',
23+
'harness-metrics.jsonl'
24+
);
25+
if (!existsSync(metricsFile)) return [];
26+
27+
const lines = readFileSync(metricsFile, 'utf-8')
28+
.split('\n')
29+
.filter((l) => l.trim());
30+
const runs: HarnessRunMetrics[] = [];
31+
for (const line of lines) {
32+
try {
33+
runs.push(JSON.parse(line));
34+
} catch {
35+
// skip malformed
36+
}
37+
}
38+
return runs;
39+
}
40+
41+
function loadSpikeAudits(
42+
projectRoot: string
43+
): Array<{ file: string; data: any }> {
44+
const dir = join(projectRoot, '.stackmemory', 'build');
45+
if (!existsSync(dir)) return [];
46+
47+
return readdirSync(dir)
48+
.filter((f) => f.startsWith('spike-') && f.endsWith('.json'))
49+
.sort()
50+
.reverse()
51+
.slice(0, 20)
52+
.map((f) => {
53+
try {
54+
return {
55+
file: f,
56+
data: JSON.parse(readFileSync(join(dir, f), 'utf-8')),
57+
};
58+
} catch {
59+
return null;
60+
}
61+
})
62+
.filter(Boolean) as Array<{ file: string; data: any }>;
63+
}
64+
65+
export function createBenchCommand(): Command {
66+
const bench = new Command('bench')
67+
.description(
68+
'Harness benchmarks — compare local runs against SWE-bench baselines'
69+
)
70+
.option('--json', 'Output as JSON', false)
71+
.option('-d, --days <n>', 'Only include runs from last N days', '30')
72+
.option('--baselines', 'Show online benchmark baselines only', false)
73+
.action(async (options) => {
74+
const projectRoot = process.cwd();
75+
76+
// Baselines-only mode
77+
if (options.baselines) {
78+
if (options.json) {
79+
console.log(
80+
JSON.stringify(
81+
{ baselines: SWE_BENCH_BASELINES, targets: HARNESS_TARGETS },
82+
null,
83+
2
84+
)
85+
);
86+
return;
87+
}
88+
console.log('\nOnline Benchmark Baselines (SWE-bench Verified)');
89+
console.log('─'.repeat(60));
90+
console.log(
91+
`${'Agent'.padEnd(20)} ${'Model'.padEnd(20)} ${'Resolve'.padStart(8)}`
92+
);
93+
console.log('─'.repeat(60));
94+
for (const b of SWE_BENCH_BASELINES) {
95+
console.log(
96+
`${b.agent.padEnd(20)} ${b.model.padEnd(20)} ${(b.resolveRate * 100).toFixed(1).padStart(7)}%`
97+
);
98+
}
99+
console.log('─'.repeat(60));
100+
101+
console.log('\nInternal Harness Targets');
102+
console.log('─'.repeat(60));
103+
console.log(
104+
` Plan latency P95: ${HARNESS_TARGETS.planLatencyP95Ms}ms`
105+
);
106+
console.log(
107+
` Total latency P95: ${HARNESS_TARGETS.totalLatencyP95Ms}ms`
108+
);
109+
console.log(
110+
` First-pass approval: ${(HARNESS_TARGETS.firstPassApprovalRate * 100).toFixed(0)}%`
111+
);
112+
console.log(
113+
` Edit success rate: ${(HARNESS_TARGETS.editSuccessRate * 100).toFixed(0)}%`
114+
);
115+
console.log(
116+
` Fuzzy fallback rate: <${(HARNESS_TARGETS.editFuzzyFallbackRate * 100).toFixed(0)}%`
117+
);
118+
console.log(
119+
` Context token budget: ${HARNESS_TARGETS.contextTokenBudget}`
120+
);
121+
console.log('');
122+
return;
123+
}
124+
125+
// Load local run data
126+
const days = parseInt(options.days, 10) || 30;
127+
const cutoff = Date.now() - days * 86400_000;
128+
const allRuns = loadRunMetrics(projectRoot);
129+
const runs = allRuns.filter((r) => r.timestamp >= cutoff);
130+
const audits = loadSpikeAudits(projectRoot);
131+
132+
if (options.json) {
133+
const summary = summarizeRuns(runs);
134+
console.log(
135+
JSON.stringify(
136+
{
137+
summary,
138+
baselines: SWE_BENCH_BASELINES,
139+
targets: HARNESS_TARGETS,
140+
runsInWindow: runs.length,
141+
totalRuns: allRuns.length,
142+
recentAudits: audits.length,
143+
},
144+
null,
145+
2
146+
)
147+
);
148+
return;
149+
}
150+
151+
// Human output
152+
console.log(`\nHarness Benchmark Report (last ${days} days)`);
153+
console.log('═'.repeat(60));
154+
155+
if (runs.length === 0) {
156+
console.log('\nNo harness runs recorded yet.');
157+
console.log('Run: stackmemory build "your task" --execute');
158+
console.log('Or: stackmemory mm-spike -t "task" --execute\n');
159+
160+
// Still show baselines for context
161+
console.log('Online Baselines (SWE-bench Verified):');
162+
for (const b of SWE_BENCH_BASELINES.slice(0, 3)) {
163+
console.log(
164+
` ${b.agent.padEnd(16)} ${(b.resolveRate * 100).toFixed(1)}%`
165+
);
166+
}
167+
console.log('');
168+
return;
169+
}
170+
171+
const summary = summarizeRuns(runs);
172+
173+
// Harness metrics
174+
console.log('\nHarness Metrics:');
175+
console.log(` Total runs: ${summary.totalRuns}`);
176+
console.log(
177+
` Approval rate: ${(summary.approvalRate * 100).toFixed(1)}%`
178+
);
179+
console.log(
180+
` First-pass rate: ${(summary.firstPassRate * 100).toFixed(1)}%`
181+
);
182+
console.log(
183+
` Avg iterations: ${summary.avgIterations.toFixed(1)}`
184+
);
185+
console.log(
186+
` Plan latency (avg): ${Math.round(summary.avgPlanLatencyMs)}ms`
187+
);
188+
console.log(
189+
` Plan latency (P95): ${Math.round(summary.p95PlanLatencyMs)}ms`
190+
);
191+
console.log(
192+
` Total latency (avg): ${Math.round(summary.avgTotalLatencyMs)}ms`
193+
);
194+
console.log(
195+
` Total latency (P95): ${Math.round(summary.p95TotalLatencyMs)}ms`
196+
);
197+
console.log(
198+
` Edit success rate: ${(summary.editSuccessRate * 100).toFixed(1)}%`
199+
);
200+
console.log(
201+
` Fuzzy fallback rate: ${(summary.editFuzzyRate * 100).toFixed(1)}%`
202+
);
203+
console.log(
204+
` Context tokens (avg): ${Math.round(summary.avgContextTokens)}`
205+
);
206+
207+
// Target comparison
208+
console.log('\nTarget Comparison:');
209+
const checks = summary.passesTargets;
210+
for (const [key, passes] of Object.entries(checks)) {
211+
const icon = passes ? 'PASS' : 'FAIL';
212+
console.log(` [${icon}] ${key}`);
213+
}
214+
215+
// Online baseline comparison
216+
console.log('\nOnline Baselines (SWE-bench Verified):');
217+
for (const b of SWE_BENCH_BASELINES.slice(0, 4)) {
218+
console.log(
219+
` ${b.agent.padEnd(16)} ${(b.resolveRate * 100).toFixed(1)}%`
220+
);
221+
}
222+
223+
// Recent audits
224+
if (audits.length > 0) {
225+
console.log(`\nRecent Spike Audits (${audits.length}):`);
226+
for (const a of audits.slice(0, 5)) {
227+
const task = a.data?.input?.task || '(unknown)';
228+
const approved = a.data?.iterations?.some(
229+
(it: any) => it.critique?.approved
230+
);
231+
const icon = approved ? 'OK' : '--';
232+
console.log(` [${icon}] ${task.slice(0, 50)}`);
233+
}
234+
}
235+
236+
console.log('');
237+
});
238+
239+
return bench;
240+
}

src/cli/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ import { registerSetupCommands } from './commands/setup.js';
6060
import { createPingCommand } from './commands/ping.js';
6161
import { createAuditCommand } from './commands/audit.js';
6262
import { createStatsCommand } from './commands/stats.js';
63+
import { createBenchCommand } from './commands/bench.js';
6364
import chalk from 'chalk';
6465
import * as fs from 'fs';
6566
import * as path from 'path';
@@ -748,6 +749,7 @@ program.addCommand(createDiscoveryCommands());
748749
program.addCommand(createModelCommand());
749750
program.addCommand(createAuditCommand());
750751
program.addCommand(createStatsCommand());
752+
program.addCommand(createBenchCommand());
751753

752754
// Register setup and diagnostic commands
753755
registerSetupCommands(program);
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import { describe, it, expect } from 'vitest';
2+
import {
3+
SWE_BENCH_BASELINES,
4+
HARNESS_TARGETS,
5+
summarizeRuns,
6+
} from '../baselines.js';
7+
import type { HarnessRunMetrics } from '../baselines.js';
8+
9+
describe('baselines', () => {
10+
it('SWE-bench baselines are valid and sorted by resolve rate', () => {
11+
expect(SWE_BENCH_BASELINES.length).toBeGreaterThanOrEqual(3);
12+
for (const b of SWE_BENCH_BASELINES) {
13+
expect(b.resolveRate).toBeGreaterThan(0);
14+
expect(b.resolveRate).toBeLessThanOrEqual(1);
15+
expect(b.source).toMatch(/^https?:\/\//);
16+
}
17+
// Sorted descending
18+
for (let i = 1; i < SWE_BENCH_BASELINES.length; i++) {
19+
expect(SWE_BENCH_BASELINES[i - 1].resolveRate).toBeGreaterThanOrEqual(
20+
SWE_BENCH_BASELINES[i].resolveRate
21+
);
22+
}
23+
});
24+
25+
it('harness targets are sensible', () => {
26+
expect(HARNESS_TARGETS.planLatencyP95Ms).toBeLessThan(30_000);
27+
expect(HARNESS_TARGETS.editSuccessRate).toBeGreaterThan(0.5);
28+
expect(HARNESS_TARGETS.contextTokenBudget).toBeGreaterThan(1000);
29+
});
30+
});
31+
32+
describe('summarizeRuns', () => {
33+
it('returns zeros for empty input', () => {
34+
const s = summarizeRuns([]);
35+
expect(s.totalRuns).toBe(0);
36+
expect(s.approvalRate).toBe(0);
37+
});
38+
39+
it('computes correct stats from synthetic runs', () => {
40+
const now = Date.now();
41+
const runs: HarnessRunMetrics[] = [
42+
{
43+
timestamp: now,
44+
task: 'test-1',
45+
plannerModel: 'sonnet',
46+
reviewerModel: 'sonnet',
47+
implementer: 'codex',
48+
planLatencyMs: 2000,
49+
totalLatencyMs: 10000,
50+
iterations: 1,
51+
approved: true,
52+
editAttempts: 5,
53+
editSuccesses: 5,
54+
editFuzzyFallbacks: 0,
55+
contextTokens: 4000,
56+
},
57+
{
58+
timestamp: now,
59+
task: 'test-2',
60+
plannerModel: 'sonnet',
61+
reviewerModel: 'sonnet',
62+
implementer: 'codex',
63+
planLatencyMs: 3000,
64+
totalLatencyMs: 20000,
65+
iterations: 2,
66+
approved: true,
67+
editAttempts: 10,
68+
editSuccesses: 8,
69+
editFuzzyFallbacks: 2,
70+
contextTokens: 5000,
71+
},
72+
{
73+
timestamp: now,
74+
task: 'test-3',
75+
plannerModel: 'sonnet',
76+
reviewerModel: 'sonnet',
77+
implementer: 'claude',
78+
planLatencyMs: 5000,
79+
totalLatencyMs: 30000,
80+
iterations: 2,
81+
approved: false,
82+
editAttempts: 3,
83+
editSuccesses: 1,
84+
editFuzzyFallbacks: 1,
85+
contextTokens: 6000,
86+
},
87+
];
88+
89+
const s = summarizeRuns(runs);
90+
expect(s.totalRuns).toBe(3);
91+
expect(s.approvalRate).toBeCloseTo(2 / 3, 2);
92+
expect(s.firstPassRate).toBeCloseTo(1 / 3, 2);
93+
expect(s.avgIterations).toBeCloseTo(5 / 3, 2);
94+
expect(s.editSuccessRate).toBeCloseTo(14 / 18, 2);
95+
expect(s.editFuzzyRate).toBeCloseTo(3 / 14, 2);
96+
expect(s.avgContextTokens).toBe(5000);
97+
98+
// Target checks
99+
expect(s.passesTargets).toHaveProperty('planLatency');
100+
expect(s.passesTargets).toHaveProperty('editSuccess');
101+
expect(s.passesTargets).toHaveProperty('contextBudget');
102+
});
103+
});

0 commit comments

Comments
 (0)