stackmemoryai
diff --git a/‎src/cli/commands/bench.ts‎
Lines changed: 240 additions & 0 deletions b/‎src/cli/commands/bench.ts‎
Lines changed: 240 additions & 0 deletions
diff --git a/‎src/cli/index.ts‎
Lines changed: 2 additions & 0 deletions b/‎src/cli/index.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/orchestrators/multimodal/__tests__/baselines.test.ts‎
Lines changed: 103 additions & 0 deletions b/‎src/orchestrators/multimodal/__tests__/baselines.test.ts‎
Lines changed: 103 additions & 0 deletions
@@ -0,0 +1,240 @@
+/**
+ * Bench Command for StackMemory CLI
+ *
+ * Runs harness benchmarks and compares against online baselines
+ * (SWE-bench Verified, internal targets).
+ */
+
+import { Command } from 'commander';
+import { existsSync, readFileSync, readdirSync } from 'fs';
+import { join } from 'path';
+import {
+  SWE_BENCH_BASELINES,
+  HARNESS_TARGETS,
+  summarizeRuns,
+} from '../../orchestrators/multimodal/baselines.js';
+import type { HarnessRunMetrics } from '../../orchestrators/multimodal/baselines.js';
+
+function loadRunMetrics(projectRoot: string): HarnessRunMetrics[] {
+  const metricsFile = join(
+    projectRoot,
+    '.stackmemory',
+    'build',
+    'harness-metrics.jsonl'
+  );
+  if (!existsSync(metricsFile)) return [];
+
+  const lines = readFileSync(metricsFile, 'utf-8')
+    .split('\n')
+    .filter((l) => l.trim());
+  const runs: HarnessRunMetrics[] = [];
+  for (const line of lines) {
+    try {
+      runs.push(JSON.parse(line));
+    } catch {
+      // skip malformed
+    }
+  }
+  return runs;
+}
+
+function loadSpikeAudits(
+  projectRoot: string
+): Array<{ file: string; data: any }> {
+  const dir = join(projectRoot, '.stackmemory', 'build');
+  if (!existsSync(dir)) return [];
+
+  return readdirSync(dir)
+    .filter((f) => f.startsWith('spike-') && f.endsWith('.json'))
+    .sort()
+    .reverse()
+    .slice(0, 20)
+    .map((f) => {
+      try {
+        return {
+          file: f,
+          data: JSON.parse(readFileSync(join(dir, f), 'utf-8')),
+        };
+      } catch {
+        return null;
+      }
+    })
+    .filter(Boolean) as Array<{ file: string; data: any }>;
+}
+
+export function createBenchCommand(): Command {
+  const bench = new Command('bench')
+    .description(
+      'Harness benchmarks — compare local runs against SWE-bench baselines'
+    )
+    .option('--json', 'Output as JSON', false)
+    .option('-d, --days <n>', 'Only include runs from last N days', '30')
+    .option('--baselines', 'Show online benchmark baselines only', false)
+    .action(async (options) => {
+      const projectRoot = process.cwd();
+
+      // Baselines-only mode
+      if (options.baselines) {
+        if (options.json) {
+          console.log(
+            JSON.stringify(
+              { baselines: SWE_BENCH_BASELINES, targets: HARNESS_TARGETS },
+              null,
+              2
+            )
+          );
+          return;
+        }
+        console.log('\nOnline Benchmark Baselines (SWE-bench Verified)');
+        console.log('─'.repeat(60));
+        console.log(
+          `${'Agent'.padEnd(20)} ${'Model'.padEnd(20)} ${'Resolve'.padStart(8)}`
+        );
+        console.log('─'.repeat(60));
+        for (const b of SWE_BENCH_BASELINES) {
+          console.log(
+            `${b.agent.padEnd(20)} ${b.model.padEnd(20)} ${(b.resolveRate * 100).toFixed(1).padStart(7)}%`
+          );
+        }
+        console.log('─'.repeat(60));
+
+        console.log('\nInternal Harness Targets');
+        console.log('─'.repeat(60));
+        console.log(
+          `  Plan latency P95:        ${HARNESS_TARGETS.planLatencyP95Ms}ms`
+        );
+        console.log(
+          `  Total latency P95:       ${HARNESS_TARGETS.totalLatencyP95Ms}ms`
+        );
+        console.log(
+          `  First-pass approval:     ${(HARNESS_TARGETS.firstPassApprovalRate * 100).toFixed(0)}%`
+        );
+        console.log(
+          `  Edit success rate:       ${(HARNESS_TARGETS.editSuccessRate * 100).toFixed(0)}%`
+        );
+        console.log(
+          `  Fuzzy fallback rate:     <${(HARNESS_TARGETS.editFuzzyFallbackRate * 100).toFixed(0)}%`
+        );
+        console.log(
+          `  Context token budget:    ${HARNESS_TARGETS.contextTokenBudget}`
+        );
+        console.log('');
+        return;
+      }
+
+      // Load local run data
+      const days = parseInt(options.days, 10) || 30;
+      const cutoff = Date.now() - days * 86400_000;
+      const allRuns = loadRunMetrics(projectRoot);
+      const runs = allRuns.filter((r) => r.timestamp >= cutoff);
+      const audits = loadSpikeAudits(projectRoot);
+
+      if (options.json) {
+        const summary = summarizeRuns(runs);
+        console.log(
+          JSON.stringify(
+            {
+              summary,
+              baselines: SWE_BENCH_BASELINES,
+              targets: HARNESS_TARGETS,
+              runsInWindow: runs.length,
+              totalRuns: allRuns.length,
+              recentAudits: audits.length,
+            },
+            null,
+            2
+          )
+        );
+        return;
+      }
+
+      // Human output
+      console.log(`\nHarness Benchmark Report (last ${days} days)`);
+      console.log('═'.repeat(60));
+
+      if (runs.length === 0) {
+        console.log('\nNo harness runs recorded yet.');
+        console.log('Run: stackmemory build "your task" --execute');
+        console.log('Or:  stackmemory mm-spike -t "task" --execute\n');
+
+        // Still show baselines for context
+        console.log('Online Baselines (SWE-bench Verified):');
+        for (const b of SWE_BENCH_BASELINES.slice(0, 3)) {
+          console.log(
+            `  ${b.agent.padEnd(16)} ${(b.resolveRate * 100).toFixed(1)}%`
+          );
+        }
+        console.log('');
+        return;
+      }
+
+      const summary = summarizeRuns(runs);
+
+      // Harness metrics
+      console.log('\nHarness Metrics:');
+      console.log(`  Total runs:            ${summary.totalRuns}`);
+      console.log(
+        `  Approval rate:         ${(summary.approvalRate * 100).toFixed(1)}%`
+      );
+      console.log(
+        `  First-pass rate:       ${(summary.firstPassRate * 100).toFixed(1)}%`
+      );
+      console.log(
+        `  Avg iterations:        ${summary.avgIterations.toFixed(1)}`
+      );
+      console.log(
+        `  Plan latency (avg):    ${Math.round(summary.avgPlanLatencyMs)}ms`
+      );
+      console.log(
+        `  Plan latency (P95):    ${Math.round(summary.p95PlanLatencyMs)}ms`
+      );
+      console.log(
+        `  Total latency (avg):   ${Math.round(summary.avgTotalLatencyMs)}ms`
+      );
+      console.log(
+        `  Total latency (P95):   ${Math.round(summary.p95TotalLatencyMs)}ms`
+      );
+      console.log(
+        `  Edit success rate:     ${(summary.editSuccessRate * 100).toFixed(1)}%`
+      );
+      console.log(
+        `  Fuzzy fallback rate:   ${(summary.editFuzzyRate * 100).toFixed(1)}%`
+      );
+      console.log(
+        `  Context tokens (avg):  ${Math.round(summary.avgContextTokens)}`
+      );
+
+      // Target comparison
+      console.log('\nTarget Comparison:');
+      const checks = summary.passesTargets;
+      for (const [key, passes] of Object.entries(checks)) {
+        const icon = passes ? 'PASS' : 'FAIL';
+        console.log(`  [${icon}] ${key}`);
+      }
+
+      // Online baseline comparison
+      console.log('\nOnline Baselines (SWE-bench Verified):');
+      for (const b of SWE_BENCH_BASELINES.slice(0, 4)) {
+        console.log(
+          `  ${b.agent.padEnd(16)} ${(b.resolveRate * 100).toFixed(1)}%`
+        );
+      }
+
+      // Recent audits
+      if (audits.length > 0) {
+        console.log(`\nRecent Spike Audits (${audits.length}):`);
+        for (const a of audits.slice(0, 5)) {
+          const task = a.data?.input?.task || '(unknown)';
+          const approved = a.data?.iterations?.some(
+            (it: any) => it.critique?.approved
+          );
+          const icon = approved ? 'OK' : '--';
+          console.log(`  [${icon}] ${task.slice(0, 50)}`);
+        }
+      }
+
+      console.log('');
+    });
+
+  return bench;
+}
@@ -60,6 +60,7 @@ import { registerSetupCommands } from './commands/setup.js';
 import { createPingCommand } from './commands/ping.js';
 import { createAuditCommand } from './commands/audit.js';
 import { createStatsCommand } from './commands/stats.js';
+import { createBenchCommand } from './commands/bench.js';
 import chalk from 'chalk';
 import * as fs from 'fs';
 import * as path from 'path';
@@ -748,6 +749,7 @@ program.addCommand(createDiscoveryCommands());
 program.addCommand(createModelCommand());
 program.addCommand(createAuditCommand());
 program.addCommand(createStatsCommand());
+program.addCommand(createBenchCommand());
 
 // Register setup and diagnostic commands
 registerSetupCommands(program);
 
@@ -0,0 +1,103 @@
+import { describe, it, expect } from 'vitest';
+import {
+  SWE_BENCH_BASELINES,
+  HARNESS_TARGETS,
+  summarizeRuns,
+} from '../baselines.js';
+import type { HarnessRunMetrics } from '../baselines.js';
+
+describe('baselines', () => {
+  it('SWE-bench baselines are valid and sorted by resolve rate', () => {
+    expect(SWE_BENCH_BASELINES.length).toBeGreaterThanOrEqual(3);
+    for (const b of SWE_BENCH_BASELINES) {
+      expect(b.resolveRate).toBeGreaterThan(0);
+      expect(b.resolveRate).toBeLessThanOrEqual(1);
+      expect(b.source).toMatch(/^https?:\/\//);
+    }
+    // Sorted descending
+    for (let i = 1; i < SWE_BENCH_BASELINES.length; i++) {
+      expect(SWE_BENCH_BASELINES[i - 1].resolveRate).toBeGreaterThanOrEqual(
+        SWE_BENCH_BASELINES[i].resolveRate
+      );
+    }
+  });
+
+  it('harness targets are sensible', () => {
+    expect(HARNESS_TARGETS.planLatencyP95Ms).toBeLessThan(30_000);
+    expect(HARNESS_TARGETS.editSuccessRate).toBeGreaterThan(0.5);
+    expect(HARNESS_TARGETS.contextTokenBudget).toBeGreaterThan(1000);
+  });
+});
+
+describe('summarizeRuns', () => {
+  it('returns zeros for empty input', () => {
+    const s = summarizeRuns([]);
+    expect(s.totalRuns).toBe(0);
+    expect(s.approvalRate).toBe(0);
+  });
+
+  it('computes correct stats from synthetic runs', () => {
+    const now = Date.now();
+    const runs: HarnessRunMetrics[] = [
+      {
+        timestamp: now,
+        task: 'test-1',
+        plannerModel: 'sonnet',
+        reviewerModel: 'sonnet',
+        implementer: 'codex',
+        planLatencyMs: 2000,
+        totalLatencyMs: 10000,
+        iterations: 1,
+        approved: true,
+        editAttempts: 5,
+        editSuccesses: 5,
+        editFuzzyFallbacks: 0,
+        contextTokens: 4000,
+      },
+      {
+        timestamp: now,
+        task: 'test-2',
+        plannerModel: 'sonnet',
+        reviewerModel: 'sonnet',
+        implementer: 'codex',
+        planLatencyMs: 3000,
+        totalLatencyMs: 20000,
+        iterations: 2,
+        approved: true,
+        editAttempts: 10,
+        editSuccesses: 8,
+        editFuzzyFallbacks: 2,
+        contextTokens: 5000,
+      },
+      {
+        timestamp: now,
+        task: 'test-3',
+        plannerModel: 'sonnet',
+        reviewerModel: 'sonnet',
+        implementer: 'claude',
+        planLatencyMs: 5000,
+        totalLatencyMs: 30000,
+        iterations: 2,
+        approved: false,
+        editAttempts: 3,
+        editSuccesses: 1,
+        editFuzzyFallbacks: 1,
+        contextTokens: 6000,
+      },
+    ];
+
+    const s = summarizeRuns(runs);
+    expect(s.totalRuns).toBe(3);
+    expect(s.approvalRate).toBeCloseTo(2 / 3, 2);
+    expect(s.firstPassRate).toBeCloseTo(1 / 3, 2);
+    expect(s.avgIterations).toBeCloseTo(5 / 3, 2);
+    expect(s.editSuccessRate).toBeCloseTo(14 / 18, 2);
+    expect(s.editFuzzyRate).toBeCloseTo(3 / 14, 2);
+    expect(s.avgContextTokens).toBe(5000);
+
+    // Target checks
+    expect(s.passesTargets).toHaveProperty('planLatency');
+    expect(s.passesTargets).toHaveProperty('editSuccess');
+    expect(s.passesTargets).toHaveProperty('contextBudget');
+  });
+});