|
| 1 | +#!/usr/bin/env bun |
| 2 | +/** |
| 3 | + * benchmark-hooks.ts — Replay action-stream through hooks, measure effectiveness |
| 4 | + * |
| 5 | + * Reads ~/.stackmemory/desire-paths/action-stream.jsonl and simulates |
| 6 | + * what each hook would have caught/suggested, producing a before/after report. |
| 7 | + * |
| 8 | + * Usage: bun run scripts/benchmark-hooks.ts [--output docs/benchmark-report.md] |
| 9 | + */ |
| 10 | + |
| 11 | +import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs'; |
| 12 | +import { join } from 'path'; |
| 13 | +import { execSync } from 'child_process'; |
| 14 | + |
| 15 | +const HOME = process.env.HOME || '/tmp'; |
| 16 | +const STREAM_FILE = join(HOME, '.stackmemory/desire-paths/action-stream.jsonl'); |
| 17 | +const OUTPUT_FLAG = process.argv.indexOf('--output'); |
| 18 | +const OUTPUT_PATH = |
| 19 | + OUTPUT_FLAG !== -1 |
| 20 | + ? process.argv[OUTPUT_FLAG + 1] |
| 21 | + : join(import.meta.dir, '..', 'docs', 'benchmark-report.md'); |
| 22 | + |
| 23 | +interface Entry { |
| 24 | + ts: string; |
| 25 | + sid: string; |
| 26 | + tool: string; |
| 27 | + target: string; |
| 28 | + dur?: number; |
| 29 | +} |
| 30 | + |
| 31 | +// --- Load action stream --- |
| 32 | +const raw = readFileSync(STREAM_FILE, 'utf-8'); |
| 33 | +const entries: Entry[] = raw |
| 34 | + .split('\n') |
| 35 | + .filter(Boolean) |
| 36 | + .map((l) => { |
| 37 | + try { |
| 38 | + return JSON.parse(l); |
| 39 | + } catch { |
| 40 | + return null; |
| 41 | + } |
| 42 | + }) |
| 43 | + .filter(Boolean) as Entry[]; |
| 44 | + |
| 45 | +console.log( |
| 46 | + `Loaded ${entries.length} entries from ${new Set(entries.map((e) => e.sid)).size} sessions` |
| 47 | +); |
| 48 | + |
| 49 | +// --- Metrics --- |
| 50 | +const metrics = { |
| 51 | + total: entries.length, |
| 52 | + sessions: new Set(entries.map((e) => e.sid)).size, |
| 53 | + |
| 54 | + // Dedup analysis |
| 55 | + reads: 0, |
| 56 | + duplicateReads: 0, |
| 57 | + wouldWarn3x: 0, |
| 58 | + wouldStop5x: 0, |
| 59 | + |
| 60 | + // Auto-route analysis |
| 61 | + bashTotal: 0, |
| 62 | + bashAsRead: 0, |
| 63 | + bashAsGlob: 0, |
| 64 | + bashAsGrep: 0, |
| 65 | + bashGit: 0, |
| 66 | + bashLegit: 0, |
| 67 | + |
| 68 | + // Script-suggest analysis |
| 69 | + gitSequences: 0, |
| 70 | + ghRunCalls: 0, |
| 71 | + webFetchCalls: 0, |
| 72 | + webSearchCalls: 0, |
| 73 | + scriptSuggestions: 0, |
| 74 | + |
| 75 | + // Prewarm analysis |
| 76 | + toolSearchCalls: 0, |
| 77 | + uniqueDeferredTools: new Set<string>(), |
| 78 | + prewarmWouldCatch: 0, |
| 79 | +}; |
| 80 | + |
| 81 | +// --- Dedup simulation --- |
| 82 | +const sessionReads: Record<string, Record<string, number>> = {}; |
| 83 | + |
| 84 | +for (const e of entries) { |
| 85 | + if (e.tool === 'Read') { |
| 86 | + metrics.reads++; |
| 87 | + const key = e.sid; |
| 88 | + if (!sessionReads[key]) sessionReads[key] = {}; |
| 89 | + const count = (sessionReads[key][e.target] || 0) + 1; |
| 90 | + sessionReads[key][e.target] = count; |
| 91 | + if (count >= 5) metrics.wouldStop5x++; |
| 92 | + else if (count >= 3) metrics.wouldWarn3x++; |
| 93 | + if (count > 1) metrics.duplicateReads++; |
| 94 | + } |
| 95 | +} |
| 96 | + |
| 97 | +// --- Auto-route simulation --- |
| 98 | +for (const e of entries) { |
| 99 | + if (e.tool !== 'Bash') continue; |
| 100 | + metrics.bashTotal++; |
| 101 | + const cmd = e.target || ''; |
| 102 | + |
| 103 | + if (/^(cat|head|tail|sed\s+-n|nl)\s/.test(cmd)) { |
| 104 | + metrics.bashAsRead++; |
| 105 | + } else if (/^(ls|find)\s/.test(cmd)) { |
| 106 | + metrics.bashAsGlob++; |
| 107 | + } else if (/^(grep|rg|ag)\s/.test(cmd)) { |
| 108 | + metrics.bashAsGrep++; |
| 109 | + } else if (/^git\s/.test(cmd)) { |
| 110 | + metrics.bashGit++; |
| 111 | + } else { |
| 112 | + metrics.bashLegit++; |
| 113 | + } |
| 114 | +} |
| 115 | + |
| 116 | +// --- Script-suggest simulation --- |
| 117 | +// Count sequences of 3+ git bash calls per session |
| 118 | +const sessionTools: Record<string, Entry[]> = {}; |
| 119 | +for (const e of entries) { |
| 120 | + if (!sessionTools[e.sid]) sessionTools[e.sid] = []; |
| 121 | + sessionTools[e.sid].push(e); |
| 122 | +} |
| 123 | + |
| 124 | +for (const [, tools] of Object.entries(sessionTools)) { |
| 125 | + let gitStreak = 0; |
| 126 | + for (const t of tools) { |
| 127 | + if (t.tool === 'Bash' && /^git\s/.test(t.target || '')) { |
| 128 | + gitStreak++; |
| 129 | + if (gitStreak === 3) metrics.gitSequences++; |
| 130 | + } else { |
| 131 | + gitStreak = 0; |
| 132 | + } |
| 133 | + if (t.tool === 'Bash' && /gh\s+run\s/.test(t.target || '')) |
| 134 | + metrics.ghRunCalls++; |
| 135 | + if (t.tool === 'WebFetch') metrics.webFetchCalls++; |
| 136 | + if (t.tool === 'WebSearch') metrics.webSearchCalls++; |
| 137 | + } |
| 138 | +} |
| 139 | +metrics.scriptSuggestions = |
| 140 | + metrics.gitSequences + |
| 141 | + metrics.ghRunCalls + |
| 142 | + metrics.webFetchCalls + |
| 143 | + metrics.webSearchCalls; |
| 144 | + |
| 145 | +// --- Prewarm simulation --- |
| 146 | +const DEFERRED_PREFIXES = [ |
| 147 | + 'mcp__', |
| 148 | + 'TaskCreate', |
| 149 | + 'TaskUpdate', |
| 150 | + 'TaskGet', |
| 151 | + 'WebFetch', |
| 152 | + 'WebSearch', |
| 153 | +]; |
| 154 | +for (const e of entries) { |
| 155 | + if (e.tool === 'ToolSearch') metrics.toolSearchCalls++; |
| 156 | + if (DEFERRED_PREFIXES.some((p) => e.tool.startsWith(p))) { |
| 157 | + metrics.uniqueDeferredTools.add(e.tool); |
| 158 | + } |
| 159 | +} |
| 160 | +// If we pre-warm top 8, how many ToolSearch calls would we avoid? |
| 161 | +// Estimate: each unique deferred tool needs 1 ToolSearch fetch per session |
| 162 | +const topTools = [...metrics.uniqueDeferredTools].slice(0, 8); |
| 163 | +metrics.prewarmWouldCatch = Math.min( |
| 164 | + metrics.toolSearchCalls, |
| 165 | + topTools.length * metrics.sessions * 0.3 |
| 166 | +); // ~30% of sessions use each |
| 167 | + |
| 168 | +// --- Generate report --- |
| 169 | +const replaceable = |
| 170 | + metrics.bashAsRead + metrics.bashAsGlob + metrics.bashAsGrep; |
| 171 | +const report = `# StackMemory Hook Benchmark Report |
| 172 | +
|
| 173 | +> Generated: ${new Date().toISOString()} |
| 174 | +> Data: ${metrics.total} tool calls across ${metrics.sessions} sessions |
| 175 | +
|
| 176 | +## Baseline (before hooks) |
| 177 | +
|
| 178 | +| Metric | Value | % of total | |
| 179 | +|--------|------:|----------:| |
| 180 | +| Total tool calls | ${metrics.total} | 100% | |
| 181 | +| Read calls | ${metrics.reads} | ${((metrics.reads / metrics.total) * 100).toFixed(1)}% | |
| 182 | +| Duplicate reads | ${metrics.duplicateReads} | ${((metrics.duplicateReads / metrics.total) * 100).toFixed(1)}% | |
| 183 | +| Bash calls | ${metrics.bashTotal} | ${((metrics.bashTotal / metrics.total) * 100).toFixed(1)}% | |
| 184 | +| Bash → should be Glob | ${metrics.bashAsGlob} | ${((metrics.bashAsGlob / metrics.total) * 100).toFixed(1)}% | |
| 185 | +| Bash → should be Read | ${metrics.bashAsRead} | ${((metrics.bashAsRead / metrics.total) * 100).toFixed(1)}% | |
| 186 | +| Bash → should be Grep | ${metrics.bashAsGrep} | ${((metrics.bashAsGrep / metrics.total) * 100).toFixed(1)}% | |
| 187 | +| Bash (git) | ${metrics.bashGit} | ${((metrics.bashGit / metrics.total) * 100).toFixed(1)}% | |
| 188 | +| Bash (legit) | ${metrics.bashLegit} | ${((metrics.bashLegit / metrics.total) * 100).toFixed(1)}% | |
| 189 | +| ToolSearch calls | ${metrics.toolSearchCalls} | ${((metrics.toolSearchCalls / metrics.total) * 100).toFixed(1)}% | |
| 190 | +
|
| 191 | +## Hook Effectiveness (projected) |
| 192 | +
|
| 193 | +### 1. Dedup Reads (escalation at 3x soft / 5x STOP) |
| 194 | +- Would warn (3-4x): **${metrics.wouldWarn3x}** calls |
| 195 | +- Would STOP (5x+): **${metrics.wouldStop5x}** calls |
| 196 | +- Combined catch: **${metrics.wouldWarn3x + metrics.wouldStop5x}** / ${metrics.reads} reads = **${(((metrics.wouldWarn3x + metrics.wouldStop5x) / metrics.reads) * 100).toFixed(1)}%** |
| 197 | +- Token savings estimate: ~${((metrics.wouldStop5x * 200) / 1000).toFixed(0)}K tokens (STOP prevents re-read) |
| 198 | +
|
| 199 | +### 2. Auto-Route (Bash → dedicated tools) |
| 200 | +- Replaceable calls caught: **${replaceable}** / ${metrics.bashTotal} Bash calls = **${((replaceable / metrics.bashTotal) * 100).toFixed(1)}%** |
| 201 | +- Breakdown: ${metrics.bashAsGlob} ls/find → Glob, ${metrics.bashAsRead} cat/head → Read, ${metrics.bashAsGrep} grep → Grep |
| 202 | +- Token savings estimate: ~${((replaceable * 50) / 1000).toFixed(0)}K tokens (reduced overhead per call) |
| 203 | +
|
| 204 | +### 3. Prewarm (pre-fetch deferred tool schemas) |
| 205 | +- ToolSearch calls observed: **${metrics.toolSearchCalls}** |
| 206 | +- Unique deferred tools: **${metrics.uniqueDeferredTools.size}** |
| 207 | +- Top 8 tools cover: ~${topTools.length} tools |
| 208 | +- Estimated catches: **~${Math.round(metrics.prewarmWouldCatch)}** avoided ToolSearch calls |
| 209 | +- Token savings estimate: ~${((metrics.prewarmWouldCatch * 150) / 1000).toFixed(0)}K tokens |
| 210 | +
|
| 211 | +### 4. Script-Suggest (pattern → script) |
| 212 | +- Git sequences (3+ cmds): **${metrics.gitSequences}** → git-ops.ts |
| 213 | +- gh run calls: **${metrics.ghRunCalls}** → build-status.ts |
| 214 | +- WebFetch calls: **${metrics.webFetchCalls}** → web-fetch.ts |
| 215 | +- WebSearch calls: **${metrics.webSearchCalls}** → web-search.ts |
| 216 | +- Total suggestions would fire: **${metrics.scriptSuggestions}** |
| 217 | +- Token savings estimate: ~${((metrics.scriptSuggestions * 4 * 200) / 1000).toFixed(0)}K tokens (each script replaces ~4 calls) |
| 218 | +
|
| 219 | +## Summary |
| 220 | +
|
| 221 | +| Hook | Catches | Est. token savings | |
| 222 | +|------|--------:|------------------:| |
| 223 | +| Dedup STOP | ${metrics.wouldStop5x} reads | ~${((metrics.wouldStop5x * 200) / 1000).toFixed(0)}K | |
| 224 | +| Auto-route | ${replaceable} Bash calls | ~${((replaceable * 50) / 1000).toFixed(0)}K | |
| 225 | +| Prewarm | ~${Math.round(metrics.prewarmWouldCatch)} ToolSearch | ~${((metrics.prewarmWouldCatch * 150) / 1000).toFixed(0)}K | |
| 226 | +| Script-suggest | ${metrics.scriptSuggestions} patterns | ~${((metrics.scriptSuggestions * 4 * 200) / 1000).toFixed(0)}K | |
| 227 | +| **Total** | | **~${((metrics.wouldStop5x * 200 + replaceable * 50 + metrics.prewarmWouldCatch * 150 + metrics.scriptSuggestions * 4 * 200) / 1000).toFixed(0)}K** | |
| 228 | +
|
| 229 | +Baseline total estimated tokens: ~${((metrics.total * 200) / 1000).toFixed(0)}K |
| 230 | +Projected waste reduction: **${(((metrics.wouldStop5x * 200 + replaceable * 50 + metrics.prewarmWouldCatch * 150 + metrics.scriptSuggestions * 4 * 200) / (metrics.total * 200)) * 100).toFixed(1)}%** |
| 231 | +`; |
| 232 | + |
| 233 | +// Write report |
| 234 | +const dir = join(import.meta.dir, '..', 'docs'); |
| 235 | +if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); |
| 236 | +writeFileSync(OUTPUT_PATH, report); |
| 237 | +console.log(`\nReport written to: ${OUTPUT_PATH}`); |
| 238 | +console.log(`\n--- Quick Summary ---`); |
| 239 | +console.log( |
| 240 | + `Dedup would catch: ${metrics.wouldWarn3x + metrics.wouldStop5x} reads (${metrics.wouldStop5x} hard-stopped)` |
| 241 | +); |
| 242 | +console.log(`Auto-route would catch: ${replaceable} Bash calls`); |
| 243 | +console.log(`Script-suggest would fire: ${metrics.scriptSuggestions} times`); |
| 244 | +console.log( |
| 245 | + `Projected total savings: ~${((metrics.wouldStop5x * 200 + replaceable * 50 + metrics.prewarmWouldCatch * 150 + metrics.scriptSuggestions * 4 * 200) / 1000).toFixed(0)}K tokens` |
| 246 | +); |
0 commit comments