From 11746d801a0aacc68c20e7bff10180e25c44cbfc Mon Sep 17 00:00:00 2001 From: Sanjay Singh Date: Wed, 24 Jun 2026 19:14:42 +0200 Subject: [PATCH 1/3] perf(parser): fix O(n^2) JSONL streaming that made Sync take >2min The cold-parse JSONL readers accumulated `pending += chunk` and then re-ran `pending.split('\n')` over the entire growing buffer on every 1MB read. Copilot CLI events.jsonl lines embed tens of MB of inline base64, so a single long line spanning many chunks cost O(n^2) CPU plus heavy GC. CPU-profiling a real 616-session / 132-workspace parse attributed 154.5s of 198s to forEachJsonlLineAsync self-time and 29.8s to GC. Replacing the repeated whole-buffer split with emitChunkLines(), which scans only each freshly read chunk via indexOf, makes the work linear with identical line semantics: phase-2 parse dropped from ~197s to ~14s (~14x) with the same parsed result. Also adds local-only Sync timing telemetry to the VS Code Output channel (sync-timing, worker-phase-timing, cold-parse-breakdown, phase2-attribution) to attribute future regressions. No remote telemetry or network calls. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/core/parser-shared.ts | 45 ++++++++++++++++++++++++++++ src/core/parser-vscode-files.test.ts | 21 +++++++++++++ src/core/parser-vscode-files.ts | 43 +++++++++++++++++++------- src/core/parser-vscode.ts | 8 ++++- src/core/parser-worker-host.test.ts | 24 +++++++++++---- src/core/parser-worker-host.ts | 18 +++++++++++ src/core/parser.ts | 35 ++++++++++++++++++++-- src/webview/panel.ts | 21 ++++++++++++- 8 files changed, 196 insertions(+), 19 deletions(-) diff --git a/src/core/parser-shared.ts b/src/core/parser-shared.ts index c50d14e6..618a3791 100644 --- a/src/core/parser-shared.ts +++ b/src/core/parser-shared.ts @@ -172,9 +172,54 @@ export function maybeForceGc(): void { if (now - lastForcedGcAt < FORCE_GC_MIN_INTERVAL_MS) return; if (process.memoryUsage().rss < FORCE_GC_RSS_THRESHOLD) return; lastForcedGcAt = now; + parseTiming.forcedGc++; try { gc(); } catch { /* gc unavailable */ } } +/* ---- Cold-parse sub-phase attribution (issue #106 follow-up) ---- + * The end-to-end `sync-timing` line proved phase-2 parse is ~99% of sync wall-clock. These + * counters break that phase down by sub-step so a single re-run shows where the time actually + * goes (chat-file parse vs edit-state parse vs CLI events vs forced-GC pauses) instead of guessing. + * Pure measurement: never alters parsed output. Reset per parse run. */ +export interface ParseTiming { + /** Cumulative ms spent in parseSessionFile (read + strip + JSON.parse + request build). */ + chatMs: number; + /** Cumulative ms spent parsing chatEditingSessions state files. */ + editMs: number; + /** Cumulative ms spent parsing CLI events.jsonl files. */ + cliMs: number; + /** Number of chat session files parsed. */ + chatFiles: number; + /** Number of edit-state files parsed. */ + editFiles: number; + /** Number of times a proactive full GC was actually forced. */ + forcedGc: number; +} + +const parseTiming: ParseTiming = { chatMs: 0, editMs: 0, cliMs: 0, chatFiles: 0, editFiles: 0, forcedGc: 0 }; + +/** Accumulate elapsed time (ms) for a cold-parse sub-step. */ +export function addParseTiming(kind: 'chat' | 'edit' | 'cli', ms: number): void { + if (kind === 'chat') { parseTiming.chatMs += ms; parseTiming.chatFiles++; } + else if (kind === 'edit') { parseTiming.editMs += ms; parseTiming.editFiles++; } + else parseTiming.cliMs += ms; +} + +/** Reset all sub-phase counters at the start of a parse run. */ +export function resetParseTiming(): void { + parseTiming.chatMs = 0; + parseTiming.editMs = 0; + parseTiming.cliMs = 0; + parseTiming.chatFiles = 0; + parseTiming.editFiles = 0; + parseTiming.forcedGc = 0; +} + +/** Snapshot the current sub-phase counters for logging. */ +export function getParseTiming(): ParseTiming { + return { ...parseTiming }; +} + export const CODE_BLOCK_RE = /```(\w+)?\n([\s\S]*?)```/g; const MAX_STORED_MESSAGE_CHARS = 16_000; diff --git a/src/core/parser-vscode-files.test.ts b/src/core/parser-vscode-files.test.ts index addd5729..02b9e1ae 100644 --- a/src/core/parser-vscode-files.test.ts +++ b/src/core/parser-vscode-files.test.ts @@ -392,6 +392,17 @@ describe('forEachJsonlLine (sync streaming reader)', () => { expect(lines[1]).toBe('b'); }); }); + + it('emits a single line that spans many read chunks intact', () => { + // A Copilot CLI events.jsonl line can embed tens of MB of inline base64. The reader must scan + // only each freshly read chunk for newlines rather than re-splitting the whole growing buffer, + // or one long line costs O(n^2) CPU (issue #106 follow-up). This line spans ~3.5 read chunks. + const big = 'x'.repeat(READ_CHUNK * 3 + 512); + withTempFile('huge.jsonl', `first\n${big}\nlast\n`, (fp) => { + const lines = collectLines((cb) => forEachJsonlLine(fp, cb)); + expect(lines).toEqual(['first', big, 'last']); + }); + }); }); describe('forEachJsonlLineAsync (async streaming reader)', () => { @@ -404,6 +415,16 @@ describe('forEachJsonlLineAsync (async streaming reader)', () => { expect(lines).toEqual(['one', 'two', 'last']); }); + it('emits a single line that spans many read chunks intact', async () => { + // Guards the same O(n^2) regression as the sync reader: the async cold-parse path streams the + // large CLI events.jsonl files, where a single inline-base64 line can span many chunks. + const big = 'y'.repeat(READ_CHUNK * 3 + 512); + const fp = makeTempFile('huge-async.jsonl', `first\n${big}\nlast\n`); + const lines: string[] = []; + await forEachJsonlLineAsync(fp, (line) => lines.push(line)); + expect(lines).toEqual(['first', big, 'last']); + }); + it('reports final byte progress equal to the file size', async () => { const fp = makeTempFile('a.jsonl', 'one\ntwo\n'); const total = fs.statSync(fp).size; diff --git a/src/core/parser-vscode-files.ts b/src/core/parser-vscode-files.ts index 98b9dcbf..06a2e578 100644 --- a/src/core/parser-vscode-files.ts +++ b/src/core/parser-vscode-files.ts @@ -29,6 +29,37 @@ export function readFile(fpath: string): string { // the longest single line, instead of the whole file (issue #106). const JSONL_READ_CHUNK = 1024 * 1024; +/** + * Emit every complete newline-delimited line contained in `pending + chunk`, returning the new + * trailing partial line (carried into the next chunk). + * + * Scans only the freshly decoded `chunk` with `indexOf`, joining the carried `pending` only when + * the first newline of the chunk is found. The earlier implementation did `pending += chunk` and + * `pending.split('\n')` on every read; when a single JSONL line spans many chunks — Copilot CLI + * `events.jsonl` lines can embed tens of MB of inline base64 — that re-scanned and re-allocated + * the entire growing buffer on each 1 MB read, making one long line cost O(n²) CPU plus heavy GC. + * Searching only the new chunk keeps total work linear in the file size while producing the exact + * same sequence of lines. + */ +function emitChunkLines(pending: string, chunk: string, onLine: (line: string) => void): string { + let nl = chunk.indexOf('\n'); + if (nl < 0) { + // No line boundary in this chunk: accumulate (V8 keeps this as a cheap cons-string, flattened + // once when the line finally completes) and keep reading. + return pending + chunk; + } + // The first newline completes the line carried over from previous chunks. + onLine(pending + chunk.slice(0, nl)); + let start = nl + 1; + nl = chunk.indexOf('\n', start); + while (nl >= 0) { + onLine(chunk.slice(start, nl)); + start = nl + 1; + nl = chunk.indexOf('\n', start); + } + return chunk.slice(start); +} + /** * Invoke `onLine` for each newline-delimited line in a (possibly very large) JSONL file, reading * the file in fixed-size chunks rather than loading it whole. @@ -66,11 +97,7 @@ export function forEachJsonlLine(fpath: string, onLine: (line: string) => void): let pending = ''; let bytesRead: number; while ((bytesRead = fs.readSync(fd, buf, 0, JSONL_READ_CHUNK, null)) > 0) { - pending += decoder.write(buf.subarray(0, bytesRead)); - const parts = pending.split('\n'); - // The last element is an incomplete line (or '') — carry it to the next chunk. - pending = parts.pop() ?? ''; - for (const line of parts) onLine(line); + pending = emitChunkLines(pending, decoder.write(buf.subarray(0, bytesRead)), onLine); } pending += decoder.end(); if (pending.length > 0) onLine(pending); @@ -126,11 +153,7 @@ export async function forEachJsonlLineAsync( while ((bytesRead = fs.readSync(fd, buf, 0, JSONL_READ_CHUNK, null)) > 0) { totalRead += bytesRead; sinceYield += bytesRead; - pending += decoder.write(buf.subarray(0, bytesRead)); - const parts = pending.split('\n'); - // The last element is an incomplete line (or '') — carry it to the next chunk. - pending = parts.pop() ?? ''; - for (const line of parts) onLine(line); + pending = emitChunkLines(pending, decoder.write(buf.subarray(0, bytesRead)), onLine); if (sinceYield >= JSONL_YIELD_BYTES) { sinceYield = 0; onProgress?.(totalRead, totalBytes); diff --git a/src/core/parser-vscode.ts b/src/core/parser-vscode.ts index 6795d9e9..dfabe5d6 100644 --- a/src/core/parser-vscode.ts +++ b/src/core/parser-vscode.ts @@ -8,7 +8,7 @@ import * as fs from 'fs'; import * as path from 'path'; import { Session } from './types'; -import { createSession, detectDevcontainerFromRequests, ParseContext, prefetchCache, stripSingleSession, maybeForceGc } from './parser-shared'; +import { createSession, detectDevcontainerFromRequests, ParseContext, prefetchCache, stripSingleSession, maybeForceGc, addParseTiming } from './parser-shared'; import { debugCore, warnCore } from './log'; import { canonicalizeReasoningEffort } from './helpers'; import { parseRawRequest, normalizeSessionMode, type RawRequest } from './parser-vscode-request'; @@ -353,7 +353,9 @@ export async function processWorkspaceEntryAsync( let completed = 0; for (let i = 0; i < chatFiles.length; i++) { + const tChat = Date.now(); const session = parseSessionFile(chatFiles[i], wsId, wsName, harness, customInstructionsBytes); + addParseTiming('chat', Date.now() - tChat); if (session) { // Strip heavy text the moment a session is parsed so a workspace with many large // sessions can't accumulate its full text before the workspace finishes (issue #106). @@ -385,7 +387,9 @@ export async function processWorkspaceEntryAsync( } const eventsFile = path.join(entryPath, 'events.jsonl'); + const tCli = Date.now(); const cliSession = parseCLIEventsFile(eventsFile, wsId, wsName, customInstructionsBytes); + addParseTiming('cli', Date.now() - tCli); if (cliSession) { stripSingleSession(cliSession); sessions.push(cliSession); @@ -399,7 +403,9 @@ export async function processWorkspaceEntryAsync( } for (let i = 0; i < editStateFiles.length; i++) { + const tEdit = Date.now(); parseEditStateFile(editStateFiles[i], editLocIndex); + addParseTiming('edit', Date.now() - tEdit); completed++; if (shouldReportChunk(i, editStateFiles.length, editEvery)) { onProgress?.({ diff --git a/src/core/parser-worker-host.test.ts b/src/core/parser-worker-host.test.ts index d5b3fb2f..2680af7f 100644 --- a/src/core/parser-worker-host.test.ts +++ b/src/core/parser-worker-host.test.ts @@ -11,8 +11,12 @@ import { EventEmitter } from 'events'; import { describe, it, expect, vi } from 'vitest'; import { parseAllLogsViaWorker, isRetryableWorkerError } from './parser-worker-host'; import { createSession, createRequest } from './parser-shared'; +import { runtimeDebug } from './runtime-debug'; import type { WorkerChunkPayload, WorkerDonePayload } from './parse-chunking'; +vi.mock('./runtime-debug', () => ({ runtimeDebug: vi.fn() })); +const runtimeDebugMock = vi.mocked(runtimeDebug); + /** Minimal stand-in for the forked child process: an EventEmitter with spied send/kill. */ class FakeChild extends EventEmitter { send = vi.fn(); @@ -166,14 +170,24 @@ describe('parseAllLogsViaWorker IPC orchestration', () => { expect(result.parseWarnings?.skippedFiles).toBe(1); }); - it('does not retry after a non-retryable failure', async () => { + it('emits a per-phase sync-timing summary on done', async () => { + runtimeDebugMock.mockClear(); const { fork, children } = queuedFork(); const promise = parseAllLogsViaWorker(['dir'], undefined, { fork: fork as never }); + const child = children[0]; - children[0].emit('message', { type: 'error', message: 'parse worker timeout (10m)' }); + child.emit('message', { type: 'progress', progress: { phase: 1, pct: 5, detail: 'fingerprints' } }); + child.emit('message', { type: 'progress', progress: { phase: 2, pct: 40, detail: 'cold parse' } }); + child.emit('message', doneMessage()); + await promise; - await expect(promise).rejects.toThrow('parse worker timeout (10m)'); - // No second attempt was forked. - expect(children).toHaveLength(1); + const timingCall = runtimeDebugMock.mock.calls.find( + (call) => call[0] === 'parser' && call[1] === 'worker-phase-timing', + ); + expect(timingCall).toBeDefined(); + const detail = typeof timingCall?.[2] === 'string' ? timingCall[2] : ''; + expect(detail).toMatch(/totalMs=\d+/); + expect(detail).toMatch(/p1=\d+ms/); + expect(detail).toMatch(/p2=\d+ms/); }); }); diff --git a/src/core/parser-worker-host.ts b/src/core/parser-worker-host.ts index 62efa1a3..47cdeca5 100644 --- a/src/core/parser-worker-host.ts +++ b/src/core/parser-worker-host.ts @@ -90,6 +90,16 @@ export async function parseAllLogsViaWorker( let lastWorkspaceLogged = 0; let settled = false; + // Per-phase wall-clock accounting for the local sync-timing summary (issue #106 follow-up). + const attemptStart = Date.now(); + let phaseStart = attemptStart; + const phaseDurations = new Map(); + const recordPhaseEnd = (phase: number): void => { + if (phase < 0) return; + phaseDurations.set(phase, (phaseDurations.get(phase) ?? 0) + (Date.now() - phaseStart)); + phaseStart = Date.now(); + }; + // Chunked-IPC assembler (issue #106, S1). Declared per-attempt so a retry starts fresh. const assembler = new ChunkAssembler(); @@ -113,6 +123,7 @@ export async function parseAllLogsViaWorker( child.on('message', (msg: { type: 'progress'; progress: LoadProgress } | { type: 'chunk'; seq: number; payload: WorkerChunkPayload } | { type: 'done'; payload: WorkerDoneMessagePayload } | { type: 'error'; message?: string }) => { if (msg.type === 'progress') { if (msg.progress.phase !== lastPhase) { + recordPhaseEnd(lastPhase); lastPhase = msg.progress.phase; runtimeDebug('parser', 'child-progress-phase', `attempt=${attempt} phase=${msg.progress.phase} detail=${msg.progress.detail || ''}`); } @@ -143,8 +154,15 @@ export async function parseAllLogsViaWorker( } if (msg.type === 'done') { + recordPhaseEnd(lastPhase); const assembled = assembler.finish(msg.payload); runtimeDebug('parser', 'child-done', `attempt=${attempt} chunks=${assembler.chunkCount} workspaces=${msg.payload.workspaces.length} sessions=${assembled.sessions.length}`); + const phaseBreakdown = [...phaseDurations.entries()] + .sort((a, b) => a[0] - b[0]) + .map(([phase, ms]) => `p${phase}=${ms}ms`) + .join(' '); + runtimeDebug('parser', 'worker-phase-timing', + `attempt=${attempt} totalMs=${Date.now() - attemptStart} ${phaseBreakdown}`); logParseWarnings(msg.payload.parseWarnings); finish(() => { const result: ParseResult = { diff --git a/src/core/parser.ts b/src/core/parser.ts index 8a37c238..fd0c90a2 100644 --- a/src/core/parser.ts +++ b/src/core/parser.ts @@ -8,7 +8,7 @@ import * as path from 'path'; import * as fs from 'fs'; import { Workspace } from './types'; -import { ParseContext, prefetchCache, stripSingleSession, maybeForceGc, recordFailedFile, resetParseWarnings } from './parser-shared'; +import { ParseContext, prefetchCache, stripSingleSession, maybeForceGc, recordFailedFile, resetParseWarnings, resetParseTiming, getParseTiming } from './parser-shared'; import { getMemoryCache, setMemoryCache, computeDirMetasAsync, loadCacheData, saveCacheData, findStaleDirs, clearCache, stripSessionsForMemory } from './cache'; import type { DirMetas, ParseResult, SessionSource } from './cache'; import { findVsCodeDirs, scanVsCodeDirs, processWorkspaceEntry, processWorkspaceEntryAsync, harnessFromPath } from './parser-vscode'; @@ -16,6 +16,7 @@ import { computeSessionTotals, createRunningTotals, type SessionTotals } from '. import { findXcodeDirs, parseXcodeDatabases, parseXcodeDatabasesAsync } from './parser-xcode'; import { collectExternalHarnessesAsync, collectExternalHarnessesSync, EXTERNAL_HARNESS_SET } from './parser-harnesses'; import { warnCore } from './log'; +import { runtimeDebug } from './runtime-debug'; export type { ParseResult }; export { clearCache }; @@ -357,9 +358,17 @@ async function processWorkspaces( ? Math.min(COLD_PARSE_MAX_PREFETCH_FILES, MAX_PREFETCH_FILES) : MAX_PREFETCH_FILES; const effectiveMaxPrefetchBytes = isColdParse ? COLD_PARSE_MAX_PREFETCH_BYTES : MAX_PREFETCH_BYTES; + const tWorkList = Date.now(); const work = await buildWorkspaceWorkList(entries); + const workListMs = Date.now() - tWorkList; const planItems = buildWorkspacePlan(work); + // Phase-2 gap attribution (issue #106 follow-up). The cold-parse-breakdown showed the actual + // file parsing is a small fraction of phase 2; these accumulators reveal where the rest of the + // wall-clock goes (workspace listing vs prefetch I/O wait vs per-workspace processing). + let prefetchWaitMs = 0; + let entryWallMs = 0; + // Build the workspace-level loading plan in processing order. if (onProgress && planItems.length > 0) { onProgress({ @@ -386,7 +395,11 @@ async function processWorkspaces( const batch = work.slice(i, i + BATCH_SIZE); const nextBatch = work.slice(i + BATCH_SIZE, i + BATCH_SIZE * 2); - if (i === 0) await prefetchBatch(batch, effectiveMaxPrefetch, effectiveMaxPrefetchBytes); + if (i === 0) { + const tPre = Date.now(); + await prefetchBatch(batch, effectiveMaxPrefetch, effectiveMaxPrefetchBytes); + prefetchWaitMs += Date.now() - tPre; + } const nextPrefetch = nextBatch.length > 0 ? prefetchBatch(nextBatch, effectiveMaxPrefetch, effectiveMaxPrefetchBytes) : Promise.resolve(); @@ -414,6 +427,7 @@ async function processWorkspaces( recordFailedFile('parser', logsDir, e); } const elapsed = Date.now() - start; + entryWallMs += elapsed; // Incrementally compute stats from newly added sessions foldNewSessions(); @@ -436,7 +450,9 @@ async function processWorkspaces( ); } + const tWait = Date.now(); await nextPrefetch; + prefetchWaitMs += Date.now() - tWait; await yieldToLoop(); // Backstop: reclaim any transient batch garbage before reading the next batch, keeping RSS // under Electron's ~2GB allocator OOM ceiling during large cold parses (issue #106). @@ -445,6 +461,9 @@ async function processWorkspaces( } finally { prefetchCache.clear(); } + runtimeDebug('parser', 'phase2-attribution', + `workspaces=${work.length} workListMs=${workListMs} prefetchWaitMs=${prefetchWaitMs} ` + + `entryWallMs=${entryWallMs} totalMs=${Date.now() - tWorkList}`); } async function collectXcode( @@ -532,6 +551,16 @@ export async function parseAllLogsAsyncDetailed( // Clear any warnings from a previous parse in this process (the worker is fresh per run, but // the in-process path can be invoked repeatedly). resetParseWarnings(); + resetParseTiming(); + + // Emit the cold-parse sub-phase attribution once, just before returning, so a single run shows + // exactly where phase-2 wall-clock went (issue #106 follow-up). Pure measurement. + const logParseBreakdown = (mode: string): void => { + const t = getParseTiming(); + runtimeDebug('parser', 'cold-parse-breakdown', + `mode=${mode} chatMs=${t.chatMs} editMs=${t.editMs} cliMs=${t.cliMs} ` + + `chatFiles=${t.chatFiles} editFiles=${t.editFiles} forcedGc=${t.forcedGc}`); + }; report({ phase: 1, detail: 'Computing directory fingerprints' }); await yieldToLoop(); @@ -620,6 +649,7 @@ export async function parseAllLogsAsyncDetailed( stripSessionsForMemory(result.sessions); setMemoryCache(result, currentMetas); saveCacheData(result, currentMetas); + logParseBreakdown('incremental'); return { result, dirMetas: currentMetas }; } @@ -642,6 +672,7 @@ export async function parseAllLogsAsyncDetailed( stripSessionsForMemory(result.sessions); setMemoryCache(result, currentMetas); saveCacheData(result, currentMetas); + logParseBreakdown('cold'); return { result, dirMetas: currentMetas }; } diff --git a/src/webview/panel.ts b/src/webview/panel.ts index d780d5fb..5ccf0275 100644 --- a/src/webview/panel.ts +++ b/src/webview/panel.ts @@ -203,6 +203,8 @@ export class DashboardPanel { this.updateSidebarStats(); this.dataReady = true; safePost({ type: 'dataReady', currentWorkspace: vscode.workspace.name || '', ...this.skippedCounts() }); + runtimeDebug('panel', 'sync-timing', + `result=warm-cache totalMs=${Date.now() - t0} sessions=${this.parseResult.sessions.length}`); return; } @@ -210,7 +212,9 @@ export class DashboardPanel { await flush(); if (this.disposed) return; + const tDiscover = Date.now(); const dirs = findLogsDirs(); + const discoverMs = Date.now() - tDiscover; const hasExternal = hasExternalHarnessSources(); runtimeDebug('panel', 'logs-dirs-found', `count=${dirs.length} external=${hasExternal}`); // External harnesses (Claude Code, Codex, OpenCode) are collected by the @@ -225,7 +229,9 @@ export class DashboardPanel { return; } + const tParse = Date.now(); this.parseResult = await parseAllLogsViaWorker(dirs, progress => sendProgress(progress)); + const parseMs = Date.now() - tParse; if (this.disposed) return; runtimeDebug('panel', 'parse-complete', `sessions=${this.parseResult.sessions.length} workspaces=${this.parseResult.workspaces.size}`); const sessionCount = this.parseResult.sessions.length; @@ -234,7 +240,9 @@ export class DashboardPanel { await flush(); if (this.disposed) return; + const tAnalyzer = Date.now(); this.analyzer = new Analyzer(this.parseResult.sessions, this.parseResult.editLocIndex, this.parseResult.workspaces); + const analyzerMs = Date.now() - tAnalyzer; runtimeDebug('panel', 'analyzer-built', `elapsedMs=${Date.now() - t0}`); sendProgress({ phase: 5, detail: 'Ready', pct: 100, sessions: sessionCount }); @@ -248,15 +256,26 @@ export class DashboardPanel { // are handled immediately instead of being queued behind warmUp(). this.dataReady = true; + const dataReadyMs = Date.now() - t0; safePost({ type: 'dataReady', currentWorkspace: vscode.workspace.name || '', ...this.skippedCounts() }); - runtimeDebug('panel', 'data-ready-sent', `elapsedMs=${Date.now() - t0}`); + runtimeDebug('panel', 'data-ready-sent', `elapsedMs=${dataReadyMs}`); + const tWarmUp = Date.now(); try { await this.analyzer.warmUp(); } catch (error) { runtimeDebug('panel', 'warmUp-failed', error); } + const warmUpMs = Date.now() - tWarmUp; runtimeDebug('panel', 'warmUp-done', `elapsedMs=${Date.now() - t0}`); + + // Local-only sync timing summary (issue #106 follow-up). Surfaces a single, parseable + // breakdown of where a cold Sync spends wall-clock time in the "AI Engineer Coach" + // output channel. Never leaves the machine — routed through the same runtimeDebug hook. + runtimeDebug('panel', 'sync-timing', + `result=cold totalMs=${Date.now() - t0} dataReadyMs=${dataReadyMs} ` + + `discoverMs=${discoverMs} parseMs=${parseMs} analyzerMs=${analyzerMs} warmUpMs=${warmUpMs} ` + + `sessions=${sessionCount} dirs=${dirs.length}`); if (this.disposed) return; for (const message of this.pendingMessages) { From 171781eccfddcd7b5de8ea14c68aa335078a520d Mon Sep 17 00:00:00 2001 From: Sanjay Singh Date: Thu, 25 Jun 2026 06:53:09 +0200 Subject: [PATCH 2/3] test(parity): add cross-tree parsed-output parity check Adds `npm run parity`, which builds the parser from two source trees (this branch and a main checkout), runs both against the same real local logs, and compares a canonical NDJSON serialization of the ParseResult by SHA-256. This is how we prove the O(n^2) JSONL streaming fix changed performance only, not output: there is no committed golden snapshot, and the harness itself lives only on the branch, so it diffs branch-vs-main builds directly instead of against a baseline. Verified: both trees emit an identical sha256 (sessions=616 workspaces=132 editLoc=1771 sources=599). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 2 + package.json | 1 + scripts/parity-parse-entry.ts | 85 +++++++++++++++++++++++ scripts/parse-parity.ts | 122 ++++++++++++++++++++++++++++++++++ 4 files changed, 210 insertions(+) create mode 100644 scripts/parity-parse-entry.ts create mode 100644 scripts/parse-parity.ts diff --git a/.gitignore b/.gitignore index 0f38381a..529efec3 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ docs/proposals/ local/ test_output.txt scripts/verify-counts.ts +scripts/.parity-out/ +scripts/.parity-tmp-entry.ts marketing/ # Playwright test results diff --git a/package.json b/package.json index 43768091..9a6b7965 100644 --- a/package.json +++ b/package.json @@ -303,6 +303,7 @@ "analyze:token-inventory": "npx tsx scripts/token-inventory.ts --save", "analyze:context-composition": "npx tsx scripts/explore-context-composition.ts", "analyze:e2e-real-data": "npx tsx scripts/e2e-real-data.ts", + "parity": "npx tsx scripts/parse-parity.ts", "analyze:catalog-fetch": "node scripts/test-catalog-fetch.mjs", "smoke-test": "node scripts/smoke-test.mjs", "gen-synth-logs": "npx tsx scripts/generate-synthetic-logs.ts", diff --git a/scripts/parity-parse-entry.ts b/scripts/parity-parse-entry.ts new file mode 100644 index 00000000..76cf36d1 --- /dev/null +++ b/scripts/parity-parse-entry.ts @@ -0,0 +1,85 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +/* + * Parse-parity entry. Bundled and run once per source tree by scripts/parse-parity.ts. + * + * It parses the real local logs with `parseAllLogs` and writes a deterministic, canonical + * representation of the ParseResult (stable key order, sorted collections) as NDJSON, plus a + * SHA-256 of that text. Two trees that produce the same SHA produced byte-for-byte identical + * parsed output — the check this repo uses to prove a refactor (e.g. the O(n^2) JSONL streaming + * fix) changed performance only, not the parsed result. + * + * Usage (driven by parse-parity.ts, not run directly): + * node .cjs + */ + +import * as crypto from 'crypto'; +import * as fs from 'fs'; +import { findLogsDirs, parseAllLogs } from '../src/core/parser'; + +/** Recursively sort object keys so serialization is independent of insertion order. */ +function sortValue(v: unknown): unknown { + if (Array.isArray(v)) return v.map(sortValue); + if (v && typeof v === 'object') { + const out: Record = {}; + for (const k of Object.keys(v as Record).sort()) { + out[k] = sortValue((v as Record)[k]); + } + return out; + } + return v; +} + +function stable(v: unknown): string { + return JSON.stringify(sortValue(v)); +} + +function main(): void { + const outPath = process.argv[2]; + if (!outPath) throw new Error('usage: .cjs '); + + const dirs = findLogsDirs(); + const result = parseAllLogs(dirs); + + const lines: string[] = []; + + // Workspaces — keyed and sorted by id. + for (const id of [...result.workspaces.keys()].sort()) { + lines.push(`WS\t${id}\t${stable(result.workspaces.get(id))}`); + } + + // Sessions — sorted by sessionId so readdir ordering between trees cannot cause a false diff. + const sessions = [...result.sessions].sort((a, b) => + a.sessionId < b.sessionId ? -1 : a.sessionId > b.sessionId ? 1 : 0, + ); + for (const s of sessions) lines.push(`SESSION\t${s.sessionId}\t${stable(s)}`); + + // Edit-location index — Map>. + for (const reqId of [...result.editLocIndex.keys()].sort()) { + const inner = result.editLocIndex.get(reqId)!; + const obj: Record = {}; + for (const uri of [...inner.keys()].sort()) obj[uri] = inner.get(uri)!; + lines.push(`EDIT\t${reqId}\t${stable(obj)}`); + } + + // Session-source index. + for (const sid of [...result.sessionSourceIndex.keys()].sort()) { + lines.push(`SRC\t${sid}\t${stable(result.sessionSourceIndex.get(sid))}`); + } + + const body = lines.join('\n'); + const sha = crypto.createHash('sha256').update(body).digest('hex'); + fs.writeFileSync(outPath, body); + fs.writeFileSync(`${outPath}.sha256`, sha); + + // eslint-disable-next-line no-console + console.log( + ` sessions=${result.sessions.length} workspaces=${result.workspaces.size} ` + + `editLoc=${result.editLocIndex.size} sources=${result.sessionSourceIndex.size} sha256=${sha}`, + ); +} + +main(); diff --git a/scripts/parse-parity.ts b/scripts/parse-parity.ts new file mode 100644 index 00000000..29360211 --- /dev/null +++ b/scripts/parse-parity.ts @@ -0,0 +1,122 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +/* + * Parsed-output parity check between two source trees (typically this branch vs. main). + * + * Why this exists: a performance refactor of the parser (e.g. the O(n^2) JSONL streaming fix in + * parser-vscode-files.ts) must change speed only, not the parsed result. There is no golden + * snapshot in the repo, and the harness itself only lives on the feature branch — so instead of + * comparing against a committed baseline, it builds the parser from BOTH trees and diffs their + * canonical output against the same real local logs. + * + * How: for each tree it copies scripts/parity-parse-entry.ts in, esbuild-bundles it against that + * tree's own src/, runs it to emit canonical NDJSON + a SHA-256, then compares the two hashes. + * Equal hash => byte-for-byte identical parsed output. + * + * Run: + * npm run parity # compares branch vs. the default main checkout + * npm run parity -- --main C:\path\to\main-checkout + * + * Notes: + * - The "main" tree must be a checkout of this repo with node_modules installed. + * - The main side runs the unoptimized parser, so a large real log set can take a couple of + * minutes there — that slowness is exactly what the branch fixes. + * - Output artifacts land in scripts/.parity-out/ (gitignored). + */ + +import { spawnSync } from 'child_process'; +import { build } from 'esbuild'; +import * as fs from 'fs'; +import * as path from 'path'; + +const branchRoot = path.resolve(__dirname, '..'); + +function parseMainRoot(): string { + const args = process.argv.slice(2); + const i = args.indexOf('--main'); + if (i >= 0 && args[i + 1]) return path.resolve(args[i + 1]); + if (process.env.PARITY_MAIN) return path.resolve(process.env.PARITY_MAIN); + return 'C:\\dev\\AI-Engineering-Coach'; +} + +const mainRoot = parseMainRoot(); +const outDir = path.join(branchRoot, 'scripts', '.parity-out'); +const entrySrc = fs.readFileSync(path.join(branchRoot, 'scripts', 'parity-parse-entry.ts'), 'utf8'); + +async function runFor(label: string, root: string): Promise<{ sha: string; ndjson: string }> { + if (!fs.existsSync(path.join(root, 'src', 'core', 'parser.ts'))) { + throw new Error(`${label}: no src/core/parser.ts under ${root}`); + } + // Inject the identical entry + canonicalizer into the target tree so ONLY the parser differs. + const tmpEntry = path.join(root, 'scripts', '.parity-tmp-entry.ts'); + fs.writeFileSync(tmpEntry, entrySrc); + const bundle = path.join(outDir, `${label}.cjs`); + const ndjson = path.join(outDir, `${label}.ndjson`); + try { + await build({ + entryPoints: [tmpEntry], + bundle: true, + platform: 'node', + format: 'cjs', + outfile: bundle, + absWorkingDir: root, // resolve ../src and node_modules from the target tree + logLevel: 'error', + }); + // eslint-disable-next-line no-console + console.log(`\n[${label}] parsing real logs (${root})...`); + const res = spawnSync(process.execPath, [bundle, ndjson], { stdio: 'inherit' }); + if (res.status !== 0) throw new Error(`${label}: parse exited with code ${res.status}`); + const sha = fs.readFileSync(`${ndjson}.sha256`, 'utf8').trim(); + return { sha, ndjson }; + } finally { + fs.rmSync(tmpEntry, { force: true }); + } +} + +function reportFirstDiff(branchNdjson: string, mainNdjson: string): void { + const bl = fs.readFileSync(branchNdjson, 'utf8').split('\n'); + const ml = fs.readFileSync(mainNdjson, 'utf8').split('\n'); + const n = Math.max(bl.length, ml.length); + for (let i = 0; i < n; i++) { + if (bl[i] !== ml[i]) { + const cut = (s: string | undefined): string => (s === undefined ? '' : s.slice(0, 400)); + // eslint-disable-next-line no-console + console.log(`\nFirst difference at line ${i + 1}:`); + // eslint-disable-next-line no-console + console.log(` branch: ${cut(bl[i])}`); + // eslint-disable-next-line no-console + console.log(` main: ${cut(ml[i])}`); + break; + } + } + // eslint-disable-next-line no-console + console.log(`\n(branch lines=${bl.length}, main lines=${ml.length})`); +} + +async function run(): Promise { + fs.mkdirSync(outDir, { recursive: true }); + /* eslint-disable no-console */ + console.log('Parse-parity check'); + console.log(` branch tree: ${branchRoot}`); + console.log(` main tree: ${mainRoot}`); + + const branch = await runFor('branch', branchRoot); + const main = await runFor('main', mainRoot); + + console.log(`\nbranch sha256 = ${branch.sha}`); + console.log(`main sha256 = ${main.sha}`); + + if (branch.sha === main.sha) { + console.log('\nPARITY OK — parsed output is byte-for-byte identical across both trees.'); + process.exit(0); + } + console.log('\nPARITY FAILED — parsed output differs between the two trees.'); + reportFirstDiff(branch.ndjson, main.ndjson); + process.exit(1); + /* eslint-enable no-console */ +} + +void run(); From be4fc35137c5284273c8c44866c6ff43c69acd9a Mon Sep 17 00:00:00 2001 From: Sanjay Singh Date: Thu, 25 Jun 2026 10:50:02 +0200 Subject: [PATCH 3/3] fix(parity,parser): address review feedback on PR #161 - parse-parity: replace the hard-coded Windows-only main checkout path with git-based discovery (git worktree list -> the main branch's working tree), so `npm run parity` works on any OS and checkout layout, and fails with an actionable message when no main checkout is found. - parser-shared(maybeForceGc): increment parseTiming.forcedGc only after gc() returns, so the counter reflects GCs actually forced rather than attempted. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- scripts/parse-parity.ts | 27 ++++++++++++++++++++++++++- src/core/parser-shared.ts | 8 ++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/scripts/parse-parity.ts b/scripts/parse-parity.ts index 29360211..6460a83e 100644 --- a/scripts/parse-parity.ts +++ b/scripts/parse-parity.ts @@ -34,12 +34,37 @@ import * as path from 'path'; const branchRoot = path.resolve(__dirname, '..'); +/** Locate the `main` branch's working tree via `git worktree list`. Works regardless of where the + * main checkout lives on disk (it need not be a sibling of this worktree) and is cross-platform. */ +function findMainWorktree(): string | undefined { + const res = spawnSync('git', ['worktree', 'list', '--porcelain'], { + cwd: branchRoot, + encoding: 'utf8', + }); + if (res.status !== 0 || !res.stdout) return undefined; + let current: string | undefined; + for (const line of res.stdout.split('\n')) { + if (line.startsWith('worktree ')) current = line.slice('worktree '.length).trim(); + else if (line.trim() === 'branch refs/heads/main' && current) return path.resolve(current); + } + return undefined; +} + function parseMainRoot(): string { const args = process.argv.slice(2); const i = args.indexOf('--main'); if (i >= 0 && args[i + 1]) return path.resolve(args[i + 1]); if (process.env.PARITY_MAIN) return path.resolve(process.env.PARITY_MAIN); - return 'C:\\dev\\AI-Engineering-Coach'; + + // Default: discover the main worktree from git rather than hard-coding a path, so this runs on + // any OS and any checkout layout. Fail with an actionable message if it cannot be found. + const fromGit = findMainWorktree(); + if (fromGit && fs.existsSync(path.join(fromGit, 'src', 'core', 'parser.ts'))) return fromGit; + + throw new Error( + 'parse-parity: could not locate a main checkout to compare against. Pass --main , set ' + + 'PARITY_MAIN, or add a worktree for the main branch (git worktree add main).', + ); } const mainRoot = parseMainRoot(); diff --git a/src/core/parser-shared.ts b/src/core/parser-shared.ts index 618a3791..9de2126a 100644 --- a/src/core/parser-shared.ts +++ b/src/core/parser-shared.ts @@ -172,8 +172,12 @@ export function maybeForceGc(): void { if (now - lastForcedGcAt < FORCE_GC_MIN_INTERVAL_MS) return; if (process.memoryUsage().rss < FORCE_GC_RSS_THRESHOLD) return; lastForcedGcAt = now; - parseTiming.forcedGc++; - try { gc(); } catch { /* gc unavailable */ } + // Count only a GC that actually ran, so `forcedGc` matches its documented meaning even if the + // engine's gc() ever throws (the !gc guard above already handles the unavailable case). + try { + gc(); + parseTiming.forcedGc++; + } catch { /* gc unavailable */ } } /* ---- Cold-parse sub-phase attribution (issue #106 follow-up) ----