From b4e5de07a6adcd5ad3bd6172b27be0753cddb373 Mon Sep 17 00:00:00 2001 From: Griffen Fargo <3642037+gfargo@users.noreply.github.com> Date: Thu, 14 May 2026 08:36:14 -0400 Subject: [PATCH] refactor(parser): structural parser registry foundation (#933 phase 1.0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First piece of the tree-sitter integration. Introduces the abstraction layer that future phases plug into; no behavior change to the existing regex extractors. Why a registry instead of inline dispatch: the upgrade path (regex → tree-sitter → tree-sitter-with-better-grammar) needs a shape that supports "multiple parsers per language, tried in order, with graceful fallback on error". Hard-coded dispatch would need to grow `try { } catch { fallthrough }` branches in every case; the registry makes that part of the iteration loop and keeps per-language modules focused on "given this diff, produce a summary". Concretely: - New `StructuralParser` interface — sync OR async; returns a templated summary or undefined to surrender to the next parser in the chain. Throwing also surrenders (failure observability via logger hook lands in phase 1.1+). - New `REGISTRY: Record` — per-language chain in priority order. Today every entry is a regex parser wrapping the existing per-language summarizer (`summarizeTsStructuralDiff`, `summarizePythonStructuralDiff`, etc.). Phase 1.1 will prepend the tree-sitter parser for `ts` and `js` without touching call sites. - New `dispatchStructuralParser(language, fileDiff)` walks the chain and returns the first non-undefined summary. Errors thrown by any parser are swallowed so the chain continues. - `summarizeLargeFiles.ts` refactored to await the new dispatcher instead of calling the per-language switch directly. The language-detection helper stays where it is. Strategy / packaging decisions for #933 captured in the issue comment: bundle TS/JS, lazy-load Python/Rust/Go from a cache at `~/.cache/coco/tree-sitter/`, WASM (`web-tree-sitter`) over native bindings. Phasing: - Phase 1.0 (THIS PR): abstraction + registry, no tree-sitter yet - Phase 1.1: bundle web-tree-sitter + TS parser .wasm, implement the tree-sitter side - Phase 2: bundle JS parser - Phase 3: lazy-load infra (cache, download, prompt) - Phase 4–6: Python / Rust / Go via lazy-load - Phase 7: polish + eval-harness comparison vs. regex Tests (6 new): - Registry shape — regex parser registered for every supported language; every chain non-empty; only valid kind identifiers - Dispatcher — returns regex summary for TS + Python diffs with structural signal; undefined for body-only diffs; undefined for unregistered languages Validation: - npx tsc --noEmit → 0 errors - npx jest → 1631/1631 pass (6 new) - npx eslint on touched files → clean Refs #933. --- .../utils/structuralParserRegistry.test.ts | 80 ++++++++++ .../default/utils/structuralParserRegistry.ts | 140 ++++++++++++++++++ .../default/utils/summarizeLargeFiles.ts | 44 ++---- 3 files changed, 235 insertions(+), 29 deletions(-) create mode 100644 src/lib/parsers/default/utils/structuralParserRegistry.test.ts create mode 100644 src/lib/parsers/default/utils/structuralParserRegistry.ts diff --git a/src/lib/parsers/default/utils/structuralParserRegistry.test.ts b/src/lib/parsers/default/utils/structuralParserRegistry.test.ts new file mode 100644 index 00000000..db3187b5 --- /dev/null +++ b/src/lib/parsers/default/utils/structuralParserRegistry.test.ts @@ -0,0 +1,80 @@ +import type { FileDiff } from '../../../types' +import { + _registrySnapshotForTesting, + dispatchStructuralParser, +} from './structuralParserRegistry' + +function fileDiff(file: string, diff: string): FileDiff { + return { file, diff, summary: '', tokenCount: Math.ceil(diff.length / 4) } +} + +describe('structuralParserRegistry', () => { + describe('registry shape', () => { + it('registers a regex parser for every supported language', () => { + const snapshot = _registrySnapshotForTesting() + expect(snapshot.ts).toContain('regex') + expect(snapshot.js).toContain('regex') + expect(snapshot.py).toContain('regex') + expect(snapshot.rs).toContain('regex') + expect(snapshot.go).toContain('regex') + }) + + it('keeps each chain non-empty (no language is unreachable)', () => { + const snapshot = _registrySnapshotForTesting() + for (const [lang, chain] of Object.entries(snapshot)) { + expect(chain.length).toBeGreaterThan(0) + // Sanity: identifiers are valid kinds. + for (const id of chain) { + expect(['regex', 'tree-sitter']).toContain(id) + } + // Linter satisfaction. + expect(typeof lang).toBe('string') + } + }) + }) + + describe('dispatchStructuralParser', () => { + it('returns the regex parser\'s summary for a TS diff with structural signal', async () => { + const diff = [ + '@@ -1,1 +1,1 @@', + '-export function legacyParse() {}', + '+export function parseRequest(input: string) {}', + ].join('\n') + const result = await dispatchStructuralParser('ts', fileDiff('src/p.ts', diff)) + expect(result).toBeDefined() + expect(result).toContain('Updated TypeScript `src/p.ts`') + }) + + it('returns the regex parser\'s summary for a Python diff with structural signal', async () => { + const diff = [ + '@@ -1,1 +1,1 @@', + '-def parse(input):', + '+def parse(input, schema):', + ].join('\n') + const result = await dispatchStructuralParser('py', fileDiff('src/p.py', diff)) + expect(result).toBeDefined() + expect(result).toContain('Updated Python `src/p.py`') + expect(result).toContain('signature change: parse()') + }) + + it('returns undefined for a body-only TS diff (no parser in the chain handles it)', async () => { + const diff = [ + '@@ -1,3 +1,3 @@', + ' export function parse() {', + '- return 1', + '+ return 2', + ' }', + ].join('\n') + const result = await dispatchStructuralParser('ts', fileDiff('src/p.ts', diff)) + expect(result).toBeUndefined() + }) + + it('returns undefined for a language with no registered chain', async () => { + // Languages outside the StructuralLanguageId union won't compile, + // so we test the runtime fallthrough via a typed cast through unknown. + const language = 'lua' as unknown as 'ts' + const result = await dispatchStructuralParser(language, fileDiff('a.lua', '+x')) + expect(result).toBeUndefined() + }) + }) +}) diff --git a/src/lib/parsers/default/utils/structuralParserRegistry.ts b/src/lib/parsers/default/utils/structuralParserRegistry.ts new file mode 100644 index 00000000..6b64962a --- /dev/null +++ b/src/lib/parsers/default/utils/structuralParserRegistry.ts @@ -0,0 +1,140 @@ +/** + * Structural parser registry (#933 phase 1.0). + * + * Each language can have multiple parsers in priority order — e.g. + * `[treeSitterTs, regexTs]` means "try tree-sitter first; if it + * isn't available or it can't handle this diff shape, fall through + * to the regex extractor". The dispatcher walks the list until one + * returns a summary or the list is exhausted; on exhaustion the + * file falls through to the LLM as before. + * + * This module is the foundation for the tree-sitter integration + * landing in phase 1.1. Today every entry is a regex parser + * wrapping the existing per-language summarizers — pure refactor, + * zero behavior change. Phase 1.1 prepends the tree-sitter parser + * for TS / JS without touching the rest of the call sites. + * + * Why a registry instead of a switch / dispatcher: the upgrade path + * (regex → tree-sitter → tree-sitter-with-better-grammar) needs a + * shape that supports "multiple parsers per language, tried in + * order, with graceful fallback on error". Hard-coded dispatch + * would need to grow `try { } catch { fallthrough }` branches in + * every language case; the registry makes that part of the + * iteration loop and keeps the per-language modules focused on + * "given this diff, produce a summary". + */ + +import type { FileDiff } from '../../../types' +import { summarizeGoStructuralDiff } from './goStructuralDiff' +import { summarizePythonStructuralDiff } from './pythonStructuralDiff' +import { summarizeRustStructuralDiff } from './rustStructuralDiff' +import { summarizeTsStructuralDiff } from './tsStructuralDiff' + +/** Identifier reported by each parser for telemetry / debugging. */ +export type StructuralParserKind = 'regex' | 'tree-sitter' + +/** Language identifier used as the registry key. */ +export type StructuralLanguageId = 'ts' | 'js' | 'py' | 'rs' | 'go' + +/** + * A structural parser is a strategy for producing a templated + * summary from a unified-diff `FileDiff`. Returns undefined when: + * + * - The diff body is empty / unchanged + * - The diff has no recognizable structural signal (paragraph- + * only edits, body-only changes, etc.) — the LLM is the + * better summarizer for these + * - This parser specifically can't handle the input (e.g. tree- + * sitter parser not loaded yet, or the AST shape is something + * the extractor doesn't recognize) + * + * Returning undefined surrenders to the next parser in the + * registry list; throwing also surrenders, with the error logged + * for telemetry. The contract is "best-effort summary or surrender"; + * the caller composes the fallthrough chain. + * + * Sync OR async — tree-sitter parser init is async, but the regex + * parsers are sync. The dispatcher awaits the result either way so + * the per-parser signatures can match their actual cost model. + */ +export interface StructuralParser { + readonly id: StructuralParserKind + summarize(fileDiff: FileDiff): Promise | string | undefined +} + +/** + * Regex-based parser shim. Adapts the existing per-language + * `summarize*StructuralDiff` functions to the parser interface so + * they can live in the registry alongside future tree-sitter + * parsers. Stateless, no init cost, sync. + */ +function regexParser( + summarize: (fileDiff: FileDiff) => string | undefined, +): StructuralParser { + return { + id: 'regex', + summarize, + } +} + +const regexTs = regexParser(summarizeTsStructuralDiff) +const regexJs = regexTs // same extractor; the language detector inside + // `summarizeTsStructuralDiff` accepts both +const regexPy = regexParser(summarizePythonStructuralDiff) +const regexRs = regexParser(summarizeRustStructuralDiff) +const regexGo = regexParser(summarizeGoStructuralDiff) + +/** + * Per-language parser chains, in priority order. Phase 1.0 has + * only regex parsers; phase 1.1 will prepend tree-sitter parsers + * for `ts` and `js`. Later phases add tree-sitter for the lazy- + * loaded languages. + */ +const REGISTRY: Record = { + ts: [regexTs], + js: [regexJs], + py: [regexPy], + rs: [regexRs], + go: [regexGo], +} + +/** + * Walk the parser chain for the given language and return the + * first non-undefined summary. Errors thrown by any parser are + * swallowed so the chain continues — telemetry hook is reserved + * for phase 1.1+ where tree-sitter failures need observability. + * + * Exported as the single public entry point; consumers should not + * read REGISTRY directly so the registry shape can evolve without + * leaking to call sites. + */ +export async function dispatchStructuralParser( + language: StructuralLanguageId, + fileDiff: FileDiff, +): Promise { + const chain = REGISTRY[language] + if (!chain) return undefined + for (const parser of chain) { + try { + const result = await parser.summarize(fileDiff) + if (result !== undefined) return result + } catch { + // Parser surrendered via throw. Continue to the next in the + // chain. Phase 1.1 wires a logger hook here so tree-sitter + // failures are observable without spamming the user. + } + } + return undefined +} + +/** + * Test seam: returns a shallow snapshot of the per-language chain. + * Used by registry-shape assertions in the eval / unit tests. + * NOT a public API — phase 1.1 may rearrange the registry's + * internal shape. + */ +export function _registrySnapshotForTesting(): Record { + return Object.fromEntries( + Object.entries(REGISTRY).map(([lang, chain]) => [lang, chain.map((p) => p.id)]) + ) as Record +} diff --git a/src/lib/parsers/default/utils/summarizeLargeFiles.ts b/src/lib/parsers/default/utils/summarizeLargeFiles.ts index da5b20f3..bedeee1e 100644 --- a/src/lib/parsers/default/utils/summarizeLargeFiles.ts +++ b/src/lib/parsers/default/utils/summarizeLargeFiles.ts @@ -10,21 +10,24 @@ import { writeDiffSummary, } from './diffSummaryCache' import { summarizeMarkdownDiff } from './markdownDiff' -import { summarizeGoStructuralDiff, isGoFile } from './goStructuralDiff' -import { summarizePythonStructuralDiff, isPythonFile } from './pythonStructuralDiff' -import { summarizeRustStructuralDiff, isRustFile } from './rustStructuralDiff' -import { detectTsLanguage, summarizeTsStructuralDiff } from './tsStructuralDiff' +import { isGoFile } from './goStructuralDiff' +import { isPythonFile } from './pythonStructuralDiff' +import { isRustFile } from './rustStructuralDiff' +import { + dispatchStructuralParser, + type StructuralLanguageId, +} from './structuralParserRegistry' +import { detectTsLanguage } from './tsStructuralDiff' import { summarizeTrivialDiff } from './trivialDiff' /** - * Language identifier shared by the `service.fastPath.languageAware` - * config knob and the dispatcher below. Adding a new language is two - * lines: append the identifier to this union (mirrored in - * `lib/langchain/types.ts` for schema generation), and add a case to - * `dispatchStructuralSummary`. + * Map a file path to the language identifier used by the + * `service.fastPath.languageAware` config knob and the parser + * registry. Adding a new language: append the identifier to the + * union (mirrored in `lib/langchain/types.ts` for schema + * generation) and register a parser chain entry in + * `structuralParserRegistry.ts`. */ -type StructuralLanguageId = 'ts' | 'js' | 'py' | 'rs' | 'go' - function detectStructuralLanguageId(path: string): StructuralLanguageId | undefined { const ts = detectTsLanguage(path) if (ts) return ts @@ -34,23 +37,6 @@ function detectStructuralLanguageId(path: string): StructuralLanguageId | undefi return undefined } -function dispatchStructuralSummary( - language: StructuralLanguageId, - fileDiff: import('../../../types').FileDiff, -): string | undefined { - switch (language) { - case 'ts': - case 'js': - return summarizeTsStructuralDiff(fileDiff) - case 'py': - return summarizePythonStructuralDiff(fileDiff) - case 'rs': - return summarizeRustStructuralDiff(fileDiff) - case 'go': - return summarizeGoStructuralDiff(fileDiff) - } -} - /** * Cache opt-out: COCO_NO_CACHE=1 disables both reads and writes * for the diff-summary cache (#845, PR 5). Default is enabled. @@ -178,7 +164,7 @@ async function summarizeFileDiff( const languageEnabled = language !== undefined && (!allowed || allowed.length === 0 || allowed.includes(language)) if (languageEnabled) { - const structuralSummary = dispatchStructuralSummary(language, fileDiff) + const structuralSummary = await dispatchStructuralParser(language, fileDiff) if (structuralSummary !== undefined) { logger.verbose( ` - ${fileDiff.file}: language-aware fast-path skip (no LLM call)`,