Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions src/lib/parsers/default/utils/structuralParserRegistry.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import type { FileDiff } from '../../../types'
import {
_registrySnapshotForTesting,
dispatchStructuralParser,
} from './structuralParserRegistry'

function fileDiff(file: string, diff: string): FileDiff {
return { file, diff, summary: '', tokenCount: Math.ceil(diff.length / 4) }
}

describe('structuralParserRegistry', () => {
describe('registry shape', () => {
it('registers a regex parser for every supported language', () => {
const snapshot = _registrySnapshotForTesting()
expect(snapshot.ts).toContain('regex')
expect(snapshot.js).toContain('regex')
expect(snapshot.py).toContain('regex')
expect(snapshot.rs).toContain('regex')
expect(snapshot.go).toContain('regex')
})

it('keeps each chain non-empty (no language is unreachable)', () => {
const snapshot = _registrySnapshotForTesting()
for (const [lang, chain] of Object.entries(snapshot)) {
expect(chain.length).toBeGreaterThan(0)
// Sanity: identifiers are valid kinds.
for (const id of chain) {
expect(['regex', 'tree-sitter']).toContain(id)
}
// Linter satisfaction.
expect(typeof lang).toBe('string')
}
})
})

describe('dispatchStructuralParser', () => {
it('returns the regex parser\'s summary for a TS diff with structural signal', async () => {
const diff = [
'@@ -1,1 +1,1 @@',
'-export function legacyParse() {}',
'+export function parseRequest(input: string) {}',
].join('\n')
const result = await dispatchStructuralParser('ts', fileDiff('src/p.ts', diff))
expect(result).toBeDefined()
expect(result).toContain('Updated TypeScript `src/p.ts`')
})

it('returns the regex parser\'s summary for a Python diff with structural signal', async () => {
const diff = [
'@@ -1,1 +1,1 @@',
'-def parse(input):',
'+def parse(input, schema):',
].join('\n')
const result = await dispatchStructuralParser('py', fileDiff('src/p.py', diff))
expect(result).toBeDefined()
expect(result).toContain('Updated Python `src/p.py`')
expect(result).toContain('signature change: parse()')
})

it('returns undefined for a body-only TS diff (no parser in the chain handles it)', async () => {
const diff = [
'@@ -1,3 +1,3 @@',
' export function parse() {',
'- return 1',
'+ return 2',
' }',
].join('\n')
const result = await dispatchStructuralParser('ts', fileDiff('src/p.ts', diff))
expect(result).toBeUndefined()
})

it('returns undefined for a language with no registered chain', async () => {
// Languages outside the StructuralLanguageId union won't compile,
// so we test the runtime fallthrough via a typed cast through unknown.
const language = 'lua' as unknown as 'ts'
const result = await dispatchStructuralParser(language, fileDiff('a.lua', '+x'))
expect(result).toBeUndefined()
})
})
})
140 changes: 140 additions & 0 deletions src/lib/parsers/default/utils/structuralParserRegistry.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/**
* Structural parser registry (#933 phase 1.0).
*
* Each language can have multiple parsers in priority order — e.g.
* `[treeSitterTs, regexTs]` means "try tree-sitter first; if it
* isn't available or it can't handle this diff shape, fall through
* to the regex extractor". The dispatcher walks the list until one
* returns a summary or the list is exhausted; on exhaustion the
* file falls through to the LLM as before.
*
* This module is the foundation for the tree-sitter integration
* landing in phase 1.1. Today every entry is a regex parser
* wrapping the existing per-language summarizers — pure refactor,
* zero behavior change. Phase 1.1 prepends the tree-sitter parser
* for TS / JS without touching the rest of the call sites.
*
* Why a registry instead of a switch / dispatcher: the upgrade path
* (regex → tree-sitter → tree-sitter-with-better-grammar) needs a
* shape that supports "multiple parsers per language, tried in
* order, with graceful fallback on error". Hard-coded dispatch
* would need to grow `try { } catch { fallthrough }` branches in
* every language case; the registry makes that part of the
* iteration loop and keeps the per-language modules focused on
* "given this diff, produce a summary".
*/

import type { FileDiff } from '../../../types'
import { summarizeGoStructuralDiff } from './goStructuralDiff'
import { summarizePythonStructuralDiff } from './pythonStructuralDiff'
import { summarizeRustStructuralDiff } from './rustStructuralDiff'
import { summarizeTsStructuralDiff } from './tsStructuralDiff'

/** Identifier reported by each parser for telemetry / debugging. */
export type StructuralParserKind = 'regex' | 'tree-sitter'

/** Language identifier used as the registry key. */
export type StructuralLanguageId = 'ts' | 'js' | 'py' | 'rs' | 'go'

/**
* A structural parser is a strategy for producing a templated
* summary from a unified-diff `FileDiff`. Returns undefined when:
*
* - The diff body is empty / unchanged
* - The diff has no recognizable structural signal (paragraph-
* only edits, body-only changes, etc.) — the LLM is the
* better summarizer for these
* - This parser specifically can't handle the input (e.g. tree-
* sitter parser not loaded yet, or the AST shape is something
* the extractor doesn't recognize)
*
* Returning undefined surrenders to the next parser in the
* registry list; throwing also surrenders, with the error logged
* for telemetry. The contract is "best-effort summary or surrender";
* the caller composes the fallthrough chain.
*
* Sync OR async — tree-sitter parser init is async, but the regex
* parsers are sync. The dispatcher awaits the result either way so
* the per-parser signatures can match their actual cost model.
*/
export interface StructuralParser {
readonly id: StructuralParserKind
summarize(fileDiff: FileDiff): Promise<string | undefined> | string | undefined
}

/**
* Regex-based parser shim. Adapts the existing per-language
* `summarize*StructuralDiff` functions to the parser interface so
* they can live in the registry alongside future tree-sitter
* parsers. Stateless, no init cost, sync.
*/
function regexParser(
summarize: (fileDiff: FileDiff) => string | undefined,
): StructuralParser {
return {
id: 'regex',
summarize,
}
}

const regexTs = regexParser(summarizeTsStructuralDiff)
const regexJs = regexTs // same extractor; the language detector inside
// `summarizeTsStructuralDiff` accepts both
const regexPy = regexParser(summarizePythonStructuralDiff)
const regexRs = regexParser(summarizeRustStructuralDiff)
const regexGo = regexParser(summarizeGoStructuralDiff)

/**
* Per-language parser chains, in priority order. Phase 1.0 has
* only regex parsers; phase 1.1 will prepend tree-sitter parsers
* for `ts` and `js`. Later phases add tree-sitter for the lazy-
* loaded languages.
*/
const REGISTRY: Record<StructuralLanguageId, StructuralParser[]> = {
ts: [regexTs],
js: [regexJs],
py: [regexPy],
rs: [regexRs],
go: [regexGo],
}

/**
* Walk the parser chain for the given language and return the
* first non-undefined summary. Errors thrown by any parser are
* swallowed so the chain continues — telemetry hook is reserved
* for phase 1.1+ where tree-sitter failures need observability.
*
* Exported as the single public entry point; consumers should not
* read REGISTRY directly so the registry shape can evolve without
* leaking to call sites.
*/
export async function dispatchStructuralParser(
language: StructuralLanguageId,
fileDiff: FileDiff,
): Promise<string | undefined> {
const chain = REGISTRY[language]
if (!chain) return undefined
for (const parser of chain) {
try {
const result = await parser.summarize(fileDiff)
if (result !== undefined) return result
} catch {
// Parser surrendered via throw. Continue to the next in the
// chain. Phase 1.1 wires a logger hook here so tree-sitter
// failures are observable without spamming the user.
}
}
return undefined
}

/**
* Test seam: returns a shallow snapshot of the per-language chain.
* Used by registry-shape assertions in the eval / unit tests.
* NOT a public API — phase 1.1 may rearrange the registry's
* internal shape.
*/
export function _registrySnapshotForTesting(): Record<StructuralLanguageId, StructuralParserKind[]> {
return Object.fromEntries(
Object.entries(REGISTRY).map(([lang, chain]) => [lang, chain.map((p) => p.id)])
) as Record<StructuralLanguageId, StructuralParserKind[]>
}
44 changes: 15 additions & 29 deletions src/lib/parsers/default/utils/summarizeLargeFiles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,24 @@ import {
writeDiffSummary,
} from './diffSummaryCache'
import { summarizeMarkdownDiff } from './markdownDiff'
import { summarizeGoStructuralDiff, isGoFile } from './goStructuralDiff'
import { summarizePythonStructuralDiff, isPythonFile } from './pythonStructuralDiff'
import { summarizeRustStructuralDiff, isRustFile } from './rustStructuralDiff'
import { detectTsLanguage, summarizeTsStructuralDiff } from './tsStructuralDiff'
import { isGoFile } from './goStructuralDiff'
import { isPythonFile } from './pythonStructuralDiff'
import { isRustFile } from './rustStructuralDiff'
import {
dispatchStructuralParser,
type StructuralLanguageId,
} from './structuralParserRegistry'
import { detectTsLanguage } from './tsStructuralDiff'
import { summarizeTrivialDiff } from './trivialDiff'

/**
* Language identifier shared by the `service.fastPath.languageAware`
* config knob and the dispatcher below. Adding a new language is two
* lines: append the identifier to this union (mirrored in
* `lib/langchain/types.ts` for schema generation), and add a case to
* `dispatchStructuralSummary`.
* Map a file path to the language identifier used by the
* `service.fastPath.languageAware` config knob and the parser
* registry. Adding a new language: append the identifier to the
* union (mirrored in `lib/langchain/types.ts` for schema
* generation) and register a parser chain entry in
* `structuralParserRegistry.ts`.
*/
type StructuralLanguageId = 'ts' | 'js' | 'py' | 'rs' | 'go'

function detectStructuralLanguageId(path: string): StructuralLanguageId | undefined {
const ts = detectTsLanguage(path)
if (ts) return ts
Expand All @@ -34,23 +37,6 @@ function detectStructuralLanguageId(path: string): StructuralLanguageId | undefi
return undefined
}

function dispatchStructuralSummary(
language: StructuralLanguageId,
fileDiff: import('../../../types').FileDiff,
): string | undefined {
switch (language) {
case 'ts':
case 'js':
return summarizeTsStructuralDiff(fileDiff)
case 'py':
return summarizePythonStructuralDiff(fileDiff)
case 'rs':
return summarizeRustStructuralDiff(fileDiff)
case 'go':
return summarizeGoStructuralDiff(fileDiff)
}
}

/**
* Cache opt-out: COCO_NO_CACHE=1 disables both reads and writes
* for the diff-summary cache (#845, PR 5). Default is enabled.
Expand Down Expand Up @@ -178,7 +164,7 @@ async function summarizeFileDiff(
const languageEnabled = language !== undefined &&
(!allowed || allowed.length === 0 || allowed.includes(language))
if (languageEnabled) {
const structuralSummary = dispatchStructuralSummary(language, fileDiff)
const structuralSummary = await dispatchStructuralParser(language, fileDiff)
if (structuralSummary !== undefined) {
logger.verbose(
` - ${fileDiff.file}: language-aware fast-path skip (no LLM call)`,
Expand Down
Loading