diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts index 92717759..a8313c63 100644 --- a/__tests__/extraction.test.ts +++ b/__tests__/extraction.test.ts @@ -93,6 +93,12 @@ describe('Language Detection', () => { expect(detectLanguage('main.dart')).toBe('dart'); }); + it('should detect Markdown files', () => { + expect(detectLanguage('README.md')).toBe('markdown'); + expect(detectLanguage('docs/guide.markdown')).toBe('markdown'); + expect(detectLanguage('docs/page.mdx')).toBe('markdown'); + }); + it('should return unknown for unsupported extensions', () => { expect(detectLanguage('styles.css')).toBe('unknown'); expect(detectLanguage('data.json')).toBe('unknown'); @@ -121,6 +127,133 @@ describe('Language Support', () => { expect(languages).toContain('swift'); expect(languages).toContain('kotlin'); expect(languages).toContain('dart'); + expect(languages).toContain('markdown'); + }); +}); + +describe('Markdown Extraction', () => { + it('should extract headings, links, and shell script references', () => { + const markdown = `# Project Guide + +See [Setup](docs/setup.md#install) and scripts/release.mjs. + +## Release + +\`\`\`bash +npm run build +node scripts/release.mjs +\`\`\` +`; + + const result = extractFromSource('README.md', markdown); + + const fileNode = result.nodes.find((n) => n.kind === 'file'); + expect(fileNode).toMatchObject({ + name: 'README.md', + language: 'markdown', + }); + + const headings = result.nodes.filter((n) => n.kind === 'module'); + expect(headings.map((n) => n.name)).toContain('Project Guide'); + expect(headings.map((n) => n.name)).toContain('Release'); + + const commandNode = result.nodes.find((n) => n.kind === 'function' && n.signature === 'node scripts/release.mjs'); + expect(commandNode).toBeDefined(); + + expect(result.unresolvedReferences).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + referenceName: 'docs/setup.md#install', + referenceKind: 'imports', + language: 'markdown', + }), + expect.objectContaining({ + referenceName: 'scripts/release.mjs', + referenceKind: 'calls', + language: 'markdown', + }), + ]) + ); + }); + + it('should extract structured table rows and file-symbol references from Markdown', () => { + const markdown = `# Maintenance Guide + +## Phase 4 + +| Template | CLI Entry | Dispatcher | Implementation | +| --- | --- | --- | --- | +| P4-S1 | \`python "{script_path}" p4 "{csv_file}" s1 "{conditions_or_-}" "{probe_cols}"\` | \`scripts/csv_search.py::run_p4\` | \`scripts/csv_search.py::_p4_stage1\` | +| P4-S2 | \`python "{script_path}" p4 "{csv_file}" s2 "{stage1_rows}" "{condition_or_-}" "{detail_cols}"\` | \`scripts/csv_search.py::run_p4\` | \`scripts/csv_search.py::_p4_stage2\` | + +- P4-FLOW changes must inspect \`scripts/csv_search.py::run_p4\`. +`; + + const result = extractFromSource('phases/phase4.md', markdown); + + const tableRows = result.nodes.filter((n) => n.kind === 'constant' && n.qualifiedName.includes('table-row')); + expect(tableRows.map((n) => n.name)).toEqual(expect.arrayContaining(['P4-S1', 'P4-S2'])); + + const p4s1 = tableRows.find((n) => n.name === 'P4-S1'); + expect(p4s1?.signature).toContain('Template: P4-S1'); + expect(p4s1?.signature).toContain('Dispatcher: scripts/csv_search.py::run_p4'); + + const commandNode = result.nodes.find((n) => + n.kind === 'function' && + n.language === 'markdown' && + n.signature?.includes('python "{script_path}" p4') + ); + expect(commandNode).toBeDefined(); + + expect(result.unresolvedReferences).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + referenceName: 'phases/scripts/csv_search.py::run_p4', + referenceKind: 'references', + language: 'markdown', + }), + expect.objectContaining({ + referenceName: 'phases/scripts/csv_search.py::_p4_stage1', + referenceKind: 'references', + language: 'markdown', + }), + ]) + ); + }); +}); + +describe('Code to Markdown Reference Extraction', () => { + it('should extract Markdown path references from code string literals', () => { + const code = ` +export const GUIDE = '../docs/guide.md'; + +export function loadDocs() { + return fs.readFileSync('../docs/guide.md#install', 'utf8'); +} +`; + + const result = extractFromSource('src/load-docs.ts', code); + const loadDocs = result.nodes.find((n) => n.kind === 'function' && n.name === 'loadDocs'); + const guideConstant = result.nodes.find((n) => n.kind === 'constant' && n.name === 'GUIDE'); + + expect(loadDocs).toBeDefined(); + expect(guideConstant).toBeDefined(); + expect(result.unresolvedReferences).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + fromNodeId: loadDocs!.id, + referenceName: 'docs/guide.md#install', + referenceKind: 'references', + language: 'typescript', + }), + expect.objectContaining({ + fromNodeId: guideConstant!.id, + referenceName: 'docs/guide.md', + referenceKind: 'references', + language: 'typescript', + }), + ]) + ); }); }); diff --git a/__tests__/integration/full-pipeline.test.ts b/__tests__/integration/full-pipeline.test.ts index cb01aa5c..faac5e28 100644 --- a/__tests__/integration/full-pipeline.test.ts +++ b/__tests__/integration/full-pipeline.test.ts @@ -82,6 +82,97 @@ describe('Integration: full pipeline', () => { cleanupTempDir(tempDir); }); + it('indexes Markdown headings and resolves Markdown links to script files', async () => { + fs.mkdirSync(path.join(tempDir, 'docs'), { recursive: true }); + fs.mkdirSync(path.join(tempDir, 'scripts'), { recursive: true }); + fs.writeFileSync( + path.join(tempDir, 'README.md'), + `# Project Guide + +See [Setup](docs/setup.md#install). + +## Release + +\`\`\`bash +node scripts/release.mjs +\`\`\` +` + ); + fs.writeFileSync(path.join(tempDir, 'docs', 'setup.md'), '# Install\n'); + fs.writeFileSync(path.join(tempDir, 'scripts', 'release.mjs'), 'export function release() { return true; }\n'); + + const cg = await CodeGraph.init(tempDir); + try { + await cg.indexAll(); + + const guide = cg.searchNodes('Project Guide').find((r) => r.node.language === 'markdown'); + expect(guide).toBeDefined(); + + const releaseCommand = cg + .searchNodes('release.mjs') + .find((r) => r.node.language === 'markdown' && r.node.kind === 'function'); + expect(releaseCommand).toBeDefined(); + + const guideEdges = cg.getOutgoingEdges(guide!.node.id).filter((e) => e.kind === 'imports'); + const guideTargets = guideEdges.map((e) => cg.getNode(e.target)); + const setupHeading = guideTargets.find((n) => n?.qualifiedName === 'docs/setup.md#install'); + expect(setupHeading).toMatchObject({ + kind: 'module', + name: 'Install', + filePath: 'docs/setup.md', + startLine: 1, + }); + + const commandEdges = cg.getOutgoingEdges(releaseCommand!.node.id).filter((e) => e.kind === 'calls'); + const commandTargets = commandEdges.map((e) => cg.getNode(e.target)?.filePath); + expect(commandTargets).toContain('scripts/release.mjs'); + } finally { + cg.destroy(); + } + }); + + it('indexes Markdown template tables and resolves file-symbol references to implementation functions', async () => { + fs.mkdirSync(path.join(tempDir, 'phases'), { recursive: true }); + fs.mkdirSync(path.join(tempDir, 'scripts'), { recursive: true }); + fs.writeFileSync( + path.join(tempDir, 'phases', 'phase4.md'), + `# Phase 4 + +## Fixed Script Templates + +| Template | CLI Entry | Dispatcher | Implementation | +| --- | --- | --- | --- | +| P4-S1 | \`python "{script_path}" p4 "{csv_file}" s1 "{conditions_or_-}" "{probe_cols}"\` | \`scripts/csv_search.py::run_p4\` | \`scripts/csv_search.py::_p4_stage1\` | +| P4-S2 | \`python "{script_path}" p4 "{csv_file}" s2 "{stage1_rows}" "{condition_or_-}" "{detail_cols}"\` | \`scripts/csv_search.py::run_p4\` | \`scripts/csv_search.py::_p4_stage2\` | +` + ); + fs.writeFileSync( + path.join(tempDir, 'scripts', 'csv_search.py'), + `def _p4_stage1(filepath, condition_spec, probe_cols_spec):\n return 's1'\n\n` + + `def _p4_stage2(filepath, stage1_rows, condition_spec, detail_cols_spec):\n return 's2'\n\n` + + `def run_p4(filepath, args):\n return _p4_stage1(filepath, '-', 'MPN')\n` + ); + + const cg = await CodeGraph.init(tempDir); + try { + await cg.indexAll(); + + const p4s1Row = cg.searchNodes('P4-S1').find((r) => r.node.language === 'markdown'); + expect(p4s1Row?.node.kind).toBe('constant'); + + const edges = cg.getOutgoingEdges(p4s1Row!.node.id).filter((e) => e.kind === 'references'); + const targets = edges.map((e) => cg.getNode(e.target)); + expect(targets).toEqual( + expect.arrayContaining([ + expect.objectContaining({ name: 'run_p4', filePath: 'scripts/csv_search.py' }), + expect.objectContaining({ name: '_p4_stage1', filePath: 'scripts/csv_search.py' }), + ]) + ); + } finally { + cg.destroy(); + } + }); + it('runs init → index → resolve → search → callers → context → sync', async () => { const MODULE_COUNT = 120; generateSyntheticProject(tempDir, MODULE_COUNT); @@ -241,4 +332,46 @@ describe('Integration: full pipeline', () => { cg.destroy(); } }, 30_000); + + it('resolves code string references to Markdown headings', async () => { + fs.mkdirSync(path.join(tempDir, 'docs'), { recursive: true }); + fs.mkdirSync(path.join(tempDir, 'scripts'), { recursive: true }); + fs.writeFileSync( + path.join(tempDir, 'docs', 'guide.md'), + `# Guide + +## Install + +Run the setup command. +` + ); + fs.writeFileSync( + path.join(tempDir, 'scripts', 'load_docs.py'), + `GUIDE = "docs/guide.md"\n\n` + + `def load_docs():\n` + + ` return open("docs/guide.md#install", encoding="utf-8").read()\n` + ); + + const cg = await CodeGraph.init(tempDir); + try { + await cg.indexAll(); + + const loadDocs = cg.searchNodes('load_docs').find((r) => r.node.language === 'python'); + expect(loadDocs).toBeDefined(); + + const edges = cg.getOutgoingEdges(loadDocs!.node.id).filter((e) => e.kind === 'references'); + const targets = edges.map((e) => cg.getNode(e.target)); + expect(targets).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + kind: 'module', + name: 'Install', + qualifiedName: 'docs/guide.md#install', + }), + ]) + ); + } finally { + cg.destroy(); + } + }); }); diff --git a/__tests__/resolution.test.ts b/__tests__/resolution.test.ts index 1ca3a3f8..086c55d0 100644 --- a/__tests__/resolution.test.ts +++ b/__tests__/resolution.test.ts @@ -267,6 +267,137 @@ describe('Resolution Module', () => { expect(result).not.toBeNull(); expect(result?.targetNodeId).toBe('method:user.ts:User.save:15'); }); + + it('should resolve Markdown file references by filename, path, and anchor suffix', () => { + const mockNodes: Node[] = [ + { + id: 'file:README.md', + kind: 'file', + name: 'README.md', + qualifiedName: 'README.md', + filePath: 'README.md', + language: 'markdown', + startLine: 1, + endLine: 10, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }, + { + id: 'file:docs/setup.md', + kind: 'file', + name: 'setup.md', + qualifiedName: 'docs/setup.md', + filePath: 'docs/setup.md', + language: 'markdown', + startLine: 1, + endLine: 10, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }, + { + id: 'module:docs/setup.md:install:1', + kind: 'module', + name: 'Install', + qualifiedName: 'docs/setup.md#install', + filePath: 'docs/setup.md', + language: 'markdown', + startLine: 1, + endLine: 10, + startColumn: 0, + endColumn: 9, + updatedAt: Date.now(), + }, + ]; + + const context: ResolutionContext = { + getNodesInFile: () => mockNodes, + getNodesByName: (name) => mockNodes.filter((n) => n.name === name), + getNodesByQualifiedName: (qualifiedName) => mockNodes.filter((n) => n.qualifiedName === qualifiedName), + getNodesByKind: () => [], + fileExists: () => true, + readFile: () => null, + getProjectRoot: () => '/test', + getAllFiles: () => ['README.md', 'docs/setup.md'], + getNodesByLowerName: () => [], + getImportMappings: () => [], + }; + + const readmeRef = { + fromNodeId: 'module:docs/setup.md:install:1', + referenceName: 'README.md', + referenceKind: 'imports' as const, + line: 1, + column: 0, + filePath: 'docs/setup.md', + language: 'markdown' as const, + }; + const setupRef = { + ...readmeRef, + referenceName: 'docs/setup.md#install', + filePath: 'README.md', + }; + + expect(matchReference(readmeRef, context)?.targetNodeId).toBe('file:README.md'); + expect(matchReference(setupRef, context)?.targetNodeId).toBe('module:docs/setup.md:install:1'); + }); + + it('should resolve Markdown file-symbol references to symbols in the referenced file', () => { + const mockNodes: Node[] = [ + { + id: 'file:scripts/csv_search.py', + kind: 'file', + name: 'csv_search.py', + qualifiedName: 'scripts/csv_search.py', + filePath: 'scripts/csv_search.py', + language: 'python', + startLine: 1, + endLine: 100, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }, + { + id: 'function:scripts/csv_search.py:run_p4:40', + kind: 'function', + name: 'run_p4', + qualifiedName: 'scripts/csv_search.py::run_p4', + filePath: 'scripts/csv_search.py', + language: 'python', + startLine: 40, + endLine: 55, + startColumn: 0, + endColumn: 0, + updatedAt: Date.now(), + }, + ]; + + const context: ResolutionContext = { + getNodesInFile: (filePath) => mockNodes.filter((n) => n.filePath === filePath), + getNodesByName: (name) => mockNodes.filter((n) => n.name === name), + getNodesByQualifiedName: (qualifiedName) => mockNodes.filter((n) => n.qualifiedName === qualifiedName), + getNodesByKind: () => [], + fileExists: () => true, + readFile: () => null, + getProjectRoot: () => '/test', + getAllFiles: () => ['scripts/csv_search.py'], + getNodesByLowerName: () => [], + getImportMappings: () => [], + }; + + const ref = { + fromNodeId: 'constant:phases/phase4.md:P4-S1:10', + referenceName: 'phases/scripts/csv_search.py::run_p4', + referenceKind: 'references' as const, + line: 10, + column: 20, + filePath: 'phases/phase4.md', + language: 'markdown' as const, + }; + + expect(matchReference(ref, context)?.targetNodeId).toBe('function:scripts/csv_search.py:run_p4:40'); + }); }); describe('Import Resolver', () => { diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts index c78c52ce..cabb9c36 100644 --- a/src/extraction/grammars.ts +++ b/src/extraction/grammars.ts @@ -10,7 +10,7 @@ import * as path from 'path'; import { Parser, Language as WasmLanguage } from 'web-tree-sitter'; import { Language } from '../types'; -export type GrammarLanguage = Exclude; +export type GrammarLanguage = Exclude; /** * WASM filename map — maps each language to its .wasm grammar file @@ -73,6 +73,9 @@ export const EXTENSION_MAP: Record = { '.yaml': 'yaml', // Twig templates (file-level tracking only, no symbol extraction) '.twig': 'twig', + '.md': 'markdown', + '.mdx': 'markdown', + '.markdown': 'markdown', '.rb': 'ruby', '.rake': 'ruby', '.swift': 'swift', @@ -238,6 +241,7 @@ export function isLanguageSupported(language: Language): boolean { if (language === 'liquid') return true; // custom regex extractor if (language === 'yaml') return true; // file-level tracking only; Drupal routing extraction via framework resolver if (language === 'twig') return true; // file-level tracking only + if (language === 'markdown') return true; // custom documentation extractor if (language === 'unknown') return false; return language in WASM_GRAMMAR_FILES; } @@ -246,7 +250,7 @@ export function isLanguageSupported(language: Language): boolean { * Check if a grammar has been loaded and is ready for parsing. */ export function isGrammarLoaded(language: Language): boolean { - if (language === 'svelte' || language === 'vue' || language === 'liquid') return true; + if (language === 'svelte' || language === 'vue' || language === 'liquid' || language === 'markdown') return true; if (language === 'yaml' || language === 'twig') return true; // no WASM grammar needed return languageCache.has(language); } @@ -255,7 +259,7 @@ export function isGrammarLoaded(language: Language): boolean { * Get all supported languages (those with grammar definitions). */ export function getSupportedLanguages(): Language[] { - return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'vue', 'liquid']; + return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'vue', 'liquid', 'markdown']; } /** @@ -327,6 +331,7 @@ export function getLanguageDisplayName(language: Language): string { luau: 'Luau', yaml: 'YAML', twig: 'Twig', + markdown: 'Markdown', unknown: 'Unknown', }; return names[language] || language; diff --git a/src/extraction/markdown-extractor.ts b/src/extraction/markdown-extractor.ts new file mode 100644 index 00000000..8b257949 --- /dev/null +++ b/src/extraction/markdown-extractor.ts @@ -0,0 +1,833 @@ +import * as path from 'path'; +import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference, EdgeKind } from '../types'; +import { generateNodeId } from './tree-sitter-helpers'; + +interface HeadingInfo { + id: string; + level: number; + title: string; + slug: string; + line: number; + endLine: number; + node: Node; +} + +interface FenceState { + marker: string; + language: string; + startLine: number; + lines: Array<{ text: string; line: number }>; +} + +/** + * Lightweight Markdown extractor. + * + * Markdown is indexed as documentation structure rather than code syntax: + * headings become searchable module nodes, links become import nodes, and + * shell-like fenced blocks create command nodes plus file-path references. + */ +export class MarkdownExtractor { + private filePath: string; + private lines: string[]; + private nodes: Node[] = []; + private edges: Edge[] = []; + private unresolvedReferences: UnresolvedReference[] = []; + private errors: ExtractionError[] = []; + private headings: HeadingInfo[] = []; + private referenceKeys = new Set(); + + constructor(filePath: string, source: string) { + this.filePath = normalizeRelativePath(filePath); + this.lines = source.split('\n'); + } + + extract(): ExtractionResult { + const startTime = Date.now(); + + try { + const fileNode = this.createFileNode(); + this.extractHeadings(fileNode); + this.extractStructuredBlocks(fileNode); + this.extractLinksAndCommands(fileNode); + } catch (error) { + this.errors.push({ + message: `Markdown extraction error: ${error instanceof Error ? error.message : String(error)}`, + filePath: this.filePath, + severity: 'error', + code: 'parse_error', + }); + } + + return { + nodes: this.nodes, + edges: this.edges, + unresolvedReferences: this.unresolvedReferences, + errors: this.errors, + durationMs: Date.now() - startTime, + }; + } + + private createFileNode(): Node { + const id = generateNodeId(this.filePath, 'file', this.filePath, 1); + const fileNode: Node = { + id, + kind: 'file', + name: path.posix.basename(this.filePath), + qualifiedName: this.filePath, + filePath: this.filePath, + language: 'markdown', + startLine: 1, + endLine: Math.max(1, this.lines.length), + startColumn: 0, + endColumn: this.lines[this.lines.length - 1]?.length || 0, + docstring: this.buildDocstring(1, Math.min(this.lines.length, 40)), + updatedAt: Date.now(), + }; + + this.nodes.push(fileNode); + return fileNode; + } + + private extractHeadings(fileNode: Node): void { + const rawHeadings: Array<{ level: number; title: string; line: number; column: number }> = []; + + for (let i = 0; i < this.lines.length; i++) { + const line = this.lines[i]!; + const match = /^(#{1,6})\s+(.+?)\s*#*\s*$/.exec(line); + if (!match) continue; + + rawHeadings.push({ + level: match[1]!.length, + title: stripInlineMarkdown(match[2]!.trim()), + line: i + 1, + column: line.indexOf('#'), + }); + } + + const slugCounts = new Map(); + for (let i = 0; i < rawHeadings.length; i++) { + const heading = rawHeadings[i]!; + const baseSlug = slugifyHeading(heading.title); + const seen = slugCounts.get(baseSlug) ?? 0; + slugCounts.set(baseSlug, seen + 1); + const slug = seen === 0 ? baseSlug : `${baseSlug}-${seen}`; + + let endLine = this.lines.length; + for (let j = i + 1; j < rawHeadings.length; j++) { + if (rawHeadings[j]!.level <= heading.level) { + endLine = rawHeadings[j]!.line - 1; + break; + } + } + + const nodeId = generateNodeId(this.filePath, 'module', `${slug}:${heading.line}`, heading.line); + const node: Node = { + id: nodeId, + kind: 'module', + name: heading.title, + qualifiedName: `${this.filePath}#${slug}`, + filePath: this.filePath, + language: 'markdown', + signature: `${'#'.repeat(heading.level)} ${heading.title}`, + docstring: this.buildDocstring(heading.line + 1, endLine), + startLine: heading.line, + endLine, + startColumn: heading.column, + endColumn: this.lines[heading.line - 1]?.length || 0, + updatedAt: Date.now(), + }; + + this.nodes.push(node); + this.headings.push({ + id: nodeId, + level: heading.level, + title: heading.title, + slug, + line: heading.line, + endLine, + node, + }); + } + + const stack: HeadingInfo[] = []; + for (const heading of this.headings) { + while (stack.length > 0 && stack[stack.length - 1]!.level >= heading.level) { + stack.pop(); + } + const parent = stack[stack.length - 1]; + this.edges.push({ + source: parent?.id ?? fileNode.id, + target: heading.id, + kind: 'contains', + provenance: 'heuristic', + }); + stack.push(heading); + } + } + + private extractLinksAndCommands(fileNode: Node): void { + let fence: FenceState | null = null; + + for (let i = 0; i < this.lines.length; i++) { + const line = this.lines[i]!; + const lineNumber = i + 1; + const fenceMatch = /^(\s*)(`{3,}|~{3,})\s*([A-Za-z0-9_+.-]*)/.exec(line); + + if (fence) { + if (line.trimStart().startsWith(fence.marker)) { + this.extractCommandsFromFence(fence, fileNode); + fence = null; + } else { + fence.lines.push({ text: line, line: lineNumber }); + } + continue; + } + + if (fenceMatch) { + fence = { + marker: fenceMatch[2]![0]!.repeat(fenceMatch[2]!.length), + language: (fenceMatch[3] ?? '').toLowerCase(), + startLine: lineNumber, + lines: [], + }; + continue; + } + + const owner = this.findOwnerForLine(lineNumber) ?? fileNode; + this.extractMarkdownLinks(line, lineNumber, owner); + if (!isTableRowLine(line)) { + this.extractFileSymbolMentions(line, lineNumber, owner); + this.extractPathMentions(line, lineNumber, owner); + } + } + + if (fence) { + this.extractCommandsFromFence(fence, fileNode); + } + } + + private extractStructuredBlocks(fileNode: Node): void { + this.extractTables(fileNode); + this.extractListItems(fileNode); + } + + private extractTables(fileNode: Node): void { + let inFence: string | null = null; + + for (let i = 0; i < this.lines.length - 1; i++) { + const line = this.lines[i]!; + const fenceMatch = /^(\s*)(`{3,}|~{3,})/.exec(line); + if (fenceMatch) { + const marker = fenceMatch[2]![0]!.repeat(fenceMatch[2]!.length); + inFence = inFence === marker ? null : marker; + continue; + } + if (inFence) continue; + + const separator = this.lines[i + 1]!; + if (!isTableHeaderLine(line) || !isTableSeparatorLine(separator)) continue; + + const headers = parseTableCells(line); + if (headers.length < 2) continue; + + let rowIndex = i + 2; + while (rowIndex < this.lines.length && isTableRowLine(this.lines[rowIndex]!)) { + const rowLine = this.lines[rowIndex]!; + const cells = parseTableCells(rowLine); + const lineNumber = rowIndex + 1; + if (shouldIndexTableRow(headers, cells)) { + this.createTableRowNode(headers, cells, rowLine, lineNumber, fileNode); + } + rowIndex++; + } + + i = rowIndex - 1; + } + } + + private extractListItems(fileNode: Node): void { + let inFence: string | null = null; + + for (let i = 0; i < this.lines.length; i++) { + const line = this.lines[i]!; + const fenceMatch = /^(\s*)(`{3,}|~{3,})/.exec(line); + if (fenceMatch) { + const marker = fenceMatch[2]![0]!.repeat(fenceMatch[2]!.length); + inFence = inFence === marker ? null : marker; + continue; + } + if (inFence) continue; + + const match = /^\s*[-*+]\s+(?:\[[ xX]\]\s+)?(.+?)\s*$/.exec(line); + if (!match) continue; + + const text = stripInlineMarkdown(match[1]!.trim()); + if (!shouldIndexListItem(text)) continue; + + const lineNumber = i + 1; + const owner = this.findOwnerForLine(lineNumber) ?? fileNode; + const name = stableIdentifier(text) ?? truncateForName(text, 80); + const nodeId = generateNodeId(this.filePath, 'constant', `list:${lineNumber}:${name}`, lineNumber); + const node: Node = { + id: nodeId, + kind: 'constant', + name, + qualifiedName: `${this.filePath}::list-item:${slugifyHeading(name)}:${lineNumber}`, + filePath: this.filePath, + language: 'markdown', + signature: `list item: ${truncateForSignature(text, 180)}`, + docstring: text, + startLine: lineNumber, + endLine: lineNumber, + startColumn: Math.max(0, line.indexOf(match[1]!)), + endColumn: line.length, + updatedAt: Date.now(), + }; + + this.nodes.push(node); + this.edges.push({ source: owner.id, target: nodeId, kind: 'contains', provenance: 'heuristic' }); + this.extractStructuredReferences(text, lineNumber, node); + } + } + + private createTableRowNode( + headers: string[], + cells: string[], + rowLine: string, + lineNumber: number, + fileNode: Node + ): void { + const owner = this.findOwnerForLine(lineNumber) ?? fileNode; + const rowText = cells.join(' | '); + const name = stableIdentifier(rowText) ?? truncateForName(firstNonEmptyCell(cells) || `row ${lineNumber}`, 80); + const signature = summarizeTableRow(headers, cells); + const nodeId = generateNodeId(this.filePath, 'constant', `table:${lineNumber}:${name}`, lineNumber); + const node: Node = { + id: nodeId, + kind: 'constant', + name, + qualifiedName: `${this.filePath}::table-row:${slugifyHeading(name)}:${lineNumber}`, + filePath: this.filePath, + language: 'markdown', + signature, + docstring: signature, + startLine: lineNumber, + endLine: lineNumber, + startColumn: Math.max(0, rowLine.indexOf(firstNonEmptyCell(cells) || '|')), + endColumn: rowLine.length, + updatedAt: Date.now(), + }; + + this.nodes.push(node); + this.edges.push({ source: owner.id, target: nodeId, kind: 'contains', provenance: 'heuristic' }); + + this.extractStructuredReferences(rowText, lineNumber, node); + this.extractCommandsFromStructuredCells(cells, rowLine, lineNumber, node); + } + + private extractCommandsFromStructuredCells(cells: string[], rowLine: string, lineNumber: number, owner: Node): void { + for (const cell of cells) { + const candidates = extractCodeSpans(cell); + if (candidates.length === 0) candidates.push(stripInlineMarkdown(cell)); + + for (const candidate of candidates) { + const command = extractCommandFromText(candidate); + if (!command) continue; + const column = Math.max(0, rowLine.indexOf(candidate)); + this.createCommandNode(command, lineNumber, column, owner); + } + } + } + + private createCommandNode(command: string, line: number, column: number, owner: Node): void { + const nodeId = generateNodeId(this.filePath, 'function', `command:${line}:${column}:${command}`, line); + const node: Node = { + id: nodeId, + kind: 'function', + name: commandName(command), + qualifiedName: `${this.filePath}::command:${line}:${column}:${command}`, + filePath: this.filePath, + language: 'markdown', + signature: command, + startLine: line, + endLine: line, + startColumn: column, + endColumn: column + command.length, + updatedAt: Date.now(), + }; + + this.nodes.push(node); + this.edges.push({ source: owner.id, target: nodeId, kind: 'contains', provenance: 'heuristic' }); + + const scriptPath = extractScriptPath(command); + if (scriptPath) { + const normalizedTarget = this.normalizeReference(scriptPath); + if (normalizedTarget) { + this.addReference(nodeId, normalizedTarget, 'calls', line, column); + } + } + } + + private extractMarkdownLinks(line: string, lineNumber: number, owner: Node): void { + const linkRegex = /!?\[([^\]]*)\]\(([^)\s]+)(?:\s+["'][^"']*["'])?\)/g; + let match: RegExpExecArray | null; + + while ((match = linkRegex.exec(line)) !== null) { + const target = match[2]!; + if (!isLocalReference(target)) continue; + + const normalizedTarget = this.normalizeReference(target); + if (!normalizedTarget) continue; + + const column = match.index; + const label = stripInlineMarkdown(match[1] || target) || target; + const displayName = path.posix.basename(stripAnchor(normalizedTarget)) || label; + const nodeId = generateNodeId(this.filePath, 'import', `${normalizedTarget}:${lineNumber}:${column}`, lineNumber); + const node: Node = { + id: nodeId, + kind: 'import', + name: displayName, + qualifiedName: `${this.filePath}::link:${normalizedTarget}:${lineNumber}:${column}`, + filePath: this.filePath, + language: 'markdown', + signature: match[0], + docstring: label, + startLine: lineNumber, + endLine: lineNumber, + startColumn: column, + endColumn: column + match[0].length, + updatedAt: Date.now(), + }; + + this.nodes.push(node); + this.edges.push({ source: owner.id, target: nodeId, kind: 'contains', provenance: 'heuristic' }); + this.addReference(owner.id, normalizedTarget, 'imports', lineNumber, column); + } + } + + private extractPathMentions(line: string, lineNumber: number, owner: Node): void { + const pathRegex = /(?= best.level) best = heading; + } + } + return best?.node ?? null; + } + + private addReference( + fromNodeId: string, + referenceName: string, + referenceKind: EdgeKind, + line: number, + column: number + ): void { + const key = `${fromNodeId}:${referenceKind}:${referenceName}:${line}:${column}`; + if (this.referenceKeys.has(key)) return; + this.referenceKeys.add(key); + this.unresolvedReferences.push({ + fromNodeId, + referenceName, + referenceKind, + line, + column, + filePath: this.filePath, + language: 'markdown', + }); + } + + private normalizeReference(target: string): string | null { + const [pathAndSymbol, anchorPart] = splitAnchor(target.trim()); + const [pathPart, symbolPart] = splitFileSymbol(pathAndSymbol); + const cleanPath = decodePath(pathPart.split(/[?#]/)[0] ?? ''); + const anchor = anchorPart ? `#${slugifyHeading(anchorPart)}` : ''; + + if (!cleanPath && anchor) { + return `${this.filePath}${anchor}`; + } + if (!cleanPath) return null; + + const withoutLeadingSlash = cleanPath.startsWith('/') ? cleanPath.slice(1) : cleanPath; + const baseDir = path.posix.dirname(this.filePath); + const normalized = cleanPath.startsWith('/') || baseDir === '.' + ? path.posix.normalize(withoutLeadingSlash) + : path.posix.normalize(path.posix.join(baseDir, withoutLeadingSlash)); + + if (normalized.startsWith('../') || normalized === '..') return null; + return `${normalized}${symbolPart ? `::${symbolPart}` : ''}${anchor}`; + } + + private buildDocstring(startLine: number, endLine: number): string | undefined { + const text = this.lines + .slice(Math.max(0, startLine - 1), Math.max(0, endLine)) + .map((line) => stripInlineMarkdown(line.replace(/^#{1,6}\s+/, '').trim())) + .filter((line) => line && !/^(```|~~~)/.test(line)) + .join('\n') + .trim(); + + if (!text) return undefined; + return text.length > 600 ? `${text.slice(0, 600)}...` : text; + } +} + +function normalizeRelativePath(filePath: string): string { + return filePath.replace(/\\/g, '/'); +} + +function stripInlineMarkdown(value: string): string { + const codeSpans: string[] = []; + const withPlaceholders = value.replace(/`([^`]+)`/g, (_match, code: string) => { + const token = `\uE000${codeSpans.length}\uE001`; + codeSpans.push(code); + return token; + }); + + return withPlaceholders + .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') + .replace(/[*~]/g, '') + .replace(/\uE000(\d+)\uE001/g, (_match, index: string) => codeSpans[Number(index)] ?? '') + .trim(); +} + +function slugifyHeading(value: string): string { + const slug = stripInlineMarkdown(value) + .toLowerCase() + .replace(/<[^>]+>/g, '') + .replace(/[^\p{L}\p{N}\s-]/gu, '') + .trim() + .replace(/\s+/g, '-'); + return slug || 'section'; +} + +function isLocalReference(target: string): boolean { + const lower = target.toLowerCase(); + return !( + lower.startsWith('http://') || + lower.startsWith('https://') || + lower.startsWith('mailto:') || + lower.startsWith('tel:') || + lower.startsWith('data:') + ); +} + +function splitAnchor(target: string): [string, string] { + const hash = target.indexOf('#'); + if (hash < 0) return [target, '']; + return [target.slice(0, hash), target.slice(hash + 1)]; +} + +function stripAnchor(target: string): string { + return splitAnchor(target)[0]; +} + +function splitFileSymbol(target: string): [string, string] { + const marker = target.indexOf('::'); + if (marker < 0) return [target, '']; + return [target.slice(0, marker), target.slice(marker + 2)]; +} + +function decodePath(value: string): string { + try { + return decodeURIComponent(value).replace(/\\/g, '/'); + } catch { + return value.replace(/\\/g, '/'); + } +} + +function isShellLikeFence(language: string): boolean { + return [ + '', + 'sh', + 'shell', + 'bash', + 'zsh', + 'fish', + 'console', + 'terminal', + 'powershell', + 'ps1', + 'pwsh', + 'cmd', + 'bat', + ].includes(language); +} + +function isTableHeaderLine(line: string): boolean { + return isTableRowLine(line) && parseTableCells(line).some(Boolean); +} + +function isTableSeparatorLine(line: string): boolean { + const cells = parseTableCells(line); + return cells.length >= 2 && cells.every((cell) => /^:?-{3,}:?$/.test(cell.trim())); +} + +function isTableRowLine(line: string): boolean { + return line.trim().startsWith('|') && line.includes('|'); +} + +function parseTableCells(line: string): string[] { + const trimmed = line.trim(); + const withoutOuter = trimmed.startsWith('|') && trimmed.endsWith('|') + ? trimmed.slice(1, -1) + : trimmed; + const cells: string[] = []; + let current = ''; + let inCode = false; + + for (let i = 0; i < withoutOuter.length; i++) { + const char = withoutOuter[i]!; + if (char === '`') { + inCode = !inCode; + current += char; + continue; + } + if (char === '|' && !inCode) { + cells.push(current.trim()); + current = ''; + continue; + } + current += char; + } + + cells.push(current.trim()); + return cells; +} + +function shouldIndexTableRow(headers: string[], cells: string[]): boolean { + const text = `${headers.join(' ')} ${cells.join(' ')}`; + return Boolean( + stableIdentifier(text) || + /(?:^|[\s`])(?:python|python3|py|node|tsx|ts-node|deno|bun|ruby|bash|sh|pwsh|powershell)\s+/i.test(text) || + /[\w.-]+\.(?:md|markdown|mdx|ts|tsx|js|mjs|cjs|py|sh|ps1|rb|go|rs|java|cs|php|swift|kt|kts|dart|vue|svelte|liquid|ya?ml|jsonc?|csv|toml)(?:[#:]|[\s`]|$)/i.test(text) + ); +} + +function shouldIndexListItem(text: string): boolean { + return Boolean( + stableIdentifier(text) || + /[\w.-]+\.(?:md|markdown|mdx|ts|tsx|js|mjs|cjs|py|sh|ps1|rb|go|rs|java|cs|php|swift|kt|kts|dart|vue|svelte|liquid|ya?ml|jsonc?|csv|toml)::[A-Za-z_$][\w$]*/.test(text) + ); +} + +function stableIdentifier(text: string): string | null { + const matches = text.match(/\b[A-Z][A-Z0-9]{1,}(?:-[A-Z0-9]+)+\b|\b[A-Z]{2,}[0-9]+(?:-[A-Z0-9]+)*\b/g); + return matches?.[0] ?? null; +} + +function firstNonEmptyCell(cells: string[]): string | null { + for (const cell of cells) { + const stripped = stripInlineMarkdown(cell); + if (stripped) return stripped; + } + return null; +} + +function summarizeTableRow(headers: string[], cells: string[]): string { + const parts: string[] = []; + for (let i = 0; i < Math.min(headers.length, cells.length); i++) { + const header = stripInlineMarkdown(headers[i] ?? ''); + const value = stripInlineMarkdown(cells[i] ?? ''); + if (!header || !value) continue; + parts.push(`${header}: ${value}`); + } + return `table row: ${truncateForSignature(parts.join('; '), 240)}`; +} + +function truncateForName(text: string, max: number): string { + const stripped = stripInlineMarkdown(text).replace(/\s+/g, ' ').trim(); + if (stripped.length <= max) return stripped; + return stripped.slice(0, max - 1).trimEnd(); +} + +function truncateForSignature(text: string, max: number): string { + const stripped = stripInlineMarkdown(text).replace(/\s+/g, ' ').trim(); + if (stripped.length <= max) return stripped; + return `${stripped.slice(0, max - 3).trimEnd()}...`; +} + +function extractCodeSpans(text: string): string[] { + const spans: string[] = []; + const regex = /`([^`]+)`/g; + let match: RegExpExecArray | null; + while ((match = regex.exec(text)) !== null) { + if (match[1]) spans.push(match[1]); + } + return spans; +} + +function extractCommandFromText(text: string): string | null { + const command = normalizeCommandLine(stripInlineMarkdown(text)); + if (!command) return null; + return extractScriptPath(command) || + /^(?:python|python3|py|node|tsx|ts-node|deno|bun|ruby|bash|sh|pwsh|powershell)\s+["']?\{script_path\}["']?/i.test(command) || + /^(?:npm|pnpm|yarn)\s+(?:run\s+)?[\w:.-]+/.test(command) + ? command + : null; +} + +function splitSymbolList(value: string): string[] { + return value + .split(/[,\s]+/) + .map((part) => part.trim().replace(/^::/, '')) + .filter((part) => /^[A-Za-z_$][\w$]*(?:[./][A-Za-z_$][\w$]*)*$/.test(part)); +} + +function looksLikeCodeSymbol(value: string): boolean { + if (!value || value.length > 140 || /\s/.test(value)) return false; + if (/^(?:true|false|null|undefined|none|yes|no)$/i.test(value)) return false; + if (/^[A-Z0-9_-]+$/.test(value) && value.includes('-')) return false; + return /^[A-Za-z_$][\w$]*(?:(?:\.|::)[A-Za-z_$][\w$]*)+$/.test(value); +} + +function normalizeCommandLine(line: string): string | null { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith('#') || trimmed.startsWith('//')) return null; + + const withoutPrompt = trimmed + .replace(/^(?:\$|>)\s+/, '') + .replace(/^PS\s+[A-Z]:\\[^>]*>\s*/i, '') + .replace(/^PS>\s*/i, ''); + + if (!withoutPrompt || /^(cd|echo|export|set)\b/.test(withoutPrompt)) return null; + return withoutPrompt; +} + +function commandName(command: string): string { + const npm = /^(?:npm|pnpm|yarn)\s+(?:run\s+)?([\w:.-]+)/.exec(command); + if (npm) return `${command.split(/\s+/)[0]} ${npm[1]}`; + + const script = extractScriptPath(command); + if (script) return path.posix.basename(script.replace(/\\/g, '/')); + + return command.split(/\s+/).slice(0, 3).join(' '); +} + +function extractScriptPath(command: string): string | null { + const patterns = [ + /^(?:node|tsx|ts-node|bun)\s+(?:run\s+)?(?:--[\w=-]+\s+)*(['"]?)([^'"\s]+\.(?:js|mjs|cjs|ts|tsx))\1/, + /^(?:deno)\s+run\s+(?:--[\w=-]+\s+)*(['"]?)([^'"\s]+\.(?:js|mjs|ts|tsx))\1/, + /^(?:python|python3|py|ruby|bash|sh|pwsh|powershell)\s+(?:-[\w-]+\s+)*(['"]?)([^'"\s]+\.(?:py|rb|sh|ps1))\1/, + ]; + + for (const pattern of patterns) { + const match = pattern.exec(command); + if (match?.[2]) return match[2]; + } + + return null; +} diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts index 28022409..23e97be7 100644 --- a/src/extraction/tree-sitter.ts +++ b/src/extraction/tree-sitter.ts @@ -23,6 +23,7 @@ import { LiquidExtractor } from './liquid-extractor'; import { SvelteExtractor } from './svelte-extractor'; import { DfmExtractor } from './dfm-extractor'; import { VueExtractor } from './vue-extractor'; +import { MarkdownExtractor } from './markdown-extractor'; import { getAllFrameworkResolvers, getApplicableFrameworks, @@ -136,6 +137,7 @@ export class TreeSitterExtractor { private errors: ExtractionError[] = []; private extractor: LanguageExtractor | null = null; private nodeStack: string[] = []; // Stack of parent node IDs + private referenceKeys: Set = new Set(); private methodIndex: Map | null = null; // lookup key → node ID for Pascal defProc lookup constructor(filePath: string, source: string, language?: Language) { @@ -265,6 +267,8 @@ export class TreeSitterExtractor { if (handled) return; } + this.extractMarkdownPathReferencesFromStringNode(node); + // Pascal-specific AST handling if (this.language === 'pascal') { skipChildren = this.visitPascalNode(node); @@ -493,6 +497,103 @@ export class TreeSitterExtractor { }; } + /** + * Extract Markdown path references from code string literals. + * + * This creates code -> documentation edges such as: + * open("docs/guide.md#install") -> docs/guide.md#install + * fs.readFileSync("../README.md") -> README.md + */ + private extractMarkdownPathReferencesFromStringNode(node: SyntaxNode, ownerId?: string): void { + if (!isMarkdownPathStringNode(node.type)) return; + + const fromNodeId = ownerId ?? this.currentReferenceOwnerId(); + if (!fromNodeId) return; + + const text = getNodeText(node, this.source); + for (const candidate of extractMarkdownPathCandidates(text)) { + const normalized = this.normalizeMarkdownPathReference(candidate.referenceName); + if (!normalized) continue; + this.addReference( + fromNodeId, + normalized, + 'references', + node.startPosition.row + 1, + node.startPosition.column + candidate.column + ); + } + } + + private extractMarkdownPathReferencesFromSubtree(node: SyntaxNode | null | undefined, ownerId: string): void { + if (!node) return; + this.extractMarkdownPathReferencesFromStringNode(node, ownerId); + + for (let i = 0; i < node.namedChildCount; i++) { + const child = node.namedChild(i); + if (child) { + this.extractMarkdownPathReferencesFromSubtree(child, ownerId); + } + } + } + + private currentReferenceOwnerId(): string | null { + if (this.nodeStack.length > 0) { + return this.nodeStack[this.nodeStack.length - 1] ?? null; + } + return this.nodes.find((n) => n.kind === 'file')?.id ?? null; + } + + private addReference( + fromNodeId: string, + referenceName: string, + referenceKind: Edge['kind'], + line: number, + column: number + ): void { + const key = `${fromNodeId}:${referenceKind}:${referenceName}:${line}:${column}`; + if (this.referenceKeys.has(key)) return; + this.referenceKeys.add(key); + this.unresolvedReferences.push({ + fromNodeId, + referenceName, + referenceKind, + line, + column, + filePath: this.filePath, + language: this.language, + }); + } + + private normalizeMarkdownPathReference(referenceName: string): string | null { + const trimmed = referenceName.trim().replace(/\\/g, '/'); + if (!trimmed || /^[a-z][a-z0-9+.-]*:\/\//i.test(trimmed)) return null; + + const hashIndex = trimmed.indexOf('#'); + const pathPartWithQuery = hashIndex >= 0 ? trimmed.slice(0, hashIndex) : trimmed; + const anchor = hashIndex >= 0 ? trimmed.slice(hashIndex) : ''; + const queryIndex = pathPartWithQuery.indexOf('?'); + const rawPath = queryIndex >= 0 ? pathPartWithQuery.slice(0, queryIndex) : pathPartWithQuery; + const cleanPath = decodePath(rawPath); + + if (!/\.(md|mdx|markdown)$/i.test(cleanPath)) return null; + + const baseDir = path.posix.dirname(this.filePath.replace(/\\/g, '/')); + let normalizedPath: string; + if (cleanPath.startsWith('/')) { + normalizedPath = path.posix.normalize(cleanPath.replace(/^\/+/, '')); + } else if (cleanPath.startsWith('./') || cleanPath.startsWith('../')) { + normalizedPath = path.posix.normalize(path.posix.join(baseDir === '.' ? '' : baseDir, cleanPath)); + } else { + normalizedPath = path.posix.normalize(cleanPath); + } + + if (!normalizedPath || normalizedPath === '.' || normalizedPath === '..' || normalizedPath.startsWith('../')) { + return null; + } + + return `${normalizedPath}${anchor}`; + } + /** * Check if the current node stack indicates we are inside a class-like node * (class, struct, interface, trait). File nodes do not count as class-like. @@ -901,6 +1002,7 @@ export class TreeSitterExtractor { // decorator->target relationship for class properties too. if (propNode) { this.extractDecoratorsFor(node, propNode.id); + this.extractMarkdownPathReferencesFromSubtree(node, propNode.id); } } @@ -946,12 +1048,15 @@ export class TreeSitterExtractor { if (!nameNode) continue; const name = getNodeText(nameNode, this.source); const signature = typeText ? `${typeText} $${name}` : `$${name}`; - this.createNode('field', name, elem, { + const fieldNode = this.createNode('field', name, elem, { docstring, signature, visibility, isStatic, }); + if (fieldNode) { + this.extractMarkdownPathReferencesFromSubtree(elem, fieldNode.id); + } } return; } @@ -983,7 +1088,10 @@ export class TreeSitterExtractor { }); // Java/Kotlin annotations / TS field decorators sit on the // outer field_declaration, not on the individual declarator. - if (fieldNode) this.extractDecoratorsFor(node, fieldNode.id); + if (fieldNode) { + this.extractDecoratorsFor(node, fieldNode.id); + this.extractMarkdownPathReferencesFromSubtree(decl, fieldNode.id); + } } } else { // Fallback: try to find an identifier child directly @@ -991,11 +1099,14 @@ export class TreeSitterExtractor { || node.namedChildren.find(c => c.type === 'identifier'); if (nameNode) { const name = getNodeText(nameNode, this.source); - this.createNode('field', name, node, { + const fieldNode = this.createNode('field', name, node, { docstring, visibility, isStatic, }); + if (fieldNode) { + this.extractMarkdownPathReferencesFromSubtree(node, fieldNode.id); + } } } } @@ -1056,6 +1167,7 @@ export class TreeSitterExtractor { // Extract type annotation references (e.g., const x: ITextModel = ...) if (varNode) { this.extractVariableTypeAnnotation(child, varNode.id); + this.extractMarkdownPathReferencesFromSubtree(valueNode, varNode.id); } } } @@ -1072,10 +1184,13 @@ export class TreeSitterExtractor { const initValue = right ? getNodeText(right, this.source).slice(0, 100) : undefined; const initSignature = initValue ? `= ${initValue}${initValue.length >= 100 ? '...' : ''}` : undefined; - this.createNode(kind, name, node, { + const varNode = this.createNode(kind, name, node, { docstring, signature: initSignature, }); + if (varNode) { + this.extractMarkdownPathReferencesFromSubtree(right, varNode.id); + } } } else if (this.language === 'go') { // Go: var_declaration, short_var_declaration, const_declaration @@ -1092,10 +1207,13 @@ export class TreeSitterExtractor { const initValue = valueNode ? getNodeText(valueNode, this.source).slice(0, 100) : undefined; const initSignature = initValue ? `= ${initValue}${initValue.length >= 100 ? '...' : ''}` : undefined; - this.createNode(node.type === 'const_declaration' ? 'constant' : 'variable', name, spec, { + const varNode = this.createNode(node.type === 'const_declaration' ? 'constant' : 'variable', name, spec, { docstring, signature: initSignature, }); + if (varNode) { + this.extractMarkdownPathReferencesFromSubtree(valueNode, varNode.id); + } } } @@ -1115,10 +1233,13 @@ export class TreeSitterExtractor { const initValue = right ? getNodeText(right, this.source).slice(0, 100) : undefined; const initSignature = initValue ? `= ${initValue}${initValue.length >= 100 ? '...' : ''}` : undefined; - this.createNode('variable', name, node, { + const varNode = this.createNode('variable', name, node, { docstring, signature: initSignature, }); + if (varNode) { + this.extractMarkdownPathReferencesFromSubtree(right, varNode.id); + } } } } @@ -1137,7 +1258,10 @@ export class TreeSitterExtractor { const valueNode = values[i]; const initValue = valueNode ? getNodeText(valueNode, this.source).slice(0, 100) : undefined; const initSignature = initValue ? `= ${initValue}${initValue.length >= 100 ? '...' : ''}` : undefined; - this.createNode(kind, name, nameNode, { docstring, signature: initSignature, isExported }); + const varNode = this.createNode(kind, name, nameNode, { docstring, signature: initSignature, isExported }); + if (varNode) { + this.extractMarkdownPathReferencesFromSubtree(valueNode, varNode.id); + } }); } else { // Generic fallback for other languages @@ -1150,10 +1274,13 @@ export class TreeSitterExtractor { : extractName(child, this.source, this.extractor); if (name && name !== '') { - this.createNode(kind, name, child, { + const varNode = this.createNode(kind, name, child, { docstring, isExported, }); + if (varNode) { + this.extractMarkdownPathReferencesFromSubtree(child, varNode.id); + } } } } @@ -1678,6 +1805,8 @@ export class TreeSitterExtractor { } } + this.extractMarkdownPathReferencesFromStringNode(node); + // Extract structural nodes found inside function bodies. // Each extract method visits its own children, so we return after extracting. if (this.extractor!.classTypes.includes(nodeType)) { @@ -2504,6 +2633,48 @@ export class TreeSitterExtractor { } } +const MARKDOWN_PATH_STRING_NODE_TYPES = new Set([ + 'string', + 'string_literal', + 'template_string', + 'raw_string_literal', + 'interpreted_string_literal', + 'interpolated_string_expression', +]); + +function isMarkdownPathStringNode(nodeType: string): boolean { + return MARKDOWN_PATH_STRING_NODE_TYPES.has(nodeType); +} + +function extractMarkdownPathCandidates(text: string): Array<{ referenceName: string; column: number }> { + const candidates: Array<{ referenceName: string; column: number }> = []; + const pattern = /((?:\.{1,2}[\\/]+|[A-Za-z0-9_.@-]+[\\/]+|[\\/]+)?(?:[A-Za-z0-9_.@-]+[\\/]+)*[A-Za-z0-9_.@-]+\.(?:md|mdx|markdown)(?:\?[^'"`\s)>,;]*)?(?:#[^'"`\s)>,;]*)?)/gi; + let match: RegExpExecArray | null; + + while ((match = pattern.exec(text)) !== null) { + const referenceName = match[1]; + if (!referenceName) continue; + + const prefix = text.slice(Math.max(0, match.index - 16), match.index); + if (/:\/{2}$/i.test(prefix)) continue; + + candidates.push({ + referenceName, + column: match.index, + }); + } + + return candidates; +} + +function decodePath(value: string): string { + try { + return decodeURIComponent(value); + } catch { + return value; + } +} + /** * Extract nodes and edges from source code. @@ -2535,6 +2706,10 @@ export function extractFromSource( // Use custom extractor for Liquid const extractor = new LiquidExtractor(filePath, source); result = extractor.extract(); + } else if (detectedLanguage === 'markdown') { + // Use custom extractor for Markdown documentation structure and references + const extractor = new MarkdownExtractor(filePath, source); + result = extractor.extract(); } else if (detectedLanguage === 'yaml' || detectedLanguage === 'twig') { // No symbol extraction — file is tracked at the file-record level only. // Framework extractors (e.g. Drupal routing resolver) run below and may diff --git a/src/resolution/index.ts b/src/resolution/index.ts index 2ae85ccb..e12a207c 100644 --- a/src/resolution/index.ts +++ b/src/resolution/index.ts @@ -428,9 +428,11 @@ export class ReferenceResolver { */ private hasAnyPossibleMatch(name: string): boolean { if (!this.knownNames) return true; // no pre-filter available + const pathName = name.replace(/\\/g, '/').split('#')[0] ?? name; // Direct name match if (this.knownNames.has(name)) return true; + if (pathName !== name && this.knownNames.has(pathName)) return true; // For qualified names like "obj.method" or "Class::method", check the parts const dotIdx = name.indexOf('.'); @@ -450,11 +452,12 @@ export class ReferenceResolver { } // For path-like references (e.g., "snippets/drawer-menu.liquid"), check the filename - const slashIdx = name.lastIndexOf('/'); + const slashIdx = pathName.lastIndexOf('/'); if (slashIdx > 0) { - const fileName = name.substring(slashIdx + 1); + const fileName = pathName.substring(slashIdx + 1); if (this.knownNames.has(fileName)) return true; } + if (slashIdx < 0 && /\.[A-Za-z0-9]+$/.test(pathName) && this.knownNames.has(pathName)) return true; return false; } diff --git a/src/resolution/name-matcher.ts b/src/resolution/name-matcher.ts index 997a4437..23b29ba2 100644 --- a/src/resolution/name-matcher.ts +++ b/src/resolution/name-matcher.ts @@ -15,10 +15,13 @@ export function matchByFilePath( ref: UnresolvedRef, context: ResolutionContext ): ResolvedRef | null { - if (!ref.referenceName.includes('/')) return null; + const normalizedName = ref.referenceName.replace(/\\/g, '/'); + const [pathAndSymbol, anchor] = splitAnchor(normalizedName); + const [pathWithoutAnchor, symbolName] = splitFileSymbol(pathAndSymbol); + if (!/\.[A-Za-z0-9]+$/.test(pathWithoutAnchor)) return null; // Extract the filename from the path - const fileName = ref.referenceName.split('/').pop(); + const fileName = pathWithoutAnchor.split('/').pop(); if (!fileName) return null; // Search for file nodes with this name @@ -27,8 +30,32 @@ export function matchByFilePath( if (fileNodes.length === 0) return null; + if (symbolName) { + const symbol = findSymbolInReferencedFile(pathWithoutAnchor, symbolName, fileNodes, context); + if (symbol) { + return { + original: ref, + targetNodeId: symbol.id, + confidence: 0.99, + resolvedBy: 'file-path', + }; + } + } + + if (anchor) { + const anchored = findAnchoredMarkdownSection(pathWithoutAnchor, anchor, fileNodes, context); + if (anchored) { + return { + original: ref, + targetNodeId: anchored.id, + confidence: 0.98, + resolvedBy: 'file-path', + }; + } + } + // Prefer exact path match on qualified_name - const exactMatch = fileNodes.find(n => n.qualifiedName === ref.referenceName || n.filePath === ref.referenceName); + const exactMatch = fileNodes.find(n => n.qualifiedName === pathWithoutAnchor || n.filePath === pathWithoutAnchor); if (exactMatch) { return { original: ref, @@ -39,7 +66,7 @@ export function matchByFilePath( } // Fall back to suffix match (e.g., ref="snippets/foo.liquid" matches "src/snippets/foo.liquid") - const suffixMatch = fileNodes.find(n => n.qualifiedName.endsWith(ref.referenceName) || n.filePath.endsWith(ref.referenceName)); + const suffixMatch = fileNodes.find(n => n.qualifiedName.endsWith(pathWithoutAnchor) || n.filePath.endsWith(pathWithoutAnchor)); if (suffixMatch) { return { original: ref, @@ -62,6 +89,97 @@ export function matchByFilePath( return null; } +function splitAnchor(referenceName: string): [string, string | null] { + const hash = referenceName.indexOf('#'); + if (hash < 0) return [referenceName, null]; + return [referenceName.slice(0, hash), referenceName.slice(hash + 1)]; +} + +function splitFileSymbol(referenceName: string): [string, string | null] { + const marker = referenceName.indexOf('::'); + if (marker < 0) return [referenceName, null]; + return [referenceName.slice(0, marker), referenceName.slice(marker + 2)]; +} + +function findSymbolInReferencedFile( + pathWithoutAnchor: string, + symbolName: string, + fileNodes: Node[], + context: ResolutionContext +): Node | null { + const candidateFiles = fileNodes.filter( + (n) => n.qualifiedName === pathWithoutAnchor || n.filePath === pathWithoutAnchor || + n.qualifiedName.endsWith(pathWithoutAnchor) || n.filePath.endsWith(pathWithoutAnchor) + ); + const filesToSearch = candidateFiles.length > 0 ? candidateFiles : fileNodes.length === 1 ? fileNodes : []; + + for (const fileNode of filesToSearch) { + const nodesInFile = context.getNodesInFile(fileNode.filePath); + const normalizedSymbol = symbolName.replace(/\//g, '.'); + const exact = nodesInFile.find((n) => + n.name === normalizedSymbol || + n.qualifiedName === `${fileNode.filePath}::${normalizedSymbol}` || + n.qualifiedName.endsWith(`::${normalizedSymbol}`) || + n.qualifiedName.endsWith(`.${normalizedSymbol}`) + ); + if (exact) return exact; + + const lastPart = normalizedSymbol.split(/[.:]/).pop(); + if (!lastPart) continue; + const byLastPart = nodesInFile.find((n) => + n.name === lastPart && + (n.kind === 'function' || n.kind === 'method' || n.kind === 'class' || n.kind === 'module' || + n.kind === 'constant' || n.kind === 'variable') + ); + if (byLastPart) return byLastPart; + } + + return null; +} + +function findAnchoredMarkdownSection( + pathWithoutAnchor: string, + anchor: string, + fileNodes: Node[], + context: ResolutionContext +): Node | null { + const normalizedAnchor = normalizeMarkdownAnchor(anchor); + const candidateFiles = fileNodes.filter( + (n) => n.qualifiedName === pathWithoutAnchor || n.filePath === pathWithoutAnchor || + n.qualifiedName.endsWith(pathWithoutAnchor) || n.filePath.endsWith(pathWithoutAnchor) + ); + + for (const fileNode of candidateFiles) { + const sectionQualifiedName = `${fileNode.filePath}#${normalizedAnchor}`; + const exact = context + .getNodesByQualifiedName(sectionQualifiedName) + .find((n) => n.kind === 'module' && n.language === 'markdown'); + if (exact) return exact; + + const section = context + .getNodesInFile(fileNode.filePath) + .find((n) => n.kind === 'module' && n.language === 'markdown' && n.qualifiedName === sectionQualifiedName); + if (section) return section; + } + + return null; +} + +function normalizeMarkdownAnchor(anchor: string): string { + try { + anchor = decodeURIComponent(anchor); + } catch { + // Keep the original anchor if it is not valid URI encoding. + } + + return anchor + .toLowerCase() + .replace(/<[^>]+>/g, '') + .replace(/[^\p{L}\p{N}\s-]/gu, '') + .trim() + .replace(/\s+/g, '-') || 'section'; +} + /** * Try to resolve a reference by exact name match */ diff --git a/src/types.ts b/src/types.ts index 0168665d..2e180f51 100644 --- a/src/types.ts +++ b/src/types.ts @@ -89,6 +89,7 @@ export const LANGUAGES = [ 'luau', 'yaml', 'twig', + 'markdown', 'unknown', ] as const;