Skip to content

Commit b470709

Browse files
committed
feat: use tree-sitter symbols in generic analyzer
- extract symbol components via Tree-sitter when available, fallback to legacy parsing - expand language/extension detection to cover more common languages - sync pnpm lockfile for tree-sitter runtime deps - update internal-docs submodule pointer and simplify releasing notes
1 parent 2e479e4 commit b470709

File tree

5 files changed

+95
-27
lines changed

5 files changed

+95
-27
lines changed

RELEASING.md

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,6 @@ This repo publishes `codebase-context` to npm.
88
- Release Please bot opens/updates a release PR automatically
99
- When you merge the release PR, it publishes to npm with provenance
1010

11-
## Setup (one-time)
12-
13-
**1. Configure npm Trusted Publisher:**
14-
- Go to https://www.npmjs.com/package/codebase-context/access
15-
- Add GitHub Actions trusted publisher:
16-
- Organization: `PatrickSys`
17-
- Repository: `codebase-context`
18-
- Workflow: `release-please.yml`
19-
- Environment: (leave empty)
20-
21-
That's it! No tokens, no rotation, just OIDC.
22-
23-
**2. Allow Release Please to work:**
24-
- GitHub Settings > Actions > General
25-
- Enable "Read and write permissions"
26-
- Enable "Allow GitHub Actions to create and approve pull requests"
27-
2811
## Releasing
2912

3013
1. Merge PRs to master
@@ -42,6 +25,7 @@ npm view codebase-context@X.Y.Z --json | jq .dist.attestations
4225
## Troubleshooting
4326

4427
If publish fails, check:
28+
4529
- Node 24+ in workflow (required for npm Trusted Publishers)
4630
- `id-token: write` permission in workflow
4731
- `registry-url` is set in setup-node

internal-docs

pnpm-lock.yaml

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/analyzers/generic/index.ts

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import {
1818
} from '../../types/index.js';
1919
import { createChunksFromCode } from '../../utils/chunking.js';
2020
import { detectLanguage } from '../../utils/language-detection.js';
21+
import { extractTreeSitterSymbols, type TreeSitterSymbol } from '../../utils/tree-sitter.js';
2122
import {
2223
detectWorkspaceType,
2324
scanWorkspacePackageJsons,
@@ -100,28 +101,58 @@ export class GenericAnalyzer implements FrameworkAnalyzer {
100101
let components: CodeComponent[] = [];
101102
let imports: ImportStatement[] = [];
102103
let exports: ExportStatement[] = [];
104+
let treeSitterGrammar: string | undefined;
105+
let usesTreeSitterSymbols = false;
103106

104107
try {
108+
const treeSitterResult = await extractTreeSitterSymbols(content, language);
109+
if (treeSitterResult && treeSitterResult.symbols.length > 0) {
110+
components = this.convertTreeSitterSymbolsToComponents(treeSitterResult.symbols);
111+
treeSitterGrammar = treeSitterResult.grammarFile;
112+
usesTreeSitterSymbols = true;
113+
}
114+
105115
if (language === 'typescript' || language === 'javascript') {
106116
const parsed = await this.parseJSTSFile(filePath, content, language);
107-
components = parsed.components;
108117
imports = parsed.imports;
109118
exports = parsed.exports;
119+
120+
// Keep legacy parser as fallback if Tree-sitter produced nothing.
121+
if (components.length === 0) {
122+
components = parsed.components;
123+
usesTreeSitterSymbols = false;
124+
treeSitterGrammar = undefined;
125+
}
110126
} else {
111-
// For other languages, use basic line-based parsing
112-
components = this.parseGenericFile(content);
127+
// For other languages, use regex fallback if Tree-sitter produced nothing.
128+
if (components.length === 0) {
129+
components = this.parseGenericFile(content);
130+
}
113131
}
114132
} catch (error) {
115133
console.warn(`Failed to parse ${filePath}:`, error);
116134
}
117135

136+
const metadata: Record<string, any> = {
137+
analyzer: this.name,
138+
fileSize: content.length,
139+
lineCount: content.split('\n').length,
140+
chunkStrategy: usesTreeSitterSymbols ? 'tree-sitter-symbol' : 'line-or-component'
141+
};
142+
143+
if (usesTreeSitterSymbols && treeSitterGrammar) {
144+
metadata.treeSitterGrammar = treeSitterGrammar;
145+
metadata.symbolAware = true;
146+
}
147+
118148
// Create chunks
119149
const chunks = await createChunksFromCode(
120150
content,
121151
filePath,
122152
relativePath,
123153
language,
124-
components
154+
components,
155+
metadata
125156
);
126157

127158
return {
@@ -131,15 +162,27 @@ export class GenericAnalyzer implements FrameworkAnalyzer {
131162
imports,
132163
exports,
133164
dependencies: [],
134-
metadata: {
135-
analyzer: this.name,
136-
fileSize: content.length,
137-
lineCount: content.split('\n').length
138-
},
165+
metadata,
139166
chunks
140167
};
141168
}
142169

170+
private convertTreeSitterSymbolsToComponents(symbols: TreeSitterSymbol[]): CodeComponent[] {
171+
return symbols.map((symbol) => ({
172+
name: symbol.name,
173+
type: symbol.kind,
174+
componentType: symbol.kind,
175+
startLine: symbol.startLine,
176+
endLine: symbol.endLine,
177+
metadata: {
178+
extraction: 'tree-sitter',
179+
nodeType: symbol.nodeType,
180+
startIndex: symbol.startIndex,
181+
endIndex: symbol.endIndex
182+
}
183+
}));
184+
}
185+
143186
async detectCodebaseMetadata(rootPath: string): Promise<CodebaseMetadata> {
144187
let projectName = path.basename(rootPath);
145188
let dependencies: Dependency[] = [];

src/utils/language-detection.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,21 @@ const extensionToLanguage: Record<string, string> = {
4646
'.zsh': 'shellscript',
4747
'.ps1': 'powershell',
4848
'.py': 'python',
49+
'.pyi': 'python',
4950
'.rb': 'ruby',
51+
'.php': 'php',
5052
'.java': 'java',
53+
'.kt': 'kotlin',
54+
'.kts': 'kotlin',
5155
'.go': 'go',
5256
'.rs': 'rust',
57+
'.cs': 'csharp',
58+
'.swift': 'swift',
59+
'.scala': 'scala',
5360
'.c': 'c',
5461
'.cpp': 'cpp',
62+
'.cc': 'cpp',
63+
'.cxx': 'cpp',
5564
'.h': 'c',
5665
'.hpp': 'cpp'
5766
};
@@ -121,13 +130,24 @@ const codeExtensions = new Set([
121130
'.mdx',
122131
'.graphql',
123132
'.gql',
133+
'.toml',
134+
'.xml',
124135
'.py',
136+
'.pyi',
125137
'.rb',
138+
'.php',
126139
'.java',
140+
'.kt',
141+
'.kts',
127142
'.go',
128143
'.rs',
144+
'.cs',
145+
'.swift',
146+
'.scala',
129147
'.c',
130148
'.cpp',
149+
'.cc',
150+
'.cxx',
131151
'.h',
132152
'.hpp'
133153
]);

0 commit comments

Comments
 (0)