Skip to content

Commit 32a900d

Browse files
CopilotMte90
andauthored
Fix tree-sitter-language-pack compatibility in llama_chunker (#18)
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
1 parent 00014bb commit 32a900d

File tree

1 file changed

+21
-4
lines changed

1 file changed

+21
-4
lines changed

ai/llama_chunker.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
77
from llama_index.core.schema import Document
88

9+
try:
10+
import tree_sitter_language_pack as tslp
11+
except ImportError:
12+
tslp = None
13+
914
from utils.logger import get_logger
1015

1116
logger = get_logger(__name__)
@@ -29,11 +34,13 @@ def chunk_with_llama_index(
2934
Returns:
3035
List of text chunks
3136
"""
32-
# Map language names to llama-index language identifiers
37+
# Map language names to tree-sitter-language-pack identifiers
3338
language_map = {
3439
"python": "python",
35-
"javascript": "js",
36-
"typescript": "ts",
40+
"javascript": "javascript",
41+
"js": "javascript",
42+
"typescript": "typescript",
43+
"ts": "typescript",
3744
"java": "java",
3845
"go": "go",
3946
"rust": "rust",
@@ -47,12 +54,22 @@ def chunk_with_llama_index(
4754
llama_lang = language_map.get(language.lower())
4855

4956
if llama_lang:
57+
# Create parser using tree_sitter_language_pack if available
58+
parser = None
59+
if tslp is not None:
60+
try:
61+
parser = tslp.get_parser(llama_lang)
62+
logger.debug(f"Created parser for language: {llama_lang}")
63+
except Exception as e:
64+
logger.warning(f"Could not create parser for {llama_lang}: {e}")
65+
5066
# Use CodeSplitter for code
5167
splitter = CodeSplitter(
5268
language=llama_lang,
5369
chunk_lines=40, # Target lines per chunk (approximation)
5470
chunk_lines_overlap=5, # Overlap in lines
55-
max_chars=chunk_size
71+
max_chars=chunk_size,
72+
parser=parser # Pass parser explicitly to avoid tree_sitter_languages dependency
5673
)
5774
logger.debug(f"Using CodeSplitter for language: {llama_lang}")
5875
else:

0 commit comments

Comments
 (0)