File tree Expand file tree Collapse file tree 1 file changed +21
-4
lines changed
Expand file tree Collapse file tree 1 file changed +21
-4
lines changed Original file line number Diff line number Diff line change 66from llama_index .core .node_parser import CodeSplitter , SentenceSplitter
77from llama_index .core .schema import Document
88
9+ try :
10+ import tree_sitter_language_pack as tslp
11+ except ImportError :
12+ tslp = None
13+
914from utils .logger import get_logger
1015
1116logger = get_logger (__name__ )
@@ -29,11 +34,13 @@ def chunk_with_llama_index(
2934 Returns:
3035 List of text chunks
3136 """
32- # Map language names to llama-index language identifiers
37+ # Map language names to tree-sitter- language-pack identifiers
3338 language_map = {
3439 "python" : "python" ,
35- "javascript" : "js" ,
36- "typescript" : "ts" ,
40+ "javascript" : "javascript" ,
41+ "js" : "javascript" ,
42+ "typescript" : "typescript" ,
43+ "ts" : "typescript" ,
3744 "java" : "java" ,
3845 "go" : "go" ,
3946 "rust" : "rust" ,
@@ -47,12 +54,22 @@ def chunk_with_llama_index(
4754 llama_lang = language_map .get (language .lower ())
4855
4956 if llama_lang :
57+ # Create parser using tree_sitter_language_pack if available
58+ parser = None
59+ if tslp is not None :
60+ try :
61+ parser = tslp .get_parser (llama_lang )
62+ logger .debug (f"Created parser for language: { llama_lang } " )
63+ except Exception as e :
64+ logger .warning (f"Could not create parser for { llama_lang } : { e } " )
65+
5066 # Use CodeSplitter for code
5167 splitter = CodeSplitter (
5268 language = llama_lang ,
5369 chunk_lines = 40 , # Target lines per chunk (approximation)
5470 chunk_lines_overlap = 5 , # Overlap in lines
55- max_chars = chunk_size
71+ max_chars = chunk_size ,
72+ parser = parser # Pass parser explicitly to avoid tree_sitter_languages dependency
5673 )
5774 logger .debug (f"Using CodeSplitter for language: { llama_lang } " )
5875 else :
You can’t perform that action at this time.
0 commit comments