GML-2040 Fix CJK bug

chengbiao-jin · chengbiao-jin · commit 93b40bfe6dfb · 2026-03-04T11:14:17.000-08:00
diff --git a/common/chunkers/markdown_chunker.py b/common/chunkers/markdown_chunker.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from common.chunkers.base_chunker import BaseChunker
+from common.chunkers.separators import TEXT_SEPARATORS
 from langchain_text_splitters.markdown import ExperimentalMarkdownSyntaxTextSplitter
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 
@@ -34,7 +35,11 @@ def chunk(self, input_string):
         md_chunks = []
 
         if self.chunk_size > 0:
-            recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+            recursive_splitter = RecursiveCharacterTextSplitter(
+                separators=TEXT_SEPARATORS,
+                chunk_size=self.chunk_size,
+                chunk_overlap=self.chunk_overlap,
+            )
 
             if any(len(chunk) > self.chunk_size for chunk in initial_chunks):
                 for chunk in initial_chunks:
diff --git a/common/chunkers/recursive_chunker.py b/common/chunkers/recursive_chunker.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from common.chunkers.base_chunker import BaseChunker
+from common.chunkers.separators import TEXT_SEPARATORS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
@@ -23,7 +24,7 @@ def __init__(self, chunk_size=1024, overlap_size=0):
 
     def chunk(self, input_string):
         text_splitter = RecursiveCharacterTextSplitter(
-            separators=["\n\n", "\n", " ", ""],
+            separators=TEXT_SEPARATORS,
             chunk_size=self.chunk_size,
             chunk_overlap=self.overlap_size,
             length_function=len
diff --git a/common/chunkers/separators.py b/common/chunkers/separators.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024-2026 TigerGraph, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+TEXT_SEPARATORS = [
+    "\n\n",
+    "\n",
+    " ",
+    "\u3002",  # CJK full stop (。)
+    "\uff0c",  # CJK comma (，)
+    "\u3001",  # CJK enumeration comma (、)
+    "\uff1b",  # CJK semicolon (；)
+    "\uff01",  # CJK exclamation mark (！)
+    "\uff1f",  # CJK question mark (？)
+    "",
+]
diff --git a/ecc/app/graphrag/graph_rag.py b/ecc/app/graphrag/graph_rag.py
@@ -101,7 +101,7 @@ async def stream_chunks(
                         "StreamChunkContent",
                         params={"chunk": c},
                     )
-                content = res[0]["ChunkContent"][0]["attributes"]["text"].encode('utf-8').decode('unicode_escape')
+                content = res[0]["ChunkContent"][0]["attributes"]["text"].encode('raw_unicode_escape').decode('unicode_escape')
                 logger.info("chunk writes to extract_chan")
                 await extract_chan.put((content, c))
 
diff --git a/ecc/app/graphrag/workers.py b/ecc/app/graphrag/workers.py
@@ -100,7 +100,7 @@ async def chunk_doc(
         # For images, get_chunker returns SingleChunker which preserves markdown image references
         chunker = ecc_util.get_chunker(chunker_type)
         # decode the text return from tigergraph as it was encoded when written into jsonl file for uploading
-        chunks = chunker.chunk(doc["attributes"]["text"].encode('utf-8').decode('unicode_escape'))
+        chunks = chunker.chunk(doc["attributes"]["text"].encode('raw_unicode_escape').decode('unicode_escape'))
        
         logger.info(f"Chunking {v_id} into {len(chunks)} chunk(s)")
         for i, chunk in enumerate(chunks):

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ async def stream_chunks(`
`101`	`101`	`"StreamChunkContent",`
`102`	`102`	`params={"chunk": c},`
`103`	`103`	`)`
`104`		`- content = res[0]["ChunkContent"][0]["attributes"]["text"].encode('utf-8').decode('unicode_escape')`
	`104`	`+ content = res[0]["ChunkContent"][0]["attributes"]["text"].encode('raw_unicode_escape').decode('unicode_escape')`
`105`	`105`	`logger.info("chunk writes to extract_chan")`
`106`	`106`	`await extract_chan.put((content, c))`
`107`	`107`