Skip to content

Commit 93b40bf

Browse files
committed
GML-2040 Fix CJK bug
1 parent edda09a commit 93b40bf

5 files changed

Lines changed: 36 additions & 4 deletions

File tree

common/chunkers/markdown_chunker.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
from common.chunkers.base_chunker import BaseChunker
16+
from common.chunkers.separators import TEXT_SEPARATORS
1617
from langchain_text_splitters.markdown import ExperimentalMarkdownSyntaxTextSplitter
1718
from langchain.text_splitter import RecursiveCharacterTextSplitter
1819

@@ -34,7 +35,11 @@ def chunk(self, input_string):
3435
md_chunks = []
3536

3637
if self.chunk_size > 0:
37-
recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
38+
recursive_splitter = RecursiveCharacterTextSplitter(
39+
separators=TEXT_SEPARATORS,
40+
chunk_size=self.chunk_size,
41+
chunk_overlap=self.chunk_overlap,
42+
)
3843

3944
if any(len(chunk) > self.chunk_size for chunk in initial_chunks):
4045
for chunk in initial_chunks:

common/chunkers/recursive_chunker.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
from common.chunkers.base_chunker import BaseChunker
16+
from common.chunkers.separators import TEXT_SEPARATORS
1617
from langchain.text_splitter import RecursiveCharacterTextSplitter
1718

1819

@@ -23,7 +24,7 @@ def __init__(self, chunk_size=1024, overlap_size=0):
2324

2425
def chunk(self, input_string):
2526
text_splitter = RecursiveCharacterTextSplitter(
26-
separators=["\n\n", "\n", " ", ""],
27+
separators=TEXT_SEPARATORS,
2728
chunk_size=self.chunk_size,
2829
chunk_overlap=self.overlap_size,
2930
length_function=len

common/chunkers/separators.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright (c) 2024-2026 TigerGraph, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
TEXT_SEPARATORS = [
16+
"\n\n",
17+
"\n",
18+
" ",
19+
"\u3002", # CJK full stop (。)
20+
"\uff0c", # CJK comma (,)
21+
"\u3001", # CJK enumeration comma (、)
22+
"\uff1b", # CJK semicolon (;)
23+
"\uff01", # CJK exclamation mark (!)
24+
"\uff1f", # CJK question mark (?)
25+
"",
26+
]

ecc/app/graphrag/graph_rag.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ async def stream_chunks(
101101
"StreamChunkContent",
102102
params={"chunk": c},
103103
)
104-
content = res[0]["ChunkContent"][0]["attributes"]["text"].encode('utf-8').decode('unicode_escape')
104+
content = res[0]["ChunkContent"][0]["attributes"]["text"].encode('raw_unicode_escape').decode('unicode_escape')
105105
logger.info("chunk writes to extract_chan")
106106
await extract_chan.put((content, c))
107107

ecc/app/graphrag/workers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ async def chunk_doc(
100100
# For images, get_chunker returns SingleChunker which preserves markdown image references
101101
chunker = ecc_util.get_chunker(chunker_type)
102102
# decode the text return from tigergraph as it was encoded when written into jsonl file for uploading
103-
chunks = chunker.chunk(doc["attributes"]["text"].encode('utf-8').decode('unicode_escape'))
103+
chunks = chunker.chunk(doc["attributes"]["text"].encode('raw_unicode_escape').decode('unicode_escape'))
104104

105105
logger.info(f"Chunking {v_id} into {len(chunks)} chunk(s)")
106106
for i, chunk in enumerate(chunks):

0 commit comments

Comments
 (0)