Skip to content

Commit 679ef3c

Browse files
committed
✨ Add support for new filetypes
1 parent 0fae592 commit 679ef3c

1 file changed

Lines changed: 19 additions & 5 deletions

File tree

sdk/nexent/data_process/json_chunk_processor.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,12 @@ def dump(v): return orjson.dumps(v).decode("utf-8")
8282
def _split_plain(self, text: str) -> List[str]:
8383
"""
8484
Split plain text by max length, preferring punctuation boundaries.
85-
Avoid cutting inside escape sequences (e.g., don't cut after a lone backslash).
85+
86+
Args:
87+
text: Input text
88+
89+
Returns:
90+
List of text chunks
8691
"""
8792
out: List[str] = []
8893
all_punct = set(string.punctuation)
@@ -118,7 +123,12 @@ def _split_plain(self, text: str) -> List[str]:
118123
def _split_json_text(self, text: str) -> List[str]:
119124
"""
120125
Split JSON-derived text while preserving top-level key-value integrity.
121-
Falls back to plain splitting if no safe top-level boundary is found.
126+
127+
Args:
128+
text: JSON-derived string
129+
130+
Returns:
131+
List of text chunks
122132
"""
123133
out: List[str] = []
124134
cur = text
@@ -140,10 +150,14 @@ def _split_json_text(self, text: str) -> List[str]:
140150

141151
def _find_last_top_kv(self, text: str, max_len: int) -> int | None:
142152
"""
143-
Find the rightmost position <= max_len where a top-level key-value pair ends.
153+
Find the split position of the last top-level key-value pair.
144154
145-
A top-level KV ends at a comma when depth == 1 and outside any string.
146-
Additionally, ensures the cut does not leave an unescaped backslash at end.
155+
Args:
156+
text: JSON substring (prefix)
157+
158+
Returns:
159+
Index after the last complete top-level KV pair,
160+
or None if no safe split point exists.
147161
"""
148162
depth = 0
149163
in_str = False

0 commit comments

Comments
 (0)