-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtext_utils.py
More file actions
29 lines (22 loc) · 972 Bytes
/
text_utils.py
File metadata and controls
29 lines (22 loc) · 972 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
"""
Text utility functions for cleaning and normalizing text data.
Provides functions to handle control characters, newlines, and Unicode normalization.
"""
# Standard library imports
import unicodedata
# Create a translation table that maps control characters to None
CONTROL_CHARS = "".join(chr(i) for i in range(32) if i not in [9, 10, 13]) + chr(127)
CONTROL_CHAR_TABLE = str.maketrans("", "", CONTROL_CHARS)
def clean_text(text):
"""Clean text to ensure it can be properly serialized"""
if not isinstance(text, str):
return text
# Remove control characters except tabs
cleaned = text.translate(CONTROL_CHAR_TABLE)
# Replace newlines and carriage returns with spaces
cleaned = cleaned.replace('\r\n', ' ').replace('\n', ' ').replace('\r', ' ')
# Replace multiple spaces with a single space
cleaned = ' '.join(cleaned.split())
# Normalize Unicode
cleaned = unicodedata.normalize("NFC", cleaned)
return cleaned