diff --git a/scripts/parser/ocr_correct.py b/scripts/parser/ocr_correct.py index 6ff3465..13f724e 100644 --- a/scripts/parser/ocr_correct.py +++ b/scripts/parser/ocr_correct.py @@ -34,6 +34,9 @@ (re.compile(r'ff'), 'ff'), (re.compile(r'\u00a0'), ' '), # Non-breaking space → regular space + # Double periods from OCR/tokenization artifacts, while preserving ellipsis + (re.compile(r'(?