From 156554e0d15125d45dd887fe33c49068723b4d41 Mon Sep 17 00:00:00 2001 From: aqilaziz Date: Wed, 6 May 2026 07:06:24 +0700 Subject: [PATCH] Fix OCR double period artifacts --- scripts/parser/ocr_correct.py | 3 +++ scripts/parser/test_ocr_correct.py | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 scripts/parser/test_ocr_correct.py diff --git a/scripts/parser/ocr_correct.py b/scripts/parser/ocr_correct.py index 6ff3465..13f724e 100644 --- a/scripts/parser/ocr_correct.py +++ b/scripts/parser/ocr_correct.py @@ -34,6 +34,9 @@ (re.compile(r'ff'), 'ff'), (re.compile(r'\u00a0'), ' '), # Non-breaking space → regular space + # Double periods from OCR/tokenization artifacts, while preserving ellipsis + (re.compile(r'(?