Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions scripts/parser/ocr_correct.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
(re.compile(r'ff'), 'ff'),
(re.compile(r'\u00a0'), ' '), # Non-breaking space → regular space

# Double periods from OCR/tokenization artifacts, while preserving ellipsis
(re.compile(r'(?<![.])\.{2}(?![.])'), '.'),

# Common scanner artifacts
(re.compile(r'^[;,.]$', re.MULTILINE), ''), # Lone punctuation on a line
(re.compile(r'^\s*[-_]{3,}\s*$', re.MULTILINE), ''), # Horizontal rules from scan lines
Expand Down
21 changes: 21 additions & 0 deletions scripts/parser/test_ocr_correct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""Tests for deterministic OCR correction patterns."""

from parser.ocr_correct import correct_ocr_errors


def test_double_periods_are_collapsed():
text = "Setiap orang wajib mematuhi ketentuan.."

assert correct_ocr_errors(text) == "Setiap orang wajib mematuhi ketentuan."


def test_double_periods_after_list_markers_are_collapsed():
text = "i.. melakukan pengawasan;\na.. menyusun standar;"

assert correct_ocr_errors(text) == "i. melakukan pengawasan;\na. menyusun standar;"


def test_ellipsis_is_preserved():
text = "Ketentuan ini berlaku..."

assert correct_ocr_errors(text) == "Ketentuan ini berlaku..."