From 60c622e8978734ae109020e669406e616ad25d8c Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Fri, 17 Apr 2026 16:14:29 +0200 Subject: [PATCH 1/4] feat: add operators, nb normalizer and replacements --- normalization/languages/__init__.py | 12 +- normalization/languages/dutch/__init__.py | 7 + .../languages/dutch/number_normalizer.py | 455 ++++++++++++++++++ normalization/languages/dutch/operators.py | 111 +++++ normalization/languages/dutch/replacements.py | 28 ++ .../languages/dutch/sentence_replacements.py | 9 + 6 files changed, 620 insertions(+), 2 deletions(-) create mode 100644 normalization/languages/dutch/__init__.py create mode 100644 normalization/languages/dutch/number_normalizer.py create mode 100644 normalization/languages/dutch/operators.py create mode 100644 normalization/languages/dutch/replacements.py create mode 100644 normalization/languages/dutch/sentence_replacements.py diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py index 18e07a6..905c82f 100644 --- a/normalization/languages/__init__.py +++ b/normalization/languages/__init__.py @@ -1,7 +1,15 @@ -from . import english, french, german, italian, spanish +from . import dutch, english, french, german, italian, spanish from .base import LanguageOperators from .registry import get_language_registry, register_language register_language(LanguageOperators) -__all__ = ["english", "french", "german", "italian", "spanish", "get_language_registry"] +__all__ = [ + "dutch", + "english", + "french", + "german", + "italian", + "spanish", + "get_language_registry", +] diff --git a/normalization/languages/dutch/__init__.py b/normalization/languages/dutch/__init__.py new file mode 100644 index 0000000..8be3be6 --- /dev/null +++ b/normalization/languages/dutch/__init__.py @@ -0,0 +1,7 @@ +from .operators import DutchOperators +from .replacements import DUTCH_REPLACEMENTS + +__all__ = [ + "DutchOperators", + "DUTCH_REPLACEMENTS", +] diff --git a/normalization/languages/dutch/number_normalizer.py b/normalization/languages/dutch/number_normalizer.py new file mode 100644 index 0000000..9df42a5 --- /dev/null +++ b/normalization/languages/dutch/number_normalizer.py @@ -0,0 +1,455 @@ +import re +from fractions import Fraction +from typing import Iterator, Match + +""" +Dutch number normalizer: spelled-out numbers to digits. + +- Dutch compound order: ones + "en" + tens (e.g. "een en twintig" -> 21). +- Vocabulary: nul, een, twee, ..., tien, elf, twaalf, ..., twintig, dertig, ... +- Multipliers: honderd, duizend, miljoen, miljard, biljoen. +- Handles currency (euro, dollar, pond, cent), percent (procent), and decimal (komma). +- Currency output follows Dutch word order: amount then unit (e.g. €10 and "tien euro" -> "10 euros"). +""" + + +class DutchNumberNormalizer: + def __init__(self, currency_symbol_to_word: dict[str, str] | None = None): + self.zeros = {"nul"} + self.ones: dict[str, int] = { + name: i + for i, name in enumerate( + [ + "een", + "twee", + "drie", + "vier", + "vijf", + "zes", + "zeven", + "acht", + "negen", + "tien", + "elf", + "twaalf", + "dertien", + "veertien", + "vijftien", + "zestien", + "zeventien", + "achttien", + "negentien", + ], + start=1, + ) + } + self.ones_plural = {} + self.ones_ordinal = { + "eerste": (1, "e"), + "tweede": (2, "e"), + "derde": (3, "e"), + "vierde": (4, "e"), + "vijfde": (5, "e"), + "zesde": (6, "e"), + "zevende": (7, "e"), + "achtste": (8, "e"), + "negende": (9, "e"), + "tiende": (10, "e"), + **{ + name + "de": (value, "e") + for name, value in self.ones.items() + if value > 10 and value < 20 + }, + } + self.ones_suffixed: dict[str, tuple[int, str]] = { + **self.ones_plural, + **self.ones_ordinal, + } + + self.tens = { + "twintig": 20, + "dertig": 30, + "veertig": 40, + "vijftig": 50, + "zestig": 60, + "zeventig": 70, + "tachtig": 80, + "negentig": 90, + } + self.tens_plural = { + name + "en": (value, "en") for name, value in self.tens.items() + } + self.tens_ordinal = { + name + "ste": (value, "e") for name, value in self.tens.items() + } + self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal} + + self.multipliers = { + "honderd": 100, + "duizend": 1_000, + "miljoen": 1_000_000, + "miljard": 1_000_000_000, + "biljoen": 1_000_000_000_000, + } + self.multipliers_plural = { + name + "en": (value, "en") for name, value in self.multipliers.items() + } + self.multipliers_ordinal = { + name + "ste": (value, "e") for name, value in self.multipliers.items() + } + self.multipliers_suffixed = { + **self.multipliers_plural, + **self.multipliers_ordinal, + } + self.decimals = {*self.ones, *self.tens, *self.zeros} + + self.preceding_prefixers = { + "min": "-", + "minus": "-", + "negatief": "-", + "plus": "+", + "positief": "+", + "nul": "0", + } + self.following_prefixers = { + "pond": "£", + "ponden": "£", + "euro": "€", + "euro's": "€", + "euros": "€", + "yen": "¥", + "yens": "¥", + "dollar": "$", + "dollars": "$", + "cent": "¢", + "centen": "¢", + "cents": "¢", + } + self.prefixes = set( + list(self.preceding_prefixers.values()) + + list(self.following_prefixers.values()) + ) + self.suffixers = { + "procent": "%", + } + self.specials = {"en", "dubbel", "drievoudig", "komma"} + + self._currency_trailing = currency_symbol_to_word or {} + + self.words = { + key + for mapping in [ + self.zeros, + self.ones, + self.ones_suffixed, + self.tens, + self.tens_suffixed, + self.multipliers, + self.multipliers_suffixed, + self.preceding_prefixers, + self.following_prefixers, + self.suffixers, + self.specials, + ] + for key in mapping + } + self.literal_words = {"een"} + + def process_words(self, words: list[str]) -> Iterator[str]: # noqa: C901 + prefix: str | None = None + value: str | int | None = None + pending_ones: int | None = None # Dutch: "een en twintig" -> 21 + skip = False + + def to_fraction(s: str | float): + try: + return Fraction(s) + except ValueError: + return None + + def output(result: str | int): + nonlocal prefix, value, pending_ones + result = str(result) + if prefix is not None: + trailing = self._currency_trailing.get(prefix) + if trailing is not None: + result = f"{result} {trailing}" + else: + result = prefix + result + value = None + pending_ones = None + prefix = None + return result + + if len(words) == 0: + return + + for i, current in enumerate(words): + prev = words[i - 1] if i != 0 else None + next_word = words[i + 1] if i != len(words) - 1 else None + if skip: + skip = False + continue + + current_lower = current.lower() + prev_lower = prev.lower() if prev is not None else None + next_lower = next_word.lower() if next_word is not None else None + + next_is_numeric = next_word is not None and re.match( + r"^\d+(\.\d+)?$", next_word + ) + + if re.match(r"^\d+$", current): + if value is not None: + yield output(value) + if pending_ones is not None: + yield output(pending_ones) + yield output(current) + continue + + has_prefix = current[0] in self.prefixes + current_without_prefix = current[1:] if has_prefix else current + if re.match(r"^\d+(\.\d+)?$", current_without_prefix): + f = to_fraction(current_without_prefix) + if f is None: + raise ValueError("Converting the fraction failed") + + if value is not None: + if isinstance(value, str) and value.endswith("."): + value = str(value) + str(current) + continue + else: + yield output(value) + if pending_ones is not None: + yield output(pending_ones) + + prefix = current[0] if has_prefix else prefix + if f.denominator == 1: + value = f.numerator + else: + value = current_without_prefix + elif current_lower not in self.words: + if value is not None: + yield output(value) + if pending_ones is not None: + yield output(pending_ones) + yield output(current) + elif current_lower in self.zeros: + value = str(value or "") + "0" + elif current_lower in self.ones: + ones = self.ones[current_lower] + + if ( + next_lower == "en" + and next_word is not None + and i + 2 < len(words) + and words[i + 2].lower() in self.tens + ): + pending_ones = ones + skip = True + elif value is None and pending_ones is None: + value = ones + elif isinstance(value, str) or prev_lower in self.ones: + if prev_lower in self.tens and ones < 10: + value = value[:-1] + str(ones) # type: ignore + else: + value = str(value) + str(ones) + elif ones < 10: + if value is not None and value % 10 == 0: + value += ones + else: + value = str(value or "") + str(ones) + else: + if value is not None and value % 100 == 0: + value += ones + else: + value = str(value or "") + str(ones) + elif current_lower in self.ones_suffixed: + ones, suffix = self.ones_suffixed[current_lower] + if value is None and pending_ones is None: + yield output(str(ones) + suffix) + elif isinstance(value, str) or prev_lower in self.ones: + if prev_lower in self.tens and ones < 10: + yield output(value[:-1] + str(ones) + suffix) # type: ignore + else: + yield output(str(value) + str(ones) + suffix) + elif ones < 10 and value is not None: + if value % 10 == 0: + yield output(str(value + ones) + suffix) + else: + yield output(str(value) + str(ones) + suffix) + else: + if value is not None and value % 100 == 0: + yield output(str(value + ones) + suffix) + else: + yield output(str(value or "") + str(ones) + suffix) + value = None + pending_ones = None + elif current_lower in self.tens: + tens = self.tens[current_lower] + if pending_ones is not None: + value = tens + pending_ones + pending_ones = None + elif value is None: + value = tens + elif isinstance(value, str): + value = str(value) + str(tens) + else: + if value % 100 == 0: + value += tens + else: + value = str(value) + str(tens) + elif current_lower in self.tens_suffixed: + tens, suffix = self.tens_suffixed[current_lower] + if pending_ones is not None: + yield output(str(tens + pending_ones) + suffix) + pending_ones = None + elif value is None: + yield output(str(tens) + suffix) + elif isinstance(value, str): + yield output(str(value) + str(tens) + suffix) + else: + if value % 100 == 0: + yield output(str(value + tens) + suffix) + else: + yield output(str(value) + str(tens) + suffix) + value = None + elif current_lower in self.multipliers: + multiplier = self.multipliers[current_lower] + if pending_ones is not None: + yield output(pending_ones) + pending_ones = None + if value is None: + value = multiplier + elif isinstance(value, str) or value == 0: + f = to_fraction(value) + p = f * multiplier if f is not None else None + if p is not None and p.denominator == 1: + value = p.numerator + else: + yield output(value) + value = multiplier + else: + before = value // 1000 * 1000 + residual = value % 1000 + value = before + residual * multiplier + elif current_lower in self.multipliers_suffixed: + multiplier, suffix = self.multipliers_suffixed[current_lower] + if pending_ones is not None: + yield output(pending_ones) + pending_ones = None + if value is None: + yield output(str(multiplier) + suffix) + elif isinstance(value, str): + f = to_fraction(value) + p = f * multiplier if f is not None else None + if p is not None and p.denominator == 1: + yield output(str(p.numerator) + suffix) + else: + yield output(value) + yield output(str(multiplier) + suffix) + else: + before = value // 1000 * 1000 + residual = value % 1000 + value = before + residual * multiplier + yield output(str(value) + suffix) + value = None + elif current_lower in self.preceding_prefixers: + if value is not None: + yield output(value) + if pending_ones is not None: + yield output(pending_ones) + + if next_lower in self.words or next_is_numeric: + prefix = self.preceding_prefixers[current_lower] + else: + yield output(current) + elif current_lower in self.following_prefixers: + if value is not None: + prefix = self.following_prefixers[current_lower] + yield output(value) + elif pending_ones is not None: + yield output(pending_ones) + yield output(current) + else: + yield output(current) + elif current_lower in self.suffixers: + if value is not None: + suffix = self.suffixers[current_lower] + yield output(str(value) + suffix) + elif pending_ones is not None: + yield output(str(pending_ones) + self.suffixers[current_lower]) + else: + yield output(current) + value = None + pending_ones = None + elif current_lower in self.specials: + if next_lower not in self.words and not next_is_numeric: + if value is not None: + yield output(value) + if pending_ones is not None: + yield output(pending_ones) + yield output(current) + elif current_lower == "en": + if prev_lower not in self.multipliers: + if value is not None: + yield output(value) + if pending_ones is not None: + yield output(pending_ones) + yield output(current) + elif current_lower == "dubbel" or current_lower == "drievoudig": + if next_lower in self.ones or next_lower in self.zeros: + repeats = 2 if current_lower == "dubbel" else 3 + ones = self.ones.get(next_lower, 0) + value = str(value or "") + str(ones) * repeats + skip = True + else: + if value is not None: + yield output(value) + if pending_ones is not None: + yield output(pending_ones) + yield output(current) + elif current_lower == "komma": + if next_lower in self.decimals or next_is_numeric: + value = str(value or "") + "." + else: + raise ValueError(f"Unexpected token: {current}") + else: + raise ValueError(f"Unexpected token: {current}") + + if value is not None: + yield output(value) + if pending_ones is not None: + yield output(pending_ones) + + def preprocess(self, s: str) -> str: + s = re.sub(r"([a-z])([0-9])", r"\1 \2", s) + s = re.sub(r"([0-9])([a-z])", r"\1 \2", s) + s = re.sub(r"([0-9])\s+(e|en|ste)\b", r"\1\2", s) + return s + + def postprocess(self, s: str) -> str: + def combine_cents(m: Match): + try: + currency = m.group(1) + integer = m.group(2) + cents = int(m.group(3)) + return f"{currency}{integer}.{cents:02d}" + except ValueError: + return m.string + + def extract_cents(m: Match): + try: + return f"¢{int(m.group(1))}" + except ValueError: + return m.string + + s = re.sub(r"([€£$¥])([0-9]+) (?:en )?¢([0-9]{1,2})\b", combine_cents, s) + s = re.sub(r"[€£$¥]0.([0-9]{1,2})\b", extract_cents, s) + return s + + def __call__(self, s: str) -> str: + s = self.preprocess(s) + s = " ".join(word for word in self.process_words(s.split()) if word is not None) + s = self.postprocess(s) + return s diff --git a/normalization/languages/dutch/operators.py b/normalization/languages/dutch/operators.py new file mode 100644 index 0000000..1d3fe20 --- /dev/null +++ b/normalization/languages/dutch/operators.py @@ -0,0 +1,111 @@ +import re + +from normalization.languages.base import ( + LanguageConfig, + LanguageOperators, +) +from normalization.languages.dutch.number_normalizer import DutchNumberNormalizer +from normalization.languages.dutch.sentence_replacements import ( + DUTCH_SENTENCE_REPLACEMENTS, +) +from normalization.languages.registry import register_language + +# Flemish apostrophe clitics (straight or typographic apostrophe). (?": "groter dan", + "<": "kleiner dan", + "°": "graden", + "°C": "graden celsius", + "°F": "graden fahrenheit", + "%": "procent", + }, + currency_symbol_to_word={ + "€": "euros", + "$": "dollars", + "£": "ponden", + "¢": "cent", + "¥": "yens", + }, + filler_words=[ + "ah", + "allee", + "alee", + "eh", + "ehm", + "hé", + "hè", + "he", + "hm", + "hmm", + "mm", + "mmm", + "mhm", + "nou", + "o", + "oke", + "okee", + "oké", + "uh", + ], + sentence_replacements=DUTCH_SENTENCE_REPLACEMENTS, +) + + +@register_language +class DutchOperators(LanguageOperators): + def __init__(self): + super().__init__(DUTCH_CONFIG) + self._number_normalizer = DutchNumberNormalizer( + DUTCH_CONFIG.currency_symbol_to_word, + ) + + def expand_written_numbers(self, text: str) -> str: + return self._number_normalizer(text) + + def expand_contractions(self, text: str) -> str: + def _temporal_sub(m: re.Match[str]) -> str: + return f"des{m.group(1)}{m.group(2).lower()}" + + text = _RE_TEMPORAL_S.sub(_temporal_sub, text) + text = _RE_CLITIC_S.sub("is", text) + + def _clitic_sub(m: re.Match[str]) -> str: + return _CLITIC_LETTER_TO_WORD[m.group(1).lower()] + + text = _RE_CLITIC_TRNKM.sub(_clitic_sub, text) + return text + + def get_word_replacements(self) -> dict[str, str]: + from normalization.languages.dutch.replacements import DUTCH_REPLACEMENTS + + return DUTCH_REPLACEMENTS diff --git a/normalization/languages/dutch/replacements.py b/normalization/languages/dutch/replacements.py new file mode 100644 index 0000000..af45bed --- /dev/null +++ b/normalization/languages/dutch/replacements.py @@ -0,0 +1,28 @@ +"""Single-token Flemish / colloquial → standard Dutch (canonical for WER).""" + +DUTCH_REPLACEMENTS: dict[str, str] = { + # Flemish dialect → standard Dutch + "ge": "je", + "da": "dat", + "ne": "een", + "efkes": "even", + "effe": "even", + "awel": "wel", + "den": "de", + "mijne": "mijn", + "gij": "jij", + "zij": "ze", + "zijne": "zijn", + # Bare clitics (apostrophe dropped by ASR) + "t": "het", + "s": "is", + "r": "er", + "k": "ik", + # Formal / informal pronoun conflation (Flemish customer service) + # ref uses formal u/uw; models transcribe je — normalise to je + "u": "je", + "uw": "je", + # Spelling variants → canonical + "okee": "oke", # oke is already in filler_words; okee must map to it + "euro": "euros", # collapse singular/plural +} diff --git a/normalization/languages/dutch/sentence_replacements.py b/normalization/languages/dutch/sentence_replacements.py new file mode 100644 index 0000000..fd5cab9 --- /dev/null +++ b/normalization/languages/dutch/sentence_replacements.py @@ -0,0 +1,9 @@ +"""Multi-word and phrase-level normalization for Dutch (incl. Flemish variants).""" + +DUTCH_SENTENCE_REPLACEMENTS: dict[str, str] = { + "fifty fifty": "5050", + "fiftyfifty": "5050", + "checks": "cheques", + "goeiemiddag": "goedemiddag", + "kollega": "collega", +} From b4587497722c8b2cdc2fb50598929dedfe51406d Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Fri, 17 Apr 2026 16:16:37 +0200 Subject: [PATCH 2/4] test: add tests for dutch normalizer --- tests/e2e/files/gladia-3/nl.csv | 25 ++++++++ tests/unit/languages/dutch_operators_test.py | 62 ++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 tests/e2e/files/gladia-3/nl.csv create mode 100644 tests/unit/languages/dutch_operators_test.py diff --git a/tests/e2e/files/gladia-3/nl.csv b/tests/e2e/files/gladia-3/nl.csv new file mode 100644 index 0000000..0f41e50 --- /dev/null +++ b/tests/e2e/files/gladia-3/nl.csv @@ -0,0 +1,25 @@ +input,expected +tien euro,10 euros +2 < 5,2 kleiner dan 5 +50°C,50 graden celsius +ca kost €50,ca kost 50 euros +"1.234,56",1234 komma 56 +dertien appels,13 appels +kollega zegt hallo,collega zegt hallo +ge weet da,je weet dat +ik zeg 't zo,ik zeg het zo +honderd euro,100 euros +vijf dollar,5 dollars +honderd euro's,100 euros +"3,14",3 komma 14 +192.168.1.1,192 punt 168 punt 1 punt 1 +test@example.com,test apenstaartje example punt com +www.example.com,w w w punt example punt com +x = 5,x gelijk aan 5 +Het woord [inaudible] is hier,het woord inaudible is hier +hallo eh daar,hallo daar +mein naam is Bob,mein naam is bob +twee duizend,2000 +'s ochtends vroeg,des ochtends vroeg +ping pong,ping pong +vijf en twintig euro,25 euros diff --git a/tests/unit/languages/dutch_operators_test.py b/tests/unit/languages/dutch_operators_test.py new file mode 100644 index 0000000..280e1b1 --- /dev/null +++ b/tests/unit/languages/dutch_operators_test.py @@ -0,0 +1,62 @@ +import pytest + +from normalization.languages.dutch.operators import DutchOperators +from normalization.languages.registry import get_language_registry + + +@pytest.fixture +def operators(): + return DutchOperators() + + +def test_dutch_is_registered(): + assert "nl" in get_language_registry() + + +def test_dutch_registry_produces_dutch_operators(): + instance = get_language_registry()["nl"]() + assert isinstance(instance, DutchOperators) + + +def test_expand_flemish_clitics(operators): + assert operators.expand_contractions("ik zeg 't zo") == "ik zeg het zo" + assert operators.expand_contractions("zeg ’t zo") == "zeg het zo" + assert operators.expand_contractions("'k kom morgen") == "ik kom morgen" + assert operators.expand_contractions("is 'r nog") == "is er nog" + assert operators.expand_contractions("'n beetje") == "een beetje" + assert operators.expand_contractions("zie je 'm") == "zie je hem" + assert operators.expand_contractions("dat 's goed") == "dat is goed" + + +def test_expand_clitic_s_not_possessive_after_word(operators): + assert operators.expand_contractions("Jan's auto") == "Jan's auto" + + +def test_expand_temporal_s_to_des(operators): + assert operators.expand_contractions("'s ochtends vroeg") == "des ochtends vroeg" + assert operators.expand_contractions("'S Avonds laat") == "des avonds laat" + + +def test_config_sentence_replacements(operators): + assert operators.config.sentence_replacements is not None + assert operators.config.sentence_replacements["kollega"] == "collega" + + +def test_word_replacements(operators): + assert operators.get_word_replacements()["ge"] == "je" + assert operators.get_word_replacements()["da"] == "dat" + assert operators.get_word_replacements()["u"] == "je" + assert operators.get_word_replacements()["uw"] == "je" + assert operators.get_word_replacements()["okee"] == "oke" + assert operators.get_word_replacements()["euro"] == "euros" + + +def test_expand_written_numbers_euro_after_amount_dutch_order(operators): + assert operators.expand_written_numbers("tien euro") == "10 euros" + assert operators.expand_written_numbers("€10") == "10 euros" + assert operators.expand_written_numbers("honderd euro's") == "100 euros" + + +def test_expand_written_numbers_other_currency_trailing_words(operators): + assert operators.expand_written_numbers("vijf dollar") == "5 dollars" + assert operators.expand_written_numbers("$3.50") == "3.50 dollars" From f7a6b529f72b3e76b176b90d9e214032dc1734bd Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Tue, 21 Apr 2026 17:01:54 +0200 Subject: [PATCH 3/4] fix: fit aplha2digit for dutch normalizer --- .../languages/dutch/number_normalizer.py | 577 ++++-------------- normalization/languages/dutch/operators.py | 5 + 2 files changed, 138 insertions(+), 444 deletions(-) diff --git a/normalization/languages/dutch/number_normalizer.py b/normalization/languages/dutch/number_normalizer.py index 9df42a5..73653ee 100644 --- a/normalization/languages/dutch/number_normalizer.py +++ b/normalization/languages/dutch/number_normalizer.py @@ -1,455 +1,144 @@ -import re -from fractions import Fraction -from typing import Iterator, Match - -""" -Dutch number normalizer: spelled-out numbers to digits. +"""Dutch number normalizer using text2num's alpha2digit. -- Dutch compound order: ones + "en" + tens (e.g. "een en twintig" -> 21). -- Vocabulary: nul, een, twee, ..., tien, elf, twaalf, ..., twintig, dertig, ... -- Multipliers: honderd, duizend, miljoen, miljard, biljoen. -- Handles currency (euro, dollar, pond, cent), percent (procent), and decimal (komma). -- Currency output follows Dutch word order: amount then unit (e.g. €10 and "tien euro" -> "10 euros"). +Converts spelled-out numbers to digits (e.g. vijf en twintig → 25) and handles +mixed digit+word forms (e.g. 3 miljard → drie miljard) before conversion so +alpha2digit does not misinterpret them. Optionally rewrites currency symbols to +amount + spoken singular unit, then restores plural trailing words from config. """ +import re -class DutchNumberNormalizer: - def __init__(self, currency_symbol_to_word: dict[str, str] | None = None): - self.zeros = {"nul"} - self.ones: dict[str, int] = { - name: i - for i, name in enumerate( - [ - "een", - "twee", - "drie", - "vier", - "vijf", - "zes", - "zeven", - "acht", - "negen", - "tien", - "elf", - "twaalf", - "dertien", - "veertien", - "vijftien", - "zestien", - "zeventien", - "achttien", - "negentien", - ], - start=1, +from text_to_num import alpha2digit + +# Digit-to-Dutch-word mapping for normalizing "3 miljard" → "drie miljard". +_DIGIT_TO_DUTCH: dict[str, str] = { + "0": "nul", + "1": "een", + "2": "twee", + "3": "drie", + "4": "vier", + "5": "vijf", + "6": "zes", + "7": "zeven", + "8": "acht", + "9": "negen", +} + +# Pattern: digit(s) followed by Dutch large-number multipliers. +_RE_MIXED_NUMBER = re.compile( + r"\b(\d+)\s+(miljoen|miljoenen|miljard|miljarden|biljoen|biljoenen)\b", + re.IGNORECASE, +) + + +def _normalize_mixed_numbers(text: str) -> str: + """Convert '3 miljard' → 'drie miljard' so alpha2digit yields 3e9, not '3 1000000000'. + + alpha2digit may concatenate a lone digit with the following word; converting + the digit to a word avoids that (e.g. 'drie miljard' → 3000000000). + """ + + def replace(match: re.Match) -> str: + number = match.group(1) + multiplier = match.group(2) + if len(number) == 1 and number in _DIGIT_TO_DUTCH: + return f"{_DIGIT_TO_DUTCH[number]} {multiplier}" + # Multi-digit: keep as-is; alpha2digit will handle or leave unchanged + return match.group(0) + + return _RE_MIXED_NUMBER.sub(replace, text) + + +def _singular_spoken_unit(trailing_word: str) -> str: + """Map ``currency_symbol_to_word`` value to a spoken singular alpha2digit accepts.""" + t = trailing_word.lower() + if t == "euros": + return "euro" + if t == "dollars": + return "dollar" + if t == "ponden": + return "pond" + if t == "yens": + return "yen" + return trailing_word + + +def _normalize_currency_symbols( + text: str, + currency_symbol_to_word: dict[str, str] | None, +) -> str: + if not currency_symbol_to_word: + return text + num = r"\d+(?:[.,]\d+)?" + for symbol, trailing in currency_symbol_to_word.items(): + singular = _singular_spoken_unit(trailing) + esc = re.escape(symbol) + text = re.sub(rf"{esc}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE) + text = re.sub(rf"({num})\s*{esc}", rf"\1 {singular}", text, flags=re.IGNORECASE) + return text + + +def _currency_plural_fix_patterns( + currency_symbol_to_word: dict[str, str] | None, +) -> tuple[tuple[re.Pattern[str], str], ...]: + """Build (pattern, replacement) pairs so digit + alpha2digit singular → config trailing word.""" + if not currency_symbol_to_word: + return () + amount = r"(\d+(?:[.,]\d+)?)" + seen: set[str] = set() + out: list[tuple[re.Pattern[str], str]] = [] + for _symbol, trailing in currency_symbol_to_word.items(): + tl = trailing.lower() + if tl in seen: + continue + seen.add(tl) + singular = _singular_spoken_unit(trailing) + if singular.lower() == tl: + continue + if tl == "euros": + pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE) + out.append((pat, rf"\1 {trailing}")) + else: + pat = re.compile( + rf"\b{amount}\s+{re.escape(singular)}\b", + re.IGNORECASE, ) - } - self.ones_plural = {} - self.ones_ordinal = { - "eerste": (1, "e"), - "tweede": (2, "e"), - "derde": (3, "e"), - "vierde": (4, "e"), - "vijfde": (5, "e"), - "zesde": (6, "e"), - "zevende": (7, "e"), - "achtste": (8, "e"), - "negende": (9, "e"), - "tiende": (10, "e"), - **{ - name + "de": (value, "e") - for name, value in self.ones.items() - if value > 10 and value < 20 - }, - } - self.ones_suffixed: dict[str, tuple[int, str]] = { - **self.ones_plural, - **self.ones_ordinal, - } - - self.tens = { - "twintig": 20, - "dertig": 30, - "veertig": 40, - "vijftig": 50, - "zestig": 60, - "zeventig": 70, - "tachtig": 80, - "negentig": 90, - } - self.tens_plural = { - name + "en": (value, "en") for name, value in self.tens.items() - } - self.tens_ordinal = { - name + "ste": (value, "e") for name, value in self.tens.items() - } - self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal} - - self.multipliers = { - "honderd": 100, - "duizend": 1_000, - "miljoen": 1_000_000, - "miljard": 1_000_000_000, - "biljoen": 1_000_000_000_000, - } - self.multipliers_plural = { - name + "en": (value, "en") for name, value in self.multipliers.items() - } - self.multipliers_ordinal = { - name + "ste": (value, "e") for name, value in self.multipliers.items() - } - self.multipliers_suffixed = { - **self.multipliers_plural, - **self.multipliers_ordinal, - } - self.decimals = {*self.ones, *self.tens, *self.zeros} - - self.preceding_prefixers = { - "min": "-", - "minus": "-", - "negatief": "-", - "plus": "+", - "positief": "+", - "nul": "0", - } - self.following_prefixers = { - "pond": "£", - "ponden": "£", - "euro": "€", - "euro's": "€", - "euros": "€", - "yen": "¥", - "yens": "¥", - "dollar": "$", - "dollars": "$", - "cent": "¢", - "centen": "¢", - "cents": "¢", - } - self.prefixes = set( - list(self.preceding_prefixers.values()) - + list(self.following_prefixers.values()) - ) - self.suffixers = { - "procent": "%", - } - self.specials = {"en", "dubbel", "drievoudig", "komma"} - - self._currency_trailing = currency_symbol_to_word or {} - - self.words = { - key - for mapping in [ - self.zeros, - self.ones, - self.ones_suffixed, - self.tens, - self.tens_suffixed, - self.multipliers, - self.multipliers_suffixed, - self.preceding_prefixers, - self.following_prefixers, - self.suffixers, - self.specials, - ] - for key in mapping - } - self.literal_words = {"een"} - - def process_words(self, words: list[str]) -> Iterator[str]: # noqa: C901 - prefix: str | None = None - value: str | int | None = None - pending_ones: int | None = None # Dutch: "een en twintig" -> 21 - skip = False - - def to_fraction(s: str | float): - try: - return Fraction(s) - except ValueError: - return None - - def output(result: str | int): - nonlocal prefix, value, pending_ones - result = str(result) - if prefix is not None: - trailing = self._currency_trailing.get(prefix) - if trailing is not None: - result = f"{result} {trailing}" - else: - result = prefix + result - value = None - pending_ones = None - prefix = None - return result + out.append((pat, rf"\1 {trailing}")) + return tuple(out) - if len(words) == 0: - return - for i, current in enumerate(words): - prev = words[i - 1] if i != 0 else None - next_word = words[i + 1] if i != len(words) - 1 else None - if skip: - skip = False - continue +def _apply_currency_plural_fixes( + text: str, + fixers: tuple[tuple[re.Pattern[str], str], ...], +) -> str: + for pattern, repl in fixers: + text = pattern.sub(repl, text) + return text - current_lower = current.lower() - prev_lower = prev.lower() if prev is not None else None - next_lower = next_word.lower() if next_word is not None else None - next_is_numeric = next_word is not None and re.match( - r"^\d+(\.\d+)?$", next_word +class DutchNumberNormalizer: + """Convert Dutch spelled-out numbers to digits via text2num.alpha2digit. + + Applies pre-passes for currency symbols (when configured) and mixed digit+word + forms (e.g. 3 miljard) before calling alpha2digit, then normalizes currency + words to the plural forms in ``currency_symbol_to_word``. + """ + + def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None: + if alpha2digit is None: + raise ImportError( + "Dutch number normalization requires the text2num package. " + "Install it with: uv add text2num" ) + self._alpha2digit = alpha2digit + self._currency_symbol_to_word = currency_symbol_to_word + self._currency_plural_fixes = _currency_plural_fix_patterns( + currency_symbol_to_word, + ) - if re.match(r"^\d+$", current): - if value is not None: - yield output(value) - if pending_ones is not None: - yield output(pending_ones) - yield output(current) - continue - - has_prefix = current[0] in self.prefixes - current_without_prefix = current[1:] if has_prefix else current - if re.match(r"^\d+(\.\d+)?$", current_without_prefix): - f = to_fraction(current_without_prefix) - if f is None: - raise ValueError("Converting the fraction failed") - - if value is not None: - if isinstance(value, str) and value.endswith("."): - value = str(value) + str(current) - continue - else: - yield output(value) - if pending_ones is not None: - yield output(pending_ones) - - prefix = current[0] if has_prefix else prefix - if f.denominator == 1: - value = f.numerator - else: - value = current_without_prefix - elif current_lower not in self.words: - if value is not None: - yield output(value) - if pending_ones is not None: - yield output(pending_ones) - yield output(current) - elif current_lower in self.zeros: - value = str(value or "") + "0" - elif current_lower in self.ones: - ones = self.ones[current_lower] - - if ( - next_lower == "en" - and next_word is not None - and i + 2 < len(words) - and words[i + 2].lower() in self.tens - ): - pending_ones = ones - skip = True - elif value is None and pending_ones is None: - value = ones - elif isinstance(value, str) or prev_lower in self.ones: - if prev_lower in self.tens and ones < 10: - value = value[:-1] + str(ones) # type: ignore - else: - value = str(value) + str(ones) - elif ones < 10: - if value is not None and value % 10 == 0: - value += ones - else: - value = str(value or "") + str(ones) - else: - if value is not None and value % 100 == 0: - value += ones - else: - value = str(value or "") + str(ones) - elif current_lower in self.ones_suffixed: - ones, suffix = self.ones_suffixed[current_lower] - if value is None and pending_ones is None: - yield output(str(ones) + suffix) - elif isinstance(value, str) or prev_lower in self.ones: - if prev_lower in self.tens and ones < 10: - yield output(value[:-1] + str(ones) + suffix) # type: ignore - else: - yield output(str(value) + str(ones) + suffix) - elif ones < 10 and value is not None: - if value % 10 == 0: - yield output(str(value + ones) + suffix) - else: - yield output(str(value) + str(ones) + suffix) - else: - if value is not None and value % 100 == 0: - yield output(str(value + ones) + suffix) - else: - yield output(str(value or "") + str(ones) + suffix) - value = None - pending_ones = None - elif current_lower in self.tens: - tens = self.tens[current_lower] - if pending_ones is not None: - value = tens + pending_ones - pending_ones = None - elif value is None: - value = tens - elif isinstance(value, str): - value = str(value) + str(tens) - else: - if value % 100 == 0: - value += tens - else: - value = str(value) + str(tens) - elif current_lower in self.tens_suffixed: - tens, suffix = self.tens_suffixed[current_lower] - if pending_ones is not None: - yield output(str(tens + pending_ones) + suffix) - pending_ones = None - elif value is None: - yield output(str(tens) + suffix) - elif isinstance(value, str): - yield output(str(value) + str(tens) + suffix) - else: - if value % 100 == 0: - yield output(str(value + tens) + suffix) - else: - yield output(str(value) + str(tens) + suffix) - value = None - elif current_lower in self.multipliers: - multiplier = self.multipliers[current_lower] - if pending_ones is not None: - yield output(pending_ones) - pending_ones = None - if value is None: - value = multiplier - elif isinstance(value, str) or value == 0: - f = to_fraction(value) - p = f * multiplier if f is not None else None - if p is not None and p.denominator == 1: - value = p.numerator - else: - yield output(value) - value = multiplier - else: - before = value // 1000 * 1000 - residual = value % 1000 - value = before + residual * multiplier - elif current_lower in self.multipliers_suffixed: - multiplier, suffix = self.multipliers_suffixed[current_lower] - if pending_ones is not None: - yield output(pending_ones) - pending_ones = None - if value is None: - yield output(str(multiplier) + suffix) - elif isinstance(value, str): - f = to_fraction(value) - p = f * multiplier if f is not None else None - if p is not None and p.denominator == 1: - yield output(str(p.numerator) + suffix) - else: - yield output(value) - yield output(str(multiplier) + suffix) - else: - before = value // 1000 * 1000 - residual = value % 1000 - value = before + residual * multiplier - yield output(str(value) + suffix) - value = None - elif current_lower in self.preceding_prefixers: - if value is not None: - yield output(value) - if pending_ones is not None: - yield output(pending_ones) - - if next_lower in self.words or next_is_numeric: - prefix = self.preceding_prefixers[current_lower] - else: - yield output(current) - elif current_lower in self.following_prefixers: - if value is not None: - prefix = self.following_prefixers[current_lower] - yield output(value) - elif pending_ones is not None: - yield output(pending_ones) - yield output(current) - else: - yield output(current) - elif current_lower in self.suffixers: - if value is not None: - suffix = self.suffixers[current_lower] - yield output(str(value) + suffix) - elif pending_ones is not None: - yield output(str(pending_ones) + self.suffixers[current_lower]) - else: - yield output(current) - value = None - pending_ones = None - elif current_lower in self.specials: - if next_lower not in self.words and not next_is_numeric: - if value is not None: - yield output(value) - if pending_ones is not None: - yield output(pending_ones) - yield output(current) - elif current_lower == "en": - if prev_lower not in self.multipliers: - if value is not None: - yield output(value) - if pending_ones is not None: - yield output(pending_ones) - yield output(current) - elif current_lower == "dubbel" or current_lower == "drievoudig": - if next_lower in self.ones or next_lower in self.zeros: - repeats = 2 if current_lower == "dubbel" else 3 - ones = self.ones.get(next_lower, 0) - value = str(value or "") + str(ones) * repeats - skip = True - else: - if value is not None: - yield output(value) - if pending_ones is not None: - yield output(pending_ones) - yield output(current) - elif current_lower == "komma": - if next_lower in self.decimals or next_is_numeric: - value = str(value or "") + "." - else: - raise ValueError(f"Unexpected token: {current}") - else: - raise ValueError(f"Unexpected token: {current}") - - if value is not None: - yield output(value) - if pending_ones is not None: - yield output(pending_ones) - - def preprocess(self, s: str) -> str: - s = re.sub(r"([a-z])([0-9])", r"\1 \2", s) - s = re.sub(r"([0-9])([a-z])", r"\1 \2", s) - s = re.sub(r"([0-9])\s+(e|en|ste)\b", r"\1\2", s) - return s - - def postprocess(self, s: str) -> str: - def combine_cents(m: Match): - try: - currency = m.group(1) - integer = m.group(2) - cents = int(m.group(3)) - return f"{currency}{integer}.{cents:02d}" - except ValueError: - return m.string - - def extract_cents(m: Match): - try: - return f"¢{int(m.group(1))}" - except ValueError: - return m.string - - s = re.sub(r"([€£$¥])([0-9]+) (?:en )?¢([0-9]{1,2})\b", combine_cents, s) - s = re.sub(r"[€£$¥]0.([0-9]{1,2})\b", extract_cents, s) - return s - - def __call__(self, s: str) -> str: - s = self.preprocess(s) - s = " ".join(word for word in self.process_words(s.split()) if word is not None) - s = self.postprocess(s) - return s + def __call__(self, text: str) -> str: + text = _normalize_currency_symbols(text, self._currency_symbol_to_word) + text = _normalize_mixed_numbers(text) + text = self._alpha2digit(text, "nl") + text = _apply_currency_plural_fixes(text, self._currency_plural_fixes) + return text diff --git a/normalization/languages/dutch/operators.py b/normalization/languages/dutch/operators.py index 1d3fe20..2f7b556 100644 --- a/normalization/languages/dutch/operators.py +++ b/normalization/languages/dutch/operators.py @@ -90,6 +90,11 @@ def __init__(self): ) def expand_written_numbers(self, text: str) -> str: + """Convert Dutch spelled-out numbers to digits (vijf en twintig → 25). + + Uses DutchNumberNormalizer, which normalizes currency symbols and mixed forms + (3 miljard → drie miljard), then text2num.alpha2digit. + """ return self._number_normalizer(text) def expand_contractions(self, text: str) -> str: From eb6f0a4f0ade2ec7372ab4c1ea6be45599a73d86 Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Tue, 21 Apr 2026 17:05:09 +0200 Subject: [PATCH 4/4] test: dutch operatos and number normalizer --- .../languages/dutch_number_normalizer_test.py | 68 +++++++++++++++++++ tests/unit/languages/dutch_operators_test.py | 11 --- tests/unit/steps/text/conftest.py | 6 ++ 3 files changed, 74 insertions(+), 11 deletions(-) create mode 100644 tests/unit/languages/dutch_number_normalizer_test.py diff --git a/tests/unit/languages/dutch_number_normalizer_test.py b/tests/unit/languages/dutch_number_normalizer_test.py new file mode 100644 index 0000000..d039ed0 --- /dev/null +++ b/tests/unit/languages/dutch_number_normalizer_test.py @@ -0,0 +1,68 @@ +import pytest + +from normalization.languages.dutch.number_normalizer import DutchNumberNormalizer +from normalization.languages.dutch.operators import DUTCH_CONFIG + + +@pytest.fixture +def normalizer() -> DutchNumberNormalizer: + return DutchNumberNormalizer(DUTCH_CONFIG.currency_symbol_to_word) + + +@pytest.fixture +def normalizer_no_currency() -> DutchNumberNormalizer: + return DutchNumberNormalizer(None) + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("vijf en twintig", "25"), + ("tweehonderd eenendertig", "231"), + ("drie miljard", "3000000000"), + ("3 miljard", "3000000000"), + ("2 miljoen", "2000000"), + ("2 MILJOEN", "2000000"), + ], +) +def test_alpha2digit_dutch_spelling_and_large_numbers( + normalizer: DutchNumberNormalizer, text: str, expected: str +): + assert normalizer(text) == expected + + +def test_multi_digit_then_miljoen_not_fully_merged(normalizer: DutchNumberNormalizer): + """Multi-digit + multiplier is left for alpha2digit; digit is not rewritten to a word.""" + assert normalizer("12 miljoen") == "12 1000000" + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("tien euro", "10 euros"), + ("honderd euro's", "100 euros"), + ("€10", "10 euros"), + ("10 €", "10 euros"), + ("vijf dollar", "5 dollars"), + ("$3.50", "3.50 dollars"), + ("£5", "5 ponden"), + ("¥200", "200 yens"), + ], +) +def test_currency_symbols_and_plural_trailing_words( + normalizer: DutchNumberNormalizer, text: str, expected: str +): + assert normalizer(text) == expected + + +def test_without_currency_config_leaves_currency_symbol( + normalizer_no_currency: DutchNumberNormalizer, +): + assert normalizer_no_currency("vijf en twintig") == "25" + assert normalizer_no_currency("€10") == "€10" + assert normalizer_no_currency("3 miljard") == "3000000000" + + +def test_non_numeric_text_unchanged(normalizer: DutchNumberNormalizer): + text = "dit is gewone tekst" + assert normalizer(text) == text diff --git a/tests/unit/languages/dutch_operators_test.py b/tests/unit/languages/dutch_operators_test.py index 280e1b1..d639092 100644 --- a/tests/unit/languages/dutch_operators_test.py +++ b/tests/unit/languages/dutch_operators_test.py @@ -49,14 +49,3 @@ def test_word_replacements(operators): assert operators.get_word_replacements()["uw"] == "je" assert operators.get_word_replacements()["okee"] == "oke" assert operators.get_word_replacements()["euro"] == "euros" - - -def test_expand_written_numbers_euro_after_amount_dutch_order(operators): - assert operators.expand_written_numbers("tien euro") == "10 euros" - assert operators.expand_written_numbers("€10") == "10 euros" - assert operators.expand_written_numbers("honderd euro's") == "100 euros" - - -def test_expand_written_numbers_other_currency_trailing_words(operators): - assert operators.expand_written_numbers("vijf dollar") == "5 dollars" - assert operators.expand_written_numbers("$3.50") == "3.50 dollars" diff --git a/tests/unit/steps/text/conftest.py b/tests/unit/steps/text/conftest.py index 81c4f15..e515c55 100644 --- a/tests/unit/steps/text/conftest.py +++ b/tests/unit/steps/text/conftest.py @@ -1,6 +1,7 @@ import pytest from normalization.languages.base import LanguageOperators +from normalization.languages.dutch import DutchOperators from normalization.languages.english import EnglishOperators from normalization.languages.french import FrenchOperators from normalization.steps import get_step_registry @@ -21,6 +22,11 @@ def french_operators(): return FrenchOperators() +@pytest.fixture +def dutch_operators(): + return DutchOperators() + + def assert_text_step_registered(step_cls): """Verify a text step is properly registered under its name.""" registry = get_step_registry()