gladiaio · Karamouche · Apr 21, 2026 · Apr 21, 2026 · coderabbitai · Apr 21, 2026
diff --git a/normalization/languages/dutch/operators.py b/normalization/languages/dutch/operators.py
@@ -32,6 +32,19 @@
     "m": "hem",
 }
 
+_DUTCH_DIGIT_WORDS: dict[str, str] = {
+    "nul": "0",
+    "een": "1",
+    "twee": "2",
+    "drie": "3",
+    "vier": "4",
+    "vijf": "5",
+    "zes": "6",
+    "zeven": "7",
+    "acht": "8",
+    "negen": "9",
+}
+
 DUTCH_CONFIG = LanguageConfig(
     code="nl",
     decimal_separator=",",
@@ -78,6 +91,37 @@
         "uh",
     ],
     sentence_replacements=DUTCH_SENTENCE_REPLACEMENTS,
+    digit_words=_DUTCH_DIGIT_WORDS,
+    number_words=[
+        *_DUTCH_DIGIT_WORDS,
+        "tien",
+        "elf",
+        "twaalf",
+        "dertien",
+        "veertien",
+        "vijftien",
+        "zestien",
+        "zeventien",
+        "achttien",
+        "negentien",
+        "twintig",
+        "dertig",
+        "veertig",
+        "vijftig",
+        "zestig",
+        "zeventig",
+        "tachtig",
+        "negentig",
+        "honderd",
+        "duizend",
+        "miljoen",
+        "miljoenen",
+        "miljard",
+        "miljarden",
+        "biljoen",
+        "biljoenen",
+    ],
+    plus_word="plus",
 )
 
 

diff --git a/normalization/languages/german/number_normalizer.py b/normalization/languages/german/number_normalizer.py
@@ -0,0 +1,67 @@
+"""German number normalizer using text2num's alpha2digit.
+
+Converts spelled-out numbers to digits (e.g. zwanzig → 20) and handles
+mixed digit+word forms (e.g. 2 hundert → zwei hundert) before conversion
+so alpha2digit does not misinterpret them.
+
+A post-pass replaces words alpha2digit leaves unconverted in isolation:
+- 'null' → '0' (alpha2digit skips it standalone)
+- 'zwei' → '2' (alpha2digit skips it standalone and in plain noun phrases)
+'ein'/'eins' are intentionally excluded — 'ein' is the German indefinite
+article and cannot be safely replaced without context.
+"""
+
+import re
+
+from text_to_num import alpha2digit
+
+_DIGIT_TO_GERMAN: dict[str, str] = {
+    "0": "null",
+    "1": "ein",
+    "2": "zwei",
+    "3": "drei",
+    "4": "vier",
+    "5": "fünf",
+    "6": "sechs",
+    "7": "sieben",
+    "8": "acht",
+    "9": "neun",
+}
+
+_RE_MIXED_NUMBER = re.compile(
+    r"\b(\d+)\s+(hundert|tausend|millionen?|milliarden?|billionen?)\b",
+    re.IGNORECASE,
+)
+
+_RE_ZWEI = re.compile(r"\bzwei\b", re.IGNORECASE)
+_RE_NULL = re.compile(r"\bnull\b", re.IGNORECASE)
+
+
+def _normalize_mixed_numbers(text: str) -> str:
+    """Convert '2 hundert' → 'zwei hundert' so alpha2digit yields 200, not '2 100'."""
+
+    def replace(match: re.Match) -> str:
+        number = match.group(1)
+        multiplier = match.group(2)
+        if len(number) == 1 and number in _DIGIT_TO_GERMAN:
+            return f"{_DIGIT_TO_GERMAN[number]} {multiplier}"
+        return match.group(0)
+
+    return _RE_MIXED_NUMBER.sub(replace, text)
+
+
+def _fix_remaining_words(text: str) -> str:
+    """Replace number words alpha2digit did not convert."""
+    text = _RE_ZWEI.sub("2", text)
+    text = _RE_NULL.sub("0", text)
+    return text
+
+
+class GermanNumberNormalizer:
+    """Convert German spelled-out numbers to digits via text2num.alpha2digit."""
+
+    def __call__(self, text: str) -> str:
+        text = _normalize_mixed_numbers(text)
+        text = alpha2digit(text, "de")
+        text = _fix_remaining_words(text)
+        return text
diff --git a/normalization/languages/german/operators.py b/normalization/languages/german/operators.py
@@ -1,10 +1,25 @@
 from normalization.languages.base import LanguageConfig, LanguageOperators
+from normalization.languages.german.number_normalizer import GermanNumberNormalizer
 from normalization.languages.german.replacements import GERMAN_REPLACEMENTS
 from normalization.languages.german.sentence_replacements import (
     GERMAN_SENTENCE_REPLACEMENTS,
 )
 from normalization.languages.registry import register_language
 
+_GERMAN_DIGIT_WORDS: dict[str, str] = {
+    "null": "0",
+    "ein": "1",
+    "eins": "1",
+    "zwei": "2",
+    "drei": "3",
+    "vier": "4",
+    "fünf": "5",
+    "sechs": "6",
+    "sieben": "7",
+    "acht": "8",
+    "neun": "9",
+}
+
 GERMAN_CONFIG = LanguageConfig(
     code="de",
     decimal_separator=",",
@@ -31,13 +46,48 @@
     },
     filler_words=["äh", "ähm", "hm", "also", "naja", "halt"],
     sentence_replacements=GERMAN_SENTENCE_REPLACEMENTS,
+    digit_words=_GERMAN_DIGIT_WORDS,
+    number_words=[
+        *_GERMAN_DIGIT_WORDS,
+        "zehn",
+        "elf",
+        "zwölf",
+        "dreizehn",
+        "vierzehn",
+        "fünfzehn",
+        "sechzehn",
+        "siebzehn",
+        "achtzehn",
+        "neunzehn",
+        "zwanzig",
+        "dreißig",
+        "vierzig",
+        "fünfzig",
+        "sechzig",
+        "siebzig",
+        "achtzig",
+        "neunzig",
+        "hundert",
+        "tausend",
+        "million",
+        "millionen",
+        "milliarde",
+        "milliarden",
+        "billion",
+        "billionen",
+    ],
+    plus_word="plus",
 )
 
 
 @register_language
 class GermanOperators(LanguageOperators):
     def __init__(self):
         super().__init__(GERMAN_CONFIG)
+        self._number_normalizer = GermanNumberNormalizer()
 
     def get_word_replacements(self) -> dict[str, str]:
         return GERMAN_REPLACEMENTS
+
+    def expand_written_numbers(self, text: str) -> str:
+        return self._number_normalizer(text)
diff --git a/normalization/languages/italian/number_normalizer.py b/normalization/languages/italian/number_normalizer.py
@@ -0,0 +1,58 @@
+"""Italian number normalizer using text2num's alpha2digit.
+
+Converts spelled-out numbers to digits (e.g. venti → 20) and handles
+mixed digit+word forms (e.g. 2 cento → due cento) before conversion
+so alpha2digit does not misinterpret them.
+
+A post-pass replaces words alpha2digit leaves unconverted in isolation:
+- 'uno' → '1'
+- 'due' → '2'
+"""
+
+import re
+
+from text_to_num import alpha2digit
+
+_RE_MIXED_NUMBER = re.compile(
+    r"\b(\d+)\s+(cento|mila?|milioni?|miliardi?)\b",
+    re.IGNORECASE,
+)
-_RE_MIXED_NUMBER = re.compile(
-    r"\b(\d+)\s+(cento|mila?|milioni?|miliardi?)\b",
-    re.IGNORECASE,
-)
+_RE_MIXED_NUMBER = re.compile(
+    r"\b(\d+)\s+(cento|mille|mila|milioni?|miliardi?)\b",
+    re.IGNORECASE,
+)
-_RE_MIXED_NUMBER = re.compile(
-    r"\b(\d+)\s+(cento|mila?|milioni?|miliardi?)\b",
-    re.IGNORECASE,
-)
+_RE_MIXED_NUMBER = re.compile(
+    r"\b(\d+)\s+(cento|mille|mila|milioni?|miliardi?)\b",
+    re.IGNORECASE,
+)
+
+_RE_UNO = re.compile(r"\buno\b", re.IGNORECASE)
+_RE_DUE = re.compile(r"\bdue\b", re.IGNORECASE)
+
+
+def _fix_remaining_words(text: str) -> str:
+    """Replace number words alpha2digit did not convert."""
+    text = _RE_UNO.sub("1", text)
+    text = _RE_DUE.sub("2", text)
+    return text
+
+
+class ItalianNumberNormalizer:
+    """Convert Italian spelled-out numbers to digits via text2num.alpha2digit.
+
+    Accepts digit_words (word→digit mapping from LanguageConfig) to derive
+    the digit→word mapping used for mixed-form pre-passes (e.g. '2 cento' → 'due cento').
+    """
+
+    def __init__(self, digit_words: dict[str, str]) -> None:
+        self._digit_to_word = {v: k for k, v in digit_words.items()}
+
+    def _normalize_mixed_numbers(self, text: str) -> str:
+        """Convert '2 cento' → 'due cento' so alpha2digit yields 200, not '2 100'."""
+
+        def replace(match: re.Match) -> str:
+            number = match.group(1)
+            multiplier = match.group(2)
+            if len(number) == 1 and number in self._digit_to_word:
+                return f"{self._digit_to_word[number]} {multiplier}"
+            return match.group(0)
+
+        return _RE_MIXED_NUMBER.sub(replace, text)
+
+    def __call__(self, text: str) -> str:
+        text = self._normalize_mixed_numbers(text)
+        text = alpha2digit(text, "it")
+        text = _fix_remaining_words(text)
+        return text
diff --git a/normalization/languages/italian/operators.py b/normalization/languages/italian/operators.py
@@ -1,10 +1,12 @@
-import re
-
 from normalization.languages.base import LanguageConfig, LanguageOperators
+from normalization.languages.italian.number_normalizer import ItalianNumberNormalizer
 from normalization.languages.italian.replacements import ITALIAN_REPLACEMENTS
+from normalization.languages.italian.sentence_replacements import (
+    ITALIAN_SENTENCE_REPLACEMENTS,
+)
 from normalization.languages.registry import register_language
 
-# Single digits 1–9: shared by digit_words and any future time/compound helpers.
+# Single digits 1-9: shared by digit_words and any future time/compound helpers.
 _ONE_TO_NINE: dict[str, str] = {
     "uno": "1",
     "due": "2",
@@ -17,11 +19,6 @@
     "nove": "9",
 }
 
-ITALIAN_SENTENCE_REPLACEMENTS: dict[str, str] = {
-    # Spoken percentages (“dieci per cento”) → one canonical form aligned with “%” → percento
-    "per cento": "percento",
-}
-
 ITALIAN_CONFIG = LanguageConfig(
     code="it",
     decimal_separator=",",
@@ -101,16 +98,12 @@
 class ItalianOperators(LanguageOperators):
     def __init__(self):
         super().__init__(ITALIAN_CONFIG)
-
-    def fix_one_word_in_numeric_contexts(self, text: str) -> str:
-        text = re.sub(r"(\d+)\s+uno\s+uno\b", r"\1 1 1", text)
-        text = re.sub(r"\buno\s+uno\s+(\d)", r"1 1 \1", text)
-        text = re.sub(r"(\d+)\s+uno\s+(\d)", r"\1 1 \2", text)
-        text = re.sub(r"(\d+)\s+uno\b", r"\1 1", text)
-        text = re.sub(r"\b(\d+)uno\b", r"\1 1", text)
-        text = re.sub(r"\buno\s+(\d)", r"1 \1", text)
-        text = re.sub(r"^uno\s+(?=[a-z])", "1 ", text)
-        return text
+        self._number_normalizer = ItalianNumberNormalizer(
+            ITALIAN_CONFIG.digit_words or {}
+        )
 
     def get_word_replacements(self) -> dict[str, str]:
         return ITALIAN_REPLACEMENTS
+
+    def expand_written_numbers(self, text: str) -> str:
+        return self._number_normalizer(text)
diff --git a/normalization/languages/italian/sentence_replacements.py b/normalization/languages/italian/sentence_replacements.py
@@ -0,0 +1,3 @@
+ITALIAN_SENTENCE_REPLACEMENTS: dict[str, str] = {
+    "per cento": "percento",
+}
diff --git a/tests/e2e/files/gladia-3/de.csv b/tests/e2e/files/gladia-3/de.csv
@@ -30,3 +30,16 @@ halt mal so,mal so
 st. petersburg,st petersburg
 6 tage krieg,sechstagekrieg
 kreuzungs punkt,kreuzungspunkt
+£100,100 pounds
+¥500,500 yens
+$20 und $30,20 dollars und 30 dollars
+zwei,2
+drei,3
+zehn,10
+zwanzig,20
+dreizehn,13
+hundert,100
+tausend,1000
+drei euro,3 euro
+hundert euro,100 euro
+zwanzig apfel,20 apfel
diff --git a/tests/e2e/files/gladia-3/en.csv b/tests/e2e/files/gladia-3/en.csv
@@ -120,3 +120,13 @@ x = 5,x equals 5
 ø in Danish,o in danish
 €20 or €30,20 euros or 30 euros
 my  name is bob,my name is bob
+thirteen dogs,13 dogs
+fifteen items,15 items
+forty people,40 people
+sixty items,60 items
+seventy two,72
+eighty nine,89
+four hundred,400
+five thousand dollars,5000 dollars
+three thousand five hundred,3500
+two billion people,2000000000 people
diff --git a/tests/e2e/files/gladia-3/es.csv b/tests/e2e/files/gladia-3/es.csv
@@ -28,3 +28,11 @@ www.gladia.io,w w w punto gladia punto io
 ¢25,25 céntimos
 £50,50 libras
 ¥1000,1000 yenes
+cinco manzanas,5 manzanas
+cero errores,0 errores
+quince personas,15 personas
+treinta,30
+cuarenta y cinco,45
+setenta y ocho,78
+quinientos,500
+quince mil,15000
diff --git a/tests/e2e/files/gladia-3/fr.csv b/tests/e2e/files/gladia-3/fr.csv
@@ -44,3 +44,11 @@ x = 5,x egal a 5
 test@example.com,test arobase example point com
 bonjour (euh) ami,bonjour ami
 ça date d'hier,ca date d hier
+seize,16
+douze pommes,12 pommes
+quarante,40
+deux cents,200
+trois mille,3000
+dix-neuf,19
+quatre-vingt-dix,90
+soixante quinze,75