diff --git a/normalization/languages/dutch/operators.py b/normalization/languages/dutch/operators.py index 2f7b556..b2d470b 100644 --- a/normalization/languages/dutch/operators.py +++ b/normalization/languages/dutch/operators.py @@ -32,6 +32,19 @@ "m": "hem", } +_DUTCH_DIGIT_WORDS: dict[str, str] = { + "nul": "0", + "een": "1", + "twee": "2", + "drie": "3", + "vier": "4", + "vijf": "5", + "zes": "6", + "zeven": "7", + "acht": "8", + "negen": "9", +} + DUTCH_CONFIG = LanguageConfig( code="nl", decimal_separator=",", @@ -78,6 +91,37 @@ "uh", ], sentence_replacements=DUTCH_SENTENCE_REPLACEMENTS, + digit_words=_DUTCH_DIGIT_WORDS, + number_words=[ + *_DUTCH_DIGIT_WORDS, + "tien", + "elf", + "twaalf", + "dertien", + "veertien", + "vijftien", + "zestien", + "zeventien", + "achttien", + "negentien", + "twintig", + "dertig", + "veertig", + "vijftig", + "zestig", + "zeventig", + "tachtig", + "negentig", + "honderd", + "duizend", + "miljoen", + "miljoenen", + "miljard", + "miljarden", + "biljoen", + "biljoenen", + ], + plus_word="plus", ) diff --git a/normalization/languages/german/number_normalizer.py b/normalization/languages/german/number_normalizer.py new file mode 100644 index 0000000..badc27e --- /dev/null +++ b/normalization/languages/german/number_normalizer.py @@ -0,0 +1,67 @@ +"""German number normalizer using text2num's alpha2digit. + +Converts spelled-out numbers to digits (e.g. zwanzig → 20) and handles +mixed digit+word forms (e.g. 2 hundert → zwei hundert) before conversion +so alpha2digit does not misinterpret them. + +A post-pass replaces words alpha2digit leaves unconverted in isolation: +- 'null' → '0' (alpha2digit skips it standalone) +- 'zwei' → '2' (alpha2digit skips it standalone and in plain noun phrases) +'ein'/'eins' are intentionally excluded — 'ein' is the German indefinite +article and cannot be safely replaced without context. +""" + +import re + +from text_to_num import alpha2digit + +_DIGIT_TO_GERMAN: dict[str, str] = { + "0": "null", + "1": "ein", + "2": "zwei", + "3": "drei", + "4": "vier", + "5": "fünf", + "6": "sechs", + "7": "sieben", + "8": "acht", + "9": "neun", +} + +_RE_MIXED_NUMBER = re.compile( + r"\b(\d+)\s+(hundert|tausend|millionen?|milliarden?|billionen?)\b", + re.IGNORECASE, +) + +_RE_ZWEI = re.compile(r"\bzwei\b", re.IGNORECASE) +_RE_NULL = re.compile(r"\bnull\b", re.IGNORECASE) + + +def _normalize_mixed_numbers(text: str) -> str: + """Convert '2 hundert' → 'zwei hundert' so alpha2digit yields 200, not '2 100'.""" + + def replace(match: re.Match) -> str: + number = match.group(1) + multiplier = match.group(2) + if len(number) == 1 and number in _DIGIT_TO_GERMAN: + return f"{_DIGIT_TO_GERMAN[number]} {multiplier}" + return match.group(0) + + return _RE_MIXED_NUMBER.sub(replace, text) + + +def _fix_remaining_words(text: str) -> str: + """Replace number words alpha2digit did not convert.""" + text = _RE_ZWEI.sub("2", text) + text = _RE_NULL.sub("0", text) + return text + + +class GermanNumberNormalizer: + """Convert German spelled-out numbers to digits via text2num.alpha2digit.""" + + def __call__(self, text: str) -> str: + text = _normalize_mixed_numbers(text) + text = alpha2digit(text, "de") + text = _fix_remaining_words(text) + return text diff --git a/normalization/languages/german/operators.py b/normalization/languages/german/operators.py index ca3397a..2af11ce 100644 --- a/normalization/languages/german/operators.py +++ b/normalization/languages/german/operators.py @@ -1,10 +1,25 @@ from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.german.number_normalizer import GermanNumberNormalizer from normalization.languages.german.replacements import GERMAN_REPLACEMENTS from normalization.languages.german.sentence_replacements import ( GERMAN_SENTENCE_REPLACEMENTS, ) from normalization.languages.registry import register_language +_GERMAN_DIGIT_WORDS: dict[str, str] = { + "null": "0", + "ein": "1", + "eins": "1", + "zwei": "2", + "drei": "3", + "vier": "4", + "fünf": "5", + "sechs": "6", + "sieben": "7", + "acht": "8", + "neun": "9", +} + GERMAN_CONFIG = LanguageConfig( code="de", decimal_separator=",", @@ -31,6 +46,37 @@ }, filler_words=["äh", "ähm", "hm", "also", "naja", "halt"], sentence_replacements=GERMAN_SENTENCE_REPLACEMENTS, + digit_words=_GERMAN_DIGIT_WORDS, + number_words=[ + *_GERMAN_DIGIT_WORDS, + "zehn", + "elf", + "zwölf", + "dreizehn", + "vierzehn", + "fünfzehn", + "sechzehn", + "siebzehn", + "achtzehn", + "neunzehn", + "zwanzig", + "dreißig", + "vierzig", + "fünfzig", + "sechzig", + "siebzig", + "achtzig", + "neunzig", + "hundert", + "tausend", + "million", + "millionen", + "milliarde", + "milliarden", + "billion", + "billionen", + ], + plus_word="plus", ) @@ -38,6 +84,10 @@ class GermanOperators(LanguageOperators): def __init__(self): super().__init__(GERMAN_CONFIG) + self._number_normalizer = GermanNumberNormalizer() def get_word_replacements(self) -> dict[str, str]: return GERMAN_REPLACEMENTS + + def expand_written_numbers(self, text: str) -> str: + return self._number_normalizer(text) diff --git a/normalization/languages/italian/number_normalizer.py b/normalization/languages/italian/number_normalizer.py new file mode 100644 index 0000000..7fb9208 --- /dev/null +++ b/normalization/languages/italian/number_normalizer.py @@ -0,0 +1,58 @@ +"""Italian number normalizer using text2num's alpha2digit. + +Converts spelled-out numbers to digits (e.g. venti → 20) and handles +mixed digit+word forms (e.g. 2 cento → due cento) before conversion +so alpha2digit does not misinterpret them. + +A post-pass replaces words alpha2digit leaves unconverted in isolation: +- 'uno' → '1' +- 'due' → '2' +""" + +import re + +from text_to_num import alpha2digit + +_RE_MIXED_NUMBER = re.compile( + r"\b(\d+)\s+(cento|mila?|milioni?|miliardi?)\b", + re.IGNORECASE, +) + +_RE_UNO = re.compile(r"\buno\b", re.IGNORECASE) +_RE_DUE = re.compile(r"\bdue\b", re.IGNORECASE) + + +def _fix_remaining_words(text: str) -> str: + """Replace number words alpha2digit did not convert.""" + text = _RE_UNO.sub("1", text) + text = _RE_DUE.sub("2", text) + return text + + +class ItalianNumberNormalizer: + """Convert Italian spelled-out numbers to digits via text2num.alpha2digit. + + Accepts digit_words (word→digit mapping from LanguageConfig) to derive + the digit→word mapping used for mixed-form pre-passes (e.g. '2 cento' → 'due cento'). + """ + + def __init__(self, digit_words: dict[str, str]) -> None: + self._digit_to_word = {v: k for k, v in digit_words.items()} + + def _normalize_mixed_numbers(self, text: str) -> str: + """Convert '2 cento' → 'due cento' so alpha2digit yields 200, not '2 100'.""" + + def replace(match: re.Match) -> str: + number = match.group(1) + multiplier = match.group(2) + if len(number) == 1 and number in self._digit_to_word: + return f"{self._digit_to_word[number]} {multiplier}" + return match.group(0) + + return _RE_MIXED_NUMBER.sub(replace, text) + + def __call__(self, text: str) -> str: + text = self._normalize_mixed_numbers(text) + text = alpha2digit(text, "it") + text = _fix_remaining_words(text) + return text diff --git a/normalization/languages/italian/operators.py b/normalization/languages/italian/operators.py index ed48304..0adbcd6 100644 --- a/normalization/languages/italian/operators.py +++ b/normalization/languages/italian/operators.py @@ -1,10 +1,12 @@ -import re - from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.italian.number_normalizer import ItalianNumberNormalizer from normalization.languages.italian.replacements import ITALIAN_REPLACEMENTS +from normalization.languages.italian.sentence_replacements import ( + ITALIAN_SENTENCE_REPLACEMENTS, +) from normalization.languages.registry import register_language -# Single digits 1–9: shared by digit_words and any future time/compound helpers. +# Single digits 1-9: shared by digit_words and any future time/compound helpers. _ONE_TO_NINE: dict[str, str] = { "uno": "1", "due": "2", @@ -17,11 +19,6 @@ "nove": "9", } -ITALIAN_SENTENCE_REPLACEMENTS: dict[str, str] = { - # Spoken percentages (“dieci per cento”) → one canonical form aligned with “%” → percento - "per cento": "percento", -} - ITALIAN_CONFIG = LanguageConfig( code="it", decimal_separator=",", @@ -101,16 +98,12 @@ class ItalianOperators(LanguageOperators): def __init__(self): super().__init__(ITALIAN_CONFIG) - - def fix_one_word_in_numeric_contexts(self, text: str) -> str: - text = re.sub(r"(\d+)\s+uno\s+uno\b", r"\1 1 1", text) - text = re.sub(r"\buno\s+uno\s+(\d)", r"1 1 \1", text) - text = re.sub(r"(\d+)\s+uno\s+(\d)", r"\1 1 \2", text) - text = re.sub(r"(\d+)\s+uno\b", r"\1 1", text) - text = re.sub(r"\b(\d+)uno\b", r"\1 1", text) - text = re.sub(r"\buno\s+(\d)", r"1 \1", text) - text = re.sub(r"^uno\s+(?=[a-z])", "1 ", text) - return text + self._number_normalizer = ItalianNumberNormalizer( + ITALIAN_CONFIG.digit_words or {} + ) def get_word_replacements(self) -> dict[str, str]: return ITALIAN_REPLACEMENTS + + def expand_written_numbers(self, text: str) -> str: + return self._number_normalizer(text) diff --git a/normalization/languages/italian/sentence_replacements.py b/normalization/languages/italian/sentence_replacements.py new file mode 100644 index 0000000..3e83de7 --- /dev/null +++ b/normalization/languages/italian/sentence_replacements.py @@ -0,0 +1,3 @@ +ITALIAN_SENTENCE_REPLACEMENTS: dict[str, str] = { + "per cento": "percento", +} diff --git a/tests/e2e/files/gladia-3/de.csv b/tests/e2e/files/gladia-3/de.csv index 91e265f..5b63973 100644 --- a/tests/e2e/files/gladia-3/de.csv +++ b/tests/e2e/files/gladia-3/de.csv @@ -30,3 +30,16 @@ halt mal so,mal so st. petersburg,st petersburg 6 tage krieg,sechstagekrieg kreuzungs punkt,kreuzungspunkt +£100,100 pounds +¥500,500 yens +$20 und $30,20 dollars und 30 dollars +zwei,2 +drei,3 +zehn,10 +zwanzig,20 +dreizehn,13 +hundert,100 +tausend,1000 +drei euro,3 euro +hundert euro,100 euro +zwanzig apfel,20 apfel diff --git a/tests/e2e/files/gladia-3/en.csv b/tests/e2e/files/gladia-3/en.csv index d37c965..f6428b0 100644 --- a/tests/e2e/files/gladia-3/en.csv +++ b/tests/e2e/files/gladia-3/en.csv @@ -120,3 +120,13 @@ x = 5,x equals 5 ø in Danish,o in danish €20 or €30,20 euros or 30 euros my name is bob,my name is bob +thirteen dogs,13 dogs +fifteen items,15 items +forty people,40 people +sixty items,60 items +seventy two,72 +eighty nine,89 +four hundred,400 +five thousand dollars,5000 dollars +three thousand five hundred,3500 +two billion people,2000000000 people diff --git a/tests/e2e/files/gladia-3/es.csv b/tests/e2e/files/gladia-3/es.csv index bda514e..f847184 100644 --- a/tests/e2e/files/gladia-3/es.csv +++ b/tests/e2e/files/gladia-3/es.csv @@ -28,3 +28,11 @@ www.gladia.io,w w w punto gladia punto io ¢25,25 céntimos £50,50 libras ¥1000,1000 yenes +cinco manzanas,5 manzanas +cero errores,0 errores +quince personas,15 personas +treinta,30 +cuarenta y cinco,45 +setenta y ocho,78 +quinientos,500 +quince mil,15000 diff --git a/tests/e2e/files/gladia-3/fr.csv b/tests/e2e/files/gladia-3/fr.csv index cd713cb..bb23b64 100644 --- a/tests/e2e/files/gladia-3/fr.csv +++ b/tests/e2e/files/gladia-3/fr.csv @@ -44,3 +44,11 @@ x = 5,x egal a 5 test@example.com,test arobase example point com bonjour (euh) ami,bonjour ami ça date d'hier,ca date d hier +seize,16 +douze pommes,12 pommes +quarante,40 +deux cents,200 +trois mille,3000 +dix-neuf,19 +quatre-vingt-dix,90 +soixante quinze,75 diff --git a/tests/e2e/files/gladia-3/it.csv b/tests/e2e/files/gladia-3/it.csv index c383407..7a47087 100644 --- a/tests/e2e/files/gladia-3/it.csv +++ b/tests/e2e/files/gladia-3/it.csv @@ -6,7 +6,7 @@ Costa €50,costa 50 euro 2 < 5,2 minore di 5 5 > 3,5 maggiore di 3 50°C,50 gradi celsius -dieci per cento,dieci percento +dieci per cento,10 percento vs milan,contro milan dott rossi,dottor rossi dr rossi,dottor rossi @@ -31,3 +31,42 @@ Contattami a john@example.com per favore,contattami a john chiocciola example pu La parola [inaudible] è qui,la parola inaudible e qui $5 e $10,5 dollari e 10 dollari mi chiamo bob,mi chiamo bob +sei sette otto,678 +due tre quattro cinque,2345 +venti,20 +trenta,30 +tredici,13 +cento,100 +mille,1000 +venti euro,20 euro +trenta mele,30 mele +novanta nove,99 +tre mele,3 mele +cinque euro,5 euro +sette persone,7 persone +dodici,12 +quattro,4 +uno,1 +due,2 +due mele,2 mele +ventuno,21 +quarantasei,46 +cinquanta e tre,53 +duecento,200 +duecentocinquanta,250 +due cento,200 +tre cento sessanta cinque,365 +duemila,2000 +millecinquecento,1500 +milleduecento,1200 +duemila e cinquecento,2500 +mille e duecento,1200 +novantanove mila,99000 +due milioni,2000000 +tre miliardi,3000000000 +2 miliardi,2000000000 +2 milioni,2000000 +3 milioni e duecento mila,3200000 +due cento persone,200 persone +milleduecento euro,1200 euro +novantanove mila persone,99000 persone diff --git a/tests/e2e/files/gladia-3/nl.csv b/tests/e2e/files/gladia-3/nl.csv index 0f41e50..f35a86c 100644 --- a/tests/e2e/files/gladia-3/nl.csv +++ b/tests/e2e/files/gladia-3/nl.csv @@ -23,3 +23,13 @@ twee duizend,2000 's ochtends vroeg,des ochtends vroeg ping pong,ping pong vijf en twintig euro,25 euros +drie appels,3 appels +twintig,20 +twee honderd,200 +duizend,1000 +negen en negentig,99 +zes en zestig,66 +drie miljoen,3000000 +drie miljard,3000000000 +tweehonderd eenendertig,231 +2 miljoen,2000000