From 99c4ddd0f676b13ad01c46b2639b96438f2125a4 Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Fri, 10 Apr 2026 10:45:01 +0200 Subject: [PATCH 01/10] feat: added italian language --- normalization/languages/__init__.py | 4 +- normalization/languages/italian/__init__.py | 7 ++ normalization/languages/italian/operators.py | 116 ++++++++++++++++++ .../languages/italian/replacements.py | 11 ++ 4 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 normalization/languages/italian/__init__.py create mode 100644 normalization/languages/italian/operators.py create mode 100644 normalization/languages/italian/replacements.py diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py index 5674874..838b5ad 100644 --- a/normalization/languages/__init__.py +++ b/normalization/languages/__init__.py @@ -1,7 +1,7 @@ -from . import english, french +from . import english, french, italian from .base import LanguageOperators from .registry import get_language_registry, register_language register_language(LanguageOperators) -__all__ = ["english", "french", "get_language_registry"] +__all__ = ["english", "french", "italian", "get_language_registry"] diff --git a/normalization/languages/italian/__init__.py b/normalization/languages/italian/__init__.py new file mode 100644 index 0000000..9278e42 --- /dev/null +++ b/normalization/languages/italian/__init__.py @@ -0,0 +1,7 @@ +from .operators import ItalianOperators +from .replacements import ITALIAN_REPLACEMENTS + +__all__ = [ + "ItalianOperators", + "ITALIAN_REPLACEMENTS", +] diff --git a/normalization/languages/italian/operators.py b/normalization/languages/italian/operators.py new file mode 100644 index 0000000..ed48304 --- /dev/null +++ b/normalization/languages/italian/operators.py @@ -0,0 +1,116 @@ +import re + +from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.italian.replacements import ITALIAN_REPLACEMENTS +from normalization.languages.registry import register_language + +# Single digits 1–9: shared by digit_words and any future time/compound helpers. +_ONE_TO_NINE: dict[str, str] = { + "uno": "1", + "due": "2", + "tre": "3", + "quattro": "4", + "cinque": "5", + "sei": "6", + "sette": "7", + "otto": "8", + "nove": "9", +} + +ITALIAN_SENTENCE_REPLACEMENTS: dict[str, str] = { + # Spoken percentages (“dieci per cento”) → one canonical form aligned with “%” → percento + "per cento": "percento", +} + +ITALIAN_CONFIG = LanguageConfig( + code="it", + decimal_separator=",", + decimal_word="virgola", + thousand_separator=".", + symbols_to_words={ + "@": "chiocciola", + ".": "punto", + "+": "più", + "=": "uguale a", + ">": "maggiore di", + "<": "minore di", + "°": "grado", + "°C": "gradi celsius", + "°F": "gradi fahrenheit", + "%": "percento", + }, + currency_symbol_to_word={ + "€": "euro", + "$": "dollari", + "£": "sterline", + "¢": "centesimi", + "¥": "yen", + }, + filler_words=[ + "eh", + "ehm", + "mm", + "mh", + "cioè", + "cioe", + "tipo", + "insomma", + "allora", + "beh", + "bah", + "dunque", + "magari", + "praticamente", + ], + sentence_replacements=ITALIAN_SENTENCE_REPLACEMENTS, + digit_words={"zero": "0", **_ONE_TO_NINE}, + number_words=[ + "zero", + *_ONE_TO_NINE, + "dieci", + "undici", + "dodici", + "tredici", + "quattordici", + "quindici", + "sedici", + "diciassette", + "diciotto", + "diciannove", + "venti", + "trenta", + "quaranta", + "cinquanta", + "sessanta", + "settanta", + "ottanta", + "novanta", + "cento", + "mille", + "mila", + "milione", + "milioni", + "miliardo", + "miliardi", + ], + plus_word="più", +) + + +@register_language +class ItalianOperators(LanguageOperators): + def __init__(self): + super().__init__(ITALIAN_CONFIG) + + def fix_one_word_in_numeric_contexts(self, text: str) -> str: + text = re.sub(r"(\d+)\s+uno\s+uno\b", r"\1 1 1", text) + text = re.sub(r"\buno\s+uno\s+(\d)", r"1 1 \1", text) + text = re.sub(r"(\d+)\s+uno\s+(\d)", r"\1 1 \2", text) + text = re.sub(r"(\d+)\s+uno\b", r"\1 1", text) + text = re.sub(r"\b(\d+)uno\b", r"\1 1", text) + text = re.sub(r"\buno\s+(\d)", r"1 \1", text) + text = re.sub(r"^uno\s+(?=[a-z])", "1 ", text) + return text + + def get_word_replacements(self) -> dict[str, str]: + return ITALIAN_REPLACEMENTS diff --git a/normalization/languages/italian/replacements.py b/normalization/languages/italian/replacements.py new file mode 100644 index 0000000..12c5e72 --- /dev/null +++ b/normalization/languages/italian/replacements.py @@ -0,0 +1,11 @@ +ITALIAN_REPLACEMENTS: dict[str, str] = { + "avv": "avvocato", + "dott": "dottor", + "dr": "dottor", + "ecc": "eccetera", + "etc": "eccetera", + "prof": "professore", + "tel": "telefono", + "versus": "contro", + "vs": "contro", +} From ac743b238086cd74aeba045267a94d5730924b37 Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Fri, 10 Apr 2026 10:45:42 +0200 Subject: [PATCH 02/10] test: added italian language tests --- tests/e2e/files/gladia-3.csv | 38 +++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/tests/e2e/files/gladia-3.csv b/tests/e2e/files/gladia-3.csv index 85c749b..0096bd3 100644 --- a/tests/e2e/files/gladia-3.csv +++ b/tests/e2e/files/gladia-3.csv @@ -123,4 +123,40 @@ x = 5,x equals 5,en ¥1000,1000 yens,en ø in Danish,o in danish,en €20 or €30,20 euros or 30 euros,en -my name is bob,my name is bob,en \ No newline at end of file +my name is bob,my name is bob,en +#1 spot,1 spot,it +It costs €50,it costs 50 euro,it +"3,14",3 virgola 14,it +"1.234,56",1234 virgola 56,it +2 < 5,2 minore di 5,it +5 > 3,5 maggiore di 3,it +50°C,50 gradi celsius,it +dieci per cento,dieci percento,it +vs milan,contro milan,it +dott rossi,dottor rossi,it +dr rossi,dottor rossi,it +ehm tipo insomma ciao,ciao,it +admin+tag@example.com,admin tag chiocciola example punto com,it +test@example.com,test chiocciola example punto com,it +uno apple,1 apple,it +x = 5,x uguale a 5,it +"The price is 99,99",the price is 99 virgola 99,it +francois.dupont@gladia.io,francois punto dupont chiocciola gladia punto io,it +www.example.com,w w w punto example punto com,it +Version 1.0.0 released,version 1 punto 0 punto 0 released,it +api.endpoint.v2,api punto endpoint punto v2,it +fail-safe,fail safe,it +U.S.A.,u s a,it +prof bianchi,professore bianchi,it +avv verdi,avvocato verdi,it +versus inter,contro inter,it +tel 123,telefono 123,it +ecc.,eccetera,it +etc subito,eccetera subito,it ++1 (619) 981-0181,+16199810181,it +10:12 pm,10:12 pm,it +Contact me at john@example.com please,contact me at john chiocciola example punto com please,it +"Temperature is 98,6 degrees",temperature is 98 virgola 6 degrees,it +The word [inaudible] is here,the word inaudible is here,it +$5 and $10,5 dollari and 10 dollari,it +my name is bob,my name is bob,it \ No newline at end of file From a6958ed6921e65651689ebe1790556957e7f6b03 Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Fri, 10 Apr 2026 11:01:06 +0200 Subject: [PATCH 03/10] feat: add german language --- normalization/languages/__init__.py | 4 +- normalization/languages/german/__init__.py | 7 +++ normalization/languages/german/operators.py | 43 +++++++++++++++++++ .../languages/german/replacements.py | 11 +++++ .../languages/german/sentence_replacements.py | 16 +++++++ 5 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 normalization/languages/german/__init__.py create mode 100644 normalization/languages/german/operators.py create mode 100644 normalization/languages/german/replacements.py create mode 100644 normalization/languages/german/sentence_replacements.py diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py index 838b5ad..706540b 100644 --- a/normalization/languages/__init__.py +++ b/normalization/languages/__init__.py @@ -1,7 +1,7 @@ -from . import english, french, italian +from . import english, french, german, italian from .base import LanguageOperators from .registry import get_language_registry, register_language register_language(LanguageOperators) -__all__ = ["english", "french", "italian", "get_language_registry"] +__all__ = ["english", "french", "german", "italian", "get_language_registry"] diff --git a/normalization/languages/german/__init__.py b/normalization/languages/german/__init__.py new file mode 100644 index 0000000..838e83b --- /dev/null +++ b/normalization/languages/german/__init__.py @@ -0,0 +1,7 @@ +from .operators import GermanOperators +from .replacements import GERMAN_REPLACEMENTS + +__all__ = [ + "GermanOperators", + "GERMAN_REPLACEMENTS", +] diff --git a/normalization/languages/german/operators.py b/normalization/languages/german/operators.py new file mode 100644 index 0000000..ca3397a --- /dev/null +++ b/normalization/languages/german/operators.py @@ -0,0 +1,43 @@ +from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.german.replacements import GERMAN_REPLACEMENTS +from normalization.languages.german.sentence_replacements import ( + GERMAN_SENTENCE_REPLACEMENTS, +) +from normalization.languages.registry import register_language + +GERMAN_CONFIG = LanguageConfig( + code="de", + decimal_separator=",", + decimal_word="komma", + thousand_separator=".", + symbols_to_words={ + "@": "at", + ".": "punkt", + "+": "plus", + "=": "gleich", + ">": "größer als", + "<": "kleiner als", + "°": "grad", + "°C": "grad celsius", + "°F": "grad fahrenheit", + "%": "prozent", + }, + currency_symbol_to_word={ + "€": "euros", + "$": "dollars", + "£": "pounds", + "¢": "cents", + "¥": "yens", + }, + filler_words=["äh", "ähm", "hm", "also", "naja", "halt"], + sentence_replacements=GERMAN_SENTENCE_REPLACEMENTS, +) + + +@register_language +class GermanOperators(LanguageOperators): + def __init__(self): + super().__init__(GERMAN_CONFIG) + + def get_word_replacements(self) -> dict[str, str]: + return GERMAN_REPLACEMENTS diff --git a/normalization/languages/german/replacements.py b/normalization/languages/german/replacements.py new file mode 100644 index 0000000..4b220b1 --- /dev/null +++ b/normalization/languages/german/replacements.py @@ -0,0 +1,11 @@ +GERMAN_REPLACEMENTS: dict[str, str] = { + "u.": "unter", + "chr.": "christus", + "rissströmungen": "riss-strömungen", + "kilometer": "km", + "xdrtb": "xdr-tb", + "dualradio": "dual-radio", + "st.": "sankt", + "bis": "-", + "maubewegung": "mau-bewegung", +} diff --git a/normalization/languages/german/sentence_replacements.py b/normalization/languages/german/sentence_replacements.py new file mode 100644 index 0000000..31086fa --- /dev/null +++ b/normalization/languages/german/sentence_replacements.py @@ -0,0 +1,16 @@ +GERMAN_SENTENCE_REPLACEMENTS: dict[str, str] = { + "regimeet kritischen": "regimekritischen", + "cannabis joints": "cannabisjoints", + "kampf handlungen": "kampfhandlungen", + "erwachsenen pornografie": "erwachsenenpornographie", + "standbild format": "standbildformat", + "internet radio seite": "internetradioseite", + "alt gedienten": "altgedienten", + "6 tage krieg": "sechstagekrieg", + "kreuzungs punkt": "kreuzungspunkt", + "wild card": "wildcard", + "national parks": "nationalparks", + "internet suche": "internetsuche", + "gleichgewicht geschlechtliche": "gleichgeschlechtlichen", + "welt kulturerbegebiete": "weltkulturerbegebiete", +} From 49339e469175380b27da412015968901627e653d Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Fri, 10 Apr 2026 11:04:14 +0200 Subject: [PATCH 04/10] test: added german language tests --- tests/e2e/files/gladia-3.csv | 38 +++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/tests/e2e/files/gladia-3.csv b/tests/e2e/files/gladia-3.csv index 0096bd3..d91c3c6 100644 --- a/tests/e2e/files/gladia-3.csv +++ b/tests/e2e/files/gladia-3.csv @@ -159,4 +159,40 @@ Contact me at john@example.com please,contact me at john chiocciola example punt "Temperature is 98,6 degrees",temperature is 98 virgola 6 degrees,it The word [inaudible] is here,the word inaudible is here,it $5 and $10,5 dollari and 10 dollari,it -my name is bob,my name is bob,it \ No newline at end of file +my name is bob,my name is bob,it +#1 spot,1 spot,de +It costs €50,it costs 50 euros,de +"3,14",3 komma 14,de +"1.234,56",1234 komma 56,de +2 < 5,2 kleiner als 5,de +5 > 3,5 grosser als 3,de +50°C,50 grad celsius,de +admin+tag@example.com,admin tag at example punkt com,de +test@example.com,test at example punkt com,de +x = 5,x gleich 5,de +"The price is 99,99",the price is 99 komma 99,de +francois.dupont@gladia.io,francois punkt dupont at gladia punkt io,de +www.example.com,w w w punkt example punkt com,de +Version 1.0.0 released,version 1 punkt 0 punkt 0 released,de +api.endpoint.v2,api punkt endpoint punkt v2,de +fail-safe,fail safe,de +U.S.A.,u s a,de ++1 (619) 981-0181,+16199810181,de +10:12 pm,10:12 pm,de +Contact me at john@example.com please,contact me at john at example punkt com please,de +"Temperature is 98,6 degrees",temperature is 98 komma 6 degrees,de +The word [inaudible] is here,the word inaudible is here,de +$5 and $10,5 dollars and 10 dollars,de +my name is bob,my name is bob,de +5 bis 10,5 - 10,de +kilometer weg,km weg,de +internet suche heute,internetsuche heute,de +wild card spiel,wildcard spiel,de +national parks tour,nationalparks tour,de +also naja hallo,hallo,de +äh ähm hallo,ah ahm hallo,de +hm okay,okay,de +halt mal so,mal so,de +st. petersburg,st petersburg,de +6 tage krieg,sechstagekrieg,de +kreuzungs punkt,kreuzungspunkt,de \ No newline at end of file From 8318ca2b6f5078ecfc123bd31d04b58a83dd1e71 Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Fri, 10 Apr 2026 11:28:33 +0200 Subject: [PATCH 05/10] feat: added spanish language --- normalization/languages/__init__.py | 4 +- normalization/languages/spanish/__init__.py | 7 + .../languages/spanish/number_normalizer.py | 196 ++++++++++++++++++ normalization/languages/spanish/operators.py | 139 +++++++++++++ .../languages/spanish/replacements.py | 30 +++ 5 files changed, 374 insertions(+), 2 deletions(-) create mode 100644 normalization/languages/spanish/__init__.py create mode 100644 normalization/languages/spanish/number_normalizer.py create mode 100644 normalization/languages/spanish/operators.py create mode 100644 normalization/languages/spanish/replacements.py diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py index 706540b..18e07a6 100644 --- a/normalization/languages/__init__.py +++ b/normalization/languages/__init__.py @@ -1,7 +1,7 @@ -from . import english, french, german, italian +from . import english, french, german, italian, spanish from .base import LanguageOperators from .registry import get_language_registry, register_language register_language(LanguageOperators) -__all__ = ["english", "french", "german", "italian", "get_language_registry"] +__all__ = ["english", "french", "german", "italian", "spanish", "get_language_registry"] diff --git a/normalization/languages/spanish/__init__.py b/normalization/languages/spanish/__init__.py new file mode 100644 index 0000000..1df78ad --- /dev/null +++ b/normalization/languages/spanish/__init__.py @@ -0,0 +1,7 @@ +from .operators import SpanishOperators +from .replacements import SPANISH_REPLACEMENTS + +__all__ = [ + "SpanishOperators", + "SPANISH_REPLACEMENTS", +] diff --git a/normalization/languages/spanish/number_normalizer.py b/normalization/languages/spanish/number_normalizer.py new file mode 100644 index 0000000..632a3fc --- /dev/null +++ b/normalization/languages/spanish/number_normalizer.py @@ -0,0 +1,196 @@ +"""Convert common Spanish spelled-out numbers to digits (STT-oriented). + +Covers 0–999, ``mil`` compounds, and informal ``veinte tres`` → ``23``. +Accepts spellings with or without accents (common in transcripts). +""" + +from __future__ import annotations + +import unicodedata + + +def _fold(s: str) -> str: + s = s.lower() + return "".join( + c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn" + ) + + +def _get(table: dict[str, int], word: str) -> int | None: + fw = _fold(word) + for k, v in table.items(): + if _fold(k) == fw: + return v + return None + + +_ONES_1_9: dict[str, int] = { + "uno": 1, + "dos": 2, + "tres": 3, + "cuatro": 4, + "cinco": 5, + "seis": 6, + "siete": 7, + "ocho": 8, + "nueve": 9, +} + +_TEENS: dict[str, int] = { + "diez": 10, + "once": 11, + "doce": 12, + "trece": 13, + "catorce": 14, + "quince": 15, + "dieciseis": 16, + "diecisiete": 17, + "dieciocho": 18, + "diecinueve": 19, +} + +_VEINTI: dict[str, int] = { + "veintiuno": 21, + "veintidos": 22, + "veintitres": 23, + "veinticuatro": 24, + "veinticinco": 25, + "veintiseis": 26, + "veintisiete": 27, + "veintiocho": 28, + "veintinueve": 29, +} + +_TENS: dict[str, int] = { + "treinta": 30, + "cuarenta": 40, + "cincuenta": 50, + "sesenta": 60, + "setenta": 70, + "ochenta": 80, + "noventa": 90, +} + +_HUNDREDS: dict[str, int] = { + "cien": 100, + "ciento": 100, + "doscientos": 200, + "trescientos": 300, + "cuatrocientos": 400, + "quinientos": 500, + "seiscientos": 600, + "setecientos": 700, + "ochocientos": 800, + "novecientos": 900, +} + + +class SpanishNumberNormalizer: + def __call__(self, text: str) -> str: + if not text.strip(): + return text + words = text.split() + out: list[str] = [] + i = 0 + n = len(words) + while i < n: + parsed = self._parse_number(words, i, n) + if parsed is not None: + end, value = parsed + out.append(str(value)) + i = end + else: + out.append(words[i]) + i += 1 + return " ".join(out) + + def _parse_number(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + """If words[i:] start with a spelled number, return (exclusive_end_index, value).""" + if i >= n: + return None + + fw = _fold(words[i]) + + if fw == "cero": + return i + 1, 0 + + # --- Optional leading hundred block (cien/ciento/ doscientos …) --- + h = _get(_HUNDREDS, words[i]) + if h is not None: + j = i + 1 + if j < n and _fold(words[j]) == "mil": + base = h * 1000 + j += 1 + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, base + v2 + return j, base + if h == 100: + sub = self._parse_0_99(words, j, n) + if sub is not None: + je, v = sub + return je, 100 + v + return j, 100 + sub = self._parse_0_99(words, j, n) + if sub is not None: + je, v = sub + return je, h + v + return j, h + + # --- 0–99 or leading multiplier for "mil" --- + sub99 = self._parse_0_99(words, i, n) + if sub99 is None: + return None + j, v = sub99 + if j < n and _fold(words[j]) == "mil": + j += 1 + if j >= n: + return j, v * 1000 + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, v * 1000 + v2 + return j, v * 1000 + return j, v + + def _parse_0_99(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + w = words[i] + fw = _fold(w) + + if fw == "veinte": + if i + 1 < n: + o = _get(_ONES_1_9, words[i + 1]) + if o is not None: + return i + 2, 20 + o + return i + 1, 20 + + v = _get(_VEINTI, w) + if v is not None: + return i + 1, v + + v = _get(_TEENS, w) + if v is not None: + return i + 1, v + + v = _get(_ONES_1_9, w) + if v is not None: + return i + 1, v + + tens = _get(_TENS, w) + if tens is None: + return None + j = i + 1 + if j < n and _fold(words[j]) == "y": + j += 1 + if j < n: + o = _get(_ONES_1_9, words[j]) + if o is not None: + return j + 1, tens + o + if i + 1 < n and tens >= 30: + o = _get(_ONES_1_9, words[i + 1]) + if o is not None: + return i + 2, tens + o + return i + 1, tens diff --git a/normalization/languages/spanish/operators.py b/normalization/languages/spanish/operators.py new file mode 100644 index 0000000..61c0dbd --- /dev/null +++ b/normalization/languages/spanish/operators.py @@ -0,0 +1,139 @@ +import re + +from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.registry import register_language +from normalization.languages.spanish.number_normalizer import SpanishNumberNormalizer +from normalization.languages.spanish.replacements import SPANISH_REPLACEMENTS + +_ONE_TO_NINE: dict[str, str] = { + "uno": "1", + "dos": "2", + "tres": "3", + "cuatro": "4", + "cinco": "5", + "seis": "6", + "siete": "7", + "ocho": "8", + "nueve": "9", +} + +SPANISH_CONFIG = LanguageConfig( + code="es", + decimal_separator=",", + decimal_word="punto", + thousand_separator=".", + symbols_to_words={ + "@": "arroba", + ".": "punto", + "+": "más", + "=": "igual a", + ">": "mayor que", + "<": "menor que", + "°": "grado", + "°C": "grados celsius", + "°F": "grados fahrenheit", + "%": "por ciento", + }, + currency_symbol_to_word={ + "€": "euros", + "$": "dólares", + "£": "libras", + "¢": "céntimos", + "¥": "yenes", + }, + filler_words=[ + "eh", + "ehm", + "mm", + "mh", + "bueno", + "pues", + "o sea", + "tipo", + "vale", + "vaya", + "mira", + "hombre", + "mujer", + "digo", + "entonces", + "claro", + "vamos", + "este", + "esta", + ], + sentence_replacements=None, + digit_words={"cero": "0", **_ONE_TO_NINE}, + number_words=[ + "cero", + *_ONE_TO_NINE, + "diez", + "once", + "doce", + "trece", + "catorce", + "quince", + "dieciséis", + "dieciseis", + "diecisiete", + "dieciocho", + "diecinueve", + "veinte", + "veintiuno", + "veintidos", + "veintitres", + "veinticuatro", + "veinticinco", + "veintiseis", + "veintisiete", + "veintiocho", + "veintinueve", + "treinta", + "cuarenta", + "cincuenta", + "sesenta", + "setenta", + "ochenta", + "noventa", + "cien", + "ciento", + "doscientos", + "trescientos", + "cuatrocientos", + "quinientos", + "seiscientos", + "setecientos", + "ochocientos", + "novecientos", + "mil", + "millón", + "millones", + "mil millones", + "billón", + "billones", + ], + plus_word="más", +) + + +@register_language +class SpanishOperators(LanguageOperators): + def __init__(self): + super().__init__(SPANISH_CONFIG) + self._number_normalizer = SpanishNumberNormalizer() + + def fix_one_word_in_numeric_contexts(self, text: str) -> str: + text = re.sub(r"(\d+)\s+uno\s+uno\b", r"\1 1 1", text) + text = re.sub(r"\buno\s+uno\s+(\d)", r"1 1 \1", text) + text = re.sub(r"(\d+)\s+uno\s+(\d)", r"\1 1 \2", text) + text = re.sub(r"(\d+)\s+uno\b", r"\1 1", text) + text = re.sub(r"\b(\d+)uno\b", r"\1 1", text) + text = re.sub(r"\buno\s+(\d)", r"1 \1", text) + text = re.sub(r"^uno\s+(?=[a-z])", "1 ", text) + return text + + def get_word_replacements(self) -> dict[str, str]: + return SPANISH_REPLACEMENTS + + def expand_written_numbers(self, text: str) -> str: + return self._number_normalizer(text) diff --git a/normalization/languages/spanish/replacements.py b/normalization/languages/spanish/replacements.py new file mode 100644 index 0000000..a4eda19 --- /dev/null +++ b/normalization/languages/spanish/replacements.py @@ -0,0 +1,30 @@ +SPANISH_REPLACEMENTS: dict[str, str] = { + "aprox": "aproximadamente", + "av": "avenida", + "cta": "cuenta", + "d": "don", + "da": "doña", + "dept": "departamento", + "depto": "departamento", + "doc": "documento", + "dr": "doctor", + "dra": "doctora", + "etc": "etcétera", + "ej": "ejemplo", + "ext": "extensión", + "hab": "habitación", + "ing": "ingeniero", + "núm": "número", + "pag": "página", + "prof": "profesor", + "profa": "profesora", + "pza": "plaza", + "tel": "teléfono", + "tfno": "teléfono", + "ud": "usted", + "uds": "ustedes", + "vd": "usted", + "vds": "ustedes", + "versus": "versus", + "vs": "versus", +} From a6029dcf0bc4bcf8dd6206e8b15d05071eaf1a64 Mon Sep 17 00:00:00 2001 From: karamouche Date: Mon, 13 Apr 2026 17:18:49 -0400 Subject: [PATCH 06/10] refactor: restructure normalization tests to group by language --- tests/e2e/normalization_test.py | 43 +++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/tests/e2e/normalization_test.py b/tests/e2e/normalization_test.py index daa1b3d..5663fd3 100644 --- a/tests/e2e/normalization_test.py +++ b/tests/e2e/normalization_test.py @@ -32,7 +32,16 @@ def _load_tests_from_csv(csv_path: Path) -> list[NormalizationTest]: def _case_ids(cases: list[NormalizationTest]) -> list[str]: - return [f"{test.language}:{test.input[:60]}" for test in cases] + return [test.input[:60] for test in cases] + + +def _group_by_language( + tests: list[NormalizationTest], +) -> dict[str, list[NormalizationTest]]: + groups: dict[str, list[NormalizationTest]] = {} + for t in tests: + groups.setdefault(t.language, []).append(t) + return groups def _load_pipeline(preset_name_or_path: str, language: str) -> NormalizationPipeline: @@ -53,16 +62,24 @@ def _load_pipeline(preset_name_or_path: str, language: str) -> NormalizationPipe _GLADIA_3_PIPELINES: dict[str, NormalizationPipeline] = {} -@pytest.mark.parametrize( - "test", - _GLADIA_3_TESTS, - ids=_case_ids(_GLADIA_3_TESTS), -) -def test_gladia_3(test: NormalizationTest) -> None: - pipeline = _load_pipeline("gladia-3", test.language) - result = pipeline.normalize(test.input) - assert result == test.expected, ( - f"\n input: {test.input!r}" - f"\n expected: {test.expected!r}" - f"\n got: {result!r}" +def _make_gladia_3_test(language: str, cases: list[NormalizationTest]): + @pytest.mark.parametrize("test", cases, ids=_case_ids(cases)) + def _test(test: NormalizationTest) -> None: + pipeline = _load_pipeline("gladia-3", language) + result = pipeline.normalize(test.input) + assert result == test.expected, ( + f"\n input: {test.input!r}" + f"\n expected: {test.expected!r}" + f"\n got: {result!r}" + ) + + _test.__name__ = f"test_gladia_3_{language}" + return _test + + +_GLADIA_3_BY_LANGUAGE = _group_by_language(_GLADIA_3_TESTS) + +for _language in sorted(_GLADIA_3_BY_LANGUAGE): + globals()[f"test_gladia_3_{_language}"] = _make_gladia_3_test( + _language, _GLADIA_3_BY_LANGUAGE[_language] ) From 18ebc5ac7224788f161216a595181ac0ccb0e7a4 Mon Sep 17 00:00:00 2001 From: karamouche Date: Mon, 13 Apr 2026 17:39:57 -0400 Subject: [PATCH 07/10] refactor: test files structure for language normalization --- .github/pull_request_template.md | 2 +- .gitignore | 1 + AGENTS.md | 2 +- CONTRIBUTING.md | 2 +- docs/contributing-guide.md | 52 +++--- tests/e2e/files/gladia-3.csv | 251 --------------------------- tests/e2e/files/gladia-3/de.csv | 37 ++++ tests/e2e/files/gladia-3/default.csv | 9 + tests/e2e/files/gladia-3/en.csv | 126 ++++++++++++++ tests/e2e/files/gladia-3/fr.csv | 46 +++++ tests/e2e/files/gladia-3/it.csv | 37 ++++ tests/e2e/normalization_test.py | 43 +++-- 12 files changed, 311 insertions(+), 297 deletions(-) delete mode 100644 tests/e2e/files/gladia-3.csv create mode 100644 tests/e2e/files/gladia-3/de.csv create mode 100644 tests/e2e/files/gladia-3/default.csv create mode 100644 tests/e2e/files/gladia-3/en.csv create mode 100644 tests/e2e/files/gladia-3/fr.csv create mode 100644 tests/e2e/files/gladia-3/it.csv diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 92570a6..efcc3e9 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -20,7 +20,7 @@ - [ ] Decorated operators class with `@register_language` - [ ] Added one import line to `languages/__init__.py` - [ ] Added unit tests in `tests/unit/languages/` -- [ ] Added e2e test rows in `tests/e2e/files/` +- [ ] Added a per-language CSV in `tests/e2e/files/{preset}/` (e.g. `tests/e2e/files/gladia-3/fr.csv`) ### New step diff --git a/.gitignore b/.gitignore index e5bab35..4d7ac38 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ wheels/ *.egg-info .ruff_cache/ .pytest_cache/ +.DS_Store # Virtual environments diff --git a/AGENTS.md b/AGENTS.md index 0c769fe..74de346 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -144,7 +144,7 @@ Never modify a published preset YAML. Never let a preset reference a step that h - [ ] Decorate the class with `@register_language` - [ ] Add one import to `languages/__init__.py` - [ ] Add tests in `tests/unit/languages/` -- [ ] Add test rows to `tests/e2e/files/` for the new language +- [ ] Add a CSV file `tests/e2e/files/{preset}/{language_code}.csv` for each relevant preset (e.g. `tests/e2e/files/gladia-3/fr.csv`) ## Adding a new step — checklist diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fcc3211..2448a49 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,7 @@ A new language requires: 2. Put all word-level substitutions in `replacements.py` 3. Instantiate a `LanguageConfig` and subclass `LanguageOperators` in `operators.py` 4. Decorate with `@register_language` and add one import to `normalization/languages/__init__.py` -5. Add tests under `tests/unit/languages/` and e2e fixture rows in `tests/e2e/files/` +5. Add tests under `tests/unit/languages/` and a per-language CSV in `tests/e2e/files/{preset}/` (e.g. `tests/e2e/files/gladia-3/fr.csv`) See [docs/contributing-guide.md](docs/contributing-guide.md) for the full checklist and design rules. diff --git a/docs/contributing-guide.md b/docs/contributing-guide.md index 5c038a9..488f5ec 100644 --- a/docs/contributing-guide.md +++ b/docs/contributing-guide.md @@ -26,7 +26,7 @@ This ordering is a hard constraint — some steps depend on earlier steps having - [ ] Decorate the class with `@register_language` - [ ] Add one import to `languages/__init__.py` - [ ] Add tests in `tests/unit/languages/` -- [ ] Add test rows to `tests/e2e/files/` for the new language +- [ ] Add a CSV file `tests/e2e/files/{preset}/{language_code}.csv` for each relevant preset (e.g. `tests/e2e/files/gladia-3/fr.csv`) ### Language data vs. language behavior @@ -159,42 +159,46 @@ def test_my_step_with_english(english_operators): ### E2E tests for a preset -E2E tests validate the full pipeline (preset + language) against a CSV fixture. The test runner lives in `tests/e2e/normalization_test.py` and CSV files go in `tests/e2e/files/`. +E2E tests validate the full pipeline (preset + language) against CSV fixtures. The test runner lives in `tests/e2e/normalization_test.py` and CSV files are organized under `tests/e2e/files/`. -**CSV format** — three columns, no quoting needed unless the value contains a comma: +**Directory structure** — one folder per preset, one CSV per language: ``` -input,expected,language -$1,000,000,1000000 dollars,en -hello world,hello world,fr +tests/e2e/files/ + gladia-3/ + default.csv + de.csv + en.csv + fr.csv + it.csv ``` -Each row is one test case. The `language` column must match a registered language code (or `default`). +**CSV format** — two columns (`input,expected`), no quoting needed unless the value contains a comma: -**Registering a new CSV** — add a block to `normalization_test.py` following the existing pattern: +``` +input,expected +"$1,000,000",1000000 dollars +hello world,hello world +``` + +The language is derived from the filename (e.g. `fr.csv` → language code `fr`). Use `default.csv` for the language-agnostic fallback. + +**Adding test cases for an existing preset** — drop rows into the appropriate `{language_code}.csv` file, or create a new CSV if the language isn't covered yet. Tests are discovered automatically. + +**Registering a new preset** — add a block to `normalization_test.py` following the existing pattern: ```python -_MY_PRESET_CSV = _FILES_DIR / "my-preset.csv" -_MY_PRESET_TESTS = _load_tests_from_csv(_MY_PRESET_CSV) if _MY_PRESET_CSV.exists() else [] +_MY_PRESET_DIR = _FILES_DIR / "my-preset" +_MY_PRESET_BY_LANGUAGE = _discover_preset_tests(_MY_PRESET_DIR) _MY_PRESET_PIPELINES: dict[str, NormalizationPipeline] = {} - -@pytest.mark.parametrize( - "test", - _MY_PRESET_TESTS, - ids=_case_ids(_MY_PRESET_TESTS), -) -def test_my_preset(test: NormalizationTest) -> None: - pipeline = _load_pipeline("my-preset", test.language) - result = pipeline.normalize(test.input) - assert result == test.expected, ( - f"\n input: {test.input!r}" - f"\n expected: {test.expected!r}" - f"\n got: {result!r}" +for _language in sorted(_MY_PRESET_BY_LANGUAGE): + globals()[f"test_my_preset_{_language}"] = _make_test( + "my-preset", _language, _MY_PRESET_BY_LANGUAGE[_language], _MY_PRESET_PIPELINES ) ``` -Pipelines are cached per language inside `_MY_PRESET_PIPELINES` to avoid reloading for each parametrized case — follow the `_load_pipeline` helper pattern already in the file. +Pipelines are cached per language to avoid reloading for each parametrized case. --- diff --git a/tests/e2e/files/gladia-3.csv b/tests/e2e/files/gladia-3.csv deleted file mode 100644 index 1381186..0000000 --- a/tests/e2e/files/gladia-3.csv +++ /dev/null @@ -1,251 +0,0 @@ -input,expected,language -#1 spot,1 spot,en -"$1,000,000",1000000 dollars,en -$5 and $10,5 dollars and 10 dollars,en -$50.75 total,50 point 75 dollars total,en -+1 (619) 981-0181,+16199810181,en -05 45 pm,5:45 pm,en -05:45pm,05:45 pm,en -"1,234.56",1234 point 56,en -1.1.1.1,1 dot 1 dot 1 dot 1,en -10 a m,10 am,en -1012 am,10:12 am,en -10:00 pm,10 pm,en -10:12 pm,10:12 pm,en -10:54 a m,10:54 am,en -11 a m,11 am,en -1145 pm,11:45 pm,en -12 p m,12 pm,en -12:34 p m,12:34 pm,en -Let's meet at noon o'clock,let us meet at 12:00,en -Appointment is at 3 o'clock,appointment is at 3:00,en -192.168.1.1,192 dot 168 dot 1 dot 1,en -2 < 5,2 less than 5,en -3.14,3 point 14,en -5 45 p m,5:45 pm,en -5 > 3,5 greater than 3,en -5 a m,5 am,en -5.45 p.m.,5:45 pm,en -50°C,50 degree celsius,en -545 pm,5:45 pm,en -6 p m,6 pm,en -602 am,6:02 am,en -6:00 am,6 am,en -6:24 am,6:24 am,en -9.8 m/s,9 point 8 m/s,en -About ¢25,about 25 cents,en -About ¢25 cents only,about 25 cents only,en -About ¥1000 yen total,about 1000 yens total,en -admin+tag@example.com,admin tag at example dot com,en -api.endpoint.v2,api dot endpoint dot v2,en -at eleven twenty-five a.m,at 11:25 am,en -at French numbers plus three three oh six six two seven three two six four three,at french numbers +330662732643,en -at ten o'clock,at 10:00,en -at two p.m,at 2 pm,en -at two thirty p.m,at 2:30 pm,en -bob b-o-b dupov d-u-p-o-v,bob b o b dupov d u p o v,en -CAPS@EXAMPLE.COM,c a p s at e x a m p l e dot c o m,en -Contact john.doe@company.co.uk now,contact john dot doe at company dot co dot uk now,en -Contact me at john@example.com please,contact me at john at example dot com please,en -example.com,example dot com,en -fail-safe,fail safe,en -file.txt,file dot txt,en -first.last+tag@subdomain.example.com,1st dot last tag at subdomain dot example dot com,en -francois.dupont@gladia.io,francois dot dupont at gladia dot io,en -good bye,goodbye,en -he ain't gonna,he is not going to,en -hello (yeah) there,hello there,en -hello (yeah) there,hello there,en -I have $20 dollars here,i have 20 dollars here,en -is +16209113040,is +16209113040,en -is one 620 911 3040,is 16209113040,en -is one 620 911 3040,is 16209113040,en -is plus 33 6 80 63 10 00,is +33680631000,en -is plus 330662732643,is +330662732643,en -is plus one 620 911 3040,is +16209113040,en -It costs £30 pounds,it costs 30 pounds,en -It costs €50,it costs 50 euros,en -It's 12.5 dollars,it is 12 point 5 dollars,en -j o h a n n,j o h a n n,en -jane_smith@test.co.uk,jane smith at test dot co dot uk,en -john j-o-h-n doe d-o-e,john j o h n doe d o e,en -my name is B.O.B,my name is b o b,en -john.doe@company.org,john dot doe at company dot org,en -lemme see,let me see,en -my phone number is o 4 5 o 6 4 3 2 1 1 and,my phone number is 0450643211 and,en -ninety nine items,99 items,en -o 4 5 o 6 4 3 2 1 1,0450643211,en -o 4 5 o 6 4 3 2 1 6,0450643216,en -o 4 5 o 6 4 3 2 1 o,0450643210,en -one apple,1 apple,en -one billion dollars,1000000000 dollars,en -one hundred,100,en -one hundred people,100 people,en -one hundred thousand items,100000 items,en -one million dollars,1000000 dollars,en -One million dollars total,1000000 dollars total,en -one thousand dollars,1000 dollars,en -one thousand three hundred and thirty seven,1337,en -"one, two, three",123,en -Plus 1 16 plus equals one.,+116 plus equals one,en -plus 1-619-981-0181,+16199810181,en -she ain't gonna,she is not going to,en -Temperature is 98.6 degrees,temperature is 98 point 6 degrees,en -ten thousand people,10000 people,en -test@example.com,test at example dot com,en -test@example.com.,test at example dot com,en -test@gladia.io.,test at gladia dot io,en -That's £100,that is 100 pounds,en -The code is ABC123 here,the code is a b c 123 here,en -The price is $99,the price is 99 dollars,en -The price is 99.99,the price is 99 point 99,en -The price is €50 euros,the price is 50 euros,en -The word [inaudible] is here,the word inaudible is here,en -twenty one dogs,21 dogs,en -twenty three people,23 people,en -twenty two items,22 items,en -two hundred items,200 items,en -two million people,2000000 people,en -two thousand,2000,en -two thousand items,2000 items,en -U.S.A.,u s a,en -user.name@domain.com,user dot name at domain dot com,en -user@domain.com,user at domain dot com,en -user@sub.domain.example.org,user at sub dot domain dot example dot org,en -Version 1.0.0 released,version 1 dot 0 dot 0 released,en -version 2.5.1,version 2 dot 5 dot 1,en -Visit www.example.com today,visit w w w dot example dot com today,en -Look at my v12 motor,look at my v12 motor,en -www.example.com,w w w dot example dot com,en -x = 5,x equals 5,en -"zip code 92103, U.S.",zip code 92103 u s,en -£5.50,5 point 50 pounds,en -¥1000,1000 yens,en -ø in Danish,o in danish,en -€20 or €30,20 euros or 30 euros,en -my name is bob,my name is bob,en -#1 spot,1 spot,it -It costs €50,it costs 50 euro,it -"3,14",3 virgola 14,it -"1.234,56",1234 virgola 56,it -2 < 5,2 minore di 5,it -5 > 3,5 maggiore di 3,it -50°C,50 gradi celsius,it -dieci per cento,dieci percento,it -vs milan,contro milan,it -dott rossi,dottor rossi,it -dr rossi,dottor rossi,it -ehm tipo insomma ciao,ciao,it -admin+tag@example.com,admin tag chiocciola example punto com,it -test@example.com,test chiocciola example punto com,it -uno apple,1 apple,it -x = 5,x uguale a 5,it -"The price is 99,99",the price is 99 virgola 99,it -francois.dupont@gladia.io,francois punto dupont chiocciola gladia punto io,it -www.example.com,w w w punto example punto com,it -Version 1.0.0 released,version 1 punto 0 punto 0 released,it -api.endpoint.v2,api punto endpoint punto v2,it -fail-safe,fail safe,it -U.S.A.,u s a,it -prof bianchi,professore bianchi,it -avv verdi,avvocato verdi,it -versus inter,contro inter,it -tel 123,telefono 123,it -ecc.,eccetera,it -etc subito,eccetera subito,it -+1 (619) 981-0181,+16199810181,it -10:12 pm,10:12 pm,it -Contact me at john@example.com please,contact me at john chiocciola example punto com please,it -"Temperature is 98,6 degrees",temperature is 98 virgola 6 degrees,it -The word [inaudible] is here,the word inaudible is here,it -$5 and $10,5 dollari and 10 dollari,it -my name is bob,my name is bob,it -#1 spot,1 spot,de -It costs €50,it costs 50 euros,de -"3,14",3 komma 14,de -"1.234,56",1234 komma 56,de -2 < 5,2 kleiner als 5,de -5 > 3,5 grosser als 3,de -50°C,50 grad celsius,de -admin+tag@example.com,admin tag at example punkt com,de -test@example.com,test at example punkt com,de -x = 5,x gleich 5,de -"The price is 99,99",the price is 99 komma 99,de -francois.dupont@gladia.io,francois punkt dupont at gladia punkt io,de -www.example.com,w w w punkt example punkt com,de -Version 1.0.0 released,version 1 punkt 0 punkt 0 released,de -api.endpoint.v2,api punkt endpoint punkt v2,de -fail-safe,fail safe,de -U.S.A.,u s a,de -+1 (619) 981-0181,+16199810181,de -10:12 pm,10:12 pm,de -Contact me at john@example.com please,contact me at john at example punkt com please,de -"Temperature is 98,6 degrees",temperature is 98 komma 6 degrees,de -The word [inaudible] is here,the word inaudible is here,de -$5 and $10,5 dollars and 10 dollars,de -my name is bob,my name is bob,de -5 bis 10,5 - 10,de -kilometer weg,km weg,de -internet suche heute,internetsuche heute,de -wild card spiel,wildcard spiel,de -national parks tour,nationalparks tour,de -also naja hallo,hallo,de -äh ähm hallo,ah ahm hallo,de -hm okay,okay,de -halt mal so,mal so,de -st. petersburg,st petersburg,de -6 tage krieg,sechstagekrieg,de -kreuzungs punkt,kreuzungspunkt,de -j'ai dit c'est bien,j ai dit c est bien,fr -vingt trois pommes,23 pommes,fr -3 milliards d euros,3000000000 d euros,fr -euh alors hein bah oui,alors oui,fr -"12,5 degrés",12 virgule 5 degres,fr -pour 100 de réduction,pourcent de reduction,fr -pour cent de réduction,pourcent de reduction,fr -"Hello, world!",hello world,default -ça va?!,ca va,default -$100,$100,default -80 €,80 €,default -test@example.com,test@example.com,default -+1234567890,+1234567890,default -one two three,one two three,default -5:30 pm,5:30 pm,default -d'accord,d accord,fr -qu'il vient,qu il vient,fr -n'est pas,n est pas,fr -l'ordinateur,l ordinateur,fr -m'appelle,m appelle,fr -s'il vous plait,s il vous plait,fr -t'as vu,t as vu,fr -cent euros,100 euros,fr -mille deux cents,1200,fr -cinquante trois,53,fr -contact@exemple.fr,contact arobase exemple point fr,fr -"2 < 5",2 plus petit que 5,fr -50°C,50 degres celsius,fr -ca coute €50,ca coute 50 euros,fr -euh bonjour hein,bonjour,fr -mme dupont,madame dupont,fr -mlle dubois,mademoiselle dubois,fr -dr martin,docteur martin,fr -prof dupont,professeur dupont,fr -st jean,saint jean,fr -ping pong,pingpong,fr -volley ball,volleyball,fr -basket ball,basketball,fr -hand ball,handball,fr -water polo,waterpolo,fr -t shirt,tshirt,fr -cd rom,cdrom,fr -super predateur,superpredateur,fr -"3,14 pi",3 virgule 14 pi,fr -soixante-dix,70,fr -quatre-vingts,80,fr -quatre-vingt-un,81,fr -nonante-neuf,99,fr -septante et un,71,fr -x = 5,x egal a 5,fr -test@example.com,test arobase example point com,fr -bonjour (euh) ami,bonjour ami,fr -ça date d'hier,ca date d hier,fr diff --git a/tests/e2e/files/gladia-3/de.csv b/tests/e2e/files/gladia-3/de.csv new file mode 100644 index 0000000..4021dc8 --- /dev/null +++ b/tests/e2e/files/gladia-3/de.csv @@ -0,0 +1,37 @@ +input,expected +#1 spot,1 spot +It costs €50,it costs 50 euros +"3,14",3 komma 14 +"1.234,56",1234 komma 56 +2 < 5,2 kleiner als 5 +5 > 3,5 grosser als 3 +50°C,50 grad celsius +admin+tag@example.com,admin tag at example punkt com +test@example.com,test at example punkt com +x = 5,x gleich 5 +"The price is 99,99",the price is 99 komma 99 +francois.dupont@gladia.io,francois punkt dupont at gladia punkt io +www.example.com,w w w punkt example punkt com +Version 1.0.0 released,version 1 punkt 0 punkt 0 released +api.endpoint.v2,api punkt endpoint punkt v2 +fail-safe,fail safe +U.S.A.,u s a ++1 (619) 981-0181,+16199810181 +10:12 pm,10:12 pm +Contact me at john@example.com please,contact me at john at example punkt com please +"Temperature is 98,6 degrees",temperature is 98 komma 6 degrees +The word [inaudible] is here,the word inaudible is here +$5 and $10,5 dollars and 10 dollars +my name is bob,my name is bob +5 bis 10,5 - 10 +kilometer weg,km weg +internet suche heute,internetsuche heute +wild card spiel,wildcard spiel +national parks tour,nationalparks tour +also naja hallo,hallo +äh ähm hallo,ah ahm hallo +hm okay,okay +halt mal so,mal so +st. petersburg,st petersburg +6 tage krieg,sechstagekrieg +kreuzungs punkt,kreuzungspunkt \ No newline at end of file diff --git a/tests/e2e/files/gladia-3/default.csv b/tests/e2e/files/gladia-3/default.csv new file mode 100644 index 0000000..4258ba0 --- /dev/null +++ b/tests/e2e/files/gladia-3/default.csv @@ -0,0 +1,9 @@ +input,expected +"Hello, world!",hello world +ça va?!,ca va +$100,$100 +80 €,80 € +test@example.com,test@example.com ++1234567890,+1234567890 +one two three,one two three +5:30 pm,5:30 pm \ No newline at end of file diff --git a/tests/e2e/files/gladia-3/en.csv b/tests/e2e/files/gladia-3/en.csv new file mode 100644 index 0000000..25ca0d6 --- /dev/null +++ b/tests/e2e/files/gladia-3/en.csv @@ -0,0 +1,126 @@ +input,expected +#1 spot,1 spot +"$1,000,000",1000000 dollars +$5 and $10,5 dollars and 10 dollars +$50.75 total,50 point 75 dollars total ++1 (619) 981-0181,+16199810181 +05 45 pm,5:45 pm +05:45pm,05:45 pm +"1,234.56",1234 point 56 +1.1.1.1,1 dot 1 dot 1 dot 1 +10 a m,10 am +1012 am,10:12 am +10:00 pm,10 pm +10:12 pm,10:12 pm +10:54 a m,10:54 am +11 a m,11 am +1145 pm,11:45 pm +12 p m,12 pm +12:34 p m,12:34 pm +Let's meet at noon o'clock,let us meet at 12:00 +Appointment is at 3 o'clock,appointment is at 3:00 +192.168.1.1,192 dot 168 dot 1 dot 1 +2 < 5,2 less than 5 +3.14,3 point 14 +5 45 p m,5:45 pm +5 > 3,5 greater than 3 +5 a m,5 am +5.45 p.m.,5:45 pm +50°C,50 degree celsius +545 pm,5:45 pm +6 p m,6 pm +602 am,6:02 am +6:00 am,6 am +6:24 am,6:24 am +9.8 m/s,9 point 8 m/s +About ¢25,about 25 cents +About ¢25 cents only,about 25 cents only +About ¥1000 yen total,about 1000 yens total +admin+tag@example.com,admin tag at example dot com +api.endpoint.v2,api dot endpoint dot v2 +at eleven twenty-five a.m,at 11:25 am +at French numbers plus three three oh six six two seven three two six four three,at french numbers +330662732643 +at ten o'clock,at 10:00 +at two p.m,at 2 pm +at two thirty p.m,at 2:30 pm +bob b-o-b dupov d-u-p-o-v,bob b o b dupov d u p o v +CAPS@EXAMPLE.COM,c a p s at e x a m p l e dot c o m +Contact john.doe@company.co.uk now,contact john dot doe at company dot co dot uk now +Contact me at john@example.com please,contact me at john at example dot com please +example.com,example dot com +fail-safe,fail safe +file.txt,file dot txt +first.last+tag@subdomain.example.com,1st dot last tag at subdomain dot example dot com +francois.dupont@gladia.io,francois dot dupont at gladia dot io +good bye,goodbye +he ain't gonna,he is not going to +hello (yeah) there,hello there +hello (yeah) there,hello there +I have $20 dollars here,i have 20 dollars here +is +16209113040,is +16209113040 +is one 620 911 3040,is 16209113040 +is one 620 911 3040,is 16209113040 +is plus 33 6 80 63 10 00,is +33680631000 +is plus 330662732643,is +330662732643 +is plus one 620 911 3040,is +16209113040 +It costs £30 pounds,it costs 30 pounds +It costs €50,it costs 50 euros +It's 12.5 dollars,it is 12 point 5 dollars +j o h a n n,j o h a n n +jane_smith@test.co.uk,jane smith at test dot co dot uk +john j-o-h-n doe d-o-e,john j o h n doe d o e +my name is B.O.B,my name is b o b +john.doe@company.org,john dot doe at company dot org +lemme see,let me see +my phone number is o 4 5 o 6 4 3 2 1 1 and,my phone number is 0450643211 and +ninety nine items,99 items +o 4 5 o 6 4 3 2 1 1,0450643211 +o 4 5 o 6 4 3 2 1 6,0450643216 +o 4 5 o 6 4 3 2 1 o,0450643210 +one apple,1 apple +one billion dollars,1000000000 dollars +one hundred,100 +one hundred people,100 people +one hundred thousand items,100000 items +one million dollars,1000000 dollars +One million dollars total,1000000 dollars total +one thousand dollars,1000 dollars +one thousand three hundred and thirty seven,1337 +"one, two, three",123 +Plus 1 16 plus equals one.,+116 plus equals one +plus 1-619-981-0181,+16199810181 +she ain't gonna,she is not going to +Temperature is 98.6 degrees,temperature is 98 point 6 degrees +ten thousand people,10000 people +test@example.com,test at example dot com +test@example.com.,test at example dot com +test@gladia.io.,test at gladia dot io +That's £100,that is 100 pounds +The code is ABC123 here,the code is a b c 123 here +The price is $99,the price is 99 dollars +The price is 99.99,the price is 99 point 99 +The price is €50 euros,the price is 50 euros +The word [inaudible] is here,the word inaudible is here +twenty one dogs,21 dogs +twenty three people,23 people +twenty two items,22 items +two hundred items,200 items +two million people,2000000 people +two thousand,2000 +two thousand items,2000 items +U.S.A.,u s a +user.name@domain.com,user dot name at domain dot com +user@domain.com,user at domain dot com +user@sub.domain.example.org,user at sub dot domain dot example dot org +Version 1.0.0 released,version 1 dot 0 dot 0 released +version 2.5.1,version 2 dot 5 dot 1 +Visit www.example.com today,visit w w w dot example dot com today +Look at my v12 motor,look at my v12 motor +www.example.com,w w w dot example dot com +x = 5,x equals 5 +"zip code 92103, U.S.",zip code 92103 u s +£5.50,5 point 50 pounds +¥1000,1000 yens +ø in Danish,o in danish +€20 or €30,20 euros or 30 euros +my name is bob,my name is bob \ No newline at end of file diff --git a/tests/e2e/files/gladia-3/fr.csv b/tests/e2e/files/gladia-3/fr.csv new file mode 100644 index 0000000..da16c21 --- /dev/null +++ b/tests/e2e/files/gladia-3/fr.csv @@ -0,0 +1,46 @@ +input,expected +j'ai dit c'est bien,j ai dit c est bien +vingt trois pommes,23 pommes +3 milliards d euros,3000000000 d euros +euh alors hein bah oui,alors oui +"12,5 degrés",12 virgule 5 degres +pour 100 de réduction,pourcent de reduction +pour cent de réduction,pourcent de reduction +d'accord,d accord +qu'il vient,qu il vient +n'est pas,n est pas +l'ordinateur,l ordinateur +m'appelle,m appelle +s'il vous plait,s il vous plait +t'as vu,t as vu +cent euros,100 euros +mille deux cents,1200 +cinquante trois,53 +contact@exemple.fr,contact arobase exemple point fr +"2 < 5",2 plus petit que 5 +50°C,50 degres celsius +ca coute €50,ca coute 50 euros +euh bonjour hein,bonjour +mme dupont,madame dupont +mlle dubois,mademoiselle dubois +dr martin,docteur martin +prof dupont,professeur dupont +st jean,saint jean +ping pong,pingpong +volley ball,volleyball +basket ball,basketball +hand ball,handball +water polo,waterpolo +t shirt,tshirt +cd rom,cdrom +super predateur,superpredateur +"3,14 pi",3 virgule 14 pi +soixante-dix,70 +quatre-vingts,80 +quatre-vingt-un,81 +nonante-neuf,99 +septante et un,71 +x = 5,x egal a 5 +test@example.com,test arobase example point com +bonjour (euh) ami,bonjour ami +ça date d'hier,ca date d hier \ No newline at end of file diff --git a/tests/e2e/files/gladia-3/it.csv b/tests/e2e/files/gladia-3/it.csv new file mode 100644 index 0000000..895daae --- /dev/null +++ b/tests/e2e/files/gladia-3/it.csv @@ -0,0 +1,37 @@ +input,expected +#1 spot,1 spot +It costs €50,it costs 50 euro +"3,14",3 virgola 14 +"1.234,56",1234 virgola 56 +2 < 5,2 minore di 5 +5 > 3,5 maggiore di 3 +50°C,50 gradi celsius +dieci per cento,dieci percento +vs milan,contro milan +dott rossi,dottor rossi +dr rossi,dottor rossi +ehm tipo insomma ciao,ciao +admin+tag@example.com,admin tag chiocciola example punto com +test@example.com,test chiocciola example punto com +uno apple,1 apple +x = 5,x uguale a 5 +"The price is 99,99",the price is 99 virgola 99 +francois.dupont@gladia.io,francois punto dupont chiocciola gladia punto io +www.example.com,w w w punto example punto com +Version 1.0.0 released,version 1 punto 0 punto 0 released +api.endpoint.v2,api punto endpoint punto v2 +fail-safe,fail safe +U.S.A.,u s a +prof bianchi,professore bianchi +avv verdi,avvocato verdi +versus inter,contro inter +tel 123,telefono 123 +ecc.,eccetera +etc subito,eccetera subito ++1 (619) 981-0181,+16199810181 +10:12 pm,10:12 pm +Contact me at john@example.com please,contact me at john chiocciola example punto com please +"Temperature is 98,6 degrees",temperature is 98 virgola 6 degrees +The word [inaudible] is here,the word inaudible is here +$5 and $10,5 dollari and 10 dollari +my name is bob,my name is bob \ No newline at end of file diff --git a/tests/e2e/normalization_test.py b/tests/e2e/normalization_test.py index 5663fd3..666b284 100644 --- a/tests/e2e/normalization_test.py +++ b/tests/e2e/normalization_test.py @@ -12,7 +12,6 @@ @dataclass class NormalizationTest: - language: str input: str expected: str @@ -23,7 +22,6 @@ def _load_tests_from_csv(csv_path: Path) -> list[NormalizationTest]: for row in csv.DictReader(f): rows.append( NormalizationTest( - language=row["language"], input=row["input"], expected=row["expected"], ) @@ -35,13 +33,31 @@ def _case_ids(cases: list[NormalizationTest]) -> list[str]: return [test.input[:60] for test in cases] -def _group_by_language( - tests: list[NormalizationTest], +def _discover_preset_tests( + preset_dir: Path, ) -> dict[str, list[NormalizationTest]]: - groups: dict[str, list[NormalizationTest]] = {} - for t in tests: - groups.setdefault(t.language, []).append(t) - return groups + """Scan a preset directory for per-language CSV files. + + Returns a dict mapping language code (filename stem) to test cases. + """ + tests: dict[str, list[NormalizationTest]] = {} + if not preset_dir.is_dir(): + return tests + for csv_path in sorted(preset_dir.glob("*.csv")): + language = csv_path.stem + cases = _load_tests_from_csv(csv_path) + if cases: + tests[language] = cases + return tests + + +# --------------------------------------------------------------------------- +# gladia_3 +# --------------------------------------------------------------------------- + +_GLADIA_3_DIR = _FILES_DIR / "gladia-3" +_GLADIA_3_BY_LANGUAGE = _discover_preset_tests(_GLADIA_3_DIR) +_GLADIA_3_PIPELINES: dict[str, NormalizationPipeline] = {} def _load_pipeline(preset_name_or_path: str, language: str) -> NormalizationPipeline: @@ -53,15 +69,6 @@ def _load_pipeline(preset_name_or_path: str, language: str) -> NormalizationPipe return _GLADIA_3_PIPELINES[language] -# --------------------------------------------------------------------------- -# gladia_3 -# --------------------------------------------------------------------------- - -_GLADIA_3_CSV = _FILES_DIR / "gladia-3.csv" -_GLADIA_3_TESTS = _load_tests_from_csv(_GLADIA_3_CSV) if _GLADIA_3_CSV.exists() else [] -_GLADIA_3_PIPELINES: dict[str, NormalizationPipeline] = {} - - def _make_gladia_3_test(language: str, cases: list[NormalizationTest]): @pytest.mark.parametrize("test", cases, ids=_case_ids(cases)) def _test(test: NormalizationTest) -> None: @@ -77,8 +84,6 @@ def _test(test: NormalizationTest) -> None: return _test -_GLADIA_3_BY_LANGUAGE = _group_by_language(_GLADIA_3_TESTS) - for _language in sorted(_GLADIA_3_BY_LANGUAGE): globals()[f"test_gladia_3_{_language}"] = _make_gladia_3_test( _language, _GLADIA_3_BY_LANGUAGE[_language] From 693499210f9a9c02007e1536b5daf0a2c60d4a70 Mon Sep 17 00:00:00 2001 From: karamouche Date: Mon, 13 Apr 2026 18:24:27 -0400 Subject: [PATCH 08/10] test: update language normalization tests for German, French, Italian and Spanish languages --- tests/e2e/files/gladia-3/de.csv | 24 ++++++++++------------ tests/e2e/files/gladia-3/default.csv | 4 +++- tests/e2e/files/gladia-3/en.csv | 6 +----- tests/e2e/files/gladia-3/es.csv | 30 ++++++++++++++++++++++++++++ tests/e2e/files/gladia-3/fr.csv | 2 +- tests/e2e/files/gladia-3/it.csv | 24 ++++++++++------------ 6 files changed, 55 insertions(+), 35 deletions(-) create mode 100644 tests/e2e/files/gladia-3/es.csv diff --git a/tests/e2e/files/gladia-3/de.csv b/tests/e2e/files/gladia-3/de.csv index 4021dc8..9704616 100644 --- a/tests/e2e/files/gladia-3/de.csv +++ b/tests/e2e/files/gladia-3/de.csv @@ -1,6 +1,6 @@ input,expected -#1 spot,1 spot -It costs €50,it costs 50 euros +#1 Platz,1 platz +Es kostet €50,es kostet 50 euros "3,14",3 komma 14 "1.234,56",1234 komma 56 2 < 5,2 kleiner als 5 @@ -9,20 +9,16 @@ It costs €50,it costs 50 euros admin+tag@example.com,admin tag at example punkt com test@example.com,test at example punkt com x = 5,x gleich 5 -"The price is 99,99",the price is 99 komma 99 +"Der Preis ist 99,99",der preis ist 99 komma 99 francois.dupont@gladia.io,francois punkt dupont at gladia punkt io www.example.com,w w w punkt example punkt com -Version 1.0.0 released,version 1 punkt 0 punkt 0 released +Version 1.0.0 veröffentlicht,version 1 punkt 0 punkt 0 veroffentlicht api.endpoint.v2,api punkt endpoint punkt v2 -fail-safe,fail safe -U.S.A.,u s a -+1 (619) 981-0181,+16199810181 -10:12 pm,10:12 pm -Contact me at john@example.com please,contact me at john at example punkt com please -"Temperature is 98,6 degrees",temperature is 98 komma 6 degrees -The word [inaudible] is here,the word inaudible is here -$5 and $10,5 dollars and 10 dollars -my name is bob,my name is bob +Kontaktiere mich bei john@example.com bitte,kontaktiere mich bei john at example punkt com bitte +"Die Temperatur ist 98,6 Grad",die temperatur ist 98 komma 6 grad +Das Wort [inaudible] ist hier,das wort inaudible ist hier +$5 und $10,5 dollars und 10 dollars +mein Name ist Bob,mein name ist bob 5 bis 10,5 - 10 kilometer weg,km weg internet suche heute,internetsuche heute @@ -34,4 +30,4 @@ hm okay,okay halt mal so,mal so st. petersburg,st petersburg 6 tage krieg,sechstagekrieg -kreuzungs punkt,kreuzungspunkt \ No newline at end of file +kreuzungs punkt,kreuzungspunkt diff --git a/tests/e2e/files/gladia-3/default.csv b/tests/e2e/files/gladia-3/default.csv index 4258ba0..3c369cf 100644 --- a/tests/e2e/files/gladia-3/default.csv +++ b/tests/e2e/files/gladia-3/default.csv @@ -6,4 +6,6 @@ $100,$100 test@example.com,test@example.com +1234567890,+1234567890 one two three,one two three -5:30 pm,5:30 pm \ No newline at end of file +5:30 pm,5:30 pm +fail-safe,fail safe +U.S.A.,u s a diff --git a/tests/e2e/files/gladia-3/en.csv b/tests/e2e/files/gladia-3/en.csv index 25ca0d6..d37c965 100644 --- a/tests/e2e/files/gladia-3/en.csv +++ b/tests/e2e/files/gladia-3/en.csv @@ -48,18 +48,15 @@ CAPS@EXAMPLE.COM,c a p s at e x a m p l e dot c o m Contact john.doe@company.co.uk now,contact john dot doe at company dot co dot uk now Contact me at john@example.com please,contact me at john at example dot com please example.com,example dot com -fail-safe,fail safe file.txt,file dot txt first.last+tag@subdomain.example.com,1st dot last tag at subdomain dot example dot com francois.dupont@gladia.io,francois dot dupont at gladia dot io good bye,goodbye he ain't gonna,he is not going to hello (yeah) there,hello there -hello (yeah) there,hello there I have $20 dollars here,i have 20 dollars here is +16209113040,is +16209113040 is one 620 911 3040,is 16209113040 -is one 620 911 3040,is 16209113040 is plus 33 6 80 63 10 00,is +33680631000 is plus 330662732643,is +330662732643 is plus one 620 911 3040,is +16209113040 @@ -108,7 +105,6 @@ two hundred items,200 items two million people,2000000 people two thousand,2000 two thousand items,2000 items -U.S.A.,u s a user.name@domain.com,user dot name at domain dot com user@domain.com,user at domain dot com user@sub.domain.example.org,user at sub dot domain dot example dot org @@ -123,4 +119,4 @@ x = 5,x equals 5 ¥1000,1000 yens ø in Danish,o in danish €20 or €30,20 euros or 30 euros -my name is bob,my name is bob \ No newline at end of file +my name is bob,my name is bob diff --git a/tests/e2e/files/gladia-3/es.csv b/tests/e2e/files/gladia-3/es.csv new file mode 100644 index 0000000..2162d6d --- /dev/null +++ b/tests/e2e/files/gladia-3/es.csv @@ -0,0 +1,30 @@ +input,expected +$99,99 dólares +"100,50 €",100 punto 50 +9.8 m/s,9 punto 8 m/s +admin+tag@example.com,admin tag arroba example punto com +cien mil items,100000 items +"Cuesta 12,5 euros",cuesta 12 punto 5 euros +diez mil personas,10000 personas +dos mil items,2000 items +dos millones de personas,2 millones de personas +"El precio es 99,99 €",el precio es 99 punto 99 +Escribe a test@ejemplo.es por favor,escribe a test arroba ejemplo punto es por favor +first.last+tag@subdomain.example.com,first punto last tag arroba subdomain punto example punto com +hola (bien) aquí,hola bien aqui +jane_smith@test.co.uk,jane smith arroba test punto co punto uk +john.doe@company.org,john punto doe arroba company punto org +mil millones de dólares,mil millones de dolares +noventa y nueve items,99 items +Señal%Marca,senal%marca +Tengo 20€ euros aquí,tengo 20 euros euros aqui +un millón de dólares,un millon de dolares +user@domain.com,user arroba domain punto com +veintidós items,22 items +veintitrés personas,23 personas +Veintitrés personas aquí,23 personas aqui +Visita www.ejemplo.com ahora,visita w w w punto ejemplo punto com ahora +www.gladia.io,w w w punto gladia punto io +¢25,25 céntimos +£50,50 libras +¥1000,1000 yenes diff --git a/tests/e2e/files/gladia-3/fr.csv b/tests/e2e/files/gladia-3/fr.csv index da16c21..cd713cb 100644 --- a/tests/e2e/files/gladia-3/fr.csv +++ b/tests/e2e/files/gladia-3/fr.csv @@ -43,4 +43,4 @@ septante et un,71 x = 5,x egal a 5 test@example.com,test arobase example point com bonjour (euh) ami,bonjour ami -ça date d'hier,ca date d hier \ No newline at end of file +ça date d'hier,ca date d hier diff --git a/tests/e2e/files/gladia-3/it.csv b/tests/e2e/files/gladia-3/it.csv index 895daae..c383407 100644 --- a/tests/e2e/files/gladia-3/it.csv +++ b/tests/e2e/files/gladia-3/it.csv @@ -1,6 +1,6 @@ input,expected -#1 spot,1 spot -It costs €50,it costs 50 euro +#1 posto,1 posto +Costa €50,costa 50 euro "3,14",3 virgola 14 "1.234,56",1234 virgola 56 2 < 5,2 minore di 5 @@ -13,25 +13,21 @@ dr rossi,dottor rossi ehm tipo insomma ciao,ciao admin+tag@example.com,admin tag chiocciola example punto com test@example.com,test chiocciola example punto com -uno apple,1 apple +uno mela,1 mela x = 5,x uguale a 5 -"The price is 99,99",the price is 99 virgola 99 +"Il prezzo è 99,99",il prezzo e 99 virgola 99 francois.dupont@gladia.io,francois punto dupont chiocciola gladia punto io www.example.com,w w w punto example punto com -Version 1.0.0 released,version 1 punto 0 punto 0 released +Versione 1.0.0 rilasciata,versione 1 punto 0 punto 0 rilasciata api.endpoint.v2,api punto endpoint punto v2 -fail-safe,fail safe -U.S.A.,u s a prof bianchi,professore bianchi avv verdi,avvocato verdi versus inter,contro inter tel 123,telefono 123 ecc.,eccetera etc subito,eccetera subito -+1 (619) 981-0181,+16199810181 -10:12 pm,10:12 pm -Contact me at john@example.com please,contact me at john chiocciola example punto com please -"Temperature is 98,6 degrees",temperature is 98 virgola 6 degrees -The word [inaudible] is here,the word inaudible is here -$5 and $10,5 dollari and 10 dollari -my name is bob,my name is bob \ No newline at end of file +Contattami a john@example.com per favore,contattami a john chiocciola example punto com per favore +"La temperatura è di 98,6 gradi",la temperatura e di 98 virgola 6 gradi +La parola [inaudible] è qui,la parola inaudible e qui +$5 e $10,5 dollari e 10 dollari +mi chiamo bob,mi chiamo bob From f33ddb38f0138c11d835d9ddd4ab89337edb395c Mon Sep 17 00:00:00 2001 From: karamouche Date: Tue, 14 Apr 2026 09:09:19 -0400 Subject: [PATCH 09/10] fix(spanish): correct decimal word from "punto" to "coma" in Spanish language configuration --- normalization/languages/spanish/operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/normalization/languages/spanish/operators.py b/normalization/languages/spanish/operators.py index 61c0dbd..e6c9a45 100644 --- a/normalization/languages/spanish/operators.py +++ b/normalization/languages/spanish/operators.py @@ -20,7 +20,7 @@ SPANISH_CONFIG = LanguageConfig( code="es", decimal_separator=",", - decimal_word="punto", + decimal_word="coma", thousand_separator=".", symbols_to_words={ "@": "arroba", From 8568b7f562f145dbaf071b71e9ef0d23c76f4934 Mon Sep 17 00:00:00 2001 From: karamouche Date: Tue, 14 Apr 2026 14:21:25 -0400 Subject: [PATCH 10/10] fix(german): remove incorrect replacement for "bis" in German language normalization --- normalization/languages/german/replacements.py | 1 - tests/e2e/files/gladia-3/de.csv | 1 - tests/e2e/files/gladia-3/es.csv | 8 ++++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/normalization/languages/german/replacements.py b/normalization/languages/german/replacements.py index 4b220b1..804c528 100644 --- a/normalization/languages/german/replacements.py +++ b/normalization/languages/german/replacements.py @@ -6,6 +6,5 @@ "xdrtb": "xdr-tb", "dualradio": "dual-radio", "st.": "sankt", - "bis": "-", "maubewegung": "mau-bewegung", } diff --git a/tests/e2e/files/gladia-3/de.csv b/tests/e2e/files/gladia-3/de.csv index 9704616..91e265f 100644 --- a/tests/e2e/files/gladia-3/de.csv +++ b/tests/e2e/files/gladia-3/de.csv @@ -19,7 +19,6 @@ Kontaktiere mich bei john@example.com bitte,kontaktiere mich bei john at example Das Wort [inaudible] ist hier,das wort inaudible ist hier $5 und $10,5 dollars und 10 dollars mein Name ist Bob,mein name ist bob -5 bis 10,5 - 10 kilometer weg,km weg internet suche heute,internetsuche heute wild card spiel,wildcard spiel diff --git a/tests/e2e/files/gladia-3/es.csv b/tests/e2e/files/gladia-3/es.csv index 2162d6d..bda514e 100644 --- a/tests/e2e/files/gladia-3/es.csv +++ b/tests/e2e/files/gladia-3/es.csv @@ -1,14 +1,14 @@ input,expected $99,99 dólares -"100,50 €",100 punto 50 -9.8 m/s,9 punto 8 m/s +"100,50 €",100 coma 50 +9.8 m/s,9 coma 8 m/s admin+tag@example.com,admin tag arroba example punto com cien mil items,100000 items -"Cuesta 12,5 euros",cuesta 12 punto 5 euros +"Cuesta 12,5 euros",cuesta 12 coma 5 euros diez mil personas,10000 personas dos mil items,2000 items dos millones de personas,2 millones de personas -"El precio es 99,99 €",el precio es 99 punto 99 +"El precio es 99,99 €",el precio es 99 coma 99 Escribe a test@ejemplo.es por favor,escribe a test arroba ejemplo punto es por favor first.last+tag@subdomain.example.com,first punto last tag arroba subdomain punto example punto com hola (bien) aquí,hola bien aqui