diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 92570a6..efcc3e9 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -20,7 +20,7 @@ - [ ] Decorated operators class with `@register_language` - [ ] Added one import line to `languages/__init__.py` - [ ] Added unit tests in `tests/unit/languages/` -- [ ] Added e2e test rows in `tests/e2e/files/` +- [ ] Added a per-language CSV in `tests/e2e/files/{preset}/` (e.g. `tests/e2e/files/gladia-3/fr.csv`) ### New step diff --git a/.gitignore b/.gitignore index e5bab35..4d7ac38 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ wheels/ *.egg-info .ruff_cache/ .pytest_cache/ +.DS_Store # Virtual environments diff --git a/AGENTS.md b/AGENTS.md index 0c769fe..74de346 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -144,7 +144,7 @@ Never modify a published preset YAML. Never let a preset reference a step that h - [ ] Decorate the class with `@register_language` - [ ] Add one import to `languages/__init__.py` - [ ] Add tests in `tests/unit/languages/` -- [ ] Add test rows to `tests/e2e/files/` for the new language +- [ ] Add a CSV file `tests/e2e/files/{preset}/{language_code}.csv` for each relevant preset (e.g. `tests/e2e/files/gladia-3/fr.csv`) ## Adding a new step — checklist diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fcc3211..2448a49 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,7 @@ A new language requires: 2. Put all word-level substitutions in `replacements.py` 3. Instantiate a `LanguageConfig` and subclass `LanguageOperators` in `operators.py` 4. Decorate with `@register_language` and add one import to `normalization/languages/__init__.py` -5. Add tests under `tests/unit/languages/` and e2e fixture rows in `tests/e2e/files/` +5. Add tests under `tests/unit/languages/` and a per-language CSV in `tests/e2e/files/{preset}/` (e.g. `tests/e2e/files/gladia-3/fr.csv`) See [docs/contributing-guide.md](docs/contributing-guide.md) for the full checklist and design rules. diff --git a/docs/contributing-guide.md b/docs/contributing-guide.md index 5c038a9..488f5ec 100644 --- a/docs/contributing-guide.md +++ b/docs/contributing-guide.md @@ -26,7 +26,7 @@ This ordering is a hard constraint — some steps depend on earlier steps having - [ ] Decorate the class with `@register_language` - [ ] Add one import to `languages/__init__.py` - [ ] Add tests in `tests/unit/languages/` -- [ ] Add test rows to `tests/e2e/files/` for the new language +- [ ] Add a CSV file `tests/e2e/files/{preset}/{language_code}.csv` for each relevant preset (e.g. `tests/e2e/files/gladia-3/fr.csv`) ### Language data vs. language behavior @@ -159,42 +159,46 @@ def test_my_step_with_english(english_operators): ### E2E tests for a preset -E2E tests validate the full pipeline (preset + language) against a CSV fixture. The test runner lives in `tests/e2e/normalization_test.py` and CSV files go in `tests/e2e/files/`. +E2E tests validate the full pipeline (preset + language) against CSV fixtures. The test runner lives in `tests/e2e/normalization_test.py` and CSV files are organized under `tests/e2e/files/`. -**CSV format** — three columns, no quoting needed unless the value contains a comma: +**Directory structure** — one folder per preset, one CSV per language: ``` -input,expected,language -$1,000,000,1000000 dollars,en -hello world,hello world,fr +tests/e2e/files/ + gladia-3/ + default.csv + de.csv + en.csv + fr.csv + it.csv ``` -Each row is one test case. The `language` column must match a registered language code (or `default`). +**CSV format** — two columns (`input,expected`), no quoting needed unless the value contains a comma: -**Registering a new CSV** — add a block to `normalization_test.py` following the existing pattern: +``` +input,expected +"$1,000,000",1000000 dollars +hello world,hello world +``` + +The language is derived from the filename (e.g. `fr.csv` → language code `fr`). Use `default.csv` for the language-agnostic fallback. + +**Adding test cases for an existing preset** — drop rows into the appropriate `{language_code}.csv` file, or create a new CSV if the language isn't covered yet. Tests are discovered automatically. + +**Registering a new preset** — add a block to `normalization_test.py` following the existing pattern: ```python -_MY_PRESET_CSV = _FILES_DIR / "my-preset.csv" -_MY_PRESET_TESTS = _load_tests_from_csv(_MY_PRESET_CSV) if _MY_PRESET_CSV.exists() else [] +_MY_PRESET_DIR = _FILES_DIR / "my-preset" +_MY_PRESET_BY_LANGUAGE = _discover_preset_tests(_MY_PRESET_DIR) _MY_PRESET_PIPELINES: dict[str, NormalizationPipeline] = {} - -@pytest.mark.parametrize( - "test", - _MY_PRESET_TESTS, - ids=_case_ids(_MY_PRESET_TESTS), -) -def test_my_preset(test: NormalizationTest) -> None: - pipeline = _load_pipeline("my-preset", test.language) - result = pipeline.normalize(test.input) - assert result == test.expected, ( - f"\n input: {test.input!r}" - f"\n expected: {test.expected!r}" - f"\n got: {result!r}" +for _language in sorted(_MY_PRESET_BY_LANGUAGE): + globals()[f"test_my_preset_{_language}"] = _make_test( + "my-preset", _language, _MY_PRESET_BY_LANGUAGE[_language], _MY_PRESET_PIPELINES ) ``` -Pipelines are cached per language inside `_MY_PRESET_PIPELINES` to avoid reloading for each parametrized case — follow the `_load_pipeline` helper pattern already in the file. +Pipelines are cached per language to avoid reloading for each parametrized case. --- diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py index 5674874..18e07a6 100644 --- a/normalization/languages/__init__.py +++ b/normalization/languages/__init__.py @@ -1,7 +1,7 @@ -from . import english, french +from . import english, french, german, italian, spanish from .base import LanguageOperators from .registry import get_language_registry, register_language register_language(LanguageOperators) -__all__ = ["english", "french", "get_language_registry"] +__all__ = ["english", "french", "german", "italian", "spanish", "get_language_registry"] diff --git a/normalization/languages/german/__init__.py b/normalization/languages/german/__init__.py new file mode 100644 index 0000000..838e83b --- /dev/null +++ b/normalization/languages/german/__init__.py @@ -0,0 +1,7 @@ +from .operators import GermanOperators +from .replacements import GERMAN_REPLACEMENTS + +__all__ = [ + "GermanOperators", + "GERMAN_REPLACEMENTS", +] diff --git a/normalization/languages/german/operators.py b/normalization/languages/german/operators.py new file mode 100644 index 0000000..ca3397a --- /dev/null +++ b/normalization/languages/german/operators.py @@ -0,0 +1,43 @@ +from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.german.replacements import GERMAN_REPLACEMENTS +from normalization.languages.german.sentence_replacements import ( + GERMAN_SENTENCE_REPLACEMENTS, +) +from normalization.languages.registry import register_language + +GERMAN_CONFIG = LanguageConfig( + code="de", + decimal_separator=",", + decimal_word="komma", + thousand_separator=".", + symbols_to_words={ + "@": "at", + ".": "punkt", + "+": "plus", + "=": "gleich", + ">": "größer als", + "<": "kleiner als", + "°": "grad", + "°C": "grad celsius", + "°F": "grad fahrenheit", + "%": "prozent", + }, + currency_symbol_to_word={ + "€": "euros", + "$": "dollars", + "£": "pounds", + "¢": "cents", + "¥": "yens", + }, + filler_words=["äh", "ähm", "hm", "also", "naja", "halt"], + sentence_replacements=GERMAN_SENTENCE_REPLACEMENTS, +) + + +@register_language +class GermanOperators(LanguageOperators): + def __init__(self): + super().__init__(GERMAN_CONFIG) + + def get_word_replacements(self) -> dict[str, str]: + return GERMAN_REPLACEMENTS diff --git a/normalization/languages/german/replacements.py b/normalization/languages/german/replacements.py new file mode 100644 index 0000000..804c528 --- /dev/null +++ b/normalization/languages/german/replacements.py @@ -0,0 +1,10 @@ +GERMAN_REPLACEMENTS: dict[str, str] = { + "u.": "unter", + "chr.": "christus", + "rissströmungen": "riss-strömungen", + "kilometer": "km", + "xdrtb": "xdr-tb", + "dualradio": "dual-radio", + "st.": "sankt", + "maubewegung": "mau-bewegung", +} diff --git a/normalization/languages/german/sentence_replacements.py b/normalization/languages/german/sentence_replacements.py new file mode 100644 index 0000000..31086fa --- /dev/null +++ b/normalization/languages/german/sentence_replacements.py @@ -0,0 +1,16 @@ +GERMAN_SENTENCE_REPLACEMENTS: dict[str, str] = { + "regimeet kritischen": "regimekritischen", + "cannabis joints": "cannabisjoints", + "kampf handlungen": "kampfhandlungen", + "erwachsenen pornografie": "erwachsenenpornographie", + "standbild format": "standbildformat", + "internet radio seite": "internetradioseite", + "alt gedienten": "altgedienten", + "6 tage krieg": "sechstagekrieg", + "kreuzungs punkt": "kreuzungspunkt", + "wild card": "wildcard", + "national parks": "nationalparks", + "internet suche": "internetsuche", + "gleichgewicht geschlechtliche": "gleichgeschlechtlichen", + "welt kulturerbegebiete": "weltkulturerbegebiete", +} diff --git a/normalization/languages/italian/__init__.py b/normalization/languages/italian/__init__.py new file mode 100644 index 0000000..9278e42 --- /dev/null +++ b/normalization/languages/italian/__init__.py @@ -0,0 +1,7 @@ +from .operators import ItalianOperators +from .replacements import ITALIAN_REPLACEMENTS + +__all__ = [ + "ItalianOperators", + "ITALIAN_REPLACEMENTS", +] diff --git a/normalization/languages/italian/operators.py b/normalization/languages/italian/operators.py new file mode 100644 index 0000000..ed48304 --- /dev/null +++ b/normalization/languages/italian/operators.py @@ -0,0 +1,116 @@ +import re + +from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.italian.replacements import ITALIAN_REPLACEMENTS +from normalization.languages.registry import register_language + +# Single digits 1–9: shared by digit_words and any future time/compound helpers. +_ONE_TO_NINE: dict[str, str] = { + "uno": "1", + "due": "2", + "tre": "3", + "quattro": "4", + "cinque": "5", + "sei": "6", + "sette": "7", + "otto": "8", + "nove": "9", +} + +ITALIAN_SENTENCE_REPLACEMENTS: dict[str, str] = { + # Spoken percentages (“dieci per cento”) → one canonical form aligned with “%” → percento + "per cento": "percento", +} + +ITALIAN_CONFIG = LanguageConfig( + code="it", + decimal_separator=",", + decimal_word="virgola", + thousand_separator=".", + symbols_to_words={ + "@": "chiocciola", + ".": "punto", + "+": "più", + "=": "uguale a", + ">": "maggiore di", + "<": "minore di", + "°": "grado", + "°C": "gradi celsius", + "°F": "gradi fahrenheit", + "%": "percento", + }, + currency_symbol_to_word={ + "€": "euro", + "$": "dollari", + "£": "sterline", + "¢": "centesimi", + "¥": "yen", + }, + filler_words=[ + "eh", + "ehm", + "mm", + "mh", + "cioè", + "cioe", + "tipo", + "insomma", + "allora", + "beh", + "bah", + "dunque", + "magari", + "praticamente", + ], + sentence_replacements=ITALIAN_SENTENCE_REPLACEMENTS, + digit_words={"zero": "0", **_ONE_TO_NINE}, + number_words=[ + "zero", + *_ONE_TO_NINE, + "dieci", + "undici", + "dodici", + "tredici", + "quattordici", + "quindici", + "sedici", + "diciassette", + "diciotto", + "diciannove", + "venti", + "trenta", + "quaranta", + "cinquanta", + "sessanta", + "settanta", + "ottanta", + "novanta", + "cento", + "mille", + "mila", + "milione", + "milioni", + "miliardo", + "miliardi", + ], + plus_word="più", +) + + +@register_language +class ItalianOperators(LanguageOperators): + def __init__(self): + super().__init__(ITALIAN_CONFIG) + + def fix_one_word_in_numeric_contexts(self, text: str) -> str: + text = re.sub(r"(\d+)\s+uno\s+uno\b", r"\1 1 1", text) + text = re.sub(r"\buno\s+uno\s+(\d)", r"1 1 \1", text) + text = re.sub(r"(\d+)\s+uno\s+(\d)", r"\1 1 \2", text) + text = re.sub(r"(\d+)\s+uno\b", r"\1 1", text) + text = re.sub(r"\b(\d+)uno\b", r"\1 1", text) + text = re.sub(r"\buno\s+(\d)", r"1 \1", text) + text = re.sub(r"^uno\s+(?=[a-z])", "1 ", text) + return text + + def get_word_replacements(self) -> dict[str, str]: + return ITALIAN_REPLACEMENTS diff --git a/normalization/languages/italian/replacements.py b/normalization/languages/italian/replacements.py new file mode 100644 index 0000000..12c5e72 --- /dev/null +++ b/normalization/languages/italian/replacements.py @@ -0,0 +1,11 @@ +ITALIAN_REPLACEMENTS: dict[str, str] = { + "avv": "avvocato", + "dott": "dottor", + "dr": "dottor", + "ecc": "eccetera", + "etc": "eccetera", + "prof": "professore", + "tel": "telefono", + "versus": "contro", + "vs": "contro", +} diff --git a/normalization/languages/spanish/__init__.py b/normalization/languages/spanish/__init__.py new file mode 100644 index 0000000..1df78ad --- /dev/null +++ b/normalization/languages/spanish/__init__.py @@ -0,0 +1,7 @@ +from .operators import SpanishOperators +from .replacements import SPANISH_REPLACEMENTS + +__all__ = [ + "SpanishOperators", + "SPANISH_REPLACEMENTS", +] diff --git a/normalization/languages/spanish/number_normalizer.py b/normalization/languages/spanish/number_normalizer.py new file mode 100644 index 0000000..632a3fc --- /dev/null +++ b/normalization/languages/spanish/number_normalizer.py @@ -0,0 +1,196 @@ +"""Convert common Spanish spelled-out numbers to digits (STT-oriented). + +Covers 0–999, ``mil`` compounds, and informal ``veinte tres`` → ``23``. +Accepts spellings with or without accents (common in transcripts). +""" + +from __future__ import annotations + +import unicodedata + + +def _fold(s: str) -> str: + s = s.lower() + return "".join( + c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn" + ) + + +def _get(table: dict[str, int], word: str) -> int | None: + fw = _fold(word) + for k, v in table.items(): + if _fold(k) == fw: + return v + return None + + +_ONES_1_9: dict[str, int] = { + "uno": 1, + "dos": 2, + "tres": 3, + "cuatro": 4, + "cinco": 5, + "seis": 6, + "siete": 7, + "ocho": 8, + "nueve": 9, +} + +_TEENS: dict[str, int] = { + "diez": 10, + "once": 11, + "doce": 12, + "trece": 13, + "catorce": 14, + "quince": 15, + "dieciseis": 16, + "diecisiete": 17, + "dieciocho": 18, + "diecinueve": 19, +} + +_VEINTI: dict[str, int] = { + "veintiuno": 21, + "veintidos": 22, + "veintitres": 23, + "veinticuatro": 24, + "veinticinco": 25, + "veintiseis": 26, + "veintisiete": 27, + "veintiocho": 28, + "veintinueve": 29, +} + +_TENS: dict[str, int] = { + "treinta": 30, + "cuarenta": 40, + "cincuenta": 50, + "sesenta": 60, + "setenta": 70, + "ochenta": 80, + "noventa": 90, +} + +_HUNDREDS: dict[str, int] = { + "cien": 100, + "ciento": 100, + "doscientos": 200, + "trescientos": 300, + "cuatrocientos": 400, + "quinientos": 500, + "seiscientos": 600, + "setecientos": 700, + "ochocientos": 800, + "novecientos": 900, +} + + +class SpanishNumberNormalizer: + def __call__(self, text: str) -> str: + if not text.strip(): + return text + words = text.split() + out: list[str] = [] + i = 0 + n = len(words) + while i < n: + parsed = self._parse_number(words, i, n) + if parsed is not None: + end, value = parsed + out.append(str(value)) + i = end + else: + out.append(words[i]) + i += 1 + return " ".join(out) + + def _parse_number(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + """If words[i:] start with a spelled number, return (exclusive_end_index, value).""" + if i >= n: + return None + + fw = _fold(words[i]) + + if fw == "cero": + return i + 1, 0 + + # --- Optional leading hundred block (cien/ciento/ doscientos …) --- + h = _get(_HUNDREDS, words[i]) + if h is not None: + j = i + 1 + if j < n and _fold(words[j]) == "mil": + base = h * 1000 + j += 1 + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, base + v2 + return j, base + if h == 100: + sub = self._parse_0_99(words, j, n) + if sub is not None: + je, v = sub + return je, 100 + v + return j, 100 + sub = self._parse_0_99(words, j, n) + if sub is not None: + je, v = sub + return je, h + v + return j, h + + # --- 0–99 or leading multiplier for "mil" --- + sub99 = self._parse_0_99(words, i, n) + if sub99 is None: + return None + j, v = sub99 + if j < n and _fold(words[j]) == "mil": + j += 1 + if j >= n: + return j, v * 1000 + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, v * 1000 + v2 + return j, v * 1000 + return j, v + + def _parse_0_99(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + w = words[i] + fw = _fold(w) + + if fw == "veinte": + if i + 1 < n: + o = _get(_ONES_1_9, words[i + 1]) + if o is not None: + return i + 2, 20 + o + return i + 1, 20 + + v = _get(_VEINTI, w) + if v is not None: + return i + 1, v + + v = _get(_TEENS, w) + if v is not None: + return i + 1, v + + v = _get(_ONES_1_9, w) + if v is not None: + return i + 1, v + + tens = _get(_TENS, w) + if tens is None: + return None + j = i + 1 + if j < n and _fold(words[j]) == "y": + j += 1 + if j < n: + o = _get(_ONES_1_9, words[j]) + if o is not None: + return j + 1, tens + o + if i + 1 < n and tens >= 30: + o = _get(_ONES_1_9, words[i + 1]) + if o is not None: + return i + 2, tens + o + return i + 1, tens diff --git a/normalization/languages/spanish/operators.py b/normalization/languages/spanish/operators.py new file mode 100644 index 0000000..e6c9a45 --- /dev/null +++ b/normalization/languages/spanish/operators.py @@ -0,0 +1,139 @@ +import re + +from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.registry import register_language +from normalization.languages.spanish.number_normalizer import SpanishNumberNormalizer +from normalization.languages.spanish.replacements import SPANISH_REPLACEMENTS + +_ONE_TO_NINE: dict[str, str] = { + "uno": "1", + "dos": "2", + "tres": "3", + "cuatro": "4", + "cinco": "5", + "seis": "6", + "siete": "7", + "ocho": "8", + "nueve": "9", +} + +SPANISH_CONFIG = LanguageConfig( + code="es", + decimal_separator=",", + decimal_word="coma", + thousand_separator=".", + symbols_to_words={ + "@": "arroba", + ".": "punto", + "+": "más", + "=": "igual a", + ">": "mayor que", + "<": "menor que", + "°": "grado", + "°C": "grados celsius", + "°F": "grados fahrenheit", + "%": "por ciento", + }, + currency_symbol_to_word={ + "€": "euros", + "$": "dólares", + "£": "libras", + "¢": "céntimos", + "¥": "yenes", + }, + filler_words=[ + "eh", + "ehm", + "mm", + "mh", + "bueno", + "pues", + "o sea", + "tipo", + "vale", + "vaya", + "mira", + "hombre", + "mujer", + "digo", + "entonces", + "claro", + "vamos", + "este", + "esta", + ], + sentence_replacements=None, + digit_words={"cero": "0", **_ONE_TO_NINE}, + number_words=[ + "cero", + *_ONE_TO_NINE, + "diez", + "once", + "doce", + "trece", + "catorce", + "quince", + "dieciséis", + "dieciseis", + "diecisiete", + "dieciocho", + "diecinueve", + "veinte", + "veintiuno", + "veintidos", + "veintitres", + "veinticuatro", + "veinticinco", + "veintiseis", + "veintisiete", + "veintiocho", + "veintinueve", + "treinta", + "cuarenta", + "cincuenta", + "sesenta", + "setenta", + "ochenta", + "noventa", + "cien", + "ciento", + "doscientos", + "trescientos", + "cuatrocientos", + "quinientos", + "seiscientos", + "setecientos", + "ochocientos", + "novecientos", + "mil", + "millón", + "millones", + "mil millones", + "billón", + "billones", + ], + plus_word="más", +) + + +@register_language +class SpanishOperators(LanguageOperators): + def __init__(self): + super().__init__(SPANISH_CONFIG) + self._number_normalizer = SpanishNumberNormalizer() + + def fix_one_word_in_numeric_contexts(self, text: str) -> str: + text = re.sub(r"(\d+)\s+uno\s+uno\b", r"\1 1 1", text) + text = re.sub(r"\buno\s+uno\s+(\d)", r"1 1 \1", text) + text = re.sub(r"(\d+)\s+uno\s+(\d)", r"\1 1 \2", text) + text = re.sub(r"(\d+)\s+uno\b", r"\1 1", text) + text = re.sub(r"\b(\d+)uno\b", r"\1 1", text) + text = re.sub(r"\buno\s+(\d)", r"1 \1", text) + text = re.sub(r"^uno\s+(?=[a-z])", "1 ", text) + return text + + def get_word_replacements(self) -> dict[str, str]: + return SPANISH_REPLACEMENTS + + def expand_written_numbers(self, text: str) -> str: + return self._number_normalizer(text) diff --git a/normalization/languages/spanish/replacements.py b/normalization/languages/spanish/replacements.py new file mode 100644 index 0000000..a4eda19 --- /dev/null +++ b/normalization/languages/spanish/replacements.py @@ -0,0 +1,30 @@ +SPANISH_REPLACEMENTS: dict[str, str] = { + "aprox": "aproximadamente", + "av": "avenida", + "cta": "cuenta", + "d": "don", + "da": "doña", + "dept": "departamento", + "depto": "departamento", + "doc": "documento", + "dr": "doctor", + "dra": "doctora", + "etc": "etcétera", + "ej": "ejemplo", + "ext": "extensión", + "hab": "habitación", + "ing": "ingeniero", + "núm": "número", + "pag": "página", + "prof": "profesor", + "profa": "profesora", + "pza": "plaza", + "tel": "teléfono", + "tfno": "teléfono", + "ud": "usted", + "uds": "ustedes", + "vd": "usted", + "vds": "ustedes", + "versus": "versus", + "vs": "versus", +} diff --git a/tests/e2e/files/gladia-3.csv b/tests/e2e/files/gladia-3.csv deleted file mode 100644 index b0b689b..0000000 --- a/tests/e2e/files/gladia-3.csv +++ /dev/null @@ -1,179 +0,0 @@ -input,expected,language -#1 spot,1 spot,en -"$1,000,000",1000000 dollars,en -$5 and $10,5 dollars and 10 dollars,en -$50.75 total,50 point 75 dollars total,en -+1 (619) 981-0181,+16199810181,en -05 45 pm,5:45 pm,en -05:45pm,05:45 pm,en -"1,234.56",1234 point 56,en -1.1.1.1,1 dot 1 dot 1 dot 1,en -10 a m,10 am,en -1012 am,10:12 am,en -10:00 pm,10 pm,en -10:12 pm,10:12 pm,en -10:54 a m,10:54 am,en -11 a m,11 am,en -1145 pm,11:45 pm,en -12 p m,12 pm,en -12:34 p m,12:34 pm,en -Let's meet at noon o'clock,let us meet at 12:00,en -Appointment is at 3 o'clock,appointment is at 3:00,en -192.168.1.1,192 dot 168 dot 1 dot 1,en -2 < 5,2 less than 5,en -3.14,3 point 14,en -5 45 p m,5:45 pm,en -5 > 3,5 greater than 3,en -5 a m,5 am,en -5.45 p.m.,5:45 pm,en -50°C,50 degree celsius,en -545 pm,5:45 pm,en -6 p m,6 pm,en -602 am,6:02 am,en -6:00 am,6 am,en -6:24 am,6:24 am,en -9.8 m/s,9 point 8 m/s,en -About ¢25,about 25 cents,en -About ¢25 cents only,about 25 cents only,en -About ¥1000 yen total,about 1000 yens total,en -admin+tag@example.com,admin tag at example dot com,en -api.endpoint.v2,api dot endpoint dot v2,en -at eleven twenty-five a.m,at 11:25 am,en -at French numbers plus three three oh six six two seven three two six four three,at french numbers +330662732643,en -at ten o'clock,at 10:00,en -at two p.m,at 2 pm,en -at two thirty p.m,at 2:30 pm,en -bob b-o-b dupov d-u-p-o-v,bob b o b dupov d u p o v,en -CAPS@EXAMPLE.COM,c a p s at e x a m p l e dot c o m,en -Contact john.doe@company.co.uk now,contact john dot doe at company dot co dot uk now,en -Contact me at john@example.com please,contact me at john at example dot com please,en -example.com,example dot com,en -fail-safe,fail safe,en -file.txt,file dot txt,en -first.last+tag@subdomain.example.com,1st dot last tag at subdomain dot example dot com,en -francois.dupont@gladia.io,francois dot dupont at gladia dot io,en -good bye,goodbye,en -he ain't gonna,he is not going to,en -hello (yeah) there,hello there,en -hello (yeah) there,hello there,en -I have $20 dollars here,i have 20 dollars here,en -is +16209113040,is +16209113040,en -is one 620 911 3040,is 16209113040,en -is one 620 911 3040,is 16209113040,en -is plus 33 6 80 63 10 00,is +33680631000,en -is plus 330662732643,is +330662732643,en -is plus one 620 911 3040,is +16209113040,en -It costs £30 pounds,it costs 30 pounds,en -It costs €50,it costs 50 euros,en -It's 12.5 dollars,it is 12 point 5 dollars,en -j o h a n n,j o h a n n,en -jane_smith@test.co.uk,jane smith at test dot co dot uk,en -john j-o-h-n doe d-o-e,john j o h n doe d o e,en -my name is B.O.B,my name is b o b,en -john.doe@company.org,john dot doe at company dot org,en -lemme see,let me see,en -my phone number is o 4 5 o 6 4 3 2 1 1 and,my phone number is 0450643211 and,en -ninety nine items,99 items,en -o 4 5 o 6 4 3 2 1 1,0450643211,en -o 4 5 o 6 4 3 2 1 6,0450643216,en -o 4 5 o 6 4 3 2 1 o,0450643210,en -one apple,1 apple,en -one billion dollars,1000000000 dollars,en -one hundred,100,en -one hundred people,100 people,en -one hundred thousand items,100000 items,en -one million dollars,1000000 dollars,en -One million dollars total,1000000 dollars total,en -one thousand dollars,1000 dollars,en -one thousand three hundred and thirty seven,1337,en -"one, two, three",123,en -Plus 1 16 plus equals one.,+116 plus equals one,en -plus 1-619-981-0181,+16199810181,en -she ain't gonna,she is not going to,en -Temperature is 98.6 degrees,temperature is 98 point 6 degrees,en -ten thousand people,10000 people,en -test@example.com,test at example dot com,en -test@example.com.,test at example dot com,en -test@gladia.io.,test at gladia dot io,en -That's £100,that is 100 pounds,en -The code is ABC123 here,the code is a b c 123 here,en -The price is $99,the price is 99 dollars,en -The price is 99.99,the price is 99 point 99,en -The price is €50 euros,the price is 50 euros,en -The word [inaudible] is here,the word inaudible is here,en -twenty one dogs,21 dogs,en -twenty three people,23 people,en -twenty two items,22 items,en -two hundred items,200 items,en -two million people,2000000 people,en -two thousand,2000,en -two thousand items,2000 items,en -U.S.A.,u s a,en -user.name@domain.com,user dot name at domain dot com,en -user@domain.com,user at domain dot com,en -user@sub.domain.example.org,user at sub dot domain dot example dot org,en -Version 1.0.0 released,version 1 dot 0 dot 0 released,en -version 2.5.1,version 2 dot 5 dot 1,en -Visit www.example.com today,visit w w w dot example dot com today,en -Look at my v12 motor,look at my v12 motor,en -www.example.com,w w w dot example dot com,en -x = 5,x equals 5,en -"zip code 92103, U.S.",zip code 92103 u s,en -£5.50,5 point 50 pounds,en -¥1000,1000 yens,en -ø in Danish,o in danish,en -€20 or €30,20 euros or 30 euros,en -my name is bob,my name is bob,en -j'ai dit c'est bien,j ai dit c est bien,fr -vingt trois pommes,23 pommes,fr -3 milliards d euros,3000000000 d euros,fr -euh alors hein bah oui,alors oui,fr -"12,5 degrés",12 virgule 5 degres,fr -pour 100 de réduction,pourcent de reduction,fr -pour cent de réduction,pourcent de reduction,fr -"Hello, world!",hello world,default -ça va?!,ca va,default -$100,$100,default -80 €,80 €,default -test@example.com,test@example.com,default -+1234567890,+1234567890,default -one two three,one two three,default -5:30 pm,5:30 pm,default -d'accord,d accord,fr -qu'il vient,qu il vient,fr -n'est pas,n est pas,fr -l'ordinateur,l ordinateur,fr -m'appelle,m appelle,fr -s'il vous plait,s il vous plait,fr -t'as vu,t as vu,fr -cent euros,100 euros,fr -mille deux cents,1200,fr -cinquante trois,53,fr -contact@exemple.fr,contact arobase exemple point fr,fr -"2 < 5",2 plus petit que 5,fr -50°C,50 degres celsius,fr -ca coute €50,ca coute 50 euros,fr -euh bonjour hein,bonjour,fr -mme dupont,madame dupont,fr -mlle dubois,mademoiselle dubois,fr -dr martin,docteur martin,fr -prof dupont,professeur dupont,fr -st jean,saint jean,fr -ping pong,pingpong,fr -volley ball,volleyball,fr -basket ball,basketball,fr -hand ball,handball,fr -water polo,waterpolo,fr -t shirt,tshirt,fr -cd rom,cdrom,fr -super predateur,superpredateur,fr -"3,14 pi",3 virgule 14 pi,fr -soixante-dix,70,fr -quatre-vingts,80,fr -quatre-vingt-un,81,fr -nonante-neuf,99,fr -septante et un,71,fr -x = 5,x egal a 5,fr -test@example.com,test arobase example point com,fr -bonjour (euh) ami,bonjour ami,fr -ça date d'hier,ca date d hier,fr \ No newline at end of file diff --git a/tests/e2e/files/gladia-3/de.csv b/tests/e2e/files/gladia-3/de.csv new file mode 100644 index 0000000..91e265f --- /dev/null +++ b/tests/e2e/files/gladia-3/de.csv @@ -0,0 +1,32 @@ +input,expected +#1 Platz,1 platz +Es kostet €50,es kostet 50 euros +"3,14",3 komma 14 +"1.234,56",1234 komma 56 +2 < 5,2 kleiner als 5 +5 > 3,5 grosser als 3 +50°C,50 grad celsius +admin+tag@example.com,admin tag at example punkt com +test@example.com,test at example punkt com +x = 5,x gleich 5 +"Der Preis ist 99,99",der preis ist 99 komma 99 +francois.dupont@gladia.io,francois punkt dupont at gladia punkt io +www.example.com,w w w punkt example punkt com +Version 1.0.0 veröffentlicht,version 1 punkt 0 punkt 0 veroffentlicht +api.endpoint.v2,api punkt endpoint punkt v2 +Kontaktiere mich bei john@example.com bitte,kontaktiere mich bei john at example punkt com bitte +"Die Temperatur ist 98,6 Grad",die temperatur ist 98 komma 6 grad +Das Wort [inaudible] ist hier,das wort inaudible ist hier +$5 und $10,5 dollars und 10 dollars +mein Name ist Bob,mein name ist bob +kilometer weg,km weg +internet suche heute,internetsuche heute +wild card spiel,wildcard spiel +national parks tour,nationalparks tour +also naja hallo,hallo +äh ähm hallo,ah ahm hallo +hm okay,okay +halt mal so,mal so +st. petersburg,st petersburg +6 tage krieg,sechstagekrieg +kreuzungs punkt,kreuzungspunkt diff --git a/tests/e2e/files/gladia-3/default.csv b/tests/e2e/files/gladia-3/default.csv new file mode 100644 index 0000000..3c369cf --- /dev/null +++ b/tests/e2e/files/gladia-3/default.csv @@ -0,0 +1,11 @@ +input,expected +"Hello, world!",hello world +ça va?!,ca va +$100,$100 +80 €,80 € +test@example.com,test@example.com ++1234567890,+1234567890 +one two three,one two three +5:30 pm,5:30 pm +fail-safe,fail safe +U.S.A.,u s a diff --git a/tests/e2e/files/gladia-3/en.csv b/tests/e2e/files/gladia-3/en.csv new file mode 100644 index 0000000..d37c965 --- /dev/null +++ b/tests/e2e/files/gladia-3/en.csv @@ -0,0 +1,122 @@ +input,expected +#1 spot,1 spot +"$1,000,000",1000000 dollars +$5 and $10,5 dollars and 10 dollars +$50.75 total,50 point 75 dollars total ++1 (619) 981-0181,+16199810181 +05 45 pm,5:45 pm +05:45pm,05:45 pm +"1,234.56",1234 point 56 +1.1.1.1,1 dot 1 dot 1 dot 1 +10 a m,10 am +1012 am,10:12 am +10:00 pm,10 pm +10:12 pm,10:12 pm +10:54 a m,10:54 am +11 a m,11 am +1145 pm,11:45 pm +12 p m,12 pm +12:34 p m,12:34 pm +Let's meet at noon o'clock,let us meet at 12:00 +Appointment is at 3 o'clock,appointment is at 3:00 +192.168.1.1,192 dot 168 dot 1 dot 1 +2 < 5,2 less than 5 +3.14,3 point 14 +5 45 p m,5:45 pm +5 > 3,5 greater than 3 +5 a m,5 am +5.45 p.m.,5:45 pm +50°C,50 degree celsius +545 pm,5:45 pm +6 p m,6 pm +602 am,6:02 am +6:00 am,6 am +6:24 am,6:24 am +9.8 m/s,9 point 8 m/s +About ¢25,about 25 cents +About ¢25 cents only,about 25 cents only +About ¥1000 yen total,about 1000 yens total +admin+tag@example.com,admin tag at example dot com +api.endpoint.v2,api dot endpoint dot v2 +at eleven twenty-five a.m,at 11:25 am +at French numbers plus three three oh six six two seven three two six four three,at french numbers +330662732643 +at ten o'clock,at 10:00 +at two p.m,at 2 pm +at two thirty p.m,at 2:30 pm +bob b-o-b dupov d-u-p-o-v,bob b o b dupov d u p o v +CAPS@EXAMPLE.COM,c a p s at e x a m p l e dot c o m +Contact john.doe@company.co.uk now,contact john dot doe at company dot co dot uk now +Contact me at john@example.com please,contact me at john at example dot com please +example.com,example dot com +file.txt,file dot txt +first.last+tag@subdomain.example.com,1st dot last tag at subdomain dot example dot com +francois.dupont@gladia.io,francois dot dupont at gladia dot io +good bye,goodbye +he ain't gonna,he is not going to +hello (yeah) there,hello there +I have $20 dollars here,i have 20 dollars here +is +16209113040,is +16209113040 +is one 620 911 3040,is 16209113040 +is plus 33 6 80 63 10 00,is +33680631000 +is plus 330662732643,is +330662732643 +is plus one 620 911 3040,is +16209113040 +It costs £30 pounds,it costs 30 pounds +It costs €50,it costs 50 euros +It's 12.5 dollars,it is 12 point 5 dollars +j o h a n n,j o h a n n +jane_smith@test.co.uk,jane smith at test dot co dot uk +john j-o-h-n doe d-o-e,john j o h n doe d o e +my name is B.O.B,my name is b o b +john.doe@company.org,john dot doe at company dot org +lemme see,let me see +my phone number is o 4 5 o 6 4 3 2 1 1 and,my phone number is 0450643211 and +ninety nine items,99 items +o 4 5 o 6 4 3 2 1 1,0450643211 +o 4 5 o 6 4 3 2 1 6,0450643216 +o 4 5 o 6 4 3 2 1 o,0450643210 +one apple,1 apple +one billion dollars,1000000000 dollars +one hundred,100 +one hundred people,100 people +one hundred thousand items,100000 items +one million dollars,1000000 dollars +One million dollars total,1000000 dollars total +one thousand dollars,1000 dollars +one thousand three hundred and thirty seven,1337 +"one, two, three",123 +Plus 1 16 plus equals one.,+116 plus equals one +plus 1-619-981-0181,+16199810181 +she ain't gonna,she is not going to +Temperature is 98.6 degrees,temperature is 98 point 6 degrees +ten thousand people,10000 people +test@example.com,test at example dot com +test@example.com.,test at example dot com +test@gladia.io.,test at gladia dot io +That's £100,that is 100 pounds +The code is ABC123 here,the code is a b c 123 here +The price is $99,the price is 99 dollars +The price is 99.99,the price is 99 point 99 +The price is €50 euros,the price is 50 euros +The word [inaudible] is here,the word inaudible is here +twenty one dogs,21 dogs +twenty three people,23 people +twenty two items,22 items +two hundred items,200 items +two million people,2000000 people +two thousand,2000 +two thousand items,2000 items +user.name@domain.com,user dot name at domain dot com +user@domain.com,user at domain dot com +user@sub.domain.example.org,user at sub dot domain dot example dot org +Version 1.0.0 released,version 1 dot 0 dot 0 released +version 2.5.1,version 2 dot 5 dot 1 +Visit www.example.com today,visit w w w dot example dot com today +Look at my v12 motor,look at my v12 motor +www.example.com,w w w dot example dot com +x = 5,x equals 5 +"zip code 92103, U.S.",zip code 92103 u s +£5.50,5 point 50 pounds +¥1000,1000 yens +ø in Danish,o in danish +€20 or €30,20 euros or 30 euros +my name is bob,my name is bob diff --git a/tests/e2e/files/gladia-3/es.csv b/tests/e2e/files/gladia-3/es.csv new file mode 100644 index 0000000..bda514e --- /dev/null +++ b/tests/e2e/files/gladia-3/es.csv @@ -0,0 +1,30 @@ +input,expected +$99,99 dólares +"100,50 €",100 coma 50 +9.8 m/s,9 coma 8 m/s +admin+tag@example.com,admin tag arroba example punto com +cien mil items,100000 items +"Cuesta 12,5 euros",cuesta 12 coma 5 euros +diez mil personas,10000 personas +dos mil items,2000 items +dos millones de personas,2 millones de personas +"El precio es 99,99 €",el precio es 99 coma 99 +Escribe a test@ejemplo.es por favor,escribe a test arroba ejemplo punto es por favor +first.last+tag@subdomain.example.com,first punto last tag arroba subdomain punto example punto com +hola (bien) aquí,hola bien aqui +jane_smith@test.co.uk,jane smith arroba test punto co punto uk +john.doe@company.org,john punto doe arroba company punto org +mil millones de dólares,mil millones de dolares +noventa y nueve items,99 items +Señal%Marca,senal%marca +Tengo 20€ euros aquí,tengo 20 euros euros aqui +un millón de dólares,un millon de dolares +user@domain.com,user arroba domain punto com +veintidós items,22 items +veintitrés personas,23 personas +Veintitrés personas aquí,23 personas aqui +Visita www.ejemplo.com ahora,visita w w w punto ejemplo punto com ahora +www.gladia.io,w w w punto gladia punto io +¢25,25 céntimos +£50,50 libras +¥1000,1000 yenes diff --git a/tests/e2e/files/gladia-3/fr.csv b/tests/e2e/files/gladia-3/fr.csv new file mode 100644 index 0000000..cd713cb --- /dev/null +++ b/tests/e2e/files/gladia-3/fr.csv @@ -0,0 +1,46 @@ +input,expected +j'ai dit c'est bien,j ai dit c est bien +vingt trois pommes,23 pommes +3 milliards d euros,3000000000 d euros +euh alors hein bah oui,alors oui +"12,5 degrés",12 virgule 5 degres +pour 100 de réduction,pourcent de reduction +pour cent de réduction,pourcent de reduction +d'accord,d accord +qu'il vient,qu il vient +n'est pas,n est pas +l'ordinateur,l ordinateur +m'appelle,m appelle +s'il vous plait,s il vous plait +t'as vu,t as vu +cent euros,100 euros +mille deux cents,1200 +cinquante trois,53 +contact@exemple.fr,contact arobase exemple point fr +"2 < 5",2 plus petit que 5 +50°C,50 degres celsius +ca coute €50,ca coute 50 euros +euh bonjour hein,bonjour +mme dupont,madame dupont +mlle dubois,mademoiselle dubois +dr martin,docteur martin +prof dupont,professeur dupont +st jean,saint jean +ping pong,pingpong +volley ball,volleyball +basket ball,basketball +hand ball,handball +water polo,waterpolo +t shirt,tshirt +cd rom,cdrom +super predateur,superpredateur +"3,14 pi",3 virgule 14 pi +soixante-dix,70 +quatre-vingts,80 +quatre-vingt-un,81 +nonante-neuf,99 +septante et un,71 +x = 5,x egal a 5 +test@example.com,test arobase example point com +bonjour (euh) ami,bonjour ami +ça date d'hier,ca date d hier diff --git a/tests/e2e/files/gladia-3/it.csv b/tests/e2e/files/gladia-3/it.csv new file mode 100644 index 0000000..c383407 --- /dev/null +++ b/tests/e2e/files/gladia-3/it.csv @@ -0,0 +1,33 @@ +input,expected +#1 posto,1 posto +Costa €50,costa 50 euro +"3,14",3 virgola 14 +"1.234,56",1234 virgola 56 +2 < 5,2 minore di 5 +5 > 3,5 maggiore di 3 +50°C,50 gradi celsius +dieci per cento,dieci percento +vs milan,contro milan +dott rossi,dottor rossi +dr rossi,dottor rossi +ehm tipo insomma ciao,ciao +admin+tag@example.com,admin tag chiocciola example punto com +test@example.com,test chiocciola example punto com +uno mela,1 mela +x = 5,x uguale a 5 +"Il prezzo è 99,99",il prezzo e 99 virgola 99 +francois.dupont@gladia.io,francois punto dupont chiocciola gladia punto io +www.example.com,w w w punto example punto com +Versione 1.0.0 rilasciata,versione 1 punto 0 punto 0 rilasciata +api.endpoint.v2,api punto endpoint punto v2 +prof bianchi,professore bianchi +avv verdi,avvocato verdi +versus inter,contro inter +tel 123,telefono 123 +ecc.,eccetera +etc subito,eccetera subito +Contattami a john@example.com per favore,contattami a john chiocciola example punto com per favore +"La temperatura è di 98,6 gradi",la temperatura e di 98 virgola 6 gradi +La parola [inaudible] è qui,la parola inaudible e qui +$5 e $10,5 dollari e 10 dollari +mi chiamo bob,mi chiamo bob diff --git a/tests/e2e/normalization_test.py b/tests/e2e/normalization_test.py index daa1b3d..666b284 100644 --- a/tests/e2e/normalization_test.py +++ b/tests/e2e/normalization_test.py @@ -12,7 +12,6 @@ @dataclass class NormalizationTest: - language: str input: str expected: str @@ -23,7 +22,6 @@ def _load_tests_from_csv(csv_path: Path) -> list[NormalizationTest]: for row in csv.DictReader(f): rows.append( NormalizationTest( - language=row["language"], input=row["input"], expected=row["expected"], ) @@ -32,7 +30,34 @@ def _load_tests_from_csv(csv_path: Path) -> list[NormalizationTest]: def _case_ids(cases: list[NormalizationTest]) -> list[str]: - return [f"{test.language}:{test.input[:60]}" for test in cases] + return [test.input[:60] for test in cases] + + +def _discover_preset_tests( + preset_dir: Path, +) -> dict[str, list[NormalizationTest]]: + """Scan a preset directory for per-language CSV files. + + Returns a dict mapping language code (filename stem) to test cases. + """ + tests: dict[str, list[NormalizationTest]] = {} + if not preset_dir.is_dir(): + return tests + for csv_path in sorted(preset_dir.glob("*.csv")): + language = csv_path.stem + cases = _load_tests_from_csv(csv_path) + if cases: + tests[language] = cases + return tests + + +# --------------------------------------------------------------------------- +# gladia_3 +# --------------------------------------------------------------------------- + +_GLADIA_3_DIR = _FILES_DIR / "gladia-3" +_GLADIA_3_BY_LANGUAGE = _discover_preset_tests(_GLADIA_3_DIR) +_GLADIA_3_PIPELINES: dict[str, NormalizationPipeline] = {} def _load_pipeline(preset_name_or_path: str, language: str) -> NormalizationPipeline: @@ -44,25 +69,22 @@ def _load_pipeline(preset_name_or_path: str, language: str) -> NormalizationPipe return _GLADIA_3_PIPELINES[language] -# --------------------------------------------------------------------------- -# gladia_3 -# --------------------------------------------------------------------------- +def _make_gladia_3_test(language: str, cases: list[NormalizationTest]): + @pytest.mark.parametrize("test", cases, ids=_case_ids(cases)) + def _test(test: NormalizationTest) -> None: + pipeline = _load_pipeline("gladia-3", language) + result = pipeline.normalize(test.input) + assert result == test.expected, ( + f"\n input: {test.input!r}" + f"\n expected: {test.expected!r}" + f"\n got: {result!r}" + ) -_GLADIA_3_CSV = _FILES_DIR / "gladia-3.csv" -_GLADIA_3_TESTS = _load_tests_from_csv(_GLADIA_3_CSV) if _GLADIA_3_CSV.exists() else [] -_GLADIA_3_PIPELINES: dict[str, NormalizationPipeline] = {} + _test.__name__ = f"test_gladia_3_{language}" + return _test -@pytest.mark.parametrize( - "test", - _GLADIA_3_TESTS, - ids=_case_ids(_GLADIA_3_TESTS), -) -def test_gladia_3(test: NormalizationTest) -> None: - pipeline = _load_pipeline("gladia-3", test.language) - result = pipeline.normalize(test.input) - assert result == test.expected, ( - f"\n input: {test.input!r}" - f"\n expected: {test.expected!r}" - f"\n got: {result!r}" +for _language in sorted(_GLADIA_3_BY_LANGUAGE): + globals()[f"test_gladia_3_{_language}"] = _make_gladia_3_test( + _language, _GLADIA_3_BY_LANGUAGE[_language] )