gladiaio · Karamouche · Apr 14, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -20,7 +20,7 @@
 - [ ] Decorated operators class with `@register_language`
 - [ ] Added one import line to `languages/__init__.py`
 - [ ] Added unit tests in `tests/unit/languages/`
-- [ ] Added e2e test rows in `tests/e2e/files/`
+- [ ] Added a per-language CSV in `tests/e2e/files/{preset}/` (e.g. `tests/e2e/files/gladia-3/fr.csv`)
 
 ### New step
 

diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@ wheels/
 *.egg-info
 .ruff_cache/
 .pytest_cache/
+.DS_Store
 
 
 # Virtual environments

diff --git a/AGENTS.md b/AGENTS.md
@@ -144,7 +144,7 @@ Never modify a published preset YAML. Never let a preset reference a step that h
 - [ ] Decorate the class with `@register_language`
 - [ ] Add one import to `languages/__init__.py`
 - [ ] Add tests in `tests/unit/languages/`
-- [ ] Add test rows to `tests/e2e/files/` for the new language
+- [ ] Add a CSV file `tests/e2e/files/{preset}/{language_code}.csv` for each relevant preset (e.g. `tests/e2e/files/gladia-3/fr.csv`)
 
 ## Adding a new step — checklist
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -18,7 +18,7 @@ A new language requires:
 2. Put all word-level substitutions in `replacements.py`
 3. Instantiate a `LanguageConfig` and subclass `LanguageOperators` in `operators.py`
 4. Decorate with `@register_language` and add one import to `normalization/languages/__init__.py`
-5. Add tests under `tests/unit/languages/` and e2e fixture rows in `tests/e2e/files/`
+5. Add tests under `tests/unit/languages/` and a per-language CSV in `tests/e2e/files/{preset}/` (e.g. `tests/e2e/files/gladia-3/fr.csv`)
 
 See [docs/contributing-guide.md](docs/contributing-guide.md) for the full checklist and design rules.
 

diff --git a/docs/contributing-guide.md b/docs/contributing-guide.md
@@ -26,7 +26,7 @@ This ordering is a hard constraint — some steps depend on earlier steps having
 - [ ] Decorate the class with `@register_language`
 - [ ] Add one import to `languages/__init__.py`
 - [ ] Add tests in `tests/unit/languages/`
-- [ ] Add test rows to `tests/e2e/files/` for the new language
+- [ ] Add a CSV file `tests/e2e/files/{preset}/{language_code}.csv` for each relevant preset (e.g. `tests/e2e/files/gladia-3/fr.csv`)
 
 ### Language data vs. language behavior
 
@@ -159,42 +159,46 @@ def test_my_step_with_english(english_operators):
 
 ### E2E tests for a preset
 
-E2E tests validate the full pipeline (preset + language) against a CSV fixture. The test runner lives in `tests/e2e/normalization_test.py` and CSV files go in `tests/e2e/files/`.
+E2E tests validate the full pipeline (preset + language) against CSV fixtures. The test runner lives in `tests/e2e/normalization_test.py` and CSV files are organized under `tests/e2e/files/`.
 
-**CSV format** — three columns, no quoting needed unless the value contains a comma:
+**Directory structure** — one folder per preset, one CSV per language:
 
 ```
-input,expected,language
-$1,000,000,1000000 dollars,en
-hello world,hello world,fr
+tests/e2e/files/
+  gladia-3/
+    default.csv
+    de.csv
+    en.csv
+    fr.csv
+    it.csv
 ```
 
-Each row is one test case. The `language` column must match a registered language code (or `default`).
+**CSV format** — two columns (`input,expected`), no quoting needed unless the value contains a comma:
 
-**Registering a new CSV** — add a block to `normalization_test.py` following the existing pattern:
+```
+input,expected
+"$1,000,000",1000000 dollars
+hello world,hello world
+```
+
+The language is derived from the filename (e.g. `fr.csv` → language code `fr`). Use `default.csv` for the language-agnostic fallback.
+
+**Adding test cases for an existing preset** — drop rows into the appropriate `{language_code}.csv` file, or create a new CSV if the language isn't covered yet. Tests are discovered automatically.
+
+**Registering a new preset** — add a block to `normalization_test.py` following the existing pattern:
 
 ```python
-_MY_PRESET_CSV = _FILES_DIR / "my-preset.csv"
-_MY_PRESET_TESTS = _load_tests_from_csv(_MY_PRESET_CSV) if _MY_PRESET_CSV.exists() else []
+_MY_PRESET_DIR = _FILES_DIR / "my-preset"
+_MY_PRESET_BY_LANGUAGE = _discover_preset_tests(_MY_PRESET_DIR)
 _MY_PRESET_PIPELINES: dict[str, NormalizationPipeline] = {}
 
-
-@pytest.mark.parametrize(
-    "test",
-    _MY_PRESET_TESTS,
-    ids=_case_ids(_MY_PRESET_TESTS),
-)
-def test_my_preset(test: NormalizationTest) -> None:
-    pipeline = _load_pipeline("my-preset", test.language)
-    result = pipeline.normalize(test.input)
-    assert result == test.expected, (
-        f"\n  input:    {test.input!r}"
-        f"\n  expected: {test.expected!r}"
-        f"\n  got:      {result!r}"
+for _language in sorted(_MY_PRESET_BY_LANGUAGE):
+    globals()[f"test_my_preset_{_language}"] = _make_test(
+        "my-preset", _language, _MY_PRESET_BY_LANGUAGE[_language], _MY_PRESET_PIPELINES
     )
 ```
 
-Pipelines are cached per language inside `_MY_PRESET_PIPELINES` to avoid reloading for each parametrized case — follow the `_load_pipeline` helper pattern already in the file.
+Pipelines are cached per language to avoid reloading for each parametrized case.
 
 ---
 

diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py
@@ -1,7 +1,7 @@
-from . import english, french
+from . import english, french, german, italian, spanish
 from .base import LanguageOperators
 from .registry import get_language_registry, register_language
 
 register_language(LanguageOperators)
 
-__all__ = ["english", "french", "get_language_registry"]
+__all__ = ["english", "french", "german", "italian", "spanish", "get_language_registry"]
diff --git a/normalization/languages/german/__init__.py b/normalization/languages/german/__init__.py
@@ -0,0 +1,7 @@
+from .operators import GermanOperators
+from .replacements import GERMAN_REPLACEMENTS
+
+__all__ = [
+    "GermanOperators",
+    "GERMAN_REPLACEMENTS",
+]
diff --git a/normalization/languages/german/operators.py b/normalization/languages/german/operators.py
@@ -0,0 +1,43 @@
+from normalization.languages.base import LanguageConfig, LanguageOperators
+from normalization.languages.german.replacements import GERMAN_REPLACEMENTS
+from normalization.languages.german.sentence_replacements import (
+    GERMAN_SENTENCE_REPLACEMENTS,
+)
+from normalization.languages.registry import register_language
+
+GERMAN_CONFIG = LanguageConfig(
+    code="de",
+    decimal_separator=",",
+    decimal_word="komma",
+    thousand_separator=".",
+    symbols_to_words={
+        "@": "at",
+        ".": "punkt",
+        "+": "plus",
+        "=": "gleich",
+        ">": "größer als",
+        "<": "kleiner als",
+        "°": "grad",
+        "°C": "grad celsius",
+        "°F": "grad fahrenheit",
+        "%": "prozent",
+    },
+    currency_symbol_to_word={
+        "€": "euros",
+        "$": "dollars",
+        "£": "pounds",
+        "¢": "cents",
+        "¥": "yens",
+    },
+    filler_words=["äh", "ähm", "hm", "also", "naja", "halt"],
+    sentence_replacements=GERMAN_SENTENCE_REPLACEMENTS,
+)
+
+
+@register_language
+class GermanOperators(LanguageOperators):
+    def __init__(self):
+        super().__init__(GERMAN_CONFIG)
+
+    def get_word_replacements(self) -> dict[str, str]:
+        return GERMAN_REPLACEMENTS
diff --git a/normalization/languages/german/replacements.py b/normalization/languages/german/replacements.py
@@ -0,0 +1,10 @@
+GERMAN_REPLACEMENTS: dict[str, str] = {
+    "u.": "unter",
+    "chr.": "christus",
+    "rissströmungen": "riss-strömungen",
+    "kilometer": "km",
+    "xdrtb": "xdr-tb",
+    "dualradio": "dual-radio",
+    "st.": "sankt",
+    "maubewegung": "mau-bewegung",
+}
diff --git a/normalization/languages/german/sentence_replacements.py b/normalization/languages/german/sentence_replacements.py
@@ -0,0 +1,16 @@
+GERMAN_SENTENCE_REPLACEMENTS: dict[str, str] = {
+    "regimeet kritischen": "regimekritischen",
+    "cannabis joints": "cannabisjoints",
+    "kampf handlungen": "kampfhandlungen",
+    "erwachsenen pornografie": "erwachsenenpornographie",
+    "standbild format": "standbildformat",
+    "internet radio seite": "internetradioseite",
+    "alt gedienten": "altgedienten",
+    "6 tage krieg": "sechstagekrieg",
+    "kreuzungs punkt": "kreuzungspunkt",
+    "wild card": "wildcard",
+    "national parks": "nationalparks",
+    "internet suche": "internetsuche",
+    "gleichgewicht geschlechtliche": "gleichgeschlechtlichen",
+    "welt kulturerbegebiete": "weltkulturerbegebiete",
+}
diff --git a/normalization/languages/italian/__init__.py b/normalization/languages/italian/__init__.py
@@ -0,0 +1,7 @@
+from .operators import ItalianOperators
+from .replacements import ITALIAN_REPLACEMENTS
+
+__all__ = [
+    "ItalianOperators",
+    "ITALIAN_REPLACEMENTS",
+]
diff --git a/normalization/languages/italian/operators.py b/normalization/languages/italian/operators.py
@@ -0,0 +1,116 @@
+import re
+
+from normalization.languages.base import LanguageConfig, LanguageOperators
+from normalization.languages.italian.replacements import ITALIAN_REPLACEMENTS
+from normalization.languages.registry import register_language
+
+# Single digits 1–9: shared by digit_words and any future time/compound helpers.
+_ONE_TO_NINE: dict[str, str] = {
+    "uno": "1",
+    "due": "2",
+    "tre": "3",
+    "quattro": "4",
+    "cinque": "5",
+    "sei": "6",
+    "sette": "7",
+    "otto": "8",
+    "nove": "9",
+}
+
+ITALIAN_SENTENCE_REPLACEMENTS: dict[str, str] = {
+    # Spoken percentages (“dieci per cento”) → one canonical form aligned with “%” → percento
+    "per cento": "percento",
+}
+
+ITALIAN_CONFIG = LanguageConfig(
+    code="it",
+    decimal_separator=",",
+    decimal_word="virgola",
+    thousand_separator=".",
+    symbols_to_words={
+        "@": "chiocciola",
+        ".": "punto",
+        "+": "più",
+        "=": "uguale a",
+        ">": "maggiore di",
+        "<": "minore di",
+        "°": "grado",
+        "°C": "gradi celsius",
+        "°F": "gradi fahrenheit",
+        "%": "percento",
+    },
+    currency_symbol_to_word={
+        "€": "euro",
+        "$": "dollari",
+        "£": "sterline",
+        "¢": "centesimi",
+        "¥": "yen",
+    },
+    filler_words=[
+        "eh",
+        "ehm",
+        "mm",
+        "mh",
+        "cioè",
+        "cioe",
+        "tipo",
+        "insomma",
+        "allora",
+        "beh",
+        "bah",
+        "dunque",
+        "magari",
+        "praticamente",
+    ],
+    sentence_replacements=ITALIAN_SENTENCE_REPLACEMENTS,
+    digit_words={"zero": "0", **_ONE_TO_NINE},
+    number_words=[
+        "zero",
+        *_ONE_TO_NINE,
+        "dieci",
+        "undici",
+        "dodici",
+        "tredici",
+        "quattordici",
+        "quindici",
+        "sedici",
+        "diciassette",
+        "diciotto",
+        "diciannove",
+        "venti",
+        "trenta",
+        "quaranta",
+        "cinquanta",
+        "sessanta",
+        "settanta",
+        "ottanta",
+        "novanta",
+        "cento",
+        "mille",
+        "mila",
+        "milione",
+        "milioni",
+        "miliardo",
+        "miliardi",
+    ],
+    plus_word="più",
+)
+
+
+@register_language
+class ItalianOperators(LanguageOperators):
+    def __init__(self):
+        super().__init__(ITALIAN_CONFIG)
+
+    def fix_one_word_in_numeric_contexts(self, text: str) -> str:
+        text = re.sub(r"(\d+)\s+uno\s+uno\b", r"\1 1 1", text)
+        text = re.sub(r"\buno\s+uno\s+(\d)", r"1 1 \1", text)
+        text = re.sub(r"(\d+)\s+uno\s+(\d)", r"\1 1 \2", text)
+        text = re.sub(r"(\d+)\s+uno\b", r"\1 1", text)
+        text = re.sub(r"\b(\d+)uno\b", r"\1 1", text)
+        text = re.sub(r"\buno\s+(\d)", r"1 \1", text)
+        text = re.sub(r"^uno\s+(?=[a-z])", "1 ", text)
+        return text
+
+    def get_word_replacements(self) -> dict[str, str]:
+        return ITALIAN_REPLACEMENTS
diff --git a/normalization/languages/italian/replacements.py b/normalization/languages/italian/replacements.py
@@ -0,0 +1,11 @@
+ITALIAN_REPLACEMENTS: dict[str, str] = {
+    "avv": "avvocato",
+    "dott": "dottor",
+    "dr": "dottor",
+    "ecc": "eccetera",
+    "etc": "eccetera",
+    "prof": "professore",
+    "tel": "telefono",
+    "versus": "contro",
+    "vs": "contro",
+}
diff --git a/normalization/languages/spanish/__init__.py b/normalization/languages/spanish/__init__.py
@@ -0,0 +1,7 @@
+from .operators import SpanishOperators
+from .replacements import SPANISH_REPLACEMENTS
+
+__all__ = [
+    "SpanishOperators",
+    "SPANISH_REPLACEMENTS",
+]