From f259b26d797c0e7b014104661d75a466e039ede0 Mon Sep 17 00:00:00 2001
From: egenthon-cmd <egenthon@gladia.io>
Date: Thu, 23 Apr 2026 13:10:31 +0200
Subject: [PATCH 1/2] feat: init finnish language

---
 normalization/languages/__init__.py           |   3 +-
 normalization/languages/finnish/__init__.py   |   7 +
 .../languages/finnish/number_normalizer.py    | 457 ++++++++++++++++++
 normalization/languages/finnish/operators.py  | 104 ++++
 .../languages/finnish/replacements.py         |  49 ++
 tests/e2e/files/gladia-3/fi.csv               |  17 +
 .../finnish_number_normalizer_test.py         |  58 +++
 .../unit/languages/finnish_operators_test.py  |  29 ++
 8 files changed, 723 insertions(+), 1 deletion(-)
 create mode 100644 normalization/languages/finnish/__init__.py
 create mode 100644 normalization/languages/finnish/number_normalizer.py
 create mode 100644 normalization/languages/finnish/operators.py
 create mode 100644 normalization/languages/finnish/replacements.py
 create mode 100644 tests/e2e/files/gladia-3/fi.csv
 create mode 100644 tests/unit/languages/finnish_number_normalizer_test.py
 create mode 100644 tests/unit/languages/finnish_operators_test.py

diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py
index 905c82f..7c0e931 100644
--- a/normalization/languages/__init__.py
+++ b/normalization/languages/__init__.py
@@ -1,4 +1,4 @@
-from . import dutch, english, french, german, italian, spanish
+from . import dutch, english, finnish, french, german, italian, spanish
 from .base import LanguageOperators
 from .registry import get_language_registry, register_language
 
@@ -7,6 +7,7 @@
 __all__ = [
     "dutch",
     "english",
+    "finnish",
     "french",
     "german",
     "italian",
diff --git a/normalization/languages/finnish/__init__.py b/normalization/languages/finnish/__init__.py
new file mode 100644
index 0000000..05fa8a1
--- /dev/null
+++ b/normalization/languages/finnish/__init__.py
@@ -0,0 +1,7 @@
+from .operators import FinnishOperators
+from .replacements import FINNISH_REPLACEMENTS
+
+__all__ = [
+    "FinnishOperators",
+    "FINNISH_REPLACEMENTS",
+]
diff --git a/normalization/languages/finnish/number_normalizer.py b/normalization/languages/finnish/number_normalizer.py
new file mode 100644
index 0000000..20e151f
--- /dev/null
+++ b/normalization/languages/finnish/number_normalizer.py
@@ -0,0 +1,457 @@
+"""Finnish number normalizer (STT-oriented).
+
+Finnish is not supported by ``text2num.alpha2digit``, so this module parses
+common transcript-style cardinals: 0–999 (including ``kymmentä`` tens),
+``tuhat`` / ``tuhatta``, and large multipliers (``miljoona``, ``miljardi``,
+``biljoona``). Optionally rewrites currency symbols like the Dutch/Swedish
+normalizers, then restores plural currency words from config.
+"""
+
+from __future__ import annotations
+
+import re
+
+
+def _fold(s: str) -> str:
+    return s.lower()
+
+
+def _get(table: dict[str, int], word: str) -> int | None:
+    fw = _fold(word)
+    for k, v in table.items():
+        if _fold(k) == fw:
+            return v
+    return None
+
+
+_ONES_2_9: dict[str, int] = {
+    "kaksi": 2,
+    "kolme": 3,
+    "neljä": 4,
+    "nelja": 4,
+    "viisi": 5,
+    "kuusi": 6,
+    "seitsemän": 7,
+    "seitseman": 7,
+    "kahdeksan": 8,
+    "yhdeksän": 9,
+    "yhdeksan": 9,
+}
+
+_ONES_1_9: dict[str, int] = {
+    "yksi": 1,
+    **_ONES_2_9,
+}
+
+_MULT_BEFORE_KYM: dict[str, int] = {
+    "yksi": 1,
+    "kaksi": 2,
+    "kolme": 3,
+    "neljä": 4,
+    "nelja": 4,
+    "viisi": 5,
+    "kuusi": 6,
+    "seitsemän": 7,
+    "seitseman": 7,
+    "kahdeksan": 8,
+    "yhdeksän": 9,
+    "yhdeksan": 9,
+}
+
+_TEENS: dict[str, int] = {
+    "kymmenen": 10,
+    "yksitoista": 11,
+    "kaksitoista": 12,
+    "kolmetoista": 13,
+    "neljätoista": 14,
+    "neljatoista": 14,
+    "viisitoista": 15,
+    "kuusitoista": 16,
+    "seitsemäntoista": 17,
+    "seitsemantoista": 17,
+    "kahdeksantoista": 18,
+    "yhdeksäntoista": 19,
+    "yhdeksantoista": 19,
+}
+
+_KYMENTTA = "kymmentä"
+_KYMENTTA_ASCII = "kymmenta"
+
+
+def _parse_glued_kymmenta(word: str) -> tuple[int, int] | None:
+    """Parse a single token like ``kaksikymmentäviisi`` → (25, consumed)."""
+    fw = _fold(word)
+    key = _KYMENTTA
+    if key not in fw:
+        key = _KYMENTTA_ASCII
+        if key not in fw:
+            return None
+    idx = fw.index(key)
+    left = fw[:idx]
+    right = fw[idx + len(key) :]
+    tens_m = _get(_MULT_BEFORE_KYM, left)
+    if tens_m is None:
+        return None
+    base = tens_m * 10
+    if not right:
+        return base, 1
+    unit = _get(_ONES_1_9, right)
+    if unit is None:
+        return None
+    return base + unit, 1
+
+
+_DIGIT_TO_FINNISH: dict[str, str] = {
+    "0": "nolla",
+    "1": "yksi",
+    "2": "kaksi",
+    "3": "kolme",
+    "4": "neljä",
+    "5": "viisi",
+    "6": "kuusi",
+    "7": "seitsemän",
+    "8": "kahdeksan",
+    "9": "yhdeksän",
+}
+
+_RE_MIXED_NUMBER = re.compile(
+    r"\b(\d+)\s+("
+    r"miljoona|miljoonaa|miljoonan|"
+    r"miljardi|miljardia|miljardin|"
+    r"biljoona|biljoonaa|biljoonan|"
+    r"tuhat|tuhatta"
+    r")\b",
+    re.IGNORECASE,
+)
+
+_BIG_MULT: dict[str, int] = {
+    "tuhat": 1000,
+    "tuhatta": 1000,
+    "miljoona": 1_000_000,
+    "miljoonaa": 1_000_000,
+    "miljoonan": 1_000_000,
+    "miljardi": 1_000_000_000,
+    "miljardia": 1_000_000_000,
+    "miljardin": 1_000_000_000,
+    "biljoona": 1_000_000_000_000,
+    "biljoonaa": 1_000_000_000_000,
+    "biljoonan": 1_000_000_000_000,
+}
+
+
+def _normalize_mixed_numbers(text: str) -> str:
+    """Convert ``3 miljardi`` → ``kolme miljardi`` so the word parser yields 3e9."""
+
+    def replace(match: re.Match[str]) -> str:
+        number = match.group(1)
+        multiplier = match.group(2)
+        if len(number) == 1 and number in _DIGIT_TO_FINNISH:
+            return f"{_DIGIT_TO_FINNISH[number]} {multiplier}"
+        return match.group(0)
+
+    return _RE_MIXED_NUMBER.sub(replace, text)
+
+
+def _singular_spoken_unit(trailing_word: str) -> str:
+    t = trailing_word.lower()
+    if t == "euros":
+        return "euro"
+    if t == "dollars":
+        return "dollar"
+    if t == "pounds":
+        return "pound"
+    if t == "yens":
+        return "yen"
+    return trailing_word
+
+
+def _normalize_currency_symbols(
+    text: str,
+    currency_symbol_to_word: dict[str, str] | None,
+) -> str:
+    if not currency_symbol_to_word:
+        return text
+    num = r"\d+(?:[.,]\d+)?"
+    for symbol, trailing in currency_symbol_to_word.items():
+        singular = _singular_spoken_unit(trailing)
+        esc = re.escape(symbol)
+        sym = rf"\b{esc}\b" if len(symbol) > 1 else esc
+        text = re.sub(rf"{sym}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE)
+        text = re.sub(rf"({num})\s*{sym}", rf"\1 {singular}", text, flags=re.IGNORECASE)
+    return text
+
+
+def _currency_plural_fix_patterns(
+    currency_symbol_to_word: dict[str, str] | None,
+) -> tuple[tuple[re.Pattern[str], str], ...]:
+    if not currency_symbol_to_word:
+        return ()
+    amount = r"(\d+(?:[.,]\d+)?)"
+    seen: set[str] = set()
+    out: list[tuple[re.Pattern[str], str]] = []
+    for _symbol, trailing in currency_symbol_to_word.items():
+        tl = trailing.lower()
+        if tl in seen:
+            continue
+        seen.add(tl)
+        singular = _singular_spoken_unit(trailing)
+        if singular.lower() == tl:
+            continue
+        if tl == "euros":
+            pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE)
+            out.append((pat, rf"\1 {trailing}"))
+        else:
+            pat = re.compile(
+                rf"\b{amount}\s+{re.escape(singular)}\b",
+                re.IGNORECASE,
+            )
+            out.append((pat, rf"\1 {trailing}"))
+    return tuple(out)
+
+
+def _apply_currency_plural_fixes(
+    text: str,
+    fixers: tuple[tuple[re.Pattern[str], str], ...],
+) -> str:
+    for pattern, repl in fixers:
+        text = pattern.sub(repl, text)
+    return text
+
+
+def _hundred_multiplier(word: str) -> int | None:
+    if _fold(word) == "yksi":
+        return 1
+    return _get(_ONES_2_9, word)
+
+
+class FinnishNumberNormalizer:
+    """Convert Finnish spelled-out numbers to digits."""
+
+    def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None:
+        self._currency_symbol_to_word = currency_symbol_to_word
+        self._currency_plural_fixes = _currency_plural_fix_patterns(
+            currency_symbol_to_word,
+        )
+
+    def __call__(self, text: str) -> str:
+        if not text.strip():
+            return text
+        text = _normalize_currency_symbols(text, self._currency_symbol_to_word)
+        text = _normalize_mixed_numbers(text)
+        words = text.split()
+        out: list[str] = []
+        i = 0
+        n = len(words)
+        while i < n:
+            parsed = self._parse_number(words, i, n)
+            if parsed is not None:
+                end, value = parsed
+                out.append(str(value))
+                i = end
+            else:
+                out.append(words[i])
+                i += 1
+        text = " ".join(out)
+        text = _apply_currency_plural_fixes(text, self._currency_plural_fixes)
+        return text
+
+    def _parse_number(self, words: list[str], i: int, n: int) -> tuple[int, int] | None:
+        if i >= n:
+            return None
+
+        fw = _fold(words[i])
+
+        if fw in ("tuhat", "tuhatta"):
+            j = i + 1
+            tail = self._parse_number(words, j, n)
+            if tail is not None:
+                end, v2 = tail
+                return end, 1000 + v2
+            return j, 1000
+
+        if i + 1 < n and fw == "yksi" and _fold(words[i + 1]) in ("tuhat", "tuhatta"):
+            j = i + 2
+            tail = self._parse_number(words, j, n)
+            base = 1000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return j, base
+
+        if i + 1 < n and fw == "yksi" and _fold(words[i + 1]) == "miljoona":
+            j = i + 2
+            tail = self._parse_number(words, j, n)
+            base = 1_000_000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return j, base
+
+        if (
+            i + 1 < n
+            and fw == "yksi"
+            and _fold(words[i + 1])
+            in (
+                "miljardi",
+                "miljardia",
+            )
+        ):
+            j = i + 2
+            tail = self._parse_number(words, j, n)
+            base = 1_000_000_000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return j, base
+
+        if (
+            i + 1 < n
+            and fw == "yksi"
+            and _fold(words[i + 1])
+            in (
+                "biljoona",
+                "biljoonaa",
+            )
+        ):
+            j = i + 2
+            tail = self._parse_number(words, j, n)
+            base = 1_000_000_000_000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return j, base
+
+        sub999 = self._parse_0_999(words, i, n)
+        if sub999 is None:
+            return None
+        j, v = sub999
+        if j >= n:
+            return j, v
+
+        next_fw = _fold(words[j])
+        if next_fw in ("tuhat", "tuhatta"):
+            j += 1
+            prod = v * 1000
+            if j >= n:
+                return j, prod
+            tail = self._parse_number(words, j, n)
+            if tail is not None:
+                end, v2 = tail
+                return end, prod + v2
+            return j, prod
+
+        mult = _BIG_MULT.get(next_fw)
+        if mult is not None and mult >= 1_000_000:
+            j += 1
+            prod = v * mult
+            if j >= n:
+                return j, prod
+            tail = self._parse_number(words, j, n)
+            if tail is not None:
+                end, v2 = tail
+                return end, prod + v2
+            return j, prod
+
+        return j, v
+
+    def _parse_0_999(self, words: list[str], i: int, n: int) -> tuple[int, int] | None:
+        if i >= n:
+            return None
+
+        if _fold(words[i]) == "nolla":
+            if i + 1 < n and self._continues_number(words[i + 1]):
+                return None
+            return i + 1, 0
+
+        if _fold(words[i]) == "sata":
+            tail = self._parse_0_99(words, i + 1, n)
+            if tail is not None:
+                je, tv = tail
+                return je, 100 + tv
+            return i + 1, 100
+
+        if i + 1 < n and _fold(words[i + 1]) in ("sata", "sataa"):
+            m = _hundred_multiplier(words[i])
+            if m is None:
+                m = _get(_ONES_2_9, words[i])
+            if m is None:
+                return None
+            next_w = _fold(words[i + 1])
+            if m == 1 and next_w == "sata":
+                base = 100
+            else:
+                base = m * 100
+            j = i + 2
+            tail = self._parse_0_99(words, j, n)
+            if tail is not None:
+                je, tv = tail
+                return je, base + tv
+            return j, base
+
+        return self._parse_0_99(words, i, n)
+
+    def _continues_number(self, word: str) -> bool:
+        fw = _fold(word)
+        if fw in ("sata", "sataa", "tuhat", "tuhatta"):
+            return True
+        if fw in _BIG_MULT:
+            return True
+        if _get(_TEENS, word) is not None:
+            return True
+        if _get(_MULT_BEFORE_KYM, word) is not None:
+            return True
+        if fw in (_KYMENTTA, _KYMENTTA_ASCII):
+            return True
+        if _get(_ONES_2_9, word) is not None:
+            return True
+        if fw == "yksi":
+            return True
+        if _parse_glued_kymmenta(word) is not None:
+            return True
+        return False
+
+    def _parse_ones_1_9(
+        self, words: list[str], i: int, n: int
+    ) -> tuple[int, int] | None:
+        if i >= n:
+            return None
+        v = _get(_ONES_1_9, words[i])
+        if v is None or v == 0:
+            return None
+        return i + 1, v
+
+    def _parse_0_99(self, words: list[str], i: int, n: int) -> tuple[int, int] | None:
+        if i >= n:
+            return None
+
+        glued = _parse_glued_kymmenta(words[i])
+        if glued is not None:
+            val, consumed = glued
+            return i + consumed, val
+
+        v = _get(_TEENS, words[i])
+        if v is not None:
+            return i + 1, v
+
+        fw = _fold(words[i])
+        if i + 1 < n:
+            nxt = _fold(words[i + 1])
+            if nxt in (_KYMENTTA, _KYMENTTA_ASCII):
+                tens_m = _get(_MULT_BEFORE_KYM, words[i])
+                if tens_m is not None:
+                    base = tens_m * 10
+                    j = i + 2
+                    ones = self._parse_ones_1_9(words, j, n)
+                    if ones is not None:
+                        je, ov = ones
+                        return je, base + ov
+                    return j, base
+
+        v = _get(_ONES_2_9, words[i])
+        if v is not None:
+            return i + 1, v
+
+        if fw == "yksi":
+            return i + 1, 1
+
+        return None
diff --git a/normalization/languages/finnish/operators.py b/normalization/languages/finnish/operators.py
new file mode 100644
index 0000000..ef19481
--- /dev/null
+++ b/normalization/languages/finnish/operators.py
@@ -0,0 +1,104 @@
+from normalization.languages.base import LanguageConfig, LanguageOperators
+from normalization.languages.finnish.number_normalizer import FinnishNumberNormalizer
+from normalization.languages.registry import register_language
+
+_FINNISH_DIGIT_WORDS: dict[str, str] = {
+    "nolla": "0",
+    "yksi": "1",
+    "kaksi": "2",
+    "kolme": "3",
+    "neljä": "4",
+    "viisi": "5",
+    "kuusi": "6",
+    "seitsemän": "7",
+    "kahdeksan": "8",
+    "yhdeksän": "9",
+}
+
+FINNISH_CONFIG = LanguageConfig(
+    code="fi",
+    decimal_separator=",",
+    decimal_word="pilkku",
+    thousand_separator=" ",
+    symbols_to_words={
+        "@": "at merkki",
+        ".": "piste",
+        "+": "plus",
+        "=": "yhtä kuin",
+        ">": "suurempi kuin",
+        "<": "pienempi kuin",
+        "°": "astetta",
+        "°C": "astetta celsius",
+        "°F": "astetta fahrenheit",
+        "%": "prosenttia",
+    },
+    currency_symbol_to_word={
+        "€": "euros",
+        "$": "dollars",
+        "£": "pounds",
+        "¢": "cent",
+        "¥": "yens",
+    },
+    filler_words=[
+        "eh",
+        "öö",
+        "hm",
+        "hmm",
+        "mm",
+        "mhm",
+        "tota",
+        "tuota",
+        "niinkun",
+        "tavallaan",
+        "aha",
+        "aa",
+        "niin",
+    ],
+    digit_words=_FINNISH_DIGIT_WORDS,
+    number_words=[
+        *_FINNISH_DIGIT_WORDS,
+        "kymmenen",
+        "yksitoista",
+        "kaksitoista",
+        "kolmetoista",
+        "neljätoista",
+        "viisitoista",
+        "kuusitoista",
+        "seitsemäntoista",
+        "kahdeksantoista",
+        "yhdeksäntoista",
+        "kymmentä",
+        "sata",
+        "sataa",
+        "tuhat",
+        "tuhatta",
+        "miljoona",
+        "miljoonaa",
+        "miljoonan",
+        "miljardi",
+        "miljardia",
+        "miljardin",
+        "biljoona",
+        "biljoonaa",
+        "biljoonan",
+    ],
+    plus_word="plus",
+)
+
+
+@register_language
+class FinnishOperators(LanguageOperators):
+    def __init__(self) -> None:
+        super().__init__(FINNISH_CONFIG)
+        self._number_normalizer = FinnishNumberNormalizer(
+            FINNISH_CONFIG.currency_symbol_to_word,
+        )
+
+    def expand_written_numbers(self, text: str) -> str:
+        """Convert Finnish spelled-out numbers to digits (e.g. kaksi kymmentä viisi → 25)."""
+        return self._number_normalizer(text)
+
+    def get_word_replacements(self) -> dict[str, str]:
+        from normalization.languages.finnish.replacements import FINNISH_REPLACEMENTS
+
+        return FINNISH_REPLACEMENTS
diff --git a/normalization/languages/finnish/replacements.py b/normalization/languages/finnish/replacements.py
new file mode 100644
index 0000000..54162c1
--- /dev/null
+++ b/normalization/languages/finnish/replacements.py
@@ -0,0 +1,49 @@
+"""Colloquial / spoken variants → standard Finnish (canonical for WER).
+
+Keys and values use the same ASCII-folded surface form as the rest of the gladia-3
+pipeline after ``casefold_text`` and ``remove_diacritics``.
+"""
+
+FINNISH_REPLACEMENTS: dict[str, str] = {
+    "ma": "mina",
+    "maa": "mina",
+    "mulle": "minulle",
+    "mulla": "minulla",
+    "mua": "minua",
+    "mun": "minun",
+    "sa": "sina",
+    "sulle": "sinulle",
+    "sulla": "sinulla",
+    "sua": "sinua",
+    "sun": "sinun",
+    "toi": "tuo",
+    "ton": "tuon",
+    "tossa": "tuossa",
+    "tosta": "tuosta",
+    "tohon": "tuohon",
+    "taa": "tama",
+    "naa": "nama",
+    "olis": "olisi",
+    "ois": "olisi",
+    "oo": "ole",
+    "ollu": "ollut",
+    "onks": "onko",
+    "oliks": "oliko",
+    "oisko": "olisiko",
+    "vois": "voisi",
+    "katotaan": "katsotaan",
+    "kattoa": "katsoa",
+    "mut": "mutta",
+    "sit": "sitten",
+    "sitte": "sitten",
+    "et": "etta",
+    "sillon": "silloin",
+    "viimeks": "viimeksi",
+    "elikka": "eli",
+    "juu": "joo",
+    "jes": "joo",
+    "ok": "okei",
+    "bank": "pankki",
+    "bankin": "pankin",
+    "euro": "euros",
+}
diff --git a/tests/e2e/files/gladia-3/fi.csv b/tests/e2e/files/gladia-3/fi.csv
new file mode 100644
index 0000000..fdfc511
--- /dev/null
+++ b/tests/e2e/files/gladia-3/fi.csv
@@ -0,0 +1,17 @@
+input,expected
+kaksi pienempi kuin viisi,2 pienempi kuin 5
+2 < 5,2 pienempi kuin 5
+50 astetta celsius,50 astetta celsius
+Se maksaa €50,se maksaa 50 euros
+kymmenen euroa,10 euroa
+kolme miljoonaa,3000000
+test@example.com,test at merkki example piste com
+www.example.com,w w w piste example piste com
+x = 5,x yhta kuin 5
+192.168.1.1,192 piste 168 piste 1 piste 1
+juu ok,joo okei
+"3,14",3 pilkku 14
+"1.234,56",1234 pilkku 56
+"8,75%","8 pilkku 75%"
+ping pong,ping pong
+tama on hyva,tama on hyva
diff --git a/tests/unit/languages/finnish_number_normalizer_test.py b/tests/unit/languages/finnish_number_normalizer_test.py
new file mode 100644
index 0000000..3b42a45
--- /dev/null
+++ b/tests/unit/languages/finnish_number_normalizer_test.py
@@ -0,0 +1,58 @@
+import pytest
+
+from normalization.languages.finnish.number_normalizer import FinnishNumberNormalizer
+from normalization.languages.finnish.operators import FINNISH_CONFIG
+
+
+@pytest.fixture
+def normalizer() -> FinnishNumberNormalizer:
+    return FinnishNumberNormalizer(FINNISH_CONFIG.currency_symbol_to_word)
+
+
+@pytest.fixture
+def normalizer_no_currency() -> FinnishNumberNormalizer:
+    return FinnishNumberNormalizer(None)
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("kaksi kymmenta viisi", "25"),
+        ("kaksi kymmentä viisi", "25"),
+        ("sata", "100"),
+        ("tuhat", "1000"),
+        ("yksi tuhat", "1000"),
+        ("kolme miljoonaa", "3000000"),
+        ("yksi miljoona", "1000000"),
+    ],
+)
+def test_finnish_spelled_numbers(
+    normalizer: FinnishNumberNormalizer, text: str, expected: str
+) -> None:
+    assert normalizer(text) == expected
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("kymmenen euroa", "10 euroa"),
+        ("€50", "50 euros"),
+        ("50 €", "50 euros"),
+    ],
+)
+def test_currency_and_spoken_units(
+    normalizer: FinnishNumberNormalizer, text: str, expected: str
+) -> None:
+    assert normalizer(text) == expected
+
+
+def test_without_currency_config_leaves_currency_symbol(
+    normalizer_no_currency: FinnishNumberNormalizer,
+) -> None:
+    assert normalizer_no_currency("kaksi kymmenta viisi") == "25"
+    assert normalizer_no_currency("€10") == "€10"
+
+
+def test_non_numeric_text_unchanged(normalizer: FinnishNumberNormalizer) -> None:
+    text = "tama on tavallista tekstia"
+    assert normalizer(text) == text
diff --git a/tests/unit/languages/finnish_operators_test.py b/tests/unit/languages/finnish_operators_test.py
new file mode 100644
index 0000000..19d6ca2
--- /dev/null
+++ b/tests/unit/languages/finnish_operators_test.py
@@ -0,0 +1,29 @@
+import pytest
+
+from normalization.languages.finnish.operators import FinnishOperators
+from normalization.languages.registry import get_language_registry
+
+
+@pytest.fixture
+def operators() -> FinnishOperators:
+    return FinnishOperators()
+
+
+def test_finnish_is_registered() -> None:
+    assert "fi" in get_language_registry()
+
+
+def test_finnish_registry_produces_finnish_operators() -> None:
+    instance = get_language_registry()["fi"]()
+    assert isinstance(instance, FinnishOperators)
+
+
+def test_config_code(operators: FinnishOperators) -> None:
+    assert operators.config.code == "fi"
+
+
+def test_word_replacements(operators: FinnishOperators) -> None:
+    assert operators.get_word_replacements()["ma"] == "mina"
+    assert operators.get_word_replacements()["ok"] == "okei"
+    assert operators.get_word_replacements()["juu"] == "joo"
+    assert operators.get_word_replacements()["euro"] == "euros"

From 761ca8733aa0fe74cd25e8cede11aafbae3048f2 Mon Sep 17 00:00:00 2001
From: egenthon-cmd <egenthon@gladia.io>
Date: Mon, 4 May 2026 12:10:32 -0400
Subject: [PATCH 2/2] fix: affirmations and negation removed from filler words

---
 normalization/languages/finnish/operators.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/normalization/languages/finnish/operators.py b/normalization/languages/finnish/operators.py
index ef19481..538a587 100644
--- a/normalization/languages/finnish/operators.py
+++ b/normalization/languages/finnish/operators.py
@@ -52,7 +52,6 @@
         "tavallaan",
         "aha",
         "aa",
-        "niin",
     ],
     digit_words=_FINNISH_DIGIT_WORDS,
     number_words=[