From b119c36e1e6be1e318ce7f570078cea4ffdf0400 Mon Sep 17 00:00:00 2001
From: egenthon-cmd <egenthon@gladia.io>
Date: Thu, 23 Apr 2026 12:04:37 +0200
Subject: [PATCH 1/4] feat: init norwegian language

Made-with: Cursor
---
 docs/steps.md                                 |  30 +-
 normalization/languages/__init__.py           |   3 +-
 .../languages/base/language_config.py         |  10 +
 normalization/languages/norwegian/__init__.py |   7 +
 .../languages/norwegian/number_normalizer.py  | 428 ++++++++++++++++++
 .../languages/norwegian/operators.py          | 124 +++++
 .../languages/norwegian/replacements.py       |  14 +
 .../text/convert_roman_numerals_to_digits.py  |  27 +-
 .../steps/text/expand_alphanumeric_codes.py   |  12 +-
 .../remove_standalone_currency_symbols.py     |  59 ++-
 normalization/steps/text/remove_symbols.py    |  18 +-
 normalization/steps/text/replace_currency.py  |  22 +-
 tests/e2e/files/gladia-3/no.csv               |  20 +
 .../norwegian_number_normalizer_test.py       |  74 +++
 .../languages/norwegian_operators_test.py     |  29 ++
 .../convert_roman_numerals_to_digits_test.py  |  31 ++
 .../text/expand_alphanumeric_codes_test.py    |  38 ++
 ...remove_standalone_currency_symbols_test.py |  29 ++
 tests/unit/steps/text/remove_symbols_test.py  |  30 ++
 .../steps/text/replace_currency_kr_test.py    |  14 +
 20 files changed, 990 insertions(+), 29 deletions(-)
 create mode 100644 normalization/languages/norwegian/__init__.py
 create mode 100644 normalization/languages/norwegian/number_normalizer.py
 create mode 100644 normalization/languages/norwegian/operators.py
 create mode 100644 normalization/languages/norwegian/replacements.py
 create mode 100644 tests/e2e/files/gladia-3/no.csv
 create mode 100644 tests/unit/languages/norwegian_number_normalizer_test.py
 create mode 100644 tests/unit/languages/norwegian_operators_test.py
 create mode 100644 tests/unit/steps/text/expand_alphanumeric_codes_test.py
 create mode 100644 tests/unit/steps/text/remove_standalone_currency_symbols_test.py
 create mode 100644 tests/unit/steps/text/remove_symbols_test.py
 create mode 100644 tests/unit/steps/text/replace_currency_kr_test.py

diff --git a/docs/steps.md b/docs/steps.md
index 5b5bb43..b3eef58 100644
--- a/docs/steps.md
+++ b/docs/steps.md
@@ -75,6 +75,10 @@ Runs before expand_alphanumeric_codes to prevent 'VIII' -> 'V I I I'.
 Only converts ii-ix to avoid false positives with single letters like 'I'.
 Skips 'v' when adjacent to digits (version-like contexts: v2, v 12).
 
+When ``operators.config.roman_numerals_uppercase_only`` is True, multi-letter
+numerals match only in ALL CAPS (so Swedish/Norwegian ``vi`` / ``Vi`` are not
+read as 6). Standalone ``V`` still matches as 5 for titles like ``Louis V``.
+
 ### `convert_word_based_time_patterns`
 
 **Base class:** `TextStep`
@@ -86,14 +90,19 @@ operators.config.pm_word, operators.config.oclock_word, and
 operators.get_compound_minutes().
 No-op when required config is None.
 
+Regex patterns are compiled once per operators config instance and cached
+on the step to avoid recompilation on every call.
+
 ### `expand_alphanumeric_codes`
 
 **Base class:** `TextStep`
 
 Space out uppercase words and alphanumeric codes.
 
-'ABC123' -> 'A B C 1 2 3', 'CNN' -> 'C N N'.
-Skips pure numbers, ordinals (1st, 2nd), and protection markers. Must run before casefold_text.
+'ABC123' -> 'A B C 1 2 3'. When ``operators.config.expand_all_caps_letter_by_letter``
+is False, pure letter ALL-CAPS tokens (e.g. SMS) are left intact for Nordic-style
+acronym handling. Skips pure numbers, ordinals (1st, 2nd), and protection markers.
+Must run before casefold_text.
 
 ### `expand_contractions`
 
@@ -329,13 +338,21 @@ Handles ¤ markers by processing segments separately.
 
 Remove currency symbols that are not adjacent to numbers.
 
+Single-character symbols use the classic between/start/end patterns (not
+between two digits). Multi-character keys (e.g. ``kr``) are matched only as
+whole tokens (``\b...\b``) and are skipped when a digit is nearby with
+only whitespace in between, so ordinary words are not corrupted.
+
 ### `remove_symbols`
 
 **Base class:** `TextStep`
 
 Replace markers, symbols, and punctuation with spaces.
 
-Preserves letters, digits, and all placeholder characters.
+Preserves letters, digits, and all placeholder characters. When
+``symbols_to_words`` defines a word for ``%``, expands ``%`` only when it
+follows a decimal or integer literal (e.g. ``8,75%``), so other ``%`` uses
+stay unchanged.
 
 ### `remove_thousand_separators`
 
@@ -376,7 +393,12 @@ No-op when either is None.
 
 **Base class:** `TextStep`
 
-Replace currency symbols with their corresponding words.
+Replace currency symbols with their corresponding words next to amounts.
+
+For each entry in ``operators.config.currency_symbol_to_word``, substitutes
+the symbol before or after a numeric literal (including placeholder decimals).
+Alphanumeric symbols (e.g. ``kr``) use word boundaries so a token like
+``kroner`` is not treated as ``kr`` plus a suffix.
 
 ### `restore_decimal_separator_with_word`
 
diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py
index 905c82f..a05d033 100644
--- a/normalization/languages/__init__.py
+++ b/normalization/languages/__init__.py
@@ -1,4 +1,4 @@
-from . import dutch, english, french, german, italian, spanish
+from . import dutch, english, french, german, italian, norwegian, spanish
 from .base import LanguageOperators
 from .registry import get_language_registry, register_language
 
@@ -10,6 +10,7 @@
     "french",
     "german",
     "italian",
+    "norwegian",
     "spanish",
     "get_language_registry",
 ]
diff --git a/normalization/languages/base/language_config.py b/normalization/languages/base/language_config.py
index b4a2495..3387cfc 100644
--- a/normalization/languages/base/language_config.py
+++ b/normalization/languages/base/language_config.py
@@ -75,6 +75,16 @@ class LanguageConfig:
     ordinal_suffixes: list[str] | None = None
     """Ordinal number suffixes for this language (e.g. ["st", "nd", "rd", "th"] for English).
     Used by steps that need to detect ordinal numbers. None = ordinal detection is skipped."""
+    roman_numerals_uppercase_only: bool = False
+    """When True, only treat Roman numerals as digits if they appear in ALL CAPS (e.g. VI, VIII).
+
+    Avoids collisions with Nordic pronouns spelled ``vi``/``Vi``. Default False preserves
+    legacy case-insensitive matching for other languages."""
+    expand_all_caps_letter_by_letter: bool = True
+    """When False, pure letter ALL-CAPS tokens (e.g. SMS) are not spaced into letters.
+
+    Nordic STT hypotheses often keep acronyms as one word; default True preserves
+    English-style letter-by-letter expansion for CAPS-only tokens."""
     am_word: str | None = None
     """Canonical AM time designator (e.g. "am" for English).
     Used by am/pm time formatting steps. None = am/pm steps are skipped."""
diff --git a/normalization/languages/norwegian/__init__.py b/normalization/languages/norwegian/__init__.py
new file mode 100644
index 0000000..2b87c45
--- /dev/null
+++ b/normalization/languages/norwegian/__init__.py
@@ -0,0 +1,7 @@
+from .operators import NorwegianOperators
+from .replacements import NORWEGIAN_REPLACEMENTS
+
+__all__ = [
+    "NorwegianOperators",
+    "NORWEGIAN_REPLACEMENTS",
+]
diff --git a/normalization/languages/norwegian/number_normalizer.py b/normalization/languages/norwegian/number_normalizer.py
new file mode 100644
index 0000000..d1c1a67
--- /dev/null
+++ b/normalization/languages/norwegian/number_normalizer.py
@@ -0,0 +1,428 @@
+"""Norwegian (Bokmål) number normalizer (STT-oriented).
+
+``text2num.alpha2digit`` does not cover Norwegian well for transcript-style
+cardinals, so this module mirrors the Swedish approach: 0–999, ``tusen``
+compounds, and large multipliers (``million``, ``milliard``, ``billion``).
+Optionally rewrites currency symbols, then restores plural currency words from
+config. Supports optional ``og`` between number parts (e.g. ``tjue og fem``).
+"""
+
+from __future__ import annotations
+
+import re
+
+
+def _fold(s: str) -> str:
+    return s.lower()
+
+
+def _get(table: dict[str, int], word: str) -> int | None:
+    fw = _fold(word)
+    for k, v in table.items():
+        if _fold(k) == fw:
+            return v
+    return None
+
+
+def _skip_optional_og(words: list[str], j: int, n: int) -> int:
+    if j < n and _fold(words[j]) == "og":
+        return j + 1
+    return j
+
+
+_ONES_2_9: dict[str, int] = {
+    "to": 2,
+    "tre": 3,
+    "fire": 4,
+    "fem": 5,
+    "seks": 6,
+    "sju": 7,
+    "syv": 7,
+    "åtte": 8,
+    "atte": 8,
+    "ni": 9,
+}
+
+_TEENS: dict[str, int] = {
+    "ti": 10,
+    "elleve": 11,
+    "tolv": 12,
+    "tretten": 13,
+    "fjorten": 14,
+    "femten": 15,
+    "seksten": 16,
+    "sytten": 17,
+    "atten": 18,
+    "nitten": 19,
+}
+
+_TENS: dict[str, int] = {
+    "tjue": 20,
+    "tyve": 20,
+    "tretti": 30,
+    "førti": 40,
+    "forti": 40,
+    "femti": 50,
+    "seksti": 60,
+    "sytti": 70,
+    "åtti": 80,
+    "atti": 80,
+    "nitti": 90,
+}
+
+_TENS_PREFIXES: tuple[tuple[str, int], ...] = tuple(
+    sorted(_TENS.items(), key=lambda kv: len(kv[0]), reverse=True)
+)
+
+_ONES_AFTER_TENS: dict[str, int] = {"ett": 1, "en": 1, "ein": 1, **_ONES_2_9}
+
+_DIGIT_TO_NORWEGIAN: dict[str, str] = {
+    "0": "null",
+    "1": "en",
+    "2": "to",
+    "3": "tre",
+    "4": "fire",
+    "5": "fem",
+    "6": "seks",
+    "7": "sju",
+    "8": "åtte",
+    "9": "ni",
+}
+
+_RE_MIXED_NUMBER = re.compile(
+    r"\b(\d+)\s+("
+    r"million|millioner|milliard|milliarder|billion|billioner|tusen"
+    r")\b",
+    re.IGNORECASE,
+)
+
+_BIG_MULT: dict[str, int] = {
+    "tusen": 1000,
+    "million": 1_000_000,
+    "millioner": 1_000_000,
+    "milliard": 1_000_000_000,
+    "milliarder": 1_000_000_000,
+    "billion": 1_000_000_000_000,
+    "billioner": 1_000_000_000_000,
+}
+
+
+def _normalize_mixed_numbers(text: str) -> str:
+    """Convert ``3 milliard`` → ``tre milliard`` so the word parser yields 3e9."""
+
+    def replace(match: re.Match[str]) -> str:
+        number = match.group(1)
+        multiplier = match.group(2)
+        if len(number) == 1 and number in _DIGIT_TO_NORWEGIAN:
+            return f"{_DIGIT_TO_NORWEGIAN[number]} {multiplier}"
+        return match.group(0)
+
+    return _RE_MIXED_NUMBER.sub(replace, text)
+
+
+def _singular_spoken_unit(trailing_word: str) -> str:
+    t = trailing_word.lower()
+    if t == "euros":
+        return "euro"
+    if t == "dollars":
+        return "dollar"
+    if t == "pounds":
+        return "pound"
+    if t == "kroner":
+        return "krone"
+    if t == "yens":
+        return "yen"
+    return trailing_word
+
+
+def _normalize_currency_symbols(
+    text: str,
+    currency_symbol_to_word: dict[str, str] | None,
+) -> str:
+    if not currency_symbol_to_word:
+        return text
+    num = r"\d+(?:[.,]\d+)?"
+    for symbol, trailing in currency_symbol_to_word.items():
+        singular = _singular_spoken_unit(trailing)
+        esc = re.escape(symbol)
+        sym = rf"\b{esc}\b" if len(symbol) > 1 else esc
+        text = re.sub(rf"{sym}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE)
+        text = re.sub(rf"({num})\s*{sym}", rf"\1 {singular}", text, flags=re.IGNORECASE)
+    return text
+
+
+def _currency_plural_fix_patterns(
+    currency_symbol_to_word: dict[str, str] | None,
+) -> tuple[tuple[re.Pattern[str], str], ...]:
+    if not currency_symbol_to_word:
+        return ()
+    amount = r"(\d+(?:[.,]\d+)?)"
+    seen: set[str] = set()
+    out: list[tuple[re.Pattern[str], str]] = []
+    for _symbol, trailing in currency_symbol_to_word.items():
+        tl = trailing.lower()
+        if tl in seen:
+            continue
+        seen.add(tl)
+        singular = _singular_spoken_unit(trailing)
+        if singular.lower() == tl:
+            continue
+        if tl == "euros":
+            pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE)
+            out.append((pat, rf"\1 {trailing}"))
+        elif tl == "kroner":
+            pat = re.compile(rf"\b{amount}\s+krone\b", re.IGNORECASE)
+            out.append((pat, rf"\1 {trailing}"))
+        else:
+            pat = re.compile(
+                rf"\b{amount}\s+{re.escape(singular)}\b",
+                re.IGNORECASE,
+            )
+            out.append((pat, rf"\1 {trailing}"))
+    return tuple(out)
+
+
+def _apply_currency_plural_fixes(
+    text: str,
+    fixers: tuple[tuple[re.Pattern[str], str], ...],
+) -> str:
+    for pattern, repl in fixers:
+        text = pattern.sub(repl, text)
+    return text
+
+
+def _hundred_multiplier(word: str) -> int | None:
+    if _fold(word) in ("en", "ett", "ein"):
+        return 1
+    return _get(_ONES_2_9, word)
+
+
+class NorwegianNumberNormalizer:
+    """Convert Norwegian spelled-out numbers to digits."""
+
+    def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None:
+        self._currency_symbol_to_word = currency_symbol_to_word
+        self._currency_plural_fixes = _currency_plural_fix_patterns(
+            currency_symbol_to_word,
+        )
+
+    def __call__(self, text: str) -> str:
+        if not text.strip():
+            return text
+        text = _normalize_currency_symbols(text, self._currency_symbol_to_word)
+        text = _normalize_mixed_numbers(text)
+        words = text.split()
+        out: list[str] = []
+        i = 0
+        n = len(words)
+        while i < n:
+            parsed = self._parse_number(words, i, n)
+            if parsed is not None:
+                end, value = parsed
+                out.append(str(value))
+                i = end
+            else:
+                out.append(words[i])
+                i += 1
+        text = " ".join(out)
+        text = _apply_currency_plural_fixes(text, self._currency_plural_fixes)
+        return text
+
+    def _parse_number(self, words: list[str], i: int, n: int) -> tuple[int, int] | None:
+        if i >= n:
+            return None
+
+        fw = _fold(words[i])
+
+        if fw == "tusen":
+            j = _skip_optional_og(words, i + 1, n)
+            tail = self._parse_number(words, j, n)
+            if tail is not None:
+                end, v2 = tail
+                return end, 1000 + v2
+            return j, 1000
+
+        if i + 1 < n and fw in ("en", "ett", "ein") and _fold(words[i + 1]) == "tusen":
+            j = i + 2
+            j = _skip_optional_og(words, j, n)
+            tail = self._parse_number(words, j, n)
+            base = 1000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return j, base
+
+        if (
+            i + 1 < n
+            and fw in ("en", "ett", "ein")
+            and _fold(words[i + 1]) == "million"
+        ):
+            j = i + 2
+            j = _skip_optional_og(words, j, n)
+            tail = self._parse_number(words, j, n)
+            base = 1_000_000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return j, base
+
+        if (
+            i + 1 < n
+            and fw in ("en", "ett", "ein")
+            and _fold(words[i + 1])
+            in (
+                "milliard",
+                "milliarder",
+            )
+        ):
+            j = i + 2
+            j = _skip_optional_og(words, j, n)
+            tail = self._parse_number(words, j, n)
+            base = 1_000_000_000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return j, base
+
+        if (
+            i + 1 < n
+            and fw in ("en", "ett", "ein")
+            and _fold(words[i + 1])
+            in (
+                "billion",
+                "billioner",
+            )
+        ):
+            j = i + 2
+            j = _skip_optional_og(words, j, n)
+            tail = self._parse_number(words, j, n)
+            base = 1_000_000_000_000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return j, base
+
+        sub999 = self._parse_0_999(words, i, n)
+        if sub999 is None:
+            return None
+        j, v = sub999
+        if j >= n:
+            return j, v
+
+        next_fw = _fold(words[j])
+        if next_fw == "tusen":
+            j += 1
+            j = _skip_optional_og(words, j, n)
+            prod = v * 1000
+            if j >= n:
+                return j, prod
+            tail = self._parse_number(words, j, n)
+            if tail is not None:
+                end, v2 = tail
+                return end, prod + v2
+            return j, prod
+
+        mult = _BIG_MULT.get(next_fw)
+        if mult is not None and mult >= 1_000_000:
+            j += 1
+            j = _skip_optional_og(words, j, n)
+            prod = v * mult
+            if j >= n:
+                return j, prod
+            tail = self._parse_number(words, j, n)
+            if tail is not None:
+                end, v2 = tail
+                return end, prod + v2
+            return j, prod
+
+        return j, v
+
+    def _parse_0_999(self, words: list[str], i: int, n: int) -> tuple[int, int] | None:
+        if i >= n:
+            return None
+
+        if _fold(words[i]) == "null":
+            if i + 1 < n and self._continues_number(words[i + 1]):
+                return None
+            return i + 1, 0
+
+        if _fold(words[i]) == "hundre":
+            j = _skip_optional_og(words, i + 1, n)
+            tail = self._parse_0_99(words, j, n)
+            if tail is not None:
+                je, tv = tail
+                return je, 100 + tv
+            return i + 1, 100
+
+        if i + 1 < n and _fold(words[i + 1]) == "hundre":
+            m = _hundred_multiplier(words[i])
+            if m is None:
+                return None
+            base = m * 100
+            j = i + 2
+            j = _skip_optional_og(words, j, n)
+            tail = self._parse_0_99(words, j, n)
+            if tail is not None:
+                je, tv = tail
+                return je, base + tv
+            return j, base
+
+        return self._parse_0_99(words, i, n)
+
+    def _continues_number(self, word: str) -> bool:
+        fw = _fold(word)
+        if fw == "og":
+            return True
+        if fw == "hundre" or fw == "tusen":
+            return True
+        if fw in _BIG_MULT:
+            return True
+        if _get(_TEENS, word) is not None:
+            return True
+        if _get(_TENS, word) is not None:
+            return True
+        if _get(_ONES_2_9, word) is not None:
+            return True
+        if fw in ("en", "ett", "ein"):
+            return True
+        return False
+
+    def _parse_0_99(self, words: list[str], i: int, n: int) -> tuple[int, int] | None:
+        if i >= n:
+            return None
+
+        fw = _fold(words[i])
+
+        v = _get(_TEENS, words[i])
+        if v is not None:
+            return i + 1, v
+
+        for prefix, tval in _TENS_PREFIXES:
+            pl = len(prefix)
+            if fw.startswith(prefix) and len(fw) > pl:
+                rest = fw[pl:]
+                unit = _get(_ONES_AFTER_TENS, rest)
+                if unit is not None:
+                    return i + 1, tval + unit
+
+        tens = _get(_TENS, words[i])
+        if tens is not None:
+            j = i + 1
+            j = _skip_optional_og(words, j, n)
+            if j < n:
+                nfw = _fold(words[j])
+                if nfw in ("ett", "en", "ein"):
+                    return j + 1, tens + 1
+                o = _get(_ONES_2_9, words[j])
+                if o is not None:
+                    return j + 1, tens + o
+            return i + 1, tens
+
+        o = _get(_ONES_2_9, words[i])
+        if o is not None:
+            return i + 1, o
+
+        if fw in ("en", "ett", "ein"):
+            return None
+
+        return None
diff --git a/normalization/languages/norwegian/operators.py b/normalization/languages/norwegian/operators.py
new file mode 100644
index 0000000..7a87278
--- /dev/null
+++ b/normalization/languages/norwegian/operators.py
@@ -0,0 +1,124 @@
+from normalization.languages.base import LanguageConfig, LanguageOperators
+from normalization.languages.norwegian.number_normalizer import (
+    NorwegianNumberNormalizer,
+)
+from normalization.languages.registry import register_language
+
+_NORWEGIAN_DIGIT_WORDS: dict[str, str] = {
+    "null": "0",
+    "en": "1",
+    "ett": "1",
+    "et": "1",
+    "ein": "1",
+    "to": "2",
+    "tre": "3",
+    "fire": "4",
+    "fem": "5",
+    "seks": "6",
+    "sju": "7",
+    "syv": "7",
+    "åtte": "8",
+    "ni": "9",
+}
+
+NORWEGIAN_CONFIG = LanguageConfig(
+    code="no",
+    roman_numerals_uppercase_only=True,
+    expand_all_caps_letter_by_letter=False,
+    decimal_separator=",",
+    decimal_word="komma",
+    thousand_separator=" ",
+    symbols_to_words={
+        "@": "krollalfa",
+        ".": "punkt",
+        "+": "plus",
+        "=": "er lik med",
+        ">": "storre enn",
+        "<": "mindre enn",
+        "°": "grader",
+        "°C": "grader celsius",
+        "°F": "grader fahrenheit",
+        "%": "prosent",
+    },
+    currency_symbol_to_word={
+        "€": "euros",
+        "$": "dollars",
+        "£": "pounds",
+        "¢": "cent",
+        "¥": "yens",
+        "kr": "kroner",
+    },
+    filler_words=[
+        "eh",
+        "øh",
+        "hm",
+        "hmm",
+        "mm",
+        "mhm",
+        "altså",
+        "liksom",
+        "bare",
+        "nå",
+        "ja",
+        "jo",
+        "nei",
+        "a",
+        "aa",
+        "mmm",
+        "akkurat",
+    ],
+    digit_words=_NORWEGIAN_DIGIT_WORDS,
+    number_words=[
+        *_NORWEGIAN_DIGIT_WORDS,
+        "ti",
+        "elleve",
+        "tolv",
+        "tretten",
+        "fjorten",
+        "femten",
+        "seksten",
+        "sytten",
+        "atten",
+        "nitten",
+        "tjue",
+        "tyve",
+        "tretti",
+        "førti",
+        "forti",
+        "femti",
+        "seksti",
+        "sytti",
+        "åtti",
+        "atti",
+        "nitti",
+        "hundre",
+        "tusen",
+        "million",
+        "millioner",
+        "milliard",
+        "milliarder",
+        "billion",
+        "billioner",
+    ],
+    plus_word="plus",
+)
+
+
+@register_language
+class NorwegianOperators(LanguageOperators):
+    def __init__(self) -> None:
+        super().__init__(NORWEGIAN_CONFIG)
+        self._number_normalizer = NorwegianNumberNormalizer(
+            NORWEGIAN_CONFIG.currency_symbol_to_word,
+        )
+
+    def expand_written_numbers(self, text: str) -> str:
+        """Convert Norwegian spelled-out numbers to digits (e.g. tjue fem → 25)."""
+        return self._number_normalizer(text)
+
+    def get_word_replacements(self) -> dict[str, str]:
+        from normalization.languages.norwegian.replacements import (
+            NORWEGIAN_REPLACEMENTS,
+        )
+
+        return NORWEGIAN_REPLACEMENTS
diff --git a/normalization/languages/norwegian/replacements.py b/normalization/languages/norwegian/replacements.py
new file mode 100644
index 0000000..3f850ae
--- /dev/null
+++ b/normalization/languages/norwegian/replacements.py
@@ -0,0 +1,14 @@
+"""Colloquial / spelling variants → standard Bokmål (canonical for WER)."""
+
+NORWEGIAN_REPLACEMENTS: dict[str, str] = {
+    "dom": "de",
+    "ke": "ikke",
+    "kor": "hvor",
+    "ska": "skal",
+    "euro": "euros",
+    "krone": "kroner",
+    "ok": "okei",
+    "kreditkort": "kredittkort",
+    "kreditkortet": "kredittkortet",
+    "derre": "der",
+}
diff --git a/normalization/steps/text/convert_roman_numerals_to_digits.py b/normalization/steps/text/convert_roman_numerals_to_digits.py
index 135cbf6..6f1062a 100644
--- a/normalization/steps/text/convert_roman_numerals_to_digits.py
+++ b/normalization/steps/text/convert_roman_numerals_to_digits.py
@@ -23,19 +23,34 @@ class ConvertRomanNumeralsToDigitsStep(TextStep):
     Runs before expand_alphanumeric_codes to prevent 'VIII' -> 'V I I I'.
     Only converts ii-ix to avoid false positives with single letters like 'I'.
     Skips 'v' when adjacent to digits (version-like contexts: v2, v 12).
+
+    When ``operators.config.roman_numerals_uppercase_only`` is True, multi-letter
+    numerals match only in ALL CAPS (so Swedish/Norwegian ``vi`` / ``Vi`` are not
+    read as 6). Standalone ``V`` still matches as 5 for titles like ``Louis V``.
     """
 
     name = "convert_roman_numerals_to_digits"
 
     def __call__(self, text: str, operators: LanguageOperators) -> str:
+        upper_only = operators.config.roman_numerals_uppercase_only
         for roman, arabic in _ROMAN_REPLACEMENTS.items():
             if roman == "v":
-                text = re.sub(
-                    r"(?<!\d )(?<!\d)\bv\b(?!\s*\d)",
-                    arabic,
-                    text,
-                    flags=re.IGNORECASE,
-                )
+                if upper_only:
+                    text = re.sub(
+                        r"(?<!\d )(?<!\d)\bV\b(?!\s*\d)",
+                        arabic,
+                        text,
+                    )
+                else:
+                    text = re.sub(
+                        r"(?<!\d )(?<!\d)\bv\b(?!\s*\d)",
+                        arabic,
+                        text,
+                        flags=re.IGNORECASE,
+                    )
+            elif upper_only:
+                token = roman.upper()
+                text = re.sub(rf"\b{re.escape(token)}\b", arabic, text)
             else:
                 text = re.sub(rf"\b{roman}\b", arabic, text, flags=re.IGNORECASE)
         return text
diff --git a/normalization/steps/text/expand_alphanumeric_codes.py b/normalization/steps/text/expand_alphanumeric_codes.py
index 048da1a..054cf39 100644
--- a/normalization/steps/text/expand_alphanumeric_codes.py
+++ b/normalization/steps/text/expand_alphanumeric_codes.py
@@ -17,8 +17,10 @@
 class ExpandAlphanumericCodesStep(TextStep):
     """Space out uppercase words and alphanumeric codes.
 
-    'ABC123' -> 'A B C 1 2 3', 'CNN' -> 'C N N'.
-    Skips pure numbers, ordinals (1st, 2nd), and protection markers. Must run before casefold_text.
+    'ABC123' -> 'A B C 1 2 3'. When ``operators.config.expand_all_caps_letter_by_letter``
+    is False, pure letter ALL-CAPS tokens (e.g. SMS) are left intact for Nordic-style
+    acronym handling. Skips pure numbers, ordinals (1st, 2nd), and protection markers.
+    Must run before casefold_text.
     """
 
     name = "expand_alphanumeric_codes"
@@ -56,6 +58,12 @@ def _should_process(match: re.Match) -> str:
                 return word
 
             has_digit = any(c.isdigit() for c in word)
+            if (
+                not operators.config.expand_all_caps_letter_by_letter
+                and word.isupper()
+                and not has_digit
+            ):
+                return word
             if word.isupper() or has_digit:
                 return _expand_word(match)
 
diff --git a/normalization/steps/text/remove_standalone_currency_symbols.py b/normalization/steps/text/remove_standalone_currency_symbols.py
index c6bd60e..937ad89 100644
--- a/normalization/steps/text/remove_standalone_currency_symbols.py
+++ b/normalization/steps/text/remove_standalone_currency_symbols.py
@@ -5,9 +5,24 @@
 from normalization.steps.registry import register_step
 
 
-def _make_standalone_patterns(
+def _currency_touching_digit(text: str, start: int, end: int) -> bool:
+    """True if a digit is next to this span, allowing only whitespace in between."""
+    i = start - 1
+    while i >= 0 and text[i].isspace():
+        i -= 1
+    if i >= 0 and text[i].isdigit():
+        return True
+    i = end
+    while i < len(text) and text[i].isspace():
+        i += 1
+    if i < len(text) and text[i].isdigit():
+        return True
+    return False
+
+
+def _make_single_char_patterns(
     symbols: frozenset[str],
-) -> tuple[re.Pattern, re.Pattern, re.Pattern, re.Pattern]:
+) -> tuple[re.Pattern[str], re.Pattern[str], re.Pattern[str], re.Pattern[str]]:
     char_class = "[" + re.escape("".join(symbols)) + "]"
     between = re.compile(rf"([^0-9]){char_class}([^0-9])")
     start = re.compile(rf"^{char_class}([^0-9])")
@@ -18,18 +33,44 @@ def _make_standalone_patterns(
 
 @register_step
 class RemoveStandaloneCurrencySymbolsStep(TextStep):
-    """Remove currency symbols that are not adjacent to numbers."""
+    """Remove currency symbols that are not adjacent to numbers.
+
+    Single-character symbols use the classic between/start/end patterns (not
+    between two digits). Multi-character keys (e.g. ``kr``) are matched only as
+    whole tokens (``\\b...\\b``) and are skipped when a digit is nearby with
+    only whitespace in between, so ordinary words are not corrupted.
+    """
 
     name = "remove_standalone_currency_symbols"
 
     def __call__(self, text: str, operators: LanguageOperators) -> str:
-        symbols = frozenset(operators.config.currency_symbol_to_word.keys())
+        symbols = tuple(operators.config.currency_symbol_to_word.keys())
         if not symbols:
             return text
 
-        between, start, end, standalone = _make_standalone_patterns(symbols)
-        text = between.sub(r"\1 \2", text)
-        text = start.sub(r" \1", text)
-        text = end.sub(r"\1 ", text)
-        text = standalone.sub(" ", text)
+        singles = frozenset(s for s in symbols if len(s) == 1)
+
+        for sym in sorted(
+            (s for s in symbols if len(s) > 1),
+            key=len,
+            reverse=True,
+        ):
+            esc = re.escape(str(sym))
+            pat = re.compile(rf"\b{esc}\b", re.IGNORECASE)
+            cur_text = text
+
+            def repl(m: re.Match[str]) -> str:
+                if _currency_touching_digit(cur_text, m.start(), m.end()):
+                    return m.group(0)
+                return ""
+
+            text = pat.sub(repl, cur_text)
+
+        if singles:
+            between, start, end, standalone = _make_single_char_patterns(singles)
+            text = between.sub(r"\1 \2", text)
+            text = start.sub(r" \1", text)
+            text = end.sub(r"\1 ", text)
+            text = standalone.sub(" ", text)
+
         return text
diff --git a/normalization/steps/text/remove_symbols.py b/normalization/steps/text/remove_symbols.py
index 639a681..e31bbc1 100644
--- a/normalization/steps/text/remove_symbols.py
+++ b/normalization/steps/text/remove_symbols.py
@@ -1,3 +1,4 @@
+import re
 import unicodedata
 
 from normalization.constants.protectors import ProtectPlaceholder
@@ -12,13 +13,26 @@
 class RemoveSymbolsStep(TextStep):
     """Replace markers, symbols, and punctuation with spaces.
 
-    Preserves letters, digits, and all placeholder characters.
+    Preserves letters, digits, and all placeholder characters. When
+    ``symbols_to_words`` defines a word for ``%``, expands ``%`` only when it
+    follows a decimal or integer literal (e.g. ``8,75%``), so other ``%`` uses
+    stay unchanged.
     """
 
     name = "remove_symbols"
 
     def __call__(self, text: str, operators: LanguageOperators) -> str:
+        text = unicodedata.normalize("NFKC", text)
+        pct_word = operators.config.symbols_to_words.get("%")
+        if pct_word:
+            # Only expand ``%`` after numeric literals (e.g. 8,75%) so brand-style
+            # strings like ``Signal%%Mark`` stay intact.
+            text = re.sub(
+                rf"(\d+(?:[.,]\d+)?)\s*{re.escape('%')}",
+                rf"\1 {pct_word}",
+                text,
+            )
         return "".join(
             c if c in _KEEP_CHARS else " " if unicodedata.category(c)[0] in "MSP" else c
-            for c in unicodedata.normalize("NFKC", text)
+            for c in text
         )
diff --git a/normalization/steps/text/replace_currency.py b/normalization/steps/text/replace_currency.py
index 7e59cf5..f7d5ef5 100644
--- a/normalization/steps/text/replace_currency.py
+++ b/normalization/steps/text/replace_currency.py
@@ -8,17 +8,29 @@
 _CURRENCY_NUM = rf"\d+(?:{ProtectPlaceholder.DECIMAL_SEPARATOR.value}\d+)?"
 
 
-def _make_currency_patterns(symbol: str) -> tuple[re.Pattern, re.Pattern]:
+def _make_currency_patterns(
+    symbol: str,
+) -> tuple[re.Pattern[str], re.Pattern[str]]:
     escaped = re.escape(symbol)
-    before = re.compile(rf"{escaped}\s*({_CURRENCY_NUM})", re.IGNORECASE)
-    after = re.compile(rf"({_CURRENCY_NUM})\s*{escaped}", re.IGNORECASE)
+    # Alphanumeric codes (e.g. "kr") must be whole tokens so we do not match
+    # "kr" inside "kroner" after another step has already expanded the amount.
+    if symbol.isalnum():
+        before = re.compile(rf"\b{escaped}\b\s*({_CURRENCY_NUM})", re.IGNORECASE)
+        after = re.compile(rf"({_CURRENCY_NUM})\s*\b{escaped}\b", re.IGNORECASE)
+    else:
+        before = re.compile(rf"{escaped}\s*({_CURRENCY_NUM})", re.IGNORECASE)
+        after = re.compile(rf"({_CURRENCY_NUM})\s*{escaped}", re.IGNORECASE)
     return before, after
 
 
 @register_step
 class ReplaceCurrencyStep(TextStep):
-    """
-    Replace currency symbols with their corresponding words.
+    """Replace currency symbols with their corresponding words next to amounts.
+
+    For each entry in ``operators.config.currency_symbol_to_word``, substitutes
+    the symbol before or after a numeric literal (including placeholder decimals).
+    Alphanumeric symbols (e.g. ``kr``) use word boundaries so a token like
+    ``kroner`` is not treated as ``kr`` plus a suffix.
     """
 
     name = "replace_currency"
diff --git a/tests/e2e/files/gladia-3/no.csv b/tests/e2e/files/gladia-3/no.csv
new file mode 100644
index 0000000..4cb16f4
--- /dev/null
+++ b/tests/e2e/files/gladia-3/no.csv
@@ -0,0 +1,20 @@
+input,expected
+ti euro,10 euros
+2 < 5,2 mindre enn 5
+2 > 5,2 storre enn 5
+50°C,50 grader celsius
+Det koster €50,det koster 50 euros
+tjue fem kroner,25 kroner
+10 kr,10 kroner
+dom heter Anna,de heter anna
+hallo eh der,hallo der
+test@example.com,test krollalfa example punkt com
+www.example.com,w w w punkt example punkt com
+x = 5,x er lik med 5
+Ordet [inaudible] er her,ordet inaudible er her
+"1.234,56",1234 komma 56
+"3,14",3 komma 14
+192.168.1.1,192 punkt 168 punkt 1 punkt 1
+ping pong,ping pong
+"8,75%","8 komma 75 prosent"
+ok da,okei da
diff --git a/tests/unit/languages/norwegian_number_normalizer_test.py b/tests/unit/languages/norwegian_number_normalizer_test.py
new file mode 100644
index 0000000..461c6e7
--- /dev/null
+++ b/tests/unit/languages/norwegian_number_normalizer_test.py
@@ -0,0 +1,74 @@
+import pytest
+
+from normalization.languages.norwegian.number_normalizer import (
+    NorwegianNumberNormalizer,
+)
+from normalization.languages.norwegian.operators import NORWEGIAN_CONFIG
+
+
+@pytest.fixture
+def normalizer() -> NorwegianNumberNormalizer:
+    return NorwegianNumberNormalizer(NORWEGIAN_CONFIG.currency_symbol_to_word)
+
+
+@pytest.fixture
+def normalizer_no_currency() -> NorwegianNumberNormalizer:
+    return NorwegianNumberNormalizer(None)
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("tjue fem", "25"),
+        ("tjue og fem", "25"),
+        ("tjueen", "21"),
+        ("fem hundre femti", "550"),
+        ("fem hundre og femti", "550"),
+        ("en million", "1000000"),
+        ("tre milliarder", "3000000000"),
+        ("3 milliard", "3000000000"),
+        ("tjue tusen fem", "20005"),
+        ("tjue tusen og fem", "20005"),
+        ("null", "0"),
+        ("femten", "15"),
+    ],
+)
+def test_norwegian_spelled_numbers(
+    normalizer: NorwegianNumberNormalizer, text: str, expected: str
+) -> None:
+    assert normalizer(text) == expected
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("ti euro", "10 euros"),
+        ("hundre kroner", "100 kroner"),
+        ("€10", "10 euros"),
+        ("10 kr", "10 kroner"),
+        ("fem dollar", "5 dollars"),
+    ],
+)
+def test_currency_symbols_and_plural_trailing_words(
+    normalizer: NorwegianNumberNormalizer, text: str, expected: str
+) -> None:
+    assert normalizer(text) == expected
+
+
+def test_without_currency_config_leaves_currency_symbol(
+    normalizer_no_currency: NorwegianNumberNormalizer,
+) -> None:
+    assert normalizer_no_currency("tjue fem") == "25"
+    assert normalizer_no_currency("€10") == "€10"
+    assert normalizer_no_currency("3 milliard") == "3000000000"
+
+
+def test_non_numeric_text_unchanged(normalizer: NorwegianNumberNormalizer) -> None:
+    text = "dette er vanlig tekst"
+    assert normalizer(text) == text
+
+
+def test_kroner_word_not_treated_as_currency_suffix(
+    normalizer: NorwegianNumberNormalizer,
+) -> None:
+    assert normalizer("25 kroner") == "25 kroner"
diff --git a/tests/unit/languages/norwegian_operators_test.py b/tests/unit/languages/norwegian_operators_test.py
new file mode 100644
index 0000000..6a243dc
--- /dev/null
+++ b/tests/unit/languages/norwegian_operators_test.py
@@ -0,0 +1,29 @@
+import pytest
+
+from normalization.languages.norwegian.operators import NorwegianOperators
+from normalization.languages.registry import get_language_registry
+
+
+@pytest.fixture
+def operators() -> NorwegianOperators:
+    return NorwegianOperators()
+
+
+def test_norwegian_is_registered() -> None:
+    assert "no" in get_language_registry()
+
+
+def test_norwegian_registry_produces_norwegian_operators() -> None:
+    instance = get_language_registry()["no"]()
+    assert isinstance(instance, NorwegianOperators)
+
+
+def test_config_code(operators: NorwegianOperators) -> None:
+    assert operators.config.code == "no"
+
+
+def test_word_replacements(operators: NorwegianOperators) -> None:
+    assert operators.get_word_replacements()["dom"] == "de"
+    assert operators.get_word_replacements()["ke"] == "ikke"
+    assert operators.get_word_replacements()["ok"] == "okei"
+    assert operators.get_word_replacements()["euro"] == "euros"
diff --git a/tests/unit/steps/text/convert_roman_numerals_to_digits_test.py b/tests/unit/steps/text/convert_roman_numerals_to_digits_test.py
index 1769ec9..840c551 100644
--- a/tests/unit/steps/text/convert_roman_numerals_to_digits_test.py
+++ b/tests/unit/steps/text/convert_roman_numerals_to_digits_test.py
@@ -1,4 +1,7 @@
+import pytest
+
 from normalization.languages.base import LanguageOperators
+from normalization.languages.norwegian.operators import NorwegianOperators
 from normalization.steps.text.convert_roman_numerals_to_digits import (
     ConvertRomanNumeralsToDigitsStep,
 )
@@ -70,3 +73,31 @@ def test_v_not_converted_when_preceded_by_digit(
     text = "12 v motor"
     converted_text = ConvertRomanNumeralsToDigitsStep()(text, operators)
     assert converted_text == "12 v motor"
+
+
+@pytest.fixture
+def uppercase_roman_operators() -> NorwegianOperators:
+    return NorwegianOperators()
+
+
+def test_nordic_lowercase_vi_not_roman(
+    uppercase_roman_operators: NorwegianOperators,
+) -> None:
+    text = "Men vi vet ikke"
+    assert ConvertRomanNumeralsToDigitsStep()(text, uppercase_roman_operators) == text
+
+
+def test_nordic_title_case_vi_not_roman(
+    uppercase_roman_operators: NorwegianOperators,
+) -> None:
+    text = "Vi skal se der"
+    assert ConvertRomanNumeralsToDigitsStep()(text, uppercase_roman_operators) == text
+
+
+def test_nordic_all_caps_vi_is_roman_six(
+    uppercase_roman_operators: NorwegianOperators,
+) -> None:
+    text = "KAPITEL VI i loven"
+    assert ConvertRomanNumeralsToDigitsStep()(text, uppercase_roman_operators) == (
+        "KAPITEL 6 i loven"
+    )
diff --git a/tests/unit/steps/text/expand_alphanumeric_codes_test.py b/tests/unit/steps/text/expand_alphanumeric_codes_test.py
new file mode 100644
index 0000000..6f0f83f
--- /dev/null
+++ b/tests/unit/steps/text/expand_alphanumeric_codes_test.py
@@ -0,0 +1,38 @@
+import pytest
+
+from normalization.languages.base import LanguageOperators
+from normalization.languages.norwegian.operators import NorwegianOperators
+from normalization.steps.text.expand_alphanumeric_codes import (
+    ExpandAlphanumericCodesStep,
+)
+
+from .conftest import assert_text_step_registered
+
+
+def test_step_is_registered() -> None:
+    assert_text_step_registered(ExpandAlphanumericCodesStep)
+
+
+def test_pure_letter_all_caps_not_spaced_when_disabled(
+    nordic_acronym_operators: NorwegianOperators,
+) -> None:
+    step = ExpandAlphanumericCodesStep()
+    assert step("SMS til deg", nordic_acronym_operators) == "SMS til deg"
+    assert step("CNN", nordic_acronym_operators) == "CNN"
+
+
+def test_pure_letter_all_caps_spaced_when_enabled(
+    operators: LanguageOperators,
+) -> None:
+    step = ExpandAlphanumericCodesStep()
+    assert step("CNN", operators) == "C N N"
+
+
+def test_mixed_alphanumeric_still_expanded(operators: LanguageOperators) -> None:
+    step = ExpandAlphanumericCodesStep()
+    assert step("ABC123", operators) == "A B C 1 2 3"
+
+
+@pytest.fixture
+def nordic_acronym_operators() -> NorwegianOperators:
+    return NorwegianOperators()
diff --git a/tests/unit/steps/text/remove_standalone_currency_symbols_test.py b/tests/unit/steps/text/remove_standalone_currency_symbols_test.py
new file mode 100644
index 0000000..cd18b86
--- /dev/null
+++ b/tests/unit/steps/text/remove_standalone_currency_symbols_test.py
@@ -0,0 +1,29 @@
+from normalization.languages.norwegian.operators import NorwegianOperators
+from normalization.steps.text.remove_standalone_currency_symbols import (
+    RemoveStandaloneCurrencySymbolsStep,
+)
+
+from .conftest import assert_text_step_registered
+
+
+def test_step_is_registered() -> None:
+    assert_text_step_registered(RemoveStandaloneCurrencySymbolsStep)
+
+
+def test_multi_char_kr_does_not_match_letters_inside_words() -> None:
+    ops = NorwegianOperators()
+    step = RemoveStandaloneCurrencySymbolsStep()
+    assert step("punkt", ops) == "punkt"
+    assert step("euros", ops) == "euros"
+
+
+def test_multi_char_kr_kept_when_touching_digit() -> None:
+    ops = NorwegianOperators()
+    step = RemoveStandaloneCurrencySymbolsStep()
+    assert step("10 kr", ops) == "10 kr"
+
+
+def test_standalone_kr_token_removed_when_not_near_digits() -> None:
+    ops = NorwegianOperators()
+    step = RemoveStandaloneCurrencySymbolsStep()
+    assert step("pris er kr i dag", ops) == "pris er  i dag"
diff --git a/tests/unit/steps/text/remove_symbols_test.py b/tests/unit/steps/text/remove_symbols_test.py
new file mode 100644
index 0000000..aaf663c
--- /dev/null
+++ b/tests/unit/steps/text/remove_symbols_test.py
@@ -0,0 +1,30 @@
+from normalization.languages.base import LanguageOperators
+from normalization.languages.english import EnglishOperators
+from normalization.languages.norwegian.operators import NorwegianOperators
+from normalization.steps.text.remove_symbols import RemoveSymbolsStep
+
+from .conftest import assert_text_step_registered
+
+
+def test_step_is_registered() -> None:
+    assert_text_step_registered(RemoveSymbolsStep)
+
+
+def test_percent_becomes_word_before_symbol_strip(
+    english_operators: EnglishOperators,
+) -> None:
+    text = RemoveSymbolsStep()("8.75% done", english_operators)
+    assert "percent" in text
+    assert "%" not in text
+
+
+def test_percent_skipped_when_not_configured(operators: LanguageOperators) -> None:
+    text = RemoveSymbolsStep()("5%", operators)
+    assert "%" in text
+
+
+def test_percent_becomes_norwegian_word_after_numeric_literal() -> None:
+    ops = NorwegianOperators()
+    text = RemoveSymbolsStep()("8,75% ferdig", ops)
+    assert "prosent" in text
+    assert "%" not in text
diff --git a/tests/unit/steps/text/replace_currency_kr_test.py b/tests/unit/steps/text/replace_currency_kr_test.py
new file mode 100644
index 0000000..d95b208
--- /dev/null
+++ b/tests/unit/steps/text/replace_currency_kr_test.py
@@ -0,0 +1,14 @@
+from normalization.languages.norwegian.operators import NorwegianOperators
+from normalization.steps.text.replace_currency import ReplaceCurrencyStep
+
+
+def test_kr_not_matched_inside_kroner() -> None:
+    ops = NorwegianOperators()
+    step = ReplaceCurrencyStep()
+    assert step("10 kroner", ops) == "10 kroner"
+
+
+def test_kr_after_amount_still_replaced() -> None:
+    ops = NorwegianOperators()
+    step = ReplaceCurrencyStep()
+    assert step("10 kr", ops) == "10 kroner"

From 0a7332c1438c7e813608b20285b48a615236f95c Mon Sep 17 00:00:00 2001
From: egenthon-cmd <egenthon@gladia.io>
Date: Mon, 4 May 2026 12:04:35 -0400
Subject: [PATCH 2/4] fix: affirmations and negation removed from filler words

---
 normalization/languages/norwegian/operators.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/normalization/languages/norwegian/operators.py b/normalization/languages/norwegian/operators.py
index 7a87278..f9b147a 100644
--- a/normalization/languages/norwegian/operators.py
+++ b/normalization/languages/norwegian/operators.py
@@ -59,13 +59,9 @@
         "liksom",
         "bare",
         "nå",
-        "ja",
-        "jo",
-        "nei",
         "a",
         "aa",
         "mmm",
-        "akkurat",
     ],
     digit_words=_NORWEGIAN_DIGIT_WORDS,
     number_words=[

From 3c84e61ffd6d41d580b021246644f40ac717ef19 Mon Sep 17 00:00:00 2001
From: karamouche <hugo4ibt@gmail.com>
Date: Tue, 5 May 2026 13:21:51 -0400
Subject: [PATCH 3/4] test: update Finnish and Swedish test for percentage
 formatting

---
 tests/e2e/files/gladia-3/fi.csv | 2 +-
 tests/e2e/files/gladia-3/sv.csv | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/files/gladia-3/fi.csv b/tests/e2e/files/gladia-3/fi.csv
index fdfc511..08733c6 100644
--- a/tests/e2e/files/gladia-3/fi.csv
+++ b/tests/e2e/files/gladia-3/fi.csv
@@ -12,6 +12,6 @@ x = 5,x yhta kuin 5
 juu ok,joo okei
 "3,14",3 pilkku 14
 "1.234,56",1234 pilkku 56
-"8,75%","8 pilkku 75%"
+"8,75%","8 pilkku 75 prosenttia"
 ping pong,ping pong
 tama on hyva,tama on hyva
diff --git a/tests/e2e/files/gladia-3/sv.csv b/tests/e2e/files/gladia-3/sv.csv
index ded9eb0..458a9f9 100644
--- a/tests/e2e/files/gladia-3/sv.csv
+++ b/tests/e2e/files/gladia-3/sv.csv
@@ -15,3 +15,4 @@ Het woord [inaudible] is hier,het woord inaudible is hier
 "3,14",3 komma 14
 192.168.1.1,192 punkt 168 punkt 1 punkt 1
 ping pong,ping pong
+"8,75%",8 komma 75 procent

From a0204f67246246bd331527945f37fa57b2633e05 Mon Sep 17 00:00:00 2001
From: karamouche <hugo4ibt@gmail.com>
Date: Tue, 5 May 2026 13:23:01 -0400
Subject: [PATCH 4/4] docs: update README to include Finnish and Norwegian
 languages in supported languages list

---
 README.md | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 724e1fd..25d7c84 100644
--- a/README.md
+++ b/README.md
@@ -110,15 +110,17 @@ Pipelines are defined declaratively in **YAML presets**. Each preset lists the s
 
 ## Supported languages
 
-| Code | Language |
-| ---- | -------- |
-| `en` | English  |
-| `fr` | French   |
-| `de` | German   |
-| `it` | Italian  |
-| `es` | Spanish  |
-| `nl` | Dutch    |
-| `sv` | Swedish  |
+| Code | Language  |
+| ---- | --------- |
+| `en` | English   |
+| `fr` | French    |
+| `de` | German    |
+| `it` | Italian   |
+| `es` | Spanish   |
+| `nl` | Dutch     |
+| `sv` | Swedish   |
+| `fi` | Finnish   |
+| `no` | Norwegian |
 
 Unsupported language codes fall back to a safe default that applies language-independent normalization only.