From b119c36e1e6be1e318ce7f570078cea4ffdf0400 Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Thu, 23 Apr 2026 12:04:37 +0200 Subject: [PATCH 1/4] feat: init norwegian language Made-with: Cursor --- docs/steps.md | 30 +- normalization/languages/__init__.py | 3 +- .../languages/base/language_config.py | 10 + normalization/languages/norwegian/__init__.py | 7 + .../languages/norwegian/number_normalizer.py | 428 ++++++++++++++++++ .../languages/norwegian/operators.py | 124 +++++ .../languages/norwegian/replacements.py | 14 + .../text/convert_roman_numerals_to_digits.py | 27 +- .../steps/text/expand_alphanumeric_codes.py | 12 +- .../remove_standalone_currency_symbols.py | 59 ++- normalization/steps/text/remove_symbols.py | 18 +- normalization/steps/text/replace_currency.py | 22 +- tests/e2e/files/gladia-3/no.csv | 20 + .../norwegian_number_normalizer_test.py | 74 +++ .../languages/norwegian_operators_test.py | 29 ++ .../convert_roman_numerals_to_digits_test.py | 31 ++ .../text/expand_alphanumeric_codes_test.py | 38 ++ ...remove_standalone_currency_symbols_test.py | 29 ++ tests/unit/steps/text/remove_symbols_test.py | 30 ++ .../steps/text/replace_currency_kr_test.py | 14 + 20 files changed, 990 insertions(+), 29 deletions(-) create mode 100644 normalization/languages/norwegian/__init__.py create mode 100644 normalization/languages/norwegian/number_normalizer.py create mode 100644 normalization/languages/norwegian/operators.py create mode 100644 normalization/languages/norwegian/replacements.py create mode 100644 tests/e2e/files/gladia-3/no.csv create mode 100644 tests/unit/languages/norwegian_number_normalizer_test.py create mode 100644 tests/unit/languages/norwegian_operators_test.py create mode 100644 tests/unit/steps/text/expand_alphanumeric_codes_test.py create mode 100644 tests/unit/steps/text/remove_standalone_currency_symbols_test.py create mode 100644 tests/unit/steps/text/remove_symbols_test.py create mode 100644 tests/unit/steps/text/replace_currency_kr_test.py diff --git a/docs/steps.md b/docs/steps.md index 5b5bb43..b3eef58 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -75,6 +75,10 @@ Runs before expand_alphanumeric_codes to prevent 'VIII' -> 'V I I I'. Only converts ii-ix to avoid false positives with single letters like 'I'. Skips 'v' when adjacent to digits (version-like contexts: v2, v 12). +When ``operators.config.roman_numerals_uppercase_only`` is True, multi-letter +numerals match only in ALL CAPS (so Swedish/Norwegian ``vi`` / ``Vi`` are not +read as 6). Standalone ``V`` still matches as 5 for titles like ``Louis V``. + ### `convert_word_based_time_patterns` **Base class:** `TextStep` @@ -86,14 +90,19 @@ operators.config.pm_word, operators.config.oclock_word, and operators.get_compound_minutes(). No-op when required config is None. +Regex patterns are compiled once per operators config instance and cached +on the step to avoid recompilation on every call. + ### `expand_alphanumeric_codes` **Base class:** `TextStep` Space out uppercase words and alphanumeric codes. -'ABC123' -> 'A B C 1 2 3', 'CNN' -> 'C N N'. -Skips pure numbers, ordinals (1st, 2nd), and protection markers. Must run before casefold_text. +'ABC123' -> 'A B C 1 2 3'. When ``operators.config.expand_all_caps_letter_by_letter`` +is False, pure letter ALL-CAPS tokens (e.g. SMS) are left intact for Nordic-style +acronym handling. Skips pure numbers, ordinals (1st, 2nd), and protection markers. +Must run before casefold_text. ### `expand_contractions` @@ -329,13 +338,21 @@ Handles ¤ markers by processing segments separately. Remove currency symbols that are not adjacent to numbers. +Single-character symbols use the classic between/start/end patterns (not +between two digits). Multi-character keys (e.g. ``kr``) are matched only as +whole tokens (``\b...\b``) and are skipped when a digit is nearby with +only whitespace in between, so ordinary words are not corrupted. + ### `remove_symbols` **Base class:** `TextStep` Replace markers, symbols, and punctuation with spaces. -Preserves letters, digits, and all placeholder characters. +Preserves letters, digits, and all placeholder characters. When +``symbols_to_words`` defines a word for ``%``, expands ``%`` only when it +follows a decimal or integer literal (e.g. ``8,75%``), so other ``%`` uses +stay unchanged. ### `remove_thousand_separators` @@ -376,7 +393,12 @@ No-op when either is None. **Base class:** `TextStep` -Replace currency symbols with their corresponding words. +Replace currency symbols with their corresponding words next to amounts. + +For each entry in ``operators.config.currency_symbol_to_word``, substitutes +the symbol before or after a numeric literal (including placeholder decimals). +Alphanumeric symbols (e.g. ``kr``) use word boundaries so a token like +``kroner`` is not treated as ``kr`` plus a suffix. ### `restore_decimal_separator_with_word` diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py index 905c82f..a05d033 100644 --- a/normalization/languages/__init__.py +++ b/normalization/languages/__init__.py @@ -1,4 +1,4 @@ -from . import dutch, english, french, german, italian, spanish +from . import dutch, english, french, german, italian, norwegian, spanish from .base import LanguageOperators from .registry import get_language_registry, register_language @@ -10,6 +10,7 @@ "french", "german", "italian", + "norwegian", "spanish", "get_language_registry", ] diff --git a/normalization/languages/base/language_config.py b/normalization/languages/base/language_config.py index b4a2495..3387cfc 100644 --- a/normalization/languages/base/language_config.py +++ b/normalization/languages/base/language_config.py @@ -75,6 +75,16 @@ class LanguageConfig: ordinal_suffixes: list[str] | None = None """Ordinal number suffixes for this language (e.g. ["st", "nd", "rd", "th"] for English). Used by steps that need to detect ordinal numbers. None = ordinal detection is skipped.""" + roman_numerals_uppercase_only: bool = False + """When True, only treat Roman numerals as digits if they appear in ALL CAPS (e.g. VI, VIII). + + Avoids collisions with Nordic pronouns spelled ``vi``/``Vi``. Default False preserves + legacy case-insensitive matching for other languages.""" + expand_all_caps_letter_by_letter: bool = True + """When False, pure letter ALL-CAPS tokens (e.g. SMS) are not spaced into letters. + + Nordic STT hypotheses often keep acronyms as one word; default True preserves + English-style letter-by-letter expansion for CAPS-only tokens.""" am_word: str | None = None """Canonical AM time designator (e.g. "am" for English). Used by am/pm time formatting steps. None = am/pm steps are skipped.""" diff --git a/normalization/languages/norwegian/__init__.py b/normalization/languages/norwegian/__init__.py new file mode 100644 index 0000000..2b87c45 --- /dev/null +++ b/normalization/languages/norwegian/__init__.py @@ -0,0 +1,7 @@ +from .operators import NorwegianOperators +from .replacements import NORWEGIAN_REPLACEMENTS + +__all__ = [ + "NorwegianOperators", + "NORWEGIAN_REPLACEMENTS", +] diff --git a/normalization/languages/norwegian/number_normalizer.py b/normalization/languages/norwegian/number_normalizer.py new file mode 100644 index 0000000..d1c1a67 --- /dev/null +++ b/normalization/languages/norwegian/number_normalizer.py @@ -0,0 +1,428 @@ +"""Norwegian (Bokmål) number normalizer (STT-oriented). + +``text2num.alpha2digit`` does not cover Norwegian well for transcript-style +cardinals, so this module mirrors the Swedish approach: 0–999, ``tusen`` +compounds, and large multipliers (``million``, ``milliard``, ``billion``). +Optionally rewrites currency symbols, then restores plural currency words from +config. Supports optional ``og`` between number parts (e.g. ``tjue og fem``). +""" + +from __future__ import annotations + +import re + + +def _fold(s: str) -> str: + return s.lower() + + +def _get(table: dict[str, int], word: str) -> int | None: + fw = _fold(word) + for k, v in table.items(): + if _fold(k) == fw: + return v + return None + + +def _skip_optional_og(words: list[str], j: int, n: int) -> int: + if j < n and _fold(words[j]) == "og": + return j + 1 + return j + + +_ONES_2_9: dict[str, int] = { + "to": 2, + "tre": 3, + "fire": 4, + "fem": 5, + "seks": 6, + "sju": 7, + "syv": 7, + "åtte": 8, + "atte": 8, + "ni": 9, +} + +_TEENS: dict[str, int] = { + "ti": 10, + "elleve": 11, + "tolv": 12, + "tretten": 13, + "fjorten": 14, + "femten": 15, + "seksten": 16, + "sytten": 17, + "atten": 18, + "nitten": 19, +} + +_TENS: dict[str, int] = { + "tjue": 20, + "tyve": 20, + "tretti": 30, + "førti": 40, + "forti": 40, + "femti": 50, + "seksti": 60, + "sytti": 70, + "åtti": 80, + "atti": 80, + "nitti": 90, +} + +_TENS_PREFIXES: tuple[tuple[str, int], ...] = tuple( + sorted(_TENS.items(), key=lambda kv: len(kv[0]), reverse=True) +) + +_ONES_AFTER_TENS: dict[str, int] = {"ett": 1, "en": 1, "ein": 1, **_ONES_2_9} + +_DIGIT_TO_NORWEGIAN: dict[str, str] = { + "0": "null", + "1": "en", + "2": "to", + "3": "tre", + "4": "fire", + "5": "fem", + "6": "seks", + "7": "sju", + "8": "åtte", + "9": "ni", +} + +_RE_MIXED_NUMBER = re.compile( + r"\b(\d+)\s+(" + r"million|millioner|milliard|milliarder|billion|billioner|tusen" + r")\b", + re.IGNORECASE, +) + +_BIG_MULT: dict[str, int] = { + "tusen": 1000, + "million": 1_000_000, + "millioner": 1_000_000, + "milliard": 1_000_000_000, + "milliarder": 1_000_000_000, + "billion": 1_000_000_000_000, + "billioner": 1_000_000_000_000, +} + + +def _normalize_mixed_numbers(text: str) -> str: + """Convert ``3 milliard`` → ``tre milliard`` so the word parser yields 3e9.""" + + def replace(match: re.Match[str]) -> str: + number = match.group(1) + multiplier = match.group(2) + if len(number) == 1 and number in _DIGIT_TO_NORWEGIAN: + return f"{_DIGIT_TO_NORWEGIAN[number]} {multiplier}" + return match.group(0) + + return _RE_MIXED_NUMBER.sub(replace, text) + + +def _singular_spoken_unit(trailing_word: str) -> str: + t = trailing_word.lower() + if t == "euros": + return "euro" + if t == "dollars": + return "dollar" + if t == "pounds": + return "pound" + if t == "kroner": + return "krone" + if t == "yens": + return "yen" + return trailing_word + + +def _normalize_currency_symbols( + text: str, + currency_symbol_to_word: dict[str, str] | None, +) -> str: + if not currency_symbol_to_word: + return text + num = r"\d+(?:[.,]\d+)?" + for symbol, trailing in currency_symbol_to_word.items(): + singular = _singular_spoken_unit(trailing) + esc = re.escape(symbol) + sym = rf"\b{esc}\b" if len(symbol) > 1 else esc + text = re.sub(rf"{sym}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE) + text = re.sub(rf"({num})\s*{sym}", rf"\1 {singular}", text, flags=re.IGNORECASE) + return text + + +def _currency_plural_fix_patterns( + currency_symbol_to_word: dict[str, str] | None, +) -> tuple[tuple[re.Pattern[str], str], ...]: + if not currency_symbol_to_word: + return () + amount = r"(\d+(?:[.,]\d+)?)" + seen: set[str] = set() + out: list[tuple[re.Pattern[str], str]] = [] + for _symbol, trailing in currency_symbol_to_word.items(): + tl = trailing.lower() + if tl in seen: + continue + seen.add(tl) + singular = _singular_spoken_unit(trailing) + if singular.lower() == tl: + continue + if tl == "euros": + pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE) + out.append((pat, rf"\1 {trailing}")) + elif tl == "kroner": + pat = re.compile(rf"\b{amount}\s+krone\b", re.IGNORECASE) + out.append((pat, rf"\1 {trailing}")) + else: + pat = re.compile( + rf"\b{amount}\s+{re.escape(singular)}\b", + re.IGNORECASE, + ) + out.append((pat, rf"\1 {trailing}")) + return tuple(out) + + +def _apply_currency_plural_fixes( + text: str, + fixers: tuple[tuple[re.Pattern[str], str], ...], +) -> str: + for pattern, repl in fixers: + text = pattern.sub(repl, text) + return text + + +def _hundred_multiplier(word: str) -> int | None: + if _fold(word) in ("en", "ett", "ein"): + return 1 + return _get(_ONES_2_9, word) + + +class NorwegianNumberNormalizer: + """Convert Norwegian spelled-out numbers to digits.""" + + def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None: + self._currency_symbol_to_word = currency_symbol_to_word + self._currency_plural_fixes = _currency_plural_fix_patterns( + currency_symbol_to_word, + ) + + def __call__(self, text: str) -> str: + if not text.strip(): + return text + text = _normalize_currency_symbols(text, self._currency_symbol_to_word) + text = _normalize_mixed_numbers(text) + words = text.split() + out: list[str] = [] + i = 0 + n = len(words) + while i < n: + parsed = self._parse_number(words, i, n) + if parsed is not None: + end, value = parsed + out.append(str(value)) + i = end + else: + out.append(words[i]) + i += 1 + text = " ".join(out) + text = _apply_currency_plural_fixes(text, self._currency_plural_fixes) + return text + + def _parse_number(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + + fw = _fold(words[i]) + + if fw == "tusen": + j = _skip_optional_og(words, i + 1, n) + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, 1000 + v2 + return j, 1000 + + if i + 1 < n and fw in ("en", "ett", "ein") and _fold(words[i + 1]) == "tusen": + j = i + 2 + j = _skip_optional_og(words, j, n) + tail = self._parse_number(words, j, n) + base = 1000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return j, base + + if ( + i + 1 < n + and fw in ("en", "ett", "ein") + and _fold(words[i + 1]) == "million" + ): + j = i + 2 + j = _skip_optional_og(words, j, n) + tail = self._parse_number(words, j, n) + base = 1_000_000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return j, base + + if ( + i + 1 < n + and fw in ("en", "ett", "ein") + and _fold(words[i + 1]) + in ( + "milliard", + "milliarder", + ) + ): + j = i + 2 + j = _skip_optional_og(words, j, n) + tail = self._parse_number(words, j, n) + base = 1_000_000_000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return j, base + + if ( + i + 1 < n + and fw in ("en", "ett", "ein") + and _fold(words[i + 1]) + in ( + "billion", + "billioner", + ) + ): + j = i + 2 + j = _skip_optional_og(words, j, n) + tail = self._parse_number(words, j, n) + base = 1_000_000_000_000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return j, base + + sub999 = self._parse_0_999(words, i, n) + if sub999 is None: + return None + j, v = sub999 + if j >= n: + return j, v + + next_fw = _fold(words[j]) + if next_fw == "tusen": + j += 1 + j = _skip_optional_og(words, j, n) + prod = v * 1000 + if j >= n: + return j, prod + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, prod + v2 + return j, prod + + mult = _BIG_MULT.get(next_fw) + if mult is not None and mult >= 1_000_000: + j += 1 + j = _skip_optional_og(words, j, n) + prod = v * mult + if j >= n: + return j, prod + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, prod + v2 + return j, prod + + return j, v + + def _parse_0_999(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + + if _fold(words[i]) == "null": + if i + 1 < n and self._continues_number(words[i + 1]): + return None + return i + 1, 0 + + if _fold(words[i]) == "hundre": + j = _skip_optional_og(words, i + 1, n) + tail = self._parse_0_99(words, j, n) + if tail is not None: + je, tv = tail + return je, 100 + tv + return i + 1, 100 + + if i + 1 < n and _fold(words[i + 1]) == "hundre": + m = _hundred_multiplier(words[i]) + if m is None: + return None + base = m * 100 + j = i + 2 + j = _skip_optional_og(words, j, n) + tail = self._parse_0_99(words, j, n) + if tail is not None: + je, tv = tail + return je, base + tv + return j, base + + return self._parse_0_99(words, i, n) + + def _continues_number(self, word: str) -> bool: + fw = _fold(word) + if fw == "og": + return True + if fw == "hundre" or fw == "tusen": + return True + if fw in _BIG_MULT: + return True + if _get(_TEENS, word) is not None: + return True + if _get(_TENS, word) is not None: + return True + if _get(_ONES_2_9, word) is not None: + return True + if fw in ("en", "ett", "ein"): + return True + return False + + def _parse_0_99(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + + fw = _fold(words[i]) + + v = _get(_TEENS, words[i]) + if v is not None: + return i + 1, v + + for prefix, tval in _TENS_PREFIXES: + pl = len(prefix) + if fw.startswith(prefix) and len(fw) > pl: + rest = fw[pl:] + unit = _get(_ONES_AFTER_TENS, rest) + if unit is not None: + return i + 1, tval + unit + + tens = _get(_TENS, words[i]) + if tens is not None: + j = i + 1 + j = _skip_optional_og(words, j, n) + if j < n: + nfw = _fold(words[j]) + if nfw in ("ett", "en", "ein"): + return j + 1, tens + 1 + o = _get(_ONES_2_9, words[j]) + if o is not None: + return j + 1, tens + o + return i + 1, tens + + o = _get(_ONES_2_9, words[i]) + if o is not None: + return i + 1, o + + if fw in ("en", "ett", "ein"): + return None + + return None diff --git a/normalization/languages/norwegian/operators.py b/normalization/languages/norwegian/operators.py new file mode 100644 index 0000000..7a87278 --- /dev/null +++ b/normalization/languages/norwegian/operators.py @@ -0,0 +1,124 @@ +from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.norwegian.number_normalizer import ( + NorwegianNumberNormalizer, +) +from normalization.languages.registry import register_language + +_NORWEGIAN_DIGIT_WORDS: dict[str, str] = { + "null": "0", + "en": "1", + "ett": "1", + "et": "1", + "ein": "1", + "to": "2", + "tre": "3", + "fire": "4", + "fem": "5", + "seks": "6", + "sju": "7", + "syv": "7", + "åtte": "8", + "ni": "9", +} + +NORWEGIAN_CONFIG = LanguageConfig( + code="no", + roman_numerals_uppercase_only=True, + expand_all_caps_letter_by_letter=False, + decimal_separator=",", + decimal_word="komma", + thousand_separator=" ", + symbols_to_words={ + "@": "krollalfa", + ".": "punkt", + "+": "plus", + "=": "er lik med", + ">": "storre enn", + "<": "mindre enn", + "°": "grader", + "°C": "grader celsius", + "°F": "grader fahrenheit", + "%": "prosent", + }, + currency_symbol_to_word={ + "€": "euros", + "$": "dollars", + "£": "pounds", + "¢": "cent", + "¥": "yens", + "kr": "kroner", + }, + filler_words=[ + "eh", + "øh", + "hm", + "hmm", + "mm", + "mhm", + "altså", + "liksom", + "bare", + "nå", + "ja", + "jo", + "nei", + "a", + "aa", + "mmm", + "akkurat", + ], + digit_words=_NORWEGIAN_DIGIT_WORDS, + number_words=[ + *_NORWEGIAN_DIGIT_WORDS, + "ti", + "elleve", + "tolv", + "tretten", + "fjorten", + "femten", + "seksten", + "sytten", + "atten", + "nitten", + "tjue", + "tyve", + "tretti", + "førti", + "forti", + "femti", + "seksti", + "sytti", + "åtti", + "atti", + "nitti", + "hundre", + "tusen", + "million", + "millioner", + "milliard", + "milliarder", + "billion", + "billioner", + ], + plus_word="plus", +) + + +@register_language +class NorwegianOperators(LanguageOperators): + def __init__(self) -> None: + super().__init__(NORWEGIAN_CONFIG) + self._number_normalizer = NorwegianNumberNormalizer( + NORWEGIAN_CONFIG.currency_symbol_to_word, + ) + + def expand_written_numbers(self, text: str) -> str: + """Convert Norwegian spelled-out numbers to digits (e.g. tjue fem → 25).""" + return self._number_normalizer(text) + + def get_word_replacements(self) -> dict[str, str]: + from normalization.languages.norwegian.replacements import ( + NORWEGIAN_REPLACEMENTS, + ) + + return NORWEGIAN_REPLACEMENTS diff --git a/normalization/languages/norwegian/replacements.py b/normalization/languages/norwegian/replacements.py new file mode 100644 index 0000000..3f850ae --- /dev/null +++ b/normalization/languages/norwegian/replacements.py @@ -0,0 +1,14 @@ +"""Colloquial / spelling variants → standard Bokmål (canonical for WER).""" + +NORWEGIAN_REPLACEMENTS: dict[str, str] = { + "dom": "de", + "ke": "ikke", + "kor": "hvor", + "ska": "skal", + "euro": "euros", + "krone": "kroner", + "ok": "okei", + "kreditkort": "kredittkort", + "kreditkortet": "kredittkortet", + "derre": "der", +} diff --git a/normalization/steps/text/convert_roman_numerals_to_digits.py b/normalization/steps/text/convert_roman_numerals_to_digits.py index 135cbf6..6f1062a 100644 --- a/normalization/steps/text/convert_roman_numerals_to_digits.py +++ b/normalization/steps/text/convert_roman_numerals_to_digits.py @@ -23,19 +23,34 @@ class ConvertRomanNumeralsToDigitsStep(TextStep): Runs before expand_alphanumeric_codes to prevent 'VIII' -> 'V I I I'. Only converts ii-ix to avoid false positives with single letters like 'I'. Skips 'v' when adjacent to digits (version-like contexts: v2, v 12). + + When ``operators.config.roman_numerals_uppercase_only`` is True, multi-letter + numerals match only in ALL CAPS (so Swedish/Norwegian ``vi`` / ``Vi`` are not + read as 6). Standalone ``V`` still matches as 5 for titles like ``Louis V``. """ name = "convert_roman_numerals_to_digits" def __call__(self, text: str, operators: LanguageOperators) -> str: + upper_only = operators.config.roman_numerals_uppercase_only for roman, arabic in _ROMAN_REPLACEMENTS.items(): if roman == "v": - text = re.sub( - r"(? 'A B C 1 2 3', 'CNN' -> 'C N N'. - Skips pure numbers, ordinals (1st, 2nd), and protection markers. Must run before casefold_text. + 'ABC123' -> 'A B C 1 2 3'. When ``operators.config.expand_all_caps_letter_by_letter`` + is False, pure letter ALL-CAPS tokens (e.g. SMS) are left intact for Nordic-style + acronym handling. Skips pure numbers, ordinals (1st, 2nd), and protection markers. + Must run before casefold_text. """ name = "expand_alphanumeric_codes" @@ -56,6 +58,12 @@ def _should_process(match: re.Match) -> str: return word has_digit = any(c.isdigit() for c in word) + if ( + not operators.config.expand_all_caps_letter_by_letter + and word.isupper() + and not has_digit + ): + return word if word.isupper() or has_digit: return _expand_word(match) diff --git a/normalization/steps/text/remove_standalone_currency_symbols.py b/normalization/steps/text/remove_standalone_currency_symbols.py index c6bd60e..937ad89 100644 --- a/normalization/steps/text/remove_standalone_currency_symbols.py +++ b/normalization/steps/text/remove_standalone_currency_symbols.py @@ -5,9 +5,24 @@ from normalization.steps.registry import register_step -def _make_standalone_patterns( +def _currency_touching_digit(text: str, start: int, end: int) -> bool: + """True if a digit is next to this span, allowing only whitespace in between.""" + i = start - 1 + while i >= 0 and text[i].isspace(): + i -= 1 + if i >= 0 and text[i].isdigit(): + return True + i = end + while i < len(text) and text[i].isspace(): + i += 1 + if i < len(text) and text[i].isdigit(): + return True + return False + + +def _make_single_char_patterns( symbols: frozenset[str], -) -> tuple[re.Pattern, re.Pattern, re.Pattern, re.Pattern]: +) -> tuple[re.Pattern[str], re.Pattern[str], re.Pattern[str], re.Pattern[str]]: char_class = "[" + re.escape("".join(symbols)) + "]" between = re.compile(rf"([^0-9]){char_class}([^0-9])") start = re.compile(rf"^{char_class}([^0-9])") @@ -18,18 +33,44 @@ def _make_standalone_patterns( @register_step class RemoveStandaloneCurrencySymbolsStep(TextStep): - """Remove currency symbols that are not adjacent to numbers.""" + """Remove currency symbols that are not adjacent to numbers. + + Single-character symbols use the classic between/start/end patterns (not + between two digits). Multi-character keys (e.g. ``kr``) are matched only as + whole tokens (``\\b...\\b``) and are skipped when a digit is nearby with + only whitespace in between, so ordinary words are not corrupted. + """ name = "remove_standalone_currency_symbols" def __call__(self, text: str, operators: LanguageOperators) -> str: - symbols = frozenset(operators.config.currency_symbol_to_word.keys()) + symbols = tuple(operators.config.currency_symbol_to_word.keys()) if not symbols: return text - between, start, end, standalone = _make_standalone_patterns(symbols) - text = between.sub(r"\1 \2", text) - text = start.sub(r" \1", text) - text = end.sub(r"\1 ", text) - text = standalone.sub(" ", text) + singles = frozenset(s for s in symbols if len(s) == 1) + + for sym in sorted( + (s for s in symbols if len(s) > 1), + key=len, + reverse=True, + ): + esc = re.escape(str(sym)) + pat = re.compile(rf"\b{esc}\b", re.IGNORECASE) + cur_text = text + + def repl(m: re.Match[str]) -> str: + if _currency_touching_digit(cur_text, m.start(), m.end()): + return m.group(0) + return "" + + text = pat.sub(repl, cur_text) + + if singles: + between, start, end, standalone = _make_single_char_patterns(singles) + text = between.sub(r"\1 \2", text) + text = start.sub(r" \1", text) + text = end.sub(r"\1 ", text) + text = standalone.sub(" ", text) + return text diff --git a/normalization/steps/text/remove_symbols.py b/normalization/steps/text/remove_symbols.py index 639a681..e31bbc1 100644 --- a/normalization/steps/text/remove_symbols.py +++ b/normalization/steps/text/remove_symbols.py @@ -1,3 +1,4 @@ +import re import unicodedata from normalization.constants.protectors import ProtectPlaceholder @@ -12,13 +13,26 @@ class RemoveSymbolsStep(TextStep): """Replace markers, symbols, and punctuation with spaces. - Preserves letters, digits, and all placeholder characters. + Preserves letters, digits, and all placeholder characters. When + ``symbols_to_words`` defines a word for ``%``, expands ``%`` only when it + follows a decimal or integer literal (e.g. ``8,75%``), so other ``%`` uses + stay unchanged. """ name = "remove_symbols" def __call__(self, text: str, operators: LanguageOperators) -> str: + text = unicodedata.normalize("NFKC", text) + pct_word = operators.config.symbols_to_words.get("%") + if pct_word: + # Only expand ``%`` after numeric literals (e.g. 8,75%) so brand-style + # strings like ``Signal%%Mark`` stay intact. + text = re.sub( + rf"(\d+(?:[.,]\d+)?)\s*{re.escape('%')}", + rf"\1 {pct_word}", + text, + ) return "".join( c if c in _KEEP_CHARS else " " if unicodedata.category(c)[0] in "MSP" else c - for c in unicodedata.normalize("NFKC", text) + for c in text ) diff --git a/normalization/steps/text/replace_currency.py b/normalization/steps/text/replace_currency.py index 7e59cf5..f7d5ef5 100644 --- a/normalization/steps/text/replace_currency.py +++ b/normalization/steps/text/replace_currency.py @@ -8,17 +8,29 @@ _CURRENCY_NUM = rf"\d+(?:{ProtectPlaceholder.DECIMAL_SEPARATOR.value}\d+)?" -def _make_currency_patterns(symbol: str) -> tuple[re.Pattern, re.Pattern]: +def _make_currency_patterns( + symbol: str, +) -> tuple[re.Pattern[str], re.Pattern[str]]: escaped = re.escape(symbol) - before = re.compile(rf"{escaped}\s*({_CURRENCY_NUM})", re.IGNORECASE) - after = re.compile(rf"({_CURRENCY_NUM})\s*{escaped}", re.IGNORECASE) + # Alphanumeric codes (e.g. "kr") must be whole tokens so we do not match + # "kr" inside "kroner" after another step has already expanded the amount. + if symbol.isalnum(): + before = re.compile(rf"\b{escaped}\b\s*({_CURRENCY_NUM})", re.IGNORECASE) + after = re.compile(rf"({_CURRENCY_NUM})\s*\b{escaped}\b", re.IGNORECASE) + else: + before = re.compile(rf"{escaped}\s*({_CURRENCY_NUM})", re.IGNORECASE) + after = re.compile(rf"({_CURRENCY_NUM})\s*{escaped}", re.IGNORECASE) return before, after @register_step class ReplaceCurrencyStep(TextStep): - """ - Replace currency symbols with their corresponding words. + """Replace currency symbols with their corresponding words next to amounts. + + For each entry in ``operators.config.currency_symbol_to_word``, substitutes + the symbol before or after a numeric literal (including placeholder decimals). + Alphanumeric symbols (e.g. ``kr``) use word boundaries so a token like + ``kroner`` is not treated as ``kr`` plus a suffix. """ name = "replace_currency" diff --git a/tests/e2e/files/gladia-3/no.csv b/tests/e2e/files/gladia-3/no.csv new file mode 100644 index 0000000..4cb16f4 --- /dev/null +++ b/tests/e2e/files/gladia-3/no.csv @@ -0,0 +1,20 @@ +input,expected +ti euro,10 euros +2 < 5,2 mindre enn 5 +2 > 5,2 storre enn 5 +50°C,50 grader celsius +Det koster €50,det koster 50 euros +tjue fem kroner,25 kroner +10 kr,10 kroner +dom heter Anna,de heter anna +hallo eh der,hallo der +test@example.com,test krollalfa example punkt com +www.example.com,w w w punkt example punkt com +x = 5,x er lik med 5 +Ordet [inaudible] er her,ordet inaudible er her +"1.234,56",1234 komma 56 +"3,14",3 komma 14 +192.168.1.1,192 punkt 168 punkt 1 punkt 1 +ping pong,ping pong +"8,75%","8 komma 75 prosent" +ok da,okei da diff --git a/tests/unit/languages/norwegian_number_normalizer_test.py b/tests/unit/languages/norwegian_number_normalizer_test.py new file mode 100644 index 0000000..461c6e7 --- /dev/null +++ b/tests/unit/languages/norwegian_number_normalizer_test.py @@ -0,0 +1,74 @@ +import pytest + +from normalization.languages.norwegian.number_normalizer import ( + NorwegianNumberNormalizer, +) +from normalization.languages.norwegian.operators import NORWEGIAN_CONFIG + + +@pytest.fixture +def normalizer() -> NorwegianNumberNormalizer: + return NorwegianNumberNormalizer(NORWEGIAN_CONFIG.currency_symbol_to_word) + + +@pytest.fixture +def normalizer_no_currency() -> NorwegianNumberNormalizer: + return NorwegianNumberNormalizer(None) + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("tjue fem", "25"), + ("tjue og fem", "25"), + ("tjueen", "21"), + ("fem hundre femti", "550"), + ("fem hundre og femti", "550"), + ("en million", "1000000"), + ("tre milliarder", "3000000000"), + ("3 milliard", "3000000000"), + ("tjue tusen fem", "20005"), + ("tjue tusen og fem", "20005"), + ("null", "0"), + ("femten", "15"), + ], +) +def test_norwegian_spelled_numbers( + normalizer: NorwegianNumberNormalizer, text: str, expected: str +) -> None: + assert normalizer(text) == expected + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("ti euro", "10 euros"), + ("hundre kroner", "100 kroner"), + ("€10", "10 euros"), + ("10 kr", "10 kroner"), + ("fem dollar", "5 dollars"), + ], +) +def test_currency_symbols_and_plural_trailing_words( + normalizer: NorwegianNumberNormalizer, text: str, expected: str +) -> None: + assert normalizer(text) == expected + + +def test_without_currency_config_leaves_currency_symbol( + normalizer_no_currency: NorwegianNumberNormalizer, +) -> None: + assert normalizer_no_currency("tjue fem") == "25" + assert normalizer_no_currency("€10") == "€10" + assert normalizer_no_currency("3 milliard") == "3000000000" + + +def test_non_numeric_text_unchanged(normalizer: NorwegianNumberNormalizer) -> None: + text = "dette er vanlig tekst" + assert normalizer(text) == text + + +def test_kroner_word_not_treated_as_currency_suffix( + normalizer: NorwegianNumberNormalizer, +) -> None: + assert normalizer("25 kroner") == "25 kroner" diff --git a/tests/unit/languages/norwegian_operators_test.py b/tests/unit/languages/norwegian_operators_test.py new file mode 100644 index 0000000..6a243dc --- /dev/null +++ b/tests/unit/languages/norwegian_operators_test.py @@ -0,0 +1,29 @@ +import pytest + +from normalization.languages.norwegian.operators import NorwegianOperators +from normalization.languages.registry import get_language_registry + + +@pytest.fixture +def operators() -> NorwegianOperators: + return NorwegianOperators() + + +def test_norwegian_is_registered() -> None: + assert "no" in get_language_registry() + + +def test_norwegian_registry_produces_norwegian_operators() -> None: + instance = get_language_registry()["no"]() + assert isinstance(instance, NorwegianOperators) + + +def test_config_code(operators: NorwegianOperators) -> None: + assert operators.config.code == "no" + + +def test_word_replacements(operators: NorwegianOperators) -> None: + assert operators.get_word_replacements()["dom"] == "de" + assert operators.get_word_replacements()["ke"] == "ikke" + assert operators.get_word_replacements()["ok"] == "okei" + assert operators.get_word_replacements()["euro"] == "euros" diff --git a/tests/unit/steps/text/convert_roman_numerals_to_digits_test.py b/tests/unit/steps/text/convert_roman_numerals_to_digits_test.py index 1769ec9..840c551 100644 --- a/tests/unit/steps/text/convert_roman_numerals_to_digits_test.py +++ b/tests/unit/steps/text/convert_roman_numerals_to_digits_test.py @@ -1,4 +1,7 @@ +import pytest + from normalization.languages.base import LanguageOperators +from normalization.languages.norwegian.operators import NorwegianOperators from normalization.steps.text.convert_roman_numerals_to_digits import ( ConvertRomanNumeralsToDigitsStep, ) @@ -70,3 +73,31 @@ def test_v_not_converted_when_preceded_by_digit( text = "12 v motor" converted_text = ConvertRomanNumeralsToDigitsStep()(text, operators) assert converted_text == "12 v motor" + + +@pytest.fixture +def uppercase_roman_operators() -> NorwegianOperators: + return NorwegianOperators() + + +def test_nordic_lowercase_vi_not_roman( + uppercase_roman_operators: NorwegianOperators, +) -> None: + text = "Men vi vet ikke" + assert ConvertRomanNumeralsToDigitsStep()(text, uppercase_roman_operators) == text + + +def test_nordic_title_case_vi_not_roman( + uppercase_roman_operators: NorwegianOperators, +) -> None: + text = "Vi skal se der" + assert ConvertRomanNumeralsToDigitsStep()(text, uppercase_roman_operators) == text + + +def test_nordic_all_caps_vi_is_roman_six( + uppercase_roman_operators: NorwegianOperators, +) -> None: + text = "KAPITEL VI i loven" + assert ConvertRomanNumeralsToDigitsStep()(text, uppercase_roman_operators) == ( + "KAPITEL 6 i loven" + ) diff --git a/tests/unit/steps/text/expand_alphanumeric_codes_test.py b/tests/unit/steps/text/expand_alphanumeric_codes_test.py new file mode 100644 index 0000000..6f0f83f --- /dev/null +++ b/tests/unit/steps/text/expand_alphanumeric_codes_test.py @@ -0,0 +1,38 @@ +import pytest + +from normalization.languages.base import LanguageOperators +from normalization.languages.norwegian.operators import NorwegianOperators +from normalization.steps.text.expand_alphanumeric_codes import ( + ExpandAlphanumericCodesStep, +) + +from .conftest import assert_text_step_registered + + +def test_step_is_registered() -> None: + assert_text_step_registered(ExpandAlphanumericCodesStep) + + +def test_pure_letter_all_caps_not_spaced_when_disabled( + nordic_acronym_operators: NorwegianOperators, +) -> None: + step = ExpandAlphanumericCodesStep() + assert step("SMS til deg", nordic_acronym_operators) == "SMS til deg" + assert step("CNN", nordic_acronym_operators) == "CNN" + + +def test_pure_letter_all_caps_spaced_when_enabled( + operators: LanguageOperators, +) -> None: + step = ExpandAlphanumericCodesStep() + assert step("CNN", operators) == "C N N" + + +def test_mixed_alphanumeric_still_expanded(operators: LanguageOperators) -> None: + step = ExpandAlphanumericCodesStep() + assert step("ABC123", operators) == "A B C 1 2 3" + + +@pytest.fixture +def nordic_acronym_operators() -> NorwegianOperators: + return NorwegianOperators() diff --git a/tests/unit/steps/text/remove_standalone_currency_symbols_test.py b/tests/unit/steps/text/remove_standalone_currency_symbols_test.py new file mode 100644 index 0000000..cd18b86 --- /dev/null +++ b/tests/unit/steps/text/remove_standalone_currency_symbols_test.py @@ -0,0 +1,29 @@ +from normalization.languages.norwegian.operators import NorwegianOperators +from normalization.steps.text.remove_standalone_currency_symbols import ( + RemoveStandaloneCurrencySymbolsStep, +) + +from .conftest import assert_text_step_registered + + +def test_step_is_registered() -> None: + assert_text_step_registered(RemoveStandaloneCurrencySymbolsStep) + + +def test_multi_char_kr_does_not_match_letters_inside_words() -> None: + ops = NorwegianOperators() + step = RemoveStandaloneCurrencySymbolsStep() + assert step("punkt", ops) == "punkt" + assert step("euros", ops) == "euros" + + +def test_multi_char_kr_kept_when_touching_digit() -> None: + ops = NorwegianOperators() + step = RemoveStandaloneCurrencySymbolsStep() + assert step("10 kr", ops) == "10 kr" + + +def test_standalone_kr_token_removed_when_not_near_digits() -> None: + ops = NorwegianOperators() + step = RemoveStandaloneCurrencySymbolsStep() + assert step("pris er kr i dag", ops) == "pris er i dag" diff --git a/tests/unit/steps/text/remove_symbols_test.py b/tests/unit/steps/text/remove_symbols_test.py new file mode 100644 index 0000000..aaf663c --- /dev/null +++ b/tests/unit/steps/text/remove_symbols_test.py @@ -0,0 +1,30 @@ +from normalization.languages.base import LanguageOperators +from normalization.languages.english import EnglishOperators +from normalization.languages.norwegian.operators import NorwegianOperators +from normalization.steps.text.remove_symbols import RemoveSymbolsStep + +from .conftest import assert_text_step_registered + + +def test_step_is_registered() -> None: + assert_text_step_registered(RemoveSymbolsStep) + + +def test_percent_becomes_word_before_symbol_strip( + english_operators: EnglishOperators, +) -> None: + text = RemoveSymbolsStep()("8.75% done", english_operators) + assert "percent" in text + assert "%" not in text + + +def test_percent_skipped_when_not_configured(operators: LanguageOperators) -> None: + text = RemoveSymbolsStep()("5%", operators) + assert "%" in text + + +def test_percent_becomes_norwegian_word_after_numeric_literal() -> None: + ops = NorwegianOperators() + text = RemoveSymbolsStep()("8,75% ferdig", ops) + assert "prosent" in text + assert "%" not in text diff --git a/tests/unit/steps/text/replace_currency_kr_test.py b/tests/unit/steps/text/replace_currency_kr_test.py new file mode 100644 index 0000000..d95b208 --- /dev/null +++ b/tests/unit/steps/text/replace_currency_kr_test.py @@ -0,0 +1,14 @@ +from normalization.languages.norwegian.operators import NorwegianOperators +from normalization.steps.text.replace_currency import ReplaceCurrencyStep + + +def test_kr_not_matched_inside_kroner() -> None: + ops = NorwegianOperators() + step = ReplaceCurrencyStep() + assert step("10 kroner", ops) == "10 kroner" + + +def test_kr_after_amount_still_replaced() -> None: + ops = NorwegianOperators() + step = ReplaceCurrencyStep() + assert step("10 kr", ops) == "10 kroner" From 0a7332c1438c7e813608b20285b48a615236f95c Mon Sep 17 00:00:00 2001 From: egenthon-cmd Date: Mon, 4 May 2026 12:04:35 -0400 Subject: [PATCH 2/4] fix: affirmations and negation removed from filler words --- normalization/languages/norwegian/operators.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/normalization/languages/norwegian/operators.py b/normalization/languages/norwegian/operators.py index 7a87278..f9b147a 100644 --- a/normalization/languages/norwegian/operators.py +++ b/normalization/languages/norwegian/operators.py @@ -59,13 +59,9 @@ "liksom", "bare", "nå", - "ja", - "jo", - "nei", "a", "aa", "mmm", - "akkurat", ], digit_words=_NORWEGIAN_DIGIT_WORDS, number_words=[ From 3c84e61ffd6d41d580b021246644f40ac717ef19 Mon Sep 17 00:00:00 2001 From: karamouche Date: Tue, 5 May 2026 13:21:51 -0400 Subject: [PATCH 3/4] test: update Finnish and Swedish test for percentage formatting --- tests/e2e/files/gladia-3/fi.csv | 2 +- tests/e2e/files/gladia-3/sv.csv | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/e2e/files/gladia-3/fi.csv b/tests/e2e/files/gladia-3/fi.csv index fdfc511..08733c6 100644 --- a/tests/e2e/files/gladia-3/fi.csv +++ b/tests/e2e/files/gladia-3/fi.csv @@ -12,6 +12,6 @@ x = 5,x yhta kuin 5 juu ok,joo okei "3,14",3 pilkku 14 "1.234,56",1234 pilkku 56 -"8,75%","8 pilkku 75%" +"8,75%","8 pilkku 75 prosenttia" ping pong,ping pong tama on hyva,tama on hyva diff --git a/tests/e2e/files/gladia-3/sv.csv b/tests/e2e/files/gladia-3/sv.csv index ded9eb0..458a9f9 100644 --- a/tests/e2e/files/gladia-3/sv.csv +++ b/tests/e2e/files/gladia-3/sv.csv @@ -15,3 +15,4 @@ Het woord [inaudible] is hier,het woord inaudible is hier "3,14",3 komma 14 192.168.1.1,192 punkt 168 punkt 1 punkt 1 ping pong,ping pong +"8,75%",8 komma 75 procent From a0204f67246246bd331527945f37fa57b2633e05 Mon Sep 17 00:00:00 2001 From: karamouche Date: Tue, 5 May 2026 13:23:01 -0400 Subject: [PATCH 4/4] docs: update README to include Finnish and Norwegian languages in supported languages list --- README.md | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 724e1fd..25d7c84 100644 --- a/README.md +++ b/README.md @@ -110,15 +110,17 @@ Pipelines are defined declaratively in **YAML presets**. Each preset lists the s ## Supported languages -| Code | Language | -| ---- | -------- | -| `en` | English | -| `fr` | French | -| `de` | German | -| `it` | Italian | -| `es` | Spanish | -| `nl` | Dutch | -| `sv` | Swedish | +| Code | Language | +| ---- | --------- | +| `en` | English | +| `fr` | French | +| `de` | German | +| `it` | Italian | +| `es` | Spanish | +| `nl` | Dutch | +| `sv` | Swedish | +| `fi` | Finnish | +| `no` | Norwegian | Unsupported language codes fall back to a safe default that applies language-independent normalization only.