diff --git a/README.md b/README.md index d7c0566..724e1fd 100644 --- a/README.md +++ b/README.md @@ -118,10 +118,11 @@ Pipelines are defined declaratively in **YAML presets**. Each preset lists the s | `it` | Italian | | `es` | Spanish | | `nl` | Dutch | +| `sv` | Swedish | Unsupported language codes fall back to a safe default that applies language-independent normalization only. -Adding a new language is self-contained — create a folder, register it with a decorator, done. See [Contributing](#adding-a-new-language). +Adding a new language is self-contained — create a folder, register it with a decorator, done. See [Contributing](CONTRIBUTING.md#add-support-for-a-new-language). ## Custom presets diff --git a/docs/contributing-guide.md b/docs/contributing-guide.md index 488f5ec..e271b87 100644 --- a/docs/contributing-guide.md +++ b/docs/contributing-guide.md @@ -169,8 +169,11 @@ tests/e2e/files/ default.csv de.csv en.csv + es.csv fr.csv it.csv + nl.csv + sv.csv ``` **CSV format** — two columns (`input,expected`), no quoting needed unless the value contains a comma: diff --git a/docs/steps.md b/docs/steps.md index 5b5bb43..727a854 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -86,6 +86,9 @@ operators.config.pm_word, operators.config.oclock_word, and operators.get_compound_minutes(). No-op when required config is None. +Regex patterns are compiled once per operators config instance and cached +on the step to avoid recompilation on every call. + ### `expand_alphanumeric_codes` **Base class:** `TextStep` @@ -329,6 +332,10 @@ Handles ¤ markers by processing segments separately. Remove currency symbols that are not adjacent to numbers. +Single-character symbols use the between/start/end patterns. Each +multi-character key (e.g. ``kr``) is stripped only when it appears as its own +token (``\b...\b``), so it is not confused with a substring inside a word. + ### `remove_symbols` **Base class:** `TextStep` @@ -376,7 +383,11 @@ No-op when either is None. **Base class:** `TextStep` -Replace currency symbols with their corresponding words. +Replace currency symbols with their corresponding words next to amounts. + +Reads ``operators.config.currency_symbol_to_word``. Multi-character symbols +(e.g. ``kr``) are matched with word boundaries so amounts already written as +``… kronor`` are not parsed as ``… kr`` + ``onor``. ### `restore_decimal_separator_with_word` diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py index 7c0e931..bfa6a54 100644 --- a/normalization/languages/__init__.py +++ b/normalization/languages/__init__.py @@ -1,4 +1,4 @@ -from . import dutch, english, finnish, french, german, italian, spanish +from . import dutch, english, finnish, french, german, italian, spanish, swedish from .base import LanguageOperators from .registry import get_language_registry, register_language @@ -12,5 +12,6 @@ "german", "italian", "spanish", + "swedish", "get_language_registry", ] diff --git a/normalization/languages/swedish/__init__.py b/normalization/languages/swedish/__init__.py new file mode 100644 index 0000000..5848d32 --- /dev/null +++ b/normalization/languages/swedish/__init__.py @@ -0,0 +1,7 @@ +from .operators import SwedishOperators +from .replacements import SWEDISH_REPLACEMENTS + +__all__ = [ + "SwedishOperators", + "SWEDISH_REPLACEMENTS", +] diff --git a/normalization/languages/swedish/number_normalizer.py b/normalization/languages/swedish/number_normalizer.py new file mode 100644 index 0000000..c145e79 --- /dev/null +++ b/normalization/languages/swedish/number_normalizer.py @@ -0,0 +1,398 @@ +"""Swedish number normalizer (STT-oriented). + +``text2num.alpha2digit`` does not support Swedish, so this module implements +spelled-out cardinal parsing for common transcript patterns: 0–999, ``tusen`` +compounds, and large multipliers (``miljon``, ``miljard``, ``biljon``). +Optionally rewrites currency symbols like the Dutch normalizer, then restores +plural currency words from config. +""" + +from __future__ import annotations + +import re + + +def _fold(s: str) -> str: + return s.lower() + + +def _get(table: dict[str, int], word: str) -> int | None: + fw = _fold(word) + for k, v in table.items(): + if _fold(k) == fw: + return v + return None + + +_ONES_2_9: dict[str, int] = { + "två": 2, + "tre": 3, + "fyra": 4, + "fem": 5, + "sex": 6, + "sju": 7, + "åtta": 8, + "atta": 8, + "nio": 9, +} + +_TEENS: dict[str, int] = { + "tio": 10, + "elva": 11, + "tolv": 12, + "tretton": 13, + "fjorton": 14, + "femton": 15, + "sexton": 16, + "sjutton": 17, + "arton": 18, + "aderton": 18, + "nitton": 19, +} + +_TENS: dict[str, int] = { + "tjugo": 20, + "trettio": 30, + "fyrtio": 40, + "femtio": 50, + "sextio": 60, + "sjuttio": 70, + "åttio": 80, + "attio": 80, + "nittio": 90, +} + +_TENS_PREFIXES: tuple[tuple[str, int], ...] = tuple(_TENS.items()) + +_ONES_AFTER_TENS: dict[str, int] = {"ett": 1, "en": 1, **_ONES_2_9} + +_DIGIT_TO_SWEDISH: dict[str, str] = { + "0": "noll", + "1": "ett", + "2": "två", + "3": "tre", + "4": "fyra", + "5": "fem", + "6": "sex", + "7": "sju", + "8": "åtta", + "9": "nio", +} + +_RE_MIXED_NUMBER = re.compile( + r"\b(\d+)\s+(" + r"miljon|miljoner|miljard|miljarder|biljon|biljoner|tusen" + r")\b", + re.IGNORECASE, +) + +_BIG_MULT: dict[str, int] = { + "tusen": 1000, + "miljon": 1_000_000, + "miljoner": 1_000_000, + "miljard": 1_000_000_000, + "miljarder": 1_000_000_000, + "biljon": 1_000_000_000_000, + "biljoner": 1_000_000_000_000, +} + + +def _normalize_mixed_numbers(text: str) -> str: + """Convert ``3 miljard`` → ``tre miljard`` so the word parser yields 3e9.""" + + def replace(match: re.Match[str]) -> str: + number = match.group(1) + multiplier = match.group(2) + if len(number) == 1 and number in _DIGIT_TO_SWEDISH: + return f"{_DIGIT_TO_SWEDISH[number]} {multiplier}" + return match.group(0) + + return _RE_MIXED_NUMBER.sub(replace, text) + + +def _singular_spoken_unit(trailing_word: str) -> str: + t = trailing_word.lower() + if t == "euros": + return "euro" + if t == "dollars": + return "dollar" + if t == "pounds": + return "pound" + if t == "kronor": + return "krona" + if t == "yens": + return "yen" + return trailing_word + + +def _normalize_currency_symbols( + text: str, + currency_symbol_to_word: dict[str, str] | None, +) -> str: + if not currency_symbol_to_word: + return text + num = r"\d+(?:[.,]\d+)?" + for symbol, trailing in currency_symbol_to_word.items(): + singular = _singular_spoken_unit(trailing) + esc = re.escape(symbol) + sym = rf"\b{esc}\b" if len(symbol) > 1 else esc + text = re.sub(rf"{sym}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE) + text = re.sub(rf"({num})\s*{sym}", rf"\1 {singular}", text, flags=re.IGNORECASE) + return text + + +def _currency_plural_fix_patterns( + currency_symbol_to_word: dict[str, str] | None, +) -> tuple[tuple[re.Pattern[str], str], ...]: + if not currency_symbol_to_word: + return () + amount = r"(\d+(?:[.,]\d+)?)" + seen: set[str] = set() + out: list[tuple[re.Pattern[str], str]] = [] + for _symbol, trailing in currency_symbol_to_word.items(): + tl = trailing.lower() + if tl in seen: + continue + seen.add(tl) + singular = _singular_spoken_unit(trailing) + if singular.lower() == tl: + continue + if tl == "euros": + pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE) + out.append((pat, rf"\1 {trailing}")) + elif tl == "kronor": + pat = re.compile(rf"\b{amount}\s+krona\b", re.IGNORECASE) + out.append((pat, rf"\1 {trailing}")) + else: + pat = re.compile( + rf"\b{amount}\s+{re.escape(singular)}\b", + re.IGNORECASE, + ) + out.append((pat, rf"\1 {trailing}")) + return tuple(out) + + +def _apply_currency_plural_fixes( + text: str, + fixers: tuple[tuple[re.Pattern[str], str], ...], +) -> str: + for pattern, repl in fixers: + text = pattern.sub(repl, text) + return text + + +def _hundred_multiplier(word: str) -> int | None: + if _fold(word) in ("en", "ett"): + return 1 + return _get(_ONES_2_9, word) + + +class SwedishNumberNormalizer: + """Convert Swedish spelled-out numbers to digits.""" + + def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None: + self._currency_symbol_to_word = currency_symbol_to_word + self._currency_plural_fixes = _currency_plural_fix_patterns( + currency_symbol_to_word, + ) + + def __call__(self, text: str) -> str: + if not text.strip(): + return text + text = _normalize_currency_symbols(text, self._currency_symbol_to_word) + text = _normalize_mixed_numbers(text) + words = text.split() + out: list[str] = [] + i = 0 + n = len(words) + while i < n: + parsed = self._parse_number(words, i, n) + if parsed is not None: + end, value = parsed + out.append(str(value)) + i = end + else: + out.append(words[i]) + i += 1 + text = " ".join(out) + text = _apply_currency_plural_fixes(text, self._currency_plural_fixes) + return text + + def _parse_number(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + + fw = _fold(words[i]) + + if fw == "tusen": + tail = self._parse_number(words, i + 1, n) + if tail is not None: + end, v2 = tail + return end, 1000 + v2 + return i + 1, 1000 + + if i + 1 < n and fw in ("en", "ett") and _fold(words[i + 1]) == "tusen": + tail = self._parse_number(words, i + 2, n) + base = 1000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return i + 2, base + + if i + 1 < n and fw in ("en", "ett") and _fold(words[i + 1]) == "miljon": + tail = self._parse_number(words, i + 2, n) + base = 1_000_000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return i + 2, base + + if ( + i + 1 < n + and fw in ("en", "ett") + and _fold(words[i + 1]) + in ( + "miljard", + "miljarder", + ) + ): + tail = self._parse_number(words, i + 2, n) + base = 1_000_000_000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return i + 2, base + + if ( + i + 1 < n + and fw in ("en", "ett") + and _fold(words[i + 1]) + in ( + "biljon", + "biljoner", + ) + ): + tail = self._parse_number(words, i + 2, n) + base = 1_000_000_000_000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return i + 2, base + + sub999 = self._parse_0_999(words, i, n) + if sub999 is None: + return None + j, v = sub999 + if j >= n: + return j, v + + next_fw = _fold(words[j]) + if next_fw == "tusen": + j += 1 + prod = v * 1000 + if j >= n: + return j, prod + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, prod + v2 + return j, prod + + mult = _BIG_MULT.get(next_fw) + if mult is not None and mult >= 1_000_000: + j += 1 + prod = v * mult + if j >= n: + return j, prod + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, prod + v2 + return j, prod + + return j, v + + def _parse_0_999(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + + if _fold(words[i]) == "noll": + if i + 1 < n and self._continues_number(words[i + 1]): + return None + return i + 1, 0 + + if _fold(words[i]) == "hundra": + tail = self._parse_0_99(words, i + 1, n) + if tail is not None: + je, tv = tail + return je, 100 + tv + return i + 1, 100 + + if i + 1 < n and _fold(words[i + 1]) == "hundra": + m = _hundred_multiplier(words[i]) + if m is None: + return None + base = m * 100 + j = i + 2 + tail = self._parse_0_99(words, j, n) + if tail is not None: + je, tv = tail + return je, base + tv + return j, base + + return self._parse_0_99(words, i, n) + + def _continues_number(self, word: str) -> bool: + fw = _fold(word) + if fw == "hundra" or fw == "tusen": + return True + if fw in _BIG_MULT: + return True + if _get(_TEENS, word) is not None: + return True + if _get(_TENS, word) is not None: + return True + if _get(_ONES_2_9, word) is not None: + return True + if fw in ("en", "ett"): + return True + return False + + def _parse_0_99(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + + fw = _fold(words[i]) + + v = _get(_TEENS, words[i]) + if v is not None: + return i + 1, v + + for prefix, tval in _TENS_PREFIXES: + pl = len(prefix) + if fw.startswith(prefix) and len(fw) > pl: + rest = fw[pl:] + unit = _get(_ONES_AFTER_TENS, rest) + if unit is not None: + return i + 1, tval + unit + + tens = _get(_TENS, words[i]) + if tens is not None: + j = i + 1 + if j < n: + nfw = _fold(words[j]) + if nfw in ("ett", "en"): + return j + 1, tens + 1 + o = _get(_ONES_2_9, words[j]) + if o is not None: + return j + 1, tens + o + return i + 1, tens + + o = _get(_ONES_2_9, words[i]) + if o is not None: + return i + 1, o + + if fw in ("en", "ett"): + return None + + return None diff --git a/normalization/languages/swedish/operators.py b/normalization/languages/swedish/operators.py new file mode 100644 index 0000000..e78cec4 --- /dev/null +++ b/normalization/languages/swedish/operators.py @@ -0,0 +1,111 @@ +from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.registry import register_language +from normalization.languages.swedish.number_normalizer import SwedishNumberNormalizer + +_SWEDISH_DIGIT_WORDS: dict[str, str] = { + "noll": "0", + "ett": "1", + "en": "1", + "två": "2", + "tre": "3", + "fyra": "4", + "fem": "5", + "sex": "6", + "sju": "7", + "åtta": "8", + "nio": "9", +} + +SWEDISH_CONFIG = LanguageConfig( + code="sv", + decimal_separator=",", + decimal_word="komma", + thousand_separator=" ", + symbols_to_words={ + "@": "snabel a", + ".": "punkt", + "+": "plus", + "=": "är lika med", + ">": "större än", + "<": "mindre än", + "°": "grader", + "°C": "grader celsius", + "°F": "grader fahrenheit", + "%": "procent", + }, + currency_symbol_to_word={ + "€": "euros", + "$": "dollars", + "£": "pounds", + "¢": "cent", + "¥": "yens", + "kr": "kronor", + }, + filler_words=[ + "eh", + "äh", + "öh", + "hm", + "hmm", + "mm", + "mhm", + "asså", + "alltså", + "liksom", + "typ", + "ba", + "va", + "nå", + ], + digit_words=_SWEDISH_DIGIT_WORDS, + number_words=[ + *_SWEDISH_DIGIT_WORDS, + "tio", + "elva", + "tolv", + "tretton", + "fjorton", + "femton", + "sexton", + "sjutton", + "arton", + "aderton", + "nitton", + "tjugo", + "trettio", + "fyrtio", + "femtio", + "sextio", + "sjuttio", + "åttio", + "attio", + "nittio", + "hundra", + "tusen", + "miljon", + "miljoner", + "miljard", + "miljarder", + "biljon", + "biljoner", + ], + plus_word="plus", +) + + +@register_language +class SwedishOperators(LanguageOperators): + def __init__(self) -> None: + super().__init__(SWEDISH_CONFIG) + self._number_normalizer = SwedishNumberNormalizer( + SWEDISH_CONFIG.currency_symbol_to_word, + ) + + def expand_written_numbers(self, text: str) -> str: + """Convert Swedish spelled-out numbers to digits (e.g. tjugo fem → 25).""" + return self._number_normalizer(text) + + def get_word_replacements(self) -> dict[str, str]: + from normalization.languages.swedish.replacements import SWEDISH_REPLACEMENTS + + return SWEDISH_REPLACEMENTS diff --git a/normalization/languages/swedish/replacements.py b/normalization/languages/swedish/replacements.py new file mode 100644 index 0000000..962dd96 --- /dev/null +++ b/normalization/languages/swedish/replacements.py @@ -0,0 +1,14 @@ +"""Single-token colloquial / spelling variants → standard Swedish (canonical for WER).""" + +SWEDISH_REPLACEMENTS: dict[str, str] = { + "mej": "mig", + "dej": "dig", + "dom": "de", + "nåt": "något", + "nånting": "någonting", + "sån": "sådan", + "sånt": "sådant", + "såna": "sådana", + "euro": "euros", + "krona": "kronor", +} diff --git a/normalization/steps/text/remove_standalone_currency_symbols.py b/normalization/steps/text/remove_standalone_currency_symbols.py index c6bd60e..71fd17d 100644 --- a/normalization/steps/text/remove_standalone_currency_symbols.py +++ b/normalization/steps/text/remove_standalone_currency_symbols.py @@ -7,7 +7,13 @@ def _make_standalone_patterns( symbols: frozenset[str], -) -> tuple[re.Pattern, re.Pattern, re.Pattern, re.Pattern]: +) -> tuple[re.Pattern[str], re.Pattern[str], re.Pattern[str], re.Pattern[str]]: + """Build matchers for single-character currency symbols only. + + Multi-character symbols (e.g. ``kr``) must not be concatenated into a + character class: that would treat each letter as its own symbol and strip + ``k``/``r`` from ordinary words such as ``kronor`` or ``euros``. + """ char_class = "[" + re.escape("".join(symbols)) + "]" between = re.compile(rf"([^0-9]){char_class}([^0-9])") start = re.compile(rf"^{char_class}([^0-9])") @@ -16,9 +22,20 @@ def _make_standalone_patterns( return between, start, end, standalone +def _strip_standalone_multi_char_symbol(text: str, symbol: str) -> str: + """Remove ``symbol`` only when it forms its own token (not a prefix like ``kr`` in ``kronor``).""" + esc = re.escape(symbol) + return re.sub(rf"\b{esc}\b", " ", text, flags=re.IGNORECASE) + + @register_step class RemoveStandaloneCurrencySymbolsStep(TextStep): - """Remove currency symbols that are not adjacent to numbers.""" + """Remove currency symbols that are not adjacent to numbers. + + Single-character symbols use the between/start/end patterns. Each + multi-character key (e.g. ``kr``) is stripped only when it appears as its own + token (``\\b...\\b``), so it is not confused with a substring inside a word. + """ name = "remove_standalone_currency_symbols" @@ -27,9 +44,15 @@ def __call__(self, text: str, operators: LanguageOperators) -> str: if not symbols: return text - between, start, end, standalone = _make_standalone_patterns(symbols) - text = between.sub(r"\1 \2", text) - text = start.sub(r" \1", text) - text = end.sub(r"\1 ", text) - text = standalone.sub(" ", text) + singles = frozenset(s for s in symbols if len(s) == 1) + if singles: + between, start, end, standalone = _make_standalone_patterns(singles) + text = between.sub(r"\1 \2", text) + text = start.sub(r" \1", text) + text = end.sub(r"\1 ", text) + text = standalone.sub(" ", text) + + multi_symbols: list[str] = [s for s in symbols if len(s) > 1] + for sym in sorted(multi_symbols, key=lambda s: len(s), reverse=True): + text = _strip_standalone_multi_char_symbol(text, sym) return text diff --git a/normalization/steps/text/replace_currency.py b/normalization/steps/text/replace_currency.py index 7e59cf5..10f3ffb 100644 --- a/normalization/steps/text/replace_currency.py +++ b/normalization/steps/text/replace_currency.py @@ -8,17 +8,23 @@ _CURRENCY_NUM = rf"\d+(?:{ProtectPlaceholder.DECIMAL_SEPARATOR.value}\d+)?" -def _make_currency_patterns(symbol: str) -> tuple[re.Pattern, re.Pattern]: +def _make_currency_patterns( + symbol: str, +) -> tuple[re.Pattern[str], re.Pattern[str]]: escaped = re.escape(symbol) - before = re.compile(rf"{escaped}\s*({_CURRENCY_NUM})", re.IGNORECASE) - after = re.compile(rf"({_CURRENCY_NUM})\s*{escaped}", re.IGNORECASE) + sym = rf"\b{escaped}\b" if len(symbol) > 1 else escaped + before = re.compile(rf"{sym}\s*({_CURRENCY_NUM})", re.IGNORECASE) + after = re.compile(rf"({_CURRENCY_NUM})\s*{sym}", re.IGNORECASE) return before, after @register_step class ReplaceCurrencyStep(TextStep): - """ - Replace currency symbols with their corresponding words. + """Replace currency symbols with their corresponding words next to amounts. + + Reads ``operators.config.currency_symbol_to_word``. Multi-character symbols + (e.g. ``kr``) are matched with word boundaries so amounts already written as + ``… kronor`` are not parsed as ``… kr`` + ``onor``. """ name = "replace_currency" diff --git a/tests/e2e/files/gladia-3/sv.csv b/tests/e2e/files/gladia-3/sv.csv new file mode 100644 index 0000000..ded9eb0 --- /dev/null +++ b/tests/e2e/files/gladia-3/sv.csv @@ -0,0 +1,17 @@ +input,expected +tio euro,10 euros +2 < 5,2 mindre an 5 +50°C,50 grader celsius +Det kostar €50,det kostar 50 euros +tjugo fem kronor,25 kronor +10 kr,10 kronor +mej heter Anna,mig heter anna +hallo eh där,hallo dar +test@example.com,test snabel a example punkt com +www.example.com,w w w punkt example punkt com +x = 5,x ar lika med 5 +Het woord [inaudible] is hier,het woord inaudible is hier +"1.234,56",1234 komma 56 +"3,14",3 komma 14 +192.168.1.1,192 punkt 168 punkt 1 punkt 1 +ping pong,ping pong diff --git a/tests/unit/languages/swedish_number_normalizer_test.py b/tests/unit/languages/swedish_number_normalizer_test.py new file mode 100644 index 0000000..b05477d --- /dev/null +++ b/tests/unit/languages/swedish_number_normalizer_test.py @@ -0,0 +1,69 @@ +import pytest + +from normalization.languages.swedish.number_normalizer import SwedishNumberNormalizer +from normalization.languages.swedish.operators import SWEDISH_CONFIG + + +@pytest.fixture +def normalizer() -> SwedishNumberNormalizer: + return SwedishNumberNormalizer(SWEDISH_CONFIG.currency_symbol_to_word) + + +@pytest.fixture +def normalizer_no_currency() -> SwedishNumberNormalizer: + return SwedishNumberNormalizer(None) + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("tjugo fem", "25"), + ("tjugoett", "21"), + ("tre hundra femtio", "350"), + ("en miljon", "1000000"), + ("tre miljarder", "3000000000"), + ("3 miljard", "3000000000"), + ("tjugo tusen fem", "20005"), + ("noll", "0"), + ("femton", "15"), + ], +) +def test_swedish_spelled_numbers( + normalizer: SwedishNumberNormalizer, text: str, expected: str +) -> None: + assert normalizer(text) == expected + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("tio euro", "10 euros"), + ("hundra kronor", "100 kronor"), + ("€10", "10 euros"), + ("10 kr", "10 kronor"), + ("fem dollar", "5 dollars"), + ], +) +def test_currency_symbols_and_plural_trailing_words( + normalizer: SwedishNumberNormalizer, text: str, expected: str +) -> None: + assert normalizer(text) == expected + + +def test_without_currency_config_leaves_currency_symbol( + normalizer_no_currency: SwedishNumberNormalizer, +) -> None: + assert normalizer_no_currency("tjugo fem") == "25" + assert normalizer_no_currency("€10") == "€10" + assert normalizer_no_currency("3 miljard") == "3000000000" + + +def test_non_numeric_text_unchanged(normalizer: SwedishNumberNormalizer) -> None: + text = "det här är vanlig text" + assert normalizer(text) == text + + +def test_kronor_word_not_treated_as_currency_suffix( + normalizer: SwedishNumberNormalizer, +) -> None: + assert normalizer("25 kronor") == "25 kronor" diff --git a/tests/unit/languages/swedish_operators_test.py b/tests/unit/languages/swedish_operators_test.py new file mode 100644 index 0000000..0807326 --- /dev/null +++ b/tests/unit/languages/swedish_operators_test.py @@ -0,0 +1,28 @@ +import pytest + +from normalization.languages.registry import get_language_registry +from normalization.languages.swedish.operators import SwedishOperators + + +@pytest.fixture +def operators() -> SwedishOperators: + return SwedishOperators() + + +def test_swedish_is_registered() -> None: + assert "sv" in get_language_registry() + + +def test_swedish_registry_produces_swedish_operators() -> None: + instance = get_language_registry()["sv"]() + assert isinstance(instance, SwedishOperators) + + +def test_config_code(operators: SwedishOperators) -> None: + assert operators.config.code == "sv" + + +def test_word_replacements(operators: SwedishOperators) -> None: + assert operators.get_word_replacements()["mej"] == "mig" + assert operators.get_word_replacements()["dom"] == "de" + assert operators.get_word_replacements()["euro"] == "euros" diff --git a/tests/unit/steps/text/remove_standalone_currency_symbols_test.py b/tests/unit/steps/text/remove_standalone_currency_symbols_test.py new file mode 100644 index 0000000..30cb2cd --- /dev/null +++ b/tests/unit/steps/text/remove_standalone_currency_symbols_test.py @@ -0,0 +1,16 @@ +from normalization.languages.swedish.operators import SwedishOperators +from normalization.steps.text.remove_standalone_currency_symbols import ( + RemoveStandaloneCurrencySymbolsStep, +) + +from .conftest import assert_text_step_registered + + +def test_step_is_registered() -> None: + assert_text_step_registered(RemoveStandaloneCurrencySymbolsStep) + + +def test_multi_char_kr_not_stripped_from_kronor() -> None: + step = RemoveStandaloneCurrencySymbolsStep() + op = SwedishOperators() + assert step("25 kronor", op) == "25 kronor" diff --git a/tests/unit/steps/text/replace_currency_test.py b/tests/unit/steps/text/replace_currency_test.py index 4d85be9..facc8b9 100644 --- a/tests/unit/steps/text/replace_currency_test.py +++ b/tests/unit/steps/text/replace_currency_test.py @@ -1,10 +1,11 @@ from normalization.languages.english import EnglishOperators +from normalization.languages.swedish.operators import SwedishOperators from normalization.steps.text.replace_currency import ReplaceCurrencyStep from .conftest import assert_text_step_registered -def test_step_is_registered(): +def test_step_is_registered() -> None: assert_text_step_registered(ReplaceCurrencyStep) @@ -17,6 +18,14 @@ def test_replace_currency_step_replaces_currency(english_operators: EnglishOpera assert replaced_text == "100 euros" +def test_replace_currency_kr_uses_word_boundaries() -> None: + """``kr`` must not match the prefix of ``kronor``.""" + step = ReplaceCurrencyStep() + op = SwedishOperators() + assert step("25 kronor", op) == "25 kronor" + assert step("25 kr", op) == "25 kronor" + + def test_replace_currency_step_replaces_currency_with_decimal_separator( english_operators: EnglishOperators, ):