From 6e105fe52b61458c159ffd616bed36ef959d207e Mon Sep 17 00:00:00 2001 From: karamouche Date: Wed, 15 Apr 2026 14:45:29 -0400 Subject: [PATCH 1/4] feat: add CLI for text normalization with language and preset options --- README.md | 22 ++++++++++++ normalization/__main__.py | 3 ++ normalization/cli.py | 73 +++++++++++++++++++++++++++++++++++++++ pyproject.toml | 3 ++ 4 files changed, 101 insertions(+) create mode 100644 normalization/__main__.py create mode 100644 normalization/cli.py diff --git a/README.md b/README.md index 6231cdb..96fc928 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,28 @@ pipeline.normalize("It's $50 at 3:00PM") # => "it is 50 dollars at 3 pm" ``` +### CLI + +```bash +# Normalize a single text +normalize "It's $50 at 3:00PM" --language en + +# Pipe from stdin +echo "she spent twenty dollars" | normalize --language fr + +# Use a custom preset +normalize "some text" --preset path/to/my-preset.yaml --language en + +# Inspect the pipeline +normalize --describe --language en +``` + +If you don't want a permanent installation, run it directly with `uvx`: + +```bash +uvx gladia-normalization "It's $50 at 3:00PM" --language en +``` + ## How it works Every pipeline runs exactly **three stages**, always in this order: diff --git a/normalization/__main__.py b/normalization/__main__.py new file mode 100644 index 0000000..5c728b0 --- /dev/null +++ b/normalization/__main__.py @@ -0,0 +1,3 @@ +from normalization.cli import main + +main() diff --git a/normalization/cli.py b/normalization/cli.py new file mode 100644 index 0000000..67a44ed --- /dev/null +++ b/normalization/cli.py @@ -0,0 +1,73 @@ +import argparse +import json +import sys + +from normalization import load_pipeline +from normalization.languages import get_language_registry +from normalization.pipeline.loader import _PRESETS_DIR + + +def _available_languages() -> list[str]: + import normalization.languages # noqa: F401 — triggers @register_language decorators + + return sorted(get_language_registry().keys()) + + +def _available_presets() -> list[str]: + return sorted(p.stem for p in _PRESETS_DIR.glob("*.yaml")) + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="normalize", + description="Normalize STT transcription text for fair WER comparison.", + ) + parser.add_argument( + "text", + nargs="?", + help="Text to normalize. Reads from stdin if omitted.", + ) + parser.add_argument( + "--language", + "-l", + default="en", + metavar="CODE", + help="Language code (default: en). Available: %(choices)s.", + choices=_available_languages(), + ) + parser.add_argument( + "--preset", + "-p", + default="gladia-3", + metavar="PRESET", + help="Built-in preset name or path to a YAML file (default: gladia-3).", + ) + parser.add_argument( + "--describe", + action="store_true", + help="Print the pipeline description as JSON and exit.", + ) + + args = parser.parse_args() + + try: + pipeline = load_pipeline(args.preset, args.language) + except FileNotFoundError as exc: + parser.error(str(exc)) + + if args.describe: + print(json.dumps(pipeline.describe(), indent=2)) + return + + if args.text is not None: + text = args.text + elif not sys.stdin.isatty(): + text = sys.stdin.read().strip() + else: + parser.error("Provide text as an argument or pipe it via stdin.") + + print(pipeline.normalize(text)) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index fcae854..267c571 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,9 @@ dependencies = [ [tool.setuptools.package-data] normalization = ["presets/*.yaml"] +[project.scripts] +normalize = "normalization.cli:main" + [project.urls] Homepage = "https://github.com/gladiaio/normalization" Repository = "https://github.com/gladiaio/normalization" From e032c2a58713abcba153e57c06f5fcbae8da58f4 Mon Sep 17 00:00:00 2001 From: karamouche Date: Wed, 15 Apr 2026 15:37:55 -0400 Subject: [PATCH 2/4] feat: add support for normalizing text files via CLI --- README.md | 3 +++ normalization/cli.py | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 96fc928..4d4afeb 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,9 @@ pipeline.normalize("It's $50 at 3:00PM") # Normalize a single text normalize "It's $50 at 3:00PM" --language en +# Normalize a text file +normalize --file transcript.txt --language en + # Pipe from stdin echo "she spent twenty dollars" | normalize --language fr diff --git a/normalization/cli.py b/normalization/cli.py index 67a44ed..cf04399 100644 --- a/normalization/cli.py +++ b/normalization/cli.py @@ -42,6 +42,12 @@ def main() -> None: metavar="PRESET", help="Built-in preset name or path to a YAML file (default: gladia-3).", ) + parser.add_argument( + "--file", + "-f", + metavar="PATH", + help="Path to a text file to normalize. Mutually exclusive with positional text.", + ) parser.add_argument( "--describe", action="store_true", @@ -50,6 +56,9 @@ def main() -> None: args = parser.parse_args() + if args.text is not None and args.file is not None: + parser.error("Provide either positional text or --file, not both.") + try: pipeline = load_pipeline(args.preset, args.language) except FileNotFoundError as exc: @@ -59,12 +68,18 @@ def main() -> None: print(json.dumps(pipeline.describe(), indent=2)) return - if args.text is not None: + if args.file is not None: + try: + with open(args.file) as fh: + text = fh.read().strip() + except OSError as exc: + parser.error(str(exc)) + elif args.text is not None: text = args.text elif not sys.stdin.isatty(): text = sys.stdin.read().strip() else: - parser.error("Provide text as an argument or pipe it via stdin.") + parser.error("Provide text as an argument, --file, or pipe it via stdin.") print(pipeline.normalize(text)) From ca5947de65a3c2e08e210718cafb539d70d61fea Mon Sep 17 00:00:00 2001 From: karamouche Date: Thu, 16 Apr 2026 08:53:49 -0400 Subject: [PATCH 3/4] refactor: rename CLI command to 'gladia-normalization' for consistency --- README.md | 10 +++++----- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4d4afeb..53798c9 100644 --- a/README.md +++ b/README.md @@ -75,19 +75,19 @@ pipeline.normalize("It's $50 at 3:00PM") ```bash # Normalize a single text -normalize "It's $50 at 3:00PM" --language en +gladia-normalization "It's $50 at 3:00PM" --language en # Normalize a text file -normalize --file transcript.txt --language en +gladia-normalization --file transcript.txt --language en # Pipe from stdin -echo "she spent twenty dollars" | normalize --language fr +echo "she spent twenty dollars" | gladia-normalization --language fr # Use a custom preset -normalize "some text" --preset path/to/my-preset.yaml --language en +gladia-normalization "some text" --preset path/to/my-preset.yaml --language en # Inspect the pipeline -normalize --describe --language en +gladia-normalization --describe --language en ``` If you don't want a permanent installation, run it directly with `uvx`: diff --git a/pyproject.toml b/pyproject.toml index 267c571..eac2b9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ normalization = ["presets/*.yaml"] [project.scripts] -normalize = "normalization.cli:main" +gladia-normalization = "normalization.cli:main" [project.urls] Homepage = "https://github.com/gladiaio/normalization" From f5288f87bc9597840e3c61efc4dab8ee2ab7f317 Mon Sep 17 00:00:00 2001 From: karamouche Date: Thu, 16 Apr 2026 09:03:56 -0400 Subject: [PATCH 4/4] docs: update examples in README to avoid using $ --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 53798c9..bca2733 100644 --- a/README.md +++ b/README.md @@ -75,16 +75,16 @@ pipeline.normalize("It's $50 at 3:00PM") ```bash # Normalize a single text -gladia-normalization "It's $50 at 3:00PM" --language en +gladia-normalization "A tea cost £2 at 3:30PM" --language en # Normalize a text file gladia-normalization --file transcript.txt --language en # Pipe from stdin -echo "she spent twenty dollars" | gladia-normalization --language fr +echo "A tea cost £2 at 3:30PM" | gladia-normalization --language fr # Use a custom preset -gladia-normalization "some text" --preset path/to/my-preset.yaml --language en +gladia-normalization "A tea cost £2 at 3:30PM" --preset path/to/my-preset.yaml --language en # Inspect the pipeline gladia-normalization --describe --language en @@ -93,7 +93,7 @@ gladia-normalization --describe --language en If you don't want a permanent installation, run it directly with `uvx`: ```bash -uvx gladia-normalization "It's $50 at 3:00PM" --language en +uvx gladia-normalization "A tea cost £2 at 3:30PM" --language en ``` ## How it works