diff --git a/README.md b/README.md index 6231cdb..bca2733 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,31 @@ pipeline.normalize("It's $50 at 3:00PM") # => "it is 50 dollars at 3 pm" ``` +### CLI + +```bash +# Normalize a single text +gladia-normalization "A tea cost £2 at 3:30PM" --language en + +# Normalize a text file +gladia-normalization --file transcript.txt --language en + +# Pipe from stdin +echo "A tea cost £2 at 3:30PM" | gladia-normalization --language fr + +# Use a custom preset +gladia-normalization "A tea cost £2 at 3:30PM" --preset path/to/my-preset.yaml --language en + +# Inspect the pipeline +gladia-normalization --describe --language en +``` + +If you don't want a permanent installation, run it directly with `uvx`: + +```bash +uvx gladia-normalization "A tea cost £2 at 3:30PM" --language en +``` + ## How it works Every pipeline runs exactly **three stages**, always in this order: diff --git a/normalization/__main__.py b/normalization/__main__.py new file mode 100644 index 0000000..5c728b0 --- /dev/null +++ b/normalization/__main__.py @@ -0,0 +1,3 @@ +from normalization.cli import main + +main() diff --git a/normalization/cli.py b/normalization/cli.py new file mode 100644 index 0000000..cf04399 --- /dev/null +++ b/normalization/cli.py @@ -0,0 +1,88 @@ +import argparse +import json +import sys + +from normalization import load_pipeline +from normalization.languages import get_language_registry +from normalization.pipeline.loader import _PRESETS_DIR + + +def _available_languages() -> list[str]: + import normalization.languages # noqa: F401 — triggers @register_language decorators + + return sorted(get_language_registry().keys()) + + +def _available_presets() -> list[str]: + return sorted(p.stem for p in _PRESETS_DIR.glob("*.yaml")) + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="normalize", + description="Normalize STT transcription text for fair WER comparison.", + ) + parser.add_argument( + "text", + nargs="?", + help="Text to normalize. Reads from stdin if omitted.", + ) + parser.add_argument( + "--language", + "-l", + default="en", + metavar="CODE", + help="Language code (default: en). Available: %(choices)s.", + choices=_available_languages(), + ) + parser.add_argument( + "--preset", + "-p", + default="gladia-3", + metavar="PRESET", + help="Built-in preset name or path to a YAML file (default: gladia-3).", + ) + parser.add_argument( + "--file", + "-f", + metavar="PATH", + help="Path to a text file to normalize. Mutually exclusive with positional text.", + ) + parser.add_argument( + "--describe", + action="store_true", + help="Print the pipeline description as JSON and exit.", + ) + + args = parser.parse_args() + + if args.text is not None and args.file is not None: + parser.error("Provide either positional text or --file, not both.") + + try: + pipeline = load_pipeline(args.preset, args.language) + except FileNotFoundError as exc: + parser.error(str(exc)) + + if args.describe: + print(json.dumps(pipeline.describe(), indent=2)) + return + + if args.file is not None: + try: + with open(args.file) as fh: + text = fh.read().strip() + except OSError as exc: + parser.error(str(exc)) + elif args.text is not None: + text = args.text + elif not sys.stdin.isatty(): + text = sys.stdin.read().strip() + else: + parser.error("Provide text as an argument, --file, or pipe it via stdin.") + + print(pipeline.normalize(text)) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index fcae854..eac2b9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,9 @@ dependencies = [ [tool.setuptools.package-data] normalization = ["presets/*.yaml"] +[project.scripts] +gladia-normalization = "normalization.cli:main" + [project.urls] Homepage = "https://github.com/gladiaio/normalization" Repository = "https://github.com/gladiaio/normalization"