Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,31 @@ pipeline.normalize("It's $50 at 3:00PM")
# => "it is 50 dollars at 3 pm"
```

### CLI

```bash
# Normalize a single text
gladia-normalization "A tea cost £2 at 3:30PM" --language en

# Normalize a text file
gladia-normalization --file transcript.txt --language en

# Pipe from stdin
echo "A tea cost £2 at 3:30PM" | gladia-normalization --language fr

# Use a custom preset
gladia-normalization "A tea cost £2 at 3:30PM" --preset path/to/my-preset.yaml --language en

# Inspect the pipeline
gladia-normalization --describe --language en
```

If you don't want a permanent installation, run it directly with `uvx`:

```bash
uvx gladia-normalization "A tea cost £2 at 3:30PM" --language en
```
Comment thread
Karamouche marked this conversation as resolved.

## How it works

Every pipeline runs exactly **three stages**, always in this order:
Expand Down
3 changes: 3 additions & 0 deletions normalization/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from normalization.cli import main

main()
88 changes: 88 additions & 0 deletions normalization/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import argparse
import json
import sys

from normalization import load_pipeline
from normalization.languages import get_language_registry
from normalization.pipeline.loader import _PRESETS_DIR


def _available_languages() -> list[str]:
import normalization.languages # noqa: F401 — triggers @register_language decorators

return sorted(get_language_registry().keys())


def _available_presets() -> list[str]:
return sorted(p.stem for p in _PRESETS_DIR.glob("*.yaml"))


def main() -> None:
parser = argparse.ArgumentParser(
prog="normalize",
description="Normalize STT transcription text for fair WER comparison.",
)
parser.add_argument(
"text",
nargs="?",
help="Text to normalize. Reads from stdin if omitted.",
)
parser.add_argument(
"--language",
"-l",
default="en",
metavar="CODE",
help="Language code (default: en). Available: %(choices)s.",
choices=_available_languages(),
)
Comment thread
Karamouche marked this conversation as resolved.
parser.add_argument(
"--preset",
"-p",
default="gladia-3",
metavar="PRESET",
help="Built-in preset name or path to a YAML file (default: gladia-3).",
)
parser.add_argument(
"--file",
"-f",
metavar="PATH",
help="Path to a text file to normalize. Mutually exclusive with positional text.",
)
parser.add_argument(
"--describe",
action="store_true",
help="Print the pipeline description as JSON and exit.",
)

args = parser.parse_args()

if args.text is not None and args.file is not None:
parser.error("Provide either positional text or --file, not both.")

try:
pipeline = load_pipeline(args.preset, args.language)
except FileNotFoundError as exc:
parser.error(str(exc))

Comment thread
Karamouche marked this conversation as resolved.
if args.describe:
print(json.dumps(pipeline.describe(), indent=2))
return

if args.file is not None:
try:
with open(args.file) as fh:
text = fh.read().strip()
except OSError as exc:
parser.error(str(exc))
elif args.text is not None:
text = args.text
elif not sys.stdin.isatty():
text = sys.stdin.read().strip()
else:
parser.error("Provide text as an argument, --file, or pipe it via stdin.")

print(pipeline.normalize(text))


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ dependencies = [
[tool.setuptools.package-data]
normalization = ["presets/*.yaml"]

[project.scripts]
gladia-normalization = "normalization.cli:main"

[project.urls]
Homepage = "https://github.com/gladiaio/normalization"
Repository = "https://github.com/gladiaio/normalization"
Expand Down