Add Segment Any Text (SaT) model (#7)

smdesai · web-flow · commit f7519ee2c64b · 2025-10-23T00:38:52.000-04:00
* segment any text model

* Update README to mention PyTorch version

* Improve formatting in README.md for commands

Updated README formatting for clarity.

* add hugging face link to download model

* changed directory location
diff --git a/models/segment-text/coreml/README.md b/models/segment-text/coreml/README.md
@@ -0,0 +1,66 @@
+# Segment Any Text CoreML
+Segment Any Text is state-of-the-art sentence segmentation with 3 Transfomer layers. A pytorch version of the model is used in [wtsplit](https://github.com/segment-any-text/wtpsplit) and additional details can be found in this [paper](https://arxiv.org/abs/2406.16678).
+
+If you wish to skip the CoreML conversion, you can download a precompiled `SaT.mlmodelc` from [Hugging Face](https://huggingface.co/smdesai/SaT).
+
+
+# CoreML Conversion
+
+## Environment Setup
+
+1. Install [uv](https://github.com/astral-sh/uv) if it is not already available.
+2. Sync the project environment.
+   ```bash
+   uv sync
+   ```
+3. Activate the virtual environment:
+   ```bash
+   source .venv/bin/activate
+   ```
+
+## Converting the Model
+
+Run the conversion script to create the SaT Core ML package:
+
+```bash
+python convert-sat.py --model-id segment-any-text/sat-3l-sm  --output-dir sat_coreml
+```
+
+This produces `SaT.mlpackage` in the `sat_coreml` directory.
+
+Here is the complete usage:
+```bash
+Usage: convert_sat.py [OPTIONS]
+
+ Options
+  --model-id                 TEXT  Model identifier to download      
+                                   from HuggingFace model hub      
+                                   [default:                         
+                                   segment-any-text/sat-3l-sm]       
+  --output-dir               PATH  Directory to write mlpackage and  
+                                   [default: sat_coreml]             
+  --conversion-type  -c      TEXT  Conversion methods to apply to    
+                                   the model. Repeat the option to   
+                                   chain conversions (allowed:       
+                                   none, prune, quantize,            
+                                   palettize; default: none).        
+                                   [default: None]                   
+```
+
+## Compiling the Model
+
+Run the following to compile the model.
+```bash
+python compile_mlmodelc.py --coreml-dir sat_coreml
+```
+
+This produces `SaT.mlmodelc` in the `compiled` directory.
+
+Here is the complete usage:
+```bash
+ Usage: compile_mlmodelc.py [OPTIONS]
+
+ Options
+  --coreml-dir        PATH  Directory where mlpackages and metadata are written
+                            [default: sat_coreml]                               
+```
diff --git a/models/segment-text/coreml/compile_mlmodelc.py b/models/segment-text/coreml/compile_mlmodelc.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+import shutil
+import subprocess
+import sys
+import typer
+from pathlib import Path
+
+BASE_DIR = Path(__file__).resolve().parent
+OUTPUT_ROOT = BASE_DIR / "compiled"
+
+app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
+
+
+def ensure_coremlcompiler() -> None:
+    """Ensure ``xcrun coremlcompiler`` is available for the active Xcode."""
+    xcrun_path = shutil.which("xcrun")
+    if xcrun_path is None:
+        print("Error: 'xcrun' not found on PATH. Install Xcode command line tools.", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        subprocess.run([
+            xcrun_path,
+            "--find",
+            "coremlcompiler",
+        ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    except subprocess.CalledProcessError:
+        print("Error: 'coremlcompiler' not found via xcrun. Check your Xcode installation.", file=sys.stderr)
+        sys.exit(1)
+
+
+def gather_packages(dir: str) -> list[Path]:
+    """Return a list of all ``*.mlpackage`` bundles under the source dirs."""
+    packages: list[Path] = []
+    source = BASE_DIR / dir
+    if not source.exists():
+        print(f"Warning: {source.relative_to(BASE_DIR)} does not exist; skipping", file=sys.stderr)
+        return packages
+    packages.extend(source.rglob("*.mlpackage"))
+    return packages
+
+
+def compile_package(package: Path) -> None:
+    """Compile a single ``.mlpackage`` bundle using ``xcrun coremlcompiler``."""
+    relative_pkg = package.relative_to(BASE_DIR)
+    #output_dir = OUTPUT_ROOT / relative_pkg.parent
+    output_dir = OUTPUT_ROOT 
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / f"{package.stem}.mlmodelc"
+
+    if output_path.exists():
+        shutil.rmtree(output_path)
+
+    cmd = [
+        "xcrun",
+        "coremlcompiler",
+        "compile",
+        str(package),
+        str(output_dir),
+    ]
+
+    print(f"Compiling {relative_pkg} -> {output_path.relative_to(BASE_DIR)}")
+    subprocess.run(cmd, check=True)
+
+
+@app.command()
+def compile(
+    coreml_dir: Path = typer.Option(
+        Path("sat_coreml"),
+        help="Directory where mlpackages and metadata are written",
+    ),
+):
+    ensure_coremlcompiler()
+    packages = gather_packages(coreml_dir)
+
+    if not packages:
+        print("No .mlpackage bundles found to compile.")
+        return
+
+    for package in packages:
+        try:
+            compile_package(package)
+        except subprocess.CalledProcessError as exc:
+            print(f"Failed to compile {package}: {exc}", file=sys.stderr)
+            sys.exit(exc.returncode)
+
+    print(f"Finished compiling {len(packages)} package(s) into {OUTPUT_ROOT.relative_to(BASE_DIR)}.")
+
+
+if __name__ == "__main__":
+    app()
diff --git a/models/segment-text/coreml/conversion_utils.py b/models/segment-text/coreml/conversion_utils.py
@@ -0,0 +1,101 @@
+import json
+import os
+from enum import IntEnum
+
+import coremltools.optimize.coreml as cto_coreml
+
+
+class Conversion(IntEnum):
+    NONE = 0
+    PRUNE = 1
+    QUANTIZE = 2
+    PALETTIZE = 3
+
+
+def update_manifest_model_name(manifest_path: str, new_name: str) -> None:
+    with open(manifest_path, "r") as file:
+        manifest = json.load(file)
+
+    for key, value in manifest["itemInfoEntries"].items():
+        if value["name"] == "model.mlmodel":
+            value["name"] = f"{new_name}.mlmodel"
+            value["path"] = f"com.apple.CoreML/{new_name}.mlmodel"
+
+    with open(manifest_path, "w") as file:
+        json.dump(manifest, file, indent=4)
+
+    print(f"Manifest updated. Model name changed to {new_name}.mlmodel")
+
+    old_model_path = os.path.join(
+        os.path.dirname(manifest_path), "Data/com.apple.CoreML/model.mlmodel"
+    )
+    new_model_path = os.path.join(
+        os.path.dirname(manifest_path),
+        f"Data/com.apple.CoreML/{new_name}.mlmodel",
+    )
+    if os.path.exists(old_model_path):
+        os.rename(old_model_path, new_model_path)
+        print(f"Model file renamed from model.mlmodel to {new_name}.mlmodel")
+    else:
+        print("Warning: model.mlmodel not found. Only manifest was updated.")
+
+
+def palettize_model(mlpackage, *, bits: int = 8, weight_threshold: int = 512):
+    print(f"\nApplying {bits}-bit palettization...")
+    try:
+        op_config = cto_coreml.OpPalettizerConfig(
+            nbits=bits,
+            weight_threshold=weight_threshold,
+        )
+        config = cto_coreml.OptimizationConfig(op_config)
+        return cto_coreml.palettize_weights(mlpackage, config)
+    except Exception as e:
+        print(f"Error palettization failed: {e}")
+        return None
+
+
+def prune_model(mlpackage, *, threshold: float = 0.01):
+    print(f"\nApplying pruning quantization...")
+    try:
+        config = cto_coreml.OptimizationConfig(
+            global_config=cto_coreml.OpThresholdPrunerConfig(threshold=threshold)
+        )
+        return cto_coreml.prune_weights(mlpackage, config)
+    except Exception as e:
+        print(f"Error pruning failed: {e}")
+        return None
+
+
+def quantize_model(mlpackage, *, dtype: str = "int8", mode: str = "linear_symmetric"):
+    if str == "linear":
+        print(f"\nApplying {dtype} quantization...")
+    else:
+        print("\nApplying mixed precision quantization...")
+
+    try:
+        op_config = cto_coreml.OpLinearQuantizerConfig(
+            mode=mode,
+            dtype=dtype,
+            granularity="per_block",
+            block_size=32,
+        )
+
+        config = cto_coreml.OptimizationConfig(global_config=op_config)
+        return cto_coreml.linear_quantize_weights(mlpackage, config)
+    except Exception as e:
+        print(f"INT8 quantization failed: {e}")
+        return None
+
+
+def apply_conversion(mlpackage, conversion_type: Conversion):
+    match conversion_type:
+        case Conversion.NONE:
+            return mlpackage
+        case Conversion.PRUNE:
+            return prune_model(mlpackage)
+        case Conversion.QUANTIZE:
+            return quantize_model(mlpackage)
+        case Conversion.PALETTIZE:
+            return palettize_model(mlpackage)
+        case _:
+            raise ValueError(f"Unsupported conversion type: {conversion_type}")
diff --git a/models/segment-text/coreml/convert_sat.py b/models/segment-text/coreml/convert_sat.py
@@ -0,0 +1,135 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import coremltools as ct
+import numpy as np
+import torch
+import typer
+
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+import wtpsplit.models  # registers SubwordXLM config/model types
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+DEFAULT_MODEL_ID = "segment-any-text/sat-3l-sm"
+
+
+from conversion_utils import (
+    Conversion,
+    apply_conversion,
+    update_manifest_model_name,
+)
+
+app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
+
+
+def parse_conversion_type(value: str | None) -> Conversion:
+    if value is None:
+        return Conversion.NONE
+    value_str = value.strip()
+    if not value_str:
+        return Conversion.NONE
+
+    try:
+        return Conversion[value_str.upper()]
+    except KeyError as exc:
+        raise typer.BadParameter(
+            f"Invalid conversion type '{value}'. "
+            "Choose from 'none', 'prune', 'quantize', or 'palettize'."
+        ) from exc
+
+
+def parse_conversion_types(
+    values: tuple[str, ...] | list[str] | None,
+) -> list[Conversion]:
+    if not values:
+        return [Conversion.NONE]
+
+    parsed: list[Conversion] = []
+    for item in values:
+        parsed.append(parse_conversion_type(item))
+    return parsed
+
+
+@app.command()
+def convert(
+    model_id: str = typer.Option(
+        DEFAULT_MODEL_ID,
+        "--model-id",
+        help="Model identifier to download from HuggingFace's model hub",
+    ),
+    output_dir: Path = typer.Option(
+        Path("sat_coreml"),
+        help="Directory where mlpackages and metadata will be written",
+    ),
+    conversion_types: list[str] = typer.Option(
+        None,
+        "--conversion-type",
+        "-c",
+        help=(
+            "Conversion methods to apply to the model. "
+            "Repeat the option to chain conversions "
+            "(allowed: none, prune, quantize, palettize; default: none)."
+        ),
+    ),
+):
+
+    conversions_to_apply = parse_conversion_types(conversion_types)
+
+    model = AutoModelForTokenClassification.from_pretrained(
+        model_id,
+        return_dict=False,
+        torchscript=True,
+        trust_remote_code=True,
+    ).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained("facebookAI/xlm-roberta-base")
+    tokenized = tokenizer(
+        ["Sample input text to trace the model."],
+        return_tensors="pt",
+        max_length=512,  # token sequence length
+        padding="max_length",
+    )
+
+    traced_model = torch.jit.trace(
+        model,
+        (tokenized["input_ids"], tokenized["attention_mask"])
+    )
+
+    outputs = [ct.TensorType(name="output")]
+
+    mlpackage = ct.convert(
+        traced_model,
+        convert_to="mlprogram",
+        inputs=[
+            ct.TensorType(
+                f"{name}",
+                shape=tensor.shape,
+                dtype=np.int32,
+            )
+            for name, tensor in tokenized.items()
+        ],
+        outputs=outputs,
+        compute_units=ct.ComputeUnit.ALL,
+        minimum_deployment_target=ct.target.iOS18,
+    )
+
+    try:
+        new_model = mlpackage
+        for conversion in conversions_to_apply:
+            new_model = apply_conversion(new_model, conversion)
+    except ValueError as e:
+        print(e)
+        return
+
+    saved_name = "SaT"
+    saved_path = output_dir / f"{saved_name}.mlpackage"
+    new_model.save(saved_path)
+
+    manifest_file = saved_path / "Manifest.json"
+    update_manifest_model_name(manifest_file, saved_name)
+
+if __name__ == "__main__":
+    app()
diff --git a/models/segment-text/coreml/pyproject.toml b/models/segment-text/coreml/pyproject.toml
diff --git a/models/segment-text/coreml/uv.lock b/models/segment-text/coreml/uv.lock