Skip to content
8 changes: 8 additions & 0 deletions docs/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,14 @@ Remove-Item -Recurse -Force "$env:USERPROFILE\.cache\winml"

The next `winml build` will re-create the cache as needed. Use `--rebuild` to force a full rebuild without relying on cached intermediates.

When a build runs out of disk space mid-write, `winml` now stops with a clear message instead of a misleading downstream error:

```text
ONNXSaveError: Insufficient disk space — unable to write ONNX model to <path>. Free up disk space and try again.
```

The partially written file is removed automatically, so a later stage never reads a truncated model. (Previously this surfaced much later as a confusing `ValueError: Failed to find proper ai.onnx domain` during quantization.) Free up space using the command above and re-run the build.

---

## General Tips
Expand Down
8 changes: 7 additions & 1 deletion src/winml/modelkit/commands/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,8 +810,14 @@ def _patch_device(cfg: WinMLBuildConfig) -> None:

# Map common errors to actionable hints
err_str = str(e)
err_lower = err_str.lower()
hint = None
if "Quantization failed" in err_str:
if "disk space" in err_lower or "no space left" in err_lower:
hint = (
"Free up disk space (e.g. clear the HuggingFace cache or "
"~/.cache/winml) and rebuild."
)
elif "Quantization failed" in err_str:
hint = "Try: --no-quant to skip quantization"
elif "Compilation failed" in err_str:
hint = "Try: --no-compile to skip compilation"
Expand Down
12 changes: 12 additions & 0 deletions src/winml/modelkit/commands/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ def _run_multi_precision(
"""Execute a multi-pass quantization pipeline from ordered precision strings."""
from ..config.precision import extract_weight_bits
from ..quant import Quantizer, WinMLQuantizationConfig, expand_precision
from ..quant.quantizer import _check_input_model_opset

modes = [_cli_precision_to_mode(p) for p in precision]
has_calibration_pass = any(m == "static" for m in modes)
Expand Down Expand Up @@ -387,6 +388,17 @@ def _run_multi_precision(

try:
console.print(f"\n[bold]Running pipeline: {label}...[/bold]")
# Mirror quantize_onnx's input guard: the multi-precision path drives the
# Quantizer pipeline directly (bypassing quantize_onnx), so surface a
# clear disk-full/corruption error here too instead of ORT's opaque
# "Failed to find proper ai.onnx domain" deep inside a pass. A missing
# file is left to Quantizer.run(), which reports "Model not found".
opset_error = _check_input_model_opset(model) if model.exists() else None
if opset_error is not None:
console.print("\n[bold red]Pipeline failed:[/bold red]")
console.print(f" {opset_error}")
raise click.ClickException("Pipeline failed")

result = Quantizer(passes).run(model, output)

if result.success:
Expand Down
3 changes: 2 additions & 1 deletion src/winml/modelkit/onnx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .external_data import copy_onnx_model, get_onnx_model_hash
from .io import InputTensorSpec, OutputTensorSpec, generate_inputs_from_onnx, get_io_config
from .metadata import capture_metadata, restore_metadata
from .persistence import cleanup_onnx, load_onnx, save_onnx
from .persistence import ONNXSaveError, cleanup_onnx, load_onnx, save_onnx
from .shape import infer_onnx_shapes, infer_shapes
from .utils import EXTERNAL_DATA_THRESHOLD, check_onnx_model, get_model_size

Expand All @@ -29,6 +29,7 @@
"EXTERNAL_DATA_THRESHOLD",
"InputTensorSpec",
"ONNXDomain",
"ONNXSaveError",
"OutputTensorSpec",
"SupportedONNXType",
"capture_metadata",
Expand Down
43 changes: 25 additions & 18 deletions src/winml/modelkit/onnx/external_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import onnx
from onnx import external_data_helper

from .persistence import load_onnx, save_onnx
from .persistence import _cleanup_partial_save, _raise_save_error, load_onnx, save_onnx


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -219,23 +219,30 @@ def copy_onnx_model(
dst.parent.mkdir(parents=True, exist_ok=True)

try:
external_files = get_external_data_files(src)
except Exception:
# Not a valid ONNX file or can't parse — fall back to simple copy
shutil.copy2(src, dst)
return

if not external_files:
# No external data — simple copy
shutil.copy2(src, dst)
return

if len(external_files) == 1:
# Single external data file — copy .data + patch .onnx
_copy_single_external(src, dst, external_files[0])
else:
# Multiple files — consolidate into one
_copy_consolidate(src, dst)
try:
external_files = get_external_data_files(src)
except Exception:
# Not a valid ONNX file or can't parse — fall back to simple copy
shutil.copy2(src, dst)
return

if not external_files:
# No external data — simple copy
shutil.copy2(src, dst)
return

if len(external_files) == 1:
# Single external data file — copy .data + patch .onnx
_copy_single_external(src, dst, external_files[0])
else:
# Multiple files — consolidate into one
_copy_consolidate(src, dst)
except OSError as e:
# A failed copy (commonly disk-full) can leave a truncated destination
# and/or .data sidecar behind. Remove them and surface a clear error
# instead of letting a later stage load the corrupt model.
_cleanup_partial_save(dst, dst.parent / f"{dst.name}.data")
_raise_save_error(e, dst)

logger.debug(
"Copied ONNX model with external data: %s -> %s (%d data files)",
Expand Down
123 changes: 111 additions & 12 deletions src/winml/modelkit/onnx/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@

from __future__ import annotations

import errno
import logging
import os
from pathlib import Path
from typing import NoReturn

import onnx
from onnx.external_data_helper import _get_all_tensors, uses_external_data
Expand All @@ -25,6 +27,92 @@
logger = logging.getLogger(__name__)


# Windows ERROR_DISK_FULL. Python usually maps this to errno.ENOSPC via the CRT,
# but we check the raw winerror too so a disk-full write is always recognised.
_WINDOWS_ERROR_DISK_FULL = 112


class ONNXSaveError(OSError):
"""Raised when an ONNX model cannot be written to disk.

Subclasses :class:`OSError` so existing ``except OSError`` handlers keep
working and the original ``errno`` is preserved (see ``errno_code``), while
surfacing a clear, actionable message. This matters most for disk-full
conditions: without it, a failed write leaves a truncated/zero-byte
``.onnx`` behind and the real cause only shows up much later as an opaque
opset-parsing error in a downstream stage.

Note:
``OSError.__init__`` only populates ``errno`` from a 2-argument
``(errno, strerror)`` call, which would also rewrite ``str(self)`` as
``"[Errno N] <message>"``. To keep the clean message *and* preserve
``errno`` for ``except OSError`` callers that inspect ``e.errno``, we
construct with the single message and set ``errno`` explicitly.

Attributes:
path: Destination path that could not be written.
disk_full: ``True`` when the failure was caused by insufficient disk
space (``errno.ENOSPC`` / Windows ``ERROR_DISK_FULL``).
errno: The originating OS error code, when known (inherited from
:class:`OSError`).
"""

def __init__(
self,
message: str,
*,
path: str | Path | None = None,
disk_full: bool = False,
errno_code: int | None = None,
) -> None:
super().__init__(message)
Comment thread
timenick marked this conversation as resolved.
# super().__init__(message) leaves self.errno = None; set it explicitly
# so callers catching this as OSError can still inspect e.errno.
if errno_code is not None:
self.errno = errno_code
self.path = path
self.disk_full = disk_full


def _is_disk_full_error(error: OSError) -> bool:
"""Return ``True`` when *error* represents an out-of-disk-space condition."""
return (
error.errno == errno.ENOSPC
or getattr(error, "winerror", None) == _WINDOWS_ERROR_DISK_FULL
)


def _cleanup_partial_save(*paths: Path | None) -> None:
"""Best-effort removal of partial artifacts left by a failed write.

A failed ``onnx.save_model`` / copy can leave a zero-byte or truncated
``.onnx`` file (and ``.data`` sidecar) behind. Removing them prevents a
later stage from loading a corrupt model and reporting a misleading error.
"""
for partial in paths:
if partial is None:
continue
try:
Path(partial).unlink(missing_ok=True)
except OSError:
logger.debug("Could not remove partial artifact: %s", partial, exc_info=True)


def _raise_save_error(error: OSError, path: Path) -> NoReturn:
"""Translate a write ``OSError`` into a clear :class:`ONNXSaveError`."""
disk_full = _is_disk_full_error(error)
if disk_full:
message = (
f"Insufficient disk space — unable to write ONNX model to {path}. "
"Free up disk space and try again."
)
else:
message = f"Failed to write ONNX model to {path}: {error}"
raise ONNXSaveError(
message, path=path, disk_full=disk_full, errno_code=error.errno
) from error


def load_onnx(
path: str | Path,
*,
Expand Down Expand Up @@ -127,20 +215,31 @@ def save_onnx(
# path.parent is guaranteed to exist: mkdir() was called above.
original_cwd = Path.cwd()
try:
os.chdir(path.parent)
onnx.save_model(
model,
path.name,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=ext_location,
size_threshold=1024,
)
finally:
os.chdir(original_cwd)
try:
os.chdir(path.parent)
onnx.save_model(
model,
path.name,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=ext_location,
size_threshold=1024,
)
finally:
os.chdir(original_cwd)
except OSError as e:
# A failed external-data write can leave a truncated .onnx and/or
# .data sidecar behind; remove them so a later stage never loads a
# corrupt model and reports a misleading error.
_cleanup_partial_save(path, ext_path)
_raise_save_error(e, path)
else:
logger.debug("Saving ONNX model inline to %s", path)
onnx.save_model(model, str(path))
try:
onnx.save_model(model, str(path))
except OSError as e:
_cleanup_partial_save(path)
_raise_save_error(e, path)


def cleanup_onnx(path: str | Path) -> list[Path]:
Expand Down
72 changes: 70 additions & 2 deletions src/winml/modelkit/quant/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,57 @@ def _merge_results(base: QuantizeResult, new: QuantizeResult) -> QuantizeResult:
)


def _check_input_model_opset(model_path: Path) -> str | None:
"""Return a clear error message if *model_path* is empty/corrupt, else None.

Mirrors ORT's ``get_opset_version`` requirement: a usable model must declare
a default (``""`` / ``ai.onnx``) opset import. A zero-byte or truncated file
parses into an (almost) empty ModelProto with no such opset import — the
signature of a previous stage that failed to finish writing (most commonly
because it ran out of disk space). Detecting it here lets us surface the
real cause instead of ORT's opaque "Failed to find proper ai.onnx domain".

A zero-byte file (the most common disk-full artefact) is caught up front
with a cheap ``stat`` so the healthy success path never pays for a full
proto parse. The full parse via ``onnx.load_model`` (graph only — no
external weights, so it never trips over a missing ``.data`` sidecar) is the
fallback for the rarer truncated-but-nonzero case.
"""
from onnx import load_model

# Fast path: a zero-byte output is the most common disk-full artefact.
try:
if model_path.stat().st_size == 0:
return (
f"Input ONNX model is empty (zero bytes): {model_path}. "
"A previous build stage may have run out of disk space. "
"Free up disk space and rebuild."
)
except OSError:
# stat() failing is unexpected (existence was already checked); fall
# through to the full parse, which surfaces a clear error either way.
pass

try:
model = load_model(str(model_path), load_external_data=False)
Comment thread
timenick marked this conversation as resolved.
except Exception as e:
return (
f"Input ONNX model could not be parsed: {model_path} ({e}). "
"The file may be truncated or corrupt — for example, a previous "
"build stage may have run out of disk space. Free up disk space "
"and rebuild."
)

has_default_opset = any(opset.domain in ("", "ai.onnx") for opset in model.opset_import)
if not has_default_opset:
return (
f"Input ONNX model is empty or corrupt (no ai.onnx opset import): "
f"{model_path}. It may have been truncated by a previous failed "
"write (e.g. insufficient disk space). Free up disk space and rebuild."
)
return None


def quantize_onnx(
model_path: str | Path,
output_path: str | Path | None = None,
Expand Down Expand Up @@ -257,6 +308,25 @@ def quantize_onnx(
output_path = model_path.parent / f"{model_path.stem}_quantized.onnx"

use_external_data: bool = kwargs.pop("use_external_data", True)
if kwargs:
raise TypeError(f"quantize_onnx() got unexpected keyword arguments: {sorted(kwargs)}")

# Guard against an empty/corrupt input model before building the pipeline.
# A previous stage that ran out of disk space can leave a truncated/zero-byte
# .onnx behind; without this check a pass fails deep inside ORT with the
# opaque "Failed to find proper ai.onnx domain". Surface the real cause
# instead, and catch it before the model-type finalizer reads the model. A
# missing file is left to Quantizer.run(), which reports a clear
# "Model not found".
if model_path.exists():
opset_error = _check_input_model_opset(model_path)
if opset_error is not None:
return QuantizeResult(
success=False,
output_path=None,
errors=[opset_error],
)

# Apply model-type-specific quant finalizer if registered. Some model types
# finalize calibration reader / nodes-to-exclude / dtypes only once the
# exported ONNX exists.
Expand All @@ -267,7 +337,5 @@ def quantize_onnx(
if finalizer is not None:
config = finalizer.finalize(config, onnx_path=model_path, model_id=config.model_id)

if kwargs:
raise TypeError(f"quantize_onnx() got unexpected keyword arguments: {sorted(kwargs)}")
passes = expand_precision(config=config)
return Quantizer(passes).run(model_path, output_path, use_external_data=use_external_data)
Loading
Loading