Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/winml/modelkit/config/precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
# Default auto-precision mapping: device -> precision
_AUTO_PRECISION: dict[str, str] = {
"npu": "w8a16",
"gpu": "fp16",
"cpu": "fp16",
"gpu": "fp32",
"cpu": "fp32",
}

# Precision -> weight/activation type mapping (named presets)
Expand Down Expand Up @@ -407,7 +407,7 @@ def resolve_precision(
# GPU + LLM: warn about w4a16 recommendation
if resolved_device == "gpu" and task in _LLM_TASKS:
logger.warning(
"GPU + LLM task '%s': auto-precision is fp16 (no quantization). "
"GPU + LLM task '%s': auto-precision is fp32 (no conversion). "
"For better performance, consider w4a16 quantization manually.",
task,
)
Expand Down
40 changes: 40 additions & 0 deletions tests/e2e/test_config_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,3 +499,43 @@ def test_module_flag_returns_list(self) -> None:
for cfg in data:
assert "loader" in cfg
assert "export" in cfg

# --- auto-precision behaviour (PR #998 regression guard) -------------
def test_cpu_auto_precision_no_quant(self) -> None:
"""device=cpu + precision=auto must NOT trigger FP16 conversion.

Before the fix, _AUTO_PRECISION mapped cpu→fp16 which silently
converted every model on CPU when no --precision flag was passed.
After the fix, cpu auto-precision resolves to fp32 (no-op).
"""
data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "cpu")
_assert_hf_config_structure(data)
assert data.get("quant") is None, (
f"cpu + auto precision should resolve to fp32 (no quant). Got: {data.get('quant')}"
)

def test_gpu_auto_precision_no_quant(self) -> None:
"""device=gpu + precision=auto must NOT trigger FP16 conversion.

Before the fix, _AUTO_PRECISION mapped gpu→fp16, which broke AMD
(MIGraphX) eval tests because MIGraphX received an FP16 model it
wasn't expecting. After the fix, gpu auto-precision resolves to
fp32 (no-op).
"""
data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "gpu")
_assert_hf_config_structure(data)
assert data.get("quant") is None, (
f"gpu + auto precision should resolve to fp32 (no quant). Got: {data.get('quant')}"
)

def test_explicit_fp16_still_triggers_quant(self) -> None:
"""--precision fp16 (explicit) must still produce an fp16 quant config.

The fix must not regress explicit FP16 requests — only auto-precision
should default to fp32.
"""
data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "cpu", "-p", "fp16")
_assert_hf_config_structure(data)
quant = data.get("quant")
assert quant is not None, "Explicit --precision fp16 should produce a quant config"
assert quant.get("mode") == "fp16"
49 changes: 21 additions & 28 deletions tests/unit/config/test_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -1972,10 +1972,10 @@ def _mock_deps(
("npu", "auto", True, "uint8", "uint16", "qnn"),
("npu", "fp16", True, "uint8", "uint8", "qnn"), # fp16 algorithm quant config
("npu", "int8", True, "uint8", "uint8", "qnn"),
("gpu", "auto", True, None, None, None), # auto on gpu -> fp16 algorithm
("gpu", "auto", False, None, None, None), # auto on gpu -> fp32 (no-op)
("gpu", "int8", True, "uint8", "uint8", None),
("gpu", "fp16", True, None, None, None), # fp16 algorithm quant config
("cpu", "auto", True, None, None, None), # auto on cpu -> fp16 algorithm
("cpu", "auto", False, None, None, None), # auto on cpu -> fp32 (no-op)
("cpu", "int8", True, "uint8", "uint8", None),
("cpu", "int16", True, "int16", "uint16", None),
("cpu", "fp16", True, None, None, None), # fp16 algorithm quant config
Expand Down Expand Up @@ -2418,7 +2418,7 @@ def test_raw_onnx_full_pipeline(self, tmp_path) -> None:
assert config.compile.ep_config.provider == "qnn"

def test_raw_onnx_cpu(self, tmp_path) -> None:
"""Raw ONNX + device=cpu resolves to an fp16 algorithm quant config, compile=None."""
"""Raw ONNX + device=cpu with auto-precision resolves to fp32 (no-op), compile=None."""
onnx_file = tmp_path / "model.onnx"
onnx_file.write_bytes(b"fake")

Expand All @@ -2433,8 +2433,7 @@ def test_raw_onnx_cpu(self, tmp_path) -> None:
config = generate_onnx_build_config(str(onnx_file), device="cpu")

assert config.export is None
assert config.quant is not None
assert config.quant.mode == "fp16"
assert config.quant is None
assert config.compile is None

def test_quantized_onnx_skips_quant(self, tmp_path) -> None:
Expand Down Expand Up @@ -2761,10 +2760,10 @@ def test_onnx_path_as_pathlib(self, tmp_path) -> None:
assert config.export is None

def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
"""device=auto + precision=auto (defaults) resolves to fp16 on CPU.
"""device=auto + precision=auto (defaults) resolves to fp32 on CPU.

resolve_check_device_ep returns device="auto" but resolve_precision
resolves the EP to pick a concrete device, yielding an fp16 algorithm quant config.
resolves the EP to pick a concrete device, yielding fp32 (no-op, no conversion).
"""
onnx_file = tmp_path / "model.onnx"
onnx_file.write_bytes(b"fake")
Expand All @@ -2779,9 +2778,8 @@ def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
):
config = generate_onnx_build_config(str(onnx_file))

# EP resolves to CPU, auto-precision=fp16 → fp16 algorithm quant config
assert config.quant is not None
assert config.quant.mode == "fp16"
# EP resolves to CPU, auto-precision=fp32 → no quantization, no compile
assert config.quant is None
assert config.compile is None

def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
Expand All @@ -2801,7 +2799,7 @@ def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
mock_resolve.assert_not_called()

def test_raw_onnx_with_gpu(self, tmp_path) -> None:
"""Raw ONNX + device=gpu resolves to an fp16 algorithm quant config, compile=None."""
"""Raw ONNX + device=gpu with auto-precision resolves to fp32 (no-op), compile=None."""
onnx_file = tmp_path / "model.onnx"
onnx_file.write_bytes(b"fake")

Expand All @@ -2815,10 +2813,8 @@ def test_raw_onnx_with_gpu(self, tmp_path) -> None:
):
config = generate_onnx_build_config(str(onnx_file), device="gpu")

# GPU auto-precision is fp16 -> fp16 algorithm quant config, no
# compile (DML has no offline step)
assert config.quant is not None
assert config.quant.mode == "fp16"
# GPU auto-precision is fp32 → no quantization, no compile (DML has no offline step)
assert config.quant is None
assert config.compile is None

def test_ep_override_forwarded(self, tmp_path) -> None:
Expand Down Expand Up @@ -2856,21 +2852,20 @@ class TestResolveQuantCompileConfig:
the HF and ONNX build config paths.
"""

def test_auto_auto_returns_fp16_algorithm(self) -> None:
"""device=auto + precision=auto resolves to an fp16 algorithm quant config.
def test_auto_auto_returns_no_quant(self) -> None:
"""device=auto + precision=auto resolves to fp32 (no quantization, no conversion).

When resolve_check_device_ep returns device="auto" but the EP
resolves to a concrete device, resolve_precision picks auto-precision
(fp16 for CPU), yielding an fp16 algorithm quant config.
(fp32 for CPU), yielding no quant config.
"""
with patch(
"winml.modelkit.sysinfo.resolve_check_device_ep",
return_value=("auto", ["npu", "gpu", "cpu"], ["CPUExecutionProvider"]),
):
quant, compile_cfg = resolve_quant_compile_config()

assert isinstance(quant, WinMLQuantizationConfig)
assert quant.mode == "fp16"
assert quant is None
assert compile_cfg is None

def test_npu_returns_quant_and_compile(self) -> None:
Expand All @@ -2887,28 +2882,26 @@ def test_npu_returns_quant_and_compile(self) -> None:
assert isinstance(compile_cfg, WinMLCompileConfig)
assert compile_cfg.ep_config.provider == "qnn"

def test_gpu_returns_fp16_quant_and_none_compile(self) -> None:
"""device=gpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
def test_gpu_returns_no_quant_and_none_compile(self) -> None:
"""device=gpu returns (None, None) — auto-precision is fp32 (no conversion)."""
with patch(
"winml.modelkit.sysinfo.resolve_check_device_ep",
return_value=("gpu", ["gpu", "cpu"], ["DmlExecutionProvider"]),
):
quant, compile_cfg = resolve_quant_compile_config(device="gpu")

assert isinstance(quant, WinMLQuantizationConfig)
assert quant.mode == "fp16"
assert quant is None
assert compile_cfg is None

def test_cpu_returns_fp16_quant_and_none_compile(self) -> None:
"""device=cpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
def test_cpu_returns_no_quant_and_none_compile(self) -> None:
"""device=cpu returns (None, None) — auto-precision is fp32 (no conversion)."""
with patch(
"winml.modelkit.sysinfo.resolve_check_device_ep",
return_value=("cpu", ["cpu"], ["CPUExecutionProvider"]),
):
quant, compile_cfg = resolve_quant_compile_config(device="cpu")

assert isinstance(quant, WinMLQuantizationConfig)
assert quant.mode == "fp16"
assert quant is None
assert compile_cfg is None

def test_ep_override_changes_provider(self) -> None:
Expand Down
36 changes: 15 additions & 21 deletions tests/unit/config/test_build_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def test_raw_onnx_full_pipeline(self, tmp_path) -> None:
assert config.compile.ep_config.provider == "qnn"

def test_raw_onnx_cpu(self, tmp_path) -> None:
"""Raw ONNX + device=cpu resolves to an fp16 algorithm quant config, compile=None."""
"""Raw ONNX + device=cpu with auto-precision resolves to fp32 (no-op), compile=None."""
onnx_file = tmp_path / "model.onnx"
onnx_file.write_bytes(b"fake")

Expand All @@ -235,8 +235,7 @@ def test_raw_onnx_cpu(self, tmp_path) -> None:
config = generate_onnx_build_config(str(onnx_file), device="cpu")

assert config.export is None
assert config.quant is not None
assert config.quant.mode == "fp16"
assert config.quant is None
assert config.compile is None

def test_quantized_onnx_skips_quant(self, tmp_path) -> None:
Expand Down Expand Up @@ -563,9 +562,9 @@ def test_onnx_path_as_pathlib(self, tmp_path) -> None:
assert config.export is None

def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
"""device=auto + precision=auto resolves to fp16 on CPU.
"""device=auto + precision=auto resolves to fp32 on CPU.

resolve_precision resolves the EP to a concrete device, yielding the fp16 algorithm.
resolve_precision resolves the EP to a concrete device, yielding fp32 (no-op).
"""
onnx_file = tmp_path / "model.onnx"
onnx_file.write_bytes(b"fake")
Expand All @@ -580,8 +579,7 @@ def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
):
config = generate_onnx_build_config(str(onnx_file))

assert config.quant is not None
assert config.quant.mode == "fp16"
assert config.quant is None
assert config.compile is None

def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
Expand All @@ -601,7 +599,7 @@ def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
mock_resolve.assert_not_called()

def test_raw_onnx_with_gpu(self, tmp_path) -> None:
"""Raw ONNX + device=gpu resolves to an fp16 algorithm quant config, compile=None.
"""Raw ONNX + device=gpu with auto-precision resolves to fp32 (no-op), compile=None.

DML has enable_ep_context=False so for_provider("dml") returns None —
no offline compile step is needed.
Expand All @@ -619,9 +617,8 @@ def test_raw_onnx_with_gpu(self, tmp_path) -> None:
):
config = generate_onnx_build_config(str(onnx_file), device="gpu")

# GPU auto-precision is fp16 -> fp16 algorithm quant config; DML has no EPContext step
assert config.quant is not None
assert config.quant.mode == "fp16"
# GPU auto-precision is fp32 → no quantization; DML has no EPContext step
assert config.quant is None
assert config.compile is None

def test_ep_override_forwarded(self, tmp_path) -> None:
Expand Down Expand Up @@ -661,16 +658,15 @@ class TestResolveQuantCompileConfig:
the HF and ONNX build config paths.
"""

def test_auto_auto_returns_fp16_algorithm(self) -> None:
"""device=auto + precision=auto resolves to an fp16 algorithm quant config."""
def test_auto_auto_returns_no_quant(self) -> None:
"""device=auto + precision=auto resolves to fp32 (no quantization, no conversion)."""
with patch(
"winml.modelkit.sysinfo.resolve_check_device_ep",
return_value=("auto", ["npu", "gpu", "cpu"], ["CPUExecutionProvider"]),
):
quant, compile_cfg = resolve_quant_compile_config()

assert isinstance(quant, WinMLQuantizationConfig)
assert quant.mode == "fp16"
assert quant is None
assert compile_cfg is None

def test_npu_returns_quant_and_compile(self) -> None:
Expand All @@ -688,27 +684,25 @@ def test_npu_returns_quant_and_compile(self) -> None:
assert compile_cfg.ep_config.provider == "qnn"

def test_gpu_returns_fp16_quant_and_none_compile(self) -> None:
"""device=gpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
"""device=gpu returns (None, None) — auto-precision is fp32 (no conversion)."""
with patch(
"winml.modelkit.sysinfo.resolve_check_device_ep",
return_value=("gpu", ["gpu", "cpu"], ["DmlExecutionProvider"]),
):
quant, compile_cfg = resolve_quant_compile_config(device="gpu")

assert isinstance(quant, WinMLQuantizationConfig)
assert quant.mode == "fp16"
assert quant is None
assert compile_cfg is None

def test_cpu_returns_fp16_quant_and_none_compile(self) -> None:
"""device=cpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
"""device=cpu returns (None, None) — auto-precision is fp32 (no conversion)."""
with patch(
"winml.modelkit.sysinfo.resolve_check_device_ep",
return_value=("cpu", ["cpu"], ["CPUExecutionProvider"]),
):
quant, compile_cfg = resolve_quant_compile_config(device="cpu")

assert isinstance(quant, WinMLQuantizationConfig)
assert quant.mode == "fp16"
assert quant is None
assert compile_cfg is None

def test_ep_override_changes_provider(self) -> None:
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/config/test_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,13 @@ class TestResolvePrecision:
("npu", "w8a16", "npu", "w8a16", "uint8", "uint16", "QNNExecutionProvider"),
("npu", "w8a8", "npu", "w8a8", "uint8", "uint8", "QNNExecutionProvider"),
("npu", "w16a16", "npu", "w16a16", "int16", "uint16", "QNNExecutionProvider"),
("gpu", "auto", "gpu", "fp16", None, None, "DmlExecutionProvider"),
("gpu", "auto", "gpu", "fp32", None, None, "DmlExecutionProvider"),
("gpu", "w8a16", "gpu", "w8a16", "uint8", "uint16", "DmlExecutionProvider"),
("gpu", "int8", "gpu", "int8", "uint8", "uint8", "DmlExecutionProvider"),
("gpu", "int16", "gpu", "int16", "int16", "uint16", "DmlExecutionProvider"),
("gpu", "fp16", "gpu", "fp16", None, None, "DmlExecutionProvider"),
("gpu", "fp32", "gpu", "fp32", None, None, "DmlExecutionProvider"),
("cpu", "auto", "cpu", "fp16", None, None, None),
("cpu", "auto", "cpu", "fp32", None, None, None),
("cpu", "int8", "cpu", "int8", "uint8", "uint8", None),
("cpu", "int16", "cpu", "int16", "int16", "uint16", None),
("cpu", "fp16", "cpu", "fp16", None, None, None),
Expand Down Expand Up @@ -144,15 +144,15 @@ def test_gpu_llm_warning(self, caplog) -> None:
policy = resolve_precision(device="gpu", task="text-generation")

assert policy.device == "gpu"
assert policy.precision == "fp16"
assert policy.precision == "fp32"
assert any("w4a16" in record.message for record in caplog.records)

def test_gpu_non_llm_no_warning(self, caplog) -> None:
"""GPU + image-classification does NOT log w4a16 warning."""
with caplog.at_level(logging.WARNING, logger="winml.modelkit.config.precision"):
policy = resolve_precision(device="gpu", task="image-classification")

assert policy.precision == "fp16"
assert policy.precision == "fp32"
assert not any("w4a16" in record.message for record in caplog.records)

def test_gpu_text2text_warning(self, caplog) -> None:
Expand Down
Loading