From 3de96a9c8875a87c457d5d601a1ecda3352edf19 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 29 Jun 2026 17:35:29 +0800 Subject: [PATCH 1/2] fix: auto-precision on GPU/CPU defaults to fp32 instead of fp16 Previously _AUTO_PRECISION mapped 'gpu' and 'cpu' to 'fp16', causing resolve_quant_compile_config to trigger an unintended FP16 model conversion whenever a user ran without --precision on a GPU/CPU machine (including AMD/MIGraphX). This broke eval tests because the model was silently converted. Fix: change the mapping to 'fp32' (no-op) for both gpu and cpu. FP16 conversion now only happens when the user explicitly passes --precision fp16. Fixes AMD eval failure reported against PR #872. --- src/winml/modelkit/config/precision.py | 6 ++-- tests/unit/config/test_build.py | 49 +++++++++++--------------- tests/unit/config/test_build_onnx.py | 36 ++++++++----------- tests/unit/config/test_precision.py | 8 ++--- 4 files changed, 43 insertions(+), 56 deletions(-) diff --git a/src/winml/modelkit/config/precision.py b/src/winml/modelkit/config/precision.py index d686ff463..afd7052c7 100644 --- a/src/winml/modelkit/config/precision.py +++ b/src/winml/modelkit/config/precision.py @@ -34,8 +34,8 @@ # Default auto-precision mapping: device -> precision _AUTO_PRECISION: dict[str, str] = { "npu": "w8a16", - "gpu": "fp16", - "cpu": "fp16", + "gpu": "fp32", + "cpu": "fp32", } # Precision -> weight/activation type mapping (named presets) @@ -407,7 +407,7 @@ def resolve_precision( # GPU + LLM: warn about w4a16 recommendation if resolved_device == "gpu" and task in _LLM_TASKS: logger.warning( - "GPU + LLM task '%s': auto-precision is fp16 (no quantization). " + "GPU + LLM task '%s': auto-precision is fp32 (no conversion). " "For better performance, consider w4a16 quantization manually.", task, ) diff --git a/tests/unit/config/test_build.py b/tests/unit/config/test_build.py index 90f807bc8..f799937b7 100644 --- a/tests/unit/config/test_build.py +++ b/tests/unit/config/test_build.py @@ -1972,10 +1972,10 @@ def _mock_deps( ("npu", "auto", True, "uint8", "uint16", "qnn"), ("npu", "fp16", True, "uint8", "uint8", "qnn"), # fp16 algorithm quant config ("npu", "int8", True, "uint8", "uint8", "qnn"), - ("gpu", "auto", True, None, None, None), # auto on gpu -> fp16 algorithm + ("gpu", "auto", False, None, None, None), # auto on gpu -> fp32 (no-op) ("gpu", "int8", True, "uint8", "uint8", None), ("gpu", "fp16", True, None, None, None), # fp16 algorithm quant config - ("cpu", "auto", True, None, None, None), # auto on cpu -> fp16 algorithm + ("cpu", "auto", False, None, None, None), # auto on cpu -> fp32 (no-op) ("cpu", "int8", True, "uint8", "uint8", None), ("cpu", "int16", True, "int16", "uint16", None), ("cpu", "fp16", True, None, None, None), # fp16 algorithm quant config @@ -2418,7 +2418,7 @@ def test_raw_onnx_full_pipeline(self, tmp_path) -> None: assert config.compile.ep_config.provider == "qnn" def test_raw_onnx_cpu(self, tmp_path) -> None: - """Raw ONNX + device=cpu resolves to an fp16 algorithm quant config, compile=None.""" + """Raw ONNX + device=cpu with auto-precision resolves to fp32 (no-op), compile=None.""" onnx_file = tmp_path / "model.onnx" onnx_file.write_bytes(b"fake") @@ -2433,8 +2433,7 @@ def test_raw_onnx_cpu(self, tmp_path) -> None: config = generate_onnx_build_config(str(onnx_file), device="cpu") assert config.export is None - assert config.quant is not None - assert config.quant.mode == "fp16" + assert config.quant is None assert config.compile is None def test_quantized_onnx_skips_quant(self, tmp_path) -> None: @@ -2761,10 +2760,10 @@ def test_onnx_path_as_pathlib(self, tmp_path) -> None: assert config.export is None def test_auto_device_auto_precision_defaults(self, tmp_path) -> None: - """device=auto + precision=auto (defaults) resolves to fp16 on CPU. + """device=auto + precision=auto (defaults) resolves to fp32 on CPU. resolve_check_device_ep returns device="auto" but resolve_precision - resolves the EP to pick a concrete device, yielding an fp16 algorithm quant config. + resolves the EP to pick a concrete device, yielding fp32 (no-op, no conversion). """ onnx_file = tmp_path / "model.onnx" onnx_file.write_bytes(b"fake") @@ -2779,9 +2778,8 @@ def test_auto_device_auto_precision_defaults(self, tmp_path) -> None: ): config = generate_onnx_build_config(str(onnx_file)) - # EP resolves to CPU, auto-precision=fp16 → fp16 algorithm quant config - assert config.quant is not None - assert config.quant.mode == "fp16" + # EP resolves to CPU, auto-precision=fp32 → no quantization, no compile + assert config.quant is None assert config.compile is None def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None: @@ -2801,7 +2799,7 @@ def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None: mock_resolve.assert_not_called() def test_raw_onnx_with_gpu(self, tmp_path) -> None: - """Raw ONNX + device=gpu resolves to an fp16 algorithm quant config, compile=None.""" + """Raw ONNX + device=gpu with auto-precision resolves to fp32 (no-op), compile=None.""" onnx_file = tmp_path / "model.onnx" onnx_file.write_bytes(b"fake") @@ -2815,10 +2813,8 @@ def test_raw_onnx_with_gpu(self, tmp_path) -> None: ): config = generate_onnx_build_config(str(onnx_file), device="gpu") - # GPU auto-precision is fp16 -> fp16 algorithm quant config, no - # compile (DML has no offline step) - assert config.quant is not None - assert config.quant.mode == "fp16" + # GPU auto-precision is fp32 → no quantization, no compile (DML has no offline step) + assert config.quant is None assert config.compile is None def test_ep_override_forwarded(self, tmp_path) -> None: @@ -2856,12 +2852,12 @@ class TestResolveQuantCompileConfig: the HF and ONNX build config paths. """ - def test_auto_auto_returns_fp16_algorithm(self) -> None: - """device=auto + precision=auto resolves to an fp16 algorithm quant config. + def test_auto_auto_returns_no_quant(self) -> None: + """device=auto + precision=auto resolves to fp32 (no quantization, no conversion). When resolve_check_device_ep returns device="auto" but the EP resolves to a concrete device, resolve_precision picks auto-precision - (fp16 for CPU), yielding an fp16 algorithm quant config. + (fp32 for CPU), yielding no quant config. """ with patch( "winml.modelkit.sysinfo.resolve_check_device_ep", @@ -2869,8 +2865,7 @@ def test_auto_auto_returns_fp16_algorithm(self) -> None: ): quant, compile_cfg = resolve_quant_compile_config() - assert isinstance(quant, WinMLQuantizationConfig) - assert quant.mode == "fp16" + assert quant is None assert compile_cfg is None def test_npu_returns_quant_and_compile(self) -> None: @@ -2887,28 +2882,26 @@ def test_npu_returns_quant_and_compile(self) -> None: assert isinstance(compile_cfg, WinMLCompileConfig) assert compile_cfg.ep_config.provider == "qnn" - def test_gpu_returns_fp16_quant_and_none_compile(self) -> None: - """device=gpu returns (fp16 algorithm quant config, None) — auto-precision is fp16.""" + def test_gpu_returns_no_quant_and_none_compile(self) -> None: + """device=gpu returns (None, None) — auto-precision is fp32 (no conversion).""" with patch( "winml.modelkit.sysinfo.resolve_check_device_ep", return_value=("gpu", ["gpu", "cpu"], ["DmlExecutionProvider"]), ): quant, compile_cfg = resolve_quant_compile_config(device="gpu") - assert isinstance(quant, WinMLQuantizationConfig) - assert quant.mode == "fp16" + assert quant is None assert compile_cfg is None - def test_cpu_returns_fp16_quant_and_none_compile(self) -> None: - """device=cpu returns (fp16 algorithm quant config, None) — auto-precision is fp16.""" + def test_cpu_returns_no_quant_and_none_compile(self) -> None: + """device=cpu returns (None, None) — auto-precision is fp32 (no conversion).""" with patch( "winml.modelkit.sysinfo.resolve_check_device_ep", return_value=("cpu", ["cpu"], ["CPUExecutionProvider"]), ): quant, compile_cfg = resolve_quant_compile_config(device="cpu") - assert isinstance(quant, WinMLQuantizationConfig) - assert quant.mode == "fp16" + assert quant is None assert compile_cfg is None def test_ep_override_changes_provider(self) -> None: diff --git a/tests/unit/config/test_build_onnx.py b/tests/unit/config/test_build_onnx.py index 57805c619..7efe47bf8 100644 --- a/tests/unit/config/test_build_onnx.py +++ b/tests/unit/config/test_build_onnx.py @@ -220,7 +220,7 @@ def test_raw_onnx_full_pipeline(self, tmp_path) -> None: assert config.compile.ep_config.provider == "qnn" def test_raw_onnx_cpu(self, tmp_path) -> None: - """Raw ONNX + device=cpu resolves to an fp16 algorithm quant config, compile=None.""" + """Raw ONNX + device=cpu with auto-precision resolves to fp32 (no-op), compile=None.""" onnx_file = tmp_path / "model.onnx" onnx_file.write_bytes(b"fake") @@ -235,8 +235,7 @@ def test_raw_onnx_cpu(self, tmp_path) -> None: config = generate_onnx_build_config(str(onnx_file), device="cpu") assert config.export is None - assert config.quant is not None - assert config.quant.mode == "fp16" + assert config.quant is None assert config.compile is None def test_quantized_onnx_skips_quant(self, tmp_path) -> None: @@ -563,9 +562,9 @@ def test_onnx_path_as_pathlib(self, tmp_path) -> None: assert config.export is None def test_auto_device_auto_precision_defaults(self, tmp_path) -> None: - """device=auto + precision=auto resolves to fp16 on CPU. + """device=auto + precision=auto resolves to fp32 on CPU. - resolve_precision resolves the EP to a concrete device, yielding the fp16 algorithm. + resolve_precision resolves the EP to a concrete device, yielding fp32 (no-op). """ onnx_file = tmp_path / "model.onnx" onnx_file.write_bytes(b"fake") @@ -580,8 +579,7 @@ def test_auto_device_auto_precision_defaults(self, tmp_path) -> None: ): config = generate_onnx_build_config(str(onnx_file)) - assert config.quant is not None - assert config.quant.mode == "fp16" + assert config.quant is None assert config.compile is None def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None: @@ -601,7 +599,7 @@ def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None: mock_resolve.assert_not_called() def test_raw_onnx_with_gpu(self, tmp_path) -> None: - """Raw ONNX + device=gpu resolves to an fp16 algorithm quant config, compile=None. + """Raw ONNX + device=gpu with auto-precision resolves to fp32 (no-op), compile=None. DML has enable_ep_context=False so for_provider("dml") returns None — no offline compile step is needed. @@ -619,9 +617,8 @@ def test_raw_onnx_with_gpu(self, tmp_path) -> None: ): config = generate_onnx_build_config(str(onnx_file), device="gpu") - # GPU auto-precision is fp16 -> fp16 algorithm quant config; DML has no EPContext step - assert config.quant is not None - assert config.quant.mode == "fp16" + # GPU auto-precision is fp32 → no quantization; DML has no EPContext step + assert config.quant is None assert config.compile is None def test_ep_override_forwarded(self, tmp_path) -> None: @@ -661,16 +658,15 @@ class TestResolveQuantCompileConfig: the HF and ONNX build config paths. """ - def test_auto_auto_returns_fp16_algorithm(self) -> None: - """device=auto + precision=auto resolves to an fp16 algorithm quant config.""" + def test_auto_auto_returns_no_quant(self) -> None: + """device=auto + precision=auto resolves to fp32 (no quantization, no conversion).""" with patch( "winml.modelkit.sysinfo.resolve_check_device_ep", return_value=("auto", ["npu", "gpu", "cpu"], ["CPUExecutionProvider"]), ): quant, compile_cfg = resolve_quant_compile_config() - assert isinstance(quant, WinMLQuantizationConfig) - assert quant.mode == "fp16" + assert quant is None assert compile_cfg is None def test_npu_returns_quant_and_compile(self) -> None: @@ -688,27 +684,25 @@ def test_npu_returns_quant_and_compile(self) -> None: assert compile_cfg.ep_config.provider == "qnn" def test_gpu_returns_fp16_quant_and_none_compile(self) -> None: - """device=gpu returns (fp16 algorithm quant config, None) — auto-precision is fp16.""" + """device=gpu returns (None, None) — auto-precision is fp32 (no conversion).""" with patch( "winml.modelkit.sysinfo.resolve_check_device_ep", return_value=("gpu", ["gpu", "cpu"], ["DmlExecutionProvider"]), ): quant, compile_cfg = resolve_quant_compile_config(device="gpu") - assert isinstance(quant, WinMLQuantizationConfig) - assert quant.mode == "fp16" + assert quant is None assert compile_cfg is None def test_cpu_returns_fp16_quant_and_none_compile(self) -> None: - """device=cpu returns (fp16 algorithm quant config, None) — auto-precision is fp16.""" + """device=cpu returns (None, None) — auto-precision is fp32 (no conversion).""" with patch( "winml.modelkit.sysinfo.resolve_check_device_ep", return_value=("cpu", ["cpu"], ["CPUExecutionProvider"]), ): quant, compile_cfg = resolve_quant_compile_config(device="cpu") - assert isinstance(quant, WinMLQuantizationConfig) - assert quant.mode == "fp16" + assert quant is None assert compile_cfg is None def test_ep_override_changes_provider(self) -> None: diff --git a/tests/unit/config/test_precision.py b/tests/unit/config/test_precision.py index 4403cbee0..a141df272 100644 --- a/tests/unit/config/test_precision.py +++ b/tests/unit/config/test_precision.py @@ -51,13 +51,13 @@ class TestResolvePrecision: ("npu", "w8a16", "npu", "w8a16", "uint8", "uint16", "QNNExecutionProvider"), ("npu", "w8a8", "npu", "w8a8", "uint8", "uint8", "QNNExecutionProvider"), ("npu", "w16a16", "npu", "w16a16", "int16", "uint16", "QNNExecutionProvider"), - ("gpu", "auto", "gpu", "fp16", None, None, "DmlExecutionProvider"), + ("gpu", "auto", "gpu", "fp32", None, None, "DmlExecutionProvider"), ("gpu", "w8a16", "gpu", "w8a16", "uint8", "uint16", "DmlExecutionProvider"), ("gpu", "int8", "gpu", "int8", "uint8", "uint8", "DmlExecutionProvider"), ("gpu", "int16", "gpu", "int16", "int16", "uint16", "DmlExecutionProvider"), ("gpu", "fp16", "gpu", "fp16", None, None, "DmlExecutionProvider"), ("gpu", "fp32", "gpu", "fp32", None, None, "DmlExecutionProvider"), - ("cpu", "auto", "cpu", "fp16", None, None, None), + ("cpu", "auto", "cpu", "fp32", None, None, None), ("cpu", "int8", "cpu", "int8", "uint8", "uint8", None), ("cpu", "int16", "cpu", "int16", "int16", "uint16", None), ("cpu", "fp16", "cpu", "fp16", None, None, None), @@ -144,7 +144,7 @@ def test_gpu_llm_warning(self, caplog) -> None: policy = resolve_precision(device="gpu", task="text-generation") assert policy.device == "gpu" - assert policy.precision == "fp16" + assert policy.precision == "fp32" assert any("w4a16" in record.message for record in caplog.records) def test_gpu_non_llm_no_warning(self, caplog) -> None: @@ -152,7 +152,7 @@ def test_gpu_non_llm_no_warning(self, caplog) -> None: with caplog.at_level(logging.WARNING, logger="winml.modelkit.config.precision"): policy = resolve_precision(device="gpu", task="image-classification") - assert policy.precision == "fp16" + assert policy.precision == "fp32" assert not any("w4a16" in record.message for record in caplog.records) def test_gpu_text2text_warning(self, caplog) -> None: From 3d61ff616442c4641abd879bec481b3bc9425081 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 29 Jun 2026 17:46:16 +0800 Subject: [PATCH 2/2] test(e2e): add regression guard for auto-precision GPU/CPU fix Add three e2e tests in TestConfigFlagVariations to guard against regression of the auto-precision GPU/CPU bug fixed in #998: - test_cpu_auto_precision_no_quant: device=cpu + precision=auto must resolve to fp32 (no quant config), not fp16. - test_gpu_auto_precision_no_quant: device=gpu + precision=auto must resolve to fp32 (no quant config), breaking AMD/MIGraphX fix. - test_explicit_fp16_still_triggers_quant: --precision fp16 (explicit) must still produce an fp16 quant config, ensuring the fix didn't regress intentional FP16 conversion. All 41 e2e config tests pass. --- tests/e2e/test_config_e2e.py | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/e2e/test_config_e2e.py b/tests/e2e/test_config_e2e.py index 7be700737..a8a1cb9c0 100644 --- a/tests/e2e/test_config_e2e.py +++ b/tests/e2e/test_config_e2e.py @@ -499,3 +499,43 @@ def test_module_flag_returns_list(self) -> None: for cfg in data: assert "loader" in cfg assert "export" in cfg + + # --- auto-precision behaviour (PR #998 regression guard) ------------- + def test_cpu_auto_precision_no_quant(self) -> None: + """device=cpu + precision=auto must NOT trigger FP16 conversion. + + Before the fix, _AUTO_PRECISION mapped cpu→fp16 which silently + converted every model on CPU when no --precision flag was passed. + After the fix, cpu auto-precision resolves to fp32 (no-op). + """ + data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "cpu") + _assert_hf_config_structure(data) + assert data.get("quant") is None, ( + f"cpu + auto precision should resolve to fp32 (no quant). Got: {data.get('quant')}" + ) + + def test_gpu_auto_precision_no_quant(self) -> None: + """device=gpu + precision=auto must NOT trigger FP16 conversion. + + Before the fix, _AUTO_PRECISION mapped gpu→fp16, which broke AMD + (MIGraphX) eval tests because MIGraphX received an FP16 model it + wasn't expecting. After the fix, gpu auto-precision resolves to + fp32 (no-op). + """ + data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "gpu") + _assert_hf_config_structure(data) + assert data.get("quant") is None, ( + f"gpu + auto precision should resolve to fp32 (no quant). Got: {data.get('quant')}" + ) + + def test_explicit_fp16_still_triggers_quant(self) -> None: + """--precision fp16 (explicit) must still produce an fp16 quant config. + + The fix must not regress explicit FP16 requests — only auto-precision + should default to fp32. + """ + data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "cpu", "-p", "fp16") + _assert_hf_config_structure(data) + quant = data.get("quant") + assert quant is not None, "Explicit --precision fp16 should produce a quant config" + assert quant.get("mode") == "fp16"