From 3de96a9c8875a87c457d5d601a1ecda3352edf19 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 29 Jun 2026 17:35:29 +0800
Subject: [PATCH 1/2] fix: auto-precision on GPU/CPU defaults to fp32 instead
 of fp16

Previously _AUTO_PRECISION mapped 'gpu' and 'cpu' to 'fp16', causing
resolve_quant_compile_config to trigger an unintended FP16 model
conversion whenever a user ran without --precision on a GPU/CPU machine
(including AMD/MIGraphX). This broke eval tests because the model was
silently converted.

Fix: change the mapping to 'fp32' (no-op) for both gpu and cpu.
FP16 conversion now only happens when the user explicitly passes
--precision fp16.

Fixes AMD eval failure reported against PR #872.
---
 src/winml/modelkit/config/precision.py |  6 ++--
 tests/unit/config/test_build.py        | 49 +++++++++++---------------
 tests/unit/config/test_build_onnx.py   | 36 ++++++++-----------
 tests/unit/config/test_precision.py    |  8 ++---
 4 files changed, 43 insertions(+), 56 deletions(-)

diff --git a/src/winml/modelkit/config/precision.py b/src/winml/modelkit/config/precision.py
index d686ff463..afd7052c7 100644
--- a/src/winml/modelkit/config/precision.py
+++ b/src/winml/modelkit/config/precision.py
@@ -34,8 +34,8 @@
 # Default auto-precision mapping: device -> precision
 _AUTO_PRECISION: dict[str, str] = {
     "npu": "w8a16",
-    "gpu": "fp16",
-    "cpu": "fp16",
+    "gpu": "fp32",
+    "cpu": "fp32",
 }
 
 # Precision -> weight/activation type mapping (named presets)
@@ -407,7 +407,7 @@ def resolve_precision(
         # GPU + LLM: warn about w4a16 recommendation
         if resolved_device == "gpu" and task in _LLM_TASKS:
             logger.warning(
-                "GPU + LLM task '%s': auto-precision is fp16 (no quantization). "
+                "GPU + LLM task '%s': auto-precision is fp32 (no conversion). "
                 "For better performance, consider w4a16 quantization manually.",
                 task,
             )
diff --git a/tests/unit/config/test_build.py b/tests/unit/config/test_build.py
index 90f807bc8..f799937b7 100644
--- a/tests/unit/config/test_build.py
+++ b/tests/unit/config/test_build.py
@@ -1972,10 +1972,10 @@ def _mock_deps(
             ("npu", "auto", True, "uint8", "uint16", "qnn"),
             ("npu", "fp16", True, "uint8", "uint8", "qnn"),  # fp16 algorithm quant config
             ("npu", "int8", True, "uint8", "uint8", "qnn"),
-            ("gpu", "auto", True, None, None, None),  # auto on gpu -> fp16 algorithm
+            ("gpu", "auto", False, None, None, None),  # auto on gpu -> fp32 (no-op)
             ("gpu", "int8", True, "uint8", "uint8", None),
             ("gpu", "fp16", True, None, None, None),  # fp16 algorithm quant config
-            ("cpu", "auto", True, None, None, None),  # auto on cpu -> fp16 algorithm
+            ("cpu", "auto", False, None, None, None),  # auto on cpu -> fp32 (no-op)
             ("cpu", "int8", True, "uint8", "uint8", None),
             ("cpu", "int16", True, "int16", "uint16", None),
             ("cpu", "fp16", True, None, None, None),  # fp16 algorithm quant config
@@ -2418,7 +2418,7 @@ def test_raw_onnx_full_pipeline(self, tmp_path) -> None:
         assert config.compile.ep_config.provider == "qnn"
 
     def test_raw_onnx_cpu(self, tmp_path) -> None:
-        """Raw ONNX + device=cpu resolves to an fp16 algorithm quant config, compile=None."""
+        """Raw ONNX + device=cpu with auto-precision resolves to fp32 (no-op), compile=None."""
         onnx_file = tmp_path / "model.onnx"
         onnx_file.write_bytes(b"fake")
 
@@ -2433,8 +2433,7 @@ def test_raw_onnx_cpu(self, tmp_path) -> None:
             config = generate_onnx_build_config(str(onnx_file), device="cpu")
 
         assert config.export is None
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        assert config.quant is None
         assert config.compile is None
 
     def test_quantized_onnx_skips_quant(self, tmp_path) -> None:
@@ -2761,10 +2760,10 @@ def test_onnx_path_as_pathlib(self, tmp_path) -> None:
         assert config.export is None
 
     def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
-        """device=auto + precision=auto (defaults) resolves to fp16 on CPU.
+        """device=auto + precision=auto (defaults) resolves to fp32 on CPU.
 
         resolve_check_device_ep returns device="auto" but resolve_precision
-        resolves the EP to pick a concrete device, yielding an fp16 algorithm quant config.
+        resolves the EP to pick a concrete device, yielding fp32 (no-op, no conversion).
         """
         onnx_file = tmp_path / "model.onnx"
         onnx_file.write_bytes(b"fake")
@@ -2779,9 +2778,8 @@ def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
         ):
             config = generate_onnx_build_config(str(onnx_file))
 
-        # EP resolves to CPU, auto-precision=fp16 → fp16 algorithm quant config
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        # EP resolves to CPU, auto-precision=fp32 → no quantization, no compile
+        assert config.quant is None
         assert config.compile is None
 
     def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
@@ -2801,7 +2799,7 @@ def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
         mock_resolve.assert_not_called()
 
     def test_raw_onnx_with_gpu(self, tmp_path) -> None:
-        """Raw ONNX + device=gpu resolves to an fp16 algorithm quant config, compile=None."""
+        """Raw ONNX + device=gpu with auto-precision resolves to fp32 (no-op), compile=None."""
         onnx_file = tmp_path / "model.onnx"
         onnx_file.write_bytes(b"fake")
 
@@ -2815,10 +2813,8 @@ def test_raw_onnx_with_gpu(self, tmp_path) -> None:
         ):
             config = generate_onnx_build_config(str(onnx_file), device="gpu")
 
-        # GPU auto-precision is fp16 -> fp16 algorithm quant config, no
-        # compile (DML has no offline step)
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        # GPU auto-precision is fp32 → no quantization, no compile (DML has no offline step)
+        assert config.quant is None
         assert config.compile is None
 
     def test_ep_override_forwarded(self, tmp_path) -> None:
@@ -2856,12 +2852,12 @@ class TestResolveQuantCompileConfig:
     the HF and ONNX build config paths.
     """
 
-    def test_auto_auto_returns_fp16_algorithm(self) -> None:
-        """device=auto + precision=auto resolves to an fp16 algorithm quant config.
+    def test_auto_auto_returns_no_quant(self) -> None:
+        """device=auto + precision=auto resolves to fp32 (no quantization, no conversion).
 
         When resolve_check_device_ep returns device="auto" but the EP
         resolves to a concrete device, resolve_precision picks auto-precision
-        (fp16 for CPU), yielding an fp16 algorithm quant config.
+        (fp32 for CPU), yielding no quant config.
         """
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
@@ -2869,8 +2865,7 @@ def test_auto_auto_returns_fp16_algorithm(self) -> None:
         ):
             quant, compile_cfg = resolve_quant_compile_config()
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
     def test_npu_returns_quant_and_compile(self) -> None:
@@ -2887,28 +2882,26 @@ def test_npu_returns_quant_and_compile(self) -> None:
         assert isinstance(compile_cfg, WinMLCompileConfig)
         assert compile_cfg.ep_config.provider == "qnn"
 
-    def test_gpu_returns_fp16_quant_and_none_compile(self) -> None:
-        """device=gpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
+    def test_gpu_returns_no_quant_and_none_compile(self) -> None:
+        """device=gpu returns (None, None) — auto-precision is fp32 (no conversion)."""
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("gpu", ["gpu", "cpu"], ["DmlExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config(device="gpu")
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
-    def test_cpu_returns_fp16_quant_and_none_compile(self) -> None:
-        """device=cpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
+    def test_cpu_returns_no_quant_and_none_compile(self) -> None:
+        """device=cpu returns (None, None) — auto-precision is fp32 (no conversion)."""
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("cpu", ["cpu"], ["CPUExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config(device="cpu")
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
     def test_ep_override_changes_provider(self) -> None:
diff --git a/tests/unit/config/test_build_onnx.py b/tests/unit/config/test_build_onnx.py
index 57805c619..7efe47bf8 100644
--- a/tests/unit/config/test_build_onnx.py
+++ b/tests/unit/config/test_build_onnx.py
@@ -220,7 +220,7 @@ def test_raw_onnx_full_pipeline(self, tmp_path) -> None:
         assert config.compile.ep_config.provider == "qnn"
 
     def test_raw_onnx_cpu(self, tmp_path) -> None:
-        """Raw ONNX + device=cpu resolves to an fp16 algorithm quant config, compile=None."""
+        """Raw ONNX + device=cpu with auto-precision resolves to fp32 (no-op), compile=None."""
         onnx_file = tmp_path / "model.onnx"
         onnx_file.write_bytes(b"fake")
 
@@ -235,8 +235,7 @@ def test_raw_onnx_cpu(self, tmp_path) -> None:
             config = generate_onnx_build_config(str(onnx_file), device="cpu")
 
         assert config.export is None
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        assert config.quant is None
         assert config.compile is None
 
     def test_quantized_onnx_skips_quant(self, tmp_path) -> None:
@@ -563,9 +562,9 @@ def test_onnx_path_as_pathlib(self, tmp_path) -> None:
         assert config.export is None
 
     def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
-        """device=auto + precision=auto resolves to fp16 on CPU.
+        """device=auto + precision=auto resolves to fp32 on CPU.
 
-        resolve_precision resolves the EP to a concrete device, yielding the fp16 algorithm.
+        resolve_precision resolves the EP to a concrete device, yielding fp32 (no-op).
         """
         onnx_file = tmp_path / "model.onnx"
         onnx_file.write_bytes(b"fake")
@@ -580,8 +579,7 @@ def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
         ):
             config = generate_onnx_build_config(str(onnx_file))
 
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        assert config.quant is None
         assert config.compile is None
 
     def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
@@ -601,7 +599,7 @@ def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
         mock_resolve.assert_not_called()
 
     def test_raw_onnx_with_gpu(self, tmp_path) -> None:
-        """Raw ONNX + device=gpu resolves to an fp16 algorithm quant config, compile=None.
+        """Raw ONNX + device=gpu with auto-precision resolves to fp32 (no-op), compile=None.
 
         DML has enable_ep_context=False so for_provider("dml") returns None —
         no offline compile step is needed.
@@ -619,9 +617,8 @@ def test_raw_onnx_with_gpu(self, tmp_path) -> None:
         ):
             config = generate_onnx_build_config(str(onnx_file), device="gpu")
 
-        # GPU auto-precision is fp16 -> fp16 algorithm quant config; DML has no EPContext step
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        # GPU auto-precision is fp32 → no quantization; DML has no EPContext step
+        assert config.quant is None
         assert config.compile is None
 
     def test_ep_override_forwarded(self, tmp_path) -> None:
@@ -661,16 +658,15 @@ class TestResolveQuantCompileConfig:
     the HF and ONNX build config paths.
     """
 
-    def test_auto_auto_returns_fp16_algorithm(self) -> None:
-        """device=auto + precision=auto resolves to an fp16 algorithm quant config."""
+    def test_auto_auto_returns_no_quant(self) -> None:
+        """device=auto + precision=auto resolves to fp32 (no quantization, no conversion)."""
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("auto", ["npu", "gpu", "cpu"], ["CPUExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config()
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
     def test_npu_returns_quant_and_compile(self) -> None:
@@ -688,27 +684,25 @@ def test_npu_returns_quant_and_compile(self) -> None:
         assert compile_cfg.ep_config.provider == "qnn"
 
     def test_gpu_returns_fp16_quant_and_none_compile(self) -> None:
-        """device=gpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
+        """device=gpu returns (None, None) — auto-precision is fp32 (no conversion)."""
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("gpu", ["gpu", "cpu"], ["DmlExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config(device="gpu")
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
     def test_cpu_returns_fp16_quant_and_none_compile(self) -> None:
-        """device=cpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
+        """device=cpu returns (None, None) — auto-precision is fp32 (no conversion)."""
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("cpu", ["cpu"], ["CPUExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config(device="cpu")
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
     def test_ep_override_changes_provider(self) -> None:
diff --git a/tests/unit/config/test_precision.py b/tests/unit/config/test_precision.py
index 4403cbee0..a141df272 100644
--- a/tests/unit/config/test_precision.py
+++ b/tests/unit/config/test_precision.py
@@ -51,13 +51,13 @@ class TestResolvePrecision:
             ("npu", "w8a16", "npu", "w8a16", "uint8", "uint16", "QNNExecutionProvider"),
             ("npu", "w8a8", "npu", "w8a8", "uint8", "uint8", "QNNExecutionProvider"),
             ("npu", "w16a16", "npu", "w16a16", "int16", "uint16", "QNNExecutionProvider"),
-            ("gpu", "auto", "gpu", "fp16", None, None, "DmlExecutionProvider"),
+            ("gpu", "auto", "gpu", "fp32", None, None, "DmlExecutionProvider"),
             ("gpu", "w8a16", "gpu", "w8a16", "uint8", "uint16", "DmlExecutionProvider"),
             ("gpu", "int8", "gpu", "int8", "uint8", "uint8", "DmlExecutionProvider"),
             ("gpu", "int16", "gpu", "int16", "int16", "uint16", "DmlExecutionProvider"),
             ("gpu", "fp16", "gpu", "fp16", None, None, "DmlExecutionProvider"),
             ("gpu", "fp32", "gpu", "fp32", None, None, "DmlExecutionProvider"),
-            ("cpu", "auto", "cpu", "fp16", None, None, None),
+            ("cpu", "auto", "cpu", "fp32", None, None, None),
             ("cpu", "int8", "cpu", "int8", "uint8", "uint8", None),
             ("cpu", "int16", "cpu", "int16", "int16", "uint16", None),
             ("cpu", "fp16", "cpu", "fp16", None, None, None),
@@ -144,7 +144,7 @@ def test_gpu_llm_warning(self, caplog) -> None:
             policy = resolve_precision(device="gpu", task="text-generation")
 
         assert policy.device == "gpu"
-        assert policy.precision == "fp16"
+        assert policy.precision == "fp32"
         assert any("w4a16" in record.message for record in caplog.records)
 
     def test_gpu_non_llm_no_warning(self, caplog) -> None:
@@ -152,7 +152,7 @@ def test_gpu_non_llm_no_warning(self, caplog) -> None:
         with caplog.at_level(logging.WARNING, logger="winml.modelkit.config.precision"):
             policy = resolve_precision(device="gpu", task="image-classification")
 
-        assert policy.precision == "fp16"
+        assert policy.precision == "fp32"
         assert not any("w4a16" in record.message for record in caplog.records)
 
     def test_gpu_text2text_warning(self, caplog) -> None:

From 3d61ff616442c4641abd879bec481b3bc9425081 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 29 Jun 2026 17:46:16 +0800
Subject: [PATCH 2/2] test(e2e): add regression guard for auto-precision
 GPU/CPU fix

Add three e2e tests in TestConfigFlagVariations to guard against
regression of the auto-precision GPU/CPU bug fixed in #998:

- test_cpu_auto_precision_no_quant: device=cpu + precision=auto
  must resolve to fp32 (no quant config), not fp16.
- test_gpu_auto_precision_no_quant: device=gpu + precision=auto
  must resolve to fp32 (no quant config), breaking AMD/MIGraphX fix.
- test_explicit_fp16_still_triggers_quant: --precision fp16 (explicit)
  must still produce an fp16 quant config, ensuring the fix didn't
  regress intentional FP16 conversion.

All 41 e2e config tests pass.
---
 tests/e2e/test_config_e2e.py | 40 ++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tests/e2e/test_config_e2e.py b/tests/e2e/test_config_e2e.py
index 7be700737..a8a1cb9c0 100644
--- a/tests/e2e/test_config_e2e.py
+++ b/tests/e2e/test_config_e2e.py
@@ -499,3 +499,43 @@ def test_module_flag_returns_list(self) -> None:
         for cfg in data:
             assert "loader" in cfg
             assert "export" in cfg
+
+    # --- auto-precision behaviour (PR #998 regression guard) -------------
+    def test_cpu_auto_precision_no_quant(self) -> None:
+        """device=cpu + precision=auto must NOT trigger FP16 conversion.
+
+        Before the fix, _AUTO_PRECISION mapped cpu→fp16 which silently
+        converted every model on CPU when no --precision flag was passed.
+        After the fix, cpu auto-precision resolves to fp32 (no-op).
+        """
+        data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "cpu")
+        _assert_hf_config_structure(data)
+        assert data.get("quant") is None, (
+            f"cpu + auto precision should resolve to fp32 (no quant). Got: {data.get('quant')}"
+        )
+
+    def test_gpu_auto_precision_no_quant(self) -> None:
+        """device=gpu + precision=auto must NOT trigger FP16 conversion.
+
+        Before the fix, _AUTO_PRECISION mapped gpu→fp16, which broke AMD
+        (MIGraphX) eval tests because MIGraphX received an FP16 model it
+        wasn't expecting. After the fix, gpu auto-precision resolves to
+        fp32 (no-op).
+        """
+        data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "gpu")
+        _assert_hf_config_structure(data)
+        assert data.get("quant") is None, (
+            f"gpu + auto precision should resolve to fp32 (no quant). Got: {data.get('quant')}"
+        )
+
+    def test_explicit_fp16_still_triggers_quant(self) -> None:
+        """--precision fp16 (explicit) must still produce an fp16 quant config.
+
+        The fix must not regress explicit FP16 requests — only auto-precision
+        should default to fp32.
+        """
+        data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "cpu", "-p", "fp16")
+        _assert_hf_config_structure(data)
+        quant = data.get("quant")
+        assert quant is not None, "Explicit --precision fp16 should produce a quant config"
+        assert quant.get("mode") == "fp16"