microsoft · DingmaomaoBJTU · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
@@ -34,8 +34,8 @@
 # Default auto-precision mapping: device -> precision
 _AUTO_PRECISION: dict[str, str] = {
     "npu": "w8a16",
-    "gpu": "fp16",
-    "cpu": "fp16",
+    "gpu": "fp32",
+    "cpu": "fp32",
 }
 
 # Precision -> weight/activation type mapping (named presets)
@@ -407,7 +407,7 @@ def resolve_precision(
         # GPU + LLM: warn about w4a16 recommendation
         if resolved_device == "gpu" and task in _LLM_TASKS:
             logger.warning(
-                "GPU + LLM task '%s': auto-precision is fp16 (no quantization). "
+                "GPU + LLM task '%s': auto-precision is fp32 (no conversion). "
                 "For better performance, consider w4a16 quantization manually.",
                 task,
             )

@@ -499,3 +499,43 @@ def test_module_flag_returns_list(self) -> None:
         for cfg in data:
             assert "loader" in cfg
             assert "export" in cfg
+
+    # --- auto-precision behaviour (PR #998 regression guard) -------------
+    def test_cpu_auto_precision_no_quant(self) -> None:
+        """device=cpu + precision=auto must NOT trigger FP16 conversion.
+
+        Before the fix, _AUTO_PRECISION mapped cpu→fp16 which silently
+        converted every model on CPU when no --precision flag was passed.
+        After the fix, cpu auto-precision resolves to fp32 (no-op).
+        """
+        data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "cpu")
+        _assert_hf_config_structure(data)
+        assert data.get("quant") is None, (
+            f"cpu + auto precision should resolve to fp32 (no quant). Got: {data.get('quant')}"
+        )
+
+    def test_gpu_auto_precision_no_quant(self) -> None:
+        """device=gpu + precision=auto must NOT trigger FP16 conversion.
+
+        Before the fix, _AUTO_PRECISION mapped gpu→fp16, which broke AMD
+        (MIGraphX) eval tests because MIGraphX received an FP16 model it
+        wasn't expecting. After the fix, gpu auto-precision resolves to
+        fp32 (no-op).
+        """
+        data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "gpu")
+        _assert_hf_config_structure(data)
+        assert data.get("quant") is None, (
+            f"gpu + auto precision should resolve to fp32 (no quant). Got: {data.get('quant')}"
+        )
+
+    def test_explicit_fp16_still_triggers_quant(self) -> None:
+        """--precision fp16 (explicit) must still produce an fp16 quant config.
+
+        The fix must not regress explicit FP16 requests — only auto-precision
+        should default to fp32.
+        """
+        data = _run_config("-m", self.MODEL, "-t", self.TASK, "-d", "cpu", "-p", "fp16")
+        _assert_hf_config_structure(data)
+        quant = data.get("quant")
+        assert quant is not None, "Explicit --precision fp16 should produce a quant config"
+        assert quant.get("mode") == "fp16"
@@ -1972,10 +1972,10 @@ def _mock_deps(
             ("npu", "auto", True, "uint8", "uint16", "qnn"),
             ("npu", "fp16", True, "uint8", "uint8", "qnn"),  # fp16 algorithm quant config
             ("npu", "int8", True, "uint8", "uint8", "qnn"),
-            ("gpu", "auto", True, None, None, None),  # auto on gpu -> fp16 algorithm
+            ("gpu", "auto", False, None, None, None),  # auto on gpu -> fp32 (no-op)
             ("gpu", "int8", True, "uint8", "uint8", None),
             ("gpu", "fp16", True, None, None, None),  # fp16 algorithm quant config
-            ("cpu", "auto", True, None, None, None),  # auto on cpu -> fp16 algorithm
+            ("cpu", "auto", False, None, None, None),  # auto on cpu -> fp32 (no-op)
             ("cpu", "int8", True, "uint8", "uint8", None),
             ("cpu", "int16", True, "int16", "uint16", None),
             ("cpu", "fp16", True, None, None, None),  # fp16 algorithm quant config
@@ -2418,7 +2418,7 @@ def test_raw_onnx_full_pipeline(self, tmp_path) -> None:
         assert config.compile.ep_config.provider == "qnn"
 
     def test_raw_onnx_cpu(self, tmp_path) -> None:
-        """Raw ONNX + device=cpu resolves to an fp16 algorithm quant config, compile=None."""
+        """Raw ONNX + device=cpu with auto-precision resolves to fp32 (no-op), compile=None."""
         onnx_file = tmp_path / "model.onnx"
         onnx_file.write_bytes(b"fake")
 
@@ -2433,8 +2433,7 @@ def test_raw_onnx_cpu(self, tmp_path) -> None:
             config = generate_onnx_build_config(str(onnx_file), device="cpu")
 
         assert config.export is None
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        assert config.quant is None
         assert config.compile is None
 
     def test_quantized_onnx_skips_quant(self, tmp_path) -> None:
@@ -2761,10 +2760,10 @@ def test_onnx_path_as_pathlib(self, tmp_path) -> None:
         assert config.export is None
 
     def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
-        """device=auto + precision=auto (defaults) resolves to fp16 on CPU.
+        """device=auto + precision=auto (defaults) resolves to fp32 on CPU.
 
         resolve_check_device_ep returns device="auto" but resolve_precision
-        resolves the EP to pick a concrete device, yielding an fp16 algorithm quant config.
+        resolves the EP to pick a concrete device, yielding fp32 (no-op, no conversion).
         """
         onnx_file = tmp_path / "model.onnx"
         onnx_file.write_bytes(b"fake")
@@ -2779,9 +2778,8 @@ def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
         ):
             config = generate_onnx_build_config(str(onnx_file))
 
-        # EP resolves to CPU, auto-precision=fp16 → fp16 algorithm quant config
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        # EP resolves to CPU, auto-precision=fp32 → no quantization, no compile
+        assert config.quant is None
         assert config.compile is None
 
     def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
@@ -2801,7 +2799,7 @@ def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
         mock_resolve.assert_not_called()
 
     def test_raw_onnx_with_gpu(self, tmp_path) -> None:
-        """Raw ONNX + device=gpu resolves to an fp16 algorithm quant config, compile=None."""
+        """Raw ONNX + device=gpu with auto-precision resolves to fp32 (no-op), compile=None."""
         onnx_file = tmp_path / "model.onnx"
         onnx_file.write_bytes(b"fake")
 
@@ -2815,10 +2813,8 @@ def test_raw_onnx_with_gpu(self, tmp_path) -> None:
         ):
             config = generate_onnx_build_config(str(onnx_file), device="gpu")
 
-        # GPU auto-precision is fp16 -> fp16 algorithm quant config, no
-        # compile (DML has no offline step)
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        # GPU auto-precision is fp32 → no quantization, no compile (DML has no offline step)
+        assert config.quant is None
         assert config.compile is None
 
     def test_ep_override_forwarded(self, tmp_path) -> None:
@@ -2856,21 +2852,20 @@ class TestResolveQuantCompileConfig:
     the HF and ONNX build config paths.
     """
 
-    def test_auto_auto_returns_fp16_algorithm(self) -> None:
-        """device=auto + precision=auto resolves to an fp16 algorithm quant config.
+    def test_auto_auto_returns_no_quant(self) -> None:
+        """device=auto + precision=auto resolves to fp32 (no quantization, no conversion).
 
         When resolve_check_device_ep returns device="auto" but the EP
         resolves to a concrete device, resolve_precision picks auto-precision
-        (fp16 for CPU), yielding an fp16 algorithm quant config.
+        (fp32 for CPU), yielding no quant config.
         """
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("auto", ["npu", "gpu", "cpu"], ["CPUExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config()
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
     def test_npu_returns_quant_and_compile(self) -> None:
@@ -2887,28 +2882,26 @@ def test_npu_returns_quant_and_compile(self) -> None:
         assert isinstance(compile_cfg, WinMLCompileConfig)
         assert compile_cfg.ep_config.provider == "qnn"
 
-    def test_gpu_returns_fp16_quant_and_none_compile(self) -> None:
-        """device=gpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
+    def test_gpu_returns_no_quant_and_none_compile(self) -> None:
+        """device=gpu returns (None, None) — auto-precision is fp32 (no conversion)."""
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("gpu", ["gpu", "cpu"], ["DmlExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config(device="gpu")
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
-    def test_cpu_returns_fp16_quant_and_none_compile(self) -> None:
-        """device=cpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
+    def test_cpu_returns_no_quant_and_none_compile(self) -> None:
+        """device=cpu returns (None, None) — auto-precision is fp32 (no conversion)."""
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("cpu", ["cpu"], ["CPUExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config(device="cpu")
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
     def test_ep_override_changes_provider(self) -> None:

@@ -220,7 +220,7 @@ def test_raw_onnx_full_pipeline(self, tmp_path) -> None:
         assert config.compile.ep_config.provider == "qnn"
 
     def test_raw_onnx_cpu(self, tmp_path) -> None:
-        """Raw ONNX + device=cpu resolves to an fp16 algorithm quant config, compile=None."""
+        """Raw ONNX + device=cpu with auto-precision resolves to fp32 (no-op), compile=None."""
         onnx_file = tmp_path / "model.onnx"
         onnx_file.write_bytes(b"fake")
 
@@ -235,8 +235,7 @@ def test_raw_onnx_cpu(self, tmp_path) -> None:
             config = generate_onnx_build_config(str(onnx_file), device="cpu")
 
         assert config.export is None
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        assert config.quant is None
         assert config.compile is None
 
     def test_quantized_onnx_skips_quant(self, tmp_path) -> None:
@@ -563,9 +562,9 @@ def test_onnx_path_as_pathlib(self, tmp_path) -> None:
         assert config.export is None
 
     def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
-        """device=auto + precision=auto resolves to fp16 on CPU.
+        """device=auto + precision=auto resolves to fp32 on CPU.
 
-        resolve_precision resolves the EP to a concrete device, yielding the fp16 algorithm.
+        resolve_precision resolves the EP to a concrete device, yielding fp32 (no-op).
         """
         onnx_file = tmp_path / "model.onnx"
         onnx_file.write_bytes(b"fake")
@@ -580,8 +579,7 @@ def test_auto_device_auto_precision_defaults(self, tmp_path) -> None:
         ):
             config = generate_onnx_build_config(str(onnx_file))
 
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        assert config.quant is None
         assert config.compile is None
 
     def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
@@ -601,7 +599,7 @@ def test_compiled_does_not_call_resolve_quant_compile(self, tmp_path) -> None:
         mock_resolve.assert_not_called()
 
     def test_raw_onnx_with_gpu(self, tmp_path) -> None:
-        """Raw ONNX + device=gpu resolves to an fp16 algorithm quant config, compile=None.
+        """Raw ONNX + device=gpu with auto-precision resolves to fp32 (no-op), compile=None.
 
         DML has enable_ep_context=False so for_provider("dml") returns None —
         no offline compile step is needed.
@@ -619,9 +617,8 @@ def test_raw_onnx_with_gpu(self, tmp_path) -> None:
         ):
             config = generate_onnx_build_config(str(onnx_file), device="gpu")
 
-        # GPU auto-precision is fp16 -> fp16 algorithm quant config; DML has no EPContext step
-        assert config.quant is not None
-        assert config.quant.mode == "fp16"
+        # GPU auto-precision is fp32 → no quantization; DML has no EPContext step
+        assert config.quant is None
         assert config.compile is None
 
     def test_ep_override_forwarded(self, tmp_path) -> None:
@@ -661,16 +658,15 @@ class TestResolveQuantCompileConfig:
     the HF and ONNX build config paths.
     """
 
-    def test_auto_auto_returns_fp16_algorithm(self) -> None:
-        """device=auto + precision=auto resolves to an fp16 algorithm quant config."""
+    def test_auto_auto_returns_no_quant(self) -> None:
+        """device=auto + precision=auto resolves to fp32 (no quantization, no conversion)."""
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("auto", ["npu", "gpu", "cpu"], ["CPUExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config()
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
     def test_npu_returns_quant_and_compile(self) -> None:
@@ -688,27 +684,25 @@ def test_npu_returns_quant_and_compile(self) -> None:
         assert compile_cfg.ep_config.provider == "qnn"
 
     def test_gpu_returns_fp16_quant_and_none_compile(self) -> None:
-        """device=gpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
+        """device=gpu returns (None, None) — auto-precision is fp32 (no conversion)."""
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("gpu", ["gpu", "cpu"], ["DmlExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config(device="gpu")
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
     def test_cpu_returns_fp16_quant_and_none_compile(self) -> None:
-        """device=cpu returns (fp16 algorithm quant config, None) — auto-precision is fp16."""
+        """device=cpu returns (None, None) — auto-precision is fp32 (no conversion)."""
         with patch(
             "winml.modelkit.sysinfo.resolve_check_device_ep",
             return_value=("cpu", ["cpu"], ["CPUExecutionProvider"]),
         ):
             quant, compile_cfg = resolve_quant_compile_config(device="cpu")
 
-        assert isinstance(quant, WinMLQuantizationConfig)
-        assert quant.mode == "fp16"
+        assert quant is None
         assert compile_cfg is None
 
     def test_ep_override_changes_provider(self) -> None:

@@ -51,13 +51,13 @@ class TestResolvePrecision:
             ("npu", "w8a16", "npu", "w8a16", "uint8", "uint16", "QNNExecutionProvider"),
             ("npu", "w8a8", "npu", "w8a8", "uint8", "uint8", "QNNExecutionProvider"),
             ("npu", "w16a16", "npu", "w16a16", "int16", "uint16", "QNNExecutionProvider"),
-            ("gpu", "auto", "gpu", "fp16", None, None, "DmlExecutionProvider"),
+            ("gpu", "auto", "gpu", "fp32", None, None, "DmlExecutionProvider"),
             ("gpu", "w8a16", "gpu", "w8a16", "uint8", "uint16", "DmlExecutionProvider"),
             ("gpu", "int8", "gpu", "int8", "uint8", "uint8", "DmlExecutionProvider"),
             ("gpu", "int16", "gpu", "int16", "int16", "uint16", "DmlExecutionProvider"),
             ("gpu", "fp16", "gpu", "fp16", None, None, "DmlExecutionProvider"),
             ("gpu", "fp32", "gpu", "fp32", None, None, "DmlExecutionProvider"),
-            ("cpu", "auto", "cpu", "fp16", None, None, None),
+            ("cpu", "auto", "cpu", "fp32", None, None, None),
             ("cpu", "int8", "cpu", "int8", "uint8", "uint8", None),
             ("cpu", "int16", "cpu", "int16", "int16", "uint16", None),
             ("cpu", "fp16", "cpu", "fp16", None, None, None),
@@ -144,15 +144,15 @@ def test_gpu_llm_warning(self, caplog) -> None:
             policy = resolve_precision(device="gpu", task="text-generation")
 
         assert policy.device == "gpu"
-        assert policy.precision == "fp16"
+        assert policy.precision == "fp32"
         assert any("w4a16" in record.message for record in caplog.records)
 
     def test_gpu_non_llm_no_warning(self, caplog) -> None:
         """GPU + image-classification does NOT log w4a16 warning."""
         with caplog.at_level(logging.WARNING, logger="winml.modelkit.config.precision"):
             policy = resolve_precision(device="gpu", task="image-classification")
 
-        assert policy.precision == "fp16"
+        assert policy.precision == "fp32"
         assert not any("w4a16" in record.message for record in caplog.records)
 
     def test_gpu_text2text_warning(self, caplog) -> None: