From 0cce439437dd4368fe95d3e415dd0d8586229c80 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Fri, 13 Feb 2026 11:01:30 +0800
Subject: [PATCH 1/2] fix FP8_STATIC loading regression

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/export/export_to_autoround/export_to_fp8.py | 3 +++
 auto_round/formats.py                                  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py
index 38bea79f3..82fcdcf20 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8.py
@@ -155,6 +155,7 @@ def save_quantized_as_autoround(
     tokenizer: Callable = None,
     layer_config: dict = None,
     inplace: bool = True,
+    backend: str = None,
     device: Union[str, torch.device] = "cpu",
     serialization_dict: dict = None,
     **kwargs,
@@ -165,6 +166,8 @@ def save_quantized_as_autoround(
     quantization_config = serialization_dict
     quantization_config["block_name_to_quantize"] = quantization_config.pop("to_quant_block_names", None)
     quantization_config["quant_method"] = "auto-round"
+    if backend:
+        quantization_config["packing_format"] = backend
     if "e5m2" in serialization_dict.get("data_type", "fp8"):
         quantization_config["fmt"] = "e5m2"
     else:
diff --git a/auto_round/formats.py b/auto_round/formats.py
index 6fe957c61..e6fc7f56f 100644
--- a/auto_round/formats.py
+++ b/auto_round/formats.py
@@ -1086,7 +1086,7 @@ def save_quantized(
         elif serialization_dict.get("data_type", "int") == "fp" and serialization_dict.get("bits", 16) == 8:
             from auto_round.export.export_to_autoround.export_to_fp8 import save_quantized_as_autoround
 
-            backend = "auto_round"
+            backend = "auto_round:fp8_static" if serialization_dict.get("act_bits", 16) == 8 else None
             export_func = save_quantized_as_autoround
         else:
             from auto_round.export.export_to_autoround.export import save_quantized_as_autoround

From fa4f92cf2200b65319cb27c0f3ec5ab31cd4a842 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Fri, 13 Feb 2026 11:29:58 +0800
Subject: [PATCH 2/2] enhance UT to cover reloading

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/test_cuda/schemes/test_scheme.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/test/test_cuda/schemes/test_scheme.py b/test/test_cuda/schemes/test_scheme.py
index 255fdc80e..8ab78756b 100644
--- a/test/test_cuda/schemes/test_scheme.py
+++ b/test/test_cuda/schemes/test_scheme.py
@@ -2,6 +2,7 @@
 import shutil
 
 import pytest
+import transformers
 from packaging import version
 
 from auto_round import AutoRound
@@ -36,12 +37,16 @@ def test_gguf(self, tiny_qwen_model_path):
     def test_w4a16(self, tiny_opt_model_path):
         ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1)
         assert ar.bits == 4
-        ar.quantize()
+        ar.quantize_and_save()
+        model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True)
+        assert model is not None, "Model loading failed after quantization with W4A16 scheme"
 
     def test_w2a16(self, tiny_opt_model_path):
         ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=1)
         assert ar.bits == 2
-        ar.quantize()
+        ar.quantize_and_save()
+        model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True)
+        assert model is not None, "Model loading failed after quantization with W2A16 scheme"
 
     def test_mxfp4(self, tiny_opt_model_path):
         ar = AutoRound(tiny_opt_model_path, scheme="MXFP4_RCEIL", nsamples=1, iters=1)
@@ -49,7 +54,9 @@ def test_mxfp4(self, tiny_opt_model_path):
         assert ar.act_bits == 4
         assert ar.data_type == "mx_fp"
         assert ar.act_data_type == "mx_fp_rceil"
-        ar.quantize()
+        ar.quantize_and_save()
+        model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True)
+        assert model is not None, "Model loading failed after quantization with MXFP4 scheme"
 
     def test_fp8_static(self, tiny_opt_model_path):
         ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=1)
@@ -59,13 +66,15 @@ def test_fp8_static(self, tiny_opt_model_path):
         assert ar.act_data_type == "fp"
         assert ar.group_size == -1
         assert ar.act_dynamic is False
-        ar.quantize()
+        ar.quantize_and_save()
+        model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True)
+        assert model is not None, "Model loading failed after quantization with FP8_STATIC scheme"
 
     ## RTN tests
     def test_w2a16_rtn(self, tiny_opt_model_path):
         ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0)
         assert ar.bits == 2
-        ar.quantize()
+        ar.quantize_and_save()
 
     def test_mxfp4_rtn(self, tiny_opt_model_path):
         ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=0)
@@ -73,7 +82,7 @@ def test_mxfp4_rtn(self, tiny_opt_model_path):
         assert ar.act_bits == 4
         assert ar.data_type == "mx_fp"
         assert ar.act_data_type == "mx_fp"
-        ar.quantize()
+        ar.quantize_and_save()
 
     def test_fp8_static_rtn(self, tiny_opt_model_path):
         ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=0)
@@ -83,7 +92,7 @@ def test_fp8_static_rtn(self, tiny_opt_model_path):
         assert ar.act_data_type == "fp"
         assert ar.group_size == -1
         assert ar.act_dynamic is False
-        ar.quantize()
+        ar.quantize_and_save()
 
     def test_scheme_in_layer_config(self):
         model_path = get_model_path("facebook/opt-125m")
@@ -94,7 +103,9 @@ def test_scheme_in_layer_config(self):
         }
         ar = AutoRound(model_path, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config)
 
-        ar.quantize()
+        ar.quantize_and_save()
+        model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True)
+        assert model is not None, "Model loading failed after quantization with layer-specific schemes"
         for n, m in ar.model.named_modules():
             if n == "model.decoder.layers.2.self_attn.q_proj":
                 assert m.bits == 2