From 0cce439437dd4368fe95d3e415dd0d8586229c80 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 13 Feb 2026 11:01:30 +0800 Subject: [PATCH 1/2] fix FP8_STATIC loading regression Signed-off-by: He, Xin3 --- auto_round/export/export_to_autoround/export_to_fp8.py | 3 +++ auto_round/formats.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index 38bea79f3..82fcdcf20 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -155,6 +155,7 @@ def save_quantized_as_autoround( tokenizer: Callable = None, layer_config: dict = None, inplace: bool = True, + backend: str = None, device: Union[str, torch.device] = "cpu", serialization_dict: dict = None, **kwargs, @@ -165,6 +166,8 @@ def save_quantized_as_autoround( quantization_config = serialization_dict quantization_config["block_name_to_quantize"] = quantization_config.pop("to_quant_block_names", None) quantization_config["quant_method"] = "auto-round" + if backend: + quantization_config["packing_format"] = backend if "e5m2" in serialization_dict.get("data_type", "fp8"): quantization_config["fmt"] = "e5m2" else: diff --git a/auto_round/formats.py b/auto_round/formats.py index 6fe957c61..e6fc7f56f 100644 --- a/auto_round/formats.py +++ b/auto_round/formats.py @@ -1086,7 +1086,7 @@ def save_quantized( elif serialization_dict.get("data_type", "int") == "fp" and serialization_dict.get("bits", 16) == 8: from auto_round.export.export_to_autoround.export_to_fp8 import save_quantized_as_autoround - backend = "auto_round" + backend = "auto_round:fp8_static" if serialization_dict.get("act_bits", 16) == 8 else None export_func = save_quantized_as_autoround else: from auto_round.export.export_to_autoround.export import save_quantized_as_autoround From fa4f92cf2200b65319cb27c0f3ec5ab31cd4a842 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 13 Feb 2026 11:29:58 +0800 Subject: [PATCH 2/2] enhance UT to cover reloading Signed-off-by: He, Xin3 --- test/test_cuda/schemes/test_scheme.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/test/test_cuda/schemes/test_scheme.py b/test/test_cuda/schemes/test_scheme.py index 255fdc80e..8ab78756b 100644 --- a/test/test_cuda/schemes/test_scheme.py +++ b/test/test_cuda/schemes/test_scheme.py @@ -2,6 +2,7 @@ import shutil import pytest +import transformers from packaging import version from auto_round import AutoRound @@ -36,12 +37,16 @@ def test_gguf(self, tiny_qwen_model_path): def test_w4a16(self, tiny_opt_model_path): ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1) assert ar.bits == 4 - ar.quantize() + ar.quantize_and_save() + model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True) + assert model is not None, "Model loading failed after quantization with W4A16 scheme" def test_w2a16(self, tiny_opt_model_path): ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=1) assert ar.bits == 2 - ar.quantize() + ar.quantize_and_save() + model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True) + assert model is not None, "Model loading failed after quantization with W2A16 scheme" def test_mxfp4(self, tiny_opt_model_path): ar = AutoRound(tiny_opt_model_path, scheme="MXFP4_RCEIL", nsamples=1, iters=1) @@ -49,7 +54,9 @@ def test_mxfp4(self, tiny_opt_model_path): assert ar.act_bits == 4 assert ar.data_type == "mx_fp" assert ar.act_data_type == "mx_fp_rceil" - ar.quantize() + ar.quantize_and_save() + model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True) + assert model is not None, "Model loading failed after quantization with MXFP4 scheme" def test_fp8_static(self, tiny_opt_model_path): ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=1) @@ -59,13 +66,15 @@ def test_fp8_static(self, tiny_opt_model_path): assert ar.act_data_type == "fp" assert ar.group_size == -1 assert ar.act_dynamic is False - ar.quantize() + ar.quantize_and_save() + model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True) + assert model is not None, "Model loading failed after quantization with FP8_STATIC scheme" ## RTN tests def test_w2a16_rtn(self, tiny_opt_model_path): ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0) assert ar.bits == 2 - ar.quantize() + ar.quantize_and_save() def test_mxfp4_rtn(self, tiny_opt_model_path): ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=0) @@ -73,7 +82,7 @@ def test_mxfp4_rtn(self, tiny_opt_model_path): assert ar.act_bits == 4 assert ar.data_type == "mx_fp" assert ar.act_data_type == "mx_fp" - ar.quantize() + ar.quantize_and_save() def test_fp8_static_rtn(self, tiny_opt_model_path): ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=0) @@ -83,7 +92,7 @@ def test_fp8_static_rtn(self, tiny_opt_model_path): assert ar.act_data_type == "fp" assert ar.group_size == -1 assert ar.act_dynamic is False - ar.quantize() + ar.quantize_and_save() def test_scheme_in_layer_config(self): model_path = get_model_path("facebook/opt-125m") @@ -94,7 +103,9 @@ def test_scheme_in_layer_config(self): } ar = AutoRound(model_path, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config) - ar.quantize() + ar.quantize_and_save() + model = transformers.AutoModelForCausalLM.from_pretrained("tmp_autoround", trust_remote_code=True) + assert model is not None, "Model loading failed after quantization with layer-specific schemes" for n, m in ar.model.named_modules(): if n == "model.decoder.layers.2.self_attn.q_proj": assert m.bits == 2