From a84a4b67830ba18ccb76cff2c41df71b946f4f81 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Mon, 26 Jan 2026 11:04:15 +0800
Subject: [PATCH 01/14] refactor init of compressor

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py | 90 +++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 35 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 3232c4470..345ca0eea 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -280,15 +280,8 @@ def __init__(
         self.shared_cache_keys = get_shared_keys(self.model)
 
         self.layer_config = layer_config
-
-        # should be set after loading model and set layer_config, cause some special scheme need these.
-        self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(scheme, kwargs)
-
-        gguf_scheme_name = get_gguf_scheme(self.scheme)
-        # GGUF uses fp32 scale dtype as default
-        scale_dtype = kwargs.pop("scale_dtype", None)
-        if scale_dtype is None:
-            scale_dtype = "fp32" if gguf_scheme_name else "fp16"
+        self.scheme = scheme
+        self.scale_dtype = kwargs.pop("scale_dtype", None)
 
         # Extra/legacy kwargs for backward compatibility
         # Major version releases may pack them with extra configuration options
@@ -314,21 +307,12 @@ def __init__(
             platform = "model_scope"
         self.platform = platform
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
-
         self.ignore_layers = kwargs.pop("ignore_layers", "")
-        predefined_ignore_layers = get_predefined_ignore_layers(self.model)
 
-        if predefined_ignore_layers:
-            logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}")
-            tmp_str = ",".join(predefined_ignore_layers)
-            if self.ignore_layers == "":
-                self.ignore_layers = tmp_str
-            else:
-                self.ignore_layers += "," + tmp_str
         self.supported_types = SUPPORTED_LAYER_TYPES
         self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
-        self.scale_dtype = convert_dtype_str2torch(scale_dtype)
         self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.block_forward = block_forward
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
@@ -360,16 +344,10 @@ def __init__(
         self.device_map = device_map
         if isinstance(self.device_map, str):
             self.device_map = self.device_map.replace(" ", "")
-
-        self.device_list = parse_available_devices(device_map)
-
-        # Set device, must place after model loading
-        self.device = get_major_device(device_map)
-        set_non_auto_device_map(self.model, self.device_map)
+        self.device = get_major_device(self.device_map)
 
         # Tuning hyperparameters
         self.seed = seed
-        set_seed(self.seed)
         self.amp = amp
         self.enable_quanted_input = enable_quanted_input
         self.enable_minmax_tuning = enable_minmax_tuning
@@ -448,7 +426,6 @@ def __init__(
         if self.static_attention_dtype is not None:
             logger.warning("The static attention dtype is experimental and currently has limited support.")
 
-        self._set_amp_dtype()
         self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device
         if self.act_bits <= 8 and self.amp_dtype == torch.float16:
             logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization")
@@ -466,23 +443,18 @@ def __init__(
 
         # after setting iters
         self.enable_torch_compile = enable_torch_compile
-        self._adjust_torch_compile(enable_torch_compile)
 
-        self.block_forward = compile_func(block_forward, self.device) if self.enable_torch_compile else block_forward
+        self.attention_mask = []
+        self.wrapper_block = wrapper_block
+
         self._check_configs()
         torch.set_printoptions(precision=3, sci_mode=True)
 
-        if isinstance(scheme, AutoScheme):
-            self.layer_config = self._gen_auto_scheme(model, scheme, dataset, self.device_map)
-
         if is_hpex_available():
             logger.info("habana_frameworks is available, import htcore explicitly.")
             import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
             import habana_frameworks.torch.hpu as hthpu  # pylint: disable=E0401]
 
-        self.attention_mask = []
-
-        self.wrapper_block = wrapper_block
         if self.enable_alg_ext:
             try:
                 logger.warning_once("using algorithm extension for quantization.")
@@ -492,6 +464,48 @@ def __init__(
             except (ImportError, ModuleNotFoundError):
                 logger.error("algorithm extension import error, fallback to default mode")
 
+        self._post_inited = False
+
+    def _post_init(self) -> None:
+        """Post-initialization for AutoRound."""
+        if self._post_inited:
+            return
+
+        # should be set after loading model and set layer_config, cause some special scheme need these.
+        self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(self.scheme, {})
+
+        # GGUF uses fp32 scale dtype as default
+        if self.scale_dtype is None:
+            gguf_scheme_name = get_gguf_scheme(self.scheme)
+            scale_dtype = "fp32" if gguf_scheme_name else "fp16"
+        self.scale_dtype = convert_dtype_str2torch(scale_dtype)
+
+        predefined_ignore_layers = get_predefined_ignore_layers(self.model)
+
+        if predefined_ignore_layers:
+            logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}")
+            tmp_str = ",".join(predefined_ignore_layers)
+            if self.ignore_layers == "":
+                self.ignore_layers = tmp_str
+            else:
+                self.ignore_layers += "," + tmp_str
+
+        # Set device, must place after model loading
+        self._set_device(self.device_map)
+        set_non_auto_device_map(self.model, self.device_map)
+        self.device_list = parse_available_devices(self.device_map)
+
+        set_seed(self.seed)
+        self._set_amp_dtype()
+        self._adjust_torch_compile(self.enable_torch_compile)
+        if self.enable_torch_compile:
+            self.block_forward = compile_func(self.block_forward, self.device)
+
+        if isinstance(self.scheme, AutoScheme):
+            self.layer_config = self._gen_auto_scheme(self.model, self.scheme, self.dataset, self.device_map)
+
+        self._post_inited = True
+
     def _gen_auto_scheme(
         self, model: torch.nn.Module, scheme: AutoScheme, dataset: str, device_map: Union[str, int, dict, torch.device]
     ) -> dict[str, dict]:
@@ -865,6 +879,9 @@ def quantize_and_save(
         Raises:
             ValueError: If an unsupported format is specified.
         """
+        # post init
+        self._post_init()
+
         # Validate and process the specified formats
         self.orig_output_dir = output_dir
 
@@ -3118,6 +3135,9 @@ def save_quantized(
         Returns:
             object: The compressed model object.
         """
+        # post init
+        self._post_init()
+
         self.orig_output_dir = output_dir
         if isinstance(format, str) and getattr(self, "formats", None) is None:
             formats = get_formats(format, self)

From 1b5749de7ab5ff2fb0f146c16940dfea5f91c9cd Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Mon, 26 Jan 2026 14:38:36 +0800
Subject: [PATCH 02/14] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/__main__.py         |  27 +-------
 auto_round/compressors/base.py | 115 ++++++++++++++++++++-------------
 2 files changed, 72 insertions(+), 70 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 1540d254d..2ffee486d 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -678,32 +678,7 @@ def tune(args):
         trust_remote_code=not args.disable_trust_remote_code,
     )
 
-    model_name = args.model.rstrip("/")
-
-    if model_name.split("/")[-1].strip(".") == "" and "gguf" not in args.format:
-        if autoround.group_size <= 0:
-            if "fp" in autoround.act_data_type:
-                suffix = f"afp{autoround.act_bits}"
-            else:
-                suffix = f"a{autoround.act_bits}"
-        else:
-            suffix = f"g{autoround.group_size}"
-        export_dir = os.path.join(args.output_dir, f"w{autoround.bits}{suffix}")
-    elif model_name.split("/")[-1].strip(".") == "" and "gguf" in args.format:
-        export_dir = args.output_dir
-    elif model_name.split("./")[-1].strip("./") != "" and "gguf" in args.format:
-        export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + "-gguf")
-    else:
-        if autoround.group_size <= 0:
-            if "fp" in autoround.act_data_type:
-                suffix = f"afp{autoround.act_bits}"
-            else:
-                suffix = f"a{autoround.act_bits}"
-        else:
-            suffix = f"g{autoround.group_size}"
-        export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{autoround.bits}{suffix}")
-
-    model, folders = autoround.quantize_and_save(export_dir, format=args.format)  # pylint: disable=E1101
+    model, folders = autoround.quantize_and_save(args.output_dir, format=args.format)  # pylint: disable=E1101
     tokenizer = autoround.tokenizer  # pylint: disable=E1101
 
     model.eval()
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 345ca0eea..f4ac257d8 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -382,24 +382,7 @@ def __init__(
         if enable_opt_rtn:
             disable_opt_rtn = False
         self.orig_disable_opt_rtn = disable_opt_rtn
-
-        if self.iters != 0 and self.orig_disable_opt_rtn is not None:
-            logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.")
-            disable_opt_rtn = True
-        if (
-            self.bits >= 8
-            and self.act_bits >= 16
-            and self.iters == 0
-            and self.data_type == "int"
-            and disable_opt_rtn is None
-        ):
-            logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.")
-            disable_opt_rtn = True
-        if disable_opt_rtn is None and self.iters == 0:
-            logger.info(
-                "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy."
-            )
-            disable_opt_rtn = False
+        self.disable_opt_rtn = disable_opt_rtn
 
         # Important Note! This is not very robust, do NOT rely on it to do high risky thing
         self.is_moe_model = is_moe_model(self.model)
@@ -410,7 +393,6 @@ def __init__(
         self.dynamic_max_gap = dynamic_max_gap
         self.lr_scheduler = lr_scheduler
         self.optimizer = self._get_optimizer(None)
-        self.disable_opt_rtn = disable_opt_rtn
 
         # Whether to pack the layer immediately after tuning
         self.is_immediate_packing = False
@@ -427,17 +409,7 @@ def __init__(
             logger.warning("The static attention dtype is experimental and currently has limited support.")
 
         self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device
-        if self.act_bits <= 8 and self.amp_dtype == torch.float16:
-            logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization")
-            self.amp_dtype = torch.bfloat16
-            if self.model.dtype != torch.bfloat16:  # keep the model's buffer dtype unchanged
-                self.model = self.model.to(torch.bfloat16)
-        else:
-            logger.info(f"using {self.model.dtype} for quantization tuning")
 
-        # Some helpers
-        if "hpu" in str(self.device):
-            self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear")
         self.batch_dim = None
         self.infer_bs_coeff = 1
 
@@ -447,23 +419,8 @@ def __init__(
         self.attention_mask = []
         self.wrapper_block = wrapper_block
 
-        self._check_configs()
         torch.set_printoptions(precision=3, sci_mode=True)
 
-        if is_hpex_available():
-            logger.info("habana_frameworks is available, import htcore explicitly.")
-            import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
-            import habana_frameworks.torch.hpu as hthpu  # pylint: disable=E0401]
-
-        if self.enable_alg_ext:
-            try:
-                logger.warning_once("using algorithm extension for quantization.")
-                from auto_round.alg_ext import wrapper_autoround
-
-                wrapper_autoround(self)
-            except (ImportError, ModuleNotFoundError):
-                logger.error("algorithm extension import error, fallback to default mode")
-
         self._post_inited = False
 
     def _post_init(self) -> None:
@@ -495,15 +452,60 @@ def _post_init(self) -> None:
         set_non_auto_device_map(self.model, self.device_map)
         self.device_list = parse_available_devices(self.device_map)
 
+        if self.iters != 0 and self.orig_disable_opt_rtn is not None:
+            logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.")
+            self.disable_opt_rtn = True
+        if (
+            self.bits >= 8
+            and self.act_bits >= 16
+            and self.iters == 0
+            and self.data_type == "int"
+            and self.disable_opt_rtn is None
+        ):
+            logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.")
+            self.disable_opt_rtn = True
+        if self.disable_opt_rtn is None and self.iters == 0:
+            logger.info(
+                "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy."
+            )
+            self.disable_opt_rtn = False
+
         set_seed(self.seed)
         self._set_amp_dtype()
         self._adjust_torch_compile(self.enable_torch_compile)
         if self.enable_torch_compile:
             self.block_forward = compile_func(self.block_forward, self.device)
 
+        if self.act_bits <= 8 and self.amp_dtype == torch.float16:
+            logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization")
+            self.amp_dtype = torch.bfloat16
+            if self.model.dtype != torch.bfloat16:  # keep the model's buffer dtype unchanged
+                self.model = self.model.to(torch.bfloat16)
+        else:
+            logger.info(f"using {self.model.dtype} for quantization tuning")
+
+        # Some helpers
+        if "hpu" in str(self.device):
+            self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear")
+
+        self._check_configs()
+
         if isinstance(self.scheme, AutoScheme):
             self.layer_config = self._gen_auto_scheme(self.model, self.scheme, self.dataset, self.device_map)
 
+        if is_hpex_available():
+            logger.info("habana_frameworks is available, import htcore explicitly.")
+            import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
+            import habana_frameworks.torch.hpu as hthpu  # pylint: disable=E0401]
+
+        if self.enable_alg_ext:
+            try:
+                logger.warning_once("using algorithm extension for quantization.")
+                from auto_round.alg_ext import wrapper_autoround
+
+                wrapper_autoround(self)
+            except (ImportError, ModuleNotFoundError):
+                logger.error("algorithm extension import error, fallback to default mode")
         self._post_inited = True
 
     def _gen_auto_scheme(
@@ -882,6 +884,31 @@ def quantize_and_save(
         # post init
         self._post_init()
 
+        model_name = self.model.name_or_path.rstrip("/")
+        if model_name.split("/")[-1].strip(".") == "" and "gguf" not in format:
+            if self.group_size <= 0:
+                if "fp" in self.act_data_type:
+                    suffix = f"afp{self.act_bits}"
+                else:
+                    suffix = f"a{self.act_bits}"
+            else:
+                suffix = f"g{self.group_size}"
+            export_dir = os.path.join(output_dir, f"w{self.bits}{suffix}")
+        elif model_name.split("/")[-1].strip(".") == "" and "gguf" in format:
+            export_dir = output_dir
+        elif model_name.split("/")[-1].strip(".") != "" and "gguf" in format:
+            export_dir = os.path.join(output_dir, model_name.split("/")[-1] + "-gguf")
+        else:
+            if self.group_size <= 0:
+                if "fp" in self.act_data_type:
+                    suffix = f"afp{self.act_bits}"
+                else:
+                    suffix = f"a{self.act_bits}"
+            else:
+                suffix = f"g{self.group_size}"
+            export_dir = os.path.join(output_dir, model_name.split("/")[-1] + f"-w{self.bits}{suffix}")
+
+        output_dir = export_dir
         # Validate and process the specified formats
         self.orig_output_dir = output_dir
 

From 6f9620f4fc5dfa2de4f84e02afa3395e913d2f55 Mon Sep 17 00:00:00 2001
From: Heng Guo <heng.guo@intel.com>
Date: Mon, 26 Jan 2026 14:44:48 +0800
Subject: [PATCH 03/14] Update auto_round/compressors/base.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/compressors/base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index f2802fccd..c06b1b71f 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -435,6 +435,8 @@ def _post_init(self) -> None:
         if self.scale_dtype is None:
             gguf_scheme_name = get_gguf_scheme(self.scheme)
             scale_dtype = "fp32" if gguf_scheme_name else "fp16"
+        else:
+            scale_dtype = self.scale_dtype
         self.scale_dtype = convert_dtype_str2torch(scale_dtype)
 
         predefined_ignore_layers = get_predefined_ignore_layers(self.model)

From 0d28e39c786fec81285dabf560510929efab553b Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Mon, 26 Jan 2026 14:44:52 +0800
Subject: [PATCH 04/14] update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index f4ac257d8..f2802fccd 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1644,6 +1644,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         Returns:
         The quantized model and layer configurations.
         """
+        # post init
+        self._post_init()
 
         self._check_compatibility()
         formats = self.formats if hasattr(self, "formats") else None
@@ -3162,8 +3164,6 @@ def save_quantized(
         Returns:
             object: The compressed model object.
         """
-        # post init
-        self._post_init()
 
         self.orig_output_dir = output_dir
         if isinstance(format, str) and getattr(self, "formats", None) is None:

From 3a5995a71965fad17a840447669de0bc9e7e9887 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Mon, 26 Jan 2026 14:53:01 +0800
Subject: [PATCH 05/14] refactor

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index f2802fccd..58aa9bdda 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -884,29 +884,24 @@ def quantize_and_save(
         # post init
         self._post_init()
 
-        model_name = self.model.name_or_path.rstrip("/")
-        if model_name.split("/")[-1].strip(".") == "" and "gguf" not in format:
+        name_or_path = self.model.name_or_path.rstrip("/")
+        model_name = name_or_path.split("/")[-1]
+        if model_name.strip(".") == "" and "gguf" not in format:
             if self.group_size <= 0:
-                if "fp" in self.act_data_type:
-                    suffix = f"afp{self.act_bits}"
-                else:
-                    suffix = f"a{self.act_bits}"
+                suffix = f"afp{self.act_bits}" if "fp" in self.act_data_type else f"a{self.act_bits}"
             else:
                 suffix = f"g{self.group_size}"
             export_dir = os.path.join(output_dir, f"w{self.bits}{suffix}")
-        elif model_name.split("/")[-1].strip(".") == "" and "gguf" in format:
+        elif model_name.strip(".") == "" and "gguf" in format:
             export_dir = output_dir
-        elif model_name.split("/")[-1].strip(".") != "" and "gguf" in format:
-            export_dir = os.path.join(output_dir, model_name.split("/")[-1] + "-gguf")
+        elif model_name.strip(".") != "" and "gguf" in format:
+            export_dir = os.path.join(output_dir, model_name + "-gguf")
         else:
             if self.group_size <= 0:
-                if "fp" in self.act_data_type:
-                    suffix = f"afp{self.act_bits}"
-                else:
-                    suffix = f"a{self.act_bits}"
+                suffix = f"afp{self.act_bits}" if "fp" in self.act_data_type else f"a{self.act_bits}"
             else:
                 suffix = f"g{self.group_size}"
-            export_dir = os.path.join(output_dir, model_name.split("/")[-1] + f"-w{self.bits}{suffix}")
+            export_dir = os.path.join(output_dir, model_name + f"-w{self.bits}{suffix}")
 
         output_dir = export_dir
         # Validate and process the specified formats

From 13c418e263c0b64b094df40c14a5330cd3cd33ce Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 27 Jan 2026 10:00:27 +0800
Subject: [PATCH 06/14] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py                |  1 +
 auto_round/wrapper.py                         |  2 +-
 test/test_ark/test_model.py                   |  5 +-
 test/test_cpu/backends/test_torch_backend.py  | 14 +++--
 test/test_cpu/core/test_autoround.py          | 16 ++++--
 test/test_cpu/export/test_export.py           | 27 ++++++---
 test/test_cpu/export/test_gguf_format.py      | 56 +++++++++++-------
 .../integrations/test_llmcompressor.py        | 10 ++--
 test/test_cpu/models/test_mllm.py             |  9 ++-
 test/test_cpu/models/test_moe_model.py        |  8 +--
 .../quantization/test_act_quantization.py     | 12 ++--
 test/test_cpu/quantization/test_mix_bits.py   | 19 +++++--
 test/test_cpu/quantization/test_mxfp_nvfp.py  | 21 +++++--
 .../quantization/test_mxfp_save_load.py       |  3 +-
 test/test_cpu/utils/test_generation.py        |  8 ++-
 test/test_cuda/backends/test_torch_backend.py | 10 +++-
 .../test_cuda/backends/test_triton_backend.py | 57 +++++++++++--------
 .../export/test_auto_round_format.py          | 36 +++++++-----
 test/test_cuda/export/test_export.py          | 16 ++++--
 test/test_cuda/export/test_gguf.py            | 14 +++--
 .../integrations/test_transformers.py         |  7 ++-
 test/test_cuda/models/test_moe_model.py       |  3 +-
 test/test_cuda/quantization/test_2_3bits.py   | 10 +++-
 test/test_cuda/quantization/test_mix_bits.py  | 20 +++++--
 .../quantization/test_mxfp_and_nvfp_quant.py  |  3 +-
 test/test_cuda/quantization/test_mxfp_nvfp.py | 10 +++-
 test/test_cuda/schemes/test_scheme.py         |  3 +-
 test/test_xpu/test_autoround.py               |  8 ++-
 28 files changed, 274 insertions(+), 134 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index e23f8917e..aae8e6305 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -281,6 +281,7 @@ def __init__(
 
         self.layer_config = layer_config
         self.scheme = scheme
+        self.is_auto_scheme = True if isinstance(scheme, AutoScheme) else False
         self.scale_dtype = kwargs.pop("scale_dtype", None)
 
         # Extra/legacy kwargs for backward compatibility
diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py
index 24836d85b..5c426f7c0 100644
--- a/auto_round/wrapper.py
+++ b/auto_round/wrapper.py
@@ -117,7 +117,7 @@ def __init__(
         self.enable_round_tuning = enable_round_tuning
         self.enable_torch_compile = enable_torch_compile
         self.enable_norm_bias_tuning = enable_norm_bias_tuning and (orig_layer.bias is not None)
-        self.enable_act_quant = self.orig_layer.act_bits <= 8
+        self.enable_act_quant = self.orig_layer.act_bits <= 8 if self.orig_layer.act_bits is not None else False
         self.weight_global_scale = getattr(self.orig_layer, "weight_global_scale", None)
         if is_nv_fp(self.orig_layer.data_type) and self.weight_global_scale is None:
             from auto_round.data_type.nvfp import calculate_gparam
diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py
index 361f1bdf9..5d347e90d 100644
--- a/test/test_ark/test_model.py
+++ b/test/test_ark/test_model.py
@@ -40,7 +40,10 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t
         else:
             autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format=format)  ##will convert to gptq model
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format=format
+        )  ##will convert to gptq model
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="ark")
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py
index 5c70f7e99..78f5efb11 100644
--- a/test/test_cpu/backends/test_torch_backend.py
+++ b/test/test_cpu/backends/test_torch_backend.py
@@ -38,7 +38,10 @@ def test_torch_4bits_asym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:gptqmodel"
+        )
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
@@ -79,17 +82,20 @@ def test_torch_4bits_sym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round"
+        )  ##will convert to gptq model
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.28
         torch.cuda.empty_cache()
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py
index 79458e237..c3cea9d77 100644
--- a/test/test_cpu/core/test_autoround.py
+++ b/test/test_cpu/core/test_autoround.py
@@ -377,16 +377,17 @@ def test_rtn(self, tiny_opt_model_path):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, nsamples=1)
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder,
+            quantized_model_path,
             torch_dtype=torch.float16,
             device_map="auto",
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_embed_quant(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
@@ -432,7 +433,10 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader):
         autoround.quantize()
         quantized_model_path = self.save_folder
 
-        autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True)
+        _, quantized_model_path = autoround.save_quantized(
+            output_dir=quantized_model_path, format="auto_round", inplace=True
+        )
+        quantized_model_path = quantized_model_path[0]
         quantization_config = AutoRoundConfig(backend="ipex")
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -442,7 +446,7 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader):
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0])
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_not_convert_modules(self):
         import requests
diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py
index cb6033de9..fce28e19b 100644
--- a/test/test_cpu/export/test_export.py
+++ b/test/test_cpu/export/test_export.py
@@ -49,7 +49,8 @@ def test_autogptq_format(self, dataloader):
             )
 
             quantized_model_path = "./saved"
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+            quantized_model_path = quantized_model_path[0]
 
             if group_size == -1:
                 shutil.rmtree("./saved", ignore_errors=True)
@@ -78,7 +79,8 @@ def test_autoround_format(self, dataloader):
                 dataset=dataloader,
             )
             quantized_model_path = "./saved"
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            quantized_model_path = quantized_model_path[0]
 
             if group_size == -1:
                 shutil.rmtree("./saved", ignore_errors=True)
@@ -105,7 +107,10 @@ def test_autoround_awq_format(self, dataloader):
             )
             quantized_model_path = "./saved"
 
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
+            _, quantized_model_path = autoround.quantize_and_save(
+                output_dir=quantized_model_path, format="auto_round:auto_awq"
+            )
+            quantized_model_path = quantized_model_path[0]
 
             # quantization_config = AutoRoundConfig(
             #     backend="cpu"
@@ -223,7 +228,8 @@ def test_static_afp8_export(self, static_kv_dtype):
             static_kv_dtype=static_kv_dtype,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        quantized_model_path = quantized_model_path[0]
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
         assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
@@ -281,7 +287,8 @@ def test_static_afp8_export(self, static_kv_dtype):
             act_group_size=0,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        quantized_model_path = quantized_model_path[0]
 
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
@@ -341,7 +348,10 @@ def test_awq_lmhead_export(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_awq"
+        )
+        quantized_model_path = quantized_model_path[0]
         lm_head = compressed_model.lm_head
         from auto_round.export.export_to_awq.utils import WQLinear_GEMM
 
@@ -368,7 +378,10 @@ def test_gptq_lmhead_export(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_gptq"
+        )
+        quantized_model_path = quantized_model_path[0]
         lm_head = compressed_model.lm_head
         assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer"
         quantization_config = AutoRoundConfig()
diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py
index 2e3cde6f0..4672db4fb 100644
--- a/test/test_cpu/export/test_gguf_format.py
+++ b/test/test_cpu/export/test_gguf_format.py
@@ -58,14 +58,17 @@ def test_q4_0(self):
         )
         quantized_model_path = "./saved"
 
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="gguf:q4_0"
+        )
+        quantized_model_path = quantized_model_path[0]
         gguf_file = os.listdir(quantized_model_path)[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
 
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_func(self):
         bits, group_size, sym = 4, 128, True
@@ -77,15 +80,18 @@ def test_func(self):
             # data_type="int"
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_1")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="gguf:q*_1"
+        )
+        quantized_model_path = quantized_model_path[0]
         assert autoround.group_size == 32
         assert not autoround.sym
-        gguf_file = os.listdir("saved")[0]
+        gguf_file = os.listdir(quantized_model_path)[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_gguf_baseline(self):
         model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
@@ -105,12 +111,15 @@ def test_gguf_baseline(self):
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="fake")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="fake"
+        )
+        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_q4_k_m(self, dataloader):
         model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
@@ -140,7 +149,10 @@ def test_q4_k_m(self, dataloader):
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="gguf:q4_k_m,fake"
+        )
+        quantized_model_path = quantized_model_path[0]
         assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16
         assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq"
         assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq"
@@ -151,13 +163,16 @@ def test_q4_k_m(self, dataloader):
         assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3
         assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8
         assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0"
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False)
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
-        shutil.rmtree("./saved", ignore_errors=True)
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="gguf:q4_k_m,fake"
+        )
+        quantized_model_path = quantized_model_path[0]
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_all_format(self, tiny_qwen_model_path):
         model_name = tiny_qwen_model_path
@@ -204,15 +219,16 @@ def test_vlm_gguf(self):
             quant_nontext_module=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        assert "mmproj-model.gguf" in os.listdir("./saved")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
+        quantized_model_path = quantized_model_path[0]
+        assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
         for file_name in os.listdir(quantized_model_path):
             file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
             if file_name == "mmproj-model.gguf":
                 assert abs(file_size - 56) < 5.0
             else:
                 assert abs(file_size - 264) < 5.0
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
         shutil.rmtree(tiny_model_path, ignore_errors=True)
 
     def test_vlm_gguf_wo_quant_nontext_module(self):
@@ -230,15 +246,16 @@ def test_vlm_gguf_wo_quant_nontext_module(self):
             quant_nontext_module=False,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        assert "mmproj-model.gguf" in os.listdir("./saved")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
+        quantized_model_path = quantized_model_path[0]
+        assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
         for file_name in os.listdir(quantized_model_path):
             file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
             if file_name == "mmproj-model.gguf":
                 assert abs(file_size - 361) < 5.0
             else:
                 assert abs(file_size - 264) < 5.0
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
         shutil.rmtree(tiny_model_path, ignore_errors=True)
 
     def test_qtype_setting(self):
@@ -328,7 +345,8 @@ def test_q2k_mixed(self):
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed")
+        quantized_model_path = quantized_model_path[0]
         gguf_file = os.listdir(quantized_model_path)[0]
         file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2
         assert abs(file_size - 1362) < 5.0
@@ -340,5 +358,5 @@ def test_q2k_mixed(self):
         assert gguf_model.get_tensor(10).name == "blk.0.ffn_up_exps.weight"
         assert gguf_model.get_tensor(10).tensor_type.name == "Q2_K"
 
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
         shutil.rmtree(saved_tiny_model_path, ignore_errors=True)
diff --git a/test/test_cpu/integrations/test_llmcompressor.py b/test/test_cpu/integrations/test_llmcompressor.py
index c2966cedd..2c41406fe 100644
--- a/test/test_cpu/integrations/test_llmcompressor.py
+++ b/test/test_cpu/integrations/test_llmcompressor.py
@@ -50,7 +50,8 @@ def test_llmcompressor_fp8(self):
             nsamples=2,
             iters=0,
         )
-        autoround.quantize_and_save("./saved", format="llm_compressor")
+        _, quantized_model_path = autoround.quantize_and_save("./saved", format="llm_compressor")
+        quantized_model_path = quantized_model_path[0]
         # from vllm import LLM
         # model = LLM("./saved")
         # result = model.generate("Hello my name is")
@@ -58,7 +59,7 @@ def test_llmcompressor_fp8(self):
 
         import json
 
-        config = json.load(open("./saved/config.json"))
+        config = json.load(open(f"{quantized_model_path}/config.json"))
         assert "group_0" in config["quantization_config"]["config_groups"]
         assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
         assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel"
@@ -75,11 +76,12 @@ def test_autoround_llmcompressor_fp8(self):
             nsamples=2,
             iters=0,
         )
-        autoround.quantize_and_save("./saved", format="auto_round:llm_compressor")
+        _, quantized_model_path = autoround.quantize_and_save("./saved", format="auto_round:llm_compressor")
+        quantized_model_path = quantized_model_path[0]
 
         import json
 
-        config = json.load(open("./saved/config.json"))
+        config = json.load(open(f"{quantized_model_path}/config.json"))
         assert "group_0" in config["quantization_config"]["config_groups"]
         assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
         assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor"
diff --git a/test/test_cpu/models/test_mllm.py b/test/test_cpu/models/test_mllm.py
index 87e140a26..9b2796b92 100644
--- a/test/test_cpu/models/test_mllm.py
+++ b/test/test_cpu/models/test_mllm.py
@@ -220,15 +220,18 @@ def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path):
             processor=processor,
             image_processor=image_processor,
         )
-        autoround.quantize_and_save("./saved/", format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save("./saved/", format="auto_round")
+        quantized_model_path = quantized_model_path[0]
 
         import requests
         from PIL import Image
         from transformers import AutoProcessor, AutoTokenizer, Qwen2_5_VLForConditionalGeneration
 
-        model = Qwen2_5_VLForConditionalGeneration.from_pretrained("./saved", torch_dtype="auto", device_map="auto")
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            quantized_model_path, torch_dtype="auto", device_map="auto"
+        )
         image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-        processor = AutoProcessor.from_pretrained("./saved")
+        processor = AutoProcessor.from_pretrained(quantized_model_path)
         messages = [
             {
                 "role": "user",
diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py
index 4934d619e..167f1409e 100644
--- a/test/test_cpu/models/test_moe_model.py
+++ b/test/test_cpu/models/test_moe_model.py
@@ -68,7 +68,7 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0):
         ignore_layers="self_attn,router,lm_head,mlp.gate",
     )
     quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
-    return quantized_model
+    return quantized_model, save_folder[0]
 
 
 def count_modules_by_type(model, target_module_name_or_class):
@@ -92,7 +92,7 @@ def test_gptoss(setup_gpt_oss, scheme):
     # Remove it to avoid mismatch during quantized model loading
     delattr(model.config, "layer_types")
 
-    quantized_model = quantize_model(model, tokenizer, output_dir, scheme)
+    quantized_model, output_dir = quantize_model(model, tokenizer, output_dir, scheme)
 
     # Ensure the quantized model is not None
     assert quantized_model is not None, "Quantized model should not be None."
@@ -130,7 +130,7 @@ def test_llama4(setup_llama4):
     delattr(model.config.text_config, "moe_layers")
     delattr(model.config.text_config, "layer_types")
 
-    quantized_model = quantize_model(model, tokenizer, output_dir, "MXFP4")
+    quantized_model, output_dir = quantize_model(model, tokenizer, output_dir, "MXFP4")
 
     # Ensure the quantized model is not None
     assert quantized_model is not None, "Quantized model should not be None."
@@ -156,7 +156,7 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe):
         iters=1,
         ignore_layers="self_attn,lm_head,mlp.gate",
     )
-    quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
+    quantized_model, output_dir = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
     assert quantized_model is not None, "Quantized model should not be None."
     loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir, device_map="cpu")
 
diff --git a/test/test_cpu/quantization/test_act_quantization.py b/test/test_cpu/quantization/test_act_quantization.py
index 47cda3599..6e2d91f29 100644
--- a/test/test_cpu/quantization/test_act_quantization.py
+++ b/test/test_cpu/quantization/test_act_quantization.py
@@ -113,7 +113,8 @@ def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert "lm_head" not in model.config.quantization_config.extra_config
 
@@ -141,7 +142,8 @@ def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         kproj_config = model.config.quantization_config.extra_config["model.decoder.layers.1.self_attn.k_proj"]
         assert "act_data_type" in kproj_config.keys() and kproj_config["act_data_type"] == "nv_fp4_with_static_gs"
@@ -167,7 +169,8 @@ def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         extra_config = model.config.quantization_config.extra_config
 
@@ -198,7 +201,8 @@ def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        quantized_model_path = quantized_model_path[0]
         from transformers import AutoConfig
 
         extra_config = AutoConfig.from_pretrained(quantized_model_path).quantization_config["extra_config"]
diff --git a/test/test_cpu/quantization/test_mix_bits.py b/test/test_cpu/quantization/test_mix_bits.py
index 5db3053cb..99437436e 100644
--- a/test/test_cpu/quantization/test_mix_bits.py
+++ b/test/test_cpu/quantization/test_mix_bits.py
@@ -55,7 +55,8 @@ def test_mixed_gptqmodel(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        quantized_model_path = quantized_model_path[0]
         # test original GPTQModel inference
         from gptqmodel import GPTQModel
 
@@ -84,7 +85,8 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        quantized_model_path = quantized_model_path[0]
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="cpu", quantization_config=quantization_config
@@ -112,7 +114,10 @@ def test_mixed_autoround_format(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
-        compressed_model = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round"
+        )
+        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
         assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
@@ -136,7 +141,8 @@ def test_fallback_regex_for_awq_format(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        quantized_model_path = quantized_model_path[0]
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="cpu", quantization_config=quantization_config
@@ -223,7 +229,10 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="auto_round"
+        )
+        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
             torch_dtype="auto",
diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py
index f5044bc73..2200ecd11 100644
--- a/test/test_cpu/quantization/test_mxfp_nvfp.py
+++ b/test/test_cpu/quantization/test_mxfp_nvfp.py
@@ -77,7 +77,10 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
             dataset=dataloader,
             layer_config=layer_config,
         )
-        compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=self.save_dir, inplace=True, format="auto_round"
+        )
+        quantized_model_path = quantized_model_path[0]
         lm_head = compressed_model.lm_head
         assert (
             hasattr(lm_head, "weight_scale")
@@ -86,7 +89,6 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
             and lm_head.weight_packed.dtype is torch.uint8
             and lm_head.weight_scale.dtype is torch.float8_e4m3fn
         ), "Illegal NVFP4 packing for lm_head layer"
-        quantized_model_path = self.save_dir
         assert is_model_outputs_similar(model_name, quantized_model_path)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -211,7 +213,10 @@ def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="llm_compressor"
+        )
+        quantized_model_path = quantized_model_path[0]
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -248,7 +253,10 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="llm_compressor"
+        )
+        quantized_model_path = quantized_model_path[0]
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -285,7 +293,10 @@ def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round"
+        )
+        quantized_model_path = quantized_model_path[0]
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
diff --git a/test/test_cpu/quantization/test_mxfp_save_load.py b/test/test_cpu/quantization/test_mxfp_save_load.py
index 396c47735..5d86a99ee 100644
--- a/test/test_cpu/quantization/test_mxfp_save_load.py
+++ b/test/test_cpu/quantization/test_mxfp_save_load.py
@@ -60,7 +60,8 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type):
 
         # Quantize and save the model to the temporary directory
         quantized_model_path = f"{temp_dir}/tmp_autoround"
-        autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
+        quantized_model_path = quantized_model_path[0]
 
         # Perform inference with the quantized model
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cpu/utils/test_generation.py b/test/test_cpu/utils/test_generation.py
index 6bf7e1020..fed11dbbe 100644
--- a/test/test_cpu/utils/test_generation.py
+++ b/test/test_cpu/utils/test_generation.py
@@ -39,7 +39,10 @@ def test_4bits_sym(self, dataloader):
         )
         quantized_model_path = self.save_folder
 
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round", inplace=False)
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round", inplace=False
+        )
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="ipex")
         model = AutoModelForCausalLM.from_pretrained(
@@ -79,7 +82,8 @@ def test_autoround_sym(self, dataloader):
             )
             quantized_model_path = "./saved"
 
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            quantized_model_path = quantized_model_path[0]
 
             model = AutoModelForCausalLM.from_pretrained(
                 quantized_model_path, device_map="auto", trust_remote_code=True
diff --git a/test/test_cuda/backends/test_torch_backend.py b/test/test_cuda/backends/test_torch_backend.py
index 4594667d9..27583d04a 100644
--- a/test/test_cuda/backends/test_torch_backend.py
+++ b/test/test_cuda/backends/test_torch_backend.py
@@ -44,7 +44,10 @@ def test_torch_4bits_asym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:gptqmodel"
+        )
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
@@ -86,7 +89,10 @@ def test_torch_4bits_sym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round"
+        )  ##will convert to gptq model
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cuda/backends/test_triton_backend.py b/test/test_cuda/backends/test_triton_backend.py
index f51e8aeba..4e88e9def 100644
--- a/test/test_cuda/backends/test_triton_backend.py
+++ b/test/test_cuda/backends/test_triton_backend.py
@@ -38,14 +38,17 @@ def test_tritonv2_4bits_asym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:gptqmodel"
+        )
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -53,10 +56,10 @@ def test_tritonv2_4bits_asym(self, dataloader):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -71,14 +74,15 @@ def test_tritonv2_2bits_asym(self):
         bits, group_size, sym = 2, 32, False
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -86,10 +90,10 @@ def test_tritonv2_2bits_asym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -113,14 +117,15 @@ def test_tritonv2_4bits_sym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
@@ -128,10 +133,10 @@ def test_tritonv2_4bits_sym(self, dataloader):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
@@ -147,14 +152,15 @@ def test_tritonv2_8bits_sym(self):
         bits, group_size, sym = 4, 256, True
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, nsamples=1, iters=1)
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -162,10 +168,10 @@ def test_tritonv2_8bits_sym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
@@ -186,14 +192,15 @@ def test_tritonv2_2bits_sym(self):
             sym=sym,
         )
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -201,10 +208,10 @@ def test_tritonv2_2bits_sym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py
index a2753605b..9e10368f2 100644
--- a/test/test_cuda/export/test_auto_round_format.py
+++ b/test/test_cuda/export/test_auto_round_format.py
@@ -52,7 +52,8 @@ def test_autoround_asym(self, tiny_opt_model_path, dataloader):
             )
             quantized_model_path = self.save_dir
 
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            quantized_model_path = quantized_model_path[0]
 
             model = AutoModelForCausalLM.from_pretrained(
                 quantized_model_path, device_map="cuda:0", trust_remote_code=True
@@ -79,13 +80,14 @@ def test_mixed_precision(self):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        quantized_model_path = quantized_model_path[0]
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -105,14 +107,17 @@ def test_awq_backend(self):
             sym=sym,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:auto_awq"
+        )
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -120,12 +125,12 @@ def test_awq_backend(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
-        shutil.rmtree(self.save_dir, ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_greater_than_050
     def test_tritonv2_bf16(self):
@@ -154,7 +159,8 @@ def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = "./saved"
 
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
+        quantized_model_path = quantized_model_path[0]
 
         from transformers import AutoRoundConfig
 
@@ -206,7 +212,10 @@ def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = "./saved"
 
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:auto_awq"
+        )
+        quantized_model_path = quantized_model_path[0]
 
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
@@ -243,7 +252,8 @@ def test_autoround_sym(self, tiny_opt_model_path, dataloader):
             )
             quantized_model_path = "./saved"
 
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            quantized_model_path = quantized_model_path[0]
 
             model = AutoModelForCausalLM.from_pretrained(
                 quantized_model_path, device_map="auto", trust_remote_code=True
diff --git a/test/test_cuda/export/test_export.py b/test/test_cuda/export/test_export.py
index efd519a51..8a6c71e86 100644
--- a/test/test_cuda/export/test_export.py
+++ b/test/test_cuda/export/test_export.py
@@ -116,7 +116,8 @@ def test_autogptq_format_qsave_ignore_layers(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        quantized_model_path = quantized_model_path[0]
 
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
@@ -218,7 +219,8 @@ def test_autoawq_format_fp_qsave_layers(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "./saved/test_export"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        quantized_model_path = quantized_model_path[0]
         from auto_round import AutoRoundConfig
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -314,7 +316,10 @@ def test_awq_lmhead_export(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_awq"
+        )
+        quantized_model_path = quantized_model_path[0]
         lm_head = compressed_model.lm_head
         from auto_round.export.export_to_awq.utils import WQLinear_GEMM
 
@@ -341,7 +346,10 @@ def test_gptq_lmhead_export(self, tiny_qwen_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_gptq"
+        )
+        quantized_model_path = quantized_model_path[0]
         lm_head = compressed_model.lm_head
         assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer"
         quantization_config = AutoRoundConfig()
diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py
index 09e7451c3..d144241a3 100644
--- a/test/test_cuda/export/test_gguf.py
+++ b/test/test_cuda/export/test_gguf.py
@@ -154,7 +154,8 @@ def test_special_model(self):
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
+        quantized_model_path = quantized_model_path[0]
         file_name = os.listdir(quantized_model_path)[0]
         file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
         assert abs(file_size - 307) < 5.0
@@ -178,11 +179,12 @@ def test_vlm_gguf(self):
             quant_nontext_module=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
-        assert "mmproj-model.gguf" in os.listdir("./saved")
-        for file in os.listdir("./saved"):
-            print(f"{file}: {os.path.getsize(os.path.join('./saved', file)) / 1024**2} MB")
-            file_size = os.path.getsize(os.path.join("./saved", file)) / 1024**2
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
+        quantized_model_path = quantized_model_path[0]
+        assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
+        for file in os.listdir(quantized_model_path):
+            print(f"{file}: {os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2} MB")
+            file_size = os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2
             if "mmproj-model.gguf" in file:
                 assert abs(file_size - 75) < 5.0
             else:
diff --git a/test/test_cuda/integrations/test_transformers.py b/test/test_cuda/integrations/test_transformers.py
index 638cb7396..dc6411ab0 100644
--- a/test/test_cuda/integrations/test_transformers.py
+++ b/test/test_cuda/integrations/test_transformers.py
@@ -200,8 +200,11 @@ def test_mixed_bits(self):
 
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
         with tempfile.TemporaryDirectory() as tmpdirname:
-            autoround.quantize_and_save(output_dir=tmpdirname)
-            model = AutoModelForCausalLM.from_pretrained(tmpdirname, torch_dtype=torch.float16, device_map="cuda")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=tmpdirname)
+            quantized_model_path = quantized_model_path[0]
+            model = AutoModelForCausalLM.from_pretrained(
+                quantized_model_path, torch_dtype=torch.float16, device_map="cuda"
+            )
             text = "There is a girl who likes adventure,"
             inputs = tokenizer(text, return_tensors="pt").to(model.device)
             tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
diff --git a/test/test_cuda/models/test_moe_model.py b/test/test_cuda/models/test_moe_model.py
index 66b0dc827..478ae49f0 100644
--- a/test/test_cuda/models/test_moe_model.py
+++ b/test/test_cuda/models/test_moe_model.py
@@ -179,7 +179,8 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe):
         iters=1,
         ignore_layers="self_attn,lm_head,mlp.gate",
     )
-    quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
+    quantized_model, output_dir = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
+    output_dir = output_dir[0]
     assert quantized_model is not None, "Quantized model should not be None."
     loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir)
     loaded_model.to("cuda")
diff --git a/test/test_cuda/quantization/test_2_3bits.py b/test/test_cuda/quantization/test_2_3bits.py
index 12ed75faa..e3aa891ba 100644
--- a/test/test_cuda/quantization/test_2_3bits.py
+++ b/test/test_cuda/quantization/test_2_3bits.py
@@ -47,7 +47,10 @@ def test_3bits_autoround(self):
         model_name = get_model_path("facebook/opt-125m")
         autoround = AutoRound(model_name, bits=3)
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round"
+        )  ##will convert to gptq model
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
@@ -65,8 +68,9 @@ def test_3bits_asym_autoround(self):
         model_name = get_model_path("facebook/opt-125m")
         bits, sym = 3, False
         autoround = AutoRound(model_name, bits=bits, sym=sym)
-        autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False)
+        quantized_model_path = quantized_model_path[0]
+        model_args = f"pretrained={quantized_model_path}"
         res = simple_evaluate(
             model="hf",
             model_args=model_args,
diff --git a/test/test_cuda/quantization/test_mix_bits.py b/test/test_cuda/quantization/test_mix_bits.py
index 614a5d2b1..b53a33d39 100644
--- a/test/test_cuda/quantization/test_mix_bits.py
+++ b/test/test_cuda/quantization/test_mix_bits.py
@@ -52,7 +52,8 @@ def test_mixed_gptqmodel(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        quantized_model_path = quantized_model_path[0]
         from gptqmodel import GPTQModel
 
         model = GPTQModel.load(quantized_model_path)
@@ -79,7 +80,8 @@ def test_mixed_gptqmodel_convert_to_ar(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        quantized_model_path = quantized_model_path[0]
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="auto", quantization_config=quantization_config
@@ -107,7 +109,8 @@ def test_mixed_autoround_format(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "self.save_dir"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
         assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
         assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
@@ -133,7 +136,8 @@ def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "self.save_dir"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        quantized_model_path = quantized_model_path[0]
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="auto", quantization_config=quantization_config
@@ -221,7 +225,10 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="auto_round"
+        )
+        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
             torch_dtype="auto",
@@ -289,9 +296,10 @@ def test_mixed_llmcompressor_format_vllm(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        compressed, _ = autoround.quantize_and_save(
+        compressed, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, inplace=False, format="llm_compressor"
         )
+        quantized_model_path = quantized_model_path[0]
         from vllm import LLM, SamplingParams
 
         # Sample prompts.
diff --git a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py
index d76b556e3..407cf7e42 100644
--- a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py
+++ b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py
@@ -52,7 +52,8 @@ def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path):
 
         # Quantize and save the model to the temporary directory
         quantized_model_path = f"{temp_dir}/tmp_autoround_{scheme}"
-        autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
+        quantized_model_path = quantized_model_path[0]
 
         # Perform inference with the quantized model
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cuda/quantization/test_mxfp_nvfp.py b/test/test_cuda/quantization/test_mxfp_nvfp.py
index 922cb9f9b..5e20c66a1 100644
--- a/test/test_cuda/quantization/test_mxfp_nvfp.py
+++ b/test/test_cuda/quantization/test_mxfp_nvfp.py
@@ -64,7 +64,10 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="llm_compressor"
+        )
+        quantized_model_path = quantized_model_path[0]
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -150,7 +153,10 @@ def test_qwen_moe_quant_infer(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=True, format="auto_round"
+        )
+        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         from auto_round.eval.evaluation import simple_evaluate_user_model
diff --git a/test/test_cuda/schemes/test_scheme.py b/test/test_cuda/schemes/test_scheme.py
index 315c5984b..c5ba7d4fc 100644
--- a/test/test_cuda/schemes/test_scheme.py
+++ b/test/test_cuda/schemes/test_scheme.py
@@ -120,7 +120,8 @@ def test_q2k_mixed(self):
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed")
+        quantized_model_path = quantized_model_path[0]
         gguf_file = os.listdir(quantized_model_path)[0]
         file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2
         assert abs(file_size - 1236) < 5.0
diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
index 6c9d65c83..77089103e 100644
--- a/test/test_xpu/test_autoround.py
+++ b/test/test_xpu/test_autoround.py
@@ -42,7 +42,8 @@ def test_gptq_format(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="auto")
 
@@ -74,7 +75,10 @@ def test_awq_format(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:auto_awq"
+        )
+        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="auto")
         # device_map="auto" doesn't work, must use "xpu"

From 0ff6480ef89bd1b04fffb4c667e2bb153f7a6b4d Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 27 Jan 2026 14:53:16 +0800
Subject: [PATCH 07/14] fix merge

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 6cca95c34..39f10fd78 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -497,7 +497,7 @@ def _post_init(self) -> None:
         self._check_configs()
 
         if isinstance(self.scheme, AutoScheme):
-            self.layer_config = self._gen_auto_scheme(self.model, self.scheme, self.dataset, self.device_map)
+            self.layer_config = self._gen_auto_scheme(self.model, self.orig_scheme, self.dataset, self.device_map)
 
         if is_hpex_available():
             logger.info("habana_frameworks is available, import htcore explicitly.")
@@ -1570,9 +1570,6 @@ def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True)
                 else:
                     self.ignore_layers += "," + tmp_str
 
-        if self.is_auto_scheme:
-            self.layer_config = self._gen_auto_scheme(self.model, self.orig_scheme, self.dataset, self.device_map)
-
         fill_default_value = True
         if self.is_auto_scheme:
             fill_default_value = False

From 151710a1a8ac8e3d0ad18b28e4a9ecfebcb644ae Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 28 Jan 2026 14:58:52 +0800
Subject: [PATCH 08/14] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 39f10fd78..04a6cd738 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -287,6 +287,11 @@ def __init__(
 
         # Extra/legacy kwargs for backward compatibility
         # Major version releases may pack them with extra configuration options
+        scheme_keys = [f.name for f in fields(QuantizationScheme)]
+        for key in scheme_keys:
+            if key in kwargs and kwargs[key] is not None:
+                setattr(self, key, kwargs.pop(key))
+
         amp = kwargs.pop("amp", True)
         lr = kwargs.pop("lr", None)
         enable_minmax_tuning = kwargs.pop("enable_minmax_tuning", True)
@@ -433,7 +438,7 @@ def _post_init(self) -> None:
         # should be set after loading model and set layer_config, cause some special scheme need these.
         # Preserve the original, unparsed scheme for later use in auto scheme generation
         # within `configure_layer_config` (which may need the raw value instead of `self.scheme`).
-        self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(self.scheme, {})
+        self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(self.scheme)
 
         # GGUF uses fp32 scale dtype as default
         if self.scale_dtype is None:
@@ -619,18 +624,18 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
     def _parse_and_set_scheme(
-        self, scheme: Union[str, dict, QuantizationScheme], kwargs
+        self,
+        scheme: Union[str, dict, QuantizationScheme],
     ) -> tuple[QuantizationScheme, bool]:
         """Parse and set the quantization scheme."""
 
-        def _parse_and_set(scheme, kwargs):
-            if kwargs.get("data_type", None) and kwargs["data_type"].endswith("_dq") and not scheme.startswith("gguf"):
-                if "bits" not in kwargs:
-                    data_type = kwargs["data_type"]
+        def _parse_and_set(scheme):
+            if self.data_type and self.data_type.endswith("_dq") and not scheme.startswith("gguf"):
+                if not hasattr(self, "bits") or self.bits is None:
                     raise KeyError(
-                        f"please set bits when setting data_type={data_type}, or using scheme as an alternative."
+                        f"please set bits when setting data_type={self.data_type}, or using scheme as an alternative."
                     )
-                bits = kwargs["bits"]
+                bits = self.bits
                 scheme = f"gguf:q{bits}_k" if bits == 6 else f"gguf:q{bits}_k_s"
             res = None
             if isinstance(scheme, QuantizationScheme):
@@ -648,11 +653,10 @@ def _parse_and_set(scheme, kwargs):
                 scheme = asdict(preset_name_to_scheme(scheme))
             scheme_keys = [f.name for f in fields(QuantizationScheme)]
             for key in scheme_keys:
-                if key in kwargs and kwargs[key] is not None:
-                    setattr(self, key, kwargs[key])
+                if hasattr(self, key) and getattr(self, key) is not None:
+                    continue
                 else:
                     setattr(self, key, scheme.get(key, None))
-                # kwargs.pop(key, None)
             if self.act_dynamic is None:
                 self.act_dynamic = True
 
@@ -708,7 +712,7 @@ def _parse_and_set(scheme, kwargs):
                 raise ValueError("options of AutoScheme must not be empty")
             options = []
             for option in scheme.options:
-                new_option = _parse_and_set(option, kwargs)
+                new_option = _parse_and_set(option)
                 options.append(new_option)
             scheme.options = options
             for opt in options:
@@ -720,15 +724,13 @@ def _parse_and_set(scheme, kwargs):
                 self.scheme = opt  # Choose the first one that not 16 bits
                 break
             # apply scheme to set default bits
-            scheme = _parse_and_set(self.scheme, kwargs)
+            scheme = _parse_and_set(self.scheme)
             is_auto_scheme = True
         else:
-            scheme = _parse_and_set(scheme, kwargs)
+            scheme = _parse_and_set(scheme)
             is_auto_scheme = False
 
         scheme_keys = [f.name for f in fields(QuantizationScheme)]
-        for key in scheme_keys:
-            kwargs.pop(key, None)
 
         return scheme, is_auto_scheme
 

From 84f9db8aee2f94f046cb0d1e0499672272c2aae9 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 28 Jan 2026 15:33:00 +0800
Subject: [PATCH 09/14] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 04a6cd738..90ebc39c8 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -630,7 +630,7 @@ def _parse_and_set_scheme(
         """Parse and set the quantization scheme."""
 
         def _parse_and_set(scheme):
-            if self.data_type and self.data_type.endswith("_dq") and not scheme.startswith("gguf"):
+            if getattr(self, "data_type", None) and self.data_type.endswith("_dq") and not scheme.startswith("gguf"):
                 if not hasattr(self, "bits") or self.bits is None:
                     raise KeyError(
                         f"please set bits when setting data_type={self.data_type}, or using scheme as an alternative."

From 464a5896cbddb6451c042e2d6022953bf33e7e2b Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 28 Jan 2026 16:30:57 +0800
Subject: [PATCH 10/14] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py                |  4 ++--
 auto_round/eval/evaluation.py                 |  2 +-
 test/test_ark/test_model.py                   |  1 -
 test/test_cpu/backends/test_torch_backend.py  |  4 +---
 test/test_cpu/core/test_autoround.py          |  4 ++--
 test/test_cpu/core/test_init.py               |  2 ++
 test/test_cpu/export/test_export.py           | 19 +++++++++++--------
 test/test_cpu/export/test_gguf_format.py      | 16 ++++++++--------
 .../integrations/test_llmcompressor.py        |  4 ++--
 test/test_cpu/models/test_mllm.py             |  3 ++-
 test/test_cpu/models/test_moe_model.py        |  2 +-
 .../quantization/test_act_quantization.py     |  8 ++++----
 test/test_cpu/quantization/test_mix_bits.py   | 11 ++++++-----
 test/test_cpu/quantization/test_mxfp_nvfp.py  | 13 +++++++------
 .../quantization/test_mxfp_save_load.py       |  1 -
 test/test_cpu/schemes/test_scheme.py          |  9 +++++----
 test/test_cpu/utils/test_generation.py        |  3 +--
 test/test_cuda/backends/test_torch_backend.py |  2 --
 .../test_cuda/backends/test_triton_backend.py |  6 +-----
 .../export/test_auto_round_format.py          |  7 +------
 test/test_cuda/export/test_export.py          |  7 +++----
 test/test_cuda/export/test_gguf.py            |  4 ++--
 .../integrations/test_transformers.py         |  2 +-
 test/test_cuda/quantization/test_2_3bits.py   |  3 +--
 test/test_cuda/quantization/test_mix_bits.py  | 12 ++++++------
 .../quantization/test_mxfp_and_nvfp_quant.py  |  1 -
 test/test_cuda/quantization/test_mxfp_nvfp.py |  4 ++--
 test/test_cuda/schemes/test_scheme.py         |  2 +-
 test/test_xpu/test_autoround.py               |  3 +--
 29 files changed, 74 insertions(+), 85 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 90ebc39c8..890124f39 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -3164,7 +3164,7 @@ def save_quantized(
         output_dir: str = None,
         format: Union[str, list[OutputFormat]] = "auto_round",
         inplace: bool = True,
-        return_folders=False,
+        return_folders=True,
         **kwargs,
     ) -> torch.nn.Module:
         """Save the quantized model to the specified output directory in the specified format.
@@ -3218,7 +3218,7 @@ def save_quantized(
             folders.append(save_folder)
 
         if return_folders:
-            return compressed_model, folders
+            return compressed_model, folders[0] if len(folders) == 1 else folders
         else:
             return compressed_model
 
diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py
index cff49b371..0bc6dbd3a 100644
--- a/auto_round/eval/evaluation.py
+++ b/auto_round/eval/evaluation.py
@@ -380,7 +380,7 @@ def run_model_evaluation(model, tokenizer, autoround, folders, formats, device_s
         return
 
     # Check if evaluation is needed for language models
-    eval_folder = folders[-1] if folders else None
+    eval_folder = folders[-1] if folders and isinstance(folders, list) else folders
     if args.tasks is None or args.tasks == "" or eval_folder is None:
         return
 
diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py
index ed53e2045..dd5119334 100644
--- a/test/test_ark/test_model.py
+++ b/test/test_ark/test_model.py
@@ -43,7 +43,6 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format=format
         )  ##will convert to gptq model
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="ark")
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py
index 6ea89f434..b47dfe48d 100644
--- a/test/test_cpu/backends/test_torch_backend.py
+++ b/test/test_cpu/backends/test_torch_backend.py
@@ -41,14 +41,13 @@ def test_torch_4bits_asym(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round:gptqmodel"
         )
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -85,7 +84,6 @@ def test_torch_4bits_sym(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round"
         )  ##will convert to gptq model
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py
index cfd1b3bc7..f565dabec 100644
--- a/test/test_cpu/core/test_autoround.py
+++ b/test/test_cpu/core/test_autoround.py
@@ -14,6 +14,7 @@
 
 
 class TestAutoRound:
+
     @classmethod
     def setup_class(self):
         model_name = opt_name_or_path
@@ -383,7 +384,6 @@ def test_rtn(self, tiny_opt_model_path):
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, nsamples=1)
         quantized_model_path = self.save_folder
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        quantized_model_path = quantized_model_path[0]
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
             torch_dtype=torch.float16,
@@ -441,7 +441,7 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader):
         _, quantized_model_path = autoround.save_quantized(
             output_dir=quantized_model_path, format="auto_round", inplace=True
         )
-        quantized_model_path = quantized_model_path[0]
+
         quantization_config = AutoRoundConfig(backend="ipex")
 
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cpu/core/test_init.py b/test/test_cpu/core/test_init.py
index 01785d679..46a460dd3 100644
--- a/test/test_cpu/core/test_init.py
+++ b/test/test_cpu/core/test_init.py
@@ -3,6 +3,8 @@
 
 def test_torch_compile(tiny_opt_model_path):
     ar = AutoRound(model=tiny_opt_model_path, scheme="NVFP4", enable_torch_compile=True)
+    ar._post_init()
     assert not ar.enable_torch_compile, "NVFP4 cannot work with torch.compile."
     ar = AutoRound(model=tiny_opt_model_path, scheme="FP8_STATIC", enable_torch_compile=True)
+    ar._post_init()
     assert not ar.enable_torch_compile, "FP8_STATIC cannot work with torch.compile."
diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py
index 3b9df68eb..2dc4c0c59 100644
--- a/test/test_cpu/export/test_export.py
+++ b/test/test_cpu/export/test_export.py
@@ -23,6 +23,7 @@ def _get_folder_size(path: str) -> float:
 
 
 class TestAutoRound:
+
     @classmethod
     def setup_class(self):
         self.model_name = opt_name_or_path
@@ -51,7 +52,6 @@ def test_autogptq_format(self, dataloader):
 
             quantized_model_path = "./saved"
             _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
-            quantized_model_path = quantized_model_path[0]
 
             if group_size == -1:
                 shutil.rmtree("./saved", ignore_errors=True)
@@ -81,7 +81,6 @@ def test_autoround_format(self, dataloader):
             )
             quantized_model_path = "./saved"
             _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-            quantized_model_path = quantized_model_path[0]
 
             if group_size == -1:
                 shutil.rmtree("./saved", ignore_errors=True)
@@ -111,7 +110,6 @@ def test_autoround_awq_format(self, dataloader):
             _, quantized_model_path = autoround.quantize_and_save(
                 output_dir=quantized_model_path, format="auto_round:auto_awq"
             )
-            quantized_model_path = quantized_model_path[0]
 
             # quantization_config = AutoRoundConfig(
             #     backend="cpu"
@@ -230,7 +228,7 @@ def test_static_afp8_export(self, static_kv_dtype):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        quantized_model_path = quantized_model_path[0]
+
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
         assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
@@ -292,7 +290,6 @@ def test_static_afp8_export(self, static_kv_dtype):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        quantized_model_path = quantized_model_path[0]
 
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
@@ -318,7 +315,8 @@ def test_static_fp8_attn(self):
             static_attention_dtype="fp8",
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
         assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
@@ -359,7 +357,7 @@ def test_awq_lmhead_export(self, dataloader):
         compressed_model, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_awq"
         )
-        quantized_model_path = quantized_model_path[0]
+
         lm_head = compressed_model.lm_head
         from auto_round.export.export_to_awq.utils import WQLinear_GEMM
 
@@ -393,7 +391,7 @@ def test_gptq_lmhead_export(self, dataloader):
         compressed_model, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_gptq"
         )
-        quantized_model_path = quantized_model_path[0]
+
         lm_head = compressed_model.lm_head
         assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer"
         quantization_config = AutoRoundConfig()
@@ -414,6 +412,7 @@ def test_export_format(self):
             self.model_name,
             scheme="FP8_STATIC",
         )
+        autoround._post_init()
         format_list = get_formats("auto_round, llm_compressor, auto_round:llm_compressor", autoround)
         assert len(format_list) == 3
         assert format_list[0].output_format == "auto_round"
@@ -427,6 +426,7 @@ def test_export_format(self):
             self.model_name,
             scheme="W4A16",
         )
+        autoround._post_init()
         format_list = get_formats("auto_round:auto_awq, auto_gptq", autoround)
         assert format_list[0].output_format == "auto_round"
         assert format_list[0].get_backend_name() == "auto_round:auto_awq"
@@ -443,6 +443,7 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path):
             group_size=32,
             sym=True,
         )
+        ar._post_init()
         with pytest.raises(ValueError, match="auto_awq format support quantization scheme with W4A16 but got bits=2"):
             get_formats("auto_round:auto_awq", ar)
 
@@ -456,6 +457,7 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path):
             group_size=32,
             sym=True,
         )
+        ar._post_init()
         with pytest.raises(ValueError, match="but got data_type=fp, bits=4"):
             get_formats("auto_round:llm_compressor", ar)
 
@@ -466,4 +468,5 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path):
             group_size=256,
             sym=True,
         )
+        ar._post_init()
         get_formats("auto_round:auto_awq", ar)
diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py
index cc2c8bf3d..636b180e7 100644
--- a/test/test_cpu/export/test_gguf_format.py
+++ b/test/test_cpu/export/test_gguf_format.py
@@ -67,7 +67,7 @@ def test_q4_0(self):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, inplace=False, format="gguf:q4_0"
         )
-        quantized_model_path = quantized_model_path[0]
+
         gguf_file = os.listdir(quantized_model_path)[0]
 
         # TODO: fix the issue of gguf loading error in transformers v5
@@ -95,7 +95,7 @@ def test_func(self):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, inplace=False, format="gguf:q*_1"
         )
-        quantized_model_path = quantized_model_path[0]
+
         assert autoround.group_size == 32
         assert not autoround.sym
         gguf_file = os.listdir(quantized_model_path)[0]
@@ -126,7 +126,7 @@ def test_gguf_baseline(self):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, inplace=False, format="fake"
         )
-        quantized_model_path = quantized_model_path[0]
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
@@ -164,7 +164,7 @@ def test_q4_k_m(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="gguf:q4_k_m,fake"
         )
-        quantized_model_path = quantized_model_path[0]
+
         assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16
         assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq"
         assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq"
@@ -183,7 +183,7 @@ def test_q4_k_m(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="gguf:q4_k_m,fake"
         )
-        quantized_model_path = quantized_model_path[0]
+
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_all_format(self, tiny_qwen_model_path):
@@ -232,7 +232,7 @@ def test_vlm_gguf(self):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        quantized_model_path = quantized_model_path[0]
+
         assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
         for file_name in os.listdir(quantized_model_path):
             file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
@@ -259,7 +259,7 @@ def test_vlm_gguf_wo_quant_nontext_module(self):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        quantized_model_path = quantized_model_path[0]
+
         assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
         for file_name in os.listdir(quantized_model_path):
             file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
@@ -358,7 +358,7 @@ def test_q2k_mixed(self):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed")
-        quantized_model_path = quantized_model_path[0]
+
         gguf_file = os.listdir(quantized_model_path)[0]
         file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2
         assert abs(file_size - 1362) < 5.0
diff --git a/test/test_cpu/integrations/test_llmcompressor.py b/test/test_cpu/integrations/test_llmcompressor.py
index 2c41406fe..95afce423 100644
--- a/test/test_cpu/integrations/test_llmcompressor.py
+++ b/test/test_cpu/integrations/test_llmcompressor.py
@@ -11,6 +11,7 @@
 
 
 class TestLLMC:
+
     @classmethod
     def setup_class(self):
         self.model_name = get_model_path("stas/tiny-random-llama-2")
@@ -51,7 +52,7 @@ def test_llmcompressor_fp8(self):
             iters=0,
         )
         _, quantized_model_path = autoround.quantize_and_save("./saved", format="llm_compressor")
-        quantized_model_path = quantized_model_path[0]
+
         # from vllm import LLM
         # model = LLM("./saved")
         # result = model.generate("Hello my name is")
@@ -77,7 +78,6 @@ def test_autoround_llmcompressor_fp8(self):
             iters=0,
         )
         _, quantized_model_path = autoround.quantize_and_save("./saved", format="auto_round:llm_compressor")
-        quantized_model_path = quantized_model_path[0]
 
         import json
 
diff --git a/test/test_cpu/models/test_mllm.py b/test/test_cpu/models/test_mllm.py
index 9b2796b92..748a6ede0 100644
--- a/test/test_cpu/models/test_mllm.py
+++ b/test/test_cpu/models/test_mllm.py
@@ -9,6 +9,7 @@
 
 
 class FakeDataLoader:
+
     def __init__(self):
         self.batch_size = 1
 
@@ -26,6 +27,7 @@ def __iter__(self):
 
 
 class TestAutoRoundMLLM:
+
     @classmethod
     def setup_class(self):
         self.model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
@@ -221,7 +223,6 @@ def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path):
             image_processor=image_processor,
         )
         _, quantized_model_path = autoround.quantize_and_save("./saved/", format="auto_round")
-        quantized_model_path = quantized_model_path[0]
 
         import requests
         from PIL import Image
diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py
index 234c9ae55..01bedb40d 100644
--- a/test/test_cpu/models/test_moe_model.py
+++ b/test/test_cpu/models/test_moe_model.py
@@ -70,7 +70,7 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0):
         ignore_layers="self_attn,router,lm_head,mlp.gate",
     )
     quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
-    return quantized_model, save_folder[0]
+    return quantized_model, save_folder
 
 
 def count_modules_by_type(model, target_module_name_or_class):
diff --git a/test/test_cpu/quantization/test_act_quantization.py b/test/test_cpu/quantization/test_act_quantization.py
index 6e2d91f29..647a3fcb9 100644
--- a/test/test_cpu/quantization/test_act_quantization.py
+++ b/test/test_cpu/quantization/test_act_quantization.py
@@ -114,7 +114,7 @@ def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        quantized_model_path = quantized_model_path[0]
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert "lm_head" not in model.config.quantization_config.extra_config
 
@@ -143,7 +143,7 @@ def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        quantized_model_path = quantized_model_path[0]
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         kproj_config = model.config.quantization_config.extra_config["model.decoder.layers.1.self_attn.k_proj"]
         assert "act_data_type" in kproj_config.keys() and kproj_config["act_data_type"] == "nv_fp4_with_static_gs"
@@ -170,7 +170,7 @@ def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        quantized_model_path = quantized_model_path[0]
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         extra_config = model.config.quantization_config.extra_config
 
@@ -202,7 +202,7 @@ def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        quantized_model_path = quantized_model_path[0]
+
         from transformers import AutoConfig
 
         extra_config = AutoConfig.from_pretrained(quantized_model_path).quantization_config["extra_config"]
diff --git a/test/test_cpu/quantization/test_mix_bits.py b/test/test_cpu/quantization/test_mix_bits.py
index 99437436e..749219271 100644
--- a/test/test_cpu/quantization/test_mix_bits.py
+++ b/test/test_cpu/quantization/test_mix_bits.py
@@ -25,6 +25,7 @@ def _get_folder_size(path: str) -> float:
 
 
 class TestAutoRound:
+
     @classmethod
     def setup_class(self):
         self.model_name = opt_name_or_path
@@ -56,7 +57,7 @@ def test_mixed_gptqmodel(self, dataloader):
         )
         quantized_model_path = self.save_dir
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
-        quantized_model_path = quantized_model_path[0]
+
         # test original GPTQModel inference
         from gptqmodel import GPTQModel
 
@@ -86,7 +87,7 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader):
         )
         quantized_model_path = self.save_dir
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
-        quantized_model_path = quantized_model_path[0]
+
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="cpu", quantization_config=quantization_config
@@ -117,7 +118,7 @@ def test_mixed_autoround_format(self, dataloader):
         compressed_model, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round"
         )
-        quantized_model_path = quantized_model_path[0]
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
         assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
@@ -142,7 +143,7 @@ def test_fallback_regex_for_awq_format(self, dataloader):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
-        quantized_model_path = quantized_model_path[0]
+
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="cpu", quantization_config=quantization_config
@@ -232,7 +233,7 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, inplace=False, format="auto_round"
         )
-        quantized_model_path = quantized_model_path[0]
+
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
             torch_dtype="auto",
diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py
index 84f8a82ff..e061d5b3b 100644
--- a/test/test_cpu/quantization/test_mxfp_nvfp.py
+++ b/test/test_cpu/quantization/test_mxfp_nvfp.py
@@ -23,6 +23,7 @@ def _get_folder_size(path: str) -> float:
 
 
 class TestAutoRoundFP:
+
     @classmethod
     def setup_class(self):
         self.save_dir = "./saved"
@@ -87,7 +88,7 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
         compressed_model, quantized_model_path = autoround.quantize_and_save(
             output_dir=self.save_dir, inplace=True, format="auto_round"
         )
-        quantized_model_path = quantized_model_path[0]
+
         lm_head = compressed_model.lm_head
         assert (
             hasattr(lm_head, "weight_scale")
@@ -146,7 +147,7 @@ def test_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
-        compressed_model = autoround.save_quantized(
+        compressed_model, _ = autoround.save_quantized(
             output_dir=quantized_model_path, inplace=True, format="llm_compressor"
         )
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
@@ -186,7 +187,7 @@ def test_rtn_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
-        compressed_model = autoround.save_quantized(
+        compressed_model, _ = autoround.save_quantized(
             output_dir=quantized_model_path, inplace=True, format="llm_compressor"
         )
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
@@ -226,7 +227,7 @@ def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader):
         compressed_model, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="llm_compressor"
         )
-        quantized_model_path = quantized_model_path[0]
+
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -266,7 +267,7 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
         compressed_model, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="llm_compressor"
         )
-        quantized_model_path = quantized_model_path[0]
+
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -306,7 +307,7 @@ def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader):
         compressed_model, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round"
         )
-        quantized_model_path = quantized_model_path[0]
+
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
diff --git a/test/test_cpu/quantization/test_mxfp_save_load.py b/test/test_cpu/quantization/test_mxfp_save_load.py
index 5d86a99ee..4fdcaeffc 100644
--- a/test/test_cpu/quantization/test_mxfp_save_load.py
+++ b/test/test_cpu/quantization/test_mxfp_save_load.py
@@ -61,7 +61,6 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type):
         # Quantize and save the model to the temporary directory
         quantized_model_path = f"{temp_dir}/tmp_autoround"
         _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
-        quantized_model_path = quantized_model_path[0]
 
         # Perform inference with the quantized model
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py
index f4f0e716b..f5a1a6ace 100644
--- a/test/test_cpu/schemes/test_scheme.py
+++ b/test/test_cpu/schemes/test_scheme.py
@@ -33,36 +33,37 @@ def test_gguf(self, tiny_qwen_model_path, dataloader):
 
     def test_w4a16(self, tiny_opt_model_path, dataloader):
         ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
-        assert ar.bits == 4
         ar.quantize()
+        assert ar.bits == 4
 
     def test_w2a16_rtn(self, tiny_opt_model_path, dataloader):
         ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader)
-        assert ar.bits == 2
         ar.quantize()
+        assert ar.bits == 2
 
     def test_mxfp4(self, tiny_opt_model_path, dataloader):
         ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        ar.quantize()
         assert ar.bits == 4
         assert ar.act_bits == 4
         assert ar.data_type == "mx_fp"
         assert ar.act_data_type == "mx_fp"
-        ar.quantize()
 
     def test_vllm(self, tiny_qwen_vl_model_path):
         from auto_round import AutoRoundMLLM
 
         ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2)
+        ar._post_init()
         assert ar.bits == 2
         assert ar.act_bits == 16
 
     def test_nvfp4(self, tiny_opt_model_path, dataloader):
         ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        ar.quantize()
         assert ar.bits == 4
         assert ar.act_bits == 4
         assert ar.data_type == "nv_fp"
         assert ar.act_data_type == "nv_fp4_with_static_gs"
-        ar.quantize()
 
     def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader):
         import copy
diff --git a/test/test_cpu/utils/test_generation.py b/test/test_cpu/utils/test_generation.py
index fed11dbbe..245ffcc72 100644
--- a/test/test_cpu/utils/test_generation.py
+++ b/test/test_cpu/utils/test_generation.py
@@ -11,6 +11,7 @@
 
 
 class TestAutoRoundFormatGeneration:
+
     @classmethod
     def setup_class(self):
         self.model_name = opt_name_or_path
@@ -42,7 +43,6 @@ def test_4bits_sym(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round", inplace=False
         )
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="ipex")
         model = AutoModelForCausalLM.from_pretrained(
@@ -83,7 +83,6 @@ def test_autoround_sym(self, dataloader):
             quantized_model_path = "./saved"
 
             _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-            quantized_model_path = quantized_model_path[0]
 
             model = AutoModelForCausalLM.from_pretrained(
                 quantized_model_path, device_map="auto", trust_remote_code=True
diff --git a/test/test_cuda/backends/test_torch_backend.py b/test/test_cuda/backends/test_torch_backend.py
index 28e2166e6..16f7dfcae 100644
--- a/test/test_cuda/backends/test_torch_backend.py
+++ b/test/test_cuda/backends/test_torch_backend.py
@@ -47,7 +47,6 @@ def test_torch_4bits_asym(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round:gptqmodel"
         )
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
@@ -92,7 +91,6 @@ def test_torch_4bits_sym(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round"
         )  ##will convert to gptq model
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cuda/backends/test_triton_backend.py b/test/test_cuda/backends/test_triton_backend.py
index c4eb3317b..b99e68ef8 100644
--- a/test/test_cuda/backends/test_triton_backend.py
+++ b/test/test_cuda/backends/test_triton_backend.py
@@ -12,6 +12,7 @@
 
 
 class TestAutoRoundTritonBackend:
+
     @classmethod
     def setup_class(self):
         self.model_name = "/models/opt-125m"
@@ -41,7 +42,6 @@ def test_tritonv2_4bits_asym(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round:gptqmodel"
         )
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
@@ -75,7 +75,6 @@ def test_tritonv2_2bits_asym(self):
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
         quantized_model_path = self.save_folder
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
@@ -118,7 +117,6 @@ def test_tritonv2_4bits_sym(self, dataloader):
         )
         quantized_model_path = self.save_folder
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
@@ -153,7 +151,6 @@ def test_tritonv2_8bits_sym(self):
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, nsamples=1, iters=1)
         quantized_model_path = self.save_folder
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
@@ -193,7 +190,6 @@ def test_tritonv2_2bits_sym(self):
         )
         quantized_model_path = self.save_folder
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py
index f936474f5..ba94ebda9 100644
--- a/test/test_cuda/export/test_auto_round_format.py
+++ b/test/test_cuda/export/test_auto_round_format.py
@@ -53,7 +53,6 @@ def test_autoround_asym(self, tiny_opt_model_path, dataloader):
             quantized_model_path = self.save_dir
 
             _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-            quantized_model_path = quantized_model_path[0]
 
             model = AutoModelForCausalLM.from_pretrained(
                 quantized_model_path, device_map="cuda:0", trust_remote_code=True
@@ -81,7 +80,7 @@ def test_mixed_precision(self):
         autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
         quantized_model_path = self.save_dir
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        quantized_model_path = quantized_model_path[0]
+
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
@@ -110,7 +109,6 @@ def test_awq_backend(self):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round:auto_awq"
         )
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
@@ -160,7 +158,6 @@ def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader):
         quantized_model_path = "./saved"
 
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
-        quantized_model_path = quantized_model_path[0]
 
         from transformers import AutoRoundConfig
 
@@ -215,7 +212,6 @@ def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round:auto_awq"
         )
-        quantized_model_path = quantized_model_path[0]
 
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
@@ -253,7 +249,6 @@ def test_autoround_sym(self, tiny_opt_model_path, dataloader):
             quantized_model_path = "./saved"
 
             _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-            quantized_model_path = quantized_model_path[0]
 
             model = AutoModelForCausalLM.from_pretrained(
                 quantized_model_path, device_map="auto", trust_remote_code=True
diff --git a/test/test_cuda/export/test_export.py b/test/test_cuda/export/test_export.py
index 7e93c1a56..8b15d234d 100644
--- a/test/test_cuda/export/test_export.py
+++ b/test/test_cuda/export/test_export.py
@@ -117,7 +117,6 @@ def test_autogptq_format_qsave_ignore_layers(self, dataloader):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
-        quantized_model_path = quantized_model_path[0]
 
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
@@ -220,7 +219,7 @@ def test_autoawq_format_fp_qsave_layers(self, dataloader):
         )
         quantized_model_path = "./saved/test_export"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
-        quantized_model_path = quantized_model_path[0]
+
         from auto_round import AutoRoundConfig
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -319,7 +318,7 @@ def test_awq_lmhead_export(self, dataloader):
         compressed_model, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_awq"
         )
-        quantized_model_path = quantized_model_path[0]
+
         lm_head = compressed_model.lm_head
         from auto_round.export.export_to_awq.utils import WQLinear_GEMM
 
@@ -349,7 +348,7 @@ def test_gptq_lmhead_export(self, tiny_qwen_model_path, dataloader):
         compressed_model, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_gptq"
         )
-        quantized_model_path = quantized_model_path[0]
+
         lm_head = compressed_model.lm_head
         assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer"
         quantization_config = AutoRoundConfig()
diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py
index d144241a3..5cadcee34 100644
--- a/test/test_cuda/export/test_gguf.py
+++ b/test/test_cuda/export/test_gguf.py
@@ -155,7 +155,7 @@ def test_special_model(self):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        quantized_model_path = quantized_model_path[0]
+
         file_name = os.listdir(quantized_model_path)[0]
         file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
         assert abs(file_size - 307) < 5.0
@@ -180,7 +180,7 @@ def test_vlm_gguf(self):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
-        quantized_model_path = quantized_model_path[0]
+
         assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
         for file in os.listdir(quantized_model_path):
             print(f"{file}: {os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2} MB")
diff --git a/test/test_cuda/integrations/test_transformers.py b/test/test_cuda/integrations/test_transformers.py
index dc6411ab0..9484daaa0 100644
--- a/test/test_cuda/integrations/test_transformers.py
+++ b/test/test_cuda/integrations/test_transformers.py
@@ -201,7 +201,7 @@ def test_mixed_bits(self):
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
         with tempfile.TemporaryDirectory() as tmpdirname:
             _, quantized_model_path = autoround.quantize_and_save(output_dir=tmpdirname)
-            quantized_model_path = quantized_model_path[0]
+
             model = AutoModelForCausalLM.from_pretrained(
                 quantized_model_path, torch_dtype=torch.float16, device_map="cuda"
             )
diff --git a/test/test_cuda/quantization/test_2_3bits.py b/test/test_cuda/quantization/test_2_3bits.py
index e0591b464..1f2e5e812 100644
--- a/test/test_cuda/quantization/test_2_3bits.py
+++ b/test/test_cuda/quantization/test_2_3bits.py
@@ -50,7 +50,6 @@ def test_3bits_autoround(self):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round"
         )  ##will convert to gptq model
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
@@ -69,7 +68,7 @@ def test_3bits_asym_autoround(self):
         bits, sym = 3, False
         autoround = AutoRound(model_name, bits=bits, sym=sym)
         _, quantized_model_path = autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False)
-        quantized_model_path = quantized_model_path[0]
+
         model_args = f"pretrained={quantized_model_path}"
         res = simple_evaluate(
             model="hf",
diff --git a/test/test_cuda/quantization/test_mix_bits.py b/test/test_cuda/quantization/test_mix_bits.py
index 15197538b..c6164f4b5 100644
--- a/test/test_cuda/quantization/test_mix_bits.py
+++ b/test/test_cuda/quantization/test_mix_bits.py
@@ -53,7 +53,7 @@ def test_mixed_gptqmodel(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
-        quantized_model_path = quantized_model_path[0]
+
         from gptqmodel import GPTQModel
 
         model = GPTQModel.load(quantized_model_path)
@@ -81,7 +81,7 @@ def test_mixed_gptqmodel_convert_to_ar(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
-        quantized_model_path = quantized_model_path[0]
+
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="auto", quantization_config=quantization_config
@@ -110,7 +110,7 @@ def test_mixed_autoround_format(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = "self.save_dir"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        quantized_model_path = quantized_model_path[0]
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
         assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
         assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
@@ -137,7 +137,7 @@ def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = "self.save_dir"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
-        quantized_model_path = quantized_model_path[0]
+
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="auto", quantization_config=quantization_config
@@ -228,7 +228,7 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, inplace=False, format="auto_round"
         )
-        quantized_model_path = quantized_model_path[0]
+
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
             torch_dtype="auto",
@@ -305,7 +305,7 @@ def test_mixed_llmcompressor_format_vllm(self, tiny_opt_model_path, dataloader):
         compressed, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, inplace=False, format="llm_compressor"
         )
-        quantized_model_path = quantized_model_path[0]
+
         from vllm import LLM, SamplingParams
 
         # Sample prompts.
diff --git a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py
index 407cf7e42..0c1cbf007 100644
--- a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py
+++ b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py
@@ -53,7 +53,6 @@ def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path):
         # Quantize and save the model to the temporary directory
         quantized_model_path = f"{temp_dir}/tmp_autoround_{scheme}"
         _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
-        quantized_model_path = quantized_model_path[0]
 
         # Perform inference with the quantized model
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cuda/quantization/test_mxfp_nvfp.py b/test/test_cuda/quantization/test_mxfp_nvfp.py
index da68942b8..c818f6c1b 100644
--- a/test/test_cuda/quantization/test_mxfp_nvfp.py
+++ b/test/test_cuda/quantization/test_mxfp_nvfp.py
@@ -67,7 +67,7 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
         compressed_model, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="llm_compressor"
         )
-        quantized_model_path = quantized_model_path[0]
+
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -156,7 +156,7 @@ def test_qwen_moe_quant_infer(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, inplace=False, format="auto_round"
         )
-        quantized_model_path = quantized_model_path[0]
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         from auto_round.eval.evaluation import simple_evaluate_user_model
diff --git a/test/test_cuda/schemes/test_scheme.py b/test/test_cuda/schemes/test_scheme.py
index c5ba7d4fc..a635d2e59 100644
--- a/test/test_cuda/schemes/test_scheme.py
+++ b/test/test_cuda/schemes/test_scheme.py
@@ -121,7 +121,7 @@ def test_q2k_mixed(self):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed")
-        quantized_model_path = quantized_model_path[0]
+
         gguf_file = os.listdir(quantized_model_path)[0]
         file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2
         assert abs(file_size - 1236) < 5.0
diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
index a5959396d..30627f50a 100644
--- a/test/test_xpu/test_autoround.py
+++ b/test/test_xpu/test_autoround.py
@@ -12,6 +12,7 @@
 
 
 class TestAutoRoundXPU:
+
     @classmethod
     def setup_class(self):
         self.device = "xpu"
@@ -43,7 +44,6 @@ def test_gptq_format(self, dataloader):
         )
         quantized_model_path = "./saved"
         _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="auto")
 
@@ -78,7 +78,6 @@ def test_awq_format(self, dataloader):
         _, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, format="auto_round:auto_awq"
         )
-        quantized_model_path = quantized_model_path[0]
 
         quantization_config = AutoRoundConfig(backend="auto")
         # device_map="auto" doesn't work, must use "xpu"

From 2c9244598a7284166cd9651ea1e556f4e9c96b3d Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 29 Jan 2026 10:56:00 +0800
Subject: [PATCH 11/14] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/test_cpu/backends/test_torch_backend.py |  4 +--
 test/test_cpu/core/test_autoround.py         | 12 +++----
 test/test_cuda/advanced/test_fp8_input.py    | 32 ++++++++---------
 test/test_cuda/quantization/test_asym.py     | 36 ++++++++++----------
 test/test_cuda/schemes/test_auto_scheme.py   | 10 +++---
 test/test_cuda/utils/test_alg_ext.py         |  6 ++--
 6 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py
index b47dfe48d..6961ddaa3 100644
--- a/test/test_cpu/backends/test_torch_backend.py
+++ b/test/test_cpu/backends/test_torch_backend.py
@@ -55,10 +55,10 @@ def test_torch_4bits_asym(self, dataloader):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
+            quantized_model_path, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py
index f565dabec..b1e3e2d05 100644
--- a/test/test_cpu/core/test_autoround.py
+++ b/test/test_cpu/core/test_autoround.py
@@ -741,8 +741,8 @@ def test_invalid_layer_config(self, tiny_opt_model_path):
     def test_quant_lm_head(self, tiny_untied_qwen_model_path):
         model_name = tiny_untied_qwen_model_path
         ar = AutoRound(model_name, quant_lm_head=True, iters=0, seqlen=8, nsamples=1, disable_opt_rtn=True)
-        ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
@@ -756,8 +756,8 @@ def test_quant_lm_head(self, tiny_untied_qwen_model_path):
             disable_opt_rtn=True,
             layer_config=layer_config,
         )
-        ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
@@ -773,8 +773,8 @@ def test_quant_lm_head_layer_config(self, tiny_untied_qwen_model_path):
             disable_opt_rtn=True,
             layer_config=layer_config,
         )
-        ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
diff --git a/test/test_cuda/advanced/test_fp8_input.py b/test/test_cuda/advanced/test_fp8_input.py
index ec3dc6bf3..918291ce2 100644
--- a/test/test_cuda/advanced/test_fp8_input.py
+++ b/test/test_cuda/advanced/test_fp8_input.py
@@ -38,8 +38,8 @@ def setup_and_teardown_class(self):
     def test_small_model_rtn_generation(self):
         model, tokenizer = self.tiny_fp8_model()
         ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
-        ar.quantize_and_save(output_dir=self.save_dir)
-        model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True)
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -49,7 +49,7 @@ def test_small_model_rtn_generation(self):
     def test_gguf_imatrix(self):
         model, tokenizer = self.tiny_fp8_model()
         ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
-        ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir)
+        _, quantized_model_path = ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir)
         # from llama_cpp import Llama
         #
         # gguf_file = os.listdir("saved/Qwen3-0.6B-FP8/-gguf")[0]
@@ -66,8 +66,8 @@ def test_gguf_imatrix(self):
     def test_small_model_rtn(self):
         model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         ar = AutoRound(model=model_name, iters=0)
-        _, folder = ar.quantize_and_save(output_dir=self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.25
@@ -77,8 +77,8 @@ def test_small_model_rtn(self):
     def test_small_model_iters1(self):
         model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         ar = AutoRound(model=model_name, iters=1)
-        _, folder = ar.quantize_and_save(output_dir=self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.25
@@ -88,8 +88,8 @@ def test_small_model_iters1(self):
     def test_medium_model_rtn(self):
         model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         ar = AutoRound(model=model_name, iters=0)
-        _, folder = ar.quantize_and_save(output_dir=self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.33
@@ -100,8 +100,8 @@ def test_medium_model_rtn_with_lm_head(self):
         model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         layer_config = {"lm_head": {"bits": 4}}
         ar = AutoRound(model=model_name, iters=0, layer_config=layer_config)
-        _, folder = ar.quantize_and_save(output_dir=self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.33
@@ -113,8 +113,8 @@ def test_fp8_model_gguf(self):
 
         model, tokenizer = self.tiny_fp8_model()
         ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
-        ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0")
-        for file in os.listdir(self.save_dir):
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0")
+        for file in os.listdir(quantized_model_path):
             if file.endswith(".gguf"):
                 gguf_file = file
         llm = Llama(f"saved/{gguf_file}", n_gpu_layers=-1)
@@ -124,8 +124,8 @@ def test_fp8_model_gguf(self):
 
         model, tokenizer = self.tiny_fp8_model()
         ar = AutoRound(model=model, tokenizer=tokenizer, iters=1)
-        ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s")
-        for file in os.listdir(self.save_dir):
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s")
+        for file in os.listdir(quantized_model_path):
             if file.endswith(".gguf"):
                 gguf_file = file
         llm = Llama(f"saved/{gguf_file}", n_gpu_layers=-1)
@@ -139,5 +139,5 @@ def test_diff_datatype(self):
             for iters in [0, 1]:
                 print(f"Testing scheme: {scheme}, iters: {iters}")
                 ar = AutoRound(model_name, iters=iters, scheme=scheme)
-                ar.quantize_and_save(output_dir=self.save_dir)
+                _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
                 shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/quantization/test_asym.py b/test/test_cuda/quantization/test_asym.py
index d55934898..7ac1487df 100644
--- a/test/test_cuda/quantization/test_asym.py
+++ b/test/test_cuda/quantization/test_asym.py
@@ -45,17 +45,17 @@ def test_asym_group_size(self, tiny_opt_model_path):
             ar = AutoRound(
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
             )
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -65,17 +65,17 @@ def test_asym_bits(self, tiny_opt_model_path):
             ar = AutoRound(
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
             )
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -87,15 +87,15 @@ def test_asym_format(self, tiny_opt_model_path):
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
             )
             # TODO when ark is ready, uncomment the following lines to do inference test
-            ar.quantize_and_save(format=format, output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format=format, output_dir=self.save_dir)
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -105,17 +105,17 @@ def test_asym_group_size_with_tuning(self, tiny_opt_model_path):
             ar = AutoRound(
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
             )
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -125,17 +125,17 @@ def test_asym_bits_with_tuning(self, tiny_opt_model_path):
             ar = AutoRound(
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
             )
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -147,14 +147,14 @@ def test_asym_format_with_tuning(self, tiny_opt_model_path):
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
             )
             # TODO when ark is ready, uncomment the following lines to do inference test
-            ar.quantize_and_save(format=format, output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format=format, output_dir=self.save_dir)
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/schemes/test_auto_scheme.py b/test/test_cuda/schemes/test_auto_scheme.py
index b91c98428..e15137533 100644
--- a/test/test_cuda/schemes/test_auto_scheme.py
+++ b/test/test_cuda/schemes/test_auto_scheme.py
@@ -242,8 +242,8 @@ def test_auto_scheme_export(self, tiny_qwen_model_path):
         model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16"))
         ar = AutoRound(model=model_name, scheme=scheme)
-        ar.quantize_and_save(self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.25
@@ -251,15 +251,15 @@ def test_auto_scheme_export(self, tiny_qwen_model_path):
 
         scheme = AutoScheme(avg_bits=3, options=("gguf:q2_k_s,gguf:q4_k_s"), nsamples=1, ignore_scale_zp_bits=True)
         ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1)
-        ar.quantize_and_save(self.save_dir)
+        _, quantized_model_path = ar.quantize_and_save(self.save_dir)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_enable_torch_compile(self):
         model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True)
         ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True)
-        ar.quantize_and_save(self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.10
diff --git a/test/test_cuda/utils/test_alg_ext.py b/test/test_cuda/utils/test_alg_ext.py
index a29bffdac..2708574b9 100644
--- a/test/test_cuda/utils/test_alg_ext.py
+++ b/test/test_cuda/utils/test_alg_ext.py
@@ -33,13 +33,13 @@ def setup_and_teardown_class(self):
     def test_2bits(self):
         model_name = get_model_path("facebook/opt-125m")
         ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True)
-        ar.quantize_and_save(self.save_folder)
+        _, quantized_model_path = ar.quantize_and_save(self.save_folder)
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder,
+            quantized_model_path,
             device_map="auto",
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         # wo alg ext 0.2078, with 0.2371

From ad8b046e966601b444081f899efe92e2a2a12b69 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 29 Jan 2026 16:34:42 +0800
Subject: [PATCH 12/14] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py               | 1 +
 test/test_cpu/export/test_gguf_format.py     | 4 ++--
 test/test_cpu/quantization/test_mxfp_nvfp.py | 6 ++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 890124f39..d1bbc478b 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2742,6 +2742,7 @@ def quantize_block(
         """
 
         # TODO: release below assertion after supporting MLLM and diffusion model quantization with quantize_block
+        self._post_init()
         assert self.__class__.__name__ not in [
             "DiffusionCompressor",
             "MLLMCompressor",
diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py
index 636b180e7..f4aea0479 100644
--- a/test/test_cpu/export/test_gguf_format.py
+++ b/test/test_cpu/export/test_gguf_format.py
@@ -175,7 +175,7 @@ def test_q4_k_m(self, dataloader):
         assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3
         assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8
         assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0"
-        shutil.rmtree(quantized_model_path, ignore_errors=True)
+        shutil.rmtree("./saved", ignore_errors=True)
 
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False)
@@ -184,7 +184,7 @@ def test_q4_k_m(self, dataloader):
             output_dir=quantized_model_path, format="gguf:q4_k_m,fake"
         )
 
-        shutil.rmtree(quantized_model_path, ignore_errors=True)
+        shutil.rmtree("./saved", ignore_errors=True)
 
     def test_all_format(self, tiny_qwen_model_path):
         model_name = tiny_qwen_model_path
diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py
index e061d5b3b..08fc6c79a 100644
--- a/test/test_cpu/quantization/test_mxfp_nvfp.py
+++ b/test/test_cpu/quantization/test_mxfp_nvfp.py
@@ -333,7 +333,7 @@ def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
-        compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
+        compressed_model, _ = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -364,7 +364,9 @@ def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=True, format="auto_round"
+        )
         assert is_model_outputs_similar(model_name, quantized_model_path)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 

From c67dc41688c433c839c1432f975ccd4618414890 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 29 Jan 2026 17:00:46 +0800
Subject: [PATCH 13/14] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index d1bbc478b..1d2f4a645 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -730,8 +730,6 @@ def _parse_and_set(scheme):
             scheme = _parse_and_set(scheme)
             is_auto_scheme = False
 
-        scheme_keys = [f.name for f in fields(QuantizationScheme)]
-
         return scheme, is_auto_scheme
 
     def _adjust_torch_compile(self, enable_torch_compile: bool) -> None:
@@ -1572,6 +1570,9 @@ def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True)
                 else:
                     self.ignore_layers += "," + tmp_str
 
+        if self.is_auto_scheme:
+            self.layer_config = self._gen_auto_scheme(self.model, self.orig_scheme, self.dataset, self.device_map)
+
         fill_default_value = True
         if self.is_auto_scheme:
             fill_default_value = False

From b54e9342293826aa62fdc001960729e19198ef74 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 30 Jan 2026 02:35:56 +0000
Subject: [PATCH 14/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 539062d2c..97e9b498e 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -302,7 +302,7 @@ def __init__(
         self.shared_cache_keys = get_shared_keys(self.model)
 
         self.layer_config = layer_config
-        
+
         self.supported_types = SUPPORTED_LAYER_TYPES
         self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
@@ -340,7 +340,7 @@ def __init__(
         if envs.AR_USE_MODELSCOPE:
             platform = "model_scope"
         self.platform = platform
-        
+
         self.ignore_layers = kwargs.pop("ignore_layers", "")
 
         self.low_cpu_mem_usage = low_cpu_mem_usage