diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 5b9616f65..d00a27a22 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -693,33 +693,7 @@ def tune(args):
         trust_remote_code=not args.disable_trust_remote_code,
     )
 
-    model_name = args.model.rstrip("/")
-
-    if model_name.split("/")[-1].strip(".") == "" and "gguf" not in args.format:
-        if autoround.group_size <= 0:
-            if "fp" in autoround.act_data_type:
-                suffix = f"afp{autoround.act_bits}"
-            else:
-                suffix = f"a{autoround.act_bits}"
-        else:
-            suffix = f"g{autoround.group_size}"
-        export_dir = os.path.join(args.output_dir, f"w{autoround.bits}{suffix}")
-    elif model_name.split("/")[-1].strip(".") == "" and "gguf" in args.format:
-        export_dir = args.output_dir
-    elif model_name.split("./")[-1].strip("./") != "" and "gguf" in args.format:
-        export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + "-gguf")
-    else:
-        if autoround.group_size <= 0:
-            if "fp" in autoround.act_data_type:
-                suffix = f"afp{autoround.act_bits}"
-            else:
-                suffix = f"a{autoround.act_bits}"
-        else:
-            suffix = f"g{autoround.group_size}"
-        export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{autoround.bits}{suffix}")
-
-    # ======================= Quantize and save model =======================
-    model, folders = autoround.quantize_and_save(export_dir, format=args.format)  # pylint: disable=E1101
+    model, folders = autoround.quantize_and_save(args.output_dir, format=args.format)  # pylint: disable=E1101
     tokenizer = autoround.tokenizer  # pylint: disable=E1101
 
     model.eval()
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 57e771478..97e9b498e 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -307,20 +307,18 @@ def __init__(
         self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
 
-        # should be set after loading model and set layer_config, cause some special scheme need these.
-        # Preserve the original, unparsed scheme for later use in auto scheme generation
-        # within `configure_layer_config` (which may need the raw value instead of `self.scheme`).
+        self.scheme = scheme
         self.orig_scheme = copy.deepcopy(scheme)
-        self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(scheme, kwargs)
-
-        gguf_scheme_name = get_gguf_scheme(self.scheme)
-        # GGUF uses fp32 scale dtype as default
-        scale_dtype = kwargs.pop("scale_dtype", None)
-        if scale_dtype is None:
-            scale_dtype = "fp32" if gguf_scheme_name else "fp16"
+        self.is_auto_scheme = True if isinstance(scheme, AutoScheme) else False
+        self.scale_dtype = kwargs.pop("scale_dtype", None)
 
         # Extra/legacy kwargs for backward compatibility
         # Major version releases may pack them with extra configuration options
+        scheme_keys = [f.name for f in fields(QuantizationScheme)]
+        for key in scheme_keys:
+            if key in kwargs and kwargs[key] is not None:
+                setattr(self, key, kwargs.pop(key))
+
         amp = kwargs.pop("amp", True)
         lr = kwargs.pop("lr", None)
         enable_minmax_tuning = kwargs.pop("enable_minmax_tuning", True)
@@ -344,10 +342,9 @@ def __init__(
         self.platform = platform
 
         self.ignore_layers = kwargs.pop("ignore_layers", "")
-        self.supported_types = SUPPORTED_LAYER_TYPES
-        self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
-        self.scale_dtype = convert_dtype_str2torch(scale_dtype)
+
         self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.block_forward = block_forward
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
@@ -379,16 +376,10 @@ def __init__(
         self.device_map = device_map
         if isinstance(self.device_map, str):
             self.device_map = self.device_map.replace(" ", "")
-
-        self.device_list = parse_available_devices(device_map)
-
-        # Set device, must place after model loading
-        self.device = get_major_device(device_map)
-        set_non_auto_device_map(self.model, self.device_map)
+        self.device = get_major_device(self.device_map)
 
         # Tuning hyperparameters
         self.seed = seed
-        set_seed(self.seed)
         self.amp = amp
         self.enable_quanted_input = enable_quanted_input
         self.enable_minmax_tuning = enable_minmax_tuning
@@ -423,24 +414,7 @@ def __init__(
         if enable_opt_rtn:
             disable_opt_rtn = False
         self.orig_disable_opt_rtn = disable_opt_rtn
-
-        if self.iters != 0 and self.orig_disable_opt_rtn is not None:
-            logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.")
-            disable_opt_rtn = True
-        if (
-            self.bits >= 8
-            and self.act_bits >= 16
-            and self.iters == 0
-            and self.data_type == "int"
-            and disable_opt_rtn is None
-        ):
-            logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.")
-            disable_opt_rtn = True
-        if disable_opt_rtn is None and self.iters == 0:
-            logger.info(
-                "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy."
-            )
-            disable_opt_rtn = False
+        self.disable_opt_rtn = disable_opt_rtn
 
         # Important Note! This is not very robust, do NOT rely on it to do high risky thing
         self.is_moe_model = is_moe_model(self.model)
@@ -451,7 +425,6 @@ def __init__(
         self.dynamic_max_gap = dynamic_max_gap
         self.lr_scheduler = lr_scheduler
         self.optimizer = self._get_optimizer(None)
-        self.disable_opt_rtn = disable_opt_rtn
 
         # Whether to pack the layer immediately after tuning
         self.is_immediate_packing = False
@@ -467,8 +440,78 @@ def __init__(
         if self.static_attention_dtype is not None:
             logger.warning("The static attention dtype is experimental and currently has limited support.")
 
-        self._set_amp_dtype()
         self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device
+
+        self.batch_dim = None
+        self.infer_bs_coeff = 1
+
+        # after setting iters
+        self.enable_torch_compile = enable_torch_compile
+
+        self.attention_mask = []
+        self.wrapper_block = wrapper_block
+
+        torch.set_printoptions(precision=3, sci_mode=True)
+
+        self._post_inited = False
+
+    def _post_init(self) -> None:
+        """Post-initialization for AutoRound."""
+        if self._post_inited:
+            return
+
+        # should be set after loading model and set layer_config, cause some special scheme need these.
+        # Preserve the original, unparsed scheme for later use in auto scheme generation
+        # within `configure_layer_config` (which may need the raw value instead of `self.scheme`).
+        self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(self.scheme)
+
+        # GGUF uses fp32 scale dtype as default
+        if self.scale_dtype is None:
+            gguf_scheme_name = get_gguf_scheme(self.scheme)
+            scale_dtype = "fp32" if gguf_scheme_name else "fp16"
+        else:
+            scale_dtype = self.scale_dtype
+        self.scale_dtype = convert_dtype_str2torch(scale_dtype)
+
+        predefined_ignore_layers = get_predefined_ignore_layers(self.model)
+
+        if predefined_ignore_layers:
+            logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}")
+            tmp_str = ",".join(predefined_ignore_layers)
+            if self.ignore_layers == "":
+                self.ignore_layers = tmp_str
+            else:
+                self.ignore_layers += "," + tmp_str
+
+        # Set device, must place after model loading
+        self._set_device(self.device_map)
+        set_non_auto_device_map(self.model, self.device_map)
+        self.device_list = parse_available_devices(self.device_map)
+
+        if self.iters != 0 and self.orig_disable_opt_rtn is not None:
+            logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.")
+            self.disable_opt_rtn = True
+        if (
+            self.bits >= 8
+            and self.act_bits >= 16
+            and self.iters == 0
+            and self.data_type == "int"
+            and self.disable_opt_rtn is None
+        ):
+            logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.")
+            self.disable_opt_rtn = True
+        if self.disable_opt_rtn is None and self.iters == 0:
+            logger.info(
+                "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy."
+            )
+            self.disable_opt_rtn = False
+
+        set_seed(self.seed)
+        self._set_amp_dtype()
+        self._adjust_torch_compile(self.enable_torch_compile)
+        if self.enable_torch_compile:
+            self.block_forward = compile_func(self.block_forward, self.device)
+
         if self.act_bits <= 8 and self.amp_dtype == torch.float16:
             logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization")
             self.amp_dtype = torch.bfloat16
@@ -480,24 +523,16 @@ def __init__(
         # Some helpers
         if "hpu" in str(self.device):
             self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear")
-        self.batch_dim = None
-        self.infer_bs_coeff = 1
 
-        # after setting iters
-        self.enable_torch_compile = enable_torch_compile
-        self._adjust_torch_compile(enable_torch_compile)
-
-        self.block_forward = compile_func(block_forward, self.device) if self.enable_torch_compile else block_forward
         self._check_configs()
-        torch.set_printoptions(precision=3, sci_mode=True)
+
+        if isinstance(self.scheme, AutoScheme):
+            self.layer_config = self._gen_auto_scheme(self.model, self.orig_scheme, self.dataset, self.device_map)
 
         if is_hpex_available():
             logger.info("habana_frameworks is available, import htcore explicitly.")
             import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
 
-        self.attention_mask = []
-
-        self.wrapper_block = wrapper_block
         if self.enable_alg_ext:
             try:
                 logger.warning_once("using algorithm extension for quantization.")
@@ -506,6 +541,7 @@ def __init__(
                 wrapper_autoround(self)
             except (ImportError, ModuleNotFoundError):
                 logger.error("algorithm extension import error, fallback to default mode")
+        self._post_inited = True
 
     def _gen_auto_scheme(
         self, model: torch.nn.Module, scheme: AutoScheme, dataset: str, device_map: Union[str, int, dict, torch.device]
@@ -612,18 +648,18 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
             raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
 
     def _parse_and_set_scheme(
-        self, scheme: Union[str, dict, QuantizationScheme], kwargs
+        self,
+        scheme: Union[str, dict, QuantizationScheme],
     ) -> tuple[QuantizationScheme, bool]:
         """Parse and set the quantization scheme."""
 
-        def _parse_and_set(scheme, kwargs):
-            if kwargs.get("data_type", None) and kwargs["data_type"].endswith("_dq") and not scheme.startswith("gguf"):
-                if "bits" not in kwargs:
-                    data_type = kwargs["data_type"]
+        def _parse_and_set(scheme):
+            if getattr(self, "data_type", None) and self.data_type.endswith("_dq") and not scheme.startswith("gguf"):
+                if not hasattr(self, "bits") or self.bits is None:
                     raise KeyError(
-                        f"please set bits when setting data_type={data_type}, or using scheme as an alternative."
+                        f"please set bits when setting data_type={self.data_type}, or using scheme as an alternative."
                     )
-                bits = kwargs["bits"]
+                bits = self.bits
                 scheme = f"gguf:q{bits}_k" if bits == 6 else f"gguf:q{bits}_k_s"
             res = None
             if isinstance(scheme, QuantizationScheme):
@@ -649,11 +685,10 @@ def _parse_and_set(scheme, kwargs):
                 scheme = asdict(preset_name_to_scheme(scheme))
             scheme_keys = [f.name for f in fields(QuantizationScheme)]
             for key in scheme_keys:
-                if key in kwargs and kwargs[key] is not None:
-                    setattr(self, key, kwargs[key])
+                if hasattr(self, key) and getattr(self, key) is not None:
+                    continue
                 else:
                     setattr(self, key, scheme.get(key, None))
-                # kwargs.pop(key, None)
             if self.act_dynamic is None:
                 self.act_dynamic = True
 
@@ -709,7 +744,7 @@ def _parse_and_set(scheme, kwargs):
                 raise ValueError("options of AutoScheme must not be empty")
             options = []
             for option in scheme.options:
-                new_option = _parse_and_set(option, kwargs)
+                new_option = _parse_and_set(option)
                 options.append(new_option)
             scheme.options = options
             for opt in options:
@@ -721,16 +756,12 @@ def _parse_and_set(scheme, kwargs):
                 self.scheme = opt  # Choose the first one that not 16 bits
                 break
             # apply scheme to set default bits
-            scheme = _parse_and_set(self.scheme, kwargs)
+            scheme = _parse_and_set(self.scheme)
             is_auto_scheme = True
         else:
-            scheme = _parse_and_set(scheme, kwargs)
+            scheme = _parse_and_set(scheme)
             is_auto_scheme = False
 
-        scheme_keys = [f.name for f in fields(QuantizationScheme)]
-        for key in scheme_keys:
-            kwargs.pop(key, None)
-
         return scheme, is_auto_scheme
 
     def _adjust_torch_compile(self, enable_torch_compile: bool) -> None:
@@ -888,6 +919,29 @@ def quantize_and_save(
         Raises:
             ValueError: If an unsupported format is specified.
         """
+        # post init
+        self._post_init()
+
+        name_or_path = self.model.name_or_path.rstrip("/")
+        model_name = name_or_path.split("/")[-1]
+        if model_name.strip(".") == "" and "gguf" not in format:
+            if self.group_size <= 0:
+                suffix = f"afp{self.act_bits}" if "fp" in self.act_data_type else f"a{self.act_bits}"
+            else:
+                suffix = f"g{self.group_size}"
+            export_dir = os.path.join(output_dir, f"w{self.bits}{suffix}")
+        elif model_name.strip(".") == "" and "gguf" in format:
+            export_dir = output_dir
+        elif model_name.strip(".") != "" and "gguf" in format:
+            export_dir = os.path.join(output_dir, model_name + "-gguf")
+        else:
+            if self.group_size <= 0:
+                suffix = f"afp{self.act_bits}" if "fp" in self.act_data_type else f"a{self.act_bits}"
+            else:
+                suffix = f"g{self.group_size}"
+            export_dir = os.path.join(output_dir, model_name + f"-w{self.bits}{suffix}")
+
+        output_dir = export_dir
         # Validate and process the specified formats
         self.orig_output_dir = output_dir
 
@@ -1637,6 +1691,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         Returns:
         The quantized model and layer configurations.
         """
+        # post init
+        self._post_init()
 
         self._check_compatibility()
         formats = self.formats if hasattr(self, "formats") else None
@@ -2719,6 +2775,7 @@ def quantize_block(
         """
 
         # TODO: release below assertion after supporting MLLM and diffusion model quantization with quantize_block
+        self._post_init()
         assert self.__class__.__name__ not in [
             "DiffusionCompressor",
             "MLLMCompressor",
@@ -3141,7 +3198,7 @@ def save_quantized(
         output_dir: str = None,
         format: Union[str, list[OutputFormat]] = "auto_round",
         inplace: bool = True,
-        return_folders=False,
+        return_folders=True,
         **kwargs,
     ) -> torch.nn.Module:
         """Save the quantized model to the specified output directory in the specified format.
@@ -3155,6 +3212,7 @@ def save_quantized(
         Returns:
             object: The compressed model object.
         """
+
         self.orig_output_dir = output_dir
         if isinstance(format, str) and getattr(self, "formats", None) is None:
             formats = get_formats(format, self)
@@ -3194,7 +3252,7 @@ def save_quantized(
             folders.append(save_folder)
 
         if return_folders:
-            return compressed_model, folders
+            return compressed_model, folders[0] if len(folders) == 1 else folders
         else:
             return compressed_model
 
diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py
index cff49b371..0bc6dbd3a 100644
--- a/auto_round/eval/evaluation.py
+++ b/auto_round/eval/evaluation.py
@@ -380,7 +380,7 @@ def run_model_evaluation(model, tokenizer, autoround, folders, formats, device_s
         return
 
     # Check if evaluation is needed for language models
-    eval_folder = folders[-1] if folders else None
+    eval_folder = folders[-1] if folders and isinstance(folders, list) else folders
     if args.tasks is None or args.tasks == "" or eval_folder is None:
         return
 
diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py
index 24836d85b..5c426f7c0 100644
--- a/auto_round/wrapper.py
+++ b/auto_round/wrapper.py
@@ -117,7 +117,7 @@ def __init__(
         self.enable_round_tuning = enable_round_tuning
         self.enable_torch_compile = enable_torch_compile
         self.enable_norm_bias_tuning = enable_norm_bias_tuning and (orig_layer.bias is not None)
-        self.enable_act_quant = self.orig_layer.act_bits <= 8
+        self.enable_act_quant = self.orig_layer.act_bits <= 8 if self.orig_layer.act_bits is not None else False
         self.weight_global_scale = getattr(self.orig_layer, "weight_global_scale", None)
         if is_nv_fp(self.orig_layer.data_type) and self.weight_global_scale is None:
             from auto_round.data_type.nvfp import calculate_gparam
diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py
index c54a57bd3..dd5119334 100644
--- a/test/test_ark/test_model.py
+++ b/test/test_ark/test_model.py
@@ -40,7 +40,9 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t
         else:
             autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format=format)  ##will convert to gptq model
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format=format
+        )  ##will convert to gptq model
 
         quantization_config = AutoRoundConfig(backend="ark")
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py
index ce4bc0049..6961ddaa3 100644
--- a/test/test_cpu/backends/test_torch_backend.py
+++ b/test/test_cpu/backends/test_torch_backend.py
@@ -38,14 +38,16 @@ def test_torch_4bits_asym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:gptqmodel"
+        )
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -53,10 +55,10 @@ def test_torch_4bits_asym(self, dataloader):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
+            quantized_model_path, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -79,17 +81,19 @@ def test_torch_4bits_sym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round"
+        )  ##will convert to gptq model
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.28
         torch.cuda.empty_cache()
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py
index 42a7dc83f..db42f5d00 100644
--- a/test/test_cpu/core/test_autoround.py
+++ b/test/test_cpu/core/test_autoround.py
@@ -14,6 +14,7 @@
 
 
 class TestAutoRound:
+
     @classmethod
     def setup_class(self):
         model_name = opt_name_or_path
@@ -382,16 +383,16 @@ def test_rtn(self, tiny_opt_model_path):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, nsamples=1)
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder,
+            quantized_model_path,
             torch_dtype=torch.float16,
             device_map="auto",
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_embed_quant(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
@@ -437,7 +438,10 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader):
         autoround.quantize()
         quantized_model_path = self.save_folder
 
-        autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True)
+        _, quantized_model_path = autoround.save_quantized(
+            output_dir=quantized_model_path, format="auto_round", inplace=True
+        )
+
         quantization_config = AutoRoundConfig(backend="ipex")
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -447,7 +451,7 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader):
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0])
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_not_convert_modules(self):
         import requests
@@ -737,8 +741,8 @@ def test_invalid_layer_config(self, tiny_opt_model_path):
     def test_quant_lm_head(self, tiny_untied_qwen_model_path):
         model_name = tiny_untied_qwen_model_path
         ar = AutoRound(model_name, quant_lm_head=True, iters=0, seqlen=8, nsamples=1, disable_opt_rtn=True)
-        ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
@@ -752,8 +756,8 @@ def test_quant_lm_head(self, tiny_untied_qwen_model_path):
             disable_opt_rtn=True,
             layer_config=layer_config,
         )
-        ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
@@ -769,8 +773,8 @@ def test_quant_lm_head_layer_config(self, tiny_untied_qwen_model_path):
             disable_opt_rtn=True,
             layer_config=layer_config,
         )
-        ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
diff --git a/test/test_cpu/core/test_init.py b/test/test_cpu/core/test_init.py
index 01785d679..46a460dd3 100644
--- a/test/test_cpu/core/test_init.py
+++ b/test/test_cpu/core/test_init.py
@@ -3,6 +3,8 @@
 
 def test_torch_compile(tiny_opt_model_path):
     ar = AutoRound(model=tiny_opt_model_path, scheme="NVFP4", enable_torch_compile=True)
+    ar._post_init()
     assert not ar.enable_torch_compile, "NVFP4 cannot work with torch.compile."
     ar = AutoRound(model=tiny_opt_model_path, scheme="FP8_STATIC", enable_torch_compile=True)
+    ar._post_init()
     assert not ar.enable_torch_compile, "FP8_STATIC cannot work with torch.compile."
diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py
index dfe339590..512e8443e 100644
--- a/test/test_cpu/export/test_export.py
+++ b/test/test_cpu/export/test_export.py
@@ -23,6 +23,7 @@ def _get_folder_size(path: str) -> float:
 
 
 class TestAutoRound:
+
     @classmethod
     def setup_class(self):
         self.model_name = opt_name_or_path
@@ -50,7 +51,7 @@ def test_autogptq_format(self, dataloader):
             )
 
             quantized_model_path = "./saved"
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
 
             if group_size == -1:
                 shutil.rmtree("./saved", ignore_errors=True)
@@ -79,7 +80,7 @@ def test_autoround_format(self, dataloader):
                 dataset=dataloader,
             )
             quantized_model_path = "./saved"
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
             if group_size == -1:
                 shutil.rmtree("./saved", ignore_errors=True)
@@ -106,7 +107,9 @@ def test_autoround_awq_format(self, dataloader):
             )
             quantized_model_path = "./saved"
 
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
+            _, quantized_model_path = autoround.quantize_and_save(
+                output_dir=quantized_model_path, format="auto_round:auto_awq"
+            )
 
             # quantization_config = AutoRoundConfig(
             #     backend="cpu"
@@ -220,7 +223,8 @@ def test_static_afp8_export(self, static_kv_dtype):
             static_kv_dtype=static_kv_dtype,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
         assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
@@ -281,7 +285,7 @@ def test_static_afp8_export(self, static_kv_dtype):
             act_group_size=0,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
@@ -307,7 +311,8 @@ def test_static_fp8_attn(self):
             static_attention_dtype="fp8",
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
         assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
@@ -345,7 +350,10 @@ def test_awq_lmhead_export(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_awq"
+        )
+
         lm_head = compressed_model.lm_head
         from auto_round.export.export_to_awq.utils import WQLinear_GEMM
 
@@ -376,7 +384,10 @@ def test_gptq_lmhead_export(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_gptq"
+        )
+
         lm_head = compressed_model.lm_head
         assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer"
         quantization_config = AutoRoundConfig()
@@ -397,6 +408,7 @@ def test_export_format(self):
             self.model_name,
             scheme="FP8_STATIC",
         )
+        autoround._post_init()
         format_list = get_formats("auto_round, llm_compressor, auto_round:llm_compressor", autoround)
         assert len(format_list) == 3
         assert format_list[0].output_format == "auto_round"
@@ -410,6 +422,7 @@ def test_export_format(self):
             self.model_name,
             scheme="W4A16",
         )
+        autoround._post_init()
         format_list = get_formats("auto_round:auto_awq, auto_gptq", autoround)
         assert format_list[0].output_format == "auto_round"
         assert format_list[0].get_backend_name() == "auto_round:auto_awq"
@@ -426,6 +439,7 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path):
             group_size=32,
             sym=True,
         )
+        ar._post_init()
         with pytest.raises(ValueError, match="auto_awq format support quantization scheme with W4A16 but got bits=2"):
             get_formats("auto_round:auto_awq", ar)
 
@@ -439,6 +453,7 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path):
             group_size=32,
             sym=True,
         )
+        ar._post_init()
         with pytest.raises(ValueError, match="but got data_type=fp, bits=4"):
             get_formats("auto_round:llm_compressor", ar)
 
@@ -449,4 +464,5 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path):
             group_size=256,
             sym=True,
         )
+        ar._post_init()
         get_formats("auto_round:auto_awq", ar)
diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py
index f1980c455..f4aea0479 100644
--- a/test/test_cpu/export/test_gguf_format.py
+++ b/test/test_cpu/export/test_gguf_format.py
@@ -64,7 +64,10 @@ def test_q4_0(self):
         )
         quantized_model_path = "./saved"
 
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="gguf:q4_0"
+        )
+
         gguf_file = os.listdir(quantized_model_path)[0]
 
         # TODO: fix the issue of gguf loading error in transformers v5
@@ -77,7 +80,7 @@ def test_q4_0(self):
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
 
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_func(self):
         bits, group_size, sym = 4, 128, True
@@ -89,15 +92,18 @@ def test_func(self):
             # data_type="int"
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_1")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="gguf:q*_1"
+        )
+
         assert autoround.group_size == 32
         assert not autoround.sym
-        gguf_file = os.listdir("saved")[0]
+        gguf_file = os.listdir(quantized_model_path)[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_gguf_baseline(self):
         model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
@@ -117,12 +123,15 @@ def test_gguf_baseline(self):
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="fake")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="fake"
+        )
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_q4_k_m(self, dataloader):
         model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
@@ -152,7 +161,10 @@ def test_q4_k_m(self, dataloader):
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="gguf:q4_k_m,fake"
+        )
+
         assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16
         assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq"
         assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq"
@@ -168,7 +180,10 @@ def test_q4_k_m(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False)
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="gguf:q4_k_m,fake"
+        )
+
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_all_format(self, tiny_qwen_model_path):
@@ -216,15 +231,16 @@ def test_vlm_gguf(self):
             quant_nontext_module=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        assert "mmproj-model.gguf" in os.listdir("./saved")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
+
+        assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
         for file_name in os.listdir(quantized_model_path):
             file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
             if file_name == "mmproj-model.gguf":
                 assert abs(file_size - 56) < 5.0
             else:
                 assert abs(file_size - 264) < 5.0
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
         shutil.rmtree(tiny_model_path, ignore_errors=True)
 
     def test_vlm_gguf_wo_quant_nontext_module(self):
@@ -242,15 +258,16 @@ def test_vlm_gguf_wo_quant_nontext_module(self):
             quant_nontext_module=False,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        assert "mmproj-model.gguf" in os.listdir("./saved")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
+
+        assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
         for file_name in os.listdir(quantized_model_path):
             file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
             if file_name == "mmproj-model.gguf":
                 assert abs(file_size - 361) < 5.0
             else:
                 assert abs(file_size - 264) < 5.0
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
         shutil.rmtree(tiny_model_path, ignore_errors=True)
 
     def test_qtype_setting(self):
@@ -340,7 +357,8 @@ def test_q2k_mixed(self):
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed")
+
         gguf_file = os.listdir(quantized_model_path)[0]
         file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2
         assert abs(file_size - 1362) < 5.0
@@ -352,5 +370,5 @@ def test_q2k_mixed(self):
         assert gguf_model.get_tensor(10).name == "blk.0.ffn_up_exps.weight"
         assert gguf_model.get_tensor(10).tensor_type.name == "Q2_K"
 
-        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
         shutil.rmtree(saved_tiny_model_path, ignore_errors=True)
diff --git a/test/test_cpu/integrations/test_llmcompressor.py b/test/test_cpu/integrations/test_llmcompressor.py
index c2966cedd..95afce423 100644
--- a/test/test_cpu/integrations/test_llmcompressor.py
+++ b/test/test_cpu/integrations/test_llmcompressor.py
@@ -11,6 +11,7 @@
 
 
 class TestLLMC:
+
     @classmethod
     def setup_class(self):
         self.model_name = get_model_path("stas/tiny-random-llama-2")
@@ -50,7 +51,8 @@ def test_llmcompressor_fp8(self):
             nsamples=2,
             iters=0,
         )
-        autoround.quantize_and_save("./saved", format="llm_compressor")
+        _, quantized_model_path = autoround.quantize_and_save("./saved", format="llm_compressor")
+
         # from vllm import LLM
         # model = LLM("./saved")
         # result = model.generate("Hello my name is")
@@ -58,7 +60,7 @@ def test_llmcompressor_fp8(self):
 
         import json
 
-        config = json.load(open("./saved/config.json"))
+        config = json.load(open(f"{quantized_model_path}/config.json"))
         assert "group_0" in config["quantization_config"]["config_groups"]
         assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
         assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel"
@@ -75,11 +77,11 @@ def test_autoround_llmcompressor_fp8(self):
             nsamples=2,
             iters=0,
         )
-        autoround.quantize_and_save("./saved", format="auto_round:llm_compressor")
+        _, quantized_model_path = autoround.quantize_and_save("./saved", format="auto_round:llm_compressor")
 
         import json
 
-        config = json.load(open("./saved/config.json"))
+        config = json.load(open(f"{quantized_model_path}/config.json"))
         assert "group_0" in config["quantization_config"]["config_groups"]
         assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
         assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor"
diff --git a/test/test_cpu/models/test_mllm.py b/test/test_cpu/models/test_mllm.py
index 87e140a26..748a6ede0 100644
--- a/test/test_cpu/models/test_mllm.py
+++ b/test/test_cpu/models/test_mllm.py
@@ -9,6 +9,7 @@
 
 
 class FakeDataLoader:
+
     def __init__(self):
         self.batch_size = 1
 
@@ -26,6 +27,7 @@ def __iter__(self):
 
 
 class TestAutoRoundMLLM:
+
     @classmethod
     def setup_class(self):
         self.model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
@@ -220,15 +222,17 @@ def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path):
             processor=processor,
             image_processor=image_processor,
         )
-        autoround.quantize_and_save("./saved/", format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save("./saved/", format="auto_round")
 
         import requests
         from PIL import Image
         from transformers import AutoProcessor, AutoTokenizer, Qwen2_5_VLForConditionalGeneration
 
-        model = Qwen2_5_VLForConditionalGeneration.from_pretrained("./saved", torch_dtype="auto", device_map="auto")
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            quantized_model_path, torch_dtype="auto", device_map="auto"
+        )
         image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-        processor = AutoProcessor.from_pretrained("./saved")
+        processor = AutoProcessor.from_pretrained(quantized_model_path)
         messages = [
             {
                 "role": "user",
diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py
index 1125ab56d..c4c69eb28 100644
--- a/test/test_cpu/models/test_moe_model.py
+++ b/test/test_cpu/models/test_moe_model.py
@@ -74,7 +74,7 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0):
         ignore_layers="self_attn,router,lm_head,mlp.gate",
     )
     quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
-    return quantized_model
+    return quantized_model, save_folder
 
 
 def count_modules_by_type(model, target_module_name_or_class):
@@ -98,7 +98,7 @@ def test_gptoss(setup_gpt_oss, scheme):
     # Remove it to avoid mismatch during quantized model loading
     delattr(model.config, "layer_types")
 
-    quantized_model = quantize_model(model, tokenizer, output_dir, scheme)
+    quantized_model, output_dir = quantize_model(model, tokenizer, output_dir, scheme)
 
     # Ensure the quantized model is not None
     assert quantized_model is not None, "Quantized model should not be None."
@@ -136,7 +136,7 @@ def test_llama4(setup_llama4):
     delattr(model.config.text_config, "moe_layers")
     delattr(model.config.text_config, "layer_types")
 
-    quantized_model = quantize_model(model, tokenizer, output_dir, "MXFP4")
+    quantized_model, output_dir = quantize_model(model, tokenizer, output_dir, "MXFP4")
 
     # Ensure the quantized model is not None
     assert quantized_model is not None, "Quantized model should not be None."
@@ -162,7 +162,7 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe):
         iters=1,
         ignore_layers="self_attn,lm_head,mlp.gate",
     )
-    quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
+    quantized_model, output_dir = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
     assert quantized_model is not None, "Quantized model should not be None."
     loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir, device_map="cpu")
 
diff --git a/test/test_cpu/quantization/test_act_quantization.py b/test/test_cpu/quantization/test_act_quantization.py
index 72739931e..72cc152e3 100644
--- a/test/test_cpu/quantization/test_act_quantization.py
+++ b/test/test_cpu/quantization/test_act_quantization.py
@@ -113,7 +113,8 @@ def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert "lm_head" not in model.config.quantization_config.extra_config
 
@@ -141,7 +142,8 @@ def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         kproj_config = model.config.quantization_config.extra_config["model.decoder.layers.1.self_attn.k_proj"]
         assert "act_data_type" in kproj_config.keys() and kproj_config["act_data_type"] == "nv_fp4_with_static_gs"
@@ -167,7 +169,8 @@ def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         extra_config = model.config.quantization_config.extra_config
 
diff --git a/test/test_cpu/quantization/test_mix_bits.py b/test/test_cpu/quantization/test_mix_bits.py
index 5db3053cb..749219271 100644
--- a/test/test_cpu/quantization/test_mix_bits.py
+++ b/test/test_cpu/quantization/test_mix_bits.py
@@ -25,6 +25,7 @@ def _get_folder_size(path: str) -> float:
 
 
 class TestAutoRound:
+
     @classmethod
     def setup_class(self):
         self.model_name = opt_name_or_path
@@ -55,7 +56,8 @@ def test_mixed_gptqmodel(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+
         # test original GPTQModel inference
         from gptqmodel import GPTQModel
 
@@ -84,7 +86,8 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="cpu", quantization_config=quantization_config
@@ -112,7 +115,10 @@ def test_mixed_autoround_format(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
-        compressed_model = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round"
+        )
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
         assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
         assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
@@ -136,7 +142,8 @@ def test_fallback_regex_for_awq_format(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="cpu", quantization_config=quantization_config
@@ -223,7 +230,10 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="auto_round"
+        )
+
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
             torch_dtype="auto",
diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py
index 3c2e9bcce..08fc6c79a 100644
--- a/test/test_cpu/quantization/test_mxfp_nvfp.py
+++ b/test/test_cpu/quantization/test_mxfp_nvfp.py
@@ -23,6 +23,7 @@ def _get_folder_size(path: str) -> float:
 
 
 class TestAutoRoundFP:
+
     @classmethod
     def setup_class(self):
         self.save_dir = "./saved"
@@ -84,7 +85,10 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
             dataset=dataloader,
             layer_config=layer_config,
         )
-        compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=self.save_dir, inplace=True, format="auto_round"
+        )
+
         lm_head = compressed_model.lm_head
         assert (
             hasattr(lm_head, "weight_scale")
@@ -93,7 +97,6 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
             and lm_head.weight_packed.dtype is torch.uint8
             and lm_head.weight_scale.dtype is torch.float8_e4m3fn
         ), "Illegal NVFP4 packing for lm_head layer"
-        quantized_model_path = self.save_dir
         assert is_model_outputs_similar(model_name, quantized_model_path)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -144,7 +147,7 @@ def test_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
-        compressed_model = autoround.save_quantized(
+        compressed_model, _ = autoround.save_quantized(
             output_dir=quantized_model_path, inplace=True, format="llm_compressor"
         )
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
@@ -184,7 +187,7 @@ def test_rtn_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
-        compressed_model = autoround.save_quantized(
+        compressed_model, _ = autoround.save_quantized(
             output_dir=quantized_model_path, inplace=True, format="llm_compressor"
         )
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
@@ -221,7 +224,10 @@ def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="llm_compressor"
+        )
+
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -258,7 +264,10 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="llm_compressor"
+        )
+
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -295,7 +304,10 @@ def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round"
+        )
+
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -321,7 +333,7 @@ def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
-        compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
+        compressed_model, _ = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -352,7 +364,9 @@ def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=True, format="auto_round"
+        )
         assert is_model_outputs_similar(model_name, quantized_model_path)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
diff --git a/test/test_cpu/quantization/test_mxfp_save_load.py b/test/test_cpu/quantization/test_mxfp_save_load.py
index 396c47735..4fdcaeffc 100644
--- a/test/test_cpu/quantization/test_mxfp_save_load.py
+++ b/test/test_cpu/quantization/test_mxfp_save_load.py
@@ -60,7 +60,7 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type):
 
         # Quantize and save the model to the temporary directory
         quantized_model_path = f"{temp_dir}/tmp_autoround"
-        autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
 
         # Perform inference with the quantized model
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py
index e2b0c15c3..cb4d841ba 100644
--- a/test/test_cpu/schemes/test_scheme.py
+++ b/test/test_cpu/schemes/test_scheme.py
@@ -33,13 +33,13 @@ def test_gguf(self, tiny_qwen_model_path, dataloader):
 
     def test_w4a16(self, tiny_opt_model_path, dataloader):
         ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
-        assert ar.bits == 4
         ar.quantize()
+        assert ar.bits == 4
 
     def test_w2a16_rtn(self, tiny_opt_model_path, dataloader):
         ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader)
-        assert ar.bits == 2
         ar.quantize()
+        assert ar.bits == 2
 
     def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader):
 
@@ -85,26 +85,27 @@ def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader):
 
     def test_mxfp4(self, tiny_opt_model_path, dataloader):
         ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        ar.quantize()
         assert ar.bits == 4
         assert ar.act_bits == 4
         assert ar.data_type == "mx_fp"
         assert ar.act_data_type == "mx_fp"
-        ar.quantize()
 
     def test_vllm(self, tiny_qwen_vl_model_path):
         from auto_round import AutoRoundMLLM
 
         ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2)
+        ar._post_init()
         assert ar.bits == 2
         assert ar.act_bits == 16
 
     def test_nvfp4(self, tiny_opt_model_path, dataloader):
         ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        ar.quantize()
         assert ar.bits == 4
         assert ar.act_bits == 4
         assert ar.data_type == "nv_fp"
         assert ar.act_data_type == "nv_fp4_with_static_gs"
-        ar.quantize()
 
     def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader):
         import copy
diff --git a/test/test_cpu/utils/test_generation.py b/test/test_cpu/utils/test_generation.py
index 6bf7e1020..245ffcc72 100644
--- a/test/test_cpu/utils/test_generation.py
+++ b/test/test_cpu/utils/test_generation.py
@@ -11,6 +11,7 @@
 
 
 class TestAutoRoundFormatGeneration:
+
     @classmethod
     def setup_class(self):
         self.model_name = opt_name_or_path
@@ -39,7 +40,9 @@ def test_4bits_sym(self, dataloader):
         )
         quantized_model_path = self.save_folder
 
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round", inplace=False)
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round", inplace=False
+        )
 
         quantization_config = AutoRoundConfig(backend="ipex")
         model = AutoModelForCausalLM.from_pretrained(
@@ -79,7 +82,7 @@ def test_autoround_sym(self, dataloader):
             )
             quantized_model_path = "./saved"
 
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
             model = AutoModelForCausalLM.from_pretrained(
                 quantized_model_path, device_map="auto", trust_remote_code=True
diff --git a/test/test_cuda/advanced/test_fp8_input.py b/test/test_cuda/advanced/test_fp8_input.py
index de4dfa453..e0f8188c0 100644
--- a/test/test_cuda/advanced/test_fp8_input.py
+++ b/test/test_cuda/advanced/test_fp8_input.py
@@ -39,8 +39,8 @@ def setup_and_teardown_class(self):
     def test_small_model_rtn_generation(self):
         model, tokenizer = self.tiny_fp8_model()
         ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
-        ar.quantize_and_save(output_dir=self.save_dir)
-        model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True)
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -55,7 +55,7 @@ def test_small_model_rtn_generation(self):
     def test_gguf_imatrix(self):
         model, tokenizer = self.tiny_fp8_model()
         ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
-        ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir)
+        _, quantized_model_path = ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir)
         # from llama_cpp import Llama
         #
         # gguf_file = os.listdir("saved/Qwen3-0.6B-FP8/-gguf")[0]
@@ -72,8 +72,8 @@ def test_gguf_imatrix(self):
     def test_small_model_rtn(self):
         model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         ar = AutoRound(model=model_name, iters=0)
-        _, folder = ar.quantize_and_save(output_dir=self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.25
@@ -83,8 +83,8 @@ def test_small_model_rtn(self):
     def test_small_model_iters1(self):
         model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         ar = AutoRound(model=model_name, iters=1)
-        _, folder = ar.quantize_and_save(output_dir=self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.25
@@ -94,8 +94,8 @@ def test_small_model_iters1(self):
     def test_medium_model_rtn(self):
         model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         ar = AutoRound(model=model_name, iters=0)
-        _, folder = ar.quantize_and_save(output_dir=self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.33
@@ -106,8 +106,8 @@ def test_medium_model_rtn_with_lm_head(self):
         model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         layer_config = {"lm_head": {"bits": 4}}
         ar = AutoRound(model=model_name, iters=0, layer_config=layer_config)
-        _, folder = ar.quantize_and_save(output_dir=self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.33
@@ -124,8 +124,8 @@ def test_fp8_model_gguf(self):
 
         model, tokenizer = self.tiny_fp8_model()
         ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
-        ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0")
-        for file in os.listdir(self.save_dir):
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0")
+        for file in os.listdir(quantized_model_path):
             if file.endswith(".gguf"):
                 gguf_file = file
         llm = Llama(f"saved/{gguf_file}", n_gpu_layers=-1)
@@ -135,8 +135,8 @@ def test_fp8_model_gguf(self):
 
         model, tokenizer = self.tiny_fp8_model()
         ar = AutoRound(model=model, tokenizer=tokenizer, iters=1)
-        ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s")
-        for file in os.listdir(self.save_dir):
+        _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s")
+        for file in os.listdir(quantized_model_path):
             if file.endswith(".gguf"):
                 gguf_file = file
         llm = Llama(f"saved/{gguf_file}", n_gpu_layers=-1)
@@ -155,5 +155,5 @@ def test_diff_datatype(self):
             for iters in [0, 1]:
                 print(f"Testing scheme: {scheme}, iters: {iters}")
                 ar = AutoRound(model_name, iters=iters, scheme=scheme)
-                ar.quantize_and_save(output_dir=self.save_dir)
+                _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir)
                 shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/backends/test_torch_backend.py b/test/test_cuda/backends/test_torch_backend.py
index 28df641a3..16f7dfcae 100644
--- a/test/test_cuda/backends/test_torch_backend.py
+++ b/test/test_cuda/backends/test_torch_backend.py
@@ -44,7 +44,9 @@ def test_torch_4bits_asym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:gptqmodel"
+        )
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
@@ -86,7 +88,9 @@ def test_torch_4bits_sym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round"
+        )  ##will convert to gptq model
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cuda/backends/test_triton_backend.py b/test/test_cuda/backends/test_triton_backend.py
index fb67ad049..b99e68ef8 100644
--- a/test/test_cuda/backends/test_triton_backend.py
+++ b/test/test_cuda/backends/test_triton_backend.py
@@ -12,6 +12,7 @@
 
 
 class TestAutoRoundTritonBackend:
+
     @classmethod
     def setup_class(self):
         self.model_name = "/models/opt-125m"
@@ -38,14 +39,16 @@ def test_tritonv2_4bits_asym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:gptqmodel"
+        )
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -53,10 +56,10 @@ def test_tritonv2_4bits_asym(self, dataloader):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -71,14 +74,14 @@ def test_tritonv2_2bits_asym(self):
         bits, group_size, sym = 2, 32, False
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -86,10 +89,10 @@ def test_tritonv2_2bits_asym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -113,14 +116,14 @@ def test_tritonv2_4bits_sym(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
@@ -128,10 +131,10 @@ def test_tritonv2_4bits_sym(self, dataloader):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
@@ -147,14 +150,14 @@ def test_tritonv2_8bits_sym(self):
         bits, group_size, sym = 4, 256, True
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, nsamples=1, iters=1)
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -162,10 +165,10 @@ def test_tritonv2_8bits_sym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
@@ -186,14 +189,14 @@ def test_tritonv2_2bits_sym(self):
             sym=sym,
         )
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -201,10 +204,10 @@ def test_tritonv2_2bits_sym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py
index 945a3d653..ba94ebda9 100644
--- a/test/test_cuda/export/test_auto_round_format.py
+++ b/test/test_cuda/export/test_auto_round_format.py
@@ -52,7 +52,7 @@ def test_autoround_asym(self, tiny_opt_model_path, dataloader):
             )
             quantized_model_path = self.save_dir
 
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
             model = AutoModelForCausalLM.from_pretrained(
                 quantized_model_path, device_map="cuda:0", trust_remote_code=True
@@ -79,13 +79,14 @@ def test_mixed_precision(self):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -105,14 +106,16 @@ def test_awq_backend(self):
             sym=sym,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:auto_awq"
+        )
 
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -120,12 +123,12 @@ def test_awq_backend(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         model_infer(model, tokenizer)
-        shutil.rmtree(self.save_dir, ignore_errors=True)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_greater_than_050
     def test_tritonv2_bf16(self):
@@ -154,7 +157,7 @@ def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = "./saved"
 
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
 
         from transformers import AutoRoundConfig
 
@@ -206,7 +209,9 @@ def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader):
         )
         quantized_model_path = "./saved"
 
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:auto_awq"
+        )
 
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
@@ -243,7 +248,7 @@ def test_autoround_sym(self, tiny_opt_model_path, dataloader):
             )
             quantized_model_path = "./saved"
 
-            autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
             model = AutoModelForCausalLM.from_pretrained(
                 quantized_model_path, device_map="auto", trust_remote_code=True
diff --git a/test/test_cuda/export/test_export.py b/test/test_cuda/export/test_export.py
index 25f29a4c8..39e0758bb 100644
--- a/test/test_cuda/export/test_export.py
+++ b/test/test_cuda/export/test_export.py
@@ -117,7 +117,7 @@ def test_autogptq_format_qsave_ignore_layers(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
 
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
@@ -219,8 +219,9 @@ def test_autoawq_format_fp_qsave_layers(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "./saved/test_export"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
-        from transformers import AutoRoundConfig
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+
+        from auto_round import AutoRoundConfig
 
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="auto", quantization_config=AutoRoundConfig()
@@ -319,7 +320,10 @@ def test_awq_lmhead_export(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_awq"
+        )
+
         lm_head = compressed_model.lm_head
         from auto_round.export.export_to_awq.utils import WQLinear_GEMM
 
@@ -350,7 +354,10 @@ def test_gptq_lmhead_export(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_gptq"
+        )
+
         lm_head = compressed_model.lm_head
         assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer"
         quantization_config = AutoRoundConfig()
diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py
index e71af74c4..9bf11c242 100644
--- a/test/test_cuda/export/test_gguf.py
+++ b/test/test_cuda/export/test_gguf.py
@@ -160,7 +160,8 @@ def test_special_model(self):
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
+
         file_name = os.listdir(quantized_model_path)[0]
         file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
         assert abs(file_size - 307) < 5.0
@@ -184,11 +185,12 @@ def test_vlm_gguf(self):
             quant_nontext_module=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
-        assert "mmproj-model.gguf" in os.listdir("./saved")
-        for file in os.listdir("./saved"):
-            print(f"{file}: {os.path.getsize(os.path.join('./saved', file)) / 1024**2} MB")
-            file_size = os.path.getsize(os.path.join("./saved", file)) / 1024**2
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
+
+        assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
+        for file in os.listdir(quantized_model_path):
+            print(f"{file}: {os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2} MB")
+            file_size = os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2
             if "mmproj-model.gguf" in file:
                 assert abs(file_size - 75) < 5.0
             else:
diff --git a/test/test_cuda/integrations/test_transformers.py b/test/test_cuda/integrations/test_transformers.py
index 9544a20a3..449a9245c 100644
--- a/test/test_cuda/integrations/test_transformers.py
+++ b/test/test_cuda/integrations/test_transformers.py
@@ -205,8 +205,11 @@ def test_mixed_bits(self):
 
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
         with tempfile.TemporaryDirectory() as tmpdirname:
-            autoround.quantize_and_save(output_dir=tmpdirname)
-            model = AutoModelForCausalLM.from_pretrained(tmpdirname, torch_dtype=torch.float16, device_map="cuda")
+            _, quantized_model_path = autoround.quantize_and_save(output_dir=tmpdirname)
+
+            model = AutoModelForCausalLM.from_pretrained(
+                quantized_model_path, torch_dtype=torch.float16, device_map="cuda"
+            )
             text = "There is a girl who likes adventure,"
             inputs = tokenizer(text, return_tensors="pt").to(model.device)
             tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
diff --git a/test/test_cuda/models/test_moe_model.py b/test/test_cuda/models/test_moe_model.py
index 40c545015..c15acc843 100644
--- a/test/test_cuda/models/test_moe_model.py
+++ b/test/test_cuda/models/test_moe_model.py
@@ -186,7 +186,8 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe):
         iters=1,
         ignore_layers="self_attn,lm_head,mlp.gate",
     )
-    quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
+    quantized_model, output_dir = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
+    output_dir = output_dir[0]
     assert quantized_model is not None, "Quantized model should not be None."
     loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir)
     loaded_model.to("cuda")
diff --git a/test/test_cuda/quantization/test_2_3bits.py b/test/test_cuda/quantization/test_2_3bits.py
index c03d5e593..1f2e5e812 100644
--- a/test/test_cuda/quantization/test_2_3bits.py
+++ b/test/test_cuda/quantization/test_2_3bits.py
@@ -47,7 +47,9 @@ def test_3bits_autoround(self):
         model_name = get_model_path("facebook/opt-125m")
         autoround = AutoRound(model_name, bits=3)
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round"
+        )  ##will convert to gptq model
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
@@ -65,8 +67,9 @@ def test_3bits_asym_autoround(self):
         model_name = get_model_path("facebook/opt-125m")
         bits, sym = 3, False
         autoround = AutoRound(model_name, bits=bits, sym=sym)
-        autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False)
+
+        model_args = f"pretrained={quantized_model_path}"
         res = simple_evaluate(
             model="hf",
             model_args=model_args,
diff --git a/test/test_cuda/quantization/test_asym.py b/test/test_cuda/quantization/test_asym.py
index d55934898..7ac1487df 100644
--- a/test/test_cuda/quantization/test_asym.py
+++ b/test/test_cuda/quantization/test_asym.py
@@ -45,17 +45,17 @@ def test_asym_group_size(self, tiny_opt_model_path):
             ar = AutoRound(
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
             )
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -65,17 +65,17 @@ def test_asym_bits(self, tiny_opt_model_path):
             ar = AutoRound(
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
             )
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -87,15 +87,15 @@ def test_asym_format(self, tiny_opt_model_path):
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
             )
             # TODO when ark is ready, uncomment the following lines to do inference test
-            ar.quantize_and_save(format=format, output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format=format, output_dir=self.save_dir)
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -105,17 +105,17 @@ def test_asym_group_size_with_tuning(self, tiny_opt_model_path):
             ar = AutoRound(
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
             )
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -125,17 +125,17 @@ def test_asym_bits_with_tuning(self, tiny_opt_model_path):
             ar = AutoRound(
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
             )
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -147,14 +147,14 @@ def test_asym_format_with_tuning(self, tiny_opt_model_path):
                 tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
             )
             # TODO when ark is ready, uncomment the following lines to do inference test
-            ar.quantize_and_save(format=format, output_dir=self.save_dir)
+            _, quantized_model_path = ar.quantize_and_save(format=format, output_dir=self.save_dir)
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_dir,
+            #     quantized_model_path,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
             # model_infer(model, tokenizer)
             shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/quantization/test_mix_bits.py b/test/test_cuda/quantization/test_mix_bits.py
index 9daa9b727..c6164f4b5 100644
--- a/test/test_cuda/quantization/test_mix_bits.py
+++ b/test/test_cuda/quantization/test_mix_bits.py
@@ -52,7 +52,8 @@ def test_mixed_gptqmodel(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+
         from gptqmodel import GPTQModel
 
         model = GPTQModel.load(quantized_model_path)
@@ -79,7 +80,8 @@ def test_mixed_gptqmodel_convert_to_ar(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="auto", quantization_config=quantization_config
@@ -107,7 +109,8 @@ def test_mixed_autoround_format(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "self.save_dir"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
         assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
         assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
@@ -133,7 +136,8 @@ def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = "self.save_dir"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
+
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="auto", quantization_config=quantization_config
@@ -221,7 +225,10 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="auto_round"
+        )
+
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
             torch_dtype="auto",
@@ -295,9 +302,10 @@ def test_mixed_llmcompressor_format_vllm(self, tiny_opt_model_path, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        compressed, _ = autoround.quantize_and_save(
+        compressed, quantized_model_path = autoround.quantize_and_save(
             output_dir=quantized_model_path, inplace=False, format="llm_compressor"
         )
+
         from vllm import LLM, SamplingParams
 
         # Sample prompts.
diff --git a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py
index d76b556e3..0c1cbf007 100644
--- a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py
+++ b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py
@@ -52,7 +52,7 @@ def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path):
 
         # Quantize and save the model to the temporary directory
         quantized_model_path = f"{temp_dir}/tmp_autoround_{scheme}"
-        autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path)
 
         # Perform inference with the quantized model
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cuda/quantization/test_mxfp_nvfp.py b/test/test_cuda/quantization/test_mxfp_nvfp.py
index e2ca5b359..58d126929 100644
--- a/test/test_cuda/quantization/test_mxfp_nvfp.py
+++ b/test/test_cuda/quantization/test_mxfp_nvfp.py
@@ -76,7 +76,10 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = self.save_dir
-        compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
+        compressed_model, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="llm_compressor"
+        )
+
         tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
@@ -162,7 +165,10 @@ def test_qwen_moe_quant_infer(self, dataloader):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="auto_round"
+        )
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         from auto_round.eval.evaluation import simple_evaluate_user_model
diff --git a/test/test_cuda/schemes/test_auto_scheme.py b/test/test_cuda/schemes/test_auto_scheme.py
index b91c98428..e15137533 100644
--- a/test/test_cuda/schemes/test_auto_scheme.py
+++ b/test/test_cuda/schemes/test_auto_scheme.py
@@ -242,8 +242,8 @@ def test_auto_scheme_export(self, tiny_qwen_model_path):
         model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16"))
         ar = AutoRound(model=model_name, scheme=scheme)
-        ar.quantize_and_save(self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.25
@@ -251,15 +251,15 @@ def test_auto_scheme_export(self, tiny_qwen_model_path):
 
         scheme = AutoScheme(avg_bits=3, options=("gguf:q2_k_s,gguf:q4_k_s"), nsamples=1, ignore_scale_zp_bits=True)
         ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1)
-        ar.quantize_and_save(self.save_dir)
+        _, quantized_model_path = ar.quantize_and_save(self.save_dir)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_enable_torch_compile(self):
         model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True)
         ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True)
-        ar.quantize_and_save(self.save_dir)
-        model_args = f"pretrained={self.save_dir}"
+        _, quantized_model_path = ar.quantize_and_save(self.save_dir)
+        model_args = f"pretrained={quantized_model_path}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.10
diff --git a/test/test_cuda/schemes/test_scheme.py b/test/test_cuda/schemes/test_scheme.py
index 255fdc80e..b6243501e 100644
--- a/test/test_cuda/schemes/test_scheme.py
+++ b/test/test_cuda/schemes/test_scheme.py
@@ -124,7 +124,8 @@ def test_q2k_mixed(self):
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed")
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed")
+
         gguf_file = os.listdir(quantized_model_path)[0]
         file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2
         assert abs(file_size - 1236) < 5.0
diff --git a/test/test_cuda/utils/test_alg_ext.py b/test/test_cuda/utils/test_alg_ext.py
index a29bffdac..2708574b9 100644
--- a/test/test_cuda/utils/test_alg_ext.py
+++ b/test/test_cuda/utils/test_alg_ext.py
@@ -33,13 +33,13 @@ def setup_and_teardown_class(self):
     def test_2bits(self):
         model_name = get_model_path("facebook/opt-125m")
         ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True)
-        ar.quantize_and_save(self.save_folder)
+        _, quantized_model_path = ar.quantize_and_save(self.save_folder)
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder,
+            quantized_model_path,
             device_map="auto",
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         # wo alg ext 0.2078, with 0.2371
diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
index fb0dbe1ae..30627f50a 100644
--- a/test/test_xpu/test_autoround.py
+++ b/test/test_xpu/test_autoround.py
@@ -12,6 +12,7 @@
 
 
 class TestAutoRoundXPU:
+
     @classmethod
     def setup_class(self):
         self.device = "xpu"
@@ -42,7 +43,7 @@ def test_gptq_format(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path)
+        _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path)
 
         quantization_config = AutoRoundConfig(backend="auto")
 
@@ -74,7 +75,9 @@ def test_awq_format(self, dataloader):
             dataset=dataloader,
         )
         quantized_model_path = "./saved"
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
+        _, quantized_model_path = autoround.quantize_and_save(
+            output_dir=quantized_model_path, format="auto_round:auto_awq"
+        )
 
         quantization_config = AutoRoundConfig(backend="auto")
         # device_map="auto" doesn't work, must use "xpu"