From a84a4b67830ba18ccb76cff2c41df71b946f4f81 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 26 Jan 2026 11:04:15 +0800 Subject: [PATCH 01/14] refactor init of compressor Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 90 +++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 3232c4470..345ca0eea 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -280,15 +280,8 @@ def __init__( self.shared_cache_keys = get_shared_keys(self.model) self.layer_config = layer_config - - # should be set after loading model and set layer_config, cause some special scheme need these. - self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(scheme, kwargs) - - gguf_scheme_name = get_gguf_scheme(self.scheme) - # GGUF uses fp32 scale dtype as default - scale_dtype = kwargs.pop("scale_dtype", None) - if scale_dtype is None: - scale_dtype = "fp32" if gguf_scheme_name else "fp16" + self.scheme = scheme + self.scale_dtype = kwargs.pop("scale_dtype", None) # Extra/legacy kwargs for backward compatibility # Major version releases may pack them with extra configuration options @@ -314,21 +307,12 @@ def __init__( platform = "model_scope" self.platform = platform self.quant_lm_head = kwargs.pop("quant_lm_head", False) - self.ignore_layers = kwargs.pop("ignore_layers", "") - predefined_ignore_layers = get_predefined_ignore_layers(self.model) - if predefined_ignore_layers: - logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}") - tmp_str = ",".join(predefined_ignore_layers) - if self.ignore_layers == "": - self.ignore_layers = tmp_str - else: - self.ignore_layers += "," + tmp_str self.supported_types = SUPPORTED_LAYER_TYPES self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES - self.scale_dtype = convert_dtype_str2torch(scale_dtype) self.low_cpu_mem_usage = low_cpu_mem_usage + self.block_forward = block_forward if kwargs: logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") @@ -360,16 +344,10 @@ def __init__( self.device_map = device_map if isinstance(self.device_map, str): self.device_map = self.device_map.replace(" ", "") - - self.device_list = parse_available_devices(device_map) - - # Set device, must place after model loading - self.device = get_major_device(device_map) - set_non_auto_device_map(self.model, self.device_map) + self.device = get_major_device(self.device_map) # Tuning hyperparameters self.seed = seed - set_seed(self.seed) self.amp = amp self.enable_quanted_input = enable_quanted_input self.enable_minmax_tuning = enable_minmax_tuning @@ -448,7 +426,6 @@ def __init__( if self.static_attention_dtype is not None: logger.warning("The static attention dtype is experimental and currently has limited support.") - self._set_amp_dtype() self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device if self.act_bits <= 8 and self.amp_dtype == torch.float16: logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization") @@ -466,23 +443,18 @@ def __init__( # after setting iters self.enable_torch_compile = enable_torch_compile - self._adjust_torch_compile(enable_torch_compile) - self.block_forward = compile_func(block_forward, self.device) if self.enable_torch_compile else block_forward + self.attention_mask = [] + self.wrapper_block = wrapper_block + self._check_configs() torch.set_printoptions(precision=3, sci_mode=True) - if isinstance(scheme, AutoScheme): - self.layer_config = self._gen_auto_scheme(model, scheme, dataset, self.device_map) - if is_hpex_available(): logger.info("habana_frameworks is available, import htcore explicitly.") import habana_frameworks.torch.core as htcore # pylint: disable=E0401 import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401] - self.attention_mask = [] - - self.wrapper_block = wrapper_block if self.enable_alg_ext: try: logger.warning_once("using algorithm extension for quantization.") @@ -492,6 +464,48 @@ def __init__( except (ImportError, ModuleNotFoundError): logger.error("algorithm extension import error, fallback to default mode") + self._post_inited = False + + def _post_init(self) -> None: + """Post-initialization for AutoRound.""" + if self._post_inited: + return + + # should be set after loading model and set layer_config, cause some special scheme need these. + self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(self.scheme, {}) + + # GGUF uses fp32 scale dtype as default + if self.scale_dtype is None: + gguf_scheme_name = get_gguf_scheme(self.scheme) + scale_dtype = "fp32" if gguf_scheme_name else "fp16" + self.scale_dtype = convert_dtype_str2torch(scale_dtype) + + predefined_ignore_layers = get_predefined_ignore_layers(self.model) + + if predefined_ignore_layers: + logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}") + tmp_str = ",".join(predefined_ignore_layers) + if self.ignore_layers == "": + self.ignore_layers = tmp_str + else: + self.ignore_layers += "," + tmp_str + + # Set device, must place after model loading + self._set_device(self.device_map) + set_non_auto_device_map(self.model, self.device_map) + self.device_list = parse_available_devices(self.device_map) + + set_seed(self.seed) + self._set_amp_dtype() + self._adjust_torch_compile(self.enable_torch_compile) + if self.enable_torch_compile: + self.block_forward = compile_func(self.block_forward, self.device) + + if isinstance(self.scheme, AutoScheme): + self.layer_config = self._gen_auto_scheme(self.model, self.scheme, self.dataset, self.device_map) + + self._post_inited = True + def _gen_auto_scheme( self, model: torch.nn.Module, scheme: AutoScheme, dataset: str, device_map: Union[str, int, dict, torch.device] ) -> dict[str, dict]: @@ -865,6 +879,9 @@ def quantize_and_save( Raises: ValueError: If an unsupported format is specified. """ + # post init + self._post_init() + # Validate and process the specified formats self.orig_output_dir = output_dir @@ -3118,6 +3135,9 @@ def save_quantized( Returns: object: The compressed model object. """ + # post init + self._post_init() + self.orig_output_dir = output_dir if isinstance(format, str) and getattr(self, "formats", None) is None: formats = get_formats(format, self) From 1b5749de7ab5ff2fb0f146c16940dfea5f91c9cd Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 26 Jan 2026 14:38:36 +0800 Subject: [PATCH 02/14] fix Signed-off-by: n1ck-guo --- auto_round/__main__.py | 27 +------- auto_round/compressors/base.py | 115 ++++++++++++++++++++------------- 2 files changed, 72 insertions(+), 70 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 1540d254d..2ffee486d 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -678,32 +678,7 @@ def tune(args): trust_remote_code=not args.disable_trust_remote_code, ) - model_name = args.model.rstrip("/") - - if model_name.split("/")[-1].strip(".") == "" and "gguf" not in args.format: - if autoround.group_size <= 0: - if "fp" in autoround.act_data_type: - suffix = f"afp{autoround.act_bits}" - else: - suffix = f"a{autoround.act_bits}" - else: - suffix = f"g{autoround.group_size}" - export_dir = os.path.join(args.output_dir, f"w{autoround.bits}{suffix}") - elif model_name.split("/")[-1].strip(".") == "" and "gguf" in args.format: - export_dir = args.output_dir - elif model_name.split("./")[-1].strip("./") != "" and "gguf" in args.format: - export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + "-gguf") - else: - if autoround.group_size <= 0: - if "fp" in autoround.act_data_type: - suffix = f"afp{autoround.act_bits}" - else: - suffix = f"a{autoround.act_bits}" - else: - suffix = f"g{autoround.group_size}" - export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{autoround.bits}{suffix}") - - model, folders = autoround.quantize_and_save(export_dir, format=args.format) # pylint: disable=E1101 + model, folders = autoround.quantize_and_save(args.output_dir, format=args.format) # pylint: disable=E1101 tokenizer = autoround.tokenizer # pylint: disable=E1101 model.eval() diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 345ca0eea..f4ac257d8 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -382,24 +382,7 @@ def __init__( if enable_opt_rtn: disable_opt_rtn = False self.orig_disable_opt_rtn = disable_opt_rtn - - if self.iters != 0 and self.orig_disable_opt_rtn is not None: - logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.") - disable_opt_rtn = True - if ( - self.bits >= 8 - and self.act_bits >= 16 - and self.iters == 0 - and self.data_type == "int" - and disable_opt_rtn is None - ): - logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.") - disable_opt_rtn = True - if disable_opt_rtn is None and self.iters == 0: - logger.info( - "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy." - ) - disable_opt_rtn = False + self.disable_opt_rtn = disable_opt_rtn # Important Note! This is not very robust, do NOT rely on it to do high risky thing self.is_moe_model = is_moe_model(self.model) @@ -410,7 +393,6 @@ def __init__( self.dynamic_max_gap = dynamic_max_gap self.lr_scheduler = lr_scheduler self.optimizer = self._get_optimizer(None) - self.disable_opt_rtn = disable_opt_rtn # Whether to pack the layer immediately after tuning self.is_immediate_packing = False @@ -427,17 +409,7 @@ def __init__( logger.warning("The static attention dtype is experimental and currently has limited support.") self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device - if self.act_bits <= 8 and self.amp_dtype == torch.float16: - logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization") - self.amp_dtype = torch.bfloat16 - if self.model.dtype != torch.bfloat16: # keep the model's buffer dtype unchanged - self.model = self.model.to(torch.bfloat16) - else: - logger.info(f"using {self.model.dtype} for quantization tuning") - # Some helpers - if "hpu" in str(self.device): - self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear") self.batch_dim = None self.infer_bs_coeff = 1 @@ -447,23 +419,8 @@ def __init__( self.attention_mask = [] self.wrapper_block = wrapper_block - self._check_configs() torch.set_printoptions(precision=3, sci_mode=True) - if is_hpex_available(): - logger.info("habana_frameworks is available, import htcore explicitly.") - import habana_frameworks.torch.core as htcore # pylint: disable=E0401 - import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401] - - if self.enable_alg_ext: - try: - logger.warning_once("using algorithm extension for quantization.") - from auto_round.alg_ext import wrapper_autoround - - wrapper_autoround(self) - except (ImportError, ModuleNotFoundError): - logger.error("algorithm extension import error, fallback to default mode") - self._post_inited = False def _post_init(self) -> None: @@ -495,15 +452,60 @@ def _post_init(self) -> None: set_non_auto_device_map(self.model, self.device_map) self.device_list = parse_available_devices(self.device_map) + if self.iters != 0 and self.orig_disable_opt_rtn is not None: + logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.") + self.disable_opt_rtn = True + if ( + self.bits >= 8 + and self.act_bits >= 16 + and self.iters == 0 + and self.data_type == "int" + and self.disable_opt_rtn is None + ): + logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.") + self.disable_opt_rtn = True + if self.disable_opt_rtn is None and self.iters == 0: + logger.info( + "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy." + ) + self.disable_opt_rtn = False + set_seed(self.seed) self._set_amp_dtype() self._adjust_torch_compile(self.enable_torch_compile) if self.enable_torch_compile: self.block_forward = compile_func(self.block_forward, self.device) + if self.act_bits <= 8 and self.amp_dtype == torch.float16: + logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization") + self.amp_dtype = torch.bfloat16 + if self.model.dtype != torch.bfloat16: # keep the model's buffer dtype unchanged + self.model = self.model.to(torch.bfloat16) + else: + logger.info(f"using {self.model.dtype} for quantization tuning") + + # Some helpers + if "hpu" in str(self.device): + self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear") + + self._check_configs() + if isinstance(self.scheme, AutoScheme): self.layer_config = self._gen_auto_scheme(self.model, self.scheme, self.dataset, self.device_map) + if is_hpex_available(): + logger.info("habana_frameworks is available, import htcore explicitly.") + import habana_frameworks.torch.core as htcore # pylint: disable=E0401 + import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401] + + if self.enable_alg_ext: + try: + logger.warning_once("using algorithm extension for quantization.") + from auto_round.alg_ext import wrapper_autoround + + wrapper_autoround(self) + except (ImportError, ModuleNotFoundError): + logger.error("algorithm extension import error, fallback to default mode") self._post_inited = True def _gen_auto_scheme( @@ -882,6 +884,31 @@ def quantize_and_save( # post init self._post_init() + model_name = self.model.name_or_path.rstrip("/") + if model_name.split("/")[-1].strip(".") == "" and "gguf" not in format: + if self.group_size <= 0: + if "fp" in self.act_data_type: + suffix = f"afp{self.act_bits}" + else: + suffix = f"a{self.act_bits}" + else: + suffix = f"g{self.group_size}" + export_dir = os.path.join(output_dir, f"w{self.bits}{suffix}") + elif model_name.split("/")[-1].strip(".") == "" and "gguf" in format: + export_dir = output_dir + elif model_name.split("/")[-1].strip(".") != "" and "gguf" in format: + export_dir = os.path.join(output_dir, model_name.split("/")[-1] + "-gguf") + else: + if self.group_size <= 0: + if "fp" in self.act_data_type: + suffix = f"afp{self.act_bits}" + else: + suffix = f"a{self.act_bits}" + else: + suffix = f"g{self.group_size}" + export_dir = os.path.join(output_dir, model_name.split("/")[-1] + f"-w{self.bits}{suffix}") + + output_dir = export_dir # Validate and process the specified formats self.orig_output_dir = output_dir From 6f9620f4fc5dfa2de4f84e02afa3395e913d2f55 Mon Sep 17 00:00:00 2001 From: Heng Guo Date: Mon, 26 Jan 2026 14:44:48 +0800 Subject: [PATCH 03/14] Update auto_round/compressors/base.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/compressors/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index f2802fccd..c06b1b71f 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -435,6 +435,8 @@ def _post_init(self) -> None: if self.scale_dtype is None: gguf_scheme_name = get_gguf_scheme(self.scheme) scale_dtype = "fp32" if gguf_scheme_name else "fp16" + else: + scale_dtype = self.scale_dtype self.scale_dtype = convert_dtype_str2torch(scale_dtype) predefined_ignore_layers = get_predefined_ignore_layers(self.model) From 0d28e39c786fec81285dabf560510929efab553b Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 26 Jan 2026 14:44:52 +0800 Subject: [PATCH 04/14] update Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index f4ac257d8..f2802fccd 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1644,6 +1644,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: Returns: The quantized model and layer configurations. """ + # post init + self._post_init() self._check_compatibility() formats = self.formats if hasattr(self, "formats") else None @@ -3162,8 +3164,6 @@ def save_quantized( Returns: object: The compressed model object. """ - # post init - self._post_init() self.orig_output_dir = output_dir if isinstance(format, str) and getattr(self, "formats", None) is None: From 3a5995a71965fad17a840447669de0bc9e7e9887 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 26 Jan 2026 14:53:01 +0800 Subject: [PATCH 05/14] refactor Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index f2802fccd..58aa9bdda 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -884,29 +884,24 @@ def quantize_and_save( # post init self._post_init() - model_name = self.model.name_or_path.rstrip("/") - if model_name.split("/")[-1].strip(".") == "" and "gguf" not in format: + name_or_path = self.model.name_or_path.rstrip("/") + model_name = name_or_path.split("/")[-1] + if model_name.strip(".") == "" and "gguf" not in format: if self.group_size <= 0: - if "fp" in self.act_data_type: - suffix = f"afp{self.act_bits}" - else: - suffix = f"a{self.act_bits}" + suffix = f"afp{self.act_bits}" if "fp" in self.act_data_type else f"a{self.act_bits}" else: suffix = f"g{self.group_size}" export_dir = os.path.join(output_dir, f"w{self.bits}{suffix}") - elif model_name.split("/")[-1].strip(".") == "" and "gguf" in format: + elif model_name.strip(".") == "" and "gguf" in format: export_dir = output_dir - elif model_name.split("/")[-1].strip(".") != "" and "gguf" in format: - export_dir = os.path.join(output_dir, model_name.split("/")[-1] + "-gguf") + elif model_name.strip(".") != "" and "gguf" in format: + export_dir = os.path.join(output_dir, model_name + "-gguf") else: if self.group_size <= 0: - if "fp" in self.act_data_type: - suffix = f"afp{self.act_bits}" - else: - suffix = f"a{self.act_bits}" + suffix = f"afp{self.act_bits}" if "fp" in self.act_data_type else f"a{self.act_bits}" else: suffix = f"g{self.group_size}" - export_dir = os.path.join(output_dir, model_name.split("/")[-1] + f"-w{self.bits}{suffix}") + export_dir = os.path.join(output_dir, model_name + f"-w{self.bits}{suffix}") output_dir = export_dir # Validate and process the specified formats From 13c418e263c0b64b094df40c14a5330cd3cd33ce Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 27 Jan 2026 10:00:27 +0800 Subject: [PATCH 06/14] fix Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 1 + auto_round/wrapper.py | 2 +- test/test_ark/test_model.py | 5 +- test/test_cpu/backends/test_torch_backend.py | 14 +++-- test/test_cpu/core/test_autoround.py | 16 ++++-- test/test_cpu/export/test_export.py | 27 ++++++--- test/test_cpu/export/test_gguf_format.py | 56 +++++++++++------- .../integrations/test_llmcompressor.py | 10 ++-- test/test_cpu/models/test_mllm.py | 9 ++- test/test_cpu/models/test_moe_model.py | 8 +-- .../quantization/test_act_quantization.py | 12 ++-- test/test_cpu/quantization/test_mix_bits.py | 19 +++++-- test/test_cpu/quantization/test_mxfp_nvfp.py | 21 +++++-- .../quantization/test_mxfp_save_load.py | 3 +- test/test_cpu/utils/test_generation.py | 8 ++- test/test_cuda/backends/test_torch_backend.py | 10 +++- .../test_cuda/backends/test_triton_backend.py | 57 +++++++++++-------- .../export/test_auto_round_format.py | 36 +++++++----- test/test_cuda/export/test_export.py | 16 ++++-- test/test_cuda/export/test_gguf.py | 14 +++-- .../integrations/test_transformers.py | 7 ++- test/test_cuda/models/test_moe_model.py | 3 +- test/test_cuda/quantization/test_2_3bits.py | 10 +++- test/test_cuda/quantization/test_mix_bits.py | 20 +++++-- .../quantization/test_mxfp_and_nvfp_quant.py | 3 +- test/test_cuda/quantization/test_mxfp_nvfp.py | 10 +++- test/test_cuda/schemes/test_scheme.py | 3 +- test/test_xpu/test_autoround.py | 8 ++- 28 files changed, 274 insertions(+), 134 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index e23f8917e..aae8e6305 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -281,6 +281,7 @@ def __init__( self.layer_config = layer_config self.scheme = scheme + self.is_auto_scheme = True if isinstance(scheme, AutoScheme) else False self.scale_dtype = kwargs.pop("scale_dtype", None) # Extra/legacy kwargs for backward compatibility diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py index 24836d85b..5c426f7c0 100644 --- a/auto_round/wrapper.py +++ b/auto_round/wrapper.py @@ -117,7 +117,7 @@ def __init__( self.enable_round_tuning = enable_round_tuning self.enable_torch_compile = enable_torch_compile self.enable_norm_bias_tuning = enable_norm_bias_tuning and (orig_layer.bias is not None) - self.enable_act_quant = self.orig_layer.act_bits <= 8 + self.enable_act_quant = self.orig_layer.act_bits <= 8 if self.orig_layer.act_bits is not None else False self.weight_global_scale = getattr(self.orig_layer, "weight_global_scale", None) if is_nv_fp(self.orig_layer.data_type) and self.weight_global_scale is None: from auto_round.data_type.nvfp import calculate_gparam diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index 361f1bdf9..5d347e90d 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -40,7 +40,10 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t else: autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format=format) ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format=format + ) ##will convert to gptq model + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="ark") model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py index 5c70f7e99..78f5efb11 100644 --- a/test/test_cpu/backends/test_torch_backend.py +++ b/test/test_cpu/backends/test_torch_backend.py @@ -38,7 +38,10 @@ def test_torch_4bits_asym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:gptqmodel" + ) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( @@ -79,17 +82,20 @@ def test_torch_4bits_sym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) ##will convert to gptq model + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.28 torch.cuda.empty_cache() - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py index 79458e237..c3cea9d77 100644 --- a/test/test_cpu/core/test_autoround.py +++ b/test/test_cpu/core/test_autoround.py @@ -377,16 +377,17 @@ def test_rtn(self, tiny_opt_model_path): bits, group_size, sym = 4, 128, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, nsamples=1) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + quantized_model_path, torch_dtype=torch.float16, device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_embed_quant(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True @@ -432,7 +433,10 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader): autoround.quantize() quantized_model_path = self.save_folder - autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True) + _, quantized_model_path = autoround.save_quantized( + output_dir=quantized_model_path, format="auto_round", inplace=True + ) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="ipex") model = AutoModelForCausalLM.from_pretrained( @@ -442,7 +446,7 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader): text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0]) - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_not_convert_modules(self): import requests diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py index cb6033de9..fce28e19b 100644 --- a/test/test_cpu/export/test_export.py +++ b/test/test_cpu/export/test_export.py @@ -49,7 +49,8 @@ def test_autogptq_format(self, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + quantized_model_path = quantized_model_path[0] if group_size == -1: shutil.rmtree("./saved", ignore_errors=True) @@ -78,7 +79,8 @@ def test_autoround_format(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] if group_size == -1: shutil.rmtree("./saved", ignore_errors=True) @@ -105,7 +107,10 @@ def test_autoround_awq_format(self, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) + quantized_model_path = quantized_model_path[0] # quantization_config = AutoRoundConfig( # backend="cpu" @@ -223,7 +228,8 @@ def test_static_afp8_export(self, static_kv_dtype): static_kv_dtype=static_kv_dtype, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() @@ -281,7 +287,8 @@ def test_static_afp8_export(self, static_kv_dtype): act_group_size=0, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() @@ -341,7 +348,10 @@ def test_awq_lmhead_export(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_awq" + ) + quantized_model_path = quantized_model_path[0] lm_head = compressed_model.lm_head from auto_round.export.export_to_awq.utils import WQLinear_GEMM @@ -368,7 +378,10 @@ def test_gptq_lmhead_export(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_gptq" + ) + quantized_model_path = quantized_model_path[0] lm_head = compressed_model.lm_head assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer" quantization_config = AutoRoundConfig() diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py index 2e3cde6f0..4672db4fb 100644 --- a/test/test_cpu/export/test_gguf_format.py +++ b/test/test_cpu/export/test_gguf_format.py @@ -58,14 +58,17 @@ def test_q4_0(self): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="gguf:q4_0" + ) + quantized_model_path = quantized_model_path[0] gguf_file = os.listdir(quantized_model_path)[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") text = "There is a girl who likes adventure," inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_func(self): bits, group_size, sym = 4, 128, True @@ -77,15 +80,18 @@ def test_func(self): # data_type="int" ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_1") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="gguf:q*_1" + ) + quantized_model_path = quantized_model_path[0] assert autoround.group_size == 32 assert not autoround.sym - gguf_file = os.listdir("saved")[0] + gguf_file = os.listdir(quantized_model_path)[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") text = "There is a girl who likes adventure," inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_gguf_baseline(self): model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") @@ -105,12 +111,15 @@ def test_gguf_baseline(self): disable_opt_rtn=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="fake") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="fake" + ) + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") text = "There is a girl who likes adventure," inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_q4_k_m(self, dataloader): model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") @@ -140,7 +149,10 @@ def test_q4_k_m(self, dataloader): disable_opt_rtn=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="gguf:q4_k_m,fake" + ) + quantized_model_path = quantized_model_path[0] assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16 assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq" assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq" @@ -151,13 +163,16 @@ def test_q4_k_m(self, dataloader): assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3 assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8 assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0" - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") - shutil.rmtree("./saved", ignore_errors=True) + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="gguf:q4_k_m,fake" + ) + quantized_model_path = quantized_model_path[0] + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_all_format(self, tiny_qwen_model_path): model_name = tiny_qwen_model_path @@ -204,15 +219,16 @@ def test_vlm_gguf(self): quant_nontext_module=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - assert "mmproj-model.gguf" in os.listdir("./saved") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + quantized_model_path = quantized_model_path[0] + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 if file_name == "mmproj-model.gguf": assert abs(file_size - 56) < 5.0 else: assert abs(file_size - 264) < 5.0 - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) shutil.rmtree(tiny_model_path, ignore_errors=True) def test_vlm_gguf_wo_quant_nontext_module(self): @@ -230,15 +246,16 @@ def test_vlm_gguf_wo_quant_nontext_module(self): quant_nontext_module=False, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - assert "mmproj-model.gguf" in os.listdir("./saved") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + quantized_model_path = quantized_model_path[0] + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 if file_name == "mmproj-model.gguf": assert abs(file_size - 361) < 5.0 else: assert abs(file_size - 264) < 5.0 - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) shutil.rmtree(tiny_model_path, ignore_errors=True) def test_qtype_setting(self): @@ -328,7 +345,8 @@ def test_q2k_mixed(self): disable_opt_rtn=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") + quantized_model_path = quantized_model_path[0] gguf_file = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2 assert abs(file_size - 1362) < 5.0 @@ -340,5 +358,5 @@ def test_q2k_mixed(self): assert gguf_model.get_tensor(10).name == "blk.0.ffn_up_exps.weight" assert gguf_model.get_tensor(10).tensor_type.name == "Q2_K" - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) shutil.rmtree(saved_tiny_model_path, ignore_errors=True) diff --git a/test/test_cpu/integrations/test_llmcompressor.py b/test/test_cpu/integrations/test_llmcompressor.py index c2966cedd..2c41406fe 100644 --- a/test/test_cpu/integrations/test_llmcompressor.py +++ b/test/test_cpu/integrations/test_llmcompressor.py @@ -50,7 +50,8 @@ def test_llmcompressor_fp8(self): nsamples=2, iters=0, ) - autoround.quantize_and_save("./saved", format="llm_compressor") + _, quantized_model_path = autoround.quantize_and_save("./saved", format="llm_compressor") + quantized_model_path = quantized_model_path[0] # from vllm import LLM # model = LLM("./saved") # result = model.generate("Hello my name is") @@ -58,7 +59,7 @@ def test_llmcompressor_fp8(self): import json - config = json.load(open("./saved/config.json")) + config = json.load(open(f"{quantized_model_path}/config.json")) assert "group_0" in config["quantization_config"]["config_groups"] assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel" @@ -75,11 +76,12 @@ def test_autoround_llmcompressor_fp8(self): nsamples=2, iters=0, ) - autoround.quantize_and_save("./saved", format="auto_round:llm_compressor") + _, quantized_model_path = autoround.quantize_and_save("./saved", format="auto_round:llm_compressor") + quantized_model_path = quantized_model_path[0] import json - config = json.load(open("./saved/config.json")) + config = json.load(open(f"{quantized_model_path}/config.json")) assert "group_0" in config["quantization_config"]["config_groups"] assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor" diff --git a/test/test_cpu/models/test_mllm.py b/test/test_cpu/models/test_mllm.py index 87e140a26..9b2796b92 100644 --- a/test/test_cpu/models/test_mllm.py +++ b/test/test_cpu/models/test_mllm.py @@ -220,15 +220,18 @@ def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path): processor=processor, image_processor=image_processor, ) - autoround.quantize_and_save("./saved/", format="auto_round") + _, quantized_model_path = autoround.quantize_and_save("./saved/", format="auto_round") + quantized_model_path = quantized_model_path[0] import requests from PIL import Image from transformers import AutoProcessor, AutoTokenizer, Qwen2_5_VLForConditionalGeneration - model = Qwen2_5_VLForConditionalGeneration.from_pretrained("./saved", torch_dtype="auto", device_map="auto") + model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + quantized_model_path, torch_dtype="auto", device_map="auto" + ) image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" - processor = AutoProcessor.from_pretrained("./saved") + processor = AutoProcessor.from_pretrained(quantized_model_path) messages = [ { "role": "user", diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py index 4934d619e..167f1409e 100644 --- a/test/test_cpu/models/test_moe_model.py +++ b/test/test_cpu/models/test_moe_model.py @@ -68,7 +68,7 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0): ignore_layers="self_attn,router,lm_head,mlp.gate", ) quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) - return quantized_model + return quantized_model, save_folder[0] def count_modules_by_type(model, target_module_name_or_class): @@ -92,7 +92,7 @@ def test_gptoss(setup_gpt_oss, scheme): # Remove it to avoid mismatch during quantized model loading delattr(model.config, "layer_types") - quantized_model = quantize_model(model, tokenizer, output_dir, scheme) + quantized_model, output_dir = quantize_model(model, tokenizer, output_dir, scheme) # Ensure the quantized model is not None assert quantized_model is not None, "Quantized model should not be None." @@ -130,7 +130,7 @@ def test_llama4(setup_llama4): delattr(model.config.text_config, "moe_layers") delattr(model.config.text_config, "layer_types") - quantized_model = quantize_model(model, tokenizer, output_dir, "MXFP4") + quantized_model, output_dir = quantize_model(model, tokenizer, output_dir, "MXFP4") # Ensure the quantized model is not None assert quantized_model is not None, "Quantized model should not be None." @@ -156,7 +156,7 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe): iters=1, ignore_layers="self_attn,lm_head,mlp.gate", ) - quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) + quantized_model, output_dir = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) assert quantized_model is not None, "Quantized model should not be None." loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir, device_map="cpu") diff --git a/test/test_cpu/quantization/test_act_quantization.py b/test/test_cpu/quantization/test_act_quantization.py index 47cda3599..6e2d91f29 100644 --- a/test/test_cpu/quantization/test_act_quantization.py +++ b/test/test_cpu/quantization/test_act_quantization.py @@ -113,7 +113,8 @@ def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" not in model.config.quantization_config.extra_config @@ -141,7 +142,8 @@ def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") kproj_config = model.config.quantization_config.extra_config["model.decoder.layers.1.self_attn.k_proj"] assert "act_data_type" in kproj_config.keys() and kproj_config["act_data_type"] == "nv_fp4_with_static_gs" @@ -167,7 +169,8 @@ def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") extra_config = model.config.quantization_config.extra_config @@ -198,7 +201,8 @@ def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] from transformers import AutoConfig extra_config = AutoConfig.from_pretrained(quantized_model_path).quantization_config["extra_config"] diff --git a/test/test_cpu/quantization/test_mix_bits.py b/test/test_cpu/quantization/test_mix_bits.py index 5db3053cb..99437436e 100644 --- a/test/test_cpu/quantization/test_mix_bits.py +++ b/test/test_cpu/quantization/test_mix_bits.py @@ -55,7 +55,8 @@ def test_mixed_gptqmodel(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + quantized_model_path = quantized_model_path[0] # test original GPTQModel inference from gptqmodel import GPTQModel @@ -84,7 +85,8 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cpu", quantization_config=quantization_config @@ -112,7 +114,10 @@ def test_mixed_autoround_format(self, dataloader): layer_config=layer_config, ) quantized_model_path = "./saved" - compressed_model = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8 assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3 @@ -136,7 +141,8 @@ def test_fallback_regex_for_awq_format(self, dataloader): layer_config=layer_config, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cpu", quantization_config=quantization_config @@ -223,7 +229,10 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="auto_round" + ) + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype="auto", diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py index f5044bc73..2200ecd11 100644 --- a/test/test_cpu/quantization/test_mxfp_nvfp.py +++ b/test/test_cpu/quantization/test_mxfp_nvfp.py @@ -77,7 +77,10 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): dataset=dataloader, layer_config=layer_config, ) - compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=self.save_dir, inplace=True, format="auto_round" + ) + quantized_model_path = quantized_model_path[0] lm_head = compressed_model.lm_head assert ( hasattr(lm_head, "weight_scale") @@ -86,7 +89,6 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): and lm_head.weight_packed.dtype is torch.uint8 and lm_head.weight_scale.dtype is torch.float8_e4m3fn ), "Illegal NVFP4 packing for lm_head layer" - quantized_model_path = self.save_dir assert is_model_outputs_similar(model_name, quantized_model_path) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -211,7 +213,10 @@ def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="llm_compressor" + ) + quantized_model_path = quantized_model_path[0] tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -248,7 +253,10 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="llm_compressor" + ) + quantized_model_path = quantized_model_path[0] tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -285,7 +293,10 @@ def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) + quantized_model_path = quantized_model_path[0] tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") diff --git a/test/test_cpu/quantization/test_mxfp_save_load.py b/test/test_cpu/quantization/test_mxfp_save_load.py index 396c47735..5d86a99ee 100644 --- a/test/test_cpu/quantization/test_mxfp_save_load.py +++ b/test/test_cpu/quantization/test_mxfp_save_load.py @@ -60,7 +60,8 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type): # Quantize and save the model to the temporary directory quantized_model_path = f"{temp_dir}/tmp_autoround" - autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) + quantized_model_path = quantized_model_path[0] # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cpu/utils/test_generation.py b/test/test_cpu/utils/test_generation.py index 6bf7e1020..fed11dbbe 100644 --- a/test/test_cpu/utils/test_generation.py +++ b/test/test_cpu/utils/test_generation.py @@ -39,7 +39,10 @@ def test_4bits_sym(self, dataloader): ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round", inplace=False) + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round", inplace=False + ) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="ipex") model = AutoModelForCausalLM.from_pretrained( @@ -79,7 +82,8 @@ def test_autoround_sym(self, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", trust_remote_code=True diff --git a/test/test_cuda/backends/test_torch_backend.py b/test/test_cuda/backends/test_torch_backend.py index 4594667d9..27583d04a 100644 --- a/test/test_cuda/backends/test_torch_backend.py +++ b/test/test_cuda/backends/test_torch_backend.py @@ -44,7 +44,10 @@ def test_torch_4bits_asym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:gptqmodel" + ) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( @@ -86,7 +89,10 @@ def test_torch_4bits_sym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) ##will convert to gptq model + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cuda/backends/test_triton_backend.py b/test/test_cuda/backends/test_triton_backend.py index f51e8aeba..4e88e9def 100644 --- a/test/test_cuda/backends/test_triton_backend.py +++ b/test/test_cuda/backends/test_triton_backend.py @@ -38,14 +38,17 @@ def test_tritonv2_4bits_asym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:gptqmodel" + ) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -53,10 +56,10 @@ def test_tritonv2_4bits_asym(self, dataloader): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -71,14 +74,15 @@ def test_tritonv2_2bits_asym(self): bits, group_size, sym = 2, 32, False autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -86,10 +90,10 @@ def test_tritonv2_2bits_asym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -113,14 +117,15 @@ def test_tritonv2_4bits_sym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) @@ -128,10 +133,10 @@ def test_tritonv2_4bits_sym(self, dataloader): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) @@ -147,14 +152,15 @@ def test_tritonv2_8bits_sym(self): bits, group_size, sym = 4, 256, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, nsamples=1, iters=1) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -162,10 +168,10 @@ def test_tritonv2_8bits_sym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) @@ -186,14 +192,15 @@ def test_tritonv2_2bits_sym(self): sym=sym, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -201,10 +208,10 @@ def test_tritonv2_2bits_sym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py index a2753605b..9e10368f2 100644 --- a/test/test_cuda/export/test_auto_round_format.py +++ b/test/test_cuda/export/test_auto_round_format.py @@ -52,7 +52,8 @@ def test_autoround_asym(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cuda:0", trust_remote_code=True @@ -79,13 +80,14 @@ def test_mixed_precision(self): bits, group_size, sym = 4, 128, True autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -105,14 +107,17 @@ def test_awq_backend(self): sym=sym, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -120,12 +125,12 @@ def test_awq_backend(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) - shutil.rmtree(self.save_dir, ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) @require_greater_than_050 def test_tritonv2_bf16(self): @@ -154,7 +159,8 @@ def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) + quantized_model_path = quantized_model_path[0] from transformers import AutoRoundConfig @@ -206,7 +212,10 @@ def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) @@ -243,7 +252,8 @@ def test_autoround_sym(self, tiny_opt_model_path, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", trust_remote_code=True diff --git a/test/test_cuda/export/test_export.py b/test/test_cuda/export/test_export.py index efd519a51..8a6c71e86 100644 --- a/test/test_cuda/export/test_export.py +++ b/test/test_cuda/export/test_export.py @@ -116,7 +116,8 @@ def test_autogptq_format_qsave_ignore_layers(self, dataloader): layer_config=layer_config, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) @@ -218,7 +219,8 @@ def test_autoawq_format_fp_qsave_layers(self, dataloader): layer_config=layer_config, ) quantized_model_path = "./saved/test_export" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + quantized_model_path = quantized_model_path[0] from auto_round import AutoRoundConfig model = AutoModelForCausalLM.from_pretrained( @@ -314,7 +316,10 @@ def test_awq_lmhead_export(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_awq" + ) + quantized_model_path = quantized_model_path[0] lm_head = compressed_model.lm_head from auto_round.export.export_to_awq.utils import WQLinear_GEMM @@ -341,7 +346,10 @@ def test_gptq_lmhead_export(self, tiny_qwen_model_path, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_gptq" + ) + quantized_model_path = quantized_model_path[0] lm_head = compressed_model.lm_head assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer" quantization_config = AutoRoundConfig() diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py index 09e7451c3..d144241a3 100644 --- a/test/test_cuda/export/test_gguf.py +++ b/test/test_cuda/export/test_gguf.py @@ -154,7 +154,8 @@ def test_special_model(self): disable_opt_rtn=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + quantized_model_path = quantized_model_path[0] file_name = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 assert abs(file_size - 307) < 5.0 @@ -178,11 +179,12 @@ def test_vlm_gguf(self): quant_nontext_module=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") - assert "mmproj-model.gguf" in os.listdir("./saved") - for file in os.listdir("./saved"): - print(f"{file}: {os.path.getsize(os.path.join('./saved', file)) / 1024**2} MB") - file_size = os.path.getsize(os.path.join("./saved", file)) / 1024**2 + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") + quantized_model_path = quantized_model_path[0] + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) + for file in os.listdir(quantized_model_path): + print(f"{file}: {os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2} MB") + file_size = os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2 if "mmproj-model.gguf" in file: assert abs(file_size - 75) < 5.0 else: diff --git a/test/test_cuda/integrations/test_transformers.py b/test/test_cuda/integrations/test_transformers.py index 638cb7396..dc6411ab0 100644 --- a/test/test_cuda/integrations/test_transformers.py +++ b/test/test_cuda/integrations/test_transformers.py @@ -200,8 +200,11 @@ def test_mixed_bits(self): autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) with tempfile.TemporaryDirectory() as tmpdirname: - autoround.quantize_and_save(output_dir=tmpdirname) - model = AutoModelForCausalLM.from_pretrained(tmpdirname, torch_dtype=torch.float16, device_map="cuda") + _, quantized_model_path = autoround.quantize_and_save(output_dir=tmpdirname) + quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained( + quantized_model_path, torch_dtype=torch.float16, device_map="cuda" + ) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0]) diff --git a/test/test_cuda/models/test_moe_model.py b/test/test_cuda/models/test_moe_model.py index 66b0dc827..478ae49f0 100644 --- a/test/test_cuda/models/test_moe_model.py +++ b/test/test_cuda/models/test_moe_model.py @@ -179,7 +179,8 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe): iters=1, ignore_layers="self_attn,lm_head,mlp.gate", ) - quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) + quantized_model, output_dir = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) + output_dir = output_dir[0] assert quantized_model is not None, "Quantized model should not be None." loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir) loaded_model.to("cuda") diff --git a/test/test_cuda/quantization/test_2_3bits.py b/test/test_cuda/quantization/test_2_3bits.py index 12ed75faa..e3aa891ba 100644 --- a/test/test_cuda/quantization/test_2_3bits.py +++ b/test/test_cuda/quantization/test_2_3bits.py @@ -47,7 +47,10 @@ def test_3bits_autoround(self): model_name = get_model_path("facebook/opt-125m") autoround = AutoRound(model_name, bits=3) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) ##will convert to gptq model + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( @@ -65,8 +68,9 @@ def test_3bits_asym_autoround(self): model_name = get_model_path("facebook/opt-125m") bits, sym = 3, False autoround = AutoRound(model_name, bits=bits, sym=sym) - autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False) + quantized_model_path = quantized_model_path[0] + model_args = f"pretrained={quantized_model_path}" res = simple_evaluate( model="hf", model_args=model_args, diff --git a/test/test_cuda/quantization/test_mix_bits.py b/test/test_cuda/quantization/test_mix_bits.py index 614a5d2b1..b53a33d39 100644 --- a/test/test_cuda/quantization/test_mix_bits.py +++ b/test/test_cuda/quantization/test_mix_bits.py @@ -52,7 +52,8 @@ def test_mixed_gptqmodel(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + quantized_model_path = quantized_model_path[0] from gptqmodel import GPTQModel model = GPTQModel.load(quantized_model_path) @@ -79,7 +80,8 @@ def test_mixed_gptqmodel_convert_to_ar(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", quantization_config=quantization_config @@ -107,7 +109,8 @@ def test_mixed_autoround_format(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = "self.save_dir" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8 assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3 @@ -133,7 +136,8 @@ def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = "self.save_dir" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", quantization_config=quantization_config @@ -221,7 +225,10 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="auto_round" + ) + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype="auto", @@ -289,9 +296,10 @@ def test_mixed_llmcompressor_format_vllm(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - compressed, _ = autoround.quantize_and_save( + compressed, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, inplace=False, format="llm_compressor" ) + quantized_model_path = quantized_model_path[0] from vllm import LLM, SamplingParams # Sample prompts. diff --git a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py index d76b556e3..407cf7e42 100644 --- a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py +++ b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py @@ -52,7 +52,8 @@ def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path): # Quantize and save the model to the temporary directory quantized_model_path = f"{temp_dir}/tmp_autoround_{scheme}" - autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) + quantized_model_path = quantized_model_path[0] # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cuda/quantization/test_mxfp_nvfp.py b/test/test_cuda/quantization/test_mxfp_nvfp.py index 922cb9f9b..5e20c66a1 100644 --- a/test/test_cuda/quantization/test_mxfp_nvfp.py +++ b/test/test_cuda/quantization/test_mxfp_nvfp.py @@ -64,7 +64,10 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="llm_compressor" + ) + quantized_model_path = quantized_model_path[0] tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -150,7 +153,10 @@ def test_qwen_moe_quant_infer(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=True, format="auto_round" + ) + quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from auto_round.eval.evaluation import simple_evaluate_user_model diff --git a/test/test_cuda/schemes/test_scheme.py b/test/test_cuda/schemes/test_scheme.py index 315c5984b..c5ba7d4fc 100644 --- a/test/test_cuda/schemes/test_scheme.py +++ b/test/test_cuda/schemes/test_scheme.py @@ -120,7 +120,8 @@ def test_q2k_mixed(self): disable_opt_rtn=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") + quantized_model_path = quantized_model_path[0] gguf_file = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2 assert abs(file_size - 1236) < 5.0 diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index 6c9d65c83..77089103e 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -42,7 +42,8 @@ def test_gptq_format(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="auto") @@ -74,7 +75,10 @@ def test_awq_format(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) + quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="auto") # device_map="auto" doesn't work, must use "xpu" From 0ff6480ef89bd1b04fffb4c667e2bb153f7a6b4d Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 27 Jan 2026 14:53:16 +0800 Subject: [PATCH 07/14] fix merge Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 6cca95c34..39f10fd78 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -497,7 +497,7 @@ def _post_init(self) -> None: self._check_configs() if isinstance(self.scheme, AutoScheme): - self.layer_config = self._gen_auto_scheme(self.model, self.scheme, self.dataset, self.device_map) + self.layer_config = self._gen_auto_scheme(self.model, self.orig_scheme, self.dataset, self.device_map) if is_hpex_available(): logger.info("habana_frameworks is available, import htcore explicitly.") @@ -1570,9 +1570,6 @@ def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True) else: self.ignore_layers += "," + tmp_str - if self.is_auto_scheme: - self.layer_config = self._gen_auto_scheme(self.model, self.orig_scheme, self.dataset, self.device_map) - fill_default_value = True if self.is_auto_scheme: fill_default_value = False From 151710a1a8ac8e3d0ad18b28e4a9ecfebcb644ae Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 28 Jan 2026 14:58:52 +0800 Subject: [PATCH 08/14] fix Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 39f10fd78..04a6cd738 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -287,6 +287,11 @@ def __init__( # Extra/legacy kwargs for backward compatibility # Major version releases may pack them with extra configuration options + scheme_keys = [f.name for f in fields(QuantizationScheme)] + for key in scheme_keys: + if key in kwargs and kwargs[key] is not None: + setattr(self, key, kwargs.pop(key)) + amp = kwargs.pop("amp", True) lr = kwargs.pop("lr", None) enable_minmax_tuning = kwargs.pop("enable_minmax_tuning", True) @@ -433,7 +438,7 @@ def _post_init(self) -> None: # should be set after loading model and set layer_config, cause some special scheme need these. # Preserve the original, unparsed scheme for later use in auto scheme generation # within `configure_layer_config` (which may need the raw value instead of `self.scheme`). - self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(self.scheme, {}) + self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(self.scheme) # GGUF uses fp32 scale dtype as default if self.scale_dtype is None: @@ -619,18 +624,18 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") def _parse_and_set_scheme( - self, scheme: Union[str, dict, QuantizationScheme], kwargs + self, + scheme: Union[str, dict, QuantizationScheme], ) -> tuple[QuantizationScheme, bool]: """Parse and set the quantization scheme.""" - def _parse_and_set(scheme, kwargs): - if kwargs.get("data_type", None) and kwargs["data_type"].endswith("_dq") and not scheme.startswith("gguf"): - if "bits" not in kwargs: - data_type = kwargs["data_type"] + def _parse_and_set(scheme): + if self.data_type and self.data_type.endswith("_dq") and not scheme.startswith("gguf"): + if not hasattr(self, "bits") or self.bits is None: raise KeyError( - f"please set bits when setting data_type={data_type}, or using scheme as an alternative." + f"please set bits when setting data_type={self.data_type}, or using scheme as an alternative." ) - bits = kwargs["bits"] + bits = self.bits scheme = f"gguf:q{bits}_k" if bits == 6 else f"gguf:q{bits}_k_s" res = None if isinstance(scheme, QuantizationScheme): @@ -648,11 +653,10 @@ def _parse_and_set(scheme, kwargs): scheme = asdict(preset_name_to_scheme(scheme)) scheme_keys = [f.name for f in fields(QuantizationScheme)] for key in scheme_keys: - if key in kwargs and kwargs[key] is not None: - setattr(self, key, kwargs[key]) + if hasattr(self, key) and getattr(self, key) is not None: + continue else: setattr(self, key, scheme.get(key, None)) - # kwargs.pop(key, None) if self.act_dynamic is None: self.act_dynamic = True @@ -708,7 +712,7 @@ def _parse_and_set(scheme, kwargs): raise ValueError("options of AutoScheme must not be empty") options = [] for option in scheme.options: - new_option = _parse_and_set(option, kwargs) + new_option = _parse_and_set(option) options.append(new_option) scheme.options = options for opt in options: @@ -720,15 +724,13 @@ def _parse_and_set(scheme, kwargs): self.scheme = opt # Choose the first one that not 16 bits break # apply scheme to set default bits - scheme = _parse_and_set(self.scheme, kwargs) + scheme = _parse_and_set(self.scheme) is_auto_scheme = True else: - scheme = _parse_and_set(scheme, kwargs) + scheme = _parse_and_set(scheme) is_auto_scheme = False scheme_keys = [f.name for f in fields(QuantizationScheme)] - for key in scheme_keys: - kwargs.pop(key, None) return scheme, is_auto_scheme From 84f9db8aee2f94f046cb0d1e0499672272c2aae9 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 28 Jan 2026 15:33:00 +0800 Subject: [PATCH 09/14] fix Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 04a6cd738..90ebc39c8 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -630,7 +630,7 @@ def _parse_and_set_scheme( """Parse and set the quantization scheme.""" def _parse_and_set(scheme): - if self.data_type and self.data_type.endswith("_dq") and not scheme.startswith("gguf"): + if getattr(self, "data_type", None) and self.data_type.endswith("_dq") and not scheme.startswith("gguf"): if not hasattr(self, "bits") or self.bits is None: raise KeyError( f"please set bits when setting data_type={self.data_type}, or using scheme as an alternative." From 464a5896cbddb6451c042e2d6022953bf33e7e2b Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 28 Jan 2026 16:30:57 +0800 Subject: [PATCH 10/14] fix Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 4 ++-- auto_round/eval/evaluation.py | 2 +- test/test_ark/test_model.py | 1 - test/test_cpu/backends/test_torch_backend.py | 4 +--- test/test_cpu/core/test_autoround.py | 4 ++-- test/test_cpu/core/test_init.py | 2 ++ test/test_cpu/export/test_export.py | 19 +++++++++++-------- test/test_cpu/export/test_gguf_format.py | 16 ++++++++-------- .../integrations/test_llmcompressor.py | 4 ++-- test/test_cpu/models/test_mllm.py | 3 ++- test/test_cpu/models/test_moe_model.py | 2 +- .../quantization/test_act_quantization.py | 8 ++++---- test/test_cpu/quantization/test_mix_bits.py | 11 ++++++----- test/test_cpu/quantization/test_mxfp_nvfp.py | 13 +++++++------ .../quantization/test_mxfp_save_load.py | 1 - test/test_cpu/schemes/test_scheme.py | 9 +++++---- test/test_cpu/utils/test_generation.py | 3 +-- test/test_cuda/backends/test_torch_backend.py | 2 -- .../test_cuda/backends/test_triton_backend.py | 6 +----- .../export/test_auto_round_format.py | 7 +------ test/test_cuda/export/test_export.py | 7 +++---- test/test_cuda/export/test_gguf.py | 4 ++-- .../integrations/test_transformers.py | 2 +- test/test_cuda/quantization/test_2_3bits.py | 3 +-- test/test_cuda/quantization/test_mix_bits.py | 12 ++++++------ .../quantization/test_mxfp_and_nvfp_quant.py | 1 - test/test_cuda/quantization/test_mxfp_nvfp.py | 4 ++-- test/test_cuda/schemes/test_scheme.py | 2 +- test/test_xpu/test_autoround.py | 3 +-- 29 files changed, 74 insertions(+), 85 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 90ebc39c8..890124f39 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -3164,7 +3164,7 @@ def save_quantized( output_dir: str = None, format: Union[str, list[OutputFormat]] = "auto_round", inplace: bool = True, - return_folders=False, + return_folders=True, **kwargs, ) -> torch.nn.Module: """Save the quantized model to the specified output directory in the specified format. @@ -3218,7 +3218,7 @@ def save_quantized( folders.append(save_folder) if return_folders: - return compressed_model, folders + return compressed_model, folders[0] if len(folders) == 1 else folders else: return compressed_model diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index cff49b371..0bc6dbd3a 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -380,7 +380,7 @@ def run_model_evaluation(model, tokenizer, autoround, folders, formats, device_s return # Check if evaluation is needed for language models - eval_folder = folders[-1] if folders else None + eval_folder = folders[-1] if folders and isinstance(folders, list) else folders if args.tasks is None or args.tasks == "" or eval_folder is None: return diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index ed53e2045..dd5119334 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -43,7 +43,6 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format=format ) ##will convert to gptq model - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="ark") model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py index 6ea89f434..b47dfe48d 100644 --- a/test/test_cpu/backends/test_torch_backend.py +++ b/test/test_cpu/backends/test_torch_backend.py @@ -41,14 +41,13 @@ def test_torch_4bits_asym(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round:gptqmodel" ) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) @@ -85,7 +84,6 @@ def test_torch_4bits_sym(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round" ) ##will convert to gptq model - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py index cfd1b3bc7..f565dabec 100644 --- a/test/test_cpu/core/test_autoround.py +++ b/test/test_cpu/core/test_autoround.py @@ -14,6 +14,7 @@ class TestAutoRound: + @classmethod def setup_class(self): model_name = opt_name_or_path @@ -383,7 +384,6 @@ def test_rtn(self, tiny_opt_model_path): autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, nsamples=1) quantized_model_path = self.save_folder _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype=torch.float16, @@ -441,7 +441,7 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader): _, quantized_model_path = autoround.save_quantized( output_dir=quantized_model_path, format="auto_round", inplace=True ) - quantized_model_path = quantized_model_path[0] + quantization_config = AutoRoundConfig(backend="ipex") model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cpu/core/test_init.py b/test/test_cpu/core/test_init.py index 01785d679..46a460dd3 100644 --- a/test/test_cpu/core/test_init.py +++ b/test/test_cpu/core/test_init.py @@ -3,6 +3,8 @@ def test_torch_compile(tiny_opt_model_path): ar = AutoRound(model=tiny_opt_model_path, scheme="NVFP4", enable_torch_compile=True) + ar._post_init() assert not ar.enable_torch_compile, "NVFP4 cannot work with torch.compile." ar = AutoRound(model=tiny_opt_model_path, scheme="FP8_STATIC", enable_torch_compile=True) + ar._post_init() assert not ar.enable_torch_compile, "FP8_STATIC cannot work with torch.compile." diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py index 3b9df68eb..2dc4c0c59 100644 --- a/test/test_cpu/export/test_export.py +++ b/test/test_cpu/export/test_export.py @@ -23,6 +23,7 @@ def _get_folder_size(path: str) -> float: class TestAutoRound: + @classmethod def setup_class(self): self.model_name = opt_name_or_path @@ -51,7 +52,6 @@ def test_autogptq_format(self, dataloader): quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") - quantized_model_path = quantized_model_path[0] if group_size == -1: shutil.rmtree("./saved", ignore_errors=True) @@ -81,7 +81,6 @@ def test_autoround_format(self, dataloader): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] if group_size == -1: shutil.rmtree("./saved", ignore_errors=True) @@ -111,7 +110,6 @@ def test_autoround_awq_format(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round:auto_awq" ) - quantized_model_path = quantized_model_path[0] # quantization_config = AutoRoundConfig( # backend="cpu" @@ -230,7 +228,7 @@ def test_static_afp8_export(self, static_kv_dtype): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] + f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() @@ -292,7 +290,6 @@ def test_static_afp8_export(self, static_kv_dtype): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() @@ -318,7 +315,8 @@ def test_static_fp8_attn(self): static_attention_dtype="fp8", ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() @@ -359,7 +357,7 @@ def test_awq_lmhead_export(self, dataloader): compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_awq" ) - quantized_model_path = quantized_model_path[0] + lm_head = compressed_model.lm_head from auto_round.export.export_to_awq.utils import WQLinear_GEMM @@ -393,7 +391,7 @@ def test_gptq_lmhead_export(self, dataloader): compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_gptq" ) - quantized_model_path = quantized_model_path[0] + lm_head = compressed_model.lm_head assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer" quantization_config = AutoRoundConfig() @@ -414,6 +412,7 @@ def test_export_format(self): self.model_name, scheme="FP8_STATIC", ) + autoround._post_init() format_list = get_formats("auto_round, llm_compressor, auto_round:llm_compressor", autoround) assert len(format_list) == 3 assert format_list[0].output_format == "auto_round" @@ -427,6 +426,7 @@ def test_export_format(self): self.model_name, scheme="W4A16", ) + autoround._post_init() format_list = get_formats("auto_round:auto_awq, auto_gptq", autoround) assert format_list[0].output_format == "auto_round" assert format_list[0].get_backend_name() == "auto_round:auto_awq" @@ -443,6 +443,7 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): group_size=32, sym=True, ) + ar._post_init() with pytest.raises(ValueError, match="auto_awq format support quantization scheme with W4A16 but got bits=2"): get_formats("auto_round:auto_awq", ar) @@ -456,6 +457,7 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): group_size=32, sym=True, ) + ar._post_init() with pytest.raises(ValueError, match="but got data_type=fp, bits=4"): get_formats("auto_round:llm_compressor", ar) @@ -466,4 +468,5 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): group_size=256, sym=True, ) + ar._post_init() get_formats("auto_round:auto_awq", ar) diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py index cc2c8bf3d..636b180e7 100644 --- a/test/test_cpu/export/test_gguf_format.py +++ b/test/test_cpu/export/test_gguf_format.py @@ -67,7 +67,7 @@ def test_q4_0(self): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, inplace=False, format="gguf:q4_0" ) - quantized_model_path = quantized_model_path[0] + gguf_file = os.listdir(quantized_model_path)[0] # TODO: fix the issue of gguf loading error in transformers v5 @@ -95,7 +95,7 @@ def test_func(self): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, inplace=False, format="gguf:q*_1" ) - quantized_model_path = quantized_model_path[0] + assert autoround.group_size == 32 assert not autoround.sym gguf_file = os.listdir(quantized_model_path)[0] @@ -126,7 +126,7 @@ def test_gguf_baseline(self): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, inplace=False, format="fake" ) - quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") text = "There is a girl who likes adventure," inputs = self.tokenizer(text, return_tensors="pt").to(model.device) @@ -164,7 +164,7 @@ def test_q4_k_m(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="gguf:q4_k_m,fake" ) - quantized_model_path = quantized_model_path[0] + assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16 assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq" assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq" @@ -183,7 +183,7 @@ def test_q4_k_m(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="gguf:q4_k_m,fake" ) - quantized_model_path = quantized_model_path[0] + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_all_format(self, tiny_qwen_model_path): @@ -232,7 +232,7 @@ def test_vlm_gguf(self): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - quantized_model_path = quantized_model_path[0] + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 @@ -259,7 +259,7 @@ def test_vlm_gguf_wo_quant_nontext_module(self): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - quantized_model_path = quantized_model_path[0] + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 @@ -358,7 +358,7 @@ def test_q2k_mixed(self): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") - quantized_model_path = quantized_model_path[0] + gguf_file = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2 assert abs(file_size - 1362) < 5.0 diff --git a/test/test_cpu/integrations/test_llmcompressor.py b/test/test_cpu/integrations/test_llmcompressor.py index 2c41406fe..95afce423 100644 --- a/test/test_cpu/integrations/test_llmcompressor.py +++ b/test/test_cpu/integrations/test_llmcompressor.py @@ -11,6 +11,7 @@ class TestLLMC: + @classmethod def setup_class(self): self.model_name = get_model_path("stas/tiny-random-llama-2") @@ -51,7 +52,7 @@ def test_llmcompressor_fp8(self): iters=0, ) _, quantized_model_path = autoround.quantize_and_save("./saved", format="llm_compressor") - quantized_model_path = quantized_model_path[0] + # from vllm import LLM # model = LLM("./saved") # result = model.generate("Hello my name is") @@ -77,7 +78,6 @@ def test_autoround_llmcompressor_fp8(self): iters=0, ) _, quantized_model_path = autoround.quantize_and_save("./saved", format="auto_round:llm_compressor") - quantized_model_path = quantized_model_path[0] import json diff --git a/test/test_cpu/models/test_mllm.py b/test/test_cpu/models/test_mllm.py index 9b2796b92..748a6ede0 100644 --- a/test/test_cpu/models/test_mllm.py +++ b/test/test_cpu/models/test_mllm.py @@ -9,6 +9,7 @@ class FakeDataLoader: + def __init__(self): self.batch_size = 1 @@ -26,6 +27,7 @@ def __iter__(self): class TestAutoRoundMLLM: + @classmethod def setup_class(self): self.model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") @@ -221,7 +223,6 @@ def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path): image_processor=image_processor, ) _, quantized_model_path = autoround.quantize_and_save("./saved/", format="auto_round") - quantized_model_path = quantized_model_path[0] import requests from PIL import Image diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py index 234c9ae55..01bedb40d 100644 --- a/test/test_cpu/models/test_moe_model.py +++ b/test/test_cpu/models/test_moe_model.py @@ -70,7 +70,7 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0): ignore_layers="self_attn,router,lm_head,mlp.gate", ) quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) - return quantized_model, save_folder[0] + return quantized_model, save_folder def count_modules_by_type(model, target_module_name_or_class): diff --git a/test/test_cpu/quantization/test_act_quantization.py b/test/test_cpu/quantization/test_act_quantization.py index 6e2d91f29..647a3fcb9 100644 --- a/test/test_cpu/quantization/test_act_quantization.py +++ b/test/test_cpu/quantization/test_act_quantization.py @@ -114,7 +114,7 @@ def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" not in model.config.quantization_config.extra_config @@ -143,7 +143,7 @@ def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") kproj_config = model.config.quantization_config.extra_config["model.decoder.layers.1.self_attn.k_proj"] assert "act_data_type" in kproj_config.keys() and kproj_config["act_data_type"] == "nv_fp4_with_static_gs" @@ -170,7 +170,7 @@ def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") extra_config = model.config.quantization_config.extra_config @@ -202,7 +202,7 @@ def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] + from transformers import AutoConfig extra_config = AutoConfig.from_pretrained(quantized_model_path).quantization_config["extra_config"] diff --git a/test/test_cpu/quantization/test_mix_bits.py b/test/test_cpu/quantization/test_mix_bits.py index 99437436e..749219271 100644 --- a/test/test_cpu/quantization/test_mix_bits.py +++ b/test/test_cpu/quantization/test_mix_bits.py @@ -25,6 +25,7 @@ def _get_folder_size(path: str) -> float: class TestAutoRound: + @classmethod def setup_class(self): self.model_name = opt_name_or_path @@ -56,7 +57,7 @@ def test_mixed_gptqmodel(self, dataloader): ) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") - quantized_model_path = quantized_model_path[0] + # test original GPTQModel inference from gptqmodel import GPTQModel @@ -86,7 +87,7 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader): ) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") - quantized_model_path = quantized_model_path[0] + quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cpu", quantization_config=quantization_config @@ -117,7 +118,7 @@ def test_mixed_autoround_format(self, dataloader): compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round" ) - quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8 assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3 @@ -142,7 +143,7 @@ def test_fallback_regex_for_awq_format(self, dataloader): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") - quantized_model_path = quantized_model_path[0] + quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cpu", quantization_config=quantization_config @@ -232,7 +233,7 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, inplace=False, format="auto_round" ) - quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype="auto", diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py index 84f8a82ff..e061d5b3b 100644 --- a/test/test_cpu/quantization/test_mxfp_nvfp.py +++ b/test/test_cpu/quantization/test_mxfp_nvfp.py @@ -23,6 +23,7 @@ def _get_folder_size(path: str) -> float: class TestAutoRoundFP: + @classmethod def setup_class(self): self.save_dir = "./saved" @@ -87,7 +88,7 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=self.save_dir, inplace=True, format="auto_round" ) - quantized_model_path = quantized_model_path[0] + lm_head = compressed_model.lm_head assert ( hasattr(lm_head, "weight_scale") @@ -146,7 +147,7 @@ def test_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir autoround.quantize() - compressed_model = autoround.save_quantized( + compressed_model, _ = autoround.save_quantized( output_dir=quantized_model_path, inplace=True, format="llm_compressor" ) tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj @@ -186,7 +187,7 @@ def test_rtn_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir autoround.quantize() - compressed_model = autoround.save_quantized( + compressed_model, _ = autoround.save_quantized( output_dir=quantized_model_path, inplace=True, format="llm_compressor" ) tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj @@ -226,7 +227,7 @@ def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader): compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="llm_compressor" ) - quantized_model_path = quantized_model_path[0] + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -266,7 +267,7 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="llm_compressor" ) - quantized_model_path = quantized_model_path[0] + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -306,7 +307,7 @@ def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader): compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round" ) - quantized_model_path = quantized_model_path[0] + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") diff --git a/test/test_cpu/quantization/test_mxfp_save_load.py b/test/test_cpu/quantization/test_mxfp_save_load.py index 5d86a99ee..4fdcaeffc 100644 --- a/test/test_cpu/quantization/test_mxfp_save_load.py +++ b/test/test_cpu/quantization/test_mxfp_save_load.py @@ -61,7 +61,6 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type): # Quantize and save the model to the temporary directory quantized_model_path = f"{temp_dir}/tmp_autoround" _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) - quantized_model_path = quantized_model_path[0] # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py index f4f0e716b..f5a1a6ace 100644 --- a/test/test_cpu/schemes/test_scheme.py +++ b/test/test_cpu/schemes/test_scheme.py @@ -33,36 +33,37 @@ def test_gguf(self, tiny_qwen_model_path, dataloader): def test_w4a16(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) - assert ar.bits == 4 ar.quantize() + assert ar.bits == 4 def test_w2a16_rtn(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) - assert ar.bits == 2 ar.quantize() + assert ar.bits == 2 def test_mxfp4(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + ar.quantize() assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "mx_fp" assert ar.act_data_type == "mx_fp" - ar.quantize() def test_vllm(self, tiny_qwen_vl_model_path): from auto_round import AutoRoundMLLM ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) + ar._post_init() assert ar.bits == 2 assert ar.act_bits == 16 def test_nvfp4(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + ar.quantize() assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "nv_fp" assert ar.act_data_type == "nv_fp4_with_static_gs" - ar.quantize() def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader): import copy diff --git a/test/test_cpu/utils/test_generation.py b/test/test_cpu/utils/test_generation.py index fed11dbbe..245ffcc72 100644 --- a/test/test_cpu/utils/test_generation.py +++ b/test/test_cpu/utils/test_generation.py @@ -11,6 +11,7 @@ class TestAutoRoundFormatGeneration: + @classmethod def setup_class(self): self.model_name = opt_name_or_path @@ -42,7 +43,6 @@ def test_4bits_sym(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round", inplace=False ) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="ipex") model = AutoModelForCausalLM.from_pretrained( @@ -83,7 +83,6 @@ def test_autoround_sym(self, dataloader): quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", trust_remote_code=True diff --git a/test/test_cuda/backends/test_torch_backend.py b/test/test_cuda/backends/test_torch_backend.py index 28e2166e6..16f7dfcae 100644 --- a/test/test_cuda/backends/test_torch_backend.py +++ b/test/test_cuda/backends/test_torch_backend.py @@ -47,7 +47,6 @@ def test_torch_4bits_asym(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round:gptqmodel" ) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( @@ -92,7 +91,6 @@ def test_torch_4bits_sym(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round" ) ##will convert to gptq model - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cuda/backends/test_triton_backend.py b/test/test_cuda/backends/test_triton_backend.py index c4eb3317b..b99e68ef8 100644 --- a/test/test_cuda/backends/test_triton_backend.py +++ b/test/test_cuda/backends/test_triton_backend.py @@ -12,6 +12,7 @@ class TestAutoRoundTritonBackend: + @classmethod def setup_class(self): self.model_name = "/models/opt-125m" @@ -41,7 +42,6 @@ def test_tritonv2_4bits_asym(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round:gptqmodel" ) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( @@ -75,7 +75,6 @@ def test_tritonv2_2bits_asym(self): autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) quantized_model_path = self.save_folder _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( @@ -118,7 +117,6 @@ def test_tritonv2_4bits_sym(self, dataloader): ) quantized_model_path = self.save_folder _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( @@ -153,7 +151,6 @@ def test_tritonv2_8bits_sym(self): autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, nsamples=1, iters=1) quantized_model_path = self.save_folder _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( @@ -193,7 +190,6 @@ def test_tritonv2_2bits_sym(self): ) quantized_model_path = self.save_folder _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py index f936474f5..ba94ebda9 100644 --- a/test/test_cuda/export/test_auto_round_format.py +++ b/test/test_cuda/export/test_auto_round_format.py @@ -53,7 +53,6 @@ def test_autoround_asym(self, tiny_opt_model_path, dataloader): quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cuda:0", trust_remote_code=True @@ -81,7 +80,7 @@ def test_mixed_precision(self): autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] + quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config @@ -110,7 +109,6 @@ def test_awq_backend(self): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round:auto_awq" ) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( @@ -160,7 +158,6 @@ def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader): quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) - quantized_model_path = quantized_model_path[0] from transformers import AutoRoundConfig @@ -215,7 +212,6 @@ def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round:auto_awq" ) - quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) @@ -253,7 +249,6 @@ def test_autoround_sym(self, tiny_opt_model_path, dataloader): quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", trust_remote_code=True diff --git a/test/test_cuda/export/test_export.py b/test/test_cuda/export/test_export.py index 7e93c1a56..8b15d234d 100644 --- a/test/test_cuda/export/test_export.py +++ b/test/test_cuda/export/test_export.py @@ -117,7 +117,6 @@ def test_autogptq_format_qsave_ignore_layers(self, dataloader): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") - quantized_model_path = quantized_model_path[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) @@ -220,7 +219,7 @@ def test_autoawq_format_fp_qsave_layers(self, dataloader): ) quantized_model_path = "./saved/test_export" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") - quantized_model_path = quantized_model_path[0] + from auto_round import AutoRoundConfig model = AutoModelForCausalLM.from_pretrained( @@ -319,7 +318,7 @@ def test_awq_lmhead_export(self, dataloader): compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_awq" ) - quantized_model_path = quantized_model_path[0] + lm_head = compressed_model.lm_head from auto_round.export.export_to_awq.utils import WQLinear_GEMM @@ -349,7 +348,7 @@ def test_gptq_lmhead_export(self, tiny_qwen_model_path, dataloader): compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_gptq" ) - quantized_model_path = quantized_model_path[0] + lm_head = compressed_model.lm_head assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer" quantization_config = AutoRoundConfig() diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py index d144241a3..5cadcee34 100644 --- a/test/test_cuda/export/test_gguf.py +++ b/test/test_cuda/export/test_gguf.py @@ -155,7 +155,7 @@ def test_special_model(self): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - quantized_model_path = quantized_model_path[0] + file_name = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 assert abs(file_size - 307) < 5.0 @@ -180,7 +180,7 @@ def test_vlm_gguf(self): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") - quantized_model_path = quantized_model_path[0] + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) for file in os.listdir(quantized_model_path): print(f"{file}: {os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2} MB") diff --git a/test/test_cuda/integrations/test_transformers.py b/test/test_cuda/integrations/test_transformers.py index dc6411ab0..9484daaa0 100644 --- a/test/test_cuda/integrations/test_transformers.py +++ b/test/test_cuda/integrations/test_transformers.py @@ -201,7 +201,7 @@ def test_mixed_bits(self): autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) with tempfile.TemporaryDirectory() as tmpdirname: _, quantized_model_path = autoround.quantize_and_save(output_dir=tmpdirname) - quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype=torch.float16, device_map="cuda" ) diff --git a/test/test_cuda/quantization/test_2_3bits.py b/test/test_cuda/quantization/test_2_3bits.py index e0591b464..1f2e5e812 100644 --- a/test/test_cuda/quantization/test_2_3bits.py +++ b/test/test_cuda/quantization/test_2_3bits.py @@ -50,7 +50,6 @@ def test_3bits_autoround(self): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round" ) ##will convert to gptq model - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( @@ -69,7 +68,7 @@ def test_3bits_asym_autoround(self): bits, sym = 3, False autoround = AutoRound(model_name, bits=bits, sym=sym) _, quantized_model_path = autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False) - quantized_model_path = quantized_model_path[0] + model_args = f"pretrained={quantized_model_path}" res = simple_evaluate( model="hf", diff --git a/test/test_cuda/quantization/test_mix_bits.py b/test/test_cuda/quantization/test_mix_bits.py index 15197538b..c6164f4b5 100644 --- a/test/test_cuda/quantization/test_mix_bits.py +++ b/test/test_cuda/quantization/test_mix_bits.py @@ -53,7 +53,7 @@ def test_mixed_gptqmodel(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") - quantized_model_path = quantized_model_path[0] + from gptqmodel import GPTQModel model = GPTQModel.load(quantized_model_path) @@ -81,7 +81,7 @@ def test_mixed_gptqmodel_convert_to_ar(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") - quantized_model_path = quantized_model_path[0] + quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", quantization_config=quantization_config @@ -110,7 +110,7 @@ def test_mixed_autoround_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = "self.save_dir" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8 assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3 @@ -137,7 +137,7 @@ def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = "self.save_dir" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") - quantized_model_path = quantized_model_path[0] + quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", quantization_config=quantization_config @@ -228,7 +228,7 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, inplace=False, format="auto_round" ) - quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype="auto", @@ -305,7 +305,7 @@ def test_mixed_llmcompressor_format_vllm(self, tiny_opt_model_path, dataloader): compressed, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, inplace=False, format="llm_compressor" ) - quantized_model_path = quantized_model_path[0] + from vllm import LLM, SamplingParams # Sample prompts. diff --git a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py index 407cf7e42..0c1cbf007 100644 --- a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py +++ b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py @@ -53,7 +53,6 @@ def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path): # Quantize and save the model to the temporary directory quantized_model_path = f"{temp_dir}/tmp_autoround_{scheme}" _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) - quantized_model_path = quantized_model_path[0] # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cuda/quantization/test_mxfp_nvfp.py b/test/test_cuda/quantization/test_mxfp_nvfp.py index da68942b8..c818f6c1b 100644 --- a/test/test_cuda/quantization/test_mxfp_nvfp.py +++ b/test/test_cuda/quantization/test_mxfp_nvfp.py @@ -67,7 +67,7 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): compressed_model, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="llm_compressor" ) - quantized_model_path = quantized_model_path[0] + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -156,7 +156,7 @@ def test_qwen_moe_quant_infer(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, inplace=False, format="auto_round" ) - quantized_model_path = quantized_model_path[0] + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from auto_round.eval.evaluation import simple_evaluate_user_model diff --git a/test/test_cuda/schemes/test_scheme.py b/test/test_cuda/schemes/test_scheme.py index c5ba7d4fc..a635d2e59 100644 --- a/test/test_cuda/schemes/test_scheme.py +++ b/test/test_cuda/schemes/test_scheme.py @@ -121,7 +121,7 @@ def test_q2k_mixed(self): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") - quantized_model_path = quantized_model_path[0] + gguf_file = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2 assert abs(file_size - 1236) < 5.0 diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index a5959396d..30627f50a 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -12,6 +12,7 @@ class TestAutoRoundXPU: + @classmethod def setup_class(self): self.device = "xpu" @@ -43,7 +44,6 @@ def test_gptq_format(self, dataloader): ) quantized_model_path = "./saved" _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="auto") @@ -78,7 +78,6 @@ def test_awq_format(self, dataloader): _, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round:auto_awq" ) - quantized_model_path = quantized_model_path[0] quantization_config = AutoRoundConfig(backend="auto") # device_map="auto" doesn't work, must use "xpu" From 2c9244598a7284166cd9651ea1e556f4e9c96b3d Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 29 Jan 2026 10:56:00 +0800 Subject: [PATCH 11/14] fix Signed-off-by: n1ck-guo --- test/test_cpu/backends/test_torch_backend.py | 4 +-- test/test_cpu/core/test_autoround.py | 12 +++---- test/test_cuda/advanced/test_fp8_input.py | 32 ++++++++--------- test/test_cuda/quantization/test_asym.py | 36 ++++++++++---------- test/test_cuda/schemes/test_auto_scheme.py | 10 +++--- test/test_cuda/utils/test_alg_ext.py | 6 ++-- 6 files changed, 50 insertions(+), 50 deletions(-) diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py index b47dfe48d..6961ddaa3 100644 --- a/test/test_cpu/backends/test_torch_backend.py +++ b/test/test_cpu/backends/test_torch_backend.py @@ -55,10 +55,10 @@ def test_torch_4bits_asym(self, dataloader): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config + quantized_model_path, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py index f565dabec..b1e3e2d05 100644 --- a/test/test_cpu/core/test_autoround.py +++ b/test/test_cpu/core/test_autoround.py @@ -741,8 +741,8 @@ def test_invalid_layer_config(self, tiny_opt_model_path): def test_quant_lm_head(self, tiny_untied_qwen_model_path): model_name = tiny_untied_qwen_model_path ar = AutoRound(model_name, quant_lm_head=True, iters=0, seqlen=8, nsamples=1, disable_opt_rtn=True) - ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 @@ -756,8 +756,8 @@ def test_quant_lm_head(self, tiny_untied_qwen_model_path): disable_opt_rtn=True, layer_config=layer_config, ) - ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 @@ -773,8 +773,8 @@ def test_quant_lm_head_layer_config(self, tiny_untied_qwen_model_path): disable_opt_rtn=True, layer_config=layer_config, ) - ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 diff --git a/test/test_cuda/advanced/test_fp8_input.py b/test/test_cuda/advanced/test_fp8_input.py index ec3dc6bf3..918291ce2 100644 --- a/test/test_cuda/advanced/test_fp8_input.py +++ b/test/test_cuda/advanced/test_fp8_input.py @@ -38,8 +38,8 @@ def setup_and_teardown_class(self): def test_small_model_rtn_generation(self): model, tokenizer = self.tiny_fp8_model() ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) - ar.quantize_and_save(output_dir=self.save_dir) - model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True) + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.save_dir) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) @@ -49,7 +49,7 @@ def test_small_model_rtn_generation(self): def test_gguf_imatrix(self): model, tokenizer = self.tiny_fp8_model() ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) - ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir) # from llama_cpp import Llama # # gguf_file = os.listdir("saved/Qwen3-0.6B-FP8/-gguf")[0] @@ -66,8 +66,8 @@ def test_gguf_imatrix(self): def test_small_model_rtn(self): model_name = get_model_path("qwen/Qwen3-0.6B-FP8") ar = AutoRound(model=model_name, iters=0) - _, folder = ar.quantize_and_save(output_dir=self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.25 @@ -77,8 +77,8 @@ def test_small_model_rtn(self): def test_small_model_iters1(self): model_name = get_model_path("qwen/Qwen3-0.6B-FP8") ar = AutoRound(model=model_name, iters=1) - _, folder = ar.quantize_and_save(output_dir=self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.25 @@ -88,8 +88,8 @@ def test_small_model_iters1(self): def test_medium_model_rtn(self): model_name = get_model_path("qwen/Qwen3-0.6B-FP8") ar = AutoRound(model=model_name, iters=0) - _, folder = ar.quantize_and_save(output_dir=self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.33 @@ -100,8 +100,8 @@ def test_medium_model_rtn_with_lm_head(self): model_name = get_model_path("qwen/Qwen3-0.6B-FP8") layer_config = {"lm_head": {"bits": 4}} ar = AutoRound(model=model_name, iters=0, layer_config=layer_config) - _, folder = ar.quantize_and_save(output_dir=self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.33 @@ -113,8 +113,8 @@ def test_fp8_model_gguf(self): model, tokenizer = self.tiny_fp8_model() ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) - ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0") - for file in os.listdir(self.save_dir): + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0") + for file in os.listdir(quantized_model_path): if file.endswith(".gguf"): gguf_file = file llm = Llama(f"saved/{gguf_file}", n_gpu_layers=-1) @@ -124,8 +124,8 @@ def test_fp8_model_gguf(self): model, tokenizer = self.tiny_fp8_model() ar = AutoRound(model=model, tokenizer=tokenizer, iters=1) - ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s") - for file in os.listdir(self.save_dir): + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s") + for file in os.listdir(quantized_model_path): if file.endswith(".gguf"): gguf_file = file llm = Llama(f"saved/{gguf_file}", n_gpu_layers=-1) @@ -139,5 +139,5 @@ def test_diff_datatype(self): for iters in [0, 1]: print(f"Testing scheme: {scheme}, iters: {iters}") ar = AutoRound(model_name, iters=iters, scheme=scheme) - ar.quantize_and_save(output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/quantization/test_asym.py b/test/test_cuda/quantization/test_asym.py index d55934898..7ac1487df 100644 --- a/test/test_cuda/quantization/test_asym.py +++ b/test/test_cuda/quantization/test_asym.py @@ -45,17 +45,17 @@ def test_asym_group_size(self, tiny_opt_model_path): ar = AutoRound( tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 ) - ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -65,17 +65,17 @@ def test_asym_bits(self, tiny_opt_model_path): ar = AutoRound( tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 ) - ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -87,15 +87,15 @@ def test_asym_format(self, tiny_opt_model_path): tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 ) # TODO when ark is ready, uncomment the following lines to do inference test - ar.quantize_and_save(format=format, output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format=format, output_dir=self.save_dir) # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -105,17 +105,17 @@ def test_asym_group_size_with_tuning(self, tiny_opt_model_path): ar = AutoRound( tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 ) - ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -125,17 +125,17 @@ def test_asym_bits_with_tuning(self, tiny_opt_model_path): ar = AutoRound( tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 ) - ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -147,14 +147,14 @@ def test_asym_format_with_tuning(self, tiny_opt_model_path): tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 ) # TODO when ark is ready, uncomment the following lines to do inference test - ar.quantize_and_save(format=format, output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format=format, output_dir=self.save_dir) # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/schemes/test_auto_scheme.py b/test/test_cuda/schemes/test_auto_scheme.py index b91c98428..e15137533 100644 --- a/test/test_cuda/schemes/test_auto_scheme.py +++ b/test/test_cuda/schemes/test_auto_scheme.py @@ -242,8 +242,8 @@ def test_auto_scheme_export(self, tiny_qwen_model_path): model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16")) ar = AutoRound(model=model_name, scheme=scheme) - ar.quantize_and_save(self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.25 @@ -251,15 +251,15 @@ def test_auto_scheme_export(self, tiny_qwen_model_path): scheme = AutoScheme(avg_bits=3, options=("gguf:q2_k_s,gguf:q4_k_s"), nsamples=1, ignore_scale_zp_bits=True) ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1) - ar.quantize_and_save(self.save_dir) + _, quantized_model_path = ar.quantize_and_save(self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) def test_enable_torch_compile(self): model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True) ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True) - ar.quantize_and_save(self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.10 diff --git a/test/test_cuda/utils/test_alg_ext.py b/test/test_cuda/utils/test_alg_ext.py index a29bffdac..2708574b9 100644 --- a/test/test_cuda/utils/test_alg_ext.py +++ b/test/test_cuda/utils/test_alg_ext.py @@ -33,13 +33,13 @@ def setup_and_teardown_class(self): def test_2bits(self): model_name = get_model_path("facebook/opt-125m") ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True) - ar.quantize_and_save(self.save_folder) + _, quantized_model_path = ar.quantize_and_save(self.save_folder) model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + quantized_model_path, device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) # wo alg ext 0.2078, with 0.2371 From ad8b046e966601b444081f899efe92e2a2a12b69 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 29 Jan 2026 16:34:42 +0800 Subject: [PATCH 12/14] fix Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 1 + test/test_cpu/export/test_gguf_format.py | 4 ++-- test/test_cpu/quantization/test_mxfp_nvfp.py | 6 ++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 890124f39..d1bbc478b 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2742,6 +2742,7 @@ def quantize_block( """ # TODO: release below assertion after supporting MLLM and diffusion model quantization with quantize_block + self._post_init() assert self.__class__.__name__ not in [ "DiffusionCompressor", "MLLMCompressor", diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py index 636b180e7..f4aea0479 100644 --- a/test/test_cpu/export/test_gguf_format.py +++ b/test/test_cpu/export/test_gguf_format.py @@ -175,7 +175,7 @@ def test_q4_k_m(self, dataloader): assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3 assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8 assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0" - shutil.rmtree(quantized_model_path, ignore_errors=True) + shutil.rmtree("./saved", ignore_errors=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False) @@ -184,7 +184,7 @@ def test_q4_k_m(self, dataloader): output_dir=quantized_model_path, format="gguf:q4_k_m,fake" ) - shutil.rmtree(quantized_model_path, ignore_errors=True) + shutil.rmtree("./saved", ignore_errors=True) def test_all_format(self, tiny_qwen_model_path): model_name = tiny_qwen_model_path diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py index e061d5b3b..08fc6c79a 100644 --- a/test/test_cpu/quantization/test_mxfp_nvfp.py +++ b/test/test_cpu/quantization/test_mxfp_nvfp.py @@ -333,7 +333,7 @@ def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir autoround.quantize() - compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") + compressed_model, _ = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -364,7 +364,9 @@ def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=True, format="auto_round" + ) assert is_model_outputs_similar(model_name, quantized_model_path) shutil.rmtree(self.save_dir, ignore_errors=True) From c67dc41688c433c839c1432f975ccd4618414890 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 29 Jan 2026 17:00:46 +0800 Subject: [PATCH 13/14] fix Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index d1bbc478b..1d2f4a645 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -730,8 +730,6 @@ def _parse_and_set(scheme): scheme = _parse_and_set(scheme) is_auto_scheme = False - scheme_keys = [f.name for f in fields(QuantizationScheme)] - return scheme, is_auto_scheme def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: @@ -1572,6 +1570,9 @@ def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True) else: self.ignore_layers += "," + tmp_str + if self.is_auto_scheme: + self.layer_config = self._gen_auto_scheme(self.model, self.orig_scheme, self.dataset, self.device_map) + fill_default_value = True if self.is_auto_scheme: fill_default_value = False From b54e9342293826aa62fdc001960729e19198ef74 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 30 Jan 2026 02:35:56 +0000 Subject: [PATCH 14/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 539062d2c..97e9b498e 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -302,7 +302,7 @@ def __init__( self.shared_cache_keys = get_shared_keys(self.model) self.layer_config = layer_config - + self.supported_types = SUPPORTED_LAYER_TYPES self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES self.quant_lm_head = kwargs.pop("quant_lm_head", False) @@ -340,7 +340,7 @@ def __init__( if envs.AR_USE_MODELSCOPE: platform = "model_scope" self.platform = platform - + self.ignore_layers = kwargs.pop("ignore_layers", "") self.low_cpu_mem_usage = low_cpu_mem_usage