diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 5b9616f65..d00a27a22 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -693,33 +693,7 @@ def tune(args): trust_remote_code=not args.disable_trust_remote_code, ) - model_name = args.model.rstrip("/") - - if model_name.split("/")[-1].strip(".") == "" and "gguf" not in args.format: - if autoround.group_size <= 0: - if "fp" in autoround.act_data_type: - suffix = f"afp{autoround.act_bits}" - else: - suffix = f"a{autoround.act_bits}" - else: - suffix = f"g{autoround.group_size}" - export_dir = os.path.join(args.output_dir, f"w{autoround.bits}{suffix}") - elif model_name.split("/")[-1].strip(".") == "" and "gguf" in args.format: - export_dir = args.output_dir - elif model_name.split("./")[-1].strip("./") != "" and "gguf" in args.format: - export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + "-gguf") - else: - if autoround.group_size <= 0: - if "fp" in autoround.act_data_type: - suffix = f"afp{autoround.act_bits}" - else: - suffix = f"a{autoround.act_bits}" - else: - suffix = f"g{autoround.group_size}" - export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{autoround.bits}{suffix}") - - # ======================= Quantize and save model ======================= - model, folders = autoround.quantize_and_save(export_dir, format=args.format) # pylint: disable=E1101 + model, folders = autoround.quantize_and_save(args.output_dir, format=args.format) # pylint: disable=E1101 tokenizer = autoround.tokenizer # pylint: disable=E1101 model.eval() diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 57e771478..97e9b498e 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -307,20 +307,18 @@ def __init__( self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES self.quant_lm_head = kwargs.pop("quant_lm_head", False) - # should be set after loading model and set layer_config, cause some special scheme need these. - # Preserve the original, unparsed scheme for later use in auto scheme generation - # within `configure_layer_config` (which may need the raw value instead of `self.scheme`). + self.scheme = scheme self.orig_scheme = copy.deepcopy(scheme) - self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(scheme, kwargs) - - gguf_scheme_name = get_gguf_scheme(self.scheme) - # GGUF uses fp32 scale dtype as default - scale_dtype = kwargs.pop("scale_dtype", None) - if scale_dtype is None: - scale_dtype = "fp32" if gguf_scheme_name else "fp16" + self.is_auto_scheme = True if isinstance(scheme, AutoScheme) else False + self.scale_dtype = kwargs.pop("scale_dtype", None) # Extra/legacy kwargs for backward compatibility # Major version releases may pack them with extra configuration options + scheme_keys = [f.name for f in fields(QuantizationScheme)] + for key in scheme_keys: + if key in kwargs and kwargs[key] is not None: + setattr(self, key, kwargs.pop(key)) + amp = kwargs.pop("amp", True) lr = kwargs.pop("lr", None) enable_minmax_tuning = kwargs.pop("enable_minmax_tuning", True) @@ -344,10 +342,9 @@ def __init__( self.platform = platform self.ignore_layers = kwargs.pop("ignore_layers", "") - self.supported_types = SUPPORTED_LAYER_TYPES - self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES - self.scale_dtype = convert_dtype_str2torch(scale_dtype) + self.low_cpu_mem_usage = low_cpu_mem_usage + self.block_forward = block_forward if kwargs: logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.") @@ -379,16 +376,10 @@ def __init__( self.device_map = device_map if isinstance(self.device_map, str): self.device_map = self.device_map.replace(" ", "") - - self.device_list = parse_available_devices(device_map) - - # Set device, must place after model loading - self.device = get_major_device(device_map) - set_non_auto_device_map(self.model, self.device_map) + self.device = get_major_device(self.device_map) # Tuning hyperparameters self.seed = seed - set_seed(self.seed) self.amp = amp self.enable_quanted_input = enable_quanted_input self.enable_minmax_tuning = enable_minmax_tuning @@ -423,24 +414,7 @@ def __init__( if enable_opt_rtn: disable_opt_rtn = False self.orig_disable_opt_rtn = disable_opt_rtn - - if self.iters != 0 and self.orig_disable_opt_rtn is not None: - logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.") - disable_opt_rtn = True - if ( - self.bits >= 8 - and self.act_bits >= 16 - and self.iters == 0 - and self.data_type == "int" - and disable_opt_rtn is None - ): - logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.") - disable_opt_rtn = True - if disable_opt_rtn is None and self.iters == 0: - logger.info( - "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy." - ) - disable_opt_rtn = False + self.disable_opt_rtn = disable_opt_rtn # Important Note! This is not very robust, do NOT rely on it to do high risky thing self.is_moe_model = is_moe_model(self.model) @@ -451,7 +425,6 @@ def __init__( self.dynamic_max_gap = dynamic_max_gap self.lr_scheduler = lr_scheduler self.optimizer = self._get_optimizer(None) - self.disable_opt_rtn = disable_opt_rtn # Whether to pack the layer immediately after tuning self.is_immediate_packing = False @@ -467,8 +440,78 @@ def __init__( if self.static_attention_dtype is not None: logger.warning("The static attention dtype is experimental and currently has limited support.") - self._set_amp_dtype() self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device + + self.batch_dim = None + self.infer_bs_coeff = 1 + + # after setting iters + self.enable_torch_compile = enable_torch_compile + + self.attention_mask = [] + self.wrapper_block = wrapper_block + + torch.set_printoptions(precision=3, sci_mode=True) + + self._post_inited = False + + def _post_init(self) -> None: + """Post-initialization for AutoRound.""" + if self._post_inited: + return + + # should be set after loading model and set layer_config, cause some special scheme need these. + # Preserve the original, unparsed scheme for later use in auto scheme generation + # within `configure_layer_config` (which may need the raw value instead of `self.scheme`). + self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(self.scheme) + + # GGUF uses fp32 scale dtype as default + if self.scale_dtype is None: + gguf_scheme_name = get_gguf_scheme(self.scheme) + scale_dtype = "fp32" if gguf_scheme_name else "fp16" + else: + scale_dtype = self.scale_dtype + self.scale_dtype = convert_dtype_str2torch(scale_dtype) + + predefined_ignore_layers = get_predefined_ignore_layers(self.model) + + if predefined_ignore_layers: + logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}") + tmp_str = ",".join(predefined_ignore_layers) + if self.ignore_layers == "": + self.ignore_layers = tmp_str + else: + self.ignore_layers += "," + tmp_str + + # Set device, must place after model loading + self._set_device(self.device_map) + set_non_auto_device_map(self.model, self.device_map) + self.device_list = parse_available_devices(self.device_map) + + if self.iters != 0 and self.orig_disable_opt_rtn is not None: + logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.") + self.disable_opt_rtn = True + if ( + self.bits >= 8 + and self.act_bits >= 16 + and self.iters == 0 + and self.data_type == "int" + and self.disable_opt_rtn is None + ): + logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.") + self.disable_opt_rtn = True + if self.disable_opt_rtn is None and self.iters == 0: + logger.info( + "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy." + ) + self.disable_opt_rtn = False + + set_seed(self.seed) + self._set_amp_dtype() + self._adjust_torch_compile(self.enable_torch_compile) + if self.enable_torch_compile: + self.block_forward = compile_func(self.block_forward, self.device) + if self.act_bits <= 8 and self.amp_dtype == torch.float16: logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization") self.amp_dtype = torch.bfloat16 @@ -480,24 +523,16 @@ def __init__( # Some helpers if "hpu" in str(self.device): self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear") - self.batch_dim = None - self.infer_bs_coeff = 1 - # after setting iters - self.enable_torch_compile = enable_torch_compile - self._adjust_torch_compile(enable_torch_compile) - - self.block_forward = compile_func(block_forward, self.device) if self.enable_torch_compile else block_forward self._check_configs() - torch.set_printoptions(precision=3, sci_mode=True) + + if isinstance(self.scheme, AutoScheme): + self.layer_config = self._gen_auto_scheme(self.model, self.orig_scheme, self.dataset, self.device_map) if is_hpex_available(): logger.info("habana_frameworks is available, import htcore explicitly.") import habana_frameworks.torch.core as htcore # pylint: disable=E0401 - self.attention_mask = [] - - self.wrapper_block = wrapper_block if self.enable_alg_ext: try: logger.warning_once("using algorithm extension for quantization.") @@ -506,6 +541,7 @@ def __init__( wrapper_autoround(self) except (ImportError, ModuleNotFoundError): logger.error("algorithm extension import error, fallback to default mode") + self._post_inited = True def _gen_auto_scheme( self, model: torch.nn.Module, scheme: AutoScheme, dataset: str, device_map: Union[str, int, dict, torch.device] @@ -612,18 +648,18 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None: raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}") def _parse_and_set_scheme( - self, scheme: Union[str, dict, QuantizationScheme], kwargs + self, + scheme: Union[str, dict, QuantizationScheme], ) -> tuple[QuantizationScheme, bool]: """Parse and set the quantization scheme.""" - def _parse_and_set(scheme, kwargs): - if kwargs.get("data_type", None) and kwargs["data_type"].endswith("_dq") and not scheme.startswith("gguf"): - if "bits" not in kwargs: - data_type = kwargs["data_type"] + def _parse_and_set(scheme): + if getattr(self, "data_type", None) and self.data_type.endswith("_dq") and not scheme.startswith("gguf"): + if not hasattr(self, "bits") or self.bits is None: raise KeyError( - f"please set bits when setting data_type={data_type}, or using scheme as an alternative." + f"please set bits when setting data_type={self.data_type}, or using scheme as an alternative." ) - bits = kwargs["bits"] + bits = self.bits scheme = f"gguf:q{bits}_k" if bits == 6 else f"gguf:q{bits}_k_s" res = None if isinstance(scheme, QuantizationScheme): @@ -649,11 +685,10 @@ def _parse_and_set(scheme, kwargs): scheme = asdict(preset_name_to_scheme(scheme)) scheme_keys = [f.name for f in fields(QuantizationScheme)] for key in scheme_keys: - if key in kwargs and kwargs[key] is not None: - setattr(self, key, kwargs[key]) + if hasattr(self, key) and getattr(self, key) is not None: + continue else: setattr(self, key, scheme.get(key, None)) - # kwargs.pop(key, None) if self.act_dynamic is None: self.act_dynamic = True @@ -709,7 +744,7 @@ def _parse_and_set(scheme, kwargs): raise ValueError("options of AutoScheme must not be empty") options = [] for option in scheme.options: - new_option = _parse_and_set(option, kwargs) + new_option = _parse_and_set(option) options.append(new_option) scheme.options = options for opt in options: @@ -721,16 +756,12 @@ def _parse_and_set(scheme, kwargs): self.scheme = opt # Choose the first one that not 16 bits break # apply scheme to set default bits - scheme = _parse_and_set(self.scheme, kwargs) + scheme = _parse_and_set(self.scheme) is_auto_scheme = True else: - scheme = _parse_and_set(scheme, kwargs) + scheme = _parse_and_set(scheme) is_auto_scheme = False - scheme_keys = [f.name for f in fields(QuantizationScheme)] - for key in scheme_keys: - kwargs.pop(key, None) - return scheme, is_auto_scheme def _adjust_torch_compile(self, enable_torch_compile: bool) -> None: @@ -888,6 +919,29 @@ def quantize_and_save( Raises: ValueError: If an unsupported format is specified. """ + # post init + self._post_init() + + name_or_path = self.model.name_or_path.rstrip("/") + model_name = name_or_path.split("/")[-1] + if model_name.strip(".") == "" and "gguf" not in format: + if self.group_size <= 0: + suffix = f"afp{self.act_bits}" if "fp" in self.act_data_type else f"a{self.act_bits}" + else: + suffix = f"g{self.group_size}" + export_dir = os.path.join(output_dir, f"w{self.bits}{suffix}") + elif model_name.strip(".") == "" and "gguf" in format: + export_dir = output_dir + elif model_name.strip(".") != "" and "gguf" in format: + export_dir = os.path.join(output_dir, model_name + "-gguf") + else: + if self.group_size <= 0: + suffix = f"afp{self.act_bits}" if "fp" in self.act_data_type else f"a{self.act_bits}" + else: + suffix = f"g{self.group_size}" + export_dir = os.path.join(output_dir, model_name + f"-w{self.bits}{suffix}") + + output_dir = export_dir # Validate and process the specified formats self.orig_output_dir = output_dir @@ -1637,6 +1691,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: Returns: The quantized model and layer configurations. """ + # post init + self._post_init() self._check_compatibility() formats = self.formats if hasattr(self, "formats") else None @@ -2719,6 +2775,7 @@ def quantize_block( """ # TODO: release below assertion after supporting MLLM and diffusion model quantization with quantize_block + self._post_init() assert self.__class__.__name__ not in [ "DiffusionCompressor", "MLLMCompressor", @@ -3141,7 +3198,7 @@ def save_quantized( output_dir: str = None, format: Union[str, list[OutputFormat]] = "auto_round", inplace: bool = True, - return_folders=False, + return_folders=True, **kwargs, ) -> torch.nn.Module: """Save the quantized model to the specified output directory in the specified format. @@ -3155,6 +3212,7 @@ def save_quantized( Returns: object: The compressed model object. """ + self.orig_output_dir = output_dir if isinstance(format, str) and getattr(self, "formats", None) is None: formats = get_formats(format, self) @@ -3194,7 +3252,7 @@ def save_quantized( folders.append(save_folder) if return_folders: - return compressed_model, folders + return compressed_model, folders[0] if len(folders) == 1 else folders else: return compressed_model diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index cff49b371..0bc6dbd3a 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -380,7 +380,7 @@ def run_model_evaluation(model, tokenizer, autoround, folders, formats, device_s return # Check if evaluation is needed for language models - eval_folder = folders[-1] if folders else None + eval_folder = folders[-1] if folders and isinstance(folders, list) else folders if args.tasks is None or args.tasks == "" or eval_folder is None: return diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py index 24836d85b..5c426f7c0 100644 --- a/auto_round/wrapper.py +++ b/auto_round/wrapper.py @@ -117,7 +117,7 @@ def __init__( self.enable_round_tuning = enable_round_tuning self.enable_torch_compile = enable_torch_compile self.enable_norm_bias_tuning = enable_norm_bias_tuning and (orig_layer.bias is not None) - self.enable_act_quant = self.orig_layer.act_bits <= 8 + self.enable_act_quant = self.orig_layer.act_bits <= 8 if self.orig_layer.act_bits is not None else False self.weight_global_scale = getattr(self.orig_layer, "weight_global_scale", None) if is_nv_fp(self.orig_layer.data_type) and self.weight_global_scale is None: from auto_round.data_type.nvfp import calculate_gparam diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index c54a57bd3..dd5119334 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -40,7 +40,9 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t else: autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format=format) ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format=format + ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="ark") model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py index ce4bc0049..6961ddaa3 100644 --- a/test/test_cpu/backends/test_torch_backend.py +++ b/test/test_cpu/backends/test_torch_backend.py @@ -38,14 +38,16 @@ def test_torch_4bits_asym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:gptqmodel" + ) quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) @@ -53,10 +55,10 @@ def test_torch_4bits_asym(self, dataloader): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config + quantized_model_path, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) @@ -79,17 +81,19 @@ def test_torch_4bits_sym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.28 torch.cuda.empty_cache() - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py index 42a7dc83f..db42f5d00 100644 --- a/test/test_cpu/core/test_autoround.py +++ b/test/test_cpu/core/test_autoround.py @@ -14,6 +14,7 @@ class TestAutoRound: + @classmethod def setup_class(self): model_name = opt_name_or_path @@ -382,16 +383,16 @@ def test_rtn(self, tiny_opt_model_path): bits, group_size, sym = 4, 128, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, nsamples=1) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + quantized_model_path, torch_dtype=torch.float16, device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_embed_quant(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True @@ -437,7 +438,10 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader): autoround.quantize() quantized_model_path = self.save_folder - autoround.save_quantized(output_dir=quantized_model_path, format="auto_round", inplace=True) + _, quantized_model_path = autoround.save_quantized( + output_dir=quantized_model_path, format="auto_round", inplace=True + ) + quantization_config = AutoRoundConfig(backend="ipex") model = AutoModelForCausalLM.from_pretrained( @@ -447,7 +451,7 @@ def test_fallback_layers(self, tiny_opt_model_path, dataloader): text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) res = tokenizer.decode(model.generate(**inputs, max_new_tokens=1)[0]) - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_not_convert_modules(self): import requests @@ -737,8 +741,8 @@ def test_invalid_layer_config(self, tiny_opt_model_path): def test_quant_lm_head(self, tiny_untied_qwen_model_path): model_name = tiny_untied_qwen_model_path ar = AutoRound(model_name, quant_lm_head=True, iters=0, seqlen=8, nsamples=1, disable_opt_rtn=True) - ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 @@ -752,8 +756,8 @@ def test_quant_lm_head(self, tiny_untied_qwen_model_path): disable_opt_rtn=True, layer_config=layer_config, ) - ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 @@ -769,8 +773,8 @@ def test_quant_lm_head_layer_config(self, tiny_untied_qwen_model_path): disable_opt_rtn=True, layer_config=layer_config, ) - ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 diff --git a/test/test_cpu/core/test_init.py b/test/test_cpu/core/test_init.py index 01785d679..46a460dd3 100644 --- a/test/test_cpu/core/test_init.py +++ b/test/test_cpu/core/test_init.py @@ -3,6 +3,8 @@ def test_torch_compile(tiny_opt_model_path): ar = AutoRound(model=tiny_opt_model_path, scheme="NVFP4", enable_torch_compile=True) + ar._post_init() assert not ar.enable_torch_compile, "NVFP4 cannot work with torch.compile." ar = AutoRound(model=tiny_opt_model_path, scheme="FP8_STATIC", enable_torch_compile=True) + ar._post_init() assert not ar.enable_torch_compile, "FP8_STATIC cannot work with torch.compile." diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py index dfe339590..512e8443e 100644 --- a/test/test_cpu/export/test_export.py +++ b/test/test_cpu/export/test_export.py @@ -23,6 +23,7 @@ def _get_folder_size(path: str) -> float: class TestAutoRound: + @classmethod def setup_class(self): self.model_name = opt_name_or_path @@ -50,7 +51,7 @@ def test_autogptq_format(self, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") if group_size == -1: shutil.rmtree("./saved", ignore_errors=True) @@ -79,7 +80,7 @@ def test_autoround_format(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") if group_size == -1: shutil.rmtree("./saved", ignore_errors=True) @@ -106,7 +107,9 @@ def test_autoround_awq_format(self, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) # quantization_config = AutoRoundConfig( # backend="cpu" @@ -220,7 +223,8 @@ def test_static_afp8_export(self, static_kv_dtype): static_kv_dtype=static_kv_dtype, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() @@ -281,7 +285,7 @@ def test_static_afp8_export(self, static_kv_dtype): act_group_size=0, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() @@ -307,7 +311,8 @@ def test_static_fp8_attn(self): static_attention_dtype="fp8", ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() @@ -345,7 +350,10 @@ def test_awq_lmhead_export(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_awq" + ) + lm_head = compressed_model.lm_head from auto_round.export.export_to_awq.utils import WQLinear_GEMM @@ -376,7 +384,10 @@ def test_gptq_lmhead_export(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_gptq" + ) + lm_head = compressed_model.lm_head assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer" quantization_config = AutoRoundConfig() @@ -397,6 +408,7 @@ def test_export_format(self): self.model_name, scheme="FP8_STATIC", ) + autoround._post_init() format_list = get_formats("auto_round, llm_compressor, auto_round:llm_compressor", autoround) assert len(format_list) == 3 assert format_list[0].output_format == "auto_round" @@ -410,6 +422,7 @@ def test_export_format(self): self.model_name, scheme="W4A16", ) + autoround._post_init() format_list = get_formats("auto_round:auto_awq, auto_gptq", autoround) assert format_list[0].output_format == "auto_round" assert format_list[0].get_backend_name() == "auto_round:auto_awq" @@ -426,6 +439,7 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): group_size=32, sym=True, ) + ar._post_init() with pytest.raises(ValueError, match="auto_awq format support quantization scheme with W4A16 but got bits=2"): get_formats("auto_round:auto_awq", ar) @@ -439,6 +453,7 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): group_size=32, sym=True, ) + ar._post_init() with pytest.raises(ValueError, match="but got data_type=fp, bits=4"): get_formats("auto_round:llm_compressor", ar) @@ -449,4 +464,5 @@ def test_export_format_with_scheme(self, tiny_qwen_model_path): group_size=256, sym=True, ) + ar._post_init() get_formats("auto_round:auto_awq", ar) diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py index f1980c455..f4aea0479 100644 --- a/test/test_cpu/export/test_gguf_format.py +++ b/test/test_cpu/export/test_gguf_format.py @@ -64,7 +64,10 @@ def test_q4_0(self): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="gguf:q4_0" + ) + gguf_file = os.listdir(quantized_model_path)[0] # TODO: fix the issue of gguf loading error in transformers v5 @@ -77,7 +80,7 @@ def test_q4_0(self): inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_func(self): bits, group_size, sym = 4, 128, True @@ -89,15 +92,18 @@ def test_func(self): # data_type="int" ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_1") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="gguf:q*_1" + ) + assert autoround.group_size == 32 assert not autoround.sym - gguf_file = os.listdir("saved")[0] + gguf_file = os.listdir(quantized_model_path)[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") text = "There is a girl who likes adventure," inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_gguf_baseline(self): model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") @@ -117,12 +123,15 @@ def test_gguf_baseline(self): disable_opt_rtn=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="fake") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="fake" + ) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") text = "There is a girl who likes adventure," inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) def test_q4_k_m(self, dataloader): model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") @@ -152,7 +161,10 @@ def test_q4_k_m(self, dataloader): disable_opt_rtn=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="gguf:q4_k_m,fake" + ) + assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16 assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq" assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq" @@ -168,7 +180,10 @@ def test_q4_k_m(self, dataloader): model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="gguf:q4_k_m,fake" + ) + shutil.rmtree("./saved", ignore_errors=True) def test_all_format(self, tiny_qwen_model_path): @@ -216,15 +231,16 @@ def test_vlm_gguf(self): quant_nontext_module=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - assert "mmproj-model.gguf" in os.listdir("./saved") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 if file_name == "mmproj-model.gguf": assert abs(file_size - 56) < 5.0 else: assert abs(file_size - 264) < 5.0 - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) shutil.rmtree(tiny_model_path, ignore_errors=True) def test_vlm_gguf_wo_quant_nontext_module(self): @@ -242,15 +258,16 @@ def test_vlm_gguf_wo_quant_nontext_module(self): quant_nontext_module=False, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - assert "mmproj-model.gguf" in os.listdir("./saved") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 if file_name == "mmproj-model.gguf": assert abs(file_size - 361) < 5.0 else: assert abs(file_size - 264) < 5.0 - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) shutil.rmtree(tiny_model_path, ignore_errors=True) def test_qtype_setting(self): @@ -340,7 +357,8 @@ def test_q2k_mixed(self): disable_opt_rtn=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") + gguf_file = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2 assert abs(file_size - 1362) < 5.0 @@ -352,5 +370,5 @@ def test_q2k_mixed(self): assert gguf_model.get_tensor(10).name == "blk.0.ffn_up_exps.weight" assert gguf_model.get_tensor(10).tensor_type.name == "Q2_K" - shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) shutil.rmtree(saved_tiny_model_path, ignore_errors=True) diff --git a/test/test_cpu/integrations/test_llmcompressor.py b/test/test_cpu/integrations/test_llmcompressor.py index c2966cedd..95afce423 100644 --- a/test/test_cpu/integrations/test_llmcompressor.py +++ b/test/test_cpu/integrations/test_llmcompressor.py @@ -11,6 +11,7 @@ class TestLLMC: + @classmethod def setup_class(self): self.model_name = get_model_path("stas/tiny-random-llama-2") @@ -50,7 +51,8 @@ def test_llmcompressor_fp8(self): nsamples=2, iters=0, ) - autoround.quantize_and_save("./saved", format="llm_compressor") + _, quantized_model_path = autoround.quantize_and_save("./saved", format="llm_compressor") + # from vllm import LLM # model = LLM("./saved") # result = model.generate("Hello my name is") @@ -58,7 +60,7 @@ def test_llmcompressor_fp8(self): import json - config = json.load(open("./saved/config.json")) + config = json.load(open(f"{quantized_model_path}/config.json")) assert "group_0" in config["quantization_config"]["config_groups"] assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel" @@ -75,11 +77,11 @@ def test_autoround_llmcompressor_fp8(self): nsamples=2, iters=0, ) - autoround.quantize_and_save("./saved", format="auto_round:llm_compressor") + _, quantized_model_path = autoround.quantize_and_save("./saved", format="auto_round:llm_compressor") import json - config = json.load(open("./saved/config.json")) + config = json.load(open(f"{quantized_model_path}/config.json")) assert "group_0" in config["quantization_config"]["config_groups"] assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor" diff --git a/test/test_cpu/models/test_mllm.py b/test/test_cpu/models/test_mllm.py index 87e140a26..748a6ede0 100644 --- a/test/test_cpu/models/test_mllm.py +++ b/test/test_cpu/models/test_mllm.py @@ -9,6 +9,7 @@ class FakeDataLoader: + def __init__(self): self.batch_size = 1 @@ -26,6 +27,7 @@ def __iter__(self): class TestAutoRoundMLLM: + @classmethod def setup_class(self): self.model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") @@ -220,15 +222,17 @@ def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path): processor=processor, image_processor=image_processor, ) - autoround.quantize_and_save("./saved/", format="auto_round") + _, quantized_model_path = autoround.quantize_and_save("./saved/", format="auto_round") import requests from PIL import Image from transformers import AutoProcessor, AutoTokenizer, Qwen2_5_VLForConditionalGeneration - model = Qwen2_5_VLForConditionalGeneration.from_pretrained("./saved", torch_dtype="auto", device_map="auto") + model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + quantized_model_path, torch_dtype="auto", device_map="auto" + ) image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" - processor = AutoProcessor.from_pretrained("./saved") + processor = AutoProcessor.from_pretrained(quantized_model_path) messages = [ { "role": "user", diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py index 1125ab56d..c4c69eb28 100644 --- a/test/test_cpu/models/test_moe_model.py +++ b/test/test_cpu/models/test_moe_model.py @@ -74,7 +74,7 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0): ignore_layers="self_attn,router,lm_head,mlp.gate", ) quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) - return quantized_model + return quantized_model, save_folder def count_modules_by_type(model, target_module_name_or_class): @@ -98,7 +98,7 @@ def test_gptoss(setup_gpt_oss, scheme): # Remove it to avoid mismatch during quantized model loading delattr(model.config, "layer_types") - quantized_model = quantize_model(model, tokenizer, output_dir, scheme) + quantized_model, output_dir = quantize_model(model, tokenizer, output_dir, scheme) # Ensure the quantized model is not None assert quantized_model is not None, "Quantized model should not be None." @@ -136,7 +136,7 @@ def test_llama4(setup_llama4): delattr(model.config.text_config, "moe_layers") delattr(model.config.text_config, "layer_types") - quantized_model = quantize_model(model, tokenizer, output_dir, "MXFP4") + quantized_model, output_dir = quantize_model(model, tokenizer, output_dir, "MXFP4") # Ensure the quantized model is not None assert quantized_model is not None, "Quantized model should not be None." @@ -162,7 +162,7 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe): iters=1, ignore_layers="self_attn,lm_head,mlp.gate", ) - quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) + quantized_model, output_dir = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) assert quantized_model is not None, "Quantized model should not be None." loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir, device_map="cpu") diff --git a/test/test_cpu/quantization/test_act_quantization.py b/test/test_cpu/quantization/test_act_quantization.py index 72739931e..72cc152e3 100644 --- a/test/test_cpu/quantization/test_act_quantization.py +++ b/test/test_cpu/quantization/test_act_quantization.py @@ -113,7 +113,8 @@ def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert "lm_head" not in model.config.quantization_config.extra_config @@ -141,7 +142,8 @@ def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") kproj_config = model.config.quantization_config.extra_config["model.decoder.layers.1.self_attn.k_proj"] assert "act_data_type" in kproj_config.keys() and kproj_config["act_data_type"] == "nv_fp4_with_static_gs" @@ -167,7 +169,8 @@ def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") extra_config = model.config.quantization_config.extra_config diff --git a/test/test_cpu/quantization/test_mix_bits.py b/test/test_cpu/quantization/test_mix_bits.py index 5db3053cb..749219271 100644 --- a/test/test_cpu/quantization/test_mix_bits.py +++ b/test/test_cpu/quantization/test_mix_bits.py @@ -25,6 +25,7 @@ def _get_folder_size(path: str) -> float: class TestAutoRound: + @classmethod def setup_class(self): self.model_name = opt_name_or_path @@ -55,7 +56,8 @@ def test_mixed_gptqmodel(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + # test original GPTQModel inference from gptqmodel import GPTQModel @@ -84,7 +86,8 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cpu", quantization_config=quantization_config @@ -112,7 +115,10 @@ def test_mixed_autoround_format(self, dataloader): layer_config=layer_config, ) quantized_model_path = "./saved" - compressed_model = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu") assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8 assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3 @@ -136,7 +142,8 @@ def test_fallback_regex_for_awq_format(self, dataloader): layer_config=layer_config, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cpu", quantization_config=quantization_config @@ -223,7 +230,10 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="auto_round" + ) + model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype="auto", diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py index 3c2e9bcce..08fc6c79a 100644 --- a/test/test_cpu/quantization/test_mxfp_nvfp.py +++ b/test/test_cpu/quantization/test_mxfp_nvfp.py @@ -23,6 +23,7 @@ def _get_folder_size(path: str) -> float: class TestAutoRoundFP: + @classmethod def setup_class(self): self.save_dir = "./saved" @@ -84,7 +85,10 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): dataset=dataloader, layer_config=layer_config, ) - compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=self.save_dir, inplace=True, format="auto_round" + ) + lm_head = compressed_model.lm_head assert ( hasattr(lm_head, "weight_scale") @@ -93,7 +97,6 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): and lm_head.weight_packed.dtype is torch.uint8 and lm_head.weight_scale.dtype is torch.float8_e4m3fn ), "Illegal NVFP4 packing for lm_head layer" - quantized_model_path = self.save_dir assert is_model_outputs_similar(model_name, quantized_model_path) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -144,7 +147,7 @@ def test_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir autoround.quantize() - compressed_model = autoround.save_quantized( + compressed_model, _ = autoround.save_quantized( output_dir=quantized_model_path, inplace=True, format="llm_compressor" ) tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj @@ -184,7 +187,7 @@ def test_rtn_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir autoround.quantize() - compressed_model = autoround.save_quantized( + compressed_model, _ = autoround.save_quantized( output_dir=quantized_model_path, inplace=True, format="llm_compressor" ) tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj @@ -221,7 +224,10 @@ def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="llm_compressor" + ) + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -258,7 +264,10 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="llm_compressor" + ) + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -295,7 +304,10 @@ def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -321,7 +333,7 @@ def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir autoround.quantize() - compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") + compressed_model, _ = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -352,7 +364,9 @@ def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=True, format="auto_round" + ) assert is_model_outputs_similar(model_name, quantized_model_path) shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cpu/quantization/test_mxfp_save_load.py b/test/test_cpu/quantization/test_mxfp_save_load.py index 396c47735..4fdcaeffc 100644 --- a/test/test_cpu/quantization/test_mxfp_save_load.py +++ b/test/test_cpu/quantization/test_mxfp_save_load.py @@ -60,7 +60,7 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type): # Quantize and save the model to the temporary directory quantized_model_path = f"{temp_dir}/tmp_autoround" - autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py index e2b0c15c3..cb4d841ba 100644 --- a/test/test_cpu/schemes/test_scheme.py +++ b/test/test_cpu/schemes/test_scheme.py @@ -33,13 +33,13 @@ def test_gguf(self, tiny_qwen_model_path, dataloader): def test_w4a16(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) - assert ar.bits == 4 ar.quantize() + assert ar.bits == 4 def test_w2a16_rtn(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) - assert ar.bits == 2 ar.quantize() + assert ar.bits == 2 def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader): @@ -85,26 +85,27 @@ def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader): def test_mxfp4(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + ar.quantize() assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "mx_fp" assert ar.act_data_type == "mx_fp" - ar.quantize() def test_vllm(self, tiny_qwen_vl_model_path): from auto_round import AutoRoundMLLM ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) + ar._post_init() assert ar.bits == 2 assert ar.act_bits == 16 def test_nvfp4(self, tiny_opt_model_path, dataloader): ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + ar.quantize() assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "nv_fp" assert ar.act_data_type == "nv_fp4_with_static_gs" - ar.quantize() def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader): import copy diff --git a/test/test_cpu/utils/test_generation.py b/test/test_cpu/utils/test_generation.py index 6bf7e1020..245ffcc72 100644 --- a/test/test_cpu/utils/test_generation.py +++ b/test/test_cpu/utils/test_generation.py @@ -11,6 +11,7 @@ class TestAutoRoundFormatGeneration: + @classmethod def setup_class(self): self.model_name = opt_name_or_path @@ -39,7 +40,9 @@ def test_4bits_sym(self, dataloader): ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round", inplace=False) + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round", inplace=False + ) quantization_config = AutoRoundConfig(backend="ipex") model = AutoModelForCausalLM.from_pretrained( @@ -79,7 +82,7 @@ def test_autoround_sym(self, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", trust_remote_code=True diff --git a/test/test_cuda/advanced/test_fp8_input.py b/test/test_cuda/advanced/test_fp8_input.py index de4dfa453..e0f8188c0 100644 --- a/test/test_cuda/advanced/test_fp8_input.py +++ b/test/test_cuda/advanced/test_fp8_input.py @@ -39,8 +39,8 @@ def setup_and_teardown_class(self): def test_small_model_rtn_generation(self): model, tokenizer = self.tiny_fp8_model() ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) - ar.quantize_and_save(output_dir=self.save_dir) - model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True) + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.save_dir) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) @@ -55,7 +55,7 @@ def test_small_model_rtn_generation(self): def test_gguf_imatrix(self): model, tokenizer = self.tiny_fp8_model() ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) - ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir) # from llama_cpp import Llama # # gguf_file = os.listdir("saved/Qwen3-0.6B-FP8/-gguf")[0] @@ -72,8 +72,8 @@ def test_gguf_imatrix(self): def test_small_model_rtn(self): model_name = get_model_path("qwen/Qwen3-0.6B-FP8") ar = AutoRound(model=model_name, iters=0) - _, folder = ar.quantize_and_save(output_dir=self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.25 @@ -83,8 +83,8 @@ def test_small_model_rtn(self): def test_small_model_iters1(self): model_name = get_model_path("qwen/Qwen3-0.6B-FP8") ar = AutoRound(model=model_name, iters=1) - _, folder = ar.quantize_and_save(output_dir=self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.25 @@ -94,8 +94,8 @@ def test_small_model_iters1(self): def test_medium_model_rtn(self): model_name = get_model_path("qwen/Qwen3-0.6B-FP8") ar = AutoRound(model=model_name, iters=0) - _, folder = ar.quantize_and_save(output_dir=self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.33 @@ -106,8 +106,8 @@ def test_medium_model_rtn_with_lm_head(self): model_name = get_model_path("qwen/Qwen3-0.6B-FP8") layer_config = {"lm_head": {"bits": 4}} ar = AutoRound(model=model_name, iters=0, layer_config=layer_config) - _, folder = ar.quantize_and_save(output_dir=self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.33 @@ -124,8 +124,8 @@ def test_fp8_model_gguf(self): model, tokenizer = self.tiny_fp8_model() ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) - ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0") - for file in os.listdir(self.save_dir): + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0") + for file in os.listdir(quantized_model_path): if file.endswith(".gguf"): gguf_file = file llm = Llama(f"saved/{gguf_file}", n_gpu_layers=-1) @@ -135,8 +135,8 @@ def test_fp8_model_gguf(self): model, tokenizer = self.tiny_fp8_model() ar = AutoRound(model=model, tokenizer=tokenizer, iters=1) - ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s") - for file in os.listdir(self.save_dir): + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s") + for file in os.listdir(quantized_model_path): if file.endswith(".gguf"): gguf_file = file llm = Llama(f"saved/{gguf_file}", n_gpu_layers=-1) @@ -155,5 +155,5 @@ def test_diff_datatype(self): for iters in [0, 1]: print(f"Testing scheme: {scheme}, iters: {iters}") ar = AutoRound(model_name, iters=iters, scheme=scheme) - ar.quantize_and_save(output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(output_dir=self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/backends/test_torch_backend.py b/test/test_cuda/backends/test_torch_backend.py index 28df641a3..16f7dfcae 100644 --- a/test/test_cuda/backends/test_torch_backend.py +++ b/test/test_cuda/backends/test_torch_backend.py @@ -44,7 +44,9 @@ def test_torch_4bits_asym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:gptqmodel" + ) quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( @@ -86,7 +88,9 @@ def test_torch_4bits_sym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cuda/backends/test_triton_backend.py b/test/test_cuda/backends/test_triton_backend.py index fb67ad049..b99e68ef8 100644 --- a/test/test_cuda/backends/test_triton_backend.py +++ b/test/test_cuda/backends/test_triton_backend.py @@ -12,6 +12,7 @@ class TestAutoRoundTritonBackend: + @classmethod def setup_class(self): self.model_name = "/models/opt-125m" @@ -38,14 +39,16 @@ def test_tritonv2_4bits_asym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:gptqmodel" + ) quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -53,10 +56,10 @@ def test_tritonv2_4bits_asym(self, dataloader): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -71,14 +74,14 @@ def test_tritonv2_2bits_asym(self): bits, group_size, sym = 2, 32, False autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -86,10 +89,10 @@ def test_tritonv2_2bits_asym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -113,14 +116,14 @@ def test_tritonv2_4bits_sym(self, dataloader): dataset=dataloader, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) @@ -128,10 +131,10 @@ def test_tritonv2_4bits_sym(self, dataloader): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) @@ -147,14 +150,14 @@ def test_tritonv2_8bits_sym(self): bits, group_size, sym = 4, 256, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, nsamples=1, iters=1) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -162,10 +165,10 @@ def test_tritonv2_8bits_sym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) @@ -186,14 +189,14 @@ def test_tritonv2_2bits_sym(self): sym=sym, ) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -201,10 +204,10 @@ def test_tritonv2_2bits_sym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py index 945a3d653..ba94ebda9 100644 --- a/test/test_cuda/export/test_auto_round_format.py +++ b/test/test_cuda/export/test_auto_round_format.py @@ -52,7 +52,7 @@ def test_autoround_asym(self, tiny_opt_model_path, dataloader): ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="cuda:0", trust_remote_code=True @@ -79,13 +79,14 @@ def test_mixed_precision(self): bits, group_size, sym = 4, 128, True autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -105,14 +106,16 @@ def test_awq_backend(self): sym=sym, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -120,12 +123,12 @@ def test_awq_backend(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + quantized_model_path, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) model_infer(model, tokenizer) - shutil.rmtree(self.save_dir, ignore_errors=True) + shutil.rmtree(quantized_model_path, ignore_errors=True) @require_greater_than_050 def test_tritonv2_bf16(self): @@ -154,7 +157,7 @@ def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) from transformers import AutoRoundConfig @@ -206,7 +209,9 @@ def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) @@ -243,7 +248,7 @@ def test_autoround_sym(self, tiny_opt_model_path, dataloader): ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", trust_remote_code=True diff --git a/test/test_cuda/export/test_export.py b/test/test_cuda/export/test_export.py index 25f29a4c8..39e0758bb 100644 --- a/test/test_cuda/export/test_export.py +++ b/test/test_cuda/export/test_export.py @@ -117,7 +117,7 @@ def test_autogptq_format_qsave_ignore_layers(self, dataloader): layer_config=layer_config, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) @@ -219,8 +219,9 @@ def test_autoawq_format_fp_qsave_layers(self, dataloader): layer_config=layer_config, ) quantized_model_path = "./saved/test_export" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") - from transformers import AutoRoundConfig + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + + from auto_round import AutoRoundConfig model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", quantization_config=AutoRoundConfig() @@ -319,7 +320,10 @@ def test_awq_lmhead_export(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_awq" + ) + lm_head = compressed_model.lm_head from auto_round.export.export_to_awq.utils import WQLinear_GEMM @@ -350,7 +354,10 @@ def test_gptq_lmhead_export(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_gptq" + ) + lm_head = compressed_model.lm_head assert hasattr(lm_head, "bits") and lm_head.bits == 4, "Illegal GPTQ quantization for lm_head layer" quantization_config = AutoRoundConfig() diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py index e71af74c4..9bf11c242 100644 --- a/test/test_cuda/export/test_gguf.py +++ b/test/test_cuda/export/test_gguf.py @@ -160,7 +160,8 @@ def test_special_model(self): disable_opt_rtn=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") + file_name = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 assert abs(file_size - 307) < 5.0 @@ -184,11 +185,12 @@ def test_vlm_gguf(self): quant_nontext_module=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") - assert "mmproj-model.gguf" in os.listdir("./saved") - for file in os.listdir("./saved"): - print(f"{file}: {os.path.getsize(os.path.join('./saved', file)) / 1024**2} MB") - file_size = os.path.getsize(os.path.join("./saved", file)) / 1024**2 + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") + + assert "mmproj-model.gguf" in os.listdir(quantized_model_path) + for file in os.listdir(quantized_model_path): + print(f"{file}: {os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2} MB") + file_size = os.path.getsize(os.path.join(quantized_model_path, file)) / 1024**2 if "mmproj-model.gguf" in file: assert abs(file_size - 75) < 5.0 else: diff --git a/test/test_cuda/integrations/test_transformers.py b/test/test_cuda/integrations/test_transformers.py index 9544a20a3..449a9245c 100644 --- a/test/test_cuda/integrations/test_transformers.py +++ b/test/test_cuda/integrations/test_transformers.py @@ -205,8 +205,11 @@ def test_mixed_bits(self): autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) with tempfile.TemporaryDirectory() as tmpdirname: - autoround.quantize_and_save(output_dir=tmpdirname) - model = AutoModelForCausalLM.from_pretrained(tmpdirname, torch_dtype=torch.float16, device_map="cuda") + _, quantized_model_path = autoround.quantize_and_save(output_dir=tmpdirname) + + model = AutoModelForCausalLM.from_pretrained( + quantized_model_path, torch_dtype=torch.float16, device_map="cuda" + ) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0]) diff --git a/test/test_cuda/models/test_moe_model.py b/test/test_cuda/models/test_moe_model.py index 40c545015..c15acc843 100644 --- a/test/test_cuda/models/test_moe_model.py +++ b/test/test_cuda/models/test_moe_model.py @@ -186,7 +186,8 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe): iters=1, ignore_layers="self_attn,lm_head,mlp.gate", ) - quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) + quantized_model, output_dir = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) + output_dir = output_dir[0] assert quantized_model is not None, "Quantized model should not be None." loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir) loaded_model.to("cuda") diff --git a/test/test_cuda/quantization/test_2_3bits.py b/test/test_cuda/quantization/test_2_3bits.py index c03d5e593..1f2e5e812 100644 --- a/test/test_cuda/quantization/test_2_3bits.py +++ b/test/test_cuda/quantization/test_2_3bits.py @@ -47,7 +47,9 @@ def test_3bits_autoround(self): model_name = get_model_path("facebook/opt-125m") autoround = AutoRound(model_name, bits=3) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round" + ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( @@ -65,8 +67,9 @@ def test_3bits_asym_autoround(self): model_name = get_model_path("facebook/opt-125m") bits, sym = 3, False autoround = AutoRound(model_name, bits=bits, sym=sym) - autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False) + + model_args = f"pretrained={quantized_model_path}" res = simple_evaluate( model="hf", model_args=model_args, diff --git a/test/test_cuda/quantization/test_asym.py b/test/test_cuda/quantization/test_asym.py index d55934898..7ac1487df 100644 --- a/test/test_cuda/quantization/test_asym.py +++ b/test/test_cuda/quantization/test_asym.py @@ -45,17 +45,17 @@ def test_asym_group_size(self, tiny_opt_model_path): ar = AutoRound( tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 ) - ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -65,17 +65,17 @@ def test_asym_bits(self, tiny_opt_model_path): ar = AutoRound( tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 ) - ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -87,15 +87,15 @@ def test_asym_format(self, tiny_opt_model_path): tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 ) # TODO when ark is ready, uncomment the following lines to do inference test - ar.quantize_and_save(format=format, output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format=format, output_dir=self.save_dir) # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -105,17 +105,17 @@ def test_asym_group_size_with_tuning(self, tiny_opt_model_path): ar = AutoRound( tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 ) - ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -125,17 +125,17 @@ def test_asym_bits_with_tuning(self, tiny_opt_model_path): ar = AutoRound( tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 ) - ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) @@ -147,14 +147,14 @@ def test_asym_format_with_tuning(self, tiny_opt_model_path): tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 ) # TODO when ark is ready, uncomment the following lines to do inference test - ar.quantize_and_save(format=format, output_dir=self.save_dir) + _, quantized_model_path = ar.quantize_and_save(format=format, output_dir=self.save_dir) # model = AutoModelForCausalLM.from_pretrained( - # self.save_dir, + # quantized_model_path, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) # model_infer(model, tokenizer) shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/quantization/test_mix_bits.py b/test/test_cuda/quantization/test_mix_bits.py index 9daa9b727..c6164f4b5 100644 --- a/test/test_cuda/quantization/test_mix_bits.py +++ b/test/test_cuda/quantization/test_mix_bits.py @@ -52,7 +52,8 @@ def test_mixed_gptqmodel(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + from gptqmodel import GPTQModel model = GPTQModel.load(quantized_model_path) @@ -79,7 +80,8 @@ def test_mixed_gptqmodel_convert_to_ar(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") + quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", quantization_config=quantization_config @@ -107,7 +109,8 @@ def test_mixed_autoround_format(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = "self.save_dir" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8 assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3 @@ -133,7 +136,8 @@ def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = "self.save_dir" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") + quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", quantization_config=quantization_config @@ -221,7 +225,10 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="auto_round" + ) + model = AutoModelForCausalLM.from_pretrained( quantized_model_path, torch_dtype="auto", @@ -295,9 +302,10 @@ def test_mixed_llmcompressor_format_vllm(self, tiny_opt_model_path, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - compressed, _ = autoround.quantize_and_save( + compressed, quantized_model_path = autoround.quantize_and_save( output_dir=quantized_model_path, inplace=False, format="llm_compressor" ) + from vllm import LLM, SamplingParams # Sample prompts. diff --git a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py index d76b556e3..0c1cbf007 100644 --- a/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py +++ b/test/test_cuda/quantization/test_mxfp_and_nvfp_quant.py @@ -52,7 +52,7 @@ def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path): # Quantize and save the model to the temporary directory quantized_model_path = f"{temp_dir}/tmp_autoround_{scheme}" - autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(format="auto_round", output_dir=quantized_model_path) # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cuda/quantization/test_mxfp_nvfp.py b/test/test_cuda/quantization/test_mxfp_nvfp.py index e2ca5b359..58d126929 100644 --- a/test/test_cuda/quantization/test_mxfp_nvfp.py +++ b/test/test_cuda/quantization/test_mxfp_nvfp.py @@ -76,7 +76,10 @@ def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): dataset=dataloader, ) quantized_model_path = self.save_dir - compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") + compressed_model, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="llm_compressor" + ) + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") @@ -162,7 +165,10 @@ def test_qwen_moe_quant_infer(self, dataloader): layer_config=layer_config, ) quantized_model_path = self.save_dir - autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, inplace=False, format="auto_round" + ) + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from auto_round.eval.evaluation import simple_evaluate_user_model diff --git a/test/test_cuda/schemes/test_auto_scheme.py b/test/test_cuda/schemes/test_auto_scheme.py index b91c98428..e15137533 100644 --- a/test/test_cuda/schemes/test_auto_scheme.py +++ b/test/test_cuda/schemes/test_auto_scheme.py @@ -242,8 +242,8 @@ def test_auto_scheme_export(self, tiny_qwen_model_path): model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16")) ar = AutoRound(model=model_name, scheme=scheme) - ar.quantize_and_save(self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.25 @@ -251,15 +251,15 @@ def test_auto_scheme_export(self, tiny_qwen_model_path): scheme = AutoScheme(avg_bits=3, options=("gguf:q2_k_s,gguf:q4_k_s"), nsamples=1, ignore_scale_zp_bits=True) ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1) - ar.quantize_and_save(self.save_dir) + _, quantized_model_path = ar.quantize_and_save(self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) def test_enable_torch_compile(self): model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True) ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True) - ar.quantize_and_save(self.save_dir) - model_args = f"pretrained={self.save_dir}" + _, quantized_model_path = ar.quantize_and_save(self.save_dir) + model_args = f"pretrained={quantized_model_path}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.10 diff --git a/test/test_cuda/schemes/test_scheme.py b/test/test_cuda/schemes/test_scheme.py index 255fdc80e..b6243501e 100644 --- a/test/test_cuda/schemes/test_scheme.py +++ b/test/test_cuda/schemes/test_scheme.py @@ -124,7 +124,8 @@ def test_q2k_mixed(self): disable_opt_rtn=True, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_mixed") + gguf_file = os.listdir(quantized_model_path)[0] file_size = os.path.getsize(os.path.join(quantized_model_path, gguf_file)) / 1024**2 assert abs(file_size - 1236) < 5.0 diff --git a/test/test_cuda/utils/test_alg_ext.py b/test/test_cuda/utils/test_alg_ext.py index a29bffdac..2708574b9 100644 --- a/test/test_cuda/utils/test_alg_ext.py +++ b/test/test_cuda/utils/test_alg_ext.py @@ -33,13 +33,13 @@ def setup_and_teardown_class(self): def test_2bits(self): model_name = get_model_path("facebook/opt-125m") ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True) - ar.quantize_and_save(self.save_folder) + _, quantized_model_path = ar.quantize_and_save(self.save_folder) model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + quantized_model_path, device_map="auto", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) # wo alg ext 0.2078, with 0.2371 diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index fb0dbe1ae..30627f50a 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -12,6 +12,7 @@ class TestAutoRoundXPU: + @classmethod def setup_class(self): self.device = "xpu" @@ -42,7 +43,7 @@ def test_gptq_format(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path) + _, quantized_model_path = autoround.quantize_and_save(output_dir=quantized_model_path) quantization_config = AutoRoundConfig(backend="auto") @@ -74,7 +75,9 @@ def test_awq_format(self, dataloader): dataset=dataloader, ) quantized_model_path = "./saved" - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") + _, quantized_model_path = autoround.quantize_and_save( + output_dir=quantized_model_path, format="auto_round:auto_awq" + ) quantization_config = AutoRoundConfig(backend="auto") # device_map="auto" doesn't work, must use "xpu"