diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100644 new mode 100755 diff --git a/auto_round/__init__.py b/auto_round/__init__.py old mode 100644 new mode 100755 index 509885b33..e075b37e0 --- a/auto_round/__init__.py +++ b/auto_round/__init__.py @@ -14,7 +14,7 @@ from auto_round.autoround import AutoRound # support for old api -from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam, AutoRoundDiffusion +from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundDiffusion from auto_round.schemes import QuantizationScheme from auto_round.auto_scheme import AutoScheme from auto_round.utils import LazyImport diff --git a/auto_round/autoround.py b/auto_round/autoround.py old mode 100644 new mode 100755 index 33ca1ccd7..7be3e3136 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -18,7 +18,6 @@ import torch from auto_round.compressors import ( - AdamCompressor, BaseCompressor, DiffusionCompressor, ExtraConfig, @@ -173,8 +172,6 @@ def __new__( extra_config.diffusion_config = None model_cls.append(LLMCompressor) - if enable_adam: - model_cls.append(AdamCompressor) dynamic_compressor = type("AutoRound", tuple(model_cls), {}) if extra_config: kwargs.update(extra_config.to_dict()) @@ -371,110 +368,6 @@ def __init__( ) -@deprecated("AutoRound") -class AutoRoundAdam(AdamCompressor): - """Class for quantization with optimizers like adamw of a PyTorch model. - - Args: - model: The PyTorch model to be quantized. - tokenizer: An optional tokenizer for processing input data. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] - scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations - bits (int): Number of bits for quantization (default is 4). - group_size (int): Size of the quantization group (default is 128). - sym (bool): Whether sym to be used (default is True). - layer_config (dict): Configuration for weight quantization (default is None). - batch_size (int): Batch size for training (default is 8). - amp (bool): Whether to use automatic mixed precision (default is True). - device: The device to be used for training (default is "auto"). - lr_scheduler: The learning rate scheduler to be used. - dataset: The default dataset name (default is "NeelNanda/pile-10k"). - enable_quanted_input (bool): Whether to use quantized input data (default is True). - enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True). - lr (float): The learning rate (default is 0.005). - minmax_lr (float): The learning rate for min-max tuning (default is None). - low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False). - iters (int): Number of iterations (default is 200). - seqlen (int): Length of the sequence. - nsamples (int): Number of samples (default is 128). - sampler (str): The sampling method (default is "rand"). - seed (int): The random seed (default is 42). - nblocks (int): Number of blocks (default is 1). - gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1). - not_use_best_mse (bool): Whether to use mean squared error (default is False). - dynamic_max_gap (int): The dynamic maximum gap (default is -1). - data_type (str): The data type to be used (default is "int"). - scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels - have different choices. - act_bits (int): Number of bits for activation quantization. Default is 16. - act_group_size (int): Group size for activation quantization. Default is None. - act_sym (bool): Whether to use symmetric activation quantization. Default is None. - act_data_type (str): Specifies the data type for activations. - Defaults to None, in which case it inherits the weight data type. - act_dynamic (bool): Whether to use dynamic activation quantization. Default is True. - to_quant_block_names (str|list): A string or list whose elements are list of - block's layer names to be quantized. - enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning - enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer function - **kwargs: Additional keyword arguments. - - Returns: - The quantized model. - """ - - bits: int | None - group_size: int | None - sym: bool | None - data_type: str | None - act_bits: int | None - act_group_size: int | None - act_sym: bool | None - act_data_type: str | None - act_dynamic: bool | None - super_bits: int | None - super_group_size: int | None - - def __init__( - self, - model: Union[torch.nn.Module, str], - tokenizer=None, - platform: str = "hf", - scheme: Union[str, dict, QuantizationScheme] = "W4A16", - layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, - dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", - iters: int = 200, - seqlen: int = 2048, - nsamples: int = 128, - batch_size: int = 8, - gradient_accumulate_steps: int = 1, - low_gpu_mem_usage: bool = False, - device_map: Union[str, int, torch.device, dict] = 0, - enable_torch_compile: bool = False, - seed: int = 42, - optimizer="AdamW", - **kwargs, - ): - super().__init__( - model=model, - tokenizer=tokenizer, - platform=platform, - scheme=scheme, - layer_config=layer_config, - batch_size=batch_size, - dataset=dataset, - low_gpu_mem_usage=low_gpu_mem_usage, - iters=iters, - seqlen=seqlen, - nsamples=nsamples, - seed=seed, - gradient_accumulate_steps=gradient_accumulate_steps, - enable_torch_compile=enable_torch_compile, - device_map=device_map, - optimizer=optimizer, - **kwargs, - ) - - @deprecated("AutoRound") class AutoRoundMLLM(MLLMCompressor): """Class for automatic rounding-based quantization with MLLMs. diff --git a/auto_round/compressors/__init__.py b/auto_round/compressors/__init__.py old mode 100644 new mode 100755 index 6f8ddf681..05623ecb7 --- a/auto_round/compressors/__init__.py +++ b/auto_round/compressors/__init__.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from auto_round.compressors.adam import AdamCompressor from auto_round.compressors.base import BaseCompressor from auto_round.compressors.base import LLMCompressor from auto_round.compressors.mllm.compressor import MLLMCompressor diff --git a/auto_round/compressors/adam.py b/auto_round/compressors/adam.py deleted file mode 100644 index fb79cf39a..000000000 --- a/auto_round/compressors/adam.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Union - -import torch - -from auto_round.compressors.base import BaseCompressor -from auto_round.schemes import QuantizationScheme -from auto_round.utils import check_is_cpu, htcore, is_hpex_available - - -class AdamCompressor(BaseCompressor): - """Class for quantization with optimizers like adamw of a PyTorch model. - - Args: - model: The PyTorch model to be quantized. - tokenizer: An optional tokenizer for processing input data. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] - scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations - bits (int): Number of bits for quantization (default is 4). - group_size (int): Size of the quantization group (default is 128). - sym (bool): Whether sym to be used (default is True). - layer_config (dict): Configuration for weight quantization (default is None). - batch_size (int): Batch size for training (default is 8). - amp (bool): Whether to use automatic mixed precision (default is True). - device: The device to be used for training (default is "auto"). - lr_scheduler: The learning rate scheduler to be used. - dataset: The default dataset name (default is "NeelNanda/pile-10k"). - enable_quanted_input (bool): Whether to use quantized input data (default is True). - enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True). - lr (float): The learning rate (default is 0.005). - minmax_lr (float): The learning rate for min-max tuning (default is None). - low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False). - iters (int): Number of iterations (default is 200). - seqlen (int): Length of the sequence. - nsamples (int): Number of samples (default is 128). - sampler (str): The sampling method (default is "rand"). - seed (int): The random seed (default is 42). - nblocks (int): Number of blocks (default is 1). - gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1). - not_use_best_mse (bool): Whether to use mean squared error (default is False). - dynamic_max_gap (int): The dynamic maximum gap (default is -1). - data_type (str): The data type to be used (default is "int"). - scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels - have different choices. - act_bits (int): Number of bits for activation quantization. Default is 16. - act_group_size (int): Group size for activation quantization. Default is None. - act_sym (bool): Whether to use symmetric activation quantization. Default is None. - act_data_type (str): Specifies the data type for activations. - Defaults to None, in which case it inherits the weight data type. - act_dynamic (bool): Whether to use dynamic activation quantization. Default is True. - to_quant_block_names (str|list): A string or list whose elements are list of - block's layer names to be quantized. - enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning - enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer function - **kwargs: Additional keyword arguments. - - Returns: - The quantized model. - """ - - bits: int | None - group_size: int | None - sym: bool | None - data_type: str | None - act_bits: int | None - act_group_size: int | None - act_sym: bool | None - act_data_type: str | None - act_dynamic: bool | None - super_bits: int | None - super_group_size: int | None - - def __init__( - self, - model: Union[torch.nn.Module, str], - tokenizer=None, - platform="hf", - scheme: Union[str, dict, QuantizationScheme] = "W4A16", - layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, - dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", - iters: int = 200, - seqlen: int = 2048, - nsamples: int = 128, - batch_size: int = 8, - gradient_accumulate_steps: int = 1, - low_gpu_mem_usage: bool = False, - device_map: Union[str, int, torch.device, dict] = 0, - enable_torch_compile: bool = False, - seed: int = 42, - optimizer="AdamW", - **kwargs, - ): - super(AdamCompressor, self).__init__( - model=model, - tokenizer=tokenizer, - platform=platform, - scheme=scheme, - layer_config=layer_config, - batch_size=batch_size, - dataset=dataset, - low_gpu_mem_usage=low_gpu_mem_usage, - iters=iters, - seqlen=seqlen, - nsamples=nsamples, - seed=seed, - gradient_accumulate_steps=gradient_accumulate_steps, - enable_torch_compile=enable_torch_compile, - device_map=device_map, - **kwargs, - ) - - self.optimizer = self._get_optimizer(optimizer) - - def _get_optimizer(self, optimizer): - if optimizer is None: - optimizer = torch.optim.AdamW - elif isinstance(optimizer, str): - optimizer = getattr(torch.optim, optimizer) - else: - optimizer = optimizer - return optimizer - - def _get_scaler(self): - scaler = None - if self.amp and not check_is_cpu(self.device): - from torch.cuda.amp import GradScaler - - scaler = GradScaler(init_scale=1024, growth_interval=100000) - return scaler - - def _scale_loss_and_backward(self, scaler, loss): - if scaler is not None: - loss = scaler.scale(loss) - - loss.backward() - if is_hpex_available(): - htcore.mark_step() - return loss - - def _step(self, scaler, optimizer, lr_schedule): - if scaler is not None: - scaler.step(optimizer) - optimizer.zero_grad() - lr_schedule.step() - scaler.update() - else: - optimizer.step() - optimizer.zero_grad() - lr_schedule.step() - if is_hpex_available(): - htcore.mark_step() diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py old mode 100644 new mode 100755 index b1f46cc1e..3d43ce929 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -50,7 +50,6 @@ reset_params, set_layer_config, ) -from auto_round.data_type import QUANT_FUNC_WITH_DTYPE from auto_round.data_type.utils import reshape_pad_tensor_by_group_size from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG, ModelType from auto_round.formats import OutputFormat, get_formats @@ -258,6 +257,7 @@ def __init__( self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False self.trust_remote_code = kwargs.pop("trust_remote_code") if "trust_remote_code" in kwargs else True self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False + self.enable_adam = kwargs.pop("enable_adam") if "enable_adam" in kwargs else False self.quantized = False if isinstance(model, str): model, tokenizer = llm_load_model( @@ -425,7 +425,6 @@ def __init__( self.not_use_best_mse = not_use_best_mse self.dynamic_max_gap = dynamic_max_gap self.lr_scheduler = lr_scheduler - self.optimizer = self._get_optimizer(None) self.disable_opt_rtn = disable_opt_rtn # Whether to pack the layer immediately after tuning @@ -912,277 +911,6 @@ def _get_save_folder_name(self, format: OutputFormat) -> str: return self.orig_output_dir - @torch.inference_mode() - def _quantize_embedding_layer(self): - """Quantizes embedding layers in the model according to the configuration. - - This method iterates through all modules in the model, identifies embedding - layers specified in `self.layer_config`, and applies the appropriate quantization - function based on bit precision, grouping strategy, and dtype. - - Returns: - bool: True if the quantization process completes without critical errors. - """ - is_quantized = False - for name, module in self.model.named_modules(): - # Skip non-Embedding modules or layers not in config - if not isinstance(module, torch.nn.Embedding) or name not in self.layer_config: - continue - - config = self.layer_config[name] - - # Skip layers that are not marked for quantization - if not check_to_quantized(config): - continue - is_quantized = True - config["scale_dtype"] = self.scale_dtype - dtype = config["data_type"] - - # Determine quantization function key with symmetry/asymmetry - if dtype not in QUANT_FUNC_WITH_DTYPE: - dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}" - - # Optionally use optimized rounding (RTN) variant - if not self.disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE: - dtype = f"rtn_{dtype}" - - quant_func = QUANT_FUNC_WITH_DTYPE[dtype] - dtype = module.weight.dtype - # As typically float32 are used in RTN to search scale zp, - # to avoid cache a bf16 copy we'd better use float32 - if config.get("super_group_size", None) is not None: - dtype = torch.float32 - - # Attempt quantization on GPU, fall back to CPU if OOM - try: - weight, scale, zp = quant_func( - module.weight.to(dtype=dtype, device=self.device), - **{ - k: config.get(k, None) - for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] - }, - ) - except torch.OutOfMemoryError: - cuda_error_msg = traceback.format_exc() - try: - logger.error(cuda_error_msg) - logger.warning("falling back to CPU") - weight, scale, zp = quant_func( - module.weight.to("cpu"), - **{ - k: config.get(k, None) - for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] - }, - ) - except Exception as e: - raise - - # Overwrite the module's weights with the quantized version - module.weight.data.copy_(weight.cpu()) - - # Attach scale and zero point (zp) to the module - for param_name, value in zip(["scale", "zp"], [scale, zp]): - if isinstance(value, dict): - for k, v in value.items(): - setattr(module, k if k == "scale" else f"w_{k}", v.cpu()) - elif isinstance(value, torch.Tensor): - setattr(module, param_name, value.cpu()) - else: - setattr(module, param_name, value) - - # Update config - self.layer_config.setdefault(name, {}).update(config) - del weight - del scale - del zp - clear_memory(device_list=self.device_list) - - return is_quantized - - def _quant_rtn_with_imatrix(self, all_to_quantized_module_names: list[str]) -> None: - """Performs RTN quantization using input activation statistics (imatrix). - - This method accumulates per-channel second-moment activation statistics (imatrix) - via forward hooks and uses them to perform RTN quantization. If CUDA memory runs out, - it falls back to CPU-based blockwise quantization. - - Args: - all_to_quantized_module_names (list[str]): - A list of module names (e.g., 'model.layers.0.self_attn.q_proj') to be quantized. - - Returns: - None - """ - logger.info("start to compute imatrix") - - # Load dataset - from auto_round.calib_dataset import get_dataloader - - if isinstance(self.dataset, str): - if self.tokenizer is None: - raise ValueError("A tokenizer must be set for the model when using a dataset string.") - dataset_name = self.dataset.replace(" ", "") - self.dataloader = get_dataloader( - self.tokenizer, self.seqlen, dataset_name, self.seed, self.batch_size, self.nsamples - ) - else: - self.dataloader = self.dataset - - model = self.model - - # Dispatch multi-GPU model if necessary - if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: - dispatch_model(model, model.hf_device_map) - - def register_act_hook(model): - """Registers hooks to accumulate activation squared norms into `imatrix`.""" - - def get_imatrix_hook(module, input, output): - input = input[0] if isinstance(input, (tuple, list)) else input - flattened = input.reshape(-1, input.shape[-1]).to(torch.float32) - squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32) - - if not hasattr(module, "imatrix"): - module.imatrix = squared - module.imatrix_cnt = input.shape[0] - else: - module.imatrix += squared.to(module.imatrix.device) - module.imatrix_cnt += input.shape[0] - - hook_handles = [] - for name, module in model.named_modules(): - if type(module) in self.supported_types and check_to_quantized(module): - hook = module.register_forward_hook(get_imatrix_hook) - hook_handles.append(hook) - return hook_handles - - hooks = register_act_hook(model) - - try: - if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: - import accelerate - - accelerate.hooks.remove_hook_from_submodules(model) - safe_to_cpu_(model) - clear_memory(device_list=self.device_list) - self._quantize_via_rtn_blockwise(all_to_quantized_module_names) - except torch.OutOfMemoryError: - cuda_error_msg = traceback.format_exc() - try: - logger.error(cuda_error_msg) - # Final fallback: warn and use CPU-only quantization - logger.warning( - "Fallback to CPU. " - "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`." - ) - safe_to_cpu_(model) - clear_memory(device_list=self.device_list) - if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: - import accelerate - - accelerate.hooks.remove_hook_from_submodules(model) - - orig_device = self.device - self.device = "cpu" - self._quantize_via_rtn_blockwise(all_to_quantized_module_names) - self.device = orig_device - except Exception as e: - raise - finally: - # Always remove hooks - for hook in hooks: - hook.remove() - - def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None: - """Quantizes a layer using RTN (Round-To-Nearest) if available. - - This function attempts to quantize a layer by switching its data type to a - `rtn_*` version if supported, then wraps and unwraps the module to apply - quantization. If GPU memory is insufficient, it falls back to CPU. - - If packing is enabled (`immediate_packing`), the function will also export - the quantized layer to the appropriate backend format. - - Args: - name (str): Name of the layer to quantize. - - Raises: - RuntimeError: If quantization fails for reasons unrelated to memory. - """ - m = get_module(self.model, name) - if dtype is not None: - m = m.to(dtype) - - if is_fp8_linear(m): - m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device) - set_module(self.model, name, m) - tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.device - # Step 1: let gguf merge layers or rename module first and we will handle the RTN is gguf specific logic - if self.is_immediate_packing and self.iters == 0 and self.formats[0].is_gguf() and not self.disable_opt_rtn: - m = m.to(tuning_device) - m.scale = None - m.zp = None - else: - try: - disable_opt_rtn = self.disable_opt_rtn - if ( - not disable_opt_rtn - and self.orig_disable_opt_rtn is None - and self.is_moe_model - and "expert" in m.global_name - and "shared_expert" not in m.global_name - and self.super_bits is None # GGUF still uses the optimized RTN for MoE layers - ): - disable_opt_rtn = True - logger.warning_once( - "MoE layer detected: optimized RTN is disabled for efficiency. " - "Use `--enable_opt_rtn` to force-enable it for MoE layers." - ) - m = m.to(tuning_device) - m = WrapperLinear( - m, - device=tuning_device, - enable_minmax_tuning=False, - enable_norm_bias_tuning=False, - enable_round_tuning=False, - enable_torch_compile=self.enable_torch_compile, - disable_opt_rtn=disable_opt_rtn, - ) - m = m.unwrapper({}) - except torch.OutOfMemoryError: - cuda_error_msg = traceback.format_exc() - m = m.orig_layer if hasattr(m, "orig_layer") else m - try: - logger.error(cuda_error_msg) - logger.warning("falling back to CPU.") - m.to("cpu") - m = WrapperLinear( - m, - enable_minmax_tuning=False, - enable_norm_bias_tuning=False, - enable_round_tuning=False, - enable_torch_compile=self.enable_torch_compile, - ) - m = m.unwrapper({}) - except Exception as e: - raise - - # Step 2: Optional immediate packing/export - if self.is_immediate_packing: # For gguf, packing conducts on block level - self._immediate_pack(name) - if to_cpu: - m = m.to("cpu") - packed_m = get_module(self.model, name) - set_module(self.model, name, packed_m.to("cpu")) - else: - if to_cpu: - m = m.to("cpu") - set_module(self.model, name, m) - if self.is_immediate_saving: - m = get_module(self.model, name) - m.to("cpu") - shard_writer(self, m, name, False) - def _immediate_pack(self, name: str): if not self.is_immediate_packing: return @@ -1195,315 +923,6 @@ def _immediate_pack(self, name: str): tokenizer=self.tokenizer, ) - @torch.inference_mode() - def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: - """Quantize all modules in the model using RTN (Round-To-Nearest) strategy. - - If the target format includes GGUF with `k`, and optimized RTN is enabled, - blockwise quantization with input caching and imatrix is used. - - Returns: - tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration. - """ - if self.amp and self.model.dtype != self.amp_dtype: - self.model.to(self.amp_dtype) - - all_to_quantized_module_names: list[str] = [n for n, m in self.model.named_modules() if check_to_quantized(m)] - self.all_to_quantized_module_names = all_to_quantized_module_names - if is_nv_fp(self.data_type): - # FIXME: (yiliu30) change it to block-wise after we refactor the quantization code and - # https://github.com/intel/auto-round/issues/1331 - materialize_model_(self.model) - self.model.to("cpu") - from auto_round.data_type.nvfp import calculate_gparam - from auto_round.data_type.utils import update_fused_layer_global_scales - - pbar = tqdm(all_to_quantized_module_names) - for name in pbar: - pbar.set_description(f"Calculate weight global scale: {name}") - m = get_module(self.model, name) - if is_fp8_linear(m): - m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device) - set_module(self.model, name, m) - weight_global_scale = calculate_gparam(m.weight, self.group_size) - setattr(m, "weight_global_scale", weight_global_scale) - - logger.info("Start to update fused layer global scales, it may take some time.") - for name, module in self.model.named_modules(): - update_fused_layer_global_scales(module) - logger.info("Finished updating fused layer global scales.") - - if not (any(fmt.is_gguf() for fmt in getattr(self, "formats", [])) or self.super_bits is not None): - self._quantize_embedding_layer() # leave to gguf itself to handle - - # Release memory - clear_memory(device_list=self.device_list) - - enable_imatrix = False - if not self.disable_opt_rtn: - has_gguf_k = ( - any(fmt.is_gguf() and "k" in fmt.output_format for fmt in getattr(self, "formats", [])) - or self.super_bits is not None - ) - if has_gguf_k: - enable_imatrix = True - elif self.data_type == "int" and self.sym: - enable_imatrix = True - if enable_imatrix: - self._quant_rtn_with_imatrix(all_to_quantized_module_names) - elif self.act_bits <= 8 and check_need_act_calibration( - self.act_dynamic, - self.act_data_type, - self.act_bits, - self.static_kv_dtype, - self.static_attention_dtype, - ): # TODO, mixed datatype has bug - hook_handles = self._register_act_max_hook(self.model) - try: - self._quantize_via_rtn_blockwise(all_to_quantized_module_names) - except torch.OutOfMemoryError: - logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.") - self.model = self.model.to("cpu") - clear_memory(device_list=self.device_list) - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: - import accelerate - - accelerate.hooks.remove_hook_from_submodules(self.model) - orig_device = self.device - self.device = "cpu" - self._quantize_via_rtn_blockwise(all_to_quantized_module_names) - self.device = orig_device - for handle in hook_handles: - handle.remove() - else: - # By default, we go with layer-wise way if no replacement happened - use_blockwise_quantization = global_state.replaced_module_count > 0 - tied_weights_keys = getattr(self.model, "_tied_weights_keys", []) - if tied_weights_keys is None: - tied_weights_keys = [] - if isinstance(tied_weights_keys, dict): - tied_weights_values = list(tied_weights_keys.values()) - else: - tied_weights_values = list(tied_weights_keys) - tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values] # rm weight/bias - # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it - if hasattr(self, "formats") and self.formats[0].is_gguf(): - lm_head_name = get_lm_head_name(self.model) - if lm_head_name is not None: - tied_weights_layers.append(lm_head_name) - - if use_blockwise_quantization: # The ram usage is a little higher - all_to_quantized_module_names = list(set(all_to_quantized_module_names)) - - all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model) - pbar = tqdm(range(sum(len(block) for block in all_blocks))) - for block_names in all_blocks: - for block_name in block_names: - pbar.set_description(f"Quantizing {block_name}") - block = get_module(self.model, block_name) - materialize_model_(block) - for name, m in block.named_modules(): - if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names: - self._quantize_layer_via_rtn(m.global_name, to_cpu=self.low_gpu_mem_usage) - all_to_quantized_module_names.remove(m.global_name) - elif ( - not any(m.children()) - and len(m.state_dict()) > 0 - and m.global_name not in tied_weights_layers - ): - set_module(self.model, m.global_name, copy.deepcopy(m)) - if self.is_immediate_saving: - shard_writer(self, name=m.global_name) - m.to("meta") - clear_memory(device_list=self.device_list) - memory_monitor.log_summary() - pbar.update(1) - cnt = 1 - for name in all_to_quantized_module_names: - logger.info(f"Quantizing remaining layer {name} on CPU.") - self._quantize_layer_via_rtn(name, to_cpu=True) - cnt += 1 - if cnt % 10 == 0: - clear_memory(device_list=self.device_list) - memory_monitor.log_summary() - else: - materialize_model_(self.model) - self.model.to("cpu") - block_names_cnt = len(flatten_list(get_block_names(self.model, True))) - clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt - cnt = 0 - pbar = tqdm(all_to_quantized_module_names) - - for n, m in self.model.named_modules(): - if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names: - pbar.set_description(f"Quantizing {m.global_name}") - self._quantize_layer_via_rtn(m.global_name) - cnt += 1 - pbar.update() - if cnt % clear_mem_freq == 0: - clear_memory(device_list=self.device_list) - memory_monitor.log_summary() - - elif not any(m.children()) and len(m.state_dict()) > 0 and n not in tied_weights_layers: - set_module(self.model, n, copy.deepcopy(m)) - if self.is_immediate_saving: - shard_writer(self, name=n) - m.to("meta") - - # Convert remaining fp8 - if is_fp8_model(self.model): - convert_fp8_module_to_16b(self.model, self.amp_dtype, self.device) - if self.is_immediate_saving: - shard_writer(self, is_finalize=True) - - self.quantized = True - return self.model, self.layer_config - - def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) -> None: - """Quantize model layers block by block using cached inputs and imatrix. - - Args: - all_to_quantized_module_names (list[str]): Names of layers to be quantized. - """ - all_to_quantized_module_names = list(set(all_to_quantized_module_names)) - - all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model) - if not all_blocks: - raise ValueError("Could not find any blocks. Check the model or quant_block_list.") - - all_first_block_names = [block[0] for block in all_blocks] - layer_names = self._get_quantized_layer_names_outside_blocks() - if self.act_bits < 16 and (not self.act_dynamic or len(layer_names) > 0): - if len(layer_names) > 0: - logger.warning( - "quantize layers outside blocks for static activation quantizaiton" - " will significantly increase calibration time" - ) - all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names) - else: - all_inputs = self.cache_inter_data(all_first_block_names, self.nsamples) - - # Clear hooks for multi-GPU setups - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: - accelerate.hooks.remove_hook_from_submodules(self.model) - - pbar = tqdm(range(sum(len(block) for block in all_blocks))) - - for block_names in all_blocks: - first_block = block_names[0] - inputs = all_inputs.pop(first_block) - input_keys = [k for k in inputs if k.startswith("hidden_state")] - if len(input_keys) != 1: - raise RuntimeError( - "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues" - ) - inputs["input_ids"] = inputs.pop(input_keys[0]) - - clear_memory(self.inputs, device_list=self.device_list) - - total_samples = len(inputs["input_ids"]) - if total_samples < self.batch_size: - self.batch_size = total_samples - logger.warning(f"Forcing batch size to {total_samples}") - - input_ids = to_device(inputs.pop("input_ids"), self.cache_device) - input_others = to_device(inputs, self.cache_device) - - tmp_dtype = self.amp_dtype if self.amp else torch.float32 - input_ids = [id_.to(tmp_dtype) for id_ in input_ids] - - for key, val in input_others.items(): - if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16): - input_others[key] = val.to(tmp_dtype) - elif isinstance(val, list): - input_others[key] = [to_dtype(v, tmp_dtype) for v in val] - - for block_name in block_names: - pbar.set_description(f"Quantizing {block_name}") - block = get_module(self.model, block_name) - materialize_model_(block) - block.to("cpu") - if is_fp8_model(self.model): - convert_fp8_module_to_16b(block, dtype=self.amp_dtype, device=self.device) - - if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1: - set_auto_device_map_for_block_with_tuning( - block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, self.device - ) - # Dispatch model if needed - if len(self.device_list) > 1: - from accelerate.hooks import AlignDevicesHook, add_hook_to_module - - for _, m in block.named_modules(): - if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): - continue - hook = AlignDevicesHook(m.tuning_device, io_same_device=True) - add_hook_to_module(m, hook, True) - else: - block = block.to(self.device) - input_ids = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - self.device, - self.cache_device, - ) - - if len(self.device_list) > 1: - accelerate.hooks.remove_hook_from_submodules(block) - - if is_nv_fp(self.act_data_type) or is_static_wfp8afp8(self): - # enable moe experts act_max automatic generation for Linear - set_amax_for_all_moe_layers(block, attr_name="act_max") - # Normalize imatrix and quantize layers - if self.low_gpu_mem_usage: - block.to("cpu") - clear_memory(device_list=self.device_list) - - for name, m in block.named_modules(): - # fix issue: Ling-flash-2.0-q2_k_s fail infer on cuda but well on cpu - # https://huggingface.co/Intel/Ling-flash-2.0-gguf-q2ks-mixed-AutoRound/discussions/1 - if hasattr(m, "imatrix"): - m.imatrix /= m.imatrix_cnt - if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names: - self._quantize_layer_via_rtn(m.global_name, to_cpu=self.low_gpu_mem_usage) - all_to_quantized_module_names.remove(m.global_name) - - if not self.is_immediate_saving: - # some modules may have been flushed and set to meta, so we could not move to gpu - mv_module_from_gpu(block) - if block_name == block_names[-1]: - clear_memory(input_ids, device_list=self.device_list) - else: - clear_memory(device_list=self.device_list) - - memory_monitor.log_summary() - pbar.update(1) - pbar.close() - # Process remaining layers not in blocks - for name in all_to_quantized_module_names: - dtype = None - if self.super_group_size is not None: - dtype = torch.float32 - self._quantize_layer_via_rtn(name, dtype=dtype) - # clear_memory(device_list=self.device_list) - # if self.is_immediate_saving: - # shard_writer(self, is_finalize=True) - - def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]: - keys = inputs.keys() - input_id_str = [key for key in keys if key.startswith("hidden_state")] - if len(input_id_str) != 1: - raise RuntimeError( - "hidden_states arg mismatch error," - "please raise an issue in https://github.com/intel/auto-round/issues" - ) - inputs["input_ids"] = inputs.pop(input_id_str[0], None) - if q_inputs is not None: - q_inputs = q_inputs.pop(input_id_str[0], None) - return inputs, q_inputs - def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True): is_gguf_format = (f := getattr(self, "formats", None)) is not None and len(f) > 0 and f[0].is_gguf() if not is_gguf_format: @@ -1639,214 +1058,14 @@ def _should_disable_inplace_due_to_layers_outside_block() -> bool: # Determine if immediate packing is required self._adjust_immediate_packing_and_saving() - if self.iters == 0: - return self._quantize_rtn() - - if bool(self.quant_block_list): - all_blocks = self.quant_block_list - else: - all_blocks = get_block_names(self.model) - - if len(all_blocks) == 0: - logger.warning("could not find blocks, exit with original model") - return self.model, self.layer_config - - if self.amp and self.model.dtype != self.amp_dtype: - self.model = self.model.to(self.amp_dtype) - - layer_names = self._get_quantized_layer_names_outside_blocks() - start_time = time.time() - all_first_block_names = [block[0] for block in all_blocks] - if len(layer_names) > 0: - logger.info( - "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names - ) - else: - logger.info("start to cache block inputs") - all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names) - is_quantized_embedding = self._quantize_embedding_layer() - clear_memory(device_list=self.device_list) - all_q_inputs = None - if is_quantized_embedding: - all_inputs = copy.deepcopy(self.inputs) - clear_memory(self.inputs, device_list=self.device_list) - all_q_inputs = self.try_cache_inter_data_gpucpu( - all_first_block_names, self.nsamples, layer_names=layer_names - ) - self.model = mv_module_from_gpu(self.model) - clear_memory(device_list=self.device_list) - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: - accelerate.hooks.remove_hook_from_submodules(self.model) # self.model.hf_device_map has not been changed - logger.info("caching done") - if len(all_blocks) > 1: - pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks)) - else: - pbar = tqdm(range(0, len(all_blocks[0]), self.nblocks)) # move the alg warning outside pbar - - for block_names in all_blocks: - inputs = all_inputs[block_names[0]] - all_inputs.pop(block_names[0]) - q_inputs = None - if all_q_inputs is not None: - q_inputs = all_q_inputs[block_names[0]] - all_q_inputs.pop(block_names[0]) - - inputs, q_inputs = self._update_inputs(inputs, q_inputs) - - clear_memory(self.inputs, device_list=self.device_list) - - if "input_ids" in inputs.keys(): - total_samples = len(inputs["input_ids"]) - if total_samples < self.batch_size: - self.batch_size = total_samples - logger.warning(f"force the train batch size to {total_samples}") - - self._quantize_blocks( - self.model, - inputs, - block_names, - q_input=q_inputs if q_inputs is not None else None, - nblocks=self.nblocks, - device=self.device, - pbar=pbar, - ) - if self.is_immediate_packing and len(self.formats) != 1: - raise ValueError( - f"Expected exactly one packing format when 'immediate_packing' is True, " - f"but got {len(self.formats)} formats." - ) - pbar.set_description("Quantizing done") - pbar.close() - self._quantize_layers(layer_names, all_inputs) - - if is_fp8_model(self.model): - for n, m in self.model.named_modules(): - if is_fp8_linear(m): - new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device).to("cpu") - set_module(self.model, n, new_layer) - if self.is_immediate_saving: - shard_writer(self, is_finalize=True) - - end_time = time.time() - cost_time = end_time - start_time - logger.info(f"quantization tuning time {cost_time}") - - # Dump a summary - quantized_layers = [] - unquantized_layers = [] - for n, m in self.model.named_modules(): - if isinstance(m, tuple(self.supported_types)): - if check_to_quantized(m): - quantized_layers.append(n) - else: - unquantized_layers.append(n) - elif hasattr(m, "scales") or hasattr(m, "scale"): # packing_immediately - quantized_layers.append(n) - summary_info = ( - f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model" - ) - if len(unquantized_layers) > 0: - summary_info += f", {unquantized_layers} have not been quantized" - logger.info(summary_info) - - self.quantized = True - return self.model, self.layer_config + if self.immediate_saving and "int" not in self.data_type: + logger.warning("immediate_saving is only supported for int quantization, set to False") + self.immediate_saving = False - def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: - """Quantizes specified layers based on inputs and configuration. + from auto_round.quantizers import create_quantizers - Args: - layer_names (list): list of layer names to quantize. - layer_inputs (dict): Dictionary mapping layer names to input data. - - Returns: - None - """ - # TODO currently we take all the layers outside blocks as post block layers which is not optimal - # if there is no input for layer, we use rtn - - for layer_name in copy.deepcopy(layer_names): - if layer_name not in layer_inputs: - if self.act_bits < 16 and not self.act_dynamic: - # Activation quantization requires collected inputs - msg_prefix = ( - f"Activation max hook for layer '{layer_name}' is unavailable due to " - f"insufficient collected inputs. " - ) - if "fp8_e5m2" in self.act_data_type: - logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.") - else: - logger.warning( - msg_prefix + "Static activation quantization is not supported or ineffective, " - "Skipping quantization for this layer." - ) - layer_names.remove(layer_name) - continue - logger.info(f"using rtn to quantize {layer_name}") - from auto_round.data_type import QUANT_FUNC_WITH_DTYPE - - layer = get_module(self.model, layer_name) - layer = layer.to(self.device) - if is_fp8_linear(layer): - new_layer = convert_fp8_layer_to_linear(layer, self.amp_dtype, self.device).to(self.device) - set_module(self.model, layer_name, new_layer) - layer = new_layer - - wrapper_layer = WrapperLinear( - layer, - enable_round_tuning=False, - enable_minmax_tuning=False, - enable_norm_bias_tuning=False, - enable_torch_compile=self.enable_torch_compile, - device=self.device, - disable_opt_rtn=self.disable_opt_rtn, - ) - new_layer = wrapper_layer.unwrapper({}) - set_module(self.model, layer_name, new_layer) - layer.cpu() - layer_names.remove(layer_name) - if len(layer_names) == 0: - memory_monitor.update() - memory_monitor.log_summary() - return - q_layer_inputs = None - enable_quanted_input = self.enable_quanted_input - has_gguf = False - - if hasattr(self, "formats"): - has_gguf = any(format_.is_gguf() for format_ in self.formats) - if has_gguf and self.is_immediate_packing: - enable_quanted_input = False - - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1 and enable_quanted_input: - dispatch_model(self.model, self.model.hf_device_map) - - if enable_quanted_input: - logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names) - q_layer_inputs = self.try_cache_inter_data_gpucpu([], self.nsamples, layer_names=layer_names) - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: - accelerate.hooks.remove_hook_from_submodules( - self.model - ) # self.model.hf_device_map has not been changed - if not self.is_immediate_saving: - self.model = mv_module_from_gpu(self.model) - clear_memory(device_list=self.device_list) - quant_layer = self._quantize_layer - for layer_name in layer_names: - layer_input = layer_inputs[layer_name] - layer_input = to_device(layer_input, self.cache_device) - q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None - q_layer_input = to_device(q_layer_input, self.cache_device) - quant_layer(layer_name, layer_input, q_layer_input, device=self.device) - if self.is_immediate_packing: - self._immediate_pack(layer_name) - - if self.is_immediate_saving: - m = get_module(self.model, layer_name) - shard_writer(self, m, name=layer_name, is_finalize=False) - del layer_input - clear_memory(q_layer_input, device_list=self.device_list) - memory_monitor.log_summary() + quantizers = create_quantizers(self) + return quantizers.quantize() @torch.no_grad() def _get_block_outputs( @@ -2399,244 +1618,6 @@ def _replace_forward(self): hook_handle = m.register_forward_hook(hook_func) self.hook_handles.append(hook_handle) - def _register_act_max_hook(self, model): - def get_act_max_hook(module, input, output): - if isinstance(input, (tuple, list)): - input = input[0] - if input.numel() == 0: - return # as no needs for act_max update - input, _, _ = reshape_pad_tensor_by_group_size(input, self.act_group_size) - act_max = torch.max(torch.abs(input), dim=-1).values - if not hasattr(module, "act_max") or module.act_max.numel() == 0: - module.act_max = act_max - else: - act_max = act_max.to(module.act_max.device) - if is_nv_fp(self.act_data_type): ## for nvfp per-tensor input_global_scale calculation usage - module.act_max = torch.max( - torch.tensor([act_max.max(), module.act_max.max()], device=act_max.device) - ) - else: - module.act_max = torch.max(act_max, module.act_max) - - hook_handles = [] - # for single layers out of blocks, like lm_head - if isinstance(model, SUPPORTED_LAYER_TYPES): - m = model - if ( - hasattr(m, "act_dynamic") - and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits) - and check_to_quantized(m) - ): - hook = m.register_forward_hook(get_act_max_hook) - hook_handles.append(hook) - return hook_handles - - for n, m in model.named_modules(): - if ( - hasattr(m, "act_dynamic") - and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits) - and check_to_quantized(m) - ): - hook = m.register_forward_hook(get_act_max_hook) - hook_handles.append(hook) - continue - - # for whole model, RTN - if n in self.layer_config: - config = self.layer_config[n] - act_dynamic = config.get("act_dynamic", True) - act_data_type = config.get("act_data_type", None) - act_bits = config.get("act_bits", 16) - if ( - config["bits"] <= 8 - and check_need_act_calibration(act_dynamic, act_data_type, act_bits) - and check_to_quantized(config) - ): - hook = m.register_forward_hook(get_act_max_hook) - hook_handles.append(hook) - continue - return hook_handles - - def _quantize_layer( - self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu" - ): - """Quantize a specific layer of the model using the provided inputs. - - Args: - layer_name (str): The name of the layer to quantize. - inputs (torch.Tensor): Input data for quantization. - q_inputs (torch.Tensor, optional): Quantized input data. Defaults to None. - device (torch.device, optional): The device to use for quantization. Defaults to torch.device("cpu"). - - Returns: - None - """ - logger.info(f"quantizing layer {layer_name}") - layer = get_module(self.model, layer_name) - if hasattr(layer, "tuning_device"): - device = layer.tuning_device - - layer = layer.to(device) - for i in range(len(inputs)): - inputs[i] = inputs[i].to(layer.weight.dtype) - if q_inputs is not None: - q_inputs[i] = q_inputs[i].to(layer.weight.dtype) - - if self.act_bits <= 8 and check_need_act_calibration( - self.act_dynamic, - self.act_data_type, - self.act_bits, - self.static_kv_dtype, - self.static_attention_dtype, - ): - tmp_inputs = q_inputs if q_inputs is not None else inputs - hook_handles = self._register_act_max_hook(layer) - with torch.no_grad(): - for input in tmp_inputs: - layer(input) - for handle in hook_handles: - handle.remove() - - wrapper_linear = WrapperLinear( - layer, - enable_minmax_tuning=self.enable_minmax_tuning, - enable_torch_compile=self.enable_torch_compile, - device=device, - ).to(device) - round_params = [] - minmax_params = [] - for key in wrapper_linear.params.keys(): - if "min" in key or "max" in key: - minmax_params.append(wrapper_linear.params[key]) - else: - round_params.append(wrapper_linear.value) - if len(round_params) + len(minmax_params) <= 0: - dump_info = f"quantized {layer_name}" - logger.info(dump_info) - with torch.no_grad(): - unwrapper_layer(self.model, wrapper_linear, layer_name, {}) - mv_module_from_gpu(layer) - - lr = torch.tensor(self.lr) - minmax_lr = torch.tensor(self.minmax_lr) - if self.enable_minmax_tuning: - optimizer = self.optimizer( - [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0 - ) - else: - optimizer = self.optimizer(round_params, lr=lr, weight_decay=0) - - if self.lr_scheduler is None: - lr_schedule = torch.optim.lr_scheduler.LinearLR( - optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters - ) - else: - lr_schedule = copy.deepcopy(self.lr_scheduler) - nsamples = len(inputs) - last_best_iter = 0 - best_loss = torch.finfo(torch.float).max - scaler = self._get_scaler() # pylint: disable=assignment-from-none - init_loss = None - gradient_accumulate_steps = self.batch_size # Force to low gpu - - total_loss = 0 - num_elm = 1 - mse_reduction = "mean" - if gradient_accumulate_steps != 1: - mse_reduction = "sum" - mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) - batch_size = 1 # Force to low gpu - global_batch_size = self.batch_size * gradient_accumulate_steps - global_batch_size = min(nsamples, global_batch_size) - if gradient_accumulate_steps != 1 and not self.attention_mask: - whole_indices = torch.arange(global_batch_size) - if q_inputs is not None: - num_elm = self._get_current_num_elm(q_inputs, whole_indices) - else: - num_elm = self._get_current_num_elm(inputs, whole_indices) - - index_sampler = IndexSampler(nsamples, global_batch_size) - - for i in range(self.iters): - total_loss = 0 - global_indices = index_sampler.next_batch() - if self.attention_mask: - num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices) - - for tmp_step in range(gradient_accumulate_steps): - indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] - if q_inputs is not None: - current_input = [q_inputs[i] for i in indices] - current_input = torch.cat(current_input, dim=0).to(device) - org_input = [inputs[i] for i in indices] - org_input = torch.cat(org_input, dim=0).to(device) - else: - current_input = [inputs[i] for i in indices] - current_input = torch.cat(current_input, dim=0).to(device) - org_input = current_input - with torch.no_grad(): - current_output = layer(org_input) - autocast_ctx = ( - nullcontext() - if not self.amp - else autocast(device_type=str(device).split(":")[0], dtype=self.amp_dtype) - ) - if self.attention_mask: - tmp_attention_mask = [self.attention_mask[i] for i in indices] - tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device) - tmp_attention_mask.unsqueeze_(-1) - - with autocast_ctx: - output_q = wrapper_linear(current_input) # pylint: disable=not-callable - loss = mse_loss( # pylint: disable=not-callable - (output_q * tmp_attention_mask).to(torch.float32), - (current_output * tmp_attention_mask).to(torch.float32), - ) - - else: - with autocast_ctx: - output_q = wrapper_linear(current_input) # pylint: disable=not-callable - loss = mse_loss( # pylint: disable=not-callable - output_q.to(torch.float32), - current_output.to(torch.float32), # mul 1.0 will copy the output - ) - - num_elm = 1 if num_elm <= 0 else num_elm - total_loss += loss.item() / num_elm - - self._scale_loss_and_backward(scaler, loss) - if i == 0: - init_loss = total_loss - - if total_loss < best_loss: - best_loss = total_loss - if not self.not_use_best_mse: - best_params = collect_best_params(wrapper_linear, self.cache_device) - last_best_iter = i - if self.not_use_best_mse and i == self.iters - 1: - best_params = collect_best_params(wrapper_linear, self.cache_device) - - if not self.not_use_best_mse: - if 0 < self.dynamic_max_gap <= i - last_best_iter: - break - self._step(scaler, optimizer, lr_schedule) - - last_loss = total_loss - best_iter = self.iters - if not self.not_use_best_mse: - last_loss = best_loss - best_iter = last_best_iter - with torch.no_grad(): - unwrapper_layer(self.model, wrapper_linear, layer_name, best_params) - mv_module_from_gpu(layer) - dump_info = f"quantized {layer_name}, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" - logger.info(dump_info) - - def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> torch.Tensor: - current_output = [output[x] for x in indices] - current_output = torch.cat(current_output, dim=self.batch_dim) - return current_output - def _get_current_q_output( self, block: torch.nn.Module, @@ -2665,13 +1646,6 @@ def _get_current_num_elm( current_input_ids = [input_ids[i] for i in indices] return sum(id.numel() for id in current_input_ids) - def _get_non_zero_cnt(self, tensor: list[torch.Tensor], indices: list[int]) -> int: - current_tensors = [tensor[i] for i in indices] - non_zero_cnt = 0 - for t in current_tensors: - non_zero_cnt += torch.count_nonzero(t).item() - return non_zero_cnt - def quantize_block( self, block: torch.nn.Module, @@ -2694,8 +1668,20 @@ def quantize_block( self.normalize_decoding_layer_inputs_(inputs) block_inputs = self.inputs[self.quant_block_list[0][0]] decoding_layer_first_input_name = "hidden_states" - input_ids, input_others = self._preprocess_block_inputs(block_inputs, decoding_layer_first_input_name) - return self._quantize_block(block, input_ids, input_others, q_input, device, auto_offload) + from auto_round.quantizers.algs.auto_round import ARQuantizer + from auto_round.quantizers.utils import preprocess_block_inputs + + input_ids, input_others = preprocess_block_inputs( + block_inputs, + device_list=self.device_list, + first_input_name=decoding_layer_first_input_name, + amp=self.amp, + amp_dtype=self.amp_dtype, + cache_device=self.cache_device, + is_diffusion=self.diffusion, + ) + ar_quantizer = ARQuantizer(self) + return ar_quantizer.quantize_block(block, input_ids, input_others, q_input, device, auto_offload) def _get_loss( self, @@ -2726,384 +1712,6 @@ def _get_loss( return loss - def _quantize_block( - self, - block: torch.nn.Module, - input_ids: Union[list[torch.Tensor], dict], - input_others: dict, - q_input: Union[torch.Tensor, dict, None] = None, - device: Union[str, torch.device] = "cpu", - auto_offload=True, - ): - """Quantize the weights of a given block of the model. - - Args: - block: The block of the model to be quantized. - input_ids: The input tensor containing tokenized input ids. - input_others: A dictionary containing additional input data. - q_input: The quantized input tensor. - device: The device for quantization. - - Returns: - Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) - """ - materialize_model_(block) - if is_fp8_model(self.model): - for n, m in block.named_modules(): - if is_fp8_linear(m): - new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device).to(device) - set_module(block, n, new_layer) - - if auto_offload: - # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights - # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk - if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1: - card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( - block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device - ) - else: - block = block.to(device) - card_0_in_high_risk, loss_device = False, device - else: - card_0_in_high_risk, loss_device = False, device - - if len(self.device_list) > 1 and auto_offload: - for n, m in block.named_modules(): - if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): - continue - from accelerate.hooks import AlignDevicesHook, add_hook_to_module - - hook = AlignDevicesHook(m.tuning_device, io_same_device=True) - add_hook_to_module(m, hook, True) - - if q_input is None: - hook_handles = self._register_act_max_hook(block) - - output = self._get_block_outputs( - block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device - ) - - for handle in hook_handles: - handle.remove() - else: - output = self._get_block_outputs( - block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device - ) - hook_handles = self._register_act_max_hook(block) - if hook_handles: - self._get_block_outputs( - block, - q_input, - input_others, - self.batch_size * self.infer_bs_coeff, - device, - self.cache_device, - save_output=False, - ) - - for handle in hook_handles: - handle.remove() - - if q_input is not None: - if input_ids is not q_input: - clear_memory(input_ids, device_list=self.device_list) - else: - clear_memory(device_list=self.device_list) - input_ids = q_input - - quantized_layer_names, unquantized_layer_names = self.wrapper_block( - block, - self.enable_minmax_tuning, - self.enable_norm_bias_tuning, - enable_torch_compile=self.enable_torch_compile, - device=device, - ) - if is_nv_fp(self.data_type): # enable qkv and moe structure global_scale fuse - from auto_round.data_type.utils import update_fused_layer_global_scales - - modules = block.modules() - for module in modules: - update_fused_layer_global_scales(module) - round_params = [] - minmax_params = [] - for n, m in block.named_modules(): - if hasattr(m, "orig_layer"): - for key in m.params.keys(): - if "min" in key or "max" in key: - minmax_params.append(m.params[key]) - else: - round_params.append(m.params[key]) - - lr = torch.tensor(self.lr) - minmax_lr = torch.tensor(self.minmax_lr) - is_adam = "adam" in self.__class__.__name__.lower() - - extra_kwargs = {} if is_adam else {"momentum": self.momentum} - - if self.enable_minmax_tuning: - params = [ - {"params": round_params}, - {"params": minmax_params, "lr": minmax_lr}, - ] - else: - params = round_params - - optimizer = self.optimizer( - params, - lr=lr, - weight_decay=0, - **extra_kwargs, - ) - - if len(round_params) + len(minmax_params) <= 0: - dump_info = ( - f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " - f"layers in the block" - ) - logger.info(dump_info) - unwrapper_block(block, {}) - mv_module_from_gpu(block) - return output, output - - if self.lr_scheduler is None: - lr_schedule = torch.optim.lr_scheduler.LinearLR( - optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters - ) - else: - lr_schedule = copy.deepcopy(self.lr_scheduler) - - if isinstance(input_ids, dict): # input_ids of Flux is dict - nsamples = len(input_ids["hidden_states"]) - else: - nsamples = len(input_ids) - last_best_iter = 0 - best_loss = torch.finfo(torch.float).max - num_elm = 1 - mse_reduction = "mean" - if self.gradient_accumulate_steps != 1: - mse_reduction = "sum" - mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) - scaler = self._get_scaler() # pylint: disable=assignment-from-none - init_loss = None - best_params = {} - total_loss = 0 - global_batch_size = self.batch_size * self.gradient_accumulate_steps - global_batch_size = min(nsamples, global_batch_size) - # We assume the block input and output shape is same - if self.gradient_accumulate_steps != 1 and not self.attention_mask: - whole_indices = torch.arange(global_batch_size) - num_elm = self._get_current_num_elm(input_ids, whole_indices) - - index_sampler = IndexSampler(nsamples, global_batch_size) - batch_size = self.batch_size - for i in range(self.iters): - if self.enable_alg_ext and self.data_type.endswith("dq"): - for n, m in block.named_modules(): - m.cur_iter = i - total_loss = 0 - global_indices = index_sampler.next_batch() - if self.attention_mask: - num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices) - - for tmp_step in range(self.gradient_accumulate_steps): - indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] - current_output = self._get_current_output(output, indices) - current_output = to_device(current_output, loss_device) - output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device) - loss = self._get_loss(output_q, current_output, indices, mse_loss, device) - num_elm = 1 if num_elm <= 0 else num_elm - total_loss += loss.item() / num_elm - - if self.low_gpu_mem_usage and card_0_in_high_risk: - # clear memory to avoid OOM due to memory fragmentation - clear_memory_if_reached_threshold(threshold=0.5, device_list=self.device_list) - - self._scale_loss_and_backward(scaler, loss) - - if self.low_gpu_mem_usage and card_0_in_high_risk: - # clear memory to avoid OOM due to memory fragmentation - clear_memory_if_reached_threshold(threshold=0.8, device_list=self.device_list) - - if i == 0: - init_loss = total_loss - - if total_loss < best_loss: - best_loss = total_loss - if not self.not_use_best_mse: - best_params = collect_best_params(block, self.cache_device) - # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True) - - last_best_iter = i - if self.not_use_best_mse and i == self.iters - 1: - best_params = collect_best_params(block, self.cache_device) - - if not self.not_use_best_mse: - if 0 < self.dynamic_max_gap <= i - last_best_iter: - break - self._step(scaler, optimizer, lr_schedule) - - last_loss = total_loss - best_iter = self.iters - if not self.not_use_best_mse: - last_loss = best_loss - best_iter = last_best_iter - if self.iters > 0: - dump_info = ( - f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " - f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" - ) - else: - dump_info = ( - f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " - "layers in the block" - ) - - if self.low_gpu_mem_usage: - clear_memory(device_list=self.device_list) # clear cached memory during training - if len(unquantized_layer_names) != 0: - logger.info(f"{unquantized_layer_names} have not been quantized") - with torch.no_grad(): - unwrapper_block(block, best_params) - - if is_nv_fp(self.act_data_type): - # enable moe experts act_max automatic generation for WrapperWALayer - set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max") - - if self.enable_quanted_input: - q_outputs = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - device, - cache_device=self.cache_device, - ) - - if len(self.device_list) > 1 and auto_offload: - accelerate.hooks.remove_hook_from_submodules(block) - if auto_offload: - mv_module_from_gpu(block) - - clear_memory(input_ids, device_list=self.device_list) - memory_info_summary = memory_monitor.get_summary() - logger.infoclean(dump_info + "," + memory_info_summary) - - return q_outputs, output - else: - if len(self.device_list) > 1 and auto_offload: - accelerate.hooks.remove_hook_from_submodules(block) - if auto_offload: - mv_module_from_gpu(block) - clear_memory(input_ids, device_list=self.device_list) - memory_info_summary = memory_monitor.get_summary() - logger.infoclean(dump_info + "," + memory_info_summary) - - return None, output - - def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[torch.Tensor, dict]: - input_ids = inputs[first_input_name] - inputs.pop(first_input_name, None) - input_others = inputs - return input_ids, input_others - - def _preprocess_block_inputs(self, inputs, first_input_name="input_ids"): - input_ids, input_others = self._split_inputs(inputs, first_input_name) - clear_memory(device_list=self.device_list) - input_ids = to_device(input_ids, self.cache_device) - input_others = to_device(input_others, self.cache_device) - # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage - - tmp_dtype = self.amp_dtype if self.amp else torch.float32 - input_ids = to_dtype(input_ids, tmp_dtype) - - for key in input_others.keys(): - if isinstance(input_others[key], torch.Tensor) and ( - input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16 - ): - input_others[key] = input_others[key].to(tmp_dtype) - elif isinstance(input_others[key], list): - for i in range(len(input_others[key])): - to_dtype(input_others[key][i], tmp_dtype) - return input_ids, input_others - - def _quantize_blocks( - self, - model: torch.nn.Module, - inputs: dict, - block_names: list, - q_input: torch.Tensor = None, - nblocks: int = 1, - device: str = "cpu", - pbar: tqdm = None, - ): - """Quantize and dequantize the weights of the specified blocks in the model. - - Args: - model: The PyTorch model to be quantized. - inputs: The input data for quantization. - block_names: The names of the blocks to be quantized and dequantized. - nblocks: The number of blocks to quantize and dequantize. - device: The device for quantization and dequantization. - - Returns: - None - """ - clear_memory(device_list=self.device_list) - for n, m in model.named_parameters(): - m.requires_grad_(False) - - input_ids, input_others = self._preprocess_block_inputs(inputs) - - if pbar is None: - pbar = tqdm(range(0, len(block_names), nblocks)) - - for i in range(0, len(block_names), nblocks): - if i != 0: - pbar.update(1) - if nblocks == 1: - n = block_names[i] - pbar.set_description(f"Quantizing {n}") - m = get_module(model, n) - else: - names = block_names[i : min(i + nblocks, len(block_names))] - pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}") - modules = [get_module(model, n) for n in names] - m = WrapperMultiblock(modules) - - m.config = model.config if hasattr(model, "config") else None - q_input, input_ids = self._quantize_block( - m, - input_ids, - input_others, - q_input=q_input, - device=device, - ) - if hasattr(model, "config"): - del m.config - if self.is_immediate_packing: - for n, tmp_m in m.named_modules(): - if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): - continue - self._immediate_pack(tmp_m.global_name) - - if self.is_immediate_saving: - shard_writer(self, m, is_finalize=False) - if pbar is not None: - pbar.update(1) - - if not self.is_immediate_saving: - self.model = mv_module_from_gpu(self.model) - for n, m in self.model.named_modules(): - if hasattr(m, "name"): - delattr(m, "name") - - del q_input - del input_ids - del input_others - del inputs - - clear_memory(device_list=self.device_list) - def save_quantized( self, output_dir: str = None, @@ -3166,30 +1774,6 @@ def save_quantized( else: return compressed_model - def _get_quantized_layer_names_outside_blocks(self) -> list: - """Gets the names of quantized layers outside blocks in the model. - - Returns: - list: List of layer names outside blocks. - """ - if self.layer_config is None or len(self.layer_config) == 0: - return [] - - layer_names = [] - all_layers_in_block = get_layer_names_in_block(self.model, self.supported_types, self.quant_block_list) - - for key in self.layer_config.keys(): - if key in all_layers_in_block: - continue - layer = get_module(self.model, key) - if layer is None: - logger.error(f"could not find layer {key} in the model, exit...") - exit(-1) - if type(layer) in self.supported_types and check_to_quantized(self.layer_config[key]): - layer_names.append(key) - - return layer_names - def _set_amp_dtype(self) -> None: """Sets the automatic mixed precision (AMP) data type for the model based on the device and configuration.""" self.amp_dtype = torch.bfloat16 @@ -3212,55 +1796,6 @@ def _set_amp_dtype(self) -> None: self.amp_dtype = torch.float32 self.model = self.model.to(torch.float32) - def _get_optimizer(self, optimizer: Any): - """Returns the specified optimizer. In SignRound, we fix the optimizer. - - Args: - optimizer: The optimizer to be used. - - Returns: - The specified optimizer. - """ - return SignSGD - - def _get_scaler(self): - """Returns scaler, in SignRound, no need to use scaler.""" - return None - - def _scale_loss_and_backward(self, scaler: Any, loss: torch.Tensor) -> torch.Tensor: - """Scales the loss and performs backward pass. - - Args: - scaler: The scaler to be used. - loss: The loss to be scaled. - - Returns: - The scaled loss. - """ - scale_loss = loss * 1000 - scale_loss.backward() - if is_hpex_available(): - htcore.mark_step() - return scale_loss - - def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any): - """Performs a step in the optimization process. - - Args: - scaler: The scaler to be used. - optimizer: The optimizer for the step. - lr_schedule: The learning rate schedule. - - Returns: - None - """ - optimizer.step() - # for hpu - if is_hpex_available(): - htcore.mark_step() - optimizer.zero_grad() - lr_schedule.step() - @classmethod @torch.no_grad() def _sampling_inputs( diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py old mode 100644 new mode 100755 index 6d9580e4f..f6ffd6741 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -165,25 +165,6 @@ def __init__( **kwargs, ) - def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]: - # flux transformer model's blocks will update hidden_states and encoder_hidden_states - input_id_str = [key for key in inputs.keys() if "hidden_state" in key] - if q_inputs is not None: - q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str} - return inputs, q_inputs - - def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[dict, dict]: - input_id_str = [key for key in inputs.keys() if "hidden_state" in key] - input_ids = {k: inputs.pop(k, None) for k in input_id_str} - input_others = inputs - return input_ids, input_others - - def _get_current_output(self, output: dict, indices: list[int]) -> torch.Tensor: - assert "hidden_states" in output - current_output = [output["hidden_states"][x] for x in indices] - current_output = torch.cat(current_output, dim=self.batch_dim) - return current_output - def _get_current_q_output( self, block: torch.nn.Module, diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py old mode 100644 new mode 100755 diff --git a/auto_round/quantizers/__init__.py b/auto_round/quantizers/__init__.py new file mode 100644 index 000000000..87ac77b62 --- /dev/null +++ b/auto_round/quantizers/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_round.quantizers.entrypoint import create_quantizers diff --git a/auto_round/quantizers/algs/__init__.py b/auto_round/quantizers/algs/__init__.py new file mode 100755 index 000000000..14a492441 --- /dev/null +++ b/auto_round/quantizers/algs/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/auto_round/quantizers/algs/auto_round.py b/auto_round/quantizers/algs/auto_round.py new file mode 100755 index 000000000..5970b467a --- /dev/null +++ b/auto_round/quantizers/algs/auto_round.py @@ -0,0 +1,1021 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import time +import traceback +from contextlib import nullcontext +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +import accelerate +import torch +from accelerate.big_modeling import dispatch_model, infer_auto_device_map +from torch import autocast +from tqdm import tqdm + +from auto_round.compressors.shard_writer import shard_writer +from auto_round.compressors.utils import ( + IndexSampler, + check_need_act_calibration, + collect_best_params, + is_nv_fp, +) +from auto_round.logger import logger +from auto_round.modelling.replace_modules import materialize_model_, safe_to_cpu_ +from auto_round.quantizers.algs.base import AlgsBaseQuantizer +from auto_round.quantizers.utils import ( + get_non_zero_cnt, + get_quantized_layer_names_outside_blocks, + preprocess_block_inputs, + quantize_embedding_layer, + register_act_max_hook, + update_inputs, +) +from auto_round.sign_sgd import SignSGD +from auto_round.utils import ( + check_is_cpu, + check_to_quantized, + clear_memory, + convert_fp8_layer_to_linear, + get_block_names, + get_module, + htcore, + is_auto_device_mapping, + is_fp8_linear, + is_fp8_model, + is_hpex_available, + memory_monitor, + mv_module_from_gpu, + set_amax_for_all_moe_layers, + set_module, + to_device, +) +from auto_round.utils.device import ( + clear_memory_if_reached_threshold, + set_auto_device_map_for_block_with_tuning, +) +from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block + +if TYPE_CHECKING: + from auto_round.compressors.base import BaseCompressor + + +class ARQuantizer(AlgsBaseQuantizer): + + def __init__(self, compressor: "BaseCompressor"): + super().__init__(compressor) + self.all_blocks = [] + self.layer_names = [] + self.all_q_inputs = None + self.optimizer = self._get_optimizer(None) + self.is_adam = False + + def _pre_quantize_impl(self, *args, **kwargs): + if bool(self.compressor.quant_block_list): + self.all_blocks = self.compressor.quant_block_list + else: + self.all_blocks = get_block_names(self.compressor.model) + if len(self.all_blocks) == 0: + logger.warning("could not find blocks, exit with original model") + return self.compressor.model, self.compressor.layer_config + + if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype: + self.compressor.model = self.compressor.model.to(self.compressor.amp_dtype) + + self.layer_names = get_quantized_layer_names_outside_blocks( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + supported_types=self.compressor.supported_types, + quant_block_list=self.compressor.quant_block_list, + ) + self.all_first_block_names = [block[0] for block in self.all_blocks] + if len(self.layer_names) > 0: + logger.info( + "Starting to cache block inputs. This may be slow due to external block layers: %s", self.layer_names + ) + else: + logger.info("start to cache block inputs") + + # TODO: refactor this + self.all_inputs = self.compressor.try_cache_inter_data_gpucpu( + self.all_first_block_names, self.compressor.nsamples, layer_names=self.layer_names + ) + is_quantized_embedding = quantize_embedding_layer( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + scale_dtype=self.compressor.scale_dtype, + disable_opt_rtn=self.compressor.disable_opt_rtn, + device=self.compressor.device, + device_list=self.compressor.device_list, + ) + clear_memory(device_list=self.compressor.device_list) + if is_quantized_embedding: + self.all_inputs = copy.deepcopy(self.compressor.inputs) + clear_memory(self.compressor.inputs, device_list=self.compressor.device_list) + # TODO: refactor this + self.all_q_inputs = self.compressor.try_cache_inter_data_gpucpu( + self.all_first_block_names, self.compressor.nsamples, layer_names=self.layer_names + ) + self.compressor.model = mv_module_from_gpu(self.compressor.model) + clear_memory(device_list=self.compressor.device_list) + + if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules( + self.compressor.model + ) # self.compressor.model.hf_device_map has not been changed + logger.info("caching done") + + def _quantize_impl(self, *args, **kwargs): + start_time = time.time() + + if len(self.all_blocks) > 1: + pbar = tqdm(range(0, sum([len(i) for i in self.all_blocks]), self.compressor.nblocks)) + else: + pbar = tqdm(range(0, len(self.all_blocks[0]), self.compressor.nblocks)) # move the alg warning outside pbar + + for block_names in self.all_blocks: + inputs = self.all_inputs[block_names[0]] + self.all_inputs.pop(block_names[0]) + q_inputs = None + if self.all_q_inputs is not None: + q_inputs = self.all_q_inputs[block_names[0]] + self.all_q_inputs.pop(block_names[0]) + + inputs, q_inputs = update_inputs(inputs, q_inputs, self.compressor.diffusion) + + clear_memory(self.compressor.inputs, device_list=self.compressor.device_list) + + if "input_ids" in inputs.keys(): + total_samples = len(inputs["input_ids"]) + if total_samples < self.compressor.batch_size: + self.compressor.batch_size = total_samples + logger.warning(f"force the train batch size to {total_samples}") + + self._quantize_blocks( + self.compressor.model, + inputs, + block_names, + q_input=q_inputs if q_inputs is not None else None, + nblocks=self.compressor.nblocks, + device=self.compressor.device, + pbar=pbar, + ) + if self.compressor.is_immediate_packing and len(self.compressor.formats) != 1: + raise ValueError( + f"Expected exactly one packing format when 'immediate_packing' is True, " + f"but got {len(self.compressor.formats)} formats." + ) + pbar.set_description("Quantizing done") + pbar.close() + self._quantize_layers(self.layer_names, self.all_inputs) + + end_time = time.time() + cost_time = end_time - start_time + logger.info(f"quantization tuning time {cost_time}") + + return self.compressor.model, self.compressor.layer_config + + def _post_quantize_impl(self, *args, **kwargs): + if is_fp8_model(self.compressor.model): + for n, m in self.compressor.model.named_modules(): + if is_fp8_linear(m): + new_layer = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device).to( + "cpu" + ) + set_module(self.compressor.model, n, new_layer) + + # Dump a summary + quantized_layers = [] + unquantized_layers = [] + for n, m in self.compressor.model.named_modules(): + if isinstance(m, tuple(self.compressor.supported_types)): + if check_to_quantized(m): + quantized_layers.append(n) + else: + unquantized_layers.append(n) + elif hasattr(m, "scales") or hasattr(m, "scale"): ##packing_immediately + quantized_layers.append(n) + summary_info = ( + f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model" + ) + if len(unquantized_layers) > 0: + summary_info += f", {unquantized_layers} have not been quantized" + logger.info(summary_info) + + self.compressor.quantized = True + + def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: + """Quantizes specified layers based on inputs and configuration. + + Args: + layer_names (list): list of layer names to quantize. + layer_inputs (dict): Dictionary mapping layer names to input data. + + Returns: + None + """ + # TODO currently we take all the layers outside blocks as post block layers which is not optimal + # if there is no input for layer, we use rtn + + for layer_name in copy.deepcopy(layer_names): + if layer_name not in layer_inputs: + if self.compressor.act_bits < 16 and not self.compressor.act_dynamic: + # Activation quantization requires collected inputs + msg_prefix = ( + f"Activation max hook for layer '{layer_name}' is unavailable due to " + f"insufficient collected inputs. " + ) + if "fp8_e5m2" in self.compressor.act_data_type: + logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.") + else: + logger.warning( + msg_prefix + "Static activation quantization is not supported or ineffective, " + "Skipping quantization for this layer." + ) + layer_names.remove(layer_name) + continue + logger.info(f"using rtn to quantize {layer_name}") + from auto_round.data_type import QUANT_FUNC_WITH_DTYPE + + layer = get_module(self.compressor.model, layer_name) + layer = layer.to(self.compressor.device) + if is_fp8_linear(layer): + new_layer = convert_fp8_layer_to_linear( + layer, self.compressor.amp_dtype, self.compressor.device + ).to(self.compressor.device) + set_module(self.compressor.model, layer_name, new_layer) + layer = new_layer + + wrapper_layer = WrapperLinear( + layer, + enable_round_tuning=False, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_torch_compile=self.compressor.enable_torch_compile, + device=self.compressor.device, + disable_opt_rtn=self.compressor.disable_opt_rtn, + ) + new_layer = wrapper_layer.unwrapper({}) + set_module(self.compressor.model, layer_name, new_layer) + layer.cpu() + layer_names.remove(layer_name) + if len(layer_names) == 0: + memory_monitor.update() + memory_monitor.log_summary() + return + q_layer_inputs = None + enable_quanted_input = self.compressor.enable_quanted_input + has_gguf = False + + if hasattr(self.compressor, "formats"): + has_gguf = any(format_.is_gguf() for format_ in self.compressor.formats) + if has_gguf and self.compressor.is_immediate_packing: + enable_quanted_input = False + + if ( + hasattr(self.compressor.model, "hf_device_map") + and len(self.compressor.model.hf_device_map) > 1 + and enable_quanted_input + ): + dispatch_model(self.compressor.model, self.compressor.model.hf_device_map) + + if enable_quanted_input: + logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names) + # TODO: refactor this + q_layer_inputs = self.compressor.try_cache_inter_data_gpucpu( + [], self.compressor.nsamples, layer_names=layer_names + ) + if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules( + self.compressor.model + ) # self.compressor.model.hf_device_map has not been changed + if not self.compressor.is_immediate_saving: + self.compressor.model = mv_module_from_gpu(self.compressor.model) + clear_memory(device_list=self.compressor.device_list) + for layer_name in layer_names: + layer_input = layer_inputs[layer_name] + layer_input = to_device(layer_input, self.compressor.cache_device) + q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None + q_layer_input = to_device(q_layer_input, self.compressor.cache_device) + self.quantize_layer(layer_name, layer_input, q_layer_input, device=self.compressor.device) + if self.compressor.is_immediate_packing: + self.compressor._immediate_pack(layer_name) + + if self.compressor.is_immediate_saving: + m = get_module(self.compressor.model, layer_name) + shard_writer(self.compressor, m, name=layer_name, last_group=True) + del layer_input + clear_memory(q_layer_input, device_list=self.compressor.device_list) + memory_monitor.log_summary() + + def _quantize_blocks( + self, + model: torch.nn.Module, + inputs: dict, + block_names: list, + q_input: torch.Tensor = None, + nblocks: int = 1, + device: str = "cpu", + pbar: tqdm = None, + ): + """Quantize and dequantize the weights of the specified blocks in the model. + + Args: + model: The PyTorch model to be quantized. + inputs: The input data for quantization. + block_names: The names of the blocks to be quantized and dequantized. + nblocks: The number of blocks to quantize and dequantize. + device: The device for quantization and dequantization. + + Returns: + None + """ + clear_memory(device_list=self.compressor.device_list) + for n, m in model.named_parameters(): + m.requires_grad_(False) + + input_ids, input_others = preprocess_block_inputs( + inputs, + device_list=self.compressor.device_list, + first_input_name="input_ids", + amp=self.compressor.amp, + amp_dtype=self.compressor.amp_dtype, + cache_device=self.compressor.cache_device, + is_diffusion=self.compressor.diffusion, + ) + + if pbar is None: + pbar = tqdm(range(0, len(block_names), nblocks)) + + for i in range(0, len(block_names), nblocks): + if i != 0: + pbar.update(1) + if nblocks == 1: + n = block_names[i] + pbar.set_description(f"Quantizing {n}") + m = get_module(model, n) + else: + names = block_names[i : min(i + nblocks, len(block_names))] + pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}") + modules = [get_module(model, n) for n in names] + m = WrapperMultiblock(modules) + + m.config = model.config if hasattr(model, "config") else None + q_input, input_ids = self.quantize_block( + m, input_ids, input_others, q_input=q_input, device=device, last_group=(i + nblocks) >= len(block_names) + ) + if hasattr(model, "config"): + del m.config + if self.compressor.is_immediate_packing: + for n, tmp_m in m.named_modules(): + if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): + continue + self.compressor._immediate_pack(tmp_m.global_name) + + if self.compressor.is_immediate_saving: + shard_writer(self.compressor, m, is_finalize=False) + if pbar is not None: + pbar.update(1) + + if not self.compressor.is_immediate_saving: + self.compressor.model = mv_module_from_gpu(self.compressor.model) + for n, m in self.compressor.model.named_modules(): + if hasattr(m, "name"): + delattr(m, "name") + + del q_input + del input_ids + del input_others + del inputs + + clear_memory(device_list=self.compressor.device_list) + + def _quantize_layer_impl( + self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu" + ): + """Quantize a specific layer of the model using the provided inputs. + + Args: + layer_name (str): The name of the layer to quantize. + inputs (torch.Tensor): Input data for quantization. + q_inputs (torch.Tensor, optional): Quantized input data. Defaults to None. + device (torch.device, optional): The device to use for quantization. Defaults to torch.device("cpu"). + + Returns: + None + """ + logger.info(f"quantizing layer {layer_name}") + layer = get_module(self.compressor.model, layer_name) + if hasattr(layer, "tuning_device"): + device = layer.tuning_device + + layer = layer.to(device) + for i in range(len(inputs)): + inputs[i] = inputs[i].to(layer.weight.dtype) + if q_inputs is not None: + q_inputs[i] = q_inputs[i].to(layer.weight.dtype) + + if self.compressor.act_bits <= 8 and check_need_act_calibration( + self.compressor.act_dynamic, + self.compressor.act_data_type, + self.compressor.act_bits, + self.compressor.static_kv_dtype, + self.compressor.static_attention_dtype, + ): + tmp_inputs = q_inputs if q_inputs is not None else inputs + hook_handles = register_act_max_hook( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + act_group_size=self.compressor.act_group_size, + act_data_type=self.compressor.act_data_type, + ) + with torch.no_grad(): + for input in tmp_inputs: + layer(input) + for handle in hook_handles: + handle.remove() + + wrapper_linear = WrapperLinear( + layer, + enable_minmax_tuning=self.compressor.enable_minmax_tuning, + enable_torch_compile=self.compressor.enable_torch_compile, + device=device, + ).to(device) + round_params = [] + minmax_params = [] + for key in wrapper_linear.params.keys(): + if "min" in key or "max" in key: + minmax_params.append(wrapper_linear.params[key]) + else: + round_params.append(wrapper_linear.value) + if len(round_params) + len(minmax_params) <= 0: + dump_info = f"quantized {layer_name}" + logger.info(dump_info) + with torch.no_grad(): + unwrapper_layer(self.compressor.model, wrapper_linear, layer_name, {}) + mv_module_from_gpu(layer) + + lr = torch.tensor(self.compressor.lr) + minmax_lr = torch.tensor(self.compressor.minmax_lr) + if self.compressor.enable_minmax_tuning: + optimizer = self.optimizer( + [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0 + ) + else: + optimizer = self.optimizer([{"params": round_params}], lr=lr, weight_decay=0) + if self.compressor.lr_scheduler is None: + lr_schedule = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.compressor.iters + ) + else: + lr_schedule = copy.deepcopy(self.compressor.lr_scheduler) + nsamples = len(inputs) + last_best_iter = 0 + best_loss = torch.finfo(torch.float).max + scaler = self._get_scaler() # pylint: disable=assignment-from-none + init_loss = None + gradient_accumulate_steps = self.compressor.batch_size # Force to low gpu + total_loss = 0 + num_elm = 1 + mse_reduction = "mean" + if gradient_accumulate_steps != 1: + mse_reduction = "sum" + mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) + batch_size = 1 # Force to low gpu + global_batch_size = self.compressor.batch_size * gradient_accumulate_steps + global_batch_size = min(nsamples, global_batch_size) + if gradient_accumulate_steps != 1 and not self.compressor.attention_mask: + whole_indices = torch.arange(global_batch_size) + if q_inputs is not None: + # Todo: refactor this + num_elm = self.compressor._get_current_num_elm(q_inputs, whole_indices) + else: + num_elm = self.compressor._get_current_num_elm(inputs, whole_indices) + + index_sampler = IndexSampler(nsamples, global_batch_size) + + for i in range(self.compressor.iters): + total_loss = 0 + global_indices = index_sampler.next_batch() + if self.compressor.attention_mask: + num_elm = get_non_zero_cnt(self.compressor.attention_mask, global_indices) + + for tmp_step in range(gradient_accumulate_steps): + indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] + if q_inputs is not None: + current_input = [q_inputs[i] for i in indices] + current_input = torch.cat(current_input, dim=0).to(device) + org_input = [inputs[i] for i in indices] + org_input = torch.cat(org_input, dim=0).to(device) + else: + current_input = [inputs[i] for i in indices] + current_input = torch.cat(current_input, dim=0).to(device) + org_input = current_input + with torch.no_grad(): + current_output = layer(org_input) + autocast_ctx = ( + nullcontext() + if not self.compressor.amp + else autocast(device_type=str(device).split(":")[0], dtype=self.compressor.amp_dtype) + ) + if self.compressor.attention_mask: + tmp_attention_mask = [self.compressor.attention_mask[i] for i in indices] + tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device) + tmp_attention_mask.unsqueeze_(-1) + + with autocast_ctx: + output_q = wrapper_linear(current_input) # pylint: disable=not-callable + loss = mse_loss( # pylint: disable=not-callable + (output_q * tmp_attention_mask).to(torch.float32), + (current_output * tmp_attention_mask).to(torch.float32), + ) + + else: + with autocast_ctx: + output_q = wrapper_linear(current_input) # pylint: disable=not-callable + loss = mse_loss( # pylint: disable=not-callable + output_q.to(torch.float32), + current_output.to(torch.float32), # mul 1.0 will copy the output + ) + + num_elm = 1 if num_elm <= 0 else num_elm + total_loss += loss.item() / num_elm + + self._scale_loss_and_backward(scaler, loss) + if i == 0: + init_loss = total_loss + + if total_loss < best_loss: + best_loss = total_loss + if not self.compressor.not_use_best_mse: + best_params = collect_best_params(wrapper_linear, self.compressor.cache_device) + last_best_iter = i + if self.compressor.not_use_best_mse and i == self.compressor.iters - 1: + best_params = collect_best_params(wrapper_linear, self.compressor.cache_device) + + if not self.compressor.not_use_best_mse: + if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter: + break + self._step(scaler, optimizer, lr_schedule) + + last_loss = total_loss + best_iter = self.compressor.iters + if not self.compressor.not_use_best_mse: + last_loss = best_loss + best_iter = last_best_iter + with torch.no_grad(): + unwrapper_layer(self.compressor.model, wrapper_linear, layer_name, best_params) + mv_module_from_gpu(layer) + dump_info = f"quantized {layer_name}, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" + logger.info(dump_info) + + def _quantize_block_impl( + self, + block: torch.nn.Module, + input_ids: Union[list[torch.Tensor], dict], + input_others: dict, + q_input: Union[torch.Tensor, dict, None] = None, + device: Union[str, torch.device] = "cpu", + auto_offload=True, + **kwargs, + ): + """Quantize the weights of a given block of the model. + + Args: + block: The block of the model to be quantized. + input_ids: The input tensor containing tokenized input ids. + input_others: A dictionary containing additional input data. + q_input: The quantized input tensor. + device: The device for quantization. + + Returns: + Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) + """ + materialize_model_(block) + if is_fp8_model(self.compressor.model): + for n, m in block.named_modules(): + if is_fp8_linear(m): + new_layer = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device).to( + device + ) + set_module(block, n, new_layer) + + if auto_offload: + # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights + # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk + if is_auto_device_mapping(self.compressor.device_map) and len(self.compressor.device_list) > 1: + card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( + block, + self.compressor.device_map, + input_ids, + self.compressor.low_gpu_mem_usage, + self.compressor.batch_size, + device, + ) + else: + block = block.to(device) + card_0_in_high_risk, loss_device = False, device + else: + card_0_in_high_risk, loss_device = False, device + + if len(self.compressor.device_list) > 1 and auto_offload: + for n, m in block.named_modules(): + if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): + continue + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + hook = AlignDevicesHook(m.tuning_device, io_same_device=True) + add_hook_to_module(m, hook, True) + + if q_input is None: + hook_handles = register_act_max_hook( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + act_group_size=self.compressor.act_group_size, + act_data_type=self.compressor.act_data_type, + ) + + # TODO: refactor this part + output = self.compressor._get_block_outputs( + block, + input_ids, + input_others, + self.compressor.batch_size * self.compressor.infer_bs_coeff, + device, + self.compressor.cache_device, + ) + + for handle in hook_handles: + handle.remove() + else: + # TODO: refactor this part + output = self.compressor._get_block_outputs( + block, + input_ids, + input_others, + self.compressor.batch_size * self.compressor.infer_bs_coeff, + device, + self.compressor.cache_device, + ) + hook_handles = register_act_max_hook( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + act_group_size=self.compressor.act_group_size, + act_data_type=self.compressor.act_data_type, + ) + if hook_handles: + # TODO: refactor this part + self.compressor._get_block_outputs( + block, + q_input, + input_others, + self.compressor.batch_size * self.compressor.infer_bs_coeff, + device, + self.compressor.cache_device, + save_output=False, + ) + + for handle in hook_handles: + handle.remove() + + if q_input is not None: + if input_ids is not q_input: + clear_memory(input_ids, device_list=self.compressor.device_list) + else: + clear_memory(device_list=self.compressor.device_list) + input_ids = q_input + + quantized_layer_names, unquantized_layer_names = self.compressor.wrapper_block( + block, + self.compressor.enable_minmax_tuning, + self.compressor.enable_norm_bias_tuning, + enable_torch_compile=self.compressor.enable_torch_compile, + device=device, + ) + if is_nv_fp(self.compressor.data_type): # enable qkv and moe structure global_scale fuse + from auto_round.data_type.utils import update_fused_layer_global_scales + + modules = block.modules() + for module in modules: + update_fused_layer_global_scales(module) + round_params = [] + minmax_params = [] + for n, m in block.named_modules(): + if hasattr(m, "orig_layer"): + for key in m.params.keys(): + if "min" in key or "max" in key: + minmax_params.append(m.params[key]) + else: + round_params.append(m.params[key]) + + lr = torch.tensor(self.compressor.lr) + minmax_lr = torch.tensor(self.compressor.minmax_lr) + + extra_kwargs = {} if self.is_adam else {"momentum": self.compressor.momentum} + + if self.compressor.enable_minmax_tuning: + params = [ + {"params": round_params}, + {"params": minmax_params, "lr": minmax_lr}, + ] + else: + params = round_params + + optimizer = self.optimizer( + params, + lr=lr, + weight_decay=0, + **extra_kwargs, + ) + + if len(round_params) + len(minmax_params) <= 0: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + f"layers in the block" + ) + logger.info(dump_info) + unwrapper_block(block, {}) + mv_module_from_gpu(block) + return output, output + + if self.compressor.lr_scheduler is None: + lr_schedule = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.compressor.iters + ) + else: + lr_schedule = copy.deepcopy(self.compressor.lr_scheduler) + + if isinstance(input_ids, dict): # input_ids of Flux is dict + nsamples = len(input_ids["hidden_states"]) + else: + nsamples = len(input_ids) + last_best_iter = 0 + best_loss = torch.finfo(torch.float).max + num_elm = 1 + mse_reduction = "mean" + if self.compressor.gradient_accumulate_steps != 1: + mse_reduction = "sum" + mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) + scaler = self._get_scaler() # pylint: disable=assignment-from-none + init_loss = None + best_params = {} + total_loss = 0 + global_batch_size = self.compressor.batch_size * self.compressor.gradient_accumulate_steps + global_batch_size = min(nsamples, global_batch_size) + # We assume the block input and output shape is same + if self.compressor.gradient_accumulate_steps != 1 and not self.compressor.attention_mask: + whole_indices = torch.arange(global_batch_size) + num_elm = self.compressor._get_current_num_elm(input_ids, whole_indices) + + index_sampler = IndexSampler(nsamples, global_batch_size) + batch_size = self.compressor.batch_size + for i in range(self.compressor.iters): + if self.compressor.enable_alg_ext and self.compressor.data_type.endswith("dq"): + for n, m in block.named_modules(): + m.cur_iter = i + total_loss = 0 + global_indices = index_sampler.next_batch() + if self.compressor.attention_mask: + num_elm = get_non_zero_cnt(self.compressor.attention_mask, global_indices) + + for tmp_step in range(self.compressor.gradient_accumulate_steps): + indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] + current_output = self._get_current_output( + output, indices, self.compressor.batch_dim, diffusion=self.compressor.diffusion + ) + current_output = to_device(current_output, loss_device) + # TODO: refactor this + output_q = self.compressor._get_current_q_output( + block, input_ids, input_others, indices, device, loss_device + ) + # TODO: refactor this + loss = self.compressor._get_loss(output_q, current_output, indices, mse_loss, device) + num_elm = 1 if num_elm <= 0 else num_elm + total_loss += loss.item() / num_elm + + if self.compressor.low_gpu_mem_usage and card_0_in_high_risk: + # clear memory to avoid OOM due to memory fragmentation + clear_memory_if_reached_threshold(threshold=0.5, device_list=self.compressor.device_list) + + self._scale_loss_and_backward(scaler, loss) + if self.compressor.low_gpu_mem_usage and card_0_in_high_risk: + # clear memory to avoid OOM due to memory fragmentation + clear_memory_if_reached_threshold(threshold=0.8, device_list=self.compressor.device_list) + + if i == 0: + init_loss = total_loss + + if total_loss < best_loss: + best_loss = total_loss + if not self.compressor.not_use_best_mse: + best_params = collect_best_params(block, self.compressor.cache_device) + # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True) + + last_best_iter = i + if self.compressor.not_use_best_mse and i == self.compressor.iters - 1: + best_params = collect_best_params(block, self.compressor.cache_device) + + if not self.compressor.not_use_best_mse: + if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter: + break + self._step(scaler, optimizer, lr_schedule) + + last_loss = total_loss + best_iter = self.compressor.iters + if not self.compressor.not_use_best_mse: + last_loss = best_loss + best_iter = last_best_iter + if self.compressor.iters > 0: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" + ) + else: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + "layers in the block" + ) + + if self.compressor.low_gpu_mem_usage: + clear_memory(device_list=self.compressor.device_list) # clear cached memory during training + if len(unquantized_layer_names) != 0: + logger.info(f"{unquantized_layer_names} have not been quantized") + with torch.no_grad(): + unwrapper_block(block, best_params) + + if is_nv_fp(self.compressor.act_data_type): + # enable moe experts act_max automatic generation for WrapperWALayer + set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max") + + if self.compressor.enable_quanted_input: + # TODO: refactor this + q_outputs = self.compressor._get_block_outputs( + block, + input_ids, + input_others, + self.compressor.batch_size * self.compressor.infer_bs_coeff, + device, + cache_device=self.compressor.cache_device, + ) + + if len(self.compressor.device_list) > 1 and auto_offload: + accelerate.hooks.remove_hook_from_submodules(block) + if auto_offload: + mv_module_from_gpu(block) + + clear_memory(input_ids, device_list=self.compressor.device_list) + memory_info_summary = memory_monitor.get_summary() + logger.infoclean(dump_info + "," + memory_info_summary) + + return q_outputs, output + else: + if len(self.compressor.device_list) > 1 and auto_offload: + accelerate.hooks.remove_hook_from_submodules(block) + if auto_offload: + mv_module_from_gpu(block) + clear_memory(input_ids, device_list=self.compressor.device_list) + memory_info_summary = memory_monitor.get_summary() + logger.infoclean(dump_info + "," + memory_info_summary) + + return None, output + + def _post_quantize_block_impl(self, block: torch.nn.Module, *args, last_group: bool, **kwargs): + """Post-process after quantizing a block. + + Args: + block: The block of the model that was quantized. + + Returns: + None + """ + if hasattr(block, "config"): + del block.config + if self.compressor.is_immediate_packing: + for _, tmp_m in block.named_modules(): + if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): + continue + self.compressor._immediate_pack(tmp_m.global_name) + + if self.compressor.is_immediate_saving: + shard_writer(self.compressor, block, last_group=last_group) + + @staticmethod + def _get_current_output( + output: list[torch.Tensor], indices: list[int], batch_dim: int, diffusion: bool = False + ) -> torch.Tensor: + if diffusion: + assert "hidden_states" in output + current_output = [output["hidden_states"][x] for x in indices] + current_output = torch.cat(current_output, dim=batch_dim) + return current_output + + current_output = [output[x] for x in indices] + current_output = torch.cat(current_output, dim=batch_dim) + return current_output + + def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any): + """Performs a step in the optimization process. + + Args: + scaler: The scaler to be used. + optimizer: The optimizer for the step. + lr_schedule: The learning rate schedule. + + Returns: + None + """ + optimizer.step() + # for hpu + if is_hpex_available(): + htcore.mark_step() + optimizer.zero_grad() + lr_schedule.step() + + def _scale_loss_and_backward(self, scaler: Any, loss: torch.Tensor) -> torch.Tensor: + """Scales the loss and performs backward pass. + + Args: + scaler: The scaler to be used. + loss: The loss to be scaled. + + Returns: + The scaled loss. + """ + scale_loss = loss * 1000 + scale_loss.backward() + if is_hpex_available(): + htcore.mark_step() + return scale_loss + + def _get_scaler(self): + """Returns scaler, in SignRound, no need to use scaler.""" + return None + + def _get_optimizer(self, optimizer: Any): + """Returns the specified optimizer. In SignRound, we fix the optimizer. + + Args: + optimizer: The optimizer to be used. + + Returns: + The specified optimizer. + """ + return SignSGD + + +class ARAdamQuantizer(ARQuantizer): + """AutoRound Quantizer with Adam optimizer.""" + + def __init__(self, compressor: "BaseCompressor"): + super().__init__(compressor) + self.optimizer = self._get_optimizer("AdamW") + self.is_adam = True + + def _step(self, scaler, optimizer, lr_schedule): + if scaler is not None: + scaler.step(optimizer) + optimizer.zero_grad() + lr_schedule.step() + scaler.update() + else: + optimizer.step() + optimizer.zero_grad() + lr_schedule.step() + if is_hpex_available(): + htcore.mark_step() + + def _scale_loss_and_backward(self, scaler, loss): + if scaler is not None: + loss = scaler.scale(loss) + + loss.backward() + if is_hpex_available(): + htcore.mark_step() + return loss + + def _get_scaler(self): + scaler = None + if self.compressor.amp and not check_is_cpu(self.compressor.device): + from torch.cuda.amp import GradScaler + + scaler = GradScaler(init_scale=1024, growth_interval=100000) + return scaler + + def _get_optimizer(self, optimizer): + if optimizer is None: + optimizer = torch.optim.AdamW + elif isinstance(optimizer, str): + optimizer = getattr(torch.optim, optimizer) + else: + optimizer = optimizer + return optimizer diff --git a/auto_round/quantizers/algs/base.py b/auto_round/quantizers/algs/base.py new file mode 100755 index 000000000..bcb112094 --- /dev/null +++ b/auto_round/quantizers/algs/base.py @@ -0,0 +1,51 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from auto_round.quantizers.base import BaseQuantizer + +if TYPE_CHECKING: + from auto_round.compressors.base import BaseCompressor + + +class AlgsBaseQuantizer(BaseQuantizer, ABC): + def _pre_quantize_impl(self, *args, **kwargs): + pass + + @abstractmethod + def _quantize_impl(self, *args, **kwargs): + pass + + def _post_quantize_impl(self, *args, **kwargs): + pass + + def _pre_quantize_layer_impl(self, *args, **kwargs): + pass + + def _quantize_layer_impl(self, *args, **kwargs): + pass + + def _post_quantize_layer_impl(self, *args, **kwargs): + pass + + def _pre_quantize_block_impl(self, *args, **kwargs): + pass + + def _quantize_block_impl(self, *args, **kwargs): + pass + + def _post_quantize_block_impl(self, *args, **kwargs): + pass diff --git a/auto_round/quantizers/algs/rtn.py b/auto_round/quantizers/algs/rtn.py new file mode 100755 index 000000000..585c2631a --- /dev/null +++ b/auto_round/quantizers/algs/rtn.py @@ -0,0 +1,689 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import traceback +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +import accelerate +import torch +from accelerate.big_modeling import dispatch_model, infer_auto_device_map +from tqdm import tqdm + +from auto_round.compressors.shard_writer import shard_writer +from auto_round.compressors.utils import ( + check_need_act_calibration, + is_nv_fp, + is_static_wfp8afp8, +) +from auto_round.logger import logger +from auto_round.modelling.replace_modules import materialize_model_, safe_to_cpu_ +from auto_round.quantizers.algs.base import AlgsBaseQuantizer +from auto_round.quantizers.utils import ( + get_quantized_layer_names_outside_blocks, + quantize_embedding_layer, + register_act_max_hook, +) +from auto_round.utils import ( + check_to_quantized, + clear_memory, + convert_fp8_layer_to_linear, + convert_fp8_module_to_16b, + flatten_list, + get_block_names, + get_lm_head_name, + get_module, + global_state, + is_auto_device_mapping, + is_fp8_linear, + is_fp8_model, + memory_monitor, + mv_module_from_gpu, + set_amax_for_all_moe_layers, + set_module, + to_device, + to_dtype, +) +from auto_round.utils.device import ( + clear_memory_if_reached_threshold, + get_major_device, + parse_available_devices, + set_auto_device_map_for_block_with_tuning, + set_non_auto_device_map, +) +from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block + +if TYPE_CHECKING: + from auto_round.compressors.base import BaseCompressor + + +class RTNQuantizer(AlgsBaseQuantizer): + + def __init__(self, compressor: "BaseCompressor"): + super().__init__(compressor) + self.all_to_quantized_module_names: list[str] = [ + n for n, m in self.compressor.model.named_modules() if check_to_quantized(m) + ] + + def _pre_quantize_impl(self, *args, **kwargs): + if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype: + self.compressor.model.to(self.compressor.amp_dtype) + + if is_nv_fp(self.compressor.data_type): + # FIXME: (yiliu30) change it to block-wise after we refactor the quantization code and + # https://github.com/intel/auto-round/issues/1331 + materialize_model_(self.model) + self.compressor.model.to("cpu") + from auto_round.data_type.nvfp import calculate_gparam + from auto_round.data_type.utils import update_fused_layer_global_scales + + pbar = tqdm(self.all_to_quantized_module_names) + for name in pbar: + pbar.set_description(f"Calculate weight global scale: {name}") + m = get_module(self.compressor.model, name) + if is_fp8_linear(m): + m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device) + set_module(self.compressor.model, name, m) + weight_global_scale = calculate_gparam(m.weight, self.compressor.group_size) + setattr(m, "weight_global_scale", weight_global_scale) + + logger.info("Start to update fused layer global scales, it may take some time.") + for name, module in self.compressor.model.named_modules(): + update_fused_layer_global_scales(module) + logger.info("Finished updating fused layer global scales.") + + @torch.inference_mode() + def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: + """Quantize all modules in the model using RTN (Round-To-Nearest) strategy. + + If the target format includes GGUF with `k`, and optimized RTN is enabled, + blockwise quantization with input caching and imatrix is used. + + Returns: + tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration. + """ + if not ( + any(fmt.is_gguf() for fmt in getattr(self.compressor, "formats", [])) + or self.compressor.super_bits is not None + ): + quantize_embedding_layer( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + scale_dtype=self.compressor.scale_dtype, + disable_opt_rtn=self.compressor.disable_opt_rtn, + device=self.compressor.device, + device_list=self.compressor.device_list, + ) # leave to gguf itself to handle + + # Release memory + clear_memory(device_list=self.compressor.device_list) + + if self.compressor.act_bits <= 8 and check_need_act_calibration( + self.compressor.act_dynamic, + self.compressor.act_data_type, + self.compressor.act_bits, + self.compressor.static_kv_dtype, + self.compressor.static_attention_dtype, + ): # TODO, mixed datatype has bug + hook_handles = register_act_max_hook( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + act_group_size=self.compressor.act_group_size, + act_data_type=self.compressor.act_data_type, + ) + try: + self._quantize_via_rtn_blockwise() + except torch.OutOfMemoryError: + logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.") + self.model = self.model.to("cpu") + clear_memory(device_list=self.compressor.device_list) + if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: + import accelerate + + accelerate.hooks.remove_hook_from_submodules(self.model) + orig_device = self.compressor.device + self.compressor.device = "cpu" + self._quantize_via_rtn_blockwise() + self.compressor.device = orig_device + for handle in hook_handles: + handle.remove() + else: + # By default, we go with layer-wise way if no replacement happened + use_blockwise_quantization = global_state.replaced_module_count > 0 + tied_weights_keys = getattr(self.compressor.model, "_tied_weights_keys", []) + if tied_weights_keys is None: + tied_weights_keys = [] + if isinstance(tied_weights_keys, dict): + tied_weights_values = list(tied_weights_keys.values()) + else: + tied_weights_values = list(tied_weights_keys) + tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values] # rm weight/bias + # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it + if hasattr(self.compressor, "formats") and self.compressor.formats[0].is_gguf(): + lm_head_name = get_lm_head_name(self.model) + if lm_head_name is not None: + tied_weights_layers.append(lm_head_name) + + if use_blockwise_quantization: # The ram usage is a little higher + all_to_quantized_module_names = list(set(self.all_to_quantized_module_names)) + all_blocks = ( + self.compressor.quant_block_list + if self.compressor.quant_block_list + else get_block_names(self.model) + ) + pbar = tqdm(range(sum(len(block) for block in all_blocks))) + for block_names in all_blocks: + for block_name in block_names: + pbar.set_description(f"Quantizing {block_name}") + block = get_module(self.compressor.model, block_name) + materialize_model_(block) + for name, m in block.named_modules(): + if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names: + self._quantize_layer_via_rtn(m.global_name, to_cpu=self.compressor.low_gpu_mem_usage) + all_to_quantized_module_names.remove(m.global_name) + elif ( + not any(m.children()) + and len(m.state_dict()) > 0 + and m.global_name not in tied_weights_layers + ): + set_module(self.compressor.model, m.global_name, copy.deepcopy(m)) + if self.compressor.is_immediate_saving: + shard_writer(self, name=m.global_name) + m.to("meta") + clear_memory(device_list=self.compressor.device_list) + memory_monitor.log_summary() + pbar.update(1) + cnt = 1 + for name in all_to_quantized_module_names: + logger.info(f"Quantizing remaining layer {name} on CPU.") + self._quantize_layer_via_rtn(name, to_cpu=True) + cnt += 1 + if cnt % 10 == 0: + clear_memory(device_list=self.compressor.device_list) + memory_monitor.log_summary() + else: + materialize_model_(self.model) + self.compressor.model.to("cpu") + block_names_cnt = len(flatten_list(get_block_names(self.compressor.model, True))) + clear_mem_freq = len(self.all_to_quantized_module_names) // block_names_cnt + if clear_mem_freq == 0: + clear_mem_freq = 1 + pbar = tqdm(self.all_to_quantized_module_names) + cnt = 1 + for name in pbar: + pbar.set_description(f"Quantizing {name}") + self._quantize_layer_via_rtn(name) + if cnt % clear_mem_freq == 0: + clear_memory(device_list=self.compressor.device_list) + memory_monitor.log_summary() + cnt = 1 + cnt += 1 + # Convert remaining fp8 + if is_fp8_model(self.compressor.model): + convert_fp8_module_to_16b(self.compressor.model, self.compressor.amp_dtype, self.compressor.device) + if self.compressor.is_immediate_saving: + shard_writer(self, is_finalize=True) + self.compressor.quantized = True + return self.compressor.model, self.compressor.layer_config + + def _quantize_via_rtn_blockwise(self) -> None: + """Quantize model layers block by block using cached inputs.""" + + all_to_quantized_module_names = list(set(self.all_to_quantized_module_names)) + + all_blocks = ( + self.compressor.quant_block_list + if self.compressor.quant_block_list + else get_block_names(self.compressor.model) + ) + if not all_blocks: + raise ValueError("Could not find any blocks. Check the model or quant_block_list.") + + all_first_block_names = [block[0] for block in all_blocks] + layer_names = get_quantized_layer_names_outside_blocks( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + supported_types=self.compressor.supported_types, + quant_block_list=self.compressor.quant_block_list, + ) + if self.compressor.act_bits < 16 and (not self.compressor.act_dynamic or len(layer_names) > 0): + if len(layer_names) > 0: + logger.warning( + "quantize layers outside blocks for static activation quantizaiton" + " will significantly increase calibration time" + ) + all_inputs = self.compressor.try_cache_inter_data_gpucpu( + all_first_block_names, self.compressor.nsamples, layer_names + ) + else: + all_inputs = self.compressor.cache_inter_data(all_first_block_names, self.compressor.nsamples) + + # Clear hooks for multi-GPU setups + if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules(self.compressor.model) + + pbar = tqdm(range(sum(len(block) for block in all_blocks))) + + for block_names in all_blocks: + first_block = block_names[0] + inputs = all_inputs.pop(first_block) + input_keys = [k for k in inputs if k.startswith("hidden_state")] + if len(input_keys) != 1: + raise RuntimeError( + "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues" + ) + inputs["input_ids"] = inputs.pop(input_keys[0]) + + clear_memory(self.compressor.inputs, device_list=self.compressor.device_list) + + total_samples = len(inputs["input_ids"]) + if total_samples < self.compressor.batch_size: + self.compressor.batch_size = total_samples + logger.warning(f"Forcing batch size to {total_samples}") + + input_ids = to_device(inputs.pop("input_ids"), self.compressor.cache_device) + input_others = to_device(inputs, self.compressor.cache_device) + + tmp_dtype = self.compressor.amp_dtype if self.compressor.amp else torch.float32 + input_ids = [id_.to(tmp_dtype) for id_ in input_ids] + + for key, val in input_others.items(): + if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16): + input_others[key] = val.to(tmp_dtype) + elif isinstance(val, list): + input_others[key] = [to_dtype(v, tmp_dtype) for v in val] + + for block_name in block_names: + pbar.set_description(f"Quantizing {block_name}") + block = get_module(self.compressor.model, block_name) + materialize_model_(block) + block.to("cpu") + if is_fp8_model(self.compressor.model): + convert_fp8_module_to_16b(block, dtype=self.compressor.amp_dtype, device=self.compressor.device) + + if is_auto_device_mapping(self.compressor.device_map) and len(self.compressor.device_list) > 1: + set_auto_device_map_for_block_with_tuning( + block, + self.compressor.device_map, + input_ids, + self.compressor.low_gpu_mem_usage, + self.compressor.batch_size, + self.compressor.device, + ) + # Dispatch model if needed + if len(self.compressor.device_list) > 1: + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + for _, m in block.named_modules(): + if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): + continue + hook = AlignDevicesHook(m.tuning_device, io_same_device=True) + add_hook_to_module(m, hook, True) + else: + block = block.to(self.compressor.device) + + # TODO: refactor this part + input_ids = self.compressor._get_block_outputs( + block, + input_ids, + input_others, + self.compressor.batch_size * self.compressor.infer_bs_coeff, + self.compressor.device, + self.compressor.cache_device, + ) + + if len(self.compressor.device_list) > 1: + accelerate.hooks.remove_hook_from_submodules(block) + + if is_nv_fp(self.compressor.act_data_type) or is_static_wfp8afp8(self.compressor): + # enable moe experts act_max automatic generation for Linear + set_amax_for_all_moe_layers(block, attr_name="act_max") + if self.compressor.low_gpu_mem_usage: + block.to("cpu") + clear_memory(device_list=self.compressor.device_list) + + for _, m in block.named_modules(): + if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names: + self._quantize_layer_via_rtn(m.global_name, to_cpu=self.compressor.low_gpu_mem_usage) + all_to_quantized_module_names.remove(m.global_name) + if not self.compressor.is_immediate_saving: + mv_module_from_gpu(block) + if block_name == block_names[-1]: + clear_memory(input_ids, device_list=self.compressor.device_list) + else: + clear_memory(device_list=self.compressor.device_list) + + memory_monitor.log_summary() + pbar.update(1) + pbar.close() + # Process remaining layers not in blocks + for name in all_to_quantized_module_names: + dtype = None + if self.compressor.super_group_size is not None: + dtype = torch.float32 + self._quantize_layer_via_rtn(name, dtype=dtype) + # clear_memory(device_list=self.compressor.device_list) + + def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None: + """Quantizes a layer using RTN (Round-To-Nearest) if available. + + This function attempts to quantize a layer by switching its data type to a + `rtn_*` version if supported, then wraps and unwraps the module to apply + quantization. If GPU memory is insufficient, it falls back to CPU. + + If packing is enabled (`immediate_packing`), the function will also export + the quantized layer to the appropriate backend format. + + Args: + name (str): Name of the layer to quantize. + + Raises: + RuntimeError: If quantization fails for reasons unrelated to memory. + """ + m = get_module(self.compressor.model, name) + if dtype is not None: + m = m.to(dtype) + + if is_fp8_linear(m): + m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device) + set_module(self.compressor.model, name, m) + tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.compressor.device + + try: + m = m.to(tuning_device) + m = WrapperLinear( + m, + device=tuning_device, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_round_tuning=False, + enable_torch_compile=self.compressor.enable_torch_compile, + ) + m = m.unwrapper({}) + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + m = m.orig_layer if hasattr(m, "orig_layer") else m + try: + logger.error(cuda_error_msg) + logger.warning("falling back to CPU.") + m.to("cpu") + m = WrapperLinear( + m, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_round_tuning=False, + enable_torch_compile=self.compressor.enable_torch_compile, + ) + m = m.unwrapper({}) + except Exception as e: + raise + + # Step 2: Optional immediate packing/export + if self.compressor.is_immediate_packing: # For gguf, packing conducts on block level + self.compressor._immediate_pack(name) + if to_cpu: + m = m.to("cpu") + packed_m = get_module(self.compressor.model, name) + set_module(self.compressor.model, name, packed_m.to("cpu")) + else: + if to_cpu: + m = m.to("cpu") + set_module(self.compressor.model, name, m) + if self.compressor.is_immediate_saving: + if hasattr(self.compressor, "all_to_quantized_module_names"): + all_to_quantized_module_names = self.compressor.all_to_quantized_module_names + else: + all_to_quantized_module_names = [ + n for n, m in self.compressor.model.named_modules() if check_to_quantized(m) + ] + last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1]) + m = get_module(self.compressor.model, name) + shard_writer(self.compressor, m, name, last_module) + + +class OptRTNQuantizer(RTNQuantizer): + + @staticmethod + def register_act_hook(model, supported_types): + """Registers hooks to accumulate activation squared norms into `imatrix`.""" + + def get_imatrix_hook(module, input, output): + input = input[0] if isinstance(input, (tuple, list)) else input + flattened = input.reshape(-1, input.shape[-1]).to(torch.float32) + squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32) + + if not hasattr(module, "imatrix"): + module.imatrix = squared + module.imatrix_cnt = input.shape[0] + else: + module.imatrix += squared.to(module.imatrix.device) + module.imatrix_cnt += input.shape[0] + + hook_handles = [] + for name, module in model.named_modules(): + if type(module) in supported_types and check_to_quantized(module): + hook = module.register_forward_hook(get_imatrix_hook) + hook_handles.append(hook) + return hook_handles + + @torch.inference_mode() + def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: + enable_imatrix = False + has_gguf_k = ( + any(fmt.is_gguf() and "k" in fmt.output_format for fmt in getattr(self.compressor, "formats", [])) + or self.compressor.super_bits is not None + ) + if has_gguf_k: + enable_imatrix = True + elif self.compressor.data_type == "int" and self.compressor.sym: + enable_imatrix = True + if enable_imatrix: + self._quant_rtn_with_imatrix(self.all_to_quantized_module_names) + # Convert remaining fp8 + if is_fp8_model(self.compressor.model): + convert_fp8_module_to_16b(self.compressor.model, self.compressor.amp_dtype, self.compressor.device) + self.compressor.quantized = True + return self.compressor.model, self.compressor.layer_config + else: + return super()._quantize_impl(*args, **kwargs) + + def _quant_rtn_with_imatrix(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: + """Quantize all modules in the model using Optimized RTN strategy. + + This method applies optimized RTN quantization to all modules in the model + that are marked for quantization. It leverages input caching and imatrix + techniques for enhanced performance. + + Returns: + tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration. + """ + if not ( + any(fmt.is_gguf() for fmt in getattr(self.compressor, "formats", [])) + or self.compressor.super_bits is not None + ): + quantize_embedding_layer( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + scale_dtype=self.compressor.scale_dtype, + disable_opt_rtn=self.compressor.disable_opt_rtn, + device=self.compressor.device, + device_list=self.compressor.device_list, + ) # leave to gguf itself to handle + + self.compressor.model.to("cpu") + # Release memory + clear_memory(device_list=self.compressor.device_list) + + logger.info("start to compute imatrix") + + # Load dataset + from auto_round.calib_dataset import get_dataloader + + if isinstance(self.compressor.dataset, str): + if self.compressor.tokenizer is None: + raise ValueError("A tokenizer must be set for the model when using a dataset string.") + dataset_name = self.compressor.dataset.replace(" ", "") + self.compressor.dataloader = get_dataloader( + self.compressor.tokenizer, + self.compressor.seqlen, + dataset_name, + self.compressor.seed, + self.compressor.batch_size, + self.compressor.nsamples, + ) + else: + self.compressor.dataloader = self.compressor.dataset + + model = self.compressor.model + + # Dispatch multi-GPU model if necessary + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + dispatch_model(model, model.hf_device_map) + + hooks = self.register_act_hook(model, self.compressor.supported_types) + + try: + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + import accelerate + + accelerate.hooks.remove_hook_from_submodules(model) + model = model.to("cpu") + clear_memory(device_list=self.compressor.device_list) + self._quantize_via_rtn_blockwise() + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + try: + logger.error(cuda_error_msg) + # Final fallback: warn and use CPU-only quantization + logger.warning( + "Fallback to CPU. " + "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`." + ) + model = model.to("cpu") + clear_memory(device_list=self.compressor.device_list) + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + import accelerate + + accelerate.hooks.remove_hook_from_submodules(model) + + orig_device = self.compressor.device + self.compressor.device = "cpu" + self._quantize_via_rtn_blockwise() + self.compressor.device = orig_device + except Exception as e: + raise + finally: + # Always remove hooks + for hook in hooks: + hook.remove() + + def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None: + """Quantizes a layer using RTN (Round-To-Nearest) if available. + + This function attempts to quantize a layer by switching its data type to a + `rtn_*` version if supported, then wraps and unwraps the module to apply + quantization. If GPU memory is insufficient, it falls back to CPU. + + If packing is enabled (`immediate_packing`), the function will also export + the quantized layer to the appropriate backend format. + + Args: + name (str): Name of the layer to quantize. + + Raises: + RuntimeError: If quantization fails for reasons unrelated to memory. + """ + m = get_module(self.compressor.model, name) + if dtype is not None: + m = m.to(dtype) + + if is_fp8_linear(m): + m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device) + set_module(self.compressor.model, name, m) + tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.compressor.device + # Step 1: Try quantization on GPU first, fall back to CPU if OOM + if ( + self.compressor.is_immediate_packing + and self.compressor.iters == 0 + and self.compressor.formats[0].is_gguf() + and not self.compressor.disable_opt_rtn + ): + m = m.to(tuning_device) + m.scale = None + m.zp = None + else: + try: + disable_opt_rtn = False + if ( + self.compressor.orig_disable_opt_rtn is None + and self.compressor.is_moe_model + and "expert" in m.global_name + and "shared_expert" not in m.global_name + and self.compressor.super_bits is None # GGUF still uses the optimized RTN for MoE layers + ): + disable_opt_rtn = True + logger.warning_once( + "MoE layer detected: optimized RTN is disabled for efficiency. " + "Use `--enable_opt_rtn` to force-enable it for MoE layers." + ) + m = m.to(tuning_device) + m = WrapperLinear( + m, + device=tuning_device, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_round_tuning=False, + enable_torch_compile=self.compressor.enable_torch_compile, + disable_opt_rtn=disable_opt_rtn, + ) + m = m.unwrapper({}) + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + m = m.orig_layer if hasattr(m, "orig_layer") else m + try: + logger.error(cuda_error_msg) + logger.warning("falling back to CPU.") + m.to("cpu") + m = WrapperLinear( + m, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_round_tuning=False, + enable_torch_compile=self.compressor.enable_torch_compile, + ) + m = m.unwrapper({}) + except Exception as e: + raise + + # Step 2: Optional immediate packing/export + if self.compressor.is_immediate_packing: # For gguf, packing conducts on block level + self.compressor._immediate_pack(name) + if to_cpu: + m = m.to("cpu") + packed_m = get_module(self.compressor.model, name) + set_module(self.compressor.model, name, packed_m.to("cpu")) + else: + if to_cpu: + m = m.to("cpu") + set_module(self.compressor.model, name, m) + if self.compressor.is_immediate_saving: + if hasattr(self.compressor, "all_to_quantized_module_names"): + all_to_quantized_module_names = self.compressor.all_to_quantized_module_names + else: + all_to_quantized_module_names = [ + n for n, m in self.compressor.model.named_modules() if check_to_quantized(m) + ] + last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1]) + m = get_module(self.compressor.model, name) + shard_writer(self.compressor, m, name, last_module) diff --git a/auto_round/quantizers/base.py b/auto_round/quantizers/base.py new file mode 100755 index 000000000..d4f70a7ef --- /dev/null +++ b/auto_round/quantizers/base.py @@ -0,0 +1,103 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from auto_round.compressors.base import BaseCompressor + + +class BaseQuantizer(ABC): + def __init__(self, compressor: "BaseCompressor"): + self.compressor = compressor + + def __mro_call(self, method_name: str, *args, **kwargs): + for cls in type(self).mro(): + method = cls.__dict__.get(method_name, None) + if method: + method(self, *args, **kwargs) + + def pre_quantize(self, *args, **kwargs): + self.__mro_call("_pre_quantize_impl", *args, **kwargs) + + @abstractmethod + def _pre_quantize_impl(self, *args, **kwargs): + pass + + def quantize(self, *args, **kwargs): + self.pre_quantize(*args, **kwargs) + self._quantize_impl(*args, **kwargs) + self.post_quantize(*args, **kwargs) + return self.compressor.model, self.compressor.layer_config + + @abstractmethod + def _quantize_impl(self, *args, **kwargs): + pass + + def post_quantize(self, *args, **kwargs): + self.__mro_call("_post_quantize_impl", *args, **kwargs) + + @abstractmethod + def _post_quantize_impl(self, *args, **kwargs): + pass + + def pre_quantize_layer(self, *args, **kwargs): + self.__mro_call("_pre_quantize_layer_impl", *args, **kwargs) + + @abstractmethod + def _pre_quantize_layer_impl(self, *args, **kwargs): + pass + + def quantize_layer(self, *args, **kwargs): + self.pre_quantize_layer(*args, **kwargs) + result = self._quantize_layer_impl(*args, **kwargs) + self.post_quantize_layer(*args, **kwargs) + return result + + @abstractmethod + def _quantize_layer_impl(self, *args, **kwargs): + pass + + def post_quantize_layer(self, *args, **kwargs): + pass + self.__mro_call("_post_quantize_layer_impl", *args, **kwargs) + + @abstractmethod + def _post_quantize_layer_impl(self, *args, **kwargs): + pass + + def pre_quantize_block(self, *args, **kwargs): + self.__mro_call("_pre_quantize_block_impl", *args, **kwargs) + + @abstractmethod + def _pre_quantize_block_impl(self, *args, **kwargs): + pass + + def quantize_block(self, *args, **kwargs): + self.pre_quantize_block(*args, **kwargs) + result = self._quantize_block_impl(*args, **kwargs) + self.post_quantize_block(*args, **kwargs) + return result + + @abstractmethod + def _quantize_block_impl(self, *args, **kwargs): + pass + + def post_quantize_block(self, *args, **kwargs): + self.__mro_call("_post_quantize_block_impl", *args, **kwargs) + + @abstractmethod + def _post_quantize_block_impl(self, *args, **kwargs): + pass diff --git a/auto_round/quantizers/entrypoint.py b/auto_round/quantizers/entrypoint.py new file mode 100755 index 000000000..e0eda7258 --- /dev/null +++ b/auto_round/quantizers/entrypoint.py @@ -0,0 +1,53 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from auto_round.compressors.base import BaseCompressor + +from auto_round.quantizers.algs.auto_round import ARAdamQuantizer, ARQuantizer +from auto_round.quantizers.algs.rtn import OptRTNQuantizer, RTNQuantizer + + +class AutoRoundQuantizer: + def __new__(cls, compressor: "BaseCompressor", dynamic_quantizers: dict = None): + assert dynamic_quantizers is not None, "Please provide dynamic_quantizers dict." + quantizer_cls = type("AutoRoundQuantizer", (dynamic_quantizers["algs"],), {}) + return quantizer_cls(compressor) + + +class Quantizers: + def __init__(self, quantizers: list[AutoRoundQuantizer]): + self.quantizers = quantizers + + def quantize(self, *args, **kwargs): + for quantizer in self.quantizers: + model, layer_config = quantizer.quantize(*args, **kwargs) + return model, layer_config + + +def create_quantizers(compressor: "BaseCompressor"): + + alg_cls = None + if compressor.iters > 0: + alg_cls = ARQuantizer if compressor.enable_adam is False else ARAdamQuantizer + else: + alg_cls = OptRTNQuantizer if compressor.disable_opt_rtn is False else RTNQuantizer + + dynamic_quantizers = {"algs": alg_cls} + return Quantizers( + quantizers=[ + AutoRoundQuantizer(compressor, dynamic_quantizers=dynamic_quantizers), + ] + ) diff --git a/auto_round/quantizers/readme.md b/auto_round/quantizers/readme.md new file mode 100644 index 000000000..7a5b7fad6 --- /dev/null +++ b/auto_round/quantizers/readme.md @@ -0,0 +1,21 @@ +# AutoRound Quantizer +主要的功能组件,包含不同的算法,量化的具体执行逻辑。 + +## 结构与调用流程 +AutoRundQuantizer根据粒度从大到小分为三层(可扩展): algs、model_type、data_type,从每层中继承方法动态的构造一个Quantizers, 同层间互斥,不同层间可以自由组合。 + +AutoRoundQuantizer +- algs + - RTN + - Tuning(auto_round) +- model_type + - llm + - mllm + - diffusion +- data_type + - gguf + - nvfp/mxfp +### 1. AutoRoundQuantizer +主入口,根据配置,使用__new__方法动态构造一个Quantizer, 从AlgsQuantizer, ModelTypeQuantizer, DataTypeQuantizer中继承方法,小粒度层可覆写大粒度层方法 + +### 2. AlgsQuantizer \ No newline at end of file diff --git a/auto_round/quantizers/utils.py b/auto_round/quantizers/utils.py new file mode 100755 index 000000000..a6f1dd4bc --- /dev/null +++ b/auto_round/quantizers/utils.py @@ -0,0 +1,282 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import traceback +from typing import Any, Callable, Optional, Union + +import torch + +from auto_round.compressors.utils import ( + check_need_act_calibration, + is_nv_fp, +) +from auto_round.data_type import QUANT_FUNC_WITH_DTYPE +from auto_round.data_type.utils import reshape_pad_tensor_by_group_size +from auto_round.logger import logger +from auto_round.utils import ( + SUPPORTED_LAYER_TYPES, + check_to_quantized, + clear_memory, + get_layer_names_in_block, + get_module, + to_device, + to_dtype, +) + + +def register_act_max_hook(model: torch.nn.Module, layer_config: dict, act_group_size: int, act_data_type: str): + def get_act_max_hook(module, input, output): + if isinstance(input, (tuple, list)): + input = input[0] + if input.numel() == 0: + return # as no needs for act_max update + input, _, _ = reshape_pad_tensor_by_group_size(input, act_group_size) + act_max = torch.max(torch.abs(input), dim=-1).values + if not hasattr(module, "act_max") or module.act_max.numel() == 0: + module.act_max = act_max + else: + act_max = act_max.to(module.act_max.device) + if is_nv_fp(act_data_type): ## for nvfp per-tensor input_global_scale calculation usage + module.act_max = torch.max(torch.tensor([act_max.max(), module.act_max.max()], device=act_max.device)) + else: + module.act_max = torch.max(act_max, module.act_max) + + hook_handles = [] + # for single layers out of blocks, like lm_head + if isinstance(model, SUPPORTED_LAYER_TYPES): + m = model + if ( + hasattr(m, "act_dynamic") + and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits) + and check_to_quantized(m) + ): + hook = m.register_forward_hook(get_act_max_hook) + hook_handles.append(hook) + return hook_handles + + for n, m in model.named_modules(): + if ( + hasattr(m, "act_dynamic") + and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits) + and check_to_quantized(m) + ): + hook = m.register_forward_hook(get_act_max_hook) + hook_handles.append(hook) + continue + + # for whole model, RTN + if n in layer_config: + config = layer_config[n] + act_dynamic = config.get("act_dynamic", True) + act_data_type = config.get("act_data_type", None) + act_bits = config.get("act_bits", 16) + if ( + config["bits"] <= 8 + and check_need_act_calibration(act_dynamic, act_data_type, act_bits) + and check_to_quantized(config) + ): + hook = m.register_forward_hook(get_act_max_hook) + hook_handles.append(hook) + continue + return hook_handles + + +@torch.inference_mode() +def quantize_embedding_layer( + model: torch.nn.Module, + layer_config: dict, + scale_dtype: str, + disable_opt_rtn: bool, + device: Union[str, torch.device], + device_list: list, +) -> bool: + """Quantizes embedding layers in the model according to the configuration. + + This method iterates through all modules in the model, identifies embedding + layers specified in `layer_config`, and applies the appropriate quantization + function based on bit precision, grouping strategy, and dtype. + + Returns: + bool: True if the quantization process completes without critical errors. + """ + is_quantized = False + for name, module in model.named_modules(): + # Skip non-Embedding modules or layers not in config + if not isinstance(module, torch.nn.Embedding) or name not in layer_config: + continue + + config = layer_config[name] + + # Skip layers that are not marked for quantization + if not check_to_quantized(config): + continue + is_quantized = True + config["scale_dtype"] = scale_dtype + dtype = config["data_type"] + + # Determine quantization function key with symmetry/asymmetry + if dtype not in QUANT_FUNC_WITH_DTYPE: + dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}" + + # Optionally use optimized rounding (RTN) variant + if not disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE: + dtype = f"rtn_{dtype}" + + quant_func = QUANT_FUNC_WITH_DTYPE[dtype] + dtype = module.weight.dtype + # As typically float32 are used in RTN to search scale zp, + # to avoid cache a bf16 copy we'd better use float32 + if config.get("super_group_size", None) is not None: + dtype = torch.float32 + + # Attempt quantization on GPU, fall back to CPU if OOM + try: + weight, scale, zp = quant_func( + module.weight.to(dtype=dtype, device=device), + **{ + k: config.get(k, None) + for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] + }, + ) + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + try: + logger.error(cuda_error_msg) + logger.warning("falling back to CPU") + weight, scale, zp = quant_func( + module.weight.to("cpu"), + **{ + k: config.get(k, None) + for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] + }, + ) + except Exception as e: + raise + + # Overwrite the module's weights with the quantized version + module.weight.data.copy_(weight.cpu()) + + # Attach scale and zero point (zp) to the module + for param_name, value in zip(["scale", "zp"], [scale, zp]): + if isinstance(value, dict): + for k, v in value.items(): + setattr(module, k if k == "scale" else f"w_{k}", v.cpu()) + elif isinstance(value, torch.Tensor): + setattr(module, param_name, value.cpu()) + else: + setattr(module, param_name, value) + + # Update config + layer_config.setdefault(name, {}).update(config) + del weight + del scale + del zp + clear_memory(device_list=device_list) + return is_quantized + + +def get_quantized_layer_names_outside_blocks( + model: torch.nn.Module, layer_config: dict, supported_types: list, quant_block_list: list +) -> list: + """Gets the names of quantized layers outside blocks in the model. + + Returns: + list: List of layer names outside blocks. + """ + if layer_config is None or len(layer_config) == 0: + return [] + + layer_names = [] + all_layers_in_block = get_layer_names_in_block(model, supported_types, quant_block_list) + + for key in layer_config.keys(): + if key in all_layers_in_block: + continue + layer = get_module(model, key) + if layer is None: + logger.error(f"could not find layer {key} in the model, exit...") + exit(-1) + if type(layer) in supported_types and check_to_quantized(layer_config[key]): + layer_names.append(key) + + return layer_names + + +def get_non_zero_cnt(tensor: list[torch.Tensor], indices: list[int]) -> int: + current_tensors = [tensor[i] for i in indices] + non_zero_cnt = 0 + for t in current_tensors: + non_zero_cnt += torch.count_nonzero(t).item() + return non_zero_cnt + + +def split_inputs(inputs: dict, first_input_name: str, is_diffusion: bool = False) -> tuple[torch.Tensor, dict]: + if is_diffusion: + input_id_str = [key for key in inputs.keys() if "hidden_state" in key] + input_ids = {k: inputs.pop(k, None) for k in input_id_str} + input_others = inputs + return input_ids, input_others + else: + input_ids = inputs[first_input_name] + inputs.pop(first_input_name, None) + input_others = inputs + return input_ids, input_others + + +def preprocess_block_inputs( + inputs, + device_list: list, + first_input_name="input_ids", + amp: bool = False, + amp_dtype: torch.dtype = torch.float32, + cache_device: Union[str, torch.device] = "cpu", + is_diffusion: bool = False, +): + input_ids, input_others = split_inputs(inputs, first_input_name, is_diffusion=is_diffusion) + clear_memory(device_list=device_list) + input_ids = to_device(input_ids, cache_device) + input_others = to_device(input_others, cache_device) + # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage + + tmp_dtype = amp_dtype if amp else torch.float32 + input_ids = to_dtype(input_ids, tmp_dtype) + + for key in input_others.keys(): + if isinstance(input_others[key], torch.Tensor) and ( + input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16 + ): + input_others[key] = input_others[key].to(tmp_dtype) + elif isinstance(input_others[key], list): + for i in range(len(input_others[key])): + to_dtype(input_others[key][i], tmp_dtype) + return input_ids, input_others + + +def update_inputs(inputs: dict, q_inputs: dict, is_diffusion: bool) -> tuple[dict, dict]: + if is_diffusion: + input_id_str = [key for key in inputs.keys() if "hidden_state" in key] + if q_inputs is not None: + q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str} + return inputs, q_inputs + else: + keys = inputs.keys() + input_id_str = [key for key in keys if key.startswith("hidden_state")] + if len(input_id_str) != 1: + raise RuntimeError( + "hidden_states arg mismatch error," + "please raise an issue in https://github.com/intel/auto-round/issues" + ) + inputs["input_ids"] = inputs.pop(input_id_str[0], None) + if q_inputs is not None: + q_inputs = q_inputs.pop(input_id_str[0], None) + return inputs, q_inputs diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py old mode 100644 new mode 100755 diff --git a/test/test_cpu/utils/test_cli_usage.py b/test/test_cpu/utils/test_cli_usage.py old mode 100644 new mode 100755 diff --git a/test/test_cuda/advanced/test_multiple_card_calib.py b/test/test_cuda/advanced/test_multiple_card_calib.py old mode 100644 new mode 100755 diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py old mode 100644 new mode 100755 diff --git a/test/test_cuda/models/test_support_vlms.py b/test/test_cuda/models/test_support_vlms.py old mode 100644 new mode 100755 diff --git a/test/test_cuda/utils/test_alg_ext.py b/test/test_cuda/utils/test_alg_ext.py old mode 100644 new mode 100755