From 246f817c6be0dbdbb3d2d4e1a7ca6e3e02ee173b Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 14 Jan 2026 03:52:45 -0500 Subject: [PATCH 1/5] refactor rtn and tuning Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 1369 +---------------- .../compressors/diffusion/compressor.py | 12 - auto_round/quantizers/__init__.py | 15 + auto_round/quantizers/algs/auto_round.py | 888 +++++++++++ auto_round/quantizers/algs/base.py | 27 + auto_round/quantizers/algs/rtn.py | 627 ++++++++ auto_round/quantizers/base.py | 50 + auto_round/quantizers/entrypoint.py | 55 + auto_round/quantizers/readme.md | 21 + auto_round/quantizers/utils.py | 262 ++++ 10 files changed, 1961 insertions(+), 1365 deletions(-) create mode 100644 auto_round/quantizers/__init__.py create mode 100644 auto_round/quantizers/algs/auto_round.py create mode 100644 auto_round/quantizers/algs/base.py create mode 100644 auto_round/quantizers/algs/rtn.py create mode 100644 auto_round/quantizers/base.py create mode 100644 auto_round/quantizers/entrypoint.py create mode 100644 auto_round/quantizers/readme.md create mode 100644 auto_round/quantizers/utils.py diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 5d7ba6371..a383659d0 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -52,7 +52,6 @@ reset_params, set_layer_config, ) -from auto_round.data_type import QUANT_FUNC_WITH_DTYPE from auto_round.data_type.utils import reshape_pad_tensor_by_group_size from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG, ModelType from auto_round.formats import OutputFormat, get_formats @@ -902,281 +901,6 @@ def _get_save_folder_name(self, format: OutputFormat) -> str: return self.orig_output_dir - @torch.inference_mode() - def _quantize_embedding_layer(self): - """Quantizes embedding layers in the model according to the configuration. - - This method iterates through all modules in the model, identifies embedding - layers specified in `self.layer_config`, and applies the appropriate quantization - function based on bit precision, grouping strategy, and dtype. - - Returns: - bool: True if the quantization process completes without critical errors. - """ - is_quantized = False - for name, module in self.model.named_modules(): - # Skip non-Embedding modules or layers not in config - if not isinstance(module, torch.nn.Embedding) or name not in self.layer_config: - continue - - config = self.layer_config[name] - - # Skip layers that are not marked for quantization - if not check_to_quantized(config): - continue - is_quantized = True - config["scale_dtype"] = self.scale_dtype - dtype = config["data_type"] - - # Determine quantization function key with symmetry/asymmetry - if dtype not in QUANT_FUNC_WITH_DTYPE: - dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}" - - # Optionally use optimized rounding (RTN) variant - if not self.disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE: - dtype = f"rtn_{dtype}" - - quant_func = QUANT_FUNC_WITH_DTYPE[dtype] - dtype = module.weight.dtype - # As typically float32 are used in RTN to search scale zp, - # to avoid cache a bf16 copy we'd better use float32 - if config.get("super_group_size", None) is not None: - dtype = torch.float32 - - # Attempt quantization on GPU, fall back to CPU if OOM - try: - weight, scale, zp = quant_func( - module.weight.to(dtype=dtype, device=self.device), - **{ - k: config.get(k, None) - for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] - }, - ) - except torch.OutOfMemoryError: - cuda_error_msg = traceback.format_exc() - try: - logger.error(cuda_error_msg) - logger.warning("falling back to CPU") - weight, scale, zp = quant_func( - module.weight.to("cpu"), - **{ - k: config.get(k, None) - for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] - }, - ) - except Exception as e: - raise - - # Overwrite the module's weights with the quantized version - module.weight.data.copy_(weight.cpu()) - - # Attach scale and zero point (zp) to the module - for param_name, value in zip(["scale", "zp"], [scale, zp]): - if isinstance(value, dict): - for k, v in value.items(): - setattr(module, k if k == "scale" else f"w_{k}", v.cpu()) - elif isinstance(value, torch.Tensor): - setattr(module, param_name, value.cpu()) - else: - setattr(module, param_name, value) - - # Update config - self.layer_config.setdefault(name, {}).update(config) - del weight - del scale - del zp - clear_memory(device_list=self.device_list) - - return is_quantized - - def _quant_rtn_with_imatrix(self, all_to_quantized_module_names: list[str]) -> None: - """Performs RTN quantization using input activation statistics (imatrix). - - This method accumulates per-channel second-moment activation statistics (imatrix) - via forward hooks and uses them to perform RTN quantization. If CUDA memory runs out, - it falls back to CPU-based blockwise quantization. - - Args: - all_to_quantized_module_names (list[str]): - A list of module names (e.g., 'model.layers.0.self_attn.q_proj') to be quantized. - - Returns: - None - """ - logger.info("start to compute imatrix") - - # Load dataset - from auto_round.calib_dataset import get_dataloader - - if isinstance(self.dataset, str): - if self.tokenizer is None: - raise ValueError("A tokenizer must be set for the model when using a dataset string.") - dataset_name = self.dataset.replace(" ", "") - self.dataloader = get_dataloader( - self.tokenizer, self.seqlen, dataset_name, self.seed, self.batch_size, self.nsamples - ) - else: - self.dataloader = self.dataset - - model = self.model - - # Dispatch multi-GPU model if necessary - if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: - dispatch_model(model, model.hf_device_map) - - def register_act_hook(model): - """Registers hooks to accumulate activation squared norms into `imatrix`.""" - - def get_imatrix_hook(module, input, output): - input = input[0] if isinstance(input, (tuple, list)) else input - flattened = input.reshape(-1, input.shape[-1]).to(torch.float32) - squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32) - - if not hasattr(module, "imatrix"): - module.imatrix = squared - module.imatrix_cnt = input.shape[0] - else: - module.imatrix += squared.to(module.imatrix.device) - module.imatrix_cnt += input.shape[0] - - hook_handles = [] - for name, module in model.named_modules(): - if type(module) in self.supported_types and check_to_quantized(module): - hook = module.register_forward_hook(get_imatrix_hook) - hook_handles.append(hook) - return hook_handles - - hooks = register_act_hook(model) - - try: - if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: - import accelerate - - accelerate.hooks.remove_hook_from_submodules(model) - model = model.to("cpu") - clear_memory(device_list=self.device_list) - self._quantize_via_rtn_blockwise(all_to_quantized_module_names) - except torch.OutOfMemoryError: - cuda_error_msg = traceback.format_exc() - try: - logger.error(cuda_error_msg) - # Final fallback: warn and use CPU-only quantization - logger.warning( - "Fallback to CPU. " - "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`." - ) - model = model.to("cpu") - clear_memory(device_list=self.device_list) - if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: - import accelerate - - accelerate.hooks.remove_hook_from_submodules(model) - - orig_device = self.device - self.device = "cpu" - self._quantize_via_rtn_blockwise(all_to_quantized_module_names) - self.device = orig_device - except Exception as e: - raise - finally: - # Always remove hooks - for hook in hooks: - hook.remove() - - def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None: - """Quantizes a layer using RTN (Round-To-Nearest) if available. - - This function attempts to quantize a layer by switching its data type to a - `rtn_*` version if supported, then wraps and unwraps the module to apply - quantization. If GPU memory is insufficient, it falls back to CPU. - - If packing is enabled (`immediate_packing`), the function will also export - the quantized layer to the appropriate backend format. - - Args: - name (str): Name of the layer to quantize. - - Raises: - RuntimeError: If quantization fails for reasons unrelated to memory. - """ - m = get_module(self.model, name) - if dtype is not None: - m = m.to(dtype) - - if is_fp8_linear(m): - m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device) - set_module(self.model, name, m) - tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.device - # Step 1: Try quantization on GPU first, fall back to CPU if OOM - if self.immediate_packing and self.iters == 0 and self.formats[0].is_gguf() and not self.disable_opt_rtn: - m = m.to(tuning_device) - m.scale = None - m.zp = None - else: - try: - disable_opt_rtn = self.disable_opt_rtn - if ( - not disable_opt_rtn - and self.orig_disable_opt_rtn is None - and self.is_moe_model - and "expert" in m.tmp_name - and "shared_expert" not in m.tmp_name - and self.super_bits is None # GGUF still uses the optimized RTN for MoE layers - ): - disable_opt_rtn = True - logger.warning_once( - "MoE layer detected: optimized RTN is disabled for efficiency. " - "Use `--enable_opt_rtn` to force-enable it for MoE layers." - ) - m = m.to(tuning_device) - m = WrapperLinear( - m, - device=tuning_device, - enable_minmax_tuning=False, - enable_norm_bias_tuning=False, - enable_round_tuning=False, - enable_torch_compile=self.enable_torch_compile, - disable_opt_rtn=disable_opt_rtn, - ) - m = m.unwrapper({}) - except torch.OutOfMemoryError: - cuda_error_msg = traceback.format_exc() - m = m.orig_layer if hasattr(m, "orig_layer") else m - try: - logger.error(cuda_error_msg) - logger.warning("falling back to CPU.") - m.to("cpu") - m = WrapperLinear( - m, - enable_minmax_tuning=False, - enable_norm_bias_tuning=False, - enable_round_tuning=False, - enable_torch_compile=self.enable_torch_compile, - ) - m = m.unwrapper({}) - except Exception as e: - raise - - # Step 2: Optional immediate packing/export - if self.immediate_packing: # For gguf, packing conducts on block level - self._immediate_pack(name) - if to_cpu: - m = m.to("cpu") - packed_m = get_module(self.model, name) - set_module(self.model, name, packed_m.to("cpu")) - else: - if to_cpu: - m = m.to("cpu") - set_module(self.model, name, m) - if self.immediate_saving: - if hasattr(self, "all_to_quantized_module_names"): - all_to_quantized_module_names = self.all_to_quantized_module_names - else: - all_to_quantized_module_names = [n for n, m in self.model.named_modules() if check_to_quantized(m)] - last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1]) - m = get_module(self.model, name) - immediate_saving(self, m, name, last_module) - def _immediate_pack(self, name: str): if not self.immediate_packing: return @@ -1192,230 +916,6 @@ def _immediate_pack(self, name: str): image_processor=self.image_processor if hasattr(self, "image_processor") else None, ) - @torch.inference_mode() - def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: - """Quantize all modules in the model using RTN (Round-To-Nearest) strategy. - - If the target format includes GGUF with `k`, and optimized RTN is enabled, - blockwise quantization with input caching and imatrix is used. - - Returns: - tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration. - """ - if self.amp and self.model.dtype != self.amp_dtype: - self.model.to(self.amp_dtype) - - all_to_quantized_module_names: list[str] = [n for n, m in self.model.named_modules() if check_to_quantized(m)] - self.all_to_quantized_module_names = all_to_quantized_module_names - if is_nv_fp(self.data_type): - from auto_round.data_type.nvfp import calculate_gparam - from auto_round.data_type.utils import update_fused_layer_global_scales - - pbar = tqdm(all_to_quantized_module_names) - for name in pbar: - pbar.set_description(f"Calculate weight global scale: {name}") - m = get_module(self.model, name) - if is_fp8_linear(m): - m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device) - set_module(self.model, name, m) - weight_global_scale = calculate_gparam(m.weight, self.group_size) - setattr(m, "weight_global_scale", weight_global_scale) - - logger.info("Start to update fused layer global scales, it may take some time.") - for name, module in self.model.named_modules(): - update_fused_layer_global_scales(module) - logger.info("Finished updating fused layer global scales.") - - if not (any(fmt.is_gguf() for fmt in getattr(self, "formats", [])) or self.super_bits is not None): - self._quantize_embedding_layer() # leave to gguf itself to handle - - self.model.to("cpu") - # Release memory - clear_memory(device_list=self.device_list) - - enable_imatrix = False - if not self.disable_opt_rtn: - has_gguf_k = ( - any(fmt.is_gguf() and "k" in fmt.output_format for fmt in getattr(self, "formats", [])) - or self.super_bits is not None - ) - if has_gguf_k: - enable_imatrix = True - elif self.data_type == "int" and self.sym: - enable_imatrix = True - if enable_imatrix: - self._quant_rtn_with_imatrix(all_to_quantized_module_names) - elif self.act_bits <= 8 and check_need_act_calibration( - self.act_dynamic, - self.act_data_type, - self.act_bits, - self.static_kv_dtype, - self.static_attention_dtype, - ): # TODO, mixed datatype has bug - hook_handles = self._register_act_max_hook(self.model) - try: - self._quantize_via_rtn_blockwise(all_to_quantized_module_names) - except torch.OutOfMemoryError: - logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.") - self.model = self.model.to("cpu") - clear_memory(device_list=self.device_list) - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: - import accelerate - - accelerate.hooks.remove_hook_from_submodules(self.model) - orig_device = self.device - self.device = "cpu" - self._quantize_via_rtn_blockwise(all_to_quantized_module_names) - self.device = orig_device - for handle in hook_handles: - handle.remove() - else: - block_names_cnt = len(flatten_list(get_block_names(self.model, True))) - clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt - if clear_mem_freq == 0: - clear_mem_freq = 1 - pbar = tqdm(all_to_quantized_module_names) - cnt = 1 - for name in pbar: - pbar.set_description(f"Quantizing {name}") - self._quantize_layer_via_rtn(name) - if cnt % clear_mem_freq == 0: - clear_memory(device_list=self.device_list) - memory_monitor.log_summary() - cnt = 1 - cnt += 1 - # Convert remaining fp8 - if is_fp8_model(self.model): - convert_fp8_model_to_16b_model(self.model, self.amp_dtype, self.device) - self.quantized = True - return self.model, self.layer_config - - def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) -> None: - """Quantize model layers block by block using cached inputs and imatrix. - - Args: - all_to_quantized_module_names (list[str]): Names of layers to be quantized. - """ - all_to_quantized_module_names = list(set(all_to_quantized_module_names)) - - all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model) - if not all_blocks: - raise ValueError("Could not find any blocks. Check the model or quant_block_list.") - - all_first_block_names = [block[0] for block in all_blocks] - layer_names = self._get_quantized_layer_names_outside_blocks() - if self.act_bits < 16 and (not self.act_dynamic or len(layer_names) > 0): - if len(layer_names) > 0: - logger.warning( - "quantize layers outside blocks for static activation quantizaiton" - " will significantly increase calibration time" - ) - all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names) - else: - all_inputs = self.cache_inter_data(all_first_block_names, self.nsamples) - - # Clear hooks for multi-GPU setups - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: - accelerate.hooks.remove_hook_from_submodules(self.model) - - pbar = tqdm(range(sum(len(block) for block in all_blocks))) - - for block_names in all_blocks: - first_block = block_names[0] - inputs = all_inputs.pop(first_block) - input_keys = [k for k in inputs if k.startswith("hidden_state")] - if len(input_keys) != 1: - raise RuntimeError( - "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues" - ) - inputs["input_ids"] = inputs.pop(input_keys[0]) - - clear_memory(self.inputs, device_list=self.device_list) - - total_samples = len(inputs["input_ids"]) - if total_samples < self.batch_size: - self.batch_size = total_samples - logger.warning(f"Forcing batch size to {total_samples}") - - input_ids = to_device(inputs.pop("input_ids"), self.cache_device) - input_others = to_device(inputs, self.cache_device) - - tmp_dtype = self.amp_dtype if self.amp else torch.float32 - input_ids = [id_.to(tmp_dtype) for id_ in input_ids] - - for key, val in input_others.items(): - if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16): - input_others[key] = val.to(tmp_dtype) - elif isinstance(val, list): - input_others[key] = [to_dtype(v, tmp_dtype) for v in val] - - for block_name in block_names: - pbar.set_description(f"Quantizing {block_name}") - block = get_module(self.model, block_name) - if is_fp8_model(self.model): - convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype, device=self.device) - - if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1: - set_auto_device_map_for_block_with_tuning( - block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, self.device - ) - # Dispatch model if needed - if len(self.device_list) > 1: - from accelerate.hooks import AlignDevicesHook, add_hook_to_module - - for _, m in block.named_modules(): - if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): - continue - hook = AlignDevicesHook(m.tuning_device, io_same_device=True) - add_hook_to_module(m, hook, True) - else: - block = block.to(self.device) - input_ids = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - self.device, - self.cache_device, - ) - - if len(self.device_list) > 1: - accelerate.hooks.remove_hook_from_submodules(block) - - if is_nv_fp(self.act_data_type) or is_static_wfp8afp8(self): - # enable moe experts act_max automatic generation for Linear - set_amax_for_all_moe_layers(block, attr_name="act_max") - # Normalize imatrix and quantize layers - if self.low_gpu_mem_usage: - block.to("cpu") - clear_memory(device_list=self.device_list) - - for _, m in block.named_modules(): - # fix issue: Ling-flash-2.0-q2_k_s fail infer on cuda but well on cpu - # https://huggingface.co/Intel/Ling-flash-2.0-gguf-q2ks-mixed-AutoRound/discussions/1 - if hasattr(m, "imatrix"): - m.imatrix /= m.imatrix_cnt - if hasattr(m, "tmp_name") and m.tmp_name in all_to_quantized_module_names: - self._quantize_layer_via_rtn(m.tmp_name, to_cpu=self.low_gpu_mem_usage) - all_to_quantized_module_names.remove(m.tmp_name) - if not self.immediate_saving: - mv_module_from_gpu(block) - if block_name == block_names[-1]: - clear_memory(input_ids, device_list=self.device_list) - else: - clear_memory(device_list=self.device_list) - - memory_monitor.log_summary() - pbar.update(1) - pbar.close() - # Process remaining layers not in blocks - for name in all_to_quantized_module_names: - dtype = None - if self.super_group_size is not None: - dtype = torch.float32 - self._quantize_layer_via_rtn(name, dtype=dtype) - # clear_memory(device_list=self.device_list) - def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]: keys = inputs.keys() input_id_str = [key for key in keys if key.startswith("hidden_state")] @@ -1484,212 +984,11 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if self.immediate_saving and "int" not in self.data_type: logger.warning("immediate_saving is only supported for int quantization, set to False") self.immediate_saving = False - if self.iters == 0: - return self._quantize_rtn() - - if bool(self.quant_block_list): - all_blocks = self.quant_block_list - else: - all_blocks = get_block_names(self.model) - if len(all_blocks) == 0: - logger.warning("could not find blocks, exit with original model") - return self.model, self.layer_config + from auto_round.quantizers import create_quantizers - if self.amp and self.model.dtype != self.amp_dtype: - self.model = self.model.to(self.amp_dtype) - - layer_names = self._get_quantized_layer_names_outside_blocks() - self.start_time = time.time() - all_first_block_names = [block[0] for block in all_blocks] - if len(layer_names) > 0: - logger.info( - "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names - ) - else: - logger.info("start to cache block inputs") - all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names) - is_quantized_embedding = self._quantize_embedding_layer() - clear_memory(device_list=self.device_list) - all_q_inputs = None - if is_quantized_embedding: - all_inputs = copy.deepcopy(self.inputs) - clear_memory(self.inputs, device_list=self.device_list) - all_q_inputs = self.try_cache_inter_data_gpucpu( - all_first_block_names, self.nsamples, layer_names=layer_names - ) - self.model = mv_module_from_gpu(self.model) - clear_memory(device_list=self.device_list) - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: - accelerate.hooks.remove_hook_from_submodules(self.model) # self.model.hf_device_map has not been changed - logger.info("caching done") - if len(all_blocks) > 1: - pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks)) - else: - pbar = tqdm(range(0, len(all_blocks[0]), self.nblocks)) # move the alg warning outside pbar - - for block_names in all_blocks: - inputs = all_inputs[block_names[0]] - all_inputs.pop(block_names[0]) - q_inputs = None - if all_q_inputs is not None: - q_inputs = all_q_inputs[block_names[0]] - all_q_inputs.pop(block_names[0]) - - inputs, q_inputs = self._update_inputs(inputs, q_inputs) - - clear_memory(self.inputs, device_list=self.device_list) - - if "input_ids" in inputs.keys(): - total_samples = len(inputs["input_ids"]) - if total_samples < self.batch_size: - self.batch_size = total_samples - logger.warning(f"force the train batch size to {total_samples}") - - self._quantize_blocks( - self.model, - inputs, - block_names, - q_input=q_inputs if q_inputs is not None else None, - nblocks=self.nblocks, - device=self.device, - pbar=pbar, - ) - if self.immediate_packing and len(self.formats) != 1: - raise ValueError( - f"Expected exactly one packing format when 'immediate_packing' is True, " - f"but got {len(self.formats)} formats." - ) - pbar.set_description("Quantizing done") - pbar.close() - self._quantize_layers(layer_names, all_inputs) - - if is_fp8_model(self.model): - for n, m in self.model.named_modules(): - if is_fp8_linear(m): - new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device).to("cpu") - set_module(self.model, n, new_layer) - - end_time = time.time() - cost_time = end_time - self.start_time - logger.info(f"quantization tuning time {cost_time}") - - # Dump a summary - quantized_layers = [] - unquantized_layers = [] - for n, m in self.model.named_modules(): - if isinstance(m, tuple(self.supported_types)): - if check_to_quantized(m): - quantized_layers.append(n) - else: - unquantized_layers.append(n) - elif hasattr(m, "scales") or hasattr(m, "scale"): ##packing_immediately - quantized_layers.append(n) - summary_info = ( - f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model" - ) - if len(unquantized_layers) > 0: - summary_info += f", {unquantized_layers} have not been quantized" - logger.info(summary_info) - - self.quantized = True - return self.model, self.layer_config - - def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: - """Quantizes specified layers based on inputs and configuration. - - Args: - layer_names (list): list of layer names to quantize. - layer_inputs (dict): Dictionary mapping layer names to input data. - - Returns: - None - """ - # TODO currently we take all the layers outside blocks as post block layers which is not optimal - # if there is no input for layer, we use rtn - - for layer_name in copy.deepcopy(layer_names): - if layer_name not in layer_inputs: - if self.act_bits < 16 and not self.act_dynamic: - # Activation quantization requires collected inputs - msg_prefix = ( - f"Activation max hook for layer '{layer_name}' is unavailable due to " - f"insufficient collected inputs. " - ) - if "fp8_e5m2" in self.act_data_type: - logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.") - else: - logger.warning( - msg_prefix + "Static activation quantization is not supported or ineffective, " - "Skipping quantization for this layer." - ) - layer_names.remove(layer_name) - continue - logger.info(f"using rtn to quantize {layer_name}") - from auto_round.data_type import QUANT_FUNC_WITH_DTYPE - - layer = get_module(self.model, layer_name) - layer = layer.to(self.device) - if is_fp8_linear(layer): - new_layer = convert_fp8_layer_to_linear(layer, self.amp_dtype, self.device).to(self.device) - set_module(self.model, layer_name, new_layer) - layer = new_layer - - wrapper_layer = WrapperLinear( - layer, - enable_round_tuning=False, - enable_minmax_tuning=False, - enable_norm_bias_tuning=False, - enable_torch_compile=self.enable_torch_compile, - device=self.device, - disable_opt_rtn=self.disable_opt_rtn, - ) - new_layer = wrapper_layer.unwrapper({}) - set_module(self.model, layer_name, new_layer) - layer.cpu() - layer_names.remove(layer_name) - if len(layer_names) == 0: - memory_monitor.update() - memory_monitor.log_summary() - return - q_layer_inputs = None - enable_quanted_input = self.enable_quanted_input - has_gguf = False - - if hasattr(self, "formats"): - has_gguf = any(format_.is_gguf() for format_ in self.formats) - if has_gguf and self.immediate_packing: - enable_quanted_input = False - - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1 and enable_quanted_input: - dispatch_model(self.model, self.model.hf_device_map) - - if enable_quanted_input: - logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names) - q_layer_inputs = self.try_cache_inter_data_gpucpu([], self.nsamples, layer_names=layer_names) - if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: - accelerate.hooks.remove_hook_from_submodules( - self.model - ) # self.model.hf_device_map has not been changed - if not self.immediate_saving: - self.model = mv_module_from_gpu(self.model) - clear_memory(device_list=self.device_list) - quant_layer = self._quantize_layer - for layer_name in layer_names: - layer_input = layer_inputs[layer_name] - layer_input = to_device(layer_input, self.cache_device) - q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None - q_layer_input = to_device(q_layer_input, self.cache_device) - quant_layer(layer_name, layer_input, q_layer_input, device=self.device) - if self.immediate_packing: - self._immediate_pack(layer_name) - - if self.immediate_saving: - m = get_module(self.model, layer_name) - immediate_saving(self, m, name=layer_name, last_group=True) - del layer_input - clear_memory(q_layer_input, device_list=self.device_list) - memory_monitor.log_summary() + quantizers = create_quantizers(self) + return quantizers.quantize() @torch.no_grad() def _get_block_outputs( @@ -2241,244 +1540,6 @@ def _replace_forward(self): hook_handle = m.register_forward_hook(hook_func) self.hook_handles.append(hook_handle) - def _register_act_max_hook(self, model): - def get_act_max_hook(module, input, output): - if isinstance(input, (tuple, list)): - input = input[0] - if input.numel() == 0: - return # as no needs for act_max update - input, _, _ = reshape_pad_tensor_by_group_size(input, self.act_group_size) - act_max = torch.max(torch.abs(input), dim=-1).values - if not hasattr(module, "act_max") or module.act_max.numel() == 0: - module.act_max = act_max - else: - act_max = act_max.to(module.act_max.device) - if is_nv_fp(self.act_data_type): ## for nvfp per-tensor input_global_scale calculation usage - module.act_max = torch.max( - torch.tensor([act_max.max(), module.act_max.max()], device=act_max.device) - ) - else: - module.act_max = torch.max(act_max, module.act_max) - - hook_handles = [] - # for single layers out of blocks, like lm_head - if isinstance(model, SUPPORTED_LAYER_TYPES): - m = model - if ( - hasattr(m, "act_dynamic") - and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits) - and check_to_quantized(m) - ): - hook = m.register_forward_hook(get_act_max_hook) - hook_handles.append(hook) - return hook_handles - - for n, m in model.named_modules(): - if ( - hasattr(m, "act_dynamic") - and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits) - and check_to_quantized(m) - ): - hook = m.register_forward_hook(get_act_max_hook) - hook_handles.append(hook) - continue - - # for whole model, RTN - if n in self.layer_config: - config = self.layer_config[n] - act_dynamic = config.get("act_dynamic", True) - act_data_type = config.get("act_data_type", None) - act_bits = config.get("act_bits", 16) - if ( - config["bits"] <= 8 - and check_need_act_calibration(act_dynamic, act_data_type, act_bits) - and check_to_quantized(config) - ): - hook = m.register_forward_hook(get_act_max_hook) - hook_handles.append(hook) - continue - return hook_handles - - def _quantize_layer( - self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu" - ): - """Quantize a specific layer of the model using the provided inputs. - - Args: - layer_name (str): The name of the layer to quantize. - inputs (torch.Tensor): Input data for quantization. - q_inputs (torch.Tensor, optional): Quantized input data. Defaults to None. - device (torch.device, optional): The device to use for quantization. Defaults to torch.device("cpu"). - - Returns: - None - """ - logger.info(f"quantizing layer {layer_name}") - layer = get_module(self.model, layer_name) - if hasattr(layer, "tuning_device"): - device = layer.tuning_device - - layer = layer.to(device) - for i in range(len(inputs)): - inputs[i] = inputs[i].to(layer.weight.dtype) - if q_inputs is not None: - q_inputs[i] = q_inputs[i].to(layer.weight.dtype) - - if self.act_bits <= 8 and check_need_act_calibration( - self.act_dynamic, - self.act_data_type, - self.act_bits, - self.static_kv_dtype, - self.static_attention_dtype, - ): - tmp_inputs = q_inputs if q_inputs is not None else inputs - hook_handles = self._register_act_max_hook(layer) - with torch.no_grad(): - for input in tmp_inputs: - layer(input) - for handle in hook_handles: - handle.remove() - - wrapper_linear = WrapperLinear( - layer, - enable_minmax_tuning=self.enable_minmax_tuning, - enable_torch_compile=self.enable_torch_compile, - device=device, - ).to(device) - round_params = [] - minmax_params = [] - for key in wrapper_linear.params.keys(): - if "min" in key or "max" in key: - minmax_params.append(wrapper_linear.params[key]) - else: - round_params.append(wrapper_linear.value) - if len(round_params) + len(minmax_params) <= 0: - dump_info = f"quantized {layer_name}" - logger.info(dump_info) - with torch.no_grad(): - unwrapper_layer(self.model, wrapper_linear, layer_name, {}) - mv_module_from_gpu(layer) - - lr = torch.tensor(self.lr) - minmax_lr = torch.tensor(self.minmax_lr) - if self.enable_minmax_tuning: - optimizer = self.optimizer( - [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0 - ) - else: - optimizer = self.optimizer(round_params, lr=lr, weight_decay=0) - - if self.lr_scheduler is None: - lr_schedule = torch.optim.lr_scheduler.LinearLR( - optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters - ) - else: - lr_schedule = copy.deepcopy(self.lr_scheduler) - nsamples = len(inputs) - last_best_iter = 0 - best_loss = torch.finfo(torch.float).max - scaler = self._get_scaler() # pylint: disable=assignment-from-none - init_loss = None - gradient_accumulate_steps = self.batch_size # Force to low gpu - - total_loss = 0 - num_elm = 1 - mse_reduction = "mean" - if gradient_accumulate_steps != 1: - mse_reduction = "sum" - mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) - batch_size = 1 # Force to low gpu - global_batch_size = self.batch_size * gradient_accumulate_steps - global_batch_size = min(nsamples, global_batch_size) - if gradient_accumulate_steps != 1 and not self.attention_mask: - whole_indices = torch.arange(global_batch_size) - if q_inputs is not None: - num_elm = self._get_current_num_elm(q_inputs, whole_indices) - else: - num_elm = self._get_current_num_elm(inputs, whole_indices) - - index_sampler = IndexSampler(nsamples, global_batch_size) - - for i in range(self.iters): - total_loss = 0 - global_indices = index_sampler.next_batch() - if self.attention_mask: - num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices) - - for tmp_step in range(gradient_accumulate_steps): - indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] - if q_inputs is not None: - current_input = [q_inputs[i] for i in indices] - current_input = torch.cat(current_input, dim=0).to(device) - org_input = [inputs[i] for i in indices] - org_input = torch.cat(org_input, dim=0).to(device) - else: - current_input = [inputs[i] for i in indices] - current_input = torch.cat(current_input, dim=0).to(device) - org_input = current_input - with torch.no_grad(): - current_output = layer(org_input) - autocast_ctx = ( - nullcontext() - if not self.amp - else autocast(device_type=str(device).split(":")[0], dtype=self.amp_dtype) - ) - if self.attention_mask: - tmp_attention_mask = [self.attention_mask[i] for i in indices] - tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device) - tmp_attention_mask.unsqueeze_(-1) - - with autocast_ctx: - output_q = wrapper_linear(current_input) # pylint: disable=not-callable - loss = mse_loss( # pylint: disable=not-callable - (output_q * tmp_attention_mask).to(torch.float32), - (current_output * tmp_attention_mask).to(torch.float32), - ) - - else: - with autocast_ctx: - output_q = wrapper_linear(current_input) # pylint: disable=not-callable - loss = mse_loss( # pylint: disable=not-callable - output_q.to(torch.float32), - current_output.to(torch.float32), # mul 1.0 will copy the output - ) - - num_elm = 1 if num_elm <= 0 else num_elm - total_loss += loss.item() / num_elm - - self._scale_loss_and_backward(scaler, loss) - if i == 0: - init_loss = total_loss - - if total_loss < best_loss: - best_loss = total_loss - if not self.not_use_best_mse: - best_params = collect_best_params(wrapper_linear, self.cache_device) - last_best_iter = i - if self.not_use_best_mse and i == self.iters - 1: - best_params = collect_best_params(wrapper_linear, self.cache_device) - - if not self.not_use_best_mse: - if 0 < self.dynamic_max_gap <= i - last_best_iter: - break - self._step(scaler, optimizer, lr_schedule) - - last_loss = total_loss - best_iter = self.iters - if not self.not_use_best_mse: - last_loss = best_loss - best_iter = last_best_iter - with torch.no_grad(): - unwrapper_layer(self.model, wrapper_linear, layer_name, best_params) - mv_module_from_gpu(layer) - dump_info = f"quantized {layer_name}, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" - logger.info(dump_info) - - def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> torch.Tensor: - current_output = [output[x] for x in indices] - current_output = torch.cat(current_output, dim=self.batch_dim) - return current_output - def _get_current_q_output( self, block: torch.nn.Module, @@ -2507,13 +1568,6 @@ def _get_current_num_elm( current_input_ids = [input_ids[i] for i in indices] return sum(id.numel() for id in current_input_ids) - def _get_non_zero_cnt(self, tensor: list[torch.Tensor], indices: list[int]) -> int: - current_tensors = [tensor[i] for i in indices] - non_zero_cnt = 0 - for t in current_tensors: - non_zero_cnt += torch.count_nonzero(t).item() - return non_zero_cnt - def quantize_block( self, block: torch.nn.Module, @@ -2536,8 +1590,19 @@ def quantize_block( self.normalize_decoding_layer_inputs_(inputs) block_inputs = self.inputs[self.quant_block_list[0][0]] decoding_layer_first_input_name = "hidden_states" - input_ids, input_others = self._preprocess_block_inputs(block_inputs, decoding_layer_first_input_name) - return self._quantize_block(block, input_ids, input_others, q_input, device, auto_offload) + from auto_round.quantizers.algs.auto_round import AutoRoundQuantizer + from auto_round.quantizers.utils import preprocess_block_inputs + + input_ids, input_others = preprocess_block_inputs( + block_inputs, + device_list=self.device_list, + first_input_name=decoding_layer_first_input_name, + amp=self.amp, + amp_dtype=self.amp_dtype, + cache_device=self.cache_device, + diffusion=self.diffusion, + ) + return AutoRoundQuantizer(self).quantize_block(block, input_ids, input_others, q_input, device, auto_offload) def _get_loss( self, @@ -2568,384 +1633,6 @@ def _get_loss( return loss - def _quantize_block( - self, - block: torch.nn.Module, - input_ids: Union[list[torch.Tensor], dict], - input_others: dict, - q_input: Union[torch.Tensor, dict, None] = None, - device: Union[str, torch.device] = "cpu", - auto_offload=True, - ): - """Quantize the weights of a given block of the model. - - Args: - block: The block of the model to be quantized. - input_ids: The input tensor containing tokenized input ids. - input_others: A dictionary containing additional input data. - q_input: The quantized input tensor. - device: The device for quantization. - - Returns: - Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) - """ - if is_fp8_model(self.model): - for n, m in block.named_modules(): - if is_fp8_linear(m): - new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device).to(device) - set_module(block, n, new_layer) - - if auto_offload: - # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights - # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk - if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1: - card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( - block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device - ) - else: - block = block.to(device) - card_0_in_high_risk, loss_device = False, device - else: - card_0_in_high_risk, loss_device = False, device - - if len(self.device_list) > 1 and auto_offload: - for n, m in block.named_modules(): - if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): - continue - from accelerate.hooks import AlignDevicesHook, add_hook_to_module - - hook = AlignDevicesHook(m.tuning_device, io_same_device=True) - add_hook_to_module(m, hook, True) - - if q_input is None: - hook_handles = self._register_act_max_hook(block) - - output = self._get_block_outputs( - block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device - ) - - for handle in hook_handles: - handle.remove() - else: - output = self._get_block_outputs( - block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device - ) - hook_handles = self._register_act_max_hook(block) - if hook_handles: - self._get_block_outputs( - block, - q_input, - input_others, - self.batch_size * self.infer_bs_coeff, - device, - self.cache_device, - save_output=False, - ) - - for handle in hook_handles: - handle.remove() - - if q_input is not None: - if input_ids is not q_input: - clear_memory(input_ids, device_list=self.device_list) - else: - clear_memory(device_list=self.device_list) - input_ids = q_input - - quantized_layer_names, unquantized_layer_names = self.wrapper_block( - block, - self.enable_minmax_tuning, - self.enable_norm_bias_tuning, - enable_torch_compile=self.enable_torch_compile, - device=device, - ) - if is_nv_fp(self.data_type): # enable qkv and moe structure global_scale fuse - from auto_round.data_type.utils import update_fused_layer_global_scales - - modules = block.modules() - for module in modules: - update_fused_layer_global_scales(module) - round_params = [] - minmax_params = [] - for n, m in block.named_modules(): - if hasattr(m, "orig_layer"): - for key in m.params.keys(): - if "min" in key or "max" in key: - minmax_params.append(m.params[key]) - else: - round_params.append(m.params[key]) - - lr = torch.tensor(self.lr) - minmax_lr = torch.tensor(self.minmax_lr) - is_adam = "adam" in self.__class__.__name__.lower() - - extra_kwargs = {} if is_adam else {"momentum": self.momentum} - - if self.enable_minmax_tuning: - params = [ - {"params": round_params}, - {"params": minmax_params, "lr": minmax_lr}, - ] - else: - params = round_params - - optimizer = self.optimizer( - params, - lr=lr, - weight_decay=0, - **extra_kwargs, - ) - - if len(round_params) + len(minmax_params) <= 0: - dump_info = ( - f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " - f"layers in the block" - ) - logger.info(dump_info) - unwrapper_block(block, {}) - mv_module_from_gpu(block) - return output, output - - if self.lr_scheduler is None: - lr_schedule = torch.optim.lr_scheduler.LinearLR( - optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters - ) - else: - lr_schedule = copy.deepcopy(self.lr_scheduler) - - if isinstance(input_ids, dict): # input_ids of Flux is dict - nsamples = len(input_ids["hidden_states"]) - else: - nsamples = len(input_ids) - last_best_iter = 0 - best_loss = torch.finfo(torch.float).max - num_elm = 1 - mse_reduction = "mean" - if self.gradient_accumulate_steps != 1: - mse_reduction = "sum" - mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) - scaler = self._get_scaler() # pylint: disable=assignment-from-none - init_loss = None - best_params = {} - total_loss = 0 - global_batch_size = self.batch_size * self.gradient_accumulate_steps - global_batch_size = min(nsamples, global_batch_size) - # We assume the block input and output shape is same - if self.gradient_accumulate_steps != 1 and not self.attention_mask: - whole_indices = torch.arange(global_batch_size) - num_elm = self._get_current_num_elm(input_ids, whole_indices) - - index_sampler = IndexSampler(nsamples, global_batch_size) - batch_size = self.batch_size - for i in range(self.iters): - if self.enable_alg_ext and self.data_type.endswith("dq"): - for n, m in block.named_modules(): - m.cur_iter = i - total_loss = 0 - global_indices = index_sampler.next_batch() - if self.attention_mask: - num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices) - - for tmp_step in range(self.gradient_accumulate_steps): - indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] - current_output = self._get_current_output(output, indices) - current_output = to_device(current_output, loss_device) - output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device) - loss = self._get_loss(output_q, current_output, indices, mse_loss, device) - num_elm = 1 if num_elm <= 0 else num_elm - total_loss += loss.item() / num_elm - - if self.low_gpu_mem_usage and card_0_in_high_risk: - # clear memory to avoid OOM due to memory fragmentation - clear_memory_if_reached_threshold(threshold=0.5, device_list=self.device_list) - - self._scale_loss_and_backward(scaler, loss) - - if self.low_gpu_mem_usage and card_0_in_high_risk: - # clear memory to avoid OOM due to memory fragmentation - clear_memory_if_reached_threshold(threshold=0.8, device_list=self.device_list) - - if i == 0: - init_loss = total_loss - - if total_loss < best_loss: - best_loss = total_loss - if not self.not_use_best_mse: - best_params = collect_best_params(block, self.cache_device) - # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True) - - last_best_iter = i - if self.not_use_best_mse and i == self.iters - 1: - best_params = collect_best_params(block, self.cache_device) - - if not self.not_use_best_mse: - if 0 < self.dynamic_max_gap <= i - last_best_iter: - break - self._step(scaler, optimizer, lr_schedule) - - last_loss = total_loss - best_iter = self.iters - if not self.not_use_best_mse: - last_loss = best_loss - best_iter = last_best_iter - if self.iters > 0: - dump_info = ( - f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " - f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" - ) - else: - dump_info = ( - f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " - "layers in the block" - ) - - if self.low_gpu_mem_usage: - clear_memory(device_list=self.device_list) # clear cached memory during training - if len(unquantized_layer_names) != 0: - logger.info(f"{unquantized_layer_names} have not been quantized") - with torch.no_grad(): - unwrapper_block(block, best_params) - - if is_nv_fp(self.act_data_type): - # enable moe experts act_max automatic generation for WrapperWALayer - set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max") - - if self.enable_quanted_input: - q_outputs = self._get_block_outputs( - block, - input_ids, - input_others, - self.batch_size * self.infer_bs_coeff, - device, - cache_device=self.cache_device, - ) - - if len(self.device_list) > 1 and auto_offload: - accelerate.hooks.remove_hook_from_submodules(block) - if auto_offload: - mv_module_from_gpu(block) - - clear_memory(input_ids, device_list=self.device_list) - memory_info_summary = memory_monitor.get_summary() - logger.infoclean(dump_info + "," + memory_info_summary) - - return q_outputs, output - else: - if len(self.device_list) > 1 and auto_offload: - accelerate.hooks.remove_hook_from_submodules(block) - if auto_offload: - mv_module_from_gpu(block) - clear_memory(input_ids, device_list=self.device_list) - memory_info_summary = memory_monitor.get_summary() - logger.infoclean(dump_info + "," + memory_info_summary) - - return None, output - - def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[torch.Tensor, dict]: - input_ids = inputs[first_input_name] - inputs.pop(first_input_name, None) - input_others = inputs - return input_ids, input_others - - def _preprocess_block_inputs(self, inputs, first_input_name="input_ids"): - input_ids, input_others = self._split_inputs(inputs, first_input_name) - clear_memory(device_list=self.device_list) - input_ids = to_device(input_ids, self.cache_device) - input_others = to_device(input_others, self.cache_device) - # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage - - tmp_dtype = self.amp_dtype if self.amp else torch.float32 - input_ids = to_dtype(input_ids, tmp_dtype) - - for key in input_others.keys(): - if isinstance(input_others[key], torch.Tensor) and ( - input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16 - ): - input_others[key] = input_others[key].to(tmp_dtype) - elif isinstance(input_others[key], list): - for i in range(len(input_others[key])): - to_dtype(input_others[key][i], tmp_dtype) - return input_ids, input_others - - def _quantize_blocks( - self, - model: torch.nn.Module, - inputs: dict, - block_names: list, - q_input: torch.Tensor = None, - nblocks: int = 1, - device: str = "cpu", - pbar: tqdm = None, - ): - """Quantize and dequantize the weights of the specified blocks in the model. - - Args: - model: The PyTorch model to be quantized. - inputs: The input data for quantization. - block_names: The names of the blocks to be quantized and dequantized. - nblocks: The number of blocks to quantize and dequantize. - device: The device for quantization and dequantization. - - Returns: - None - """ - clear_memory(device_list=self.device_list) - for n, m in model.named_parameters(): - m.requires_grad_(False) - - input_ids, input_others = self._preprocess_block_inputs(inputs) - - if pbar is None: - pbar = tqdm(range(0, len(block_names), nblocks)) - - for i in range(0, len(block_names), nblocks): - if i != 0: - pbar.update(1) - if nblocks == 1: - n = block_names[i] - pbar.set_description(f"Quantizing {n}") - m = get_module(model, n) - else: - names = block_names[i : min(i + nblocks, len(block_names))] - pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}") - modules = [get_module(model, n) for n in names] - m = WrapperMultiblock(modules) - - m.config = model.config if hasattr(model, "config") else None - q_input, input_ids = self._quantize_block( - m, - input_ids, - input_others, - q_input=q_input, - device=device, - ) - if hasattr(model, "config"): - del m.config - if self.immediate_packing: - for _, tmp_m in m.named_modules(): - if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): - continue - self._immediate_pack(tmp_m.tmp_name) - - if self.immediate_saving: - last_group = (i + nblocks) >= len(block_names) - immediate_saving(self, m, last_group=last_group) - if pbar is not None: - pbar.update(1) - - if not self.immediate_saving: - self.model = mv_module_from_gpu(self.model) - for n, m in self.model.named_modules(): - if hasattr(m, "name"): - delattr(m, "name") - - del q_input - del input_ids - del input_others - del inputs - - clear_memory(device_list=self.device_list) - def save_quantized( self, output_dir: str = None, @@ -3008,30 +1695,6 @@ def save_quantized( else: return compressed_model - def _get_quantized_layer_names_outside_blocks(self) -> list: - """Gets the names of quantized layers outside blocks in the model. - - Returns: - list: List of layer names outside blocks. - """ - if self.layer_config is None or len(self.layer_config) == 0: - return [] - - layer_names = [] - all_layers_in_block = get_layer_names_in_block(self.model, self.supported_types, self.quant_block_list) - - for key in self.layer_config.keys(): - if key in all_layers_in_block: - continue - layer = get_module(self.model, key) - if layer is None: - logger.error(f"could not find layer {key} in the model, exit...") - exit(-1) - if type(layer) in self.supported_types and check_to_quantized(self.layer_config[key]): - layer_names.append(key) - - return layer_names - def _set_amp_dtype(self) -> None: """Sets the automatic mixed precision (AMP) data type for the model based on the device and configuration.""" self.amp_dtype = torch.bfloat16 diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index 50e807320..0bfbf038c 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -172,18 +172,6 @@ def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]: q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str} return inputs, q_inputs - def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[dict, dict]: - input_id_str = [key for key in inputs.keys() if "hidden_state" in key] - input_ids = {k: inputs.pop(k, None) for k in input_id_str} - input_others = inputs - return input_ids, input_others - - def _get_current_output(self, output: dict, indices: list[int]) -> torch.Tensor: - assert "hidden_states" in output - current_output = [output["hidden_states"][x] for x in indices] - current_output = torch.cat(current_output, dim=self.batch_dim) - return current_output - def _get_current_q_output( self, block: torch.nn.Module, diff --git a/auto_round/quantizers/__init__.py b/auto_round/quantizers/__init__.py new file mode 100644 index 000000000..87ac77b62 --- /dev/null +++ b/auto_round/quantizers/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_round.quantizers.entrypoint import create_quantizers diff --git a/auto_round/quantizers/algs/auto_round.py b/auto_round/quantizers/algs/auto_round.py new file mode 100644 index 000000000..da02d0397 --- /dev/null +++ b/auto_round/quantizers/algs/auto_round.py @@ -0,0 +1,888 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import time +import traceback +from contextlib import nullcontext +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +import accelerate +import torch +from accelerate.big_modeling import dispatch_model, infer_auto_device_map +from torch import autocast +from tqdm import tqdm + +from auto_round.compressors.utils import ( + IndexSampler, + check_need_act_calibration, + collect_best_params, + immediate_saving, + is_nv_fp, +) +from auto_round.logger import logger +from auto_round.quantizers.algs.base import AlgsBaseQuantizer +from auto_round.quantizers.utils import ( + get_non_zero_cnt, + get_quantized_layer_names_outside_blocks, + preprocess_block_inputs, + quantize_embedding_layer, + register_act_max_hook, +) +from auto_round.utils import ( + check_to_quantized, + clear_memory, + convert_fp8_layer_to_linear, + get_block_names, + get_module, + is_auto_device_mapping, + is_fp8_linear, + is_fp8_model, + memory_monitor, + mv_module_from_gpu, + set_amax_for_all_moe_layers, + set_module, + to_device, +) +from auto_round.utils.device import ( + clear_memory_if_reached_threshold, + set_auto_device_map_for_block_with_tuning, +) +from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block + +if TYPE_CHECKING: + from auto_round.compressors.base import BaseCompressor + + +class ARQuantizer(AlgsBaseQuantizer): + + def __init__(self, compressor: "BaseCompressor"): + super().__init__(compressor) + + def pre_quantize(self, *args, **kwargs): + return super().pre_quantize(*args, **kwargs) + + def quantize(self, *args, **kwargs): + if bool(self.compressor.quant_block_list): + all_blocks = self.compressor.quant_block_list + else: + all_blocks = get_block_names(self.compressor.model) + + if len(all_blocks) == 0: + logger.warning("could not find blocks, exit with original model") + return self.compressor.model, self.compressor.layer_config + + if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype: + self.compressor.model = self.compressor.model.to(self.compressor.amp_dtype) + + layer_names = get_quantized_layer_names_outside_blocks( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + supported_types=self.compressor.supported_types, + quant_block_list=self.compressor.quant_block_list, + ) + start_time = time.time() + all_first_block_names = [block[0] for block in all_blocks] + if len(layer_names) > 0: + logger.info( + "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names + ) + else: + logger.info("start to cache block inputs") + + # TODO: refactor this + all_inputs = self.compressor.try_cache_inter_data_gpucpu( + all_first_block_names, self.compressor.nsamples, layer_names=layer_names + ) + + is_quantized_embedding = quantize_embedding_layer( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + scale_dtype=self.compressor.data_type, + disable_opt_rtn=self.compressor.disable_opt_rtn, + device=self.compressor.device, + device_list=self.compressor.device_list, + ) + clear_memory(device_list=self.compressor.device_list) + all_q_inputs = None + if is_quantized_embedding: + all_inputs = copy.deepcopy(self.compressor.inputs) + clear_memory(self.compressor.inputs, device_list=self.compressor.device_list) + # TODO: refactor this + all_q_inputs = self.compressor.try_cache_inter_data_gpucpu( + all_first_block_names, self.compressor.nsamples, layer_names=layer_names + ) + self.compressor.model = mv_module_from_gpu(self.compressor.model) + clear_memory(device_list=self.compressor.device_list) + + if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules( + self.compressor.model + ) # self.compressor.model.hf_device_map has not been changed + logger.info("caching done") + if len(all_blocks) > 1: + pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.compressor.nblocks)) + else: + pbar = tqdm(range(0, len(all_blocks[0]), self.compressor.nblocks)) # move the alg warning outside pbar + + for block_names in all_blocks: + inputs = all_inputs[block_names[0]] + all_inputs.pop(block_names[0]) + q_inputs = None + if all_q_inputs is not None: + q_inputs = all_q_inputs[block_names[0]] + all_q_inputs.pop(block_names[0]) + + # TODO: refactor this + inputs, q_inputs = self.compressor._update_inputs(inputs, q_inputs) + + clear_memory(self.compressor.inputs, device_list=self.compressor.device_list) + + if "input_ids" in inputs.keys(): + total_samples = len(inputs["input_ids"]) + if total_samples < self.compressor.batch_size: + self.compressor.batch_size = total_samples + logger.warning(f"force the train batch size to {total_samples}") + + self._quantize_blocks( + self.compressor.model, + inputs, + block_names, + q_input=q_inputs if q_inputs is not None else None, + nblocks=self.compressor.nblocks, + device=self.compressor.device, + pbar=pbar, + ) + if self.compressor.immediate_packing and len(self.compressor.formats) != 1: + raise ValueError( + f"Expected exactly one packing format when 'immediate_packing' is True, " + f"but got {len(self.compressor.formats)} formats." + ) + pbar.set_description("Quantizing done") + pbar.close() + self._quantize_layers(layer_names, all_inputs) + + if is_fp8_model(self.compressor.model): + for n, m in self.compressor.model.named_modules(): + if is_fp8_linear(m): + new_layer = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device).to( + "cpu" + ) + set_module(self.compressor.model, n, new_layer) + + end_time = time.time() + cost_time = end_time - start_time + logger.info(f"quantization tuning time {cost_time}") + + # Dump a summary + quantized_layers = [] + unquantized_layers = [] + for n, m in self.compressor.model.named_modules(): + if isinstance(m, tuple(self.compressor.supported_types)): + if check_to_quantized(m): + quantized_layers.append(n) + else: + unquantized_layers.append(n) + elif hasattr(m, "scales") or hasattr(m, "scale"): ##packing_immediately + quantized_layers.append(n) + summary_info = ( + f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model" + ) + if len(unquantized_layers) > 0: + summary_info += f", {unquantized_layers} have not been quantized" + logger.info(summary_info) + + self.compressor.quantized = True + return self.compressor.model, self.compressor.layer_config + + def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: + """Quantizes specified layers based on inputs and configuration. + + Args: + layer_names (list): list of layer names to quantize. + layer_inputs (dict): Dictionary mapping layer names to input data. + + Returns: + None + """ + # TODO currently we take all the layers outside blocks as post block layers which is not optimal + # if there is no input for layer, we use rtn + + for layer_name in copy.deepcopy(layer_names): + if layer_name not in layer_inputs: + if self.compressor.act_bits < 16 and not self.compressor.act_dynamic: + # Activation quantization requires collected inputs + msg_prefix = ( + f"Activation max hook for layer '{layer_name}' is unavailable due to " + f"insufficient collected inputs. " + ) + if "fp8_e5m2" in self.compressor.act_data_type: + logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.") + else: + logger.warning( + msg_prefix + "Static activation quantization is not supported or ineffective, " + "Skipping quantization for this layer." + ) + layer_names.remove(layer_name) + continue + logger.info(f"using rtn to quantize {layer_name}") + from auto_round.data_type import QUANT_FUNC_WITH_DTYPE + + layer = get_module(self.compressor.model, layer_name) + layer = layer.to(self.compressor.device) + if is_fp8_linear(layer): + new_layer = convert_fp8_layer_to_linear( + layer, self.compressor.amp_dtype, self.compressor.device + ).to(self.compressor.device) + set_module(self.compressor.model, layer_name, new_layer) + layer = new_layer + + wrapper_layer = WrapperLinear( + layer, + enable_round_tuning=False, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_torch_compile=self.compressor.enable_torch_compile, + device=self.compressor.device, + disable_opt_rtn=self.compressor.disable_opt_rtn, + ) + new_layer = wrapper_layer.unwrapper({}) + set_module(self.compressor.model, layer_name, new_layer) + layer.cpu() + layer_names.remove(layer_name) + if len(layer_names) == 0: + memory_monitor.update() + memory_monitor.log_summary() + return + q_layer_inputs = None + enable_quanted_input = self.compressor.enable_quanted_input + has_gguf = False + + if hasattr(self.compressor, "formats"): + has_gguf = any(format_.is_gguf() for format_ in self.compressor.formats) + if has_gguf and self.compressor.immediate_packing: + enable_quanted_input = False + + if ( + hasattr(self.compressor.model, "hf_device_map") + and len(self.compressor.model.hf_device_map) > 1 + and enable_quanted_input + ): + dispatch_model(self.compressor.model, self.compressor.model.hf_device_map) + + if enable_quanted_input: + logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names) + # TODO: refactor this + q_layer_inputs = self.compressor.try_cache_inter_data_gpucpu( + [], self.compressor.nsamples, layer_names=layer_names + ) + if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules( + self.compressor.model + ) # self.compressor.model.hf_device_map has not been changed + if not self.compressor.immediate_saving: + self.compressor.model = mv_module_from_gpu(self.compressor.model) + clear_memory(device_list=self.compressor.device_list) + for layer_name in layer_names: + layer_input = layer_inputs[layer_name] + layer_input = to_device(layer_input, self.compressor.cache_device) + q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None + q_layer_input = to_device(q_layer_input, self.compressor.cache_device) + self.quantize_layer(layer_name, layer_input, q_layer_input, device=self.compressor.device) + if self.compressor.immediate_packing: + self.compressor._immediate_pack(layer_name) + + if self.compressor.immediate_saving: + m = get_module(self.compressor.model, layer_name) + immediate_saving(self.compressor, m, name=layer_name, last_group=True) + del layer_input + clear_memory(q_layer_input, device_list=self.compressor.device_list) + memory_monitor.log_summary() + + def _quantize_blocks( + self, + model: torch.nn.Module, + inputs: dict, + block_names: list, + q_input: torch.Tensor = None, + nblocks: int = 1, + device: str = "cpu", + pbar: tqdm = None, + ): + """Quantize and dequantize the weights of the specified blocks in the model. + + Args: + model: The PyTorch model to be quantized. + inputs: The input data for quantization. + block_names: The names of the blocks to be quantized and dequantized. + nblocks: The number of blocks to quantize and dequantize. + device: The device for quantization and dequantization. + + Returns: + None + """ + clear_memory(device_list=self.compressor.device_list) + for n, m in model.named_parameters(): + m.requires_grad_(False) + + input_ids, input_others = preprocess_block_inputs( + inputs, + device_list=self.compressor.device_list, + first_input_name="input_ids", + amp=self.compressor.amp, + amp_dtype=self.compressor.amp_dtype, + cache_device=self.compressor.cache_device, + diffusion=self.compressor.diffusion, + ) + + if pbar is None: + pbar = tqdm(range(0, len(block_names), nblocks)) + + for i in range(0, len(block_names), nblocks): + if i != 0: + pbar.update(1) + if nblocks == 1: + n = block_names[i] + pbar.set_description(f"Quantizing {n}") + m = get_module(model, n) + else: + names = block_names[i : min(i + nblocks, len(block_names))] + pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}") + modules = [get_module(model, n) for n in names] + m = WrapperMultiblock(modules) + + m.config = model.config if hasattr(model, "config") else None + q_input, input_ids = self.quantize_block( + m, + input_ids, + input_others, + q_input=q_input, + device=device, + ) + if hasattr(model, "config"): + del m.config + if self.compressor.immediate_packing: + for _, tmp_m in m.named_modules(): + if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): + continue + self.compressor._immediate_pack(tmp_m.tmp_name) + + if self.compressor.immediate_saving: + last_group = (i + nblocks) >= len(block_names) + immediate_saving(self.compressor, m, last_group=last_group) + if pbar is not None: + pbar.update(1) + + if not self.compressor.immediate_saving: + self.compressor.model = mv_module_from_gpu(self.compressor.model) + for n, m in self.compressor.model.named_modules(): + if hasattr(m, "name"): + delattr(m, "name") + + del q_input + del input_ids + del input_others + del inputs + + clear_memory(device_list=self.compressor.device_list) + + def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"): + """Quantize a specific layer of the model using the provided inputs. + + Args: + layer_name (str): The name of the layer to quantize. + inputs (torch.Tensor): Input data for quantization. + q_inputs (torch.Tensor, optional): Quantized input data. Defaults to None. + device (torch.device, optional): The device to use for quantization. Defaults to torch.device("cpu"). + + Returns: + None + """ + logger.info(f"quantizing layer {layer_name}") + layer = get_module(self.compressor.model, layer_name) + if hasattr(layer, "tuning_device"): + device = layer.tuning_device + + layer = layer.to(device) + for i in range(len(inputs)): + inputs[i] = inputs[i].to(layer.weight.dtype) + if q_inputs is not None: + q_inputs[i] = q_inputs[i].to(layer.weight.dtype) + + if self.compressor.act_bits <= 8 and check_need_act_calibration( + self.compressor.act_dynamic, + self.compressor.act_data_type, + self.compressor.act_bits, + self.compressor.static_kv_dtype, + self.compressor.static_attention_dtype, + ): + tmp_inputs = q_inputs if q_inputs is not None else inputs + hook_handles = register_act_max_hook( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + act_group_size=self.compressor.act_group_size, + act_data_type=self.compressor.act_data_type, + ) + with torch.no_grad(): + for input in tmp_inputs: + layer(input) + for handle in hook_handles: + handle.remove() + + wrapper_linear = WrapperLinear( + layer, + enable_minmax_tuning=self.compressor.enable_minmax_tuning, + enable_torch_compile=self.compressor.enable_torch_compile, + device=device, + ).to(device) + round_params = [] + minmax_params = [] + for key in wrapper_linear.params.keys(): + if "min" in key or "max" in key: + minmax_params.append(wrapper_linear.params[key]) + else: + round_params.append(wrapper_linear.value) + if len(round_params) + len(minmax_params) <= 0: + dump_info = f"quantized {layer_name}" + logger.info(dump_info) + with torch.no_grad(): + unwrapper_layer(self.compressor.model, wrapper_linear, layer_name, {}) + mv_module_from_gpu(layer) + + lr = torch.tensor(self.compressor.lr) + minmax_lr = torch.tensor(self.compressor.minmax_lr) + if self.compressor.enable_minmax_tuning: + optimizer = self.optimizer( + [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0 + ) + else: + optimizer = self.optimizer(round_params, lr=lr, weight_decay=0) + + if self.compressor.lr_scheduler is None: + lr_schedule = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.compressor.iters + ) + else: + lr_schedule = copy.deepcopy(self.compressor.lr_scheduler) + nsamples = len(inputs) + last_best_iter = 0 + best_loss = torch.finfo(torch.float).max + scaler = self.compressor._get_scaler() # pylint: disable=assignment-from-none + init_loss = None + gradient_accumulate_steps = self.compressor.batch_size # Force to low gpu + total_loss = 0 + num_elm = 1 + mse_reduction = "mean" + if gradient_accumulate_steps != 1: + mse_reduction = "sum" + mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) + batch_size = 1 # Force to low gpu + global_batch_size = self.compressor.batch_size * gradient_accumulate_steps + global_batch_size = min(nsamples, global_batch_size) + if gradient_accumulate_steps != 1 and not self.compressor.attention_mask: + whole_indices = torch.arange(global_batch_size) + if q_inputs is not None: + # Todo: refactor this + num_elm = self.compressor._get_current_num_elm(q_inputs, whole_indices) + else: + num_elm = self.compressor._get_current_num_elm(inputs, whole_indices) + + index_sampler = IndexSampler(nsamples, global_batch_size) + + for i in range(self.compressor.iters): + total_loss = 0 + global_indices = index_sampler.next_batch() + if self.compressor.attention_mask: + num_elm = get_non_zero_cnt(self.compressor.attention_mask, global_indices) + + for tmp_step in range(gradient_accumulate_steps): + indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] + if q_inputs is not None: + current_input = [q_inputs[i] for i in indices] + current_input = torch.cat(current_input, dim=0).to(device) + org_input = [inputs[i] for i in indices] + org_input = torch.cat(org_input, dim=0).to(device) + else: + current_input = [inputs[i] for i in indices] + current_input = torch.cat(current_input, dim=0).to(device) + org_input = current_input + with torch.no_grad(): + current_output = layer(org_input) + autocast_ctx = ( + nullcontext() + if not self.compressor.amp + else autocast(device_type=str(device).split(":")[0], dtype=self.compressor.amp_dtype) + ) + if self.compressor.attention_mask: + tmp_attention_mask = [self.compressor.attention_mask[i] for i in indices] + tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device) + tmp_attention_mask.unsqueeze_(-1) + + with autocast_ctx: + output_q = wrapper_linear(current_input) # pylint: disable=not-callable + loss = mse_loss( # pylint: disable=not-callable + (output_q * tmp_attention_mask).to(torch.float32), + (current_output * tmp_attention_mask).to(torch.float32), + ) + + else: + with autocast_ctx: + output_q = wrapper_linear(current_input) # pylint: disable=not-callable + loss = mse_loss( # pylint: disable=not-callable + output_q.to(torch.float32), + current_output.to(torch.float32), # mul 1.0 will copy the output + ) + + num_elm = 1 if num_elm <= 0 else num_elm + total_loss += loss.item() / num_elm + + self.compressor.scale_loss_and_backward(scaler, loss) + if i == 0: + init_loss = total_loss + + if total_loss < best_loss: + best_loss = total_loss + if not self.compressor.not_use_best_mse: + best_params = collect_best_params(wrapper_linear, self.compressor.cache_device) + last_best_iter = i + if self.compressor.not_use_best_mse and i == self.compressor.iters - 1: + best_params = collect_best_params(wrapper_linear, self.compressor.cache_device) + + if not self.compressor.not_use_best_mse: + if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter: + break + self.compressor._step(scaler, optimizer, lr_schedule) + + last_loss = total_loss + best_iter = self.compressor.iters + if not self.compressor.not_use_best_mse: + last_loss = best_loss + best_iter = last_best_iter + with torch.no_grad(): + unwrapper_layer(self.compressor.model, wrapper_linear, layer_name, best_params) + mv_module_from_gpu(layer) + dump_info = f"quantized {layer_name}, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" + logger.info(dump_info) + + def quantize_block( + self, + block: torch.nn.Module, + input_ids: Union[list[torch.Tensor], dict], + input_others: dict, + q_input: Union[torch.Tensor, dict, None] = None, + device: Union[str, torch.device] = "cpu", + auto_offload=True, + ): + """Quantize the weights of a given block of the model. + + Args: + block: The block of the model to be quantized. + input_ids: The input tensor containing tokenized input ids. + input_others: A dictionary containing additional input data. + q_input: The quantized input tensor. + device: The device for quantization. + + Returns: + Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output) + """ + if is_fp8_model(self.compressor.model): + for n, m in block.named_modules(): + if is_fp8_linear(m): + new_layer = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device).to( + device + ) + set_module(block, n, new_layer) + + if auto_offload: + # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights + # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk + if is_auto_device_mapping(self.compressor.device_map) and len(self.compressor.device_list) > 1: + card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( + block, + self.compressor.device_map, + input_ids, + self.compressor.low_gpu_mem_usage, + self.compressor.batch_size, + device, + ) + else: + block = block.to(device) + card_0_in_high_risk, loss_device = False, device + else: + card_0_in_high_risk, loss_device = False, device + + if len(self.compressor.device_list) > 1 and auto_offload: + for n, m in block.named_modules(): + if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): + continue + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + hook = AlignDevicesHook(m.tuning_device, io_same_device=True) + add_hook_to_module(m, hook, True) + + if q_input is None: + hook_handles = register_act_max_hook( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + act_group_size=self.compressor.act_group_size, + act_data_type=self.compressor.act_data_type, + ) + + # TODO: refactor this part + output = self.compressor._get_block_outputs( + block, + input_ids, + input_others, + self.compressor.batch_size * self.compressor.infer_bs_coeff, + device, + self.compressor.cache_device, + ) + + for handle in hook_handles: + handle.remove() + else: + # TODO: refactor this part + output = self.compressor._get_block_outputs( + block, + input_ids, + input_others, + self.compressor.batch_size * self.compressor.infer_bs_coeff, + device, + self.compressor.cache_device, + ) + hook_handles = register_act_max_hook( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + act_group_size=self.compressor.act_group_size, + act_data_type=self.compressor.act_data_type, + ) + if hook_handles: + # TODO: refactor this part + self.compressor._get_block_outputs( + block, + q_input, + input_others, + self.compressor.batch_size * self.compressor.infer_bs_coeff, + device, + self.compressor.cache_device, + save_output=False, + ) + + for handle in hook_handles: + handle.remove() + + if q_input is not None: + if input_ids is not q_input: + clear_memory(input_ids, device_list=self.compressor.device_list) + else: + clear_memory(device_list=self.compressor.device_list) + input_ids = q_input + + quantized_layer_names, unquantized_layer_names = self.compressor.wrapper_block( + block, + self.compressor.enable_minmax_tuning, + self.compressor.enable_norm_bias_tuning, + enable_torch_compile=self.compressor.enable_torch_compile, + device=device, + ) + if is_nv_fp(self.compressor.data_type): # enable qkv and moe structure global_scale fuse + from auto_round.data_type.utils import update_fused_layer_global_scales + + modules = block.modules() + for module in modules: + update_fused_layer_global_scales(module) + round_params = [] + minmax_params = [] + for n, m in block.named_modules(): + if hasattr(m, "orig_layer"): + for key in m.params.keys(): + if "min" in key or "max" in key: + minmax_params.append(m.params[key]) + else: + round_params.append(m.params[key]) + + lr = torch.tensor(self.compressor.lr) + minmax_lr = torch.tensor(self.compressor.minmax_lr) + is_adam = "adam" in self.compressor.__class__.__name__.lower() + + extra_kwargs = {} if is_adam else {"momentum": self.compressor.momentum} + + if self.compressor.enable_minmax_tuning: + params = [ + {"params": round_params}, + {"params": minmax_params, "lr": minmax_lr}, + ] + else: + params = round_params + + optimizer = self.compressor.optimizer( + params, + lr=lr, + weight_decay=0, + **extra_kwargs, + ) + + if len(round_params) + len(minmax_params) <= 0: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + f"layers in the block" + ) + logger.info(dump_info) + unwrapper_block(block, {}) + mv_module_from_gpu(block) + return output, output + + if self.compressor.lr_scheduler is None: + lr_schedule = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.compressor.iters + ) + else: + lr_schedule = copy.deepcopy(self.compressor.lr_scheduler) + + if isinstance(input_ids, dict): # input_ids of Flux is dict + nsamples = len(input_ids["hidden_states"]) + else: + nsamples = len(input_ids) + last_best_iter = 0 + best_loss = torch.finfo(torch.float).max + num_elm = 1 + mse_reduction = "mean" + if self.compressor.gradient_accumulate_steps != 1: + mse_reduction = "sum" + mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) + scaler = self.compressor._get_scaler() # pylint: disable=assignment-from-none + init_loss = None + best_params = {} + total_loss = 0 + global_batch_size = self.compressor.batch_size * self.compressor.gradient_accumulate_steps + global_batch_size = min(nsamples, global_batch_size) + # We assume the block input and output shape is same + if self.compressor.gradient_accumulate_steps != 1 and not self.compressor.attention_mask: + whole_indices = torch.arange(global_batch_size) + num_elm = self.compressor._get_current_num_elm(input_ids, whole_indices) + + index_sampler = IndexSampler(nsamples, global_batch_size) + batch_size = self.compressor.batch_size + for i in range(self.compressor.iters): + if self.compressor.enable_alg_ext and self.compressor.data_type.endswith("dq"): + for n, m in block.named_modules(): + m.cur_iter = i + total_loss = 0 + global_indices = index_sampler.next_batch() + if self.compressor.attention_mask: + num_elm = get_non_zero_cnt(self.compressor.attention_mask, global_indices) + + for tmp_step in range(self.compressor.gradient_accumulate_steps): + indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] + current_output = self._get_current_output(output, indices, self.compressor.batch_dim) + current_output = to_device(current_output, loss_device) + # TODO: refactor this + output_q = self.compressor._get_current_q_output( + block, input_ids, input_others, indices, device, loss_device + ) + # TODO: refactor this + loss = self.compressor._get_loss(output_q, current_output, indices, mse_loss, device) + num_elm = 1 if num_elm <= 0 else num_elm + total_loss += loss.item() / num_elm + + if self.compressor.low_gpu_mem_usage and card_0_in_high_risk: + # clear memory to avoid OOM due to memory fragmentation + clear_memory_if_reached_threshold(threshold=0.5, device_list=self.compressor.device_list) + + self.compressor._scale_loss_and_backward(scaler, loss) + if self.compressor.low_gpu_mem_usage and card_0_in_high_risk: + # clear memory to avoid OOM due to memory fragmentation + clear_memory_if_reached_threshold(threshold=0.8, device_list=self.compressor.device_list) + + if i == 0: + init_loss = total_loss + + if total_loss < best_loss: + best_loss = total_loss + if not self.compressor.not_use_best_mse: + best_params = collect_best_params(block, self.compressor.cache_device) + # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True) + + last_best_iter = i + if self.compressor.not_use_best_mse and i == self.compressor.iters - 1: + best_params = collect_best_params(block, self.compressor.cache_device) + + if not self.compressor.not_use_best_mse: + if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter: + break + self.compressor._step(scaler, optimizer, lr_schedule) + + last_loss = total_loss + best_iter = self.compressor.iters + if not self.compressor.not_use_best_mse: + last_loss = best_loss + best_iter = last_best_iter + if self.compressor.iters > 0: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" + ) + else: + dump_info = ( + f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} " + "layers in the block" + ) + + if self.compressor.low_gpu_mem_usage: + clear_memory(device_list=self.compressor.device_list) # clear cached memory during training + if len(unquantized_layer_names) != 0: + logger.info(f"{unquantized_layer_names} have not been quantized") + with torch.no_grad(): + unwrapper_block(block, best_params) + + if is_nv_fp(self.compressor.act_data_type): + # enable moe experts act_max automatic generation for WrapperWALayer + set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max") + + if self.compressor.enable_quanted_input: + # TODO: refactor this + q_outputs = self.compressor._get_block_outputs( + block, + input_ids, + input_others, + self.compressor.batch_size * self.compressor.infer_bs_coeff, + device, + cache_device=self.compressor.cache_device, + ) + + if len(self.compressor.device_list) > 1 and auto_offload: + accelerate.hooks.remove_hook_from_submodules(block) + if auto_offload: + mv_module_from_gpu(block) + + clear_memory(input_ids, device_list=self.compressor.device_list) + memory_info_summary = memory_monitor.get_summary() + logger.infoclean(dump_info + "," + memory_info_summary) + + return q_outputs, output + else: + if len(self.compressor.device_list) > 1 and auto_offload: + accelerate.hooks.remove_hook_from_submodules(block) + if auto_offload: + mv_module_from_gpu(block) + clear_memory(input_ids, device_list=self.compressor.device_list) + memory_info_summary = memory_monitor.get_summary() + logger.infoclean(dump_info + "," + memory_info_summary) + + return None, output + + @staticmethod + def _get_current_output(output: list[torch.Tensor], indices: list[int], batch_dim: int) -> torch.Tensor: + current_output = [output[x] for x in indices] + current_output = torch.cat(current_output, dim=batch_dim) + return current_output diff --git a/auto_round/quantizers/algs/base.py b/auto_round/quantizers/algs/base.py new file mode 100644 index 000000000..62e97c134 --- /dev/null +++ b/auto_round/quantizers/algs/base.py @@ -0,0 +1,27 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from auto_round.quantizers.base import BaseQuantizer + +if TYPE_CHECKING: + from auto_round.compressors.base import BaseCompressor + + +class AlgsBaseQuantizer(BaseQuantizer): + @abstractmethod + def quantize(self, *args, **kwargs): + pass diff --git a/auto_round/quantizers/algs/rtn.py b/auto_round/quantizers/algs/rtn.py new file mode 100644 index 000000000..80d37d2fe --- /dev/null +++ b/auto_round/quantizers/algs/rtn.py @@ -0,0 +1,627 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import traceback +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +import accelerate +import torch +from accelerate.big_modeling import dispatch_model, infer_auto_device_map +from tqdm import tqdm + +from auto_round.compressors.utils import ( + check_need_act_calibration, + immediate_saving, + is_nv_fp, + is_static_wfp8afp8, +) +from auto_round.logger import logger +from auto_round.quantizers.algs.base import AlgsBaseQuantizer +from auto_round.quantizers.utils import ( + get_quantized_layer_names_outside_blocks, + quantize_embedding_layer, + register_act_max_hook, +) +from auto_round.utils import ( + check_to_quantized, + clear_memory, + convert_fp8_layer_to_linear, + convert_fp8_model_to_16b_model, + flatten_list, + get_block_names, + get_module, + is_auto_device_mapping, + is_fp8_linear, + is_fp8_model, + memory_monitor, + mv_module_from_gpu, + set_amax_for_all_moe_layers, + set_module, + to_device, + to_dtype, +) +from auto_round.utils.device import ( + clear_memory_if_reached_threshold, + get_major_device, + parse_available_devices, + set_auto_device_map_for_block_with_tuning, + set_non_auto_device_map, +) +from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block + +if TYPE_CHECKING: + from auto_round.compressors.base import BaseCompressor + + +class RTNQuantizer(AlgsBaseQuantizer): + + def __init__(self, compressor: "BaseCompressor"): + super().__init__(compressor) + self.all_to_quantized_module_names: list[str] = [ + n for n, m in self.compressor.model.named_modules() if check_to_quantized(m) + ] + + def pre_quantize(self, *args, **kwargs): + if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype: + self.compressor.model.to(self.compressor.amp_dtype) + + if is_nv_fp(self.compressor.data_type): + from auto_round.data_type.nvfp import calculate_gparam + from auto_round.data_type.utils import update_fused_layer_global_scales + + pbar = tqdm(self.all_to_quantized_module_names) + for name in pbar: + pbar.set_description(f"Calculate weight global scale: {name}") + m = get_module(self.compressor.model, name) + if is_fp8_linear(m): + m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device) + set_module(self.compressor.model, name, m) + weight_global_scale = calculate_gparam(m.weight, self.compressor.group_size) + setattr(m, "weight_global_scale", weight_global_scale) + + logger.info("Start to update fused layer global scales, it may take some time.") + for name, module in self.compressor.model.named_modules(): + update_fused_layer_global_scales(module) + logger.info("Finished updating fused layer global scales.") + + @torch.inference_mode() + def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: + """Quantize all modules in the model using RTN (Round-To-Nearest) strategy. + + If the target format includes GGUF with `k`, and optimized RTN is enabled, + blockwise quantization with input caching and imatrix is used. + + Returns: + tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration. + """ + if not ( + any(fmt.is_gguf() for fmt in getattr(self.compressor, "formats", [])) + or self.compressor.super_bits is not None + ): + quantize_embedding_layer( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + scale_dtype=self.compressor.data_type, + disable_opt_rtn=self.compressor.disable_opt_rtn, + device=self.compressor.device, + device_list=self.compressor.device_list, + ) # leave to gguf itself to handle + + self.compressor.model.to("cpu") + # Release memory + clear_memory(device_list=self.compressor.device_list) + + if self.compressor.act_bits <= 8 and check_need_act_calibration( + self.compressor.act_dynamic, + self.compressor.act_data_type, + self.compressor.act_bits, + self.compressor.static_kv_dtype, + self.compressor.static_attention_dtype, + ): # TODO, mixed datatype has bug + hook_handles = register_act_max_hook( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + act_group_size=self.compressor.act_group_size, + act_data_type=self.compressor.act_data_type, + ) + try: + self._quantize_via_rtn_blockwise() + except torch.OutOfMemoryError: + logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.") + self.model = self.model.to("cpu") + clear_memory(device_list=self.compressor.device_list) + if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1: + import accelerate + + accelerate.hooks.remove_hook_from_submodules(self.model) + orig_device = self.compressor.device + self.compressor.device = "cpu" + self._quantize_via_rtn_blockwise() + self.compressor.device = orig_device + for handle in hook_handles: + handle.remove() + else: + block_names_cnt = len(flatten_list(get_block_names(self.compressor.model, True))) + clear_mem_freq = len(self.all_to_quantized_module_names) // block_names_cnt + if clear_mem_freq == 0: + clear_mem_freq = 1 + pbar = tqdm(self.all_to_quantized_module_names) + cnt = 1 + for name in pbar: + pbar.set_description(f"Quantizing {name}") + self._quantize_layer_via_rtn(name) + if cnt % clear_mem_freq == 0: + clear_memory(device_list=self.compressor.device_list) + memory_monitor.log_summary() + cnt = 1 + cnt += 1 + # Convert remaining fp8 + if is_fp8_model(self.compressor.model): + convert_fp8_model_to_16b_model(self.compressor.model, self.compressor.amp_dtype, self.compressor.device) + self.compressor.quantized = True + return self.compressor.model, self.compressor.layer_config + + def post_quantize(self, *args, **kwargs): + pass + + def _quantize_via_rtn_blockwise(self) -> None: + """Quantize model layers block by block using cached inputs.""" + + all_to_quantized_module_names = list(set(self.all_to_quantized_module_names)) + + all_blocks = ( + self.compressor.quant_block_list + if self.compressor.quant_block_list + else get_block_names(self.compressor.model) + ) + if not all_blocks: + raise ValueError("Could not find any blocks. Check the model or quant_block_list.") + + all_first_block_names = [block[0] for block in all_blocks] + layer_names = get_quantized_layer_names_outside_blocks( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + supported_types=self.compressor.supported_types, + quant_block_list=self.compressor.quant_block_list, + ) + if self.compressor.act_bits < 16 and (not self.compressor.act_dynamic or len(layer_names) > 0): + if len(layer_names) > 0: + logger.warning( + "quantize layers outside blocks for static activation quantizaiton" + " will significantly increase calibration time" + ) + all_inputs = self.compressor.try_cache_inter_data_gpucpu( + all_first_block_names, self.compressor.nsamples, layer_names + ) + else: + all_inputs = self.compressor.cache_inter_data(all_first_block_names, self.compressor.nsamples) + + # Clear hooks for multi-GPU setups + if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1: + accelerate.hooks.remove_hook_from_submodules(self.compressor.model) + + pbar = tqdm(range(sum(len(block) for block in all_blocks))) + + for block_names in all_blocks: + first_block = block_names[0] + inputs = all_inputs.pop(first_block) + input_keys = [k for k in inputs if k.startswith("hidden_state")] + if len(input_keys) != 1: + raise RuntimeError( + "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues" + ) + inputs["input_ids"] = inputs.pop(input_keys[0]) + + clear_memory(self.compressor.inputs, device_list=self.compressor.device_list) + + total_samples = len(inputs["input_ids"]) + if total_samples < self.compressor.batch_size: + self.compressor.batch_size = total_samples + logger.warning(f"Forcing batch size to {total_samples}") + + input_ids = to_device(inputs.pop("input_ids"), self.compressor.cache_device) + input_others = to_device(inputs, self.compressor.cache_device) + + tmp_dtype = self.compressor.amp_dtype if self.compressor.amp else torch.float32 + input_ids = [id_.to(tmp_dtype) for id_ in input_ids] + + for key, val in input_others.items(): + if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16): + input_others[key] = val.to(tmp_dtype) + elif isinstance(val, list): + input_others[key] = [to_dtype(v, tmp_dtype) for v in val] + + for block_name in block_names: + pbar.set_description(f"Quantizing {block_name}") + block = get_module(self.compressor.model, block_name) + if is_fp8_model(self.compressor.model): + convert_fp8_model_to_16b_model( + block, dtype=self.compressor.amp_dtype, device=self.compressor.device + ) + + if is_auto_device_mapping(self.compressor.device_map) and len(self.compressor.device_list) > 1: + set_auto_device_map_for_block_with_tuning( + block, + self.compressor.device_map, + input_ids, + self.compressor.low_gpu_mem_usage, + self.compressor.batch_size, + self.compressor.device, + ) + # Dispatch model if needed + if len(self.compressor.device_list) > 1: + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + for _, m in block.named_modules(): + if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"): + continue + hook = AlignDevicesHook(m.tuning_device, io_same_device=True) + add_hook_to_module(m, hook, True) + else: + block = block.to(self.compressor.device) + + # TODO: refactor this part + input_ids = self.compressor._get_block_outputs( + block, + input_ids, + input_others, + self.compressor.batch_size * self.compressor.infer_bs_coeff, + self.compressor.device, + self.compressor.cache_device, + ) + + if len(self.compressor.device_list) > 1: + accelerate.hooks.remove_hook_from_submodules(block) + + if is_nv_fp(self.compressor.act_data_type) or is_static_wfp8afp8(self.compressor): + # enable moe experts act_max automatic generation for Linear + set_amax_for_all_moe_layers(block, attr_name="act_max") + if self.compressor.low_gpu_mem_usage: + block.to("cpu") + clear_memory(device_list=self.compressor.device_list) + + for _, m in block.named_modules(): + if hasattr(m, "tmp_name") and m.tmp_name in all_to_quantized_module_names: + self._quantize_layer_via_rtn(m.tmp_name, to_cpu=self.compressor.low_gpu_mem_usage) + all_to_quantized_module_names.remove(m.tmp_name) + if not self.compressor.immediate_saving: + mv_module_from_gpu(block) + if block_name == block_names[-1]: + clear_memory(input_ids, device_list=self.compressor.device_list) + else: + clear_memory(device_list=self.compressor.device_list) + + memory_monitor.log_summary() + pbar.update(1) + pbar.close() + # Process remaining layers not in blocks + for name in all_to_quantized_module_names: + dtype = None + if self.compressor.super_group_size is not None: + dtype = torch.float32 + self._quantize_layer_via_rtn(name, dtype=dtype) + # clear_memory(device_list=self.compressor.device_list) + + def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None: + """Quantizes a layer using RTN (Round-To-Nearest) if available. + + This function attempts to quantize a layer by switching its data type to a + `rtn_*` version if supported, then wraps and unwraps the module to apply + quantization. If GPU memory is insufficient, it falls back to CPU. + + If packing is enabled (`immediate_packing`), the function will also export + the quantized layer to the appropriate backend format. + + Args: + name (str): Name of the layer to quantize. + + Raises: + RuntimeError: If quantization fails for reasons unrelated to memory. + """ + m = get_module(self.compressor.model, name) + if dtype is not None: + m = m.to(dtype) + + if is_fp8_linear(m): + m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device) + set_module(self.compressor.model, name, m) + tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.compressor.device + + try: + m = m.to(tuning_device) + m = WrapperLinear( + m, + device=tuning_device, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_round_tuning=False, + enable_torch_compile=self.compressor.enable_torch_compile, + ) + m = m.unwrapper({}) + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + m = m.orig_layer if hasattr(m, "orig_layer") else m + try: + logger.error(cuda_error_msg) + logger.warning("falling back to CPU.") + m.to("cpu") + m = WrapperLinear( + m, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_round_tuning=False, + enable_torch_compile=self.compressor.enable_torch_compile, + ) + m = m.unwrapper({}) + except Exception as e: + raise + + # Step 2: Optional immediate packing/export + if self.compressor.immediate_packing: # For gguf, packing conducts on block level + self.compressor._immediate_pack(name) + if to_cpu: + m = m.to("cpu") + packed_m = get_module(self.compressor.model, name) + set_module(self.compressor.model, name, packed_m.to("cpu")) + else: + if to_cpu: + m = m.to("cpu") + set_module(self.compressor.model, name, m) + if self.compressor.immediate_saving: + if hasattr(self.compressor, "all_to_quantized_module_names"): + all_to_quantized_module_names = self.compressor.all_to_quantized_module_names + else: + all_to_quantized_module_names = [ + n for n, m in self.compressor.model.named_modules() if check_to_quantized(m) + ] + last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1]) + m = get_module(self.compressor.model, name) + immediate_saving(self.compressor, m, name, last_module) + + +class OptRTNQuantizer(RTNQuantizer): + + @staticmethod + def register_act_hook(model, supported_types): + """Registers hooks to accumulate activation squared norms into `imatrix`.""" + + def get_imatrix_hook(module, input, output): + input = input[0] if isinstance(input, (tuple, list)) else input + flattened = input.reshape(-1, input.shape[-1]).to(torch.float32) + squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32) + + if not hasattr(module, "imatrix"): + module.imatrix = squared + module.imatrix_cnt = input.shape[0] + else: + module.imatrix += squared.to(module.imatrix.device) + module.imatrix_cnt += input.shape[0] + + hook_handles = [] + for name, module in model.named_modules(): + if type(module) in supported_types and check_to_quantized(module): + hook = module.register_forward_hook(get_imatrix_hook) + hook_handles.append(hook) + return hook_handles + + @torch.inference_mode() + def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: + enable_imatrix = False + has_gguf_k = ( + any(fmt.is_gguf() and "k" in fmt.output_format for fmt in getattr(self.compressor, "formats", [])) + or self.compressor.super_bits is not None + ) + if has_gguf_k: + enable_imatrix = True + elif self.compressor.data_type == "int" and self.compressor.sym: + enable_imatrix = True + if enable_imatrix: + self._quant_rtn_with_imatrix(self.all_to_quantized_module_names) + # Convert remaining fp8 + if is_fp8_model(self.compressor.model): + convert_fp8_model_to_16b_model(self.compressor.model, self.compressor.amp_dtype, self.compressor.device) + self.compressor.quantized = True + return self.compressor.model, self.compressor.layer_config + else: + return super().quantize(*args, **kwargs) + + def _quant_rtn_with_imatrix(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: + """Quantize all modules in the model using Optimized RTN strategy. + + This method applies optimized RTN quantization to all modules in the model + that are marked for quantization. It leverages input caching and imatrix + techniques for enhanced performance. + + Returns: + tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration. + """ + if not ( + any(fmt.is_gguf() for fmt in getattr(self.compressor, "formats", [])) + or self.compressor.super_bits is not None + ): + quantize_embedding_layer( + model=self.compressor.model, + layer_config=self.compressor.layer_config, + scale_dtype=self.compressor.data_type, + disable_opt_rtn=self.compressor.disable_opt_rtn, + device=self.compressor.device, + device_list=self.compressor.device_list, + ) # leave to gguf itself to handle + + self.compressor.model.to("cpu") + # Release memory + clear_memory(device_list=self.compressor.device_list) + + logger.info("start to compute imatrix") + + # Load dataset + from auto_round.calib_dataset import get_dataloader + + if isinstance(self.compressor.dataset, str): + if self.compressor.tokenizer is None: + raise ValueError("A tokenizer must be set for the model when using a dataset string.") + dataset_name = self.compressor.dataset.replace(" ", "") + self.compressor.dataloader = get_dataloader( + self.compressor.tokenizer, + self.compressor.seqlen, + dataset_name, + self.compressor.seed, + self.compressor.batch_size, + self.compressor.nsamples, + ) + else: + self.compressor.dataloader = self.compressor.dataset + + model = self.compressor.model + + # Dispatch multi-GPU model if necessary + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + dispatch_model(model, model.hf_device_map) + + hooks = self.register_act_hook(model, self.compressor.supported_types) + + try: + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + import accelerate + + accelerate.hooks.remove_hook_from_submodules(model) + model = model.to("cpu") + clear_memory(device_list=self.compressor.device_list) + self._quantize_via_rtn_blockwise() + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + try: + logger.error(cuda_error_msg) + # Final fallback: warn and use CPU-only quantization + logger.warning( + "Fallback to CPU. " + "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`." + ) + model = model.to("cpu") + clear_memory(device_list=self.compressor.device_list) + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + import accelerate + + accelerate.hooks.remove_hook_from_submodules(model) + + orig_device = self.compressor.device + self.compressor.device = "cpu" + self._quantize_via_rtn_blockwise() + self.compressor.device = orig_device + except Exception as e: + raise + finally: + # Always remove hooks + for hook in hooks: + hook.remove() + + def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None: + """Quantizes a layer using RTN (Round-To-Nearest) if available. + + This function attempts to quantize a layer by switching its data type to a + `rtn_*` version if supported, then wraps and unwraps the module to apply + quantization. If GPU memory is insufficient, it falls back to CPU. + + If packing is enabled (`immediate_packing`), the function will also export + the quantized layer to the appropriate backend format. + + Args: + name (str): Name of the layer to quantize. + + Raises: + RuntimeError: If quantization fails for reasons unrelated to memory. + """ + m = get_module(self.compressor.model, name) + if dtype is not None: + m = m.to(dtype) + + if is_fp8_linear(m): + m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device) + set_module(self.compressor.model, name, m) + tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.compressor.device + # Step 1: Try quantization on GPU first, fall back to CPU if OOM + if ( + self.compressor.immediate_packing + and self.compressor.iters == 0 + and self.compressor.formats[0].is_gguf() + and not self.compressor.disable_opt_rtn + ): + m = m.to(tuning_device) + m.scale = None + m.zp = None + else: + try: + disable_opt_rtn = False + if ( + self.compressor.orig_disable_opt_rtn is None + and self.compressor.is_moe_model + and "expert" in m.tmp_name + and "shared_expert" not in m.tmp_name + and self.compressor.super_bits is None # GGUF still uses the optimized RTN for MoE layers + ): + disable_opt_rtn = True + logger.warning_once( + "MoE layer detected: optimized RTN is disabled for efficiency. " + "Use `--enable_opt_rtn` to force-enable it for MoE layers." + ) + m = m.to(tuning_device) + m = WrapperLinear( + m, + device=tuning_device, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_round_tuning=False, + enable_torch_compile=self.compressor.enable_torch_compile, + disable_opt_rtn=disable_opt_rtn, + ) + m = m.unwrapper({}) + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + m = m.orig_layer if hasattr(m, "orig_layer") else m + try: + logger.error(cuda_error_msg) + logger.warning("falling back to CPU.") + m.to("cpu") + m = WrapperLinear( + m, + enable_minmax_tuning=False, + enable_norm_bias_tuning=False, + enable_round_tuning=False, + enable_torch_compile=self.compressor.enable_torch_compile, + ) + m = m.unwrapper({}) + except Exception as e: + raise + + # Step 2: Optional immediate packing/export + if self.compressor.immediate_packing: # For gguf, packing conducts on block level + self.compressor._immediate_pack(name) + if to_cpu: + m = m.to("cpu") + packed_m = get_module(self.compressor.model, name) + set_module(self.compressor.model, name, packed_m.to("cpu")) + else: + if to_cpu: + m = m.to("cpu") + set_module(self.compressor.model, name, m) + if self.compressor.immediate_saving: + if hasattr(self.compressor, "all_to_quantized_module_names"): + all_to_quantized_module_names = self.compressor.all_to_quantized_module_names + else: + all_to_quantized_module_names = [ + n for n, m in self.compressor.model.named_modules() if check_to_quantized(m) + ] + last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1]) + m = get_module(self.compressor.model, name) + immediate_saving(self.compressor, m, name, last_module) diff --git a/auto_round/quantizers/base.py b/auto_round/quantizers/base.py new file mode 100644 index 000000000..bd6b07a69 --- /dev/null +++ b/auto_round/quantizers/base.py @@ -0,0 +1,50 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific la + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from auto_round.compressors.base import BaseCompressor + + +class BaseQuantizer(ABC): + def __init__(self, compressor: "BaseCompressor"): + self.compressor = compressor + + def pre_quantize(self, *args, **kwargs): + pass + + def quantize(self, *args, **kwargs): + pass + + def post_quantize(self, *args, **kwargs): + pass + + def pre_quantize_layer(self, *args, **kwargs): + pass + + def quantize_layer(self, *args, **kwargs): + pass + + def post_quantize_layer(self, *args, **kwargs): + pass + + def pre_quantize_block(self, *args, **kwargs): + pass + + def quantize_block(self, *args, **kwargs): + pass + + def post_quantize_block(self, *args, **kwargs): + pass diff --git a/auto_round/quantizers/entrypoint.py b/auto_round/quantizers/entrypoint.py new file mode 100644 index 000000000..d22a999f1 --- /dev/null +++ b/auto_round/quantizers/entrypoint.py @@ -0,0 +1,55 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from auto_round.compressors.base import BaseCompressor + +from auto_round.quantizers.algs.auto_round import ARQuantizer +from auto_round.quantizers.algs.rtn import OptRTNQuantizer, RTNQuantizer + + +class AutoRoundQuantizer: + def __new__(cls, compressor: "BaseCompressor", dynamic_quantizers: dict = None): + assert dynamic_quantizers is not None, "Please provide dynamic_quantizers dict." + quantizer_cls = type("AutoRoundQuantizer", tuple(dynamic_quantizers.values()), {}) + return quantizer_cls(compressor) + + +class Quanterizers: + def __init__(self, quantizers: list[AutoRoundQuantizer]): + self.quantizers = quantizers + + def quantize(self, *args, **kwargs): + for quantizer in self.quantizers: + quantizer.pre_quantize(*args, **kwargs) + model, layer_config = quantizer.quantize(*args, **kwargs) + quantizer.post_quantize(*args, **kwargs) + return model, layer_config + + +def create_quantizers(compressor: "BaseCompressor"): + + alg_cls = None + if compressor.iters > 0: + alg_cls = ARQuantizer + else: + alg_cls = OptRTNQuantizer if compressor.disable_opt_rtn is False else RTNQuantizer + + dynamic_quantizers = {"algs": alg_cls} + return Quanterizers( + quantizers=[ + AutoRoundQuantizer(compressor, dynamic_quantizers=dynamic_quantizers), + ] + ) diff --git a/auto_round/quantizers/readme.md b/auto_round/quantizers/readme.md new file mode 100644 index 000000000..7a5b7fad6 --- /dev/null +++ b/auto_round/quantizers/readme.md @@ -0,0 +1,21 @@ +# AutoRound Quantizer +主要的功能组件,包含不同的算法,量化的具体执行逻辑。 + +## 结构与调用流程 +AutoRundQuantizer根据粒度从大到小分为三层(可扩展): algs、model_type、data_type,从每层中继承方法动态的构造一个Quantizers, 同层间互斥,不同层间可以自由组合。 + +AutoRoundQuantizer +- algs + - RTN + - Tuning(auto_round) +- model_type + - llm + - mllm + - diffusion +- data_type + - gguf + - nvfp/mxfp +### 1. AutoRoundQuantizer +主入口,根据配置,使用__new__方法动态构造一个Quantizer, 从AlgsQuantizer, ModelTypeQuantizer, DataTypeQuantizer中继承方法,小粒度层可覆写大粒度层方法 + +### 2. AlgsQuantizer \ No newline at end of file diff --git a/auto_round/quantizers/utils.py b/auto_round/quantizers/utils.py new file mode 100644 index 000000000..3ea0c009f --- /dev/null +++ b/auto_round/quantizers/utils.py @@ -0,0 +1,262 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import traceback +from typing import Any, Callable, Optional, Union + +import torch + +from auto_round.compressors.utils import ( + check_need_act_calibration, + is_nv_fp, +) +from auto_round.data_type import QUANT_FUNC_WITH_DTYPE +from auto_round.data_type.utils import reshape_pad_tensor_by_group_size +from auto_round.logger import logger +from auto_round.utils import ( + SUPPORTED_LAYER_TYPES, + check_to_quantized, + clear_memory, + get_layer_names_in_block, + get_module, + to_device, + to_dtype, +) + + +def register_act_max_hook(model: torch.nn.Module, layer_config: dict, act_group_size: int, act_data_type: str): + def get_act_max_hook(module, input, output): + if isinstance(input, (tuple, list)): + input = input[0] + if input.numel() == 0: + return # as no needs for act_max update + input, _, _ = reshape_pad_tensor_by_group_size(input, act_group_size) + act_max = torch.max(torch.abs(input), dim=-1).values + if not hasattr(module, "act_max") or module.act_max.numel() == 0: + module.act_max = act_max + else: + act_max = act_max.to(module.act_max.device) + if is_nv_fp(act_data_type): ## for nvfp per-tensor input_global_scale calculation usage + module.act_max = torch.max(torch.tensor([act_max.max(), module.act_max.max()], device=act_max.device)) + else: + module.act_max = torch.max(act_max, module.act_max) + + hook_handles = [] + # for single layers out of blocks, like lm_head + if isinstance(model, SUPPORTED_LAYER_TYPES): + m = model + if ( + hasattr(m, "act_dynamic") + and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits) + and check_to_quantized(m) + ): + hook = m.register_forward_hook(get_act_max_hook) + hook_handles.append(hook) + return hook_handles + + for n, m in model.named_modules(): + if ( + hasattr(m, "act_dynamic") + and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits) + and check_to_quantized(m) + ): + hook = m.register_forward_hook(get_act_max_hook) + hook_handles.append(hook) + continue + + # for whole model, RTN + if n in layer_config: + config = layer_config[n] + act_dynamic = config.get("act_dynamic", True) + act_data_type = config.get("act_data_type", None) + act_bits = config.get("act_bits", 16) + if ( + config["bits"] <= 8 + and check_need_act_calibration(act_dynamic, act_data_type, act_bits) + and check_to_quantized(config) + ): + hook = m.register_forward_hook(get_act_max_hook) + hook_handles.append(hook) + continue + return hook_handles + + +@torch.inference_mode() +def quantize_embedding_layer( + model: torch.nn.Module, + layer_config: dict, + scale_dtype: str, + disable_opt_rtn: bool, + device: Union[str, torch.device], + device_list: list, +) -> bool: + """Quantizes embedding layers in the model according to the configuration. + + This method iterates through all modules in the model, identifies embedding + layers specified in `layer_config`, and applies the appropriate quantization + function based on bit precision, grouping strategy, and dtype. + + Returns: + bool: True if the quantization process completes without critical errors. + """ + is_quantized = False + for name, module in model.named_modules(): + # Skip non-Embedding modules or layers not in config + if not isinstance(module, torch.nn.Embedding) or name not in layer_config: + continue + + config = layer_config[name] + + # Skip layers that are not marked for quantization + if not check_to_quantized(config): + continue + is_quantized = True + config["scale_dtype"] = scale_dtype + dtype = config["data_type"] + + # Determine quantization function key with symmetry/asymmetry + if dtype not in QUANT_FUNC_WITH_DTYPE: + dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}" + + # Optionally use optimized rounding (RTN) variant + if not disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE: + dtype = f"rtn_{dtype}" + + quant_func = QUANT_FUNC_WITH_DTYPE[dtype] + dtype = module.weight.dtype + # As typically float32 are used in RTN to search scale zp, + # to avoid cache a bf16 copy we'd better use float32 + if config.get("super_group_size", None) is not None: + dtype = torch.float32 + + # Attempt quantization on GPU, fall back to CPU if OOM + try: + weight, scale, zp = quant_func( + module.weight.to(dtype=dtype, device=device), + **{ + k: config.get(k, None) + for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] + }, + ) + except torch.OutOfMemoryError: + cuda_error_msg = traceback.format_exc() + try: + logger.error(cuda_error_msg) + logger.warning("falling back to CPU") + weight, scale, zp = quant_func( + module.weight.to("cpu"), + **{ + k: config.get(k, None) + for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"] + }, + ) + except Exception as e: + raise + + # Overwrite the module's weights with the quantized version + module.weight.data.copy_(weight.cpu()) + + # Attach scale and zero point (zp) to the module + for param_name, value in zip(["scale", "zp"], [scale, zp]): + if isinstance(value, dict): + for k, v in value.items(): + setattr(module, k if k == "scale" else f"w_{k}", v.cpu()) + elif isinstance(value, torch.Tensor): + setattr(module, param_name, value.cpu()) + else: + setattr(module, param_name, value) + + # Update config + layer_config.setdefault(name, {}).update(config) + del weight + del scale + del zp + clear_memory(device_list=device_list) + return is_quantized + + +def get_quantized_layer_names_outside_blocks( + model: torch.nn.Module, layer_config: dict, supported_types: list, quant_block_list: list +) -> list: + """Gets the names of quantized layers outside blocks in the model. + + Returns: + list: List of layer names outside blocks. + """ + if layer_config is None or len(layer_config) == 0: + return [] + + layer_names = [] + all_layers_in_block = get_layer_names_in_block(model, supported_types, quant_block_list) + + for key in layer_config.keys(): + if key in all_layers_in_block: + continue + layer = get_module(model, key) + if layer is None: + logger.error(f"could not find layer {key} in the model, exit...") + exit(-1) + if type(layer) in supported_types and check_to_quantized(layer_config[key]): + layer_names.append(key) + + return layer_names + + +def get_non_zero_cnt(tensor: list[torch.Tensor], indices: list[int]) -> int: + current_tensors = [tensor[i] for i in indices] + non_zero_cnt = 0 + for t in current_tensors: + non_zero_cnt += torch.count_nonzero(t).item() + return non_zero_cnt + + +def split_inputs(inputs: dict, first_input_name: str, diffusion: bool = False) -> tuple[torch.Tensor, dict]: + if diffusion: + input_id_str = [key for key in inputs.keys() if "hidden_state" in key] + input_ids = {k: inputs.pop(k, None) for k in input_id_str} + input_others = inputs + return input_ids, input_others + else: + input_ids = inputs[first_input_name] + inputs.pop(first_input_name, None) + input_others = inputs + return input_ids, input_others + + +def preprocess_block_inputs( + inputs, + device_list: list, + first_input_name="input_ids", + amp: bool = False, + amp_dtype: torch.dtype = torch.float32, + cache_device: Union[str, torch.device] = "cpu", + diffusion: bool = False, +): + input_ids, input_others = split_inputs(inputs, first_input_name, diffusion=diffusion) + clear_memory(device_list=device_list) + input_ids = to_device(input_ids, cache_device) + input_others = to_device(input_others, cache_device) + # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage + + tmp_dtype = amp_dtype if amp else torch.float32 + input_ids = to_dtype(input_ids, tmp_dtype) + + for key in input_others.keys(): + if isinstance(input_others[key], torch.Tensor) and ( + input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16 + ): + input_others[key] = input_others[key].to(tmp_dtype) + elif isinstance(input_others[key], list): + for i in range(len(input_others[key])): + to_dtype(input_others[key][i], tmp_dtype) + return input_ids, input_others From cb334610ac80db04017a04152e41f931d0e6e404 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 14 Jan 2026 03:56:13 -0500 Subject: [PATCH 2/5] fix Signed-off-by: n1ck-guo --- auto_round/quantizers/algs/auto_round.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/auto_round/quantizers/algs/auto_round.py b/auto_round/quantizers/algs/auto_round.py index da02d0397..a2159d292 100644 --- a/auto_round/quantizers/algs/auto_round.py +++ b/auto_round/quantizers/algs/auto_round.py @@ -784,7 +784,9 @@ def quantize_block( for tmp_step in range(self.compressor.gradient_accumulate_steps): indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size] - current_output = self._get_current_output(output, indices, self.compressor.batch_dim) + current_output = self._get_current_output( + output, indices, self.compressor.batch_dim, diffusion=self.compressor.diffusion + ) current_output = to_device(current_output, loss_device) # TODO: refactor this output_q = self.compressor._get_current_q_output( @@ -882,7 +884,15 @@ def quantize_block( return None, output @staticmethod - def _get_current_output(output: list[torch.Tensor], indices: list[int], batch_dim: int) -> torch.Tensor: + def _get_current_output( + output: list[torch.Tensor], indices: list[int], batch_dim: int, diffusion: bool = False + ) -> torch.Tensor: + if diffusion: + assert "hidden_states" in output + current_output = [output["hidden_states"][x] for x in indices] + current_output = torch.cat(current_output, dim=batch_dim) + return current_output + current_output = [output[x] for x in indices] current_output = torch.cat(current_output, dim=batch_dim) return current_output From 52afa900cc9160167b0fd74e1dc4ef76a7080d96 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Wed, 14 Jan 2026 20:26:55 -0500 Subject: [PATCH 3/5] fix Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 5 +++-- auto_round/quantizers/algs/auto_round.py | 6 +++--- auto_round/quantizers/algs/rtn.py | 4 ++-- auto_round/quantizers/base.py | 3 ++- auto_round/quantizers/entrypoint.py | 4 ++-- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index a383659d0..b59e7354f 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1590,7 +1590,7 @@ def quantize_block( self.normalize_decoding_layer_inputs_(inputs) block_inputs = self.inputs[self.quant_block_list[0][0]] decoding_layer_first_input_name = "hidden_states" - from auto_round.quantizers.algs.auto_round import AutoRoundQuantizer + from auto_round.quantizers.algs.auto_round import ARQuantizer from auto_round.quantizers.utils import preprocess_block_inputs input_ids, input_others = preprocess_block_inputs( @@ -1602,7 +1602,8 @@ def quantize_block( cache_device=self.cache_device, diffusion=self.diffusion, ) - return AutoRoundQuantizer(self).quantize_block(block, input_ids, input_others, q_input, device, auto_offload) + ar_quantizer = ARQuantizer(self) + return ar_quantizer.quantize_block(block, input_ids, input_others, q_input, device, auto_offload) def _get_loss( self, diff --git a/auto_round/quantizers/algs/auto_round.py b/auto_round/quantizers/algs/auto_round.py index a2159d292..db3330088 100644 --- a/auto_round/quantizers/algs/auto_round.py +++ b/auto_round/quantizers/algs/auto_round.py @@ -108,7 +108,7 @@ def quantize(self, *args, **kwargs): is_quantized_embedding = quantize_embedding_layer( model=self.compressor.model, layer_config=self.compressor.layer_config, - scale_dtype=self.compressor.data_type, + scale_dtype=self.compressor.scale_dtype, disable_opt_rtn=self.compressor.disable_opt_rtn, device=self.compressor.device, device_list=self.compressor.device_list, @@ -462,11 +462,11 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch. lr = torch.tensor(self.compressor.lr) minmax_lr = torch.tensor(self.compressor.minmax_lr) if self.compressor.enable_minmax_tuning: - optimizer = self.optimizer( + optimizer = self.compressor.optimizer( [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0 ) else: - optimizer = self.optimizer(round_params, lr=lr, weight_decay=0) + optimizer = self.compressor.optimizer(round_params, lr=lr, weight_decay=0) if self.compressor.lr_scheduler is None: lr_schedule = torch.optim.lr_scheduler.LinearLR( diff --git a/auto_round/quantizers/algs/rtn.py b/auto_round/quantizers/algs/rtn.py index 80d37d2fe..f1b4676f4 100644 --- a/auto_round/quantizers/algs/rtn.py +++ b/auto_round/quantizers/algs/rtn.py @@ -112,7 +112,7 @@ def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: quantize_embedding_layer( model=self.compressor.model, layer_config=self.compressor.layer_config, - scale_dtype=self.compressor.data_type, + scale_dtype=self.compressor.scale_dtype, disable_opt_rtn=self.compressor.disable_opt_rtn, device=self.compressor.device, device_list=self.compressor.device_list, @@ -453,7 +453,7 @@ def _quant_rtn_with_imatrix(self, *args, **kwargs) -> tuple[torch.nn.Module, dic quantize_embedding_layer( model=self.compressor.model, layer_config=self.compressor.layer_config, - scale_dtype=self.compressor.data_type, + scale_dtype=self.compressor.scale_dtype, disable_opt_rtn=self.compressor.disable_opt_rtn, device=self.compressor.device, device_list=self.compressor.device_list, diff --git a/auto_round/quantizers/base.py b/auto_round/quantizers/base.py index bd6b07a69..1e1a73f17 100644 --- a/auto_round/quantizers/base.py +++ b/auto_round/quantizers/base.py @@ -9,7 +9,8 @@ # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific la +# See the License for the specific language governing permissions and +# limitations under the License. from abc import ABC, abstractmethod from typing import TYPE_CHECKING diff --git a/auto_round/quantizers/entrypoint.py b/auto_round/quantizers/entrypoint.py index d22a999f1..473dcfca1 100644 --- a/auto_round/quantizers/entrypoint.py +++ b/auto_round/quantizers/entrypoint.py @@ -27,7 +27,7 @@ def __new__(cls, compressor: "BaseCompressor", dynamic_quantizers: dict = None): return quantizer_cls(compressor) -class Quanterizers: +class Quantizers: def __init__(self, quantizers: list[AutoRoundQuantizer]): self.quantizers = quantizers @@ -48,7 +48,7 @@ def create_quantizers(compressor: "BaseCompressor"): alg_cls = OptRTNQuantizer if compressor.disable_opt_rtn is False else RTNQuantizer dynamic_quantizers = {"algs": alg_cls} - return Quanterizers( + return Quantizers( quantizers=[ AutoRoundQuantizer(compressor, dynamic_quantizers=dynamic_quantizers), ] From 22ffe2d8b409befa95387adea6051b220854950c Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 20 Jan 2026 15:51:51 +0800 Subject: [PATCH 4/5] update Signed-off-by: n1ck-guo --- .pre-commit-config.yaml | 0 auto_round/__init__.py | 2 +- auto_round/autoround.py | 108 +------- auto_round/compressors/__init__.py | 1 - auto_round/compressors/adam.py | 164 ------------ auto_round/compressors/base.py | 66 +---- .../compressors/diffusion/compressor.py | 7 - auto_round/compressors/utils.py | 8 +- auto_round/quantizers/algs/auto_round.py | 247 +++++++++++++----- auto_round/quantizers/algs/base.py | 28 +- auto_round/quantizers/algs/rtn.py | 11 +- auto_round/quantizers/base.py | 52 ++++ auto_round/quantizers/entrypoint.py | 8 +- auto_round/quantizers/utils.py | 28 +- test/test_cpu/export/test_gguf_format.py | 10 +- test/test_cpu/utils/test_cli_usage.py | 16 +- .../advanced/test_multiple_card_calib.py | 4 +- test/test_cuda/export/test_gguf.py | 2 +- test/test_cuda/models/test_support_vlms.py | 10 +- test/test_cuda/utils/test_alg_ext.py | 4 +- 20 files changed, 322 insertions(+), 454 deletions(-) mode change 100644 => 100755 .pre-commit-config.yaml mode change 100644 => 100755 auto_round/__init__.py mode change 100644 => 100755 auto_round/autoround.py mode change 100644 => 100755 auto_round/compressors/__init__.py delete mode 100644 auto_round/compressors/adam.py mode change 100644 => 100755 auto_round/compressors/base.py mode change 100644 => 100755 auto_round/compressors/diffusion/compressor.py mode change 100644 => 100755 auto_round/compressors/utils.py mode change 100644 => 100755 auto_round/quantizers/algs/auto_round.py mode change 100644 => 100755 auto_round/quantizers/algs/base.py mode change 100644 => 100755 auto_round/quantizers/algs/rtn.py mode change 100644 => 100755 auto_round/quantizers/base.py mode change 100644 => 100755 auto_round/quantizers/entrypoint.py mode change 100644 => 100755 auto_round/quantizers/utils.py mode change 100644 => 100755 test/test_cpu/export/test_gguf_format.py mode change 100644 => 100755 test/test_cpu/utils/test_cli_usage.py mode change 100644 => 100755 test/test_cuda/advanced/test_multiple_card_calib.py mode change 100644 => 100755 test/test_cuda/export/test_gguf.py mode change 100644 => 100755 test/test_cuda/models/test_support_vlms.py mode change 100644 => 100755 test/test_cuda/utils/test_alg_ext.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100644 new mode 100755 diff --git a/auto_round/__init__.py b/auto_round/__init__.py old mode 100644 new mode 100755 index 87c70e06a..e2f1b6c58 --- a/auto_round/__init__.py +++ b/auto_round/__init__.py @@ -14,7 +14,7 @@ from auto_round.autoround import AutoRound # support for old api -from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam, AutoRoundDiffusion +from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundDiffusion from auto_round.schemes import QuantizationScheme from auto_round.auto_scheme import AutoScheme from auto_round.utils import LazyImport diff --git a/auto_round/autoround.py b/auto_round/autoround.py old mode 100644 new mode 100755 index 6d69c01c3..0cff92082 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -18,7 +18,6 @@ import torch from auto_round.compressors import ( - AdamCompressor, BaseCompressor, DiffusionCompressor, ExtraConfig, @@ -173,8 +172,6 @@ def __new__( extra_config.diffusion_config = None model_cls.append(LLMCompressor) - if enable_adam: - model_cls.append(AdamCompressor) dynamic_compressor = type("AutoRound", tuple(model_cls), {}) if extra_config: kwargs.update(extra_config.to_dict()) @@ -187,6 +184,7 @@ def __new__( "'fp_layers' is deprecated, please use 'ignore_layers' to set layers not to be quantized." ) kwargs["ignore"] = kwargs.pop("fp_layers") + kwargs["enable_adam"] = enable_adam ar = dynamic_compressor( model=model, tokenizer=tokenizer, @@ -371,110 +369,6 @@ def __init__( ) -@deprecated("AutoRound") -class AutoRoundAdam(AdamCompressor): - """Class for quantization with optimizers like adamw of a PyTorch model. - - Args: - model: The PyTorch model to be quantized. - tokenizer: An optional tokenizer for processing input data. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] - scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations - bits (int): Number of bits for quantization (default is 4). - group_size (int): Size of the quantization group (default is 128). - sym (bool): Whether sym to be used (default is True). - layer_config (dict): Configuration for weight quantization (default is None). - batch_size (int): Batch size for training (default is 8). - amp (bool): Whether to use automatic mixed precision (default is True). - device: The device to be used for training (default is "auto"). - lr_scheduler: The learning rate scheduler to be used. - dataset: The default dataset name (default is "NeelNanda/pile-10k"). - enable_quanted_input (bool): Whether to use quantized input data (default is True). - enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True). - lr (float): The learning rate (default is 0.005). - minmax_lr (float): The learning rate for min-max tuning (default is None). - low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False). - iters (int): Number of iterations (default is 200). - seqlen (int): Length of the sequence. - nsamples (int): Number of samples (default is 128). - sampler (str): The sampling method (default is "rand"). - seed (int): The random seed (default is 42). - nblocks (int): Number of blocks (default is 1). - gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1). - not_use_best_mse (bool): Whether to use mean squared error (default is False). - dynamic_max_gap (int): The dynamic maximum gap (default is -1). - data_type (str): The data type to be used (default is "int"). - scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels - have different choices. - act_bits (int): Number of bits for activation quantization. Default is 16. - act_group_size (int): Group size for activation quantization. Default is None. - act_sym (bool): Whether to use symmetric activation quantization. Default is None. - act_data_type (str): Specifies the data type for activations. - Defaults to None, in which case it inherits the weight data type. - act_dynamic (bool): Whether to use dynamic activation quantization. Default is True. - to_quant_block_names (str|list): A string or list whose elements are list of - block's layer names to be quantized. - enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning - enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer function - **kwargs: Additional keyword arguments. - - Returns: - The quantized model. - """ - - bits: int | None - group_size: int | None - sym: bool | None - data_type: str | None - act_bits: int | None - act_group_size: int | None - act_sym: bool | None - act_data_type: str | None - act_dynamic: bool | None - super_bits: int | None - super_group_size: int | None - - def __init__( - self, - model: Union[torch.nn.Module, str], - tokenizer=None, - platform: str = "hf", - scheme: Union[str, dict, QuantizationScheme] = "W4A16", - layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, - dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", - iters: int = 200, - seqlen: int = 2048, - nsamples: int = 128, - batch_size: int = 8, - gradient_accumulate_steps: int = 1, - low_gpu_mem_usage: bool = False, - device_map: Union[str, int, torch.device, dict] = 0, - enable_torch_compile: bool = False, - seed: int = 42, - optimizer="AdamW", - **kwargs, - ): - super().__init__( - model=model, - tokenizer=tokenizer, - platform=platform, - scheme=scheme, - layer_config=layer_config, - batch_size=batch_size, - dataset=dataset, - low_gpu_mem_usage=low_gpu_mem_usage, - iters=iters, - seqlen=seqlen, - nsamples=nsamples, - seed=seed, - gradient_accumulate_steps=gradient_accumulate_steps, - enable_torch_compile=enable_torch_compile, - device_map=device_map, - optimizer=optimizer, - **kwargs, - ) - - @deprecated("AutoRound") class AutoRoundMLLM(MLLMCompressor): """Class for automatic rounding-based quantization with MLLMs. diff --git a/auto_round/compressors/__init__.py b/auto_round/compressors/__init__.py old mode 100644 new mode 100755 index 6f8ddf681..05623ecb7 --- a/auto_round/compressors/__init__.py +++ b/auto_round/compressors/__init__.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from auto_round.compressors.adam import AdamCompressor from auto_round.compressors.base import BaseCompressor from auto_round.compressors.base import LLMCompressor from auto_round.compressors.mllm.compressor import MLLMCompressor diff --git a/auto_round/compressors/adam.py b/auto_round/compressors/adam.py deleted file mode 100644 index fb79cf39a..000000000 --- a/auto_round/compressors/adam.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Union - -import torch - -from auto_round.compressors.base import BaseCompressor -from auto_round.schemes import QuantizationScheme -from auto_round.utils import check_is_cpu, htcore, is_hpex_available - - -class AdamCompressor(BaseCompressor): - """Class for quantization with optimizers like adamw of a PyTorch model. - - Args: - model: The PyTorch model to be quantized. - tokenizer: An optional tokenizer for processing input data. - platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"] - scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations - bits (int): Number of bits for quantization (default is 4). - group_size (int): Size of the quantization group (default is 128). - sym (bool): Whether sym to be used (default is True). - layer_config (dict): Configuration for weight quantization (default is None). - batch_size (int): Batch size for training (default is 8). - amp (bool): Whether to use automatic mixed precision (default is True). - device: The device to be used for training (default is "auto"). - lr_scheduler: The learning rate scheduler to be used. - dataset: The default dataset name (default is "NeelNanda/pile-10k"). - enable_quanted_input (bool): Whether to use quantized input data (default is True). - enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True). - lr (float): The learning rate (default is 0.005). - minmax_lr (float): The learning rate for min-max tuning (default is None). - low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False). - iters (int): Number of iterations (default is 200). - seqlen (int): Length of the sequence. - nsamples (int): Number of samples (default is 128). - sampler (str): The sampling method (default is "rand"). - seed (int): The random seed (default is 42). - nblocks (int): Number of blocks (default is 1). - gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1). - not_use_best_mse (bool): Whether to use mean squared error (default is False). - dynamic_max_gap (int): The dynamic maximum gap (default is -1). - data_type (str): The data type to be used (default is "int"). - scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels - have different choices. - act_bits (int): Number of bits for activation quantization. Default is 16. - act_group_size (int): Group size for activation quantization. Default is None. - act_sym (bool): Whether to use symmetric activation quantization. Default is None. - act_data_type (str): Specifies the data type for activations. - Defaults to None, in which case it inherits the weight data type. - act_dynamic (bool): Whether to use dynamic activation quantization. Default is True. - to_quant_block_names (str|list): A string or list whose elements are list of - block's layer names to be quantized. - enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning - enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer function - **kwargs: Additional keyword arguments. - - Returns: - The quantized model. - """ - - bits: int | None - group_size: int | None - sym: bool | None - data_type: str | None - act_bits: int | None - act_group_size: int | None - act_sym: bool | None - act_data_type: str | None - act_dynamic: bool | None - super_bits: int | None - super_group_size: int | None - - def __init__( - self, - model: Union[torch.nn.Module, str], - tokenizer=None, - platform="hf", - scheme: Union[str, dict, QuantizationScheme] = "W4A16", - layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, - dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", - iters: int = 200, - seqlen: int = 2048, - nsamples: int = 128, - batch_size: int = 8, - gradient_accumulate_steps: int = 1, - low_gpu_mem_usage: bool = False, - device_map: Union[str, int, torch.device, dict] = 0, - enable_torch_compile: bool = False, - seed: int = 42, - optimizer="AdamW", - **kwargs, - ): - super(AdamCompressor, self).__init__( - model=model, - tokenizer=tokenizer, - platform=platform, - scheme=scheme, - layer_config=layer_config, - batch_size=batch_size, - dataset=dataset, - low_gpu_mem_usage=low_gpu_mem_usage, - iters=iters, - seqlen=seqlen, - nsamples=nsamples, - seed=seed, - gradient_accumulate_steps=gradient_accumulate_steps, - enable_torch_compile=enable_torch_compile, - device_map=device_map, - **kwargs, - ) - - self.optimizer = self._get_optimizer(optimizer) - - def _get_optimizer(self, optimizer): - if optimizer is None: - optimizer = torch.optim.AdamW - elif isinstance(optimizer, str): - optimizer = getattr(torch.optim, optimizer) - else: - optimizer = optimizer - return optimizer - - def _get_scaler(self): - scaler = None - if self.amp and not check_is_cpu(self.device): - from torch.cuda.amp import GradScaler - - scaler = GradScaler(init_scale=1024, growth_interval=100000) - return scaler - - def _scale_loss_and_backward(self, scaler, loss): - if scaler is not None: - loss = scaler.scale(loss) - - loss.backward() - if is_hpex_available(): - htcore.mark_step() - return loss - - def _step(self, scaler, optimizer, lr_schedule): - if scaler is not None: - scaler.step(optimizer) - optimizer.zero_grad() - lr_schedule.step() - scaler.update() - else: - optimizer.step() - optimizer.zero_grad() - lr_schedule.step() - if is_hpex_available(): - htcore.mark_step() diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py old mode 100644 new mode 100755 index b59e7354f..67a5092f8 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -256,6 +256,7 @@ def __init__( model_dtype = kwargs.pop("model_dtype", None) self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False + self.enable_adam = kwargs.pop("enable_adam") if "enable_adam" in kwargs else False self.quantized = False if isinstance(model, str): model, tokenizer = llm_load_model( @@ -413,7 +414,6 @@ def __init__( self.not_use_best_mse = not_use_best_mse self.dynamic_max_gap = dynamic_max_gap self.lr_scheduler = lr_scheduler - self.optimizer = self._get_optimizer(None) self.disable_opt_rtn = disable_opt_rtn # Whether to pack the layer immediately after tuning @@ -916,19 +916,6 @@ def _immediate_pack(self, name: str): image_processor=self.image_processor if hasattr(self, "image_processor") else None, ) - def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]: - keys = inputs.keys() - input_id_str = [key for key in keys if key.startswith("hidden_state")] - if len(input_id_str) != 1: - raise RuntimeError( - "hidden_states arg mismatch error," - "please raise an issue in https://github.com/intel/auto-round/issues" - ) - inputs["input_ids"] = inputs.pop(input_id_str[0], None) - if q_inputs is not None: - q_inputs = q_inputs.pop(input_id_str[0], None) - return inputs, q_inputs - def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True): fill_default_value = True if self.is_auto_scheme: @@ -1600,7 +1587,7 @@ def quantize_block( amp=self.amp, amp_dtype=self.amp_dtype, cache_device=self.cache_device, - diffusion=self.diffusion, + is_diffusion=self.diffusion, ) ar_quantizer = ARQuantizer(self) return ar_quantizer.quantize_block(block, input_ids, input_others, q_input, device, auto_offload) @@ -1718,55 +1705,6 @@ def _set_amp_dtype(self) -> None: self.amp_dtype = torch.float32 self.model = self.model.to(torch.float32) - def _get_optimizer(self, optimizer: Any): - """Returns the specified optimizer. In SignRound, we fix the optimizer. - - Args: - optimizer: The optimizer to be used. - - Returns: - The specified optimizer. - """ - return SignSGD - - def _get_scaler(self): - """Returns scaler, in SignRound, no need to use scaler.""" - return None - - def _scale_loss_and_backward(self, scaler: Any, loss: torch.Tensor) -> torch.Tensor: - """Scales the loss and performs backward pass. - - Args: - scaler: The scaler to be used. - loss: The loss to be scaled. - - Returns: - The scaled loss. - """ - scale_loss = loss * 1000 - scale_loss.backward() - if is_hpex_available(): - htcore.mark_step() - return scale_loss - - def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any): - """Performs a step in the optimization process. - - Args: - scaler: The scaler to be used. - optimizer: The optimizer for the step. - lr_schedule: The learning rate schedule. - - Returns: - None - """ - optimizer.step() - # for hpu - if is_hpex_available(): - htcore.mark_step() - optimizer.zero_grad() - lr_schedule.step() - @classmethod @torch.no_grad() def _sampling_inputs( diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py old mode 100644 new mode 100755 index 0bfbf038c..ec03c8a84 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -165,13 +165,6 @@ def __init__( **kwargs, ) - def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]: - # flux transformer model's blocks will update hidden_states and encoder_hidden_states - input_id_str = [key for key in inputs.keys() if "hidden_state" in key] - if q_inputs is not None: - q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str} - return inputs, q_inputs - def _get_current_q_output( self, block: torch.nn.Module, diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py old mode 100644 new mode 100755 index db95d9547..4d240b3cc --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -957,13 +957,19 @@ def immediate_saving(rounder: object, m: torch.nn.Module, name: str = None, last import json from collections import OrderedDict + from auto_round.quantizers.utils import get_quantized_layer_names_outside_blocks from auto_round.utils import clear_memory, get_module # User configurable (can be preset on rounder) max_shard_size = getattr(rounder, "max_shard_size", "5GB") safe_serialization = getattr(rounder, "safe_serialization", True) if not hasattr(rounder, "quantized_layer_names_outside_blocks"): - rounder.quantized_layer_names_outside_blocks = rounder._get_quantized_layer_names_outside_blocks() + rounder.quantized_layer_names_outside_blocks = get_quantized_layer_names_outside_blocks( + rounder.model, + rounder.layer_config, + rounder.supported_types, + rounder.quant_block_list, + ) layer_names = rounder.quantized_layer_names_outside_blocks if len(layer_names) > 0 and name != layer_names[-1]: last_group = False diff --git a/auto_round/quantizers/algs/auto_round.py b/auto_round/quantizers/algs/auto_round.py old mode 100644 new mode 100755 index db3330088..8d0577b88 --- a/auto_round/quantizers/algs/auto_round.py +++ b/auto_round/quantizers/algs/auto_round.py @@ -38,16 +38,21 @@ preprocess_block_inputs, quantize_embedding_layer, register_act_max_hook, + update_inputs, ) +from auto_round.sign_sgd import SignSGD from auto_round.utils import ( + check_is_cpu, check_to_quantized, clear_memory, convert_fp8_layer_to_linear, get_block_names, get_module, + htcore, is_auto_device_mapping, is_fp8_linear, is_fp8_model, + is_hpex_available, memory_monitor, mv_module_from_gpu, set_amax_for_all_moe_layers, @@ -68,43 +73,42 @@ class ARQuantizer(AlgsBaseQuantizer): def __init__(self, compressor: "BaseCompressor"): super().__init__(compressor) + self.all_blocks = [] + self.layer_names = [] + self.all_q_inputs = None + self.optimizer = self._get_optimizer(None) + self.is_adam = False - def pre_quantize(self, *args, **kwargs): - return super().pre_quantize(*args, **kwargs) - - def quantize(self, *args, **kwargs): + def _pre_quantize_impl(self, *args, **kwargs): if bool(self.compressor.quant_block_list): - all_blocks = self.compressor.quant_block_list + self.all_blocks = self.compressor.quant_block_list else: - all_blocks = get_block_names(self.compressor.model) - - if len(all_blocks) == 0: + self.all_blocks = get_block_names(self.compressor.model) + if len(self.all_blocks) == 0: logger.warning("could not find blocks, exit with original model") return self.compressor.model, self.compressor.layer_config if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype: self.compressor.model = self.compressor.model.to(self.compressor.amp_dtype) - layer_names = get_quantized_layer_names_outside_blocks( + self.layer_names = get_quantized_layer_names_outside_blocks( model=self.compressor.model, layer_config=self.compressor.layer_config, supported_types=self.compressor.supported_types, quant_block_list=self.compressor.quant_block_list, ) - start_time = time.time() - all_first_block_names = [block[0] for block in all_blocks] - if len(layer_names) > 0: + self.all_first_block_names = [block[0] for block in self.all_blocks] + if len(self.layer_names) > 0: logger.info( - "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names + "Starting to cache block inputs. This may be slow due to external block layers: %s", self.layer_names ) else: logger.info("start to cache block inputs") # TODO: refactor this - all_inputs = self.compressor.try_cache_inter_data_gpucpu( - all_first_block_names, self.compressor.nsamples, layer_names=layer_names + self.all_inputs = self.compressor.try_cache_inter_data_gpucpu( + self.all_first_block_names, self.compressor.nsamples, layer_names=self.layer_names ) - is_quantized_embedding = quantize_embedding_layer( model=self.compressor.model, layer_config=self.compressor.layer_config, @@ -114,13 +118,12 @@ def quantize(self, *args, **kwargs): device_list=self.compressor.device_list, ) clear_memory(device_list=self.compressor.device_list) - all_q_inputs = None if is_quantized_embedding: - all_inputs = copy.deepcopy(self.compressor.inputs) + self.all_inputs = copy.deepcopy(self.compressor.inputs) clear_memory(self.compressor.inputs, device_list=self.compressor.device_list) # TODO: refactor this - all_q_inputs = self.compressor.try_cache_inter_data_gpucpu( - all_first_block_names, self.compressor.nsamples, layer_names=layer_names + self.all_q_inputs = self.compressor.try_cache_inter_data_gpucpu( + self.all_first_block_names, self.compressor.nsamples, layer_names=self.layer_names ) self.compressor.model = mv_module_from_gpu(self.compressor.model) clear_memory(device_list=self.compressor.device_list) @@ -130,21 +133,24 @@ def quantize(self, *args, **kwargs): self.compressor.model ) # self.compressor.model.hf_device_map has not been changed logger.info("caching done") - if len(all_blocks) > 1: - pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.compressor.nblocks)) + + def _quantize_impl(self, *args, **kwargs): + start_time = time.time() + + if len(self.all_blocks) > 1: + pbar = tqdm(range(0, sum([len(i) for i in self.all_blocks]), self.compressor.nblocks)) else: - pbar = tqdm(range(0, len(all_blocks[0]), self.compressor.nblocks)) # move the alg warning outside pbar + pbar = tqdm(range(0, len(self.all_blocks[0]), self.compressor.nblocks)) # move the alg warning outside pbar - for block_names in all_blocks: - inputs = all_inputs[block_names[0]] - all_inputs.pop(block_names[0]) + for block_names in self.all_blocks: + inputs = self.all_inputs[block_names[0]] + self.all_inputs.pop(block_names[0]) q_inputs = None - if all_q_inputs is not None: - q_inputs = all_q_inputs[block_names[0]] - all_q_inputs.pop(block_names[0]) + if self.all_q_inputs is not None: + q_inputs = self.all_q_inputs[block_names[0]] + self.all_q_inputs.pop(block_names[0]) - # TODO: refactor this - inputs, q_inputs = self.compressor._update_inputs(inputs, q_inputs) + inputs, q_inputs = update_inputs(inputs, q_inputs, self.compressor.diffusion) clear_memory(self.compressor.inputs, device_list=self.compressor.device_list) @@ -170,8 +176,15 @@ def quantize(self, *args, **kwargs): ) pbar.set_description("Quantizing done") pbar.close() - self._quantize_layers(layer_names, all_inputs) + self._quantize_layers(self.layer_names, self.all_inputs) + end_time = time.time() + cost_time = end_time - start_time + logger.info(f"quantization tuning time {cost_time}") + + return self.compressor.model, self.compressor.layer_config + + def _post_quantize_impl(self, *args, **kwargs): if is_fp8_model(self.compressor.model): for n, m in self.compressor.model.named_modules(): if is_fp8_linear(m): @@ -180,10 +193,6 @@ def quantize(self, *args, **kwargs): ) set_module(self.compressor.model, n, new_layer) - end_time = time.time() - cost_time = end_time - start_time - logger.info(f"quantization tuning time {cost_time}") - # Dump a summary quantized_layers = [] unquantized_layers = [] @@ -203,7 +212,6 @@ def quantize(self, *args, **kwargs): logger.info(summary_info) self.compressor.quantized = True - return self.compressor.model, self.compressor.layer_config def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None: """Quantizes specified layers based on inputs and configuration. @@ -342,7 +350,7 @@ def _quantize_blocks( amp=self.compressor.amp, amp_dtype=self.compressor.amp_dtype, cache_device=self.compressor.cache_device, - diffusion=self.compressor.diffusion, + is_diffusion=self.compressor.diffusion, ) if pbar is None: @@ -363,23 +371,8 @@ def _quantize_blocks( m.config = model.config if hasattr(model, "config") else None q_input, input_ids = self.quantize_block( - m, - input_ids, - input_others, - q_input=q_input, - device=device, + m, input_ids, input_others, q_input=q_input, device=device, last_group=(i + nblocks) >= len(block_names) ) - if hasattr(model, "config"): - del m.config - if self.compressor.immediate_packing: - for _, tmp_m in m.named_modules(): - if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): - continue - self.compressor._immediate_pack(tmp_m.tmp_name) - - if self.compressor.immediate_saving: - last_group = (i + nblocks) >= len(block_names) - immediate_saving(self.compressor, m, last_group=last_group) if pbar is not None: pbar.update(1) @@ -396,7 +389,9 @@ def _quantize_blocks( clear_memory(device_list=self.compressor.device_list) - def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"): + def _quantize_layer_impl( + self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu" + ): """Quantize a specific layer of the model using the provided inputs. Args: @@ -462,12 +457,11 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch. lr = torch.tensor(self.compressor.lr) minmax_lr = torch.tensor(self.compressor.minmax_lr) if self.compressor.enable_minmax_tuning: - optimizer = self.compressor.optimizer( + optimizer = self.optimizer( [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0 ) else: - optimizer = self.compressor.optimizer(round_params, lr=lr, weight_decay=0) - + optimizer = self.optimizer([{"params": round_params}], lr=lr, weight_decay=0) if self.compressor.lr_scheduler is None: lr_schedule = torch.optim.lr_scheduler.LinearLR( optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.compressor.iters @@ -477,7 +471,7 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch. nsamples = len(inputs) last_best_iter = 0 best_loss = torch.finfo(torch.float).max - scaler = self.compressor._get_scaler() # pylint: disable=assignment-from-none + scaler = self._get_scaler() # pylint: disable=assignment-from-none init_loss = None gradient_accumulate_steps = self.compressor.batch_size # Force to low gpu total_loss = 0 @@ -546,7 +540,7 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch. num_elm = 1 if num_elm <= 0 else num_elm total_loss += loss.item() / num_elm - self.compressor.scale_loss_and_backward(scaler, loss) + self._scale_loss_and_backward(scaler, loss) if i == 0: init_loss = total_loss @@ -561,7 +555,7 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch. if not self.compressor.not_use_best_mse: if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter: break - self.compressor._step(scaler, optimizer, lr_schedule) + self._step(scaler, optimizer, lr_schedule) last_loss = total_loss best_iter = self.compressor.iters @@ -574,7 +568,7 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch. dump_info = f"quantized {layer_name}, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}" logger.info(dump_info) - def quantize_block( + def _quantize_block_impl( self, block: torch.nn.Module, input_ids: Union[list[torch.Tensor], dict], @@ -582,6 +576,7 @@ def quantize_block( q_input: Union[torch.Tensor, dict, None] = None, device: Union[str, torch.device] = "cpu", auto_offload=True, + **kwargs, ): """Quantize the weights of a given block of the model. @@ -713,9 +708,8 @@ def quantize_block( lr = torch.tensor(self.compressor.lr) minmax_lr = torch.tensor(self.compressor.minmax_lr) - is_adam = "adam" in self.compressor.__class__.__name__.lower() - extra_kwargs = {} if is_adam else {"momentum": self.compressor.momentum} + extra_kwargs = {} if self.is_adam else {"momentum": self.compressor.momentum} if self.compressor.enable_minmax_tuning: params = [ @@ -725,7 +719,7 @@ def quantize_block( else: params = round_params - optimizer = self.compressor.optimizer( + optimizer = self.optimizer( params, lr=lr, weight_decay=0, @@ -760,7 +754,7 @@ def quantize_block( if self.compressor.gradient_accumulate_steps != 1: mse_reduction = "sum" mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device) - scaler = self.compressor._get_scaler() # pylint: disable=assignment-from-none + scaler = self._get_scaler() # pylint: disable=assignment-from-none init_loss = None best_params = {} total_loss = 0 @@ -801,7 +795,7 @@ def quantize_block( # clear memory to avoid OOM due to memory fragmentation clear_memory_if_reached_threshold(threshold=0.5, device_list=self.compressor.device_list) - self.compressor._scale_loss_and_backward(scaler, loss) + self._scale_loss_and_backward(scaler, loss) if self.compressor.low_gpu_mem_usage and card_0_in_high_risk: # clear memory to avoid OOM due to memory fragmentation clear_memory_if_reached_threshold(threshold=0.8, device_list=self.compressor.device_list) @@ -822,7 +816,7 @@ def quantize_block( if not self.compressor.not_use_best_mse: if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter: break - self.compressor._step(scaler, optimizer, lr_schedule) + self._step(scaler, optimizer, lr_schedule) last_loss = total_loss best_iter = self.compressor.iters @@ -883,6 +877,26 @@ def quantize_block( return None, output + def _post_quantize_block_impl(self, block: torch.nn.Module, *args, last_group: bool, **kwargs): + """Post-process after quantizing a block. + + Args: + block: The block of the model that was quantized. + + Returns: + None + """ + if hasattr(block, "config"): + del block.config + if self.compressor.immediate_packing: + for _, tmp_m in block.named_modules(): + if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)): + continue + self.compressor._immediate_pack(tmp_m.tmp_name) + + if self.compressor.immediate_saving: + immediate_saving(self.compressor, block, last_group=last_group) + @staticmethod def _get_current_output( output: list[torch.Tensor], indices: list[int], batch_dim: int, diffusion: bool = False @@ -896,3 +910,100 @@ def _get_current_output( current_output = [output[x] for x in indices] current_output = torch.cat(current_output, dim=batch_dim) return current_output + + def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any): + """Performs a step in the optimization process. + + Args: + scaler: The scaler to be used. + optimizer: The optimizer for the step. + lr_schedule: The learning rate schedule. + + Returns: + None + """ + optimizer.step() + # for hpu + if is_hpex_available(): + htcore.mark_step() + optimizer.zero_grad() + lr_schedule.step() + + def _scale_loss_and_backward(self, scaler: Any, loss: torch.Tensor) -> torch.Tensor: + """Scales the loss and performs backward pass. + + Args: + scaler: The scaler to be used. + loss: The loss to be scaled. + + Returns: + The scaled loss. + """ + scale_loss = loss * 1000 + scale_loss.backward() + if is_hpex_available(): + htcore.mark_step() + return scale_loss + + def _get_scaler(self): + """Returns scaler, in SignRound, no need to use scaler.""" + return None + + def _get_optimizer(self, optimizer: Any): + """Returns the specified optimizer. In SignRound, we fix the optimizer. + + Args: + optimizer: The optimizer to be used. + + Returns: + The specified optimizer. + """ + return SignSGD + + +class ARAdamQuantizer(ARQuantizer): + """AutoRound Quantizer with Adam optimizer.""" + + def __init__(self, compressor: "BaseCompressor"): + super().__init__(compressor) + self.optimizer = self._get_optimizer("AdamW") + self.is_adam = True + + def _step(self, scaler, optimizer, lr_schedule): + if scaler is not None: + scaler.step(optimizer) + optimizer.zero_grad() + lr_schedule.step() + scaler.update() + else: + optimizer.step() + optimizer.zero_grad() + lr_schedule.step() + if is_hpex_available(): + htcore.mark_step() + + def _scale_loss_and_backward(self, scaler, loss): + if scaler is not None: + loss = scaler.scale(loss) + + loss.backward() + if is_hpex_available(): + htcore.mark_step() + return loss + + def _get_scaler(self): + scaler = None + if self.compressor.amp and not check_is_cpu(self.compressor.device): + from torch.cuda.amp import GradScaler + + scaler = GradScaler(init_scale=1024, growth_interval=100000) + return scaler + + def _get_optimizer(self, optimizer): + if optimizer is None: + optimizer = torch.optim.AdamW + elif isinstance(optimizer, str): + optimizer = getattr(torch.optim, optimizer) + else: + optimizer = optimizer + return optimizer diff --git a/auto_round/quantizers/algs/base.py b/auto_round/quantizers/algs/base.py old mode 100644 new mode 100755 index 62e97c134..bcb112094 --- a/auto_round/quantizers/algs/base.py +++ b/auto_round/quantizers/algs/base.py @@ -21,7 +21,31 @@ from auto_round.compressors.base import BaseCompressor -class AlgsBaseQuantizer(BaseQuantizer): +class AlgsBaseQuantizer(BaseQuantizer, ABC): + def _pre_quantize_impl(self, *args, **kwargs): + pass + @abstractmethod - def quantize(self, *args, **kwargs): + def _quantize_impl(self, *args, **kwargs): + pass + + def _post_quantize_impl(self, *args, **kwargs): + pass + + def _pre_quantize_layer_impl(self, *args, **kwargs): + pass + + def _quantize_layer_impl(self, *args, **kwargs): + pass + + def _post_quantize_layer_impl(self, *args, **kwargs): + pass + + def _pre_quantize_block_impl(self, *args, **kwargs): + pass + + def _quantize_block_impl(self, *args, **kwargs): + pass + + def _post_quantize_block_impl(self, *args, **kwargs): pass diff --git a/auto_round/quantizers/algs/rtn.py b/auto_round/quantizers/algs/rtn.py old mode 100644 new mode 100755 index f1b4676f4..d70ab6a76 --- a/auto_round/quantizers/algs/rtn.py +++ b/auto_round/quantizers/algs/rtn.py @@ -72,7 +72,7 @@ def __init__(self, compressor: "BaseCompressor"): n for n, m in self.compressor.model.named_modules() if check_to_quantized(m) ] - def pre_quantize(self, *args, **kwargs): + def _pre_quantize_impl(self, *args, **kwargs): if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype: self.compressor.model.to(self.compressor.amp_dtype) @@ -96,7 +96,7 @@ def pre_quantize(self, *args, **kwargs): logger.info("Finished updating fused layer global scales.") @torch.inference_mode() - def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: + def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: """Quantize all modules in the model using RTN (Round-To-Nearest) strategy. If the target format includes GGUF with `k`, and optimized RTN is enabled, @@ -172,9 +172,6 @@ def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: self.compressor.quantized = True return self.compressor.model, self.compressor.layer_config - def post_quantize(self, *args, **kwargs): - pass - def _quantize_via_rtn_blockwise(self) -> None: """Quantize model layers block by block using cached inputs.""" @@ -416,7 +413,7 @@ def get_imatrix_hook(module, input, output): return hook_handles @torch.inference_mode() - def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: + def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: enable_imatrix = False has_gguf_k = ( any(fmt.is_gguf() and "k" in fmt.output_format for fmt in getattr(self.compressor, "formats", [])) @@ -434,7 +431,7 @@ def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: self.compressor.quantized = True return self.compressor.model, self.compressor.layer_config else: - return super().quantize(*args, **kwargs) + return super()._quantize_impl(*args, **kwargs) def _quant_rtn_with_imatrix(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]: """Quantize all modules in the model using Optimized RTN strategy. diff --git a/auto_round/quantizers/base.py b/auto_round/quantizers/base.py old mode 100644 new mode 100755 index 1e1a73f17..d4f70a7ef --- a/auto_round/quantizers/base.py +++ b/auto_round/quantizers/base.py @@ -23,29 +23,81 @@ class BaseQuantizer(ABC): def __init__(self, compressor: "BaseCompressor"): self.compressor = compressor + def __mro_call(self, method_name: str, *args, **kwargs): + for cls in type(self).mro(): + method = cls.__dict__.get(method_name, None) + if method: + method(self, *args, **kwargs) + def pre_quantize(self, *args, **kwargs): + self.__mro_call("_pre_quantize_impl", *args, **kwargs) + + @abstractmethod + def _pre_quantize_impl(self, *args, **kwargs): pass def quantize(self, *args, **kwargs): + self.pre_quantize(*args, **kwargs) + self._quantize_impl(*args, **kwargs) + self.post_quantize(*args, **kwargs) + return self.compressor.model, self.compressor.layer_config + + @abstractmethod + def _quantize_impl(self, *args, **kwargs): pass def post_quantize(self, *args, **kwargs): + self.__mro_call("_post_quantize_impl", *args, **kwargs) + + @abstractmethod + def _post_quantize_impl(self, *args, **kwargs): pass def pre_quantize_layer(self, *args, **kwargs): + self.__mro_call("_pre_quantize_layer_impl", *args, **kwargs) + + @abstractmethod + def _pre_quantize_layer_impl(self, *args, **kwargs): pass def quantize_layer(self, *args, **kwargs): + self.pre_quantize_layer(*args, **kwargs) + result = self._quantize_layer_impl(*args, **kwargs) + self.post_quantize_layer(*args, **kwargs) + return result + + @abstractmethod + def _quantize_layer_impl(self, *args, **kwargs): pass def post_quantize_layer(self, *args, **kwargs): pass + self.__mro_call("_post_quantize_layer_impl", *args, **kwargs) + + @abstractmethod + def _post_quantize_layer_impl(self, *args, **kwargs): + pass def pre_quantize_block(self, *args, **kwargs): + self.__mro_call("_pre_quantize_block_impl", *args, **kwargs) + + @abstractmethod + def _pre_quantize_block_impl(self, *args, **kwargs): pass def quantize_block(self, *args, **kwargs): + self.pre_quantize_block(*args, **kwargs) + result = self._quantize_block_impl(*args, **kwargs) + self.post_quantize_block(*args, **kwargs) + return result + + @abstractmethod + def _quantize_block_impl(self, *args, **kwargs): pass def post_quantize_block(self, *args, **kwargs): + self.__mro_call("_post_quantize_block_impl", *args, **kwargs) + + @abstractmethod + def _post_quantize_block_impl(self, *args, **kwargs): pass diff --git a/auto_round/quantizers/entrypoint.py b/auto_round/quantizers/entrypoint.py old mode 100644 new mode 100755 index 473dcfca1..34745ca46 --- a/auto_round/quantizers/entrypoint.py +++ b/auto_round/quantizers/entrypoint.py @@ -16,14 +16,14 @@ if TYPE_CHECKING: from auto_round.compressors.base import BaseCompressor -from auto_round.quantizers.algs.auto_round import ARQuantizer +from auto_round.quantizers.algs.auto_round import ARAdamQuantizer, ARQuantizer from auto_round.quantizers.algs.rtn import OptRTNQuantizer, RTNQuantizer class AutoRoundQuantizer: def __new__(cls, compressor: "BaseCompressor", dynamic_quantizers: dict = None): assert dynamic_quantizers is not None, "Please provide dynamic_quantizers dict." - quantizer_cls = type("AutoRoundQuantizer", tuple(dynamic_quantizers.values()), {}) + quantizer_cls = type("AutoRoundQuantizer", (dynamic_quantizers["data_type"], dynamic_quantizers["algs"]), {}) return quantizer_cls(compressor) @@ -33,9 +33,7 @@ def __init__(self, quantizers: list[AutoRoundQuantizer]): def quantize(self, *args, **kwargs): for quantizer in self.quantizers: - quantizer.pre_quantize(*args, **kwargs) model, layer_config = quantizer.quantize(*args, **kwargs) - quantizer.post_quantize(*args, **kwargs) return model, layer_config @@ -43,7 +41,7 @@ def create_quantizers(compressor: "BaseCompressor"): alg_cls = None if compressor.iters > 0: - alg_cls = ARQuantizer + alg_cls = ARQuantizer if compressor.enable_adam is False else ARAdamQuantizer else: alg_cls = OptRTNQuantizer if compressor.disable_opt_rtn is False else RTNQuantizer diff --git a/auto_round/quantizers/utils.py b/auto_round/quantizers/utils.py old mode 100644 new mode 100755 index 3ea0c009f..a6f1dd4bc --- a/auto_round/quantizers/utils.py +++ b/auto_round/quantizers/utils.py @@ -220,8 +220,8 @@ def get_non_zero_cnt(tensor: list[torch.Tensor], indices: list[int]) -> int: return non_zero_cnt -def split_inputs(inputs: dict, first_input_name: str, diffusion: bool = False) -> tuple[torch.Tensor, dict]: - if diffusion: +def split_inputs(inputs: dict, first_input_name: str, is_diffusion: bool = False) -> tuple[torch.Tensor, dict]: + if is_diffusion: input_id_str = [key for key in inputs.keys() if "hidden_state" in key] input_ids = {k: inputs.pop(k, None) for k in input_id_str} input_others = inputs @@ -240,9 +240,9 @@ def preprocess_block_inputs( amp: bool = False, amp_dtype: torch.dtype = torch.float32, cache_device: Union[str, torch.device] = "cpu", - diffusion: bool = False, + is_diffusion: bool = False, ): - input_ids, input_others = split_inputs(inputs, first_input_name, diffusion=diffusion) + input_ids, input_others = split_inputs(inputs, first_input_name, is_diffusion=is_diffusion) clear_memory(device_list=device_list) input_ids = to_device(input_ids, cache_device) input_others = to_device(input_others, cache_device) @@ -260,3 +260,23 @@ def preprocess_block_inputs( for i in range(len(input_others[key])): to_dtype(input_others[key][i], tmp_dtype) return input_ids, input_others + + +def update_inputs(inputs: dict, q_inputs: dict, is_diffusion: bool) -> tuple[dict, dict]: + if is_diffusion: + input_id_str = [key for key in inputs.keys() if "hidden_state" in key] + if q_inputs is not None: + q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str} + return inputs, q_inputs + else: + keys = inputs.keys() + input_id_str = [key for key in keys if key.startswith("hidden_state")] + if len(input_id_str) != 1: + raise RuntimeError( + "hidden_states arg mismatch error," + "please raise an issue in https://github.com/intel/auto-round/issues" + ) + inputs["input_ids"] = inputs.pop(input_id_str[0], None) + if q_inputs is not None: + q_inputs = q_inputs.pop(input_id_str[0], None) + return inputs, q_inputs diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py old mode 100644 new mode 100755 index 4ba59629e..6f736eb98 --- a/test/test_cpu/export/test_gguf_format.py +++ b/test/test_cpu/export/test_gguf_format.py @@ -29,7 +29,7 @@ def teardown_class(self): def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path): python_path = sys.executable res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {tiny_gemma_model_path} " + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {tiny_gemma_model_path} " f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m" ) if res > 0 or res == -1: @@ -37,7 +37,7 @@ def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path): shutil.rmtree("./saved", ignore_errors=True) res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {tiny_qwen_model_path}" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {tiny_qwen_model_path}" f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" ) if res > 0 or res == -1: @@ -165,7 +165,7 @@ def test_all_format(self, tiny_qwen_model_path): # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]: for gguf_format in ["gguf:q4_k_m"]: res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {model_name} " + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {model_name} " f" --bs 16 --iters 1 --nsamples 1 --seqlen 16 --format {gguf_format}" ) if res > 0 or res == -1: @@ -173,7 +173,7 @@ def test_all_format(self, tiny_qwen_model_path): shutil.rmtree("../../tmp_autoround", ignore_errors=True) res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {model_name}" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {model_name}" f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --format fake,{gguf_format}" ) if res > 0 or res == -1: @@ -182,7 +182,7 @@ def test_all_format(self, tiny_qwen_model_path): # test mixed q2_k_s res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {model_name}" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {model_name}" f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --scheme GGUF:Q2_K_MIXED" ) if res > 0 or res == -1: diff --git a/test/test_cpu/utils/test_cli_usage.py b/test/test_cpu/utils/test_cli_usage.py old mode 100644 new mode 100755 index 6ba676936..f57e9c102 --- a/test/test_cpu/utils/test_cli_usage.py +++ b/test/test_cpu/utils/test_cli_usage.py @@ -25,24 +25,24 @@ def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path): python_path = sys.executable # Test llm script - res = os.system(f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round -h") + res = os.system(f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" @@ -50,23 +50,23 @@ def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path): # test mllm script # test auto_round_mllm --eval help - res = os.system(f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --eval -h") + res = os.system(f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --eval -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" # test auto_round_mllm --lmms help - res = os.system(f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --eval --lmms -h") + res = os.system(f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --eval --lmms -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round" " --quant_nontext_module --output_dir ./saved " ) if res > 0 or res == -1: diff --git a/test/test_cuda/advanced/test_multiple_card_calib.py b/test/test_cuda/advanced/test_multiple_card_calib.py old mode 100644 new mode 100755 index 06c869a86..8eef8b981 --- a/test/test_cuda/advanced/test_multiple_card_calib.py +++ b/test/test_cuda/advanced/test_multiple_card_calib.py @@ -44,7 +44,7 @@ def test_multiple_card_calib(self): ##test llm script res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" @@ -55,7 +55,7 @@ def test_multiple_card_nvfp4(self): ##test llm script res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model facebook/opt-125m --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model facebook/opt-125m --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py old mode 100644 new mode 100755 index a6089a086..454a2c6ce --- a/test/test_cuda/export/test_gguf.py +++ b/test/test_cuda/export/test_gguf.py @@ -59,7 +59,7 @@ def test_gguf_format(self, tiny_qwen_model_path, dataloader): save_dir = os.path.join(os.path.dirname(__file__), "saved") res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 " + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 " f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0" ) print(save_dir) diff --git a/test/test_cuda/models/test_support_vlms.py b/test/test_cuda/models/test_support_vlms.py old mode 100644 new mode 100755 index 4f9373ca5..a339ba69a --- a/test/test_cuda/models/test_support_vlms.py +++ b/test/test_cuda/models/test_support_vlms.py @@ -30,7 +30,7 @@ def test_qwen2(self): model_path = "/models/Qwen2-VL-2B-Instruct/" # test tune res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {self.python_path} -m auto_round --mllm " + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}" ) assert not (res > 0 or res == -1), "qwen2 tuning fail" @@ -85,7 +85,7 @@ def test_phi3(self): model_path = "/models/Phi-3.5-vision-instruct/" ## test tune res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {self.python_path} -m auto_round --mllm " + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}" ) assert not (res > 0 or res == -1), "Phi-3.5 tuning fail" @@ -133,7 +133,7 @@ def test_phi3_vision_awq(self): model_path = "/models/Phi-3.5-vision-instruct/" ## test tune res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {self.python_path} -m auto_round --mllm " + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --quant_nontext_module " f"--nsample 64 --seqlen 32 " f"--format auto_awq --output_dir {self.save_dir} --device {self.device}" @@ -181,7 +181,7 @@ def test_glm(self): model_path = "/models/glm-4v-9b/" ## test tune res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {self.python_path} -m auto_round " + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {self.python_path} -m auto_round " f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}" ) assert not (res > 0 or res == -1), "glm-4v-9b tuning fail" @@ -190,7 +190,7 @@ def test_granite_vision(self): model_path = "/models/granite-vision-3.2-2b" ## test tune res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {self.python_path} -m auto_round " + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {self.python_path} -m auto_round " f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}" ) assert not (res > 0 or res == -1), "granite-vision-3.2-2b tuning fail" diff --git a/test/test_cuda/utils/test_alg_ext.py b/test/test_cuda/utils/test_alg_ext.py old mode 100644 new mode 100755 index d2e8e4bfb..16f42bda5 --- a/test/test_cuda/utils/test_alg_ext.py +++ b/test/test_cuda/utils/test_alg_ext.py @@ -52,13 +52,13 @@ def test_cli(self, tiny_opt_model_path): python_path = sys.executable res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32" + f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" From 85757811b2f496c07c6e7d7324c198165d87f6aa Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 27 Jan 2026 14:54:19 +0800 Subject: [PATCH 5/5] fix Signed-off-by: n1ck-guo --- auto_round/quantizers/algs/rtn.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/auto_round/quantizers/algs/rtn.py b/auto_round/quantizers/algs/rtn.py index 1d6710f85..585c2631a 100755 --- a/auto_round/quantizers/algs/rtn.py +++ b/auto_round/quantizers/algs/rtn.py @@ -39,7 +39,7 @@ check_to_quantized, clear_memory, convert_fp8_layer_to_linear, - convert_fp8_model_to_16b_model, + convert_fp8_module_to_16b, flatten_list, get_block_names, get_lm_head_name, @@ -190,7 +190,7 @@ def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, An materialize_model_(block) for name, m in block.named_modules(): if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names: - self._quantize_layer_via_rtn(m.global_name, to_cpu=self.low_gpu_mem_usage) + self._quantize_layer_via_rtn(m.global_name, to_cpu=self.compressor.low_gpu_mem_usage) all_to_quantized_module_names.remove(m.global_name) elif ( not any(m.children()) @@ -198,10 +198,10 @@ def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, An and m.global_name not in tied_weights_layers ): set_module(self.compressor.model, m.global_name, copy.deepcopy(m)) - if self.is_immediate_saving: + if self.compressor.is_immediate_saving: shard_writer(self, name=m.global_name) m.to("meta") - clear_memory(device_list=self.device_list) + clear_memory(device_list=self.compressor.device_list) memory_monitor.log_summary() pbar.update(1) cnt = 1 @@ -210,7 +210,7 @@ def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, An self._quantize_layer_via_rtn(name, to_cpu=True) cnt += 1 if cnt % 10 == 0: - clear_memory(device_list=self.device_list) + clear_memory(device_list=self.compressor.device_list) memory_monitor.log_summary() else: materialize_model_(self.model) @@ -231,8 +231,8 @@ def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, An cnt += 1 # Convert remaining fp8 if is_fp8_model(self.compressor.model): - convert_fp8_model_to_16b_model(self.compressor.model, self.compressor.amp_dtype, self.compressor.device) - if self.is_immediate_saving: + convert_fp8_module_to_16b(self.compressor.model, self.compressor.amp_dtype, self.compressor.device) + if self.compressor.is_immediate_saving: shard_writer(self, is_finalize=True) self.compressor.quantized = True return self.compressor.model, self.compressor.layer_config @@ -310,9 +310,7 @@ def _quantize_via_rtn_blockwise(self) -> None: materialize_model_(block) block.to("cpu") if is_fp8_model(self.compressor.model): - convert_fp8_model_to_16b_model( - block, dtype=self.compressor.amp_dtype, device=self.compressor.device - ) + convert_fp8_module_to_16b(block, dtype=self.compressor.amp_dtype, device=self.compressor.device) if is_auto_device_mapping(self.compressor.device_map) and len(self.compressor.device_list) > 1: set_auto_device_map_for_block_with_tuning( @@ -494,7 +492,7 @@ def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, An self._quant_rtn_with_imatrix(self.all_to_quantized_module_names) # Convert remaining fp8 if is_fp8_model(self.compressor.model): - convert_fp8_model_to_16b_model(self.compressor.model, self.compressor.amp_dtype, self.compressor.device) + convert_fp8_module_to_16b(self.compressor.model, self.compressor.amp_dtype, self.compressor.device) self.compressor.quantized = True return self.compressor.model, self.compressor.layer_config else: