From 246f817c6be0dbdbb3d2d4e1a7ca6e3e02ee173b Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 14 Jan 2026 03:52:45 -0500
Subject: [PATCH 1/5] refactor rtn and tuning

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py                | 1369 +----------------
 .../compressors/diffusion/compressor.py       |   12 -
 auto_round/quantizers/__init__.py             |   15 +
 auto_round/quantizers/algs/auto_round.py      |  888 +++++++++++
 auto_round/quantizers/algs/base.py            |   27 +
 auto_round/quantizers/algs/rtn.py             |  627 ++++++++
 auto_round/quantizers/base.py                 |   50 +
 auto_round/quantizers/entrypoint.py           |   55 +
 auto_round/quantizers/readme.md               |   21 +
 auto_round/quantizers/utils.py                |  262 ++++
 10 files changed, 1961 insertions(+), 1365 deletions(-)
 create mode 100644 auto_round/quantizers/__init__.py
 create mode 100644 auto_round/quantizers/algs/auto_round.py
 create mode 100644 auto_round/quantizers/algs/base.py
 create mode 100644 auto_round/quantizers/algs/rtn.py
 create mode 100644 auto_round/quantizers/base.py
 create mode 100644 auto_round/quantizers/entrypoint.py
 create mode 100644 auto_round/quantizers/readme.md
 create mode 100644 auto_round/quantizers/utils.py

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 5d7ba6371..a383659d0 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -52,7 +52,6 @@
     reset_params,
     set_layer_config,
 )
-from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size
 from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG, ModelType
 from auto_round.formats import OutputFormat, get_formats
@@ -902,281 +901,6 @@ def _get_save_folder_name(self, format: OutputFormat) -> str:
 
         return self.orig_output_dir
 
-    @torch.inference_mode()
-    def _quantize_embedding_layer(self):
-        """Quantizes embedding layers in the model according to the configuration.
-
-        This method iterates through all modules in the model, identifies embedding
-        layers specified in `self.layer_config`, and applies the appropriate quantization
-        function based on bit precision, grouping strategy, and dtype.
-
-        Returns:
-            bool: True if the quantization process completes without critical errors.
-        """
-        is_quantized = False
-        for name, module in self.model.named_modules():
-            # Skip non-Embedding modules or layers not in config
-            if not isinstance(module, torch.nn.Embedding) or name not in self.layer_config:
-                continue
-
-            config = self.layer_config[name]
-
-            # Skip layers that are not marked for quantization
-            if not check_to_quantized(config):
-                continue
-            is_quantized = True
-            config["scale_dtype"] = self.scale_dtype
-            dtype = config["data_type"]
-
-            # Determine quantization function key with symmetry/asymmetry
-            if dtype not in QUANT_FUNC_WITH_DTYPE:
-                dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}"
-
-            # Optionally use optimized rounding (RTN) variant
-            if not self.disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE:
-                dtype = f"rtn_{dtype}"
-
-            quant_func = QUANT_FUNC_WITH_DTYPE[dtype]
-            dtype = module.weight.dtype
-            # As typically float32 are used in RTN to search scale zp,
-            # to avoid cache a bf16 copy we'd better use float32
-            if config.get("super_group_size", None) is not None:
-                dtype = torch.float32
-
-            # Attempt quantization on GPU, fall back to CPU if OOM
-            try:
-                weight, scale, zp = quant_func(
-                    module.weight.to(dtype=dtype, device=self.device),
-                    **{
-                        k: config.get(k, None)
-                        for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]
-                    },
-                )
-            except torch.OutOfMemoryError:
-                cuda_error_msg = traceback.format_exc()
-                try:
-                    logger.error(cuda_error_msg)
-                    logger.warning("falling back to CPU")
-                    weight, scale, zp = quant_func(
-                        module.weight.to("cpu"),
-                        **{
-                            k: config.get(k, None)
-                            for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]
-                        },
-                    )
-                except Exception as e:
-                    raise
-
-            # Overwrite the module's weights with the quantized version
-            module.weight.data.copy_(weight.cpu())
-
-            # Attach scale and zero point (zp) to the module
-            for param_name, value in zip(["scale", "zp"], [scale, zp]):
-                if isinstance(value, dict):
-                    for k, v in value.items():
-                        setattr(module, k if k == "scale" else f"w_{k}", v.cpu())
-                elif isinstance(value, torch.Tensor):
-                    setattr(module, param_name, value.cpu())
-                else:
-                    setattr(module, param_name, value)
-
-            # Update config
-            self.layer_config.setdefault(name, {}).update(config)
-            del weight
-            del scale
-            del zp
-            clear_memory(device_list=self.device_list)
-
-        return is_quantized
-
-    def _quant_rtn_with_imatrix(self, all_to_quantized_module_names: list[str]) -> None:
-        """Performs RTN quantization using input activation statistics (imatrix).
-
-        This method accumulates per-channel second-moment activation statistics (imatrix)
-        via forward hooks and uses them to perform RTN quantization. If CUDA memory runs out,
-        it falls back to CPU-based blockwise quantization.
-
-        Args:
-            all_to_quantized_module_names (list[str]):
-                A list of module names (e.g., 'model.layers.0.self_attn.q_proj') to be quantized.
-
-        Returns:
-            None
-        """
-        logger.info("start to compute imatrix")
-
-        # Load dataset
-        from auto_round.calib_dataset import get_dataloader
-
-        if isinstance(self.dataset, str):
-            if self.tokenizer is None:
-                raise ValueError("A tokenizer must be set for the model when using a dataset string.")
-            dataset_name = self.dataset.replace(" ", "")
-            self.dataloader = get_dataloader(
-                self.tokenizer, self.seqlen, dataset_name, self.seed, self.batch_size, self.nsamples
-            )
-        else:
-            self.dataloader = self.dataset
-
-        model = self.model
-
-        # Dispatch multi-GPU model if necessary
-        if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
-            dispatch_model(model, model.hf_device_map)
-
-        def register_act_hook(model):
-            """Registers hooks to accumulate activation squared norms into `imatrix`."""
-
-            def get_imatrix_hook(module, input, output):
-                input = input[0] if isinstance(input, (tuple, list)) else input
-                flattened = input.reshape(-1, input.shape[-1]).to(torch.float32)
-                squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32)
-
-                if not hasattr(module, "imatrix"):
-                    module.imatrix = squared
-                    module.imatrix_cnt = input.shape[0]
-                else:
-                    module.imatrix += squared.to(module.imatrix.device)
-                    module.imatrix_cnt += input.shape[0]
-
-            hook_handles = []
-            for name, module in model.named_modules():
-                if type(module) in self.supported_types and check_to_quantized(module):
-                    hook = module.register_forward_hook(get_imatrix_hook)
-                    hook_handles.append(hook)
-            return hook_handles
-
-        hooks = register_act_hook(model)
-
-        try:
-            if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
-                import accelerate
-
-                accelerate.hooks.remove_hook_from_submodules(model)
-            model = model.to("cpu")
-            clear_memory(device_list=self.device_list)
-            self._quantize_via_rtn_blockwise(all_to_quantized_module_names)
-        except torch.OutOfMemoryError:
-            cuda_error_msg = traceback.format_exc()
-            try:
-                logger.error(cuda_error_msg)
-                # Final fallback: warn and use CPU-only quantization
-                logger.warning(
-                    "Fallback to CPU. "
-                    "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`."
-                )
-                model = model.to("cpu")
-                clear_memory(device_list=self.device_list)
-                if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
-                    import accelerate
-
-                    accelerate.hooks.remove_hook_from_submodules(model)
-
-                orig_device = self.device
-                self.device = "cpu"
-                self._quantize_via_rtn_blockwise(all_to_quantized_module_names)
-                self.device = orig_device
-            except Exception as e:
-                raise
-        finally:
-            # Always remove hooks
-            for hook in hooks:
-                hook.remove()
-
-    def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None:
-        """Quantizes a layer using RTN (Round-To-Nearest) if available.
-
-        This function attempts to quantize a layer by switching its data type to a
-        `rtn_*` version if supported, then wraps and unwraps the module to apply
-        quantization. If GPU memory is insufficient, it falls back to CPU.
-
-        If packing is enabled (`immediate_packing`), the function will also export
-        the quantized layer to the appropriate backend format.
-
-        Args:
-            name (str): Name of the layer to quantize.
-
-        Raises:
-            RuntimeError: If quantization fails for reasons unrelated to memory.
-        """
-        m = get_module(self.model, name)
-        if dtype is not None:
-            m = m.to(dtype)
-
-        if is_fp8_linear(m):
-            m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device)
-            set_module(self.model, name, m)
-        tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.device
-        # Step 1: Try quantization on GPU first, fall back to CPU if OOM
-        if self.immediate_packing and self.iters == 0 and self.formats[0].is_gguf() and not self.disable_opt_rtn:
-            m = m.to(tuning_device)
-            m.scale = None
-            m.zp = None
-        else:
-            try:
-                disable_opt_rtn = self.disable_opt_rtn
-                if (
-                    not disable_opt_rtn
-                    and self.orig_disable_opt_rtn is None
-                    and self.is_moe_model
-                    and "expert" in m.tmp_name
-                    and "shared_expert" not in m.tmp_name
-                    and self.super_bits is None  # GGUF still uses the optimized RTN for MoE layers
-                ):
-                    disable_opt_rtn = True
-                    logger.warning_once(
-                        "MoE layer detected: optimized RTN is disabled for efficiency. "
-                        "Use `--enable_opt_rtn` to force-enable it for MoE layers."
-                    )
-                m = m.to(tuning_device)
-                m = WrapperLinear(
-                    m,
-                    device=tuning_device,
-                    enable_minmax_tuning=False,
-                    enable_norm_bias_tuning=False,
-                    enable_round_tuning=False,
-                    enable_torch_compile=self.enable_torch_compile,
-                    disable_opt_rtn=disable_opt_rtn,
-                )
-                m = m.unwrapper({})
-            except torch.OutOfMemoryError:
-                cuda_error_msg = traceback.format_exc()
-                m = m.orig_layer if hasattr(m, "orig_layer") else m
-                try:
-                    logger.error(cuda_error_msg)
-                    logger.warning("falling back to CPU.")
-                    m.to("cpu")
-                    m = WrapperLinear(
-                        m,
-                        enable_minmax_tuning=False,
-                        enable_norm_bias_tuning=False,
-                        enable_round_tuning=False,
-                        enable_torch_compile=self.enable_torch_compile,
-                    )
-                    m = m.unwrapper({})
-                except Exception as e:
-                    raise
-
-        # Step 2: Optional immediate packing/export
-        if self.immediate_packing:  # For gguf, packing conducts on block level
-            self._immediate_pack(name)
-            if to_cpu:
-                m = m.to("cpu")
-                packed_m = get_module(self.model, name)
-                set_module(self.model, name, packed_m.to("cpu"))
-        else:
-            if to_cpu:
-                m = m.to("cpu")
-            set_module(self.model, name, m)
-        if self.immediate_saving:
-            if hasattr(self, "all_to_quantized_module_names"):
-                all_to_quantized_module_names = self.all_to_quantized_module_names
-            else:
-                all_to_quantized_module_names = [n for n, m in self.model.named_modules() if check_to_quantized(m)]
-            last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1])
-            m = get_module(self.model, name)
-            immediate_saving(self, m, name, last_module)
-
     def _immediate_pack(self, name: str):
         if not self.immediate_packing:
             return
@@ -1192,230 +916,6 @@ def _immediate_pack(self, name: str):
             image_processor=self.image_processor if hasattr(self, "image_processor") else None,
         )
 
-    @torch.inference_mode()
-    def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
-        """Quantize all modules in the model using RTN (Round-To-Nearest) strategy.
-
-        If the target format includes GGUF with `k`, and optimized RTN is enabled,
-        blockwise quantization with input caching and imatrix is used.
-
-        Returns:
-            tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration.
-        """
-        if self.amp and self.model.dtype != self.amp_dtype:
-            self.model.to(self.amp_dtype)
-
-        all_to_quantized_module_names: list[str] = [n for n, m in self.model.named_modules() if check_to_quantized(m)]
-        self.all_to_quantized_module_names = all_to_quantized_module_names
-        if is_nv_fp(self.data_type):
-            from auto_round.data_type.nvfp import calculate_gparam
-            from auto_round.data_type.utils import update_fused_layer_global_scales
-
-            pbar = tqdm(all_to_quantized_module_names)
-            for name in pbar:
-                pbar.set_description(f"Calculate weight global scale: {name}")
-                m = get_module(self.model, name)
-                if is_fp8_linear(m):
-                    m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device)
-                    set_module(self.model, name, m)
-                weight_global_scale = calculate_gparam(m.weight, self.group_size)
-                setattr(m, "weight_global_scale", weight_global_scale)
-
-            logger.info("Start to update fused layer global scales, it may take some time.")
-            for name, module in self.model.named_modules():
-                update_fused_layer_global_scales(module)
-            logger.info("Finished updating fused layer global scales.")
-
-        if not (any(fmt.is_gguf() for fmt in getattr(self, "formats", [])) or self.super_bits is not None):
-            self._quantize_embedding_layer()  # leave to gguf itself to handle
-
-        self.model.to("cpu")
-        # Release memory
-        clear_memory(device_list=self.device_list)
-
-        enable_imatrix = False
-        if not self.disable_opt_rtn:
-            has_gguf_k = (
-                any(fmt.is_gguf() and "k" in fmt.output_format for fmt in getattr(self, "formats", []))
-                or self.super_bits is not None
-            )
-            if has_gguf_k:
-                enable_imatrix = True
-            elif self.data_type == "int" and self.sym:
-                enable_imatrix = True
-        if enable_imatrix:
-            self._quant_rtn_with_imatrix(all_to_quantized_module_names)
-        elif self.act_bits <= 8 and check_need_act_calibration(
-            self.act_dynamic,
-            self.act_data_type,
-            self.act_bits,
-            self.static_kv_dtype,
-            self.static_attention_dtype,
-        ):  # TODO, mixed datatype has bug
-            hook_handles = self._register_act_max_hook(self.model)
-            try:
-                self._quantize_via_rtn_blockwise(all_to_quantized_module_names)
-            except torch.OutOfMemoryError:
-                logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.")
-                self.model = self.model.to("cpu")
-                clear_memory(device_list=self.device_list)
-                if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
-                    import accelerate
-
-                    accelerate.hooks.remove_hook_from_submodules(self.model)
-                orig_device = self.device
-                self.device = "cpu"
-                self._quantize_via_rtn_blockwise(all_to_quantized_module_names)
-                self.device = orig_device
-            for handle in hook_handles:
-                handle.remove()
-        else:
-            block_names_cnt = len(flatten_list(get_block_names(self.model, True)))
-            clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt
-            if clear_mem_freq == 0:
-                clear_mem_freq = 1
-            pbar = tqdm(all_to_quantized_module_names)
-            cnt = 1
-            for name in pbar:
-                pbar.set_description(f"Quantizing {name}")
-                self._quantize_layer_via_rtn(name)
-                if cnt % clear_mem_freq == 0:
-                    clear_memory(device_list=self.device_list)
-                    memory_monitor.log_summary()
-                    cnt = 1
-                cnt += 1
-        # Convert remaining fp8
-        if is_fp8_model(self.model):
-            convert_fp8_model_to_16b_model(self.model, self.amp_dtype, self.device)
-        self.quantized = True
-        return self.model, self.layer_config
-
-    def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) -> None:
-        """Quantize model layers block by block using cached inputs and imatrix.
-
-        Args:
-            all_to_quantized_module_names (list[str]): Names of layers to be quantized.
-        """
-        all_to_quantized_module_names = list(set(all_to_quantized_module_names))
-
-        all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model)
-        if not all_blocks:
-            raise ValueError("Could not find any blocks. Check the model or quant_block_list.")
-
-        all_first_block_names = [block[0] for block in all_blocks]
-        layer_names = self._get_quantized_layer_names_outside_blocks()
-        if self.act_bits < 16 and (not self.act_dynamic or len(layer_names) > 0):
-            if len(layer_names) > 0:
-                logger.warning(
-                    "quantize layers outside blocks for static activation quantizaiton"
-                    " will significantly increase calibration time"
-                )
-            all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names)
-        else:
-            all_inputs = self.cache_inter_data(all_first_block_names, self.nsamples)
-
-        # Clear hooks for multi-GPU setups
-        if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
-            accelerate.hooks.remove_hook_from_submodules(self.model)
-
-        pbar = tqdm(range(sum(len(block) for block in all_blocks)))
-
-        for block_names in all_blocks:
-            first_block = block_names[0]
-            inputs = all_inputs.pop(first_block)
-            input_keys = [k for k in inputs if k.startswith("hidden_state")]
-            if len(input_keys) != 1:
-                raise RuntimeError(
-                    "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues"
-                )
-            inputs["input_ids"] = inputs.pop(input_keys[0])
-
-            clear_memory(self.inputs, device_list=self.device_list)
-
-            total_samples = len(inputs["input_ids"])
-            if total_samples < self.batch_size:
-                self.batch_size = total_samples
-                logger.warning(f"Forcing batch size to {total_samples}")
-
-            input_ids = to_device(inputs.pop("input_ids"), self.cache_device)
-            input_others = to_device(inputs, self.cache_device)
-
-            tmp_dtype = self.amp_dtype if self.amp else torch.float32
-            input_ids = [id_.to(tmp_dtype) for id_ in input_ids]
-
-            for key, val in input_others.items():
-                if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16):
-                    input_others[key] = val.to(tmp_dtype)
-                elif isinstance(val, list):
-                    input_others[key] = [to_dtype(v, tmp_dtype) for v in val]
-
-            for block_name in block_names:
-                pbar.set_description(f"Quantizing {block_name}")
-                block = get_module(self.model, block_name)
-                if is_fp8_model(self.model):
-                    convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype, device=self.device)
-
-                if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1:
-                    set_auto_device_map_for_block_with_tuning(
-                        block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, self.device
-                    )
-                # Dispatch model if needed
-                if len(self.device_list) > 1:
-                    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
-
-                    for _, m in block.named_modules():
-                        if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
-                            continue
-                        hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
-                        add_hook_to_module(m, hook, True)
-                else:
-                    block = block.to(self.device)
-                input_ids = self._get_block_outputs(
-                    block,
-                    input_ids,
-                    input_others,
-                    self.batch_size * self.infer_bs_coeff,
-                    self.device,
-                    self.cache_device,
-                )
-
-                if len(self.device_list) > 1:
-                    accelerate.hooks.remove_hook_from_submodules(block)
-
-                if is_nv_fp(self.act_data_type) or is_static_wfp8afp8(self):
-                    # enable moe experts act_max automatic generation for Linear
-                    set_amax_for_all_moe_layers(block, attr_name="act_max")
-                # Normalize imatrix and quantize layers
-                if self.low_gpu_mem_usage:
-                    block.to("cpu")
-                    clear_memory(device_list=self.device_list)
-
-                for _, m in block.named_modules():
-                    # fix issue: Ling-flash-2.0-q2_k_s fail infer on cuda but well on cpu
-                    # https://huggingface.co/Intel/Ling-flash-2.0-gguf-q2ks-mixed-AutoRound/discussions/1
-                    if hasattr(m, "imatrix"):
-                        m.imatrix /= m.imatrix_cnt
-                    if hasattr(m, "tmp_name") and m.tmp_name in all_to_quantized_module_names:
-                        self._quantize_layer_via_rtn(m.tmp_name, to_cpu=self.low_gpu_mem_usage)
-                        all_to_quantized_module_names.remove(m.tmp_name)
-                if not self.immediate_saving:
-                    mv_module_from_gpu(block)
-                if block_name == block_names[-1]:
-                    clear_memory(input_ids, device_list=self.device_list)
-                else:
-                    clear_memory(device_list=self.device_list)
-
-                memory_monitor.log_summary()
-                pbar.update(1)
-        pbar.close()
-        # Process remaining layers not in blocks
-        for name in all_to_quantized_module_names:
-            dtype = None
-            if self.super_group_size is not None:
-                dtype = torch.float32
-            self._quantize_layer_via_rtn(name, dtype=dtype)
-            # clear_memory(device_list=self.device_list)
-
     def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]:
         keys = inputs.keys()
         input_id_str = [key for key in keys if key.startswith("hidden_state")]
@@ -1484,212 +984,11 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         if self.immediate_saving and "int" not in self.data_type:
             logger.warning("immediate_saving is only supported for int quantization, set to False")
             self.immediate_saving = False
-        if self.iters == 0:
-            return self._quantize_rtn()
-
-        if bool(self.quant_block_list):
-            all_blocks = self.quant_block_list
-        else:
-            all_blocks = get_block_names(self.model)
 
-        if len(all_blocks) == 0:
-            logger.warning("could not find blocks, exit with original model")
-            return self.model, self.layer_config
+        from auto_round.quantizers import create_quantizers
 
-        if self.amp and self.model.dtype != self.amp_dtype:
-            self.model = self.model.to(self.amp_dtype)
-
-        layer_names = self._get_quantized_layer_names_outside_blocks()
-        self.start_time = time.time()
-        all_first_block_names = [block[0] for block in all_blocks]
-        if len(layer_names) > 0:
-            logger.info(
-                "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names
-            )
-        else:
-            logger.info("start to cache block inputs")
-        all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names)
-        is_quantized_embedding = self._quantize_embedding_layer()
-        clear_memory(device_list=self.device_list)
-        all_q_inputs = None
-        if is_quantized_embedding:
-            all_inputs = copy.deepcopy(self.inputs)
-            clear_memory(self.inputs, device_list=self.device_list)
-            all_q_inputs = self.try_cache_inter_data_gpucpu(
-                all_first_block_names, self.nsamples, layer_names=layer_names
-            )
-        self.model = mv_module_from_gpu(self.model)
-        clear_memory(device_list=self.device_list)
-        if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
-            accelerate.hooks.remove_hook_from_submodules(self.model)  # self.model.hf_device_map has not been changed
-        logger.info("caching done")
-        if len(all_blocks) > 1:
-            pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks))
-        else:
-            pbar = tqdm(range(0, len(all_blocks[0]), self.nblocks))  # move the alg warning outside pbar
-
-        for block_names in all_blocks:
-            inputs = all_inputs[block_names[0]]
-            all_inputs.pop(block_names[0])
-            q_inputs = None
-            if all_q_inputs is not None:
-                q_inputs = all_q_inputs[block_names[0]]
-                all_q_inputs.pop(block_names[0])
-
-            inputs, q_inputs = self._update_inputs(inputs, q_inputs)
-
-            clear_memory(self.inputs, device_list=self.device_list)
-
-            if "input_ids" in inputs.keys():
-                total_samples = len(inputs["input_ids"])
-                if total_samples < self.batch_size:
-                    self.batch_size = total_samples
-                    logger.warning(f"force the train batch size to {total_samples}")
-
-            self._quantize_blocks(
-                self.model,
-                inputs,
-                block_names,
-                q_input=q_inputs if q_inputs is not None else None,
-                nblocks=self.nblocks,
-                device=self.device,
-                pbar=pbar,
-            )
-            if self.immediate_packing and len(self.formats) != 1:
-                raise ValueError(
-                    f"Expected exactly one packing format when 'immediate_packing' is True, "
-                    f"but got {len(self.formats)} formats."
-                )
-        pbar.set_description("Quantizing done")
-        pbar.close()
-        self._quantize_layers(layer_names, all_inputs)
-
-        if is_fp8_model(self.model):
-            for n, m in self.model.named_modules():
-                if is_fp8_linear(m):
-                    new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device).to("cpu")
-                    set_module(self.model, n, new_layer)
-
-        end_time = time.time()
-        cost_time = end_time - self.start_time
-        logger.info(f"quantization tuning time {cost_time}")
-
-        # Dump a summary
-        quantized_layers = []
-        unquantized_layers = []
-        for n, m in self.model.named_modules():
-            if isinstance(m, tuple(self.supported_types)):
-                if check_to_quantized(m):
-                    quantized_layers.append(n)
-                else:
-                    unquantized_layers.append(n)
-            elif hasattr(m, "scales") or hasattr(m, "scale"):  ##packing_immediately
-                quantized_layers.append(n)
-        summary_info = (
-            f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model"
-        )
-        if len(unquantized_layers) > 0:
-            summary_info += f",  {unquantized_layers} have not been quantized"
-        logger.info(summary_info)
-
-        self.quantized = True
-        return self.model, self.layer_config
-
-    def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
-        """Quantizes specified layers based on inputs and configuration.
-
-        Args:
-            layer_names (list): list of layer names to quantize.
-            layer_inputs (dict): Dictionary mapping layer names to input data.
-
-        Returns:
-            None
-        """
-        # TODO currently we take all the layers outside blocks as post block layers which is not optimal
-        # if there is no input for layer, we use rtn
-
-        for layer_name in copy.deepcopy(layer_names):
-            if layer_name not in layer_inputs:
-                if self.act_bits < 16 and not self.act_dynamic:
-                    # Activation quantization requires collected inputs
-                    msg_prefix = (
-                        f"Activation max hook for layer '{layer_name}' is unavailable due to "
-                        f"insufficient collected inputs. "
-                    )
-                    if "fp8_e5m2" in self.act_data_type:
-                        logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.")
-                    else:
-                        logger.warning(
-                            msg_prefix + "Static activation quantization is not supported or ineffective, "
-                            "Skipping quantization for this layer."
-                        )
-                        layer_names.remove(layer_name)
-                        continue
-                logger.info(f"using rtn to quantize {layer_name}")
-                from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
-
-                layer = get_module(self.model, layer_name)
-                layer = layer.to(self.device)
-                if is_fp8_linear(layer):
-                    new_layer = convert_fp8_layer_to_linear(layer, self.amp_dtype, self.device).to(self.device)
-                    set_module(self.model, layer_name, new_layer)
-                    layer = new_layer
-
-                wrapper_layer = WrapperLinear(
-                    layer,
-                    enable_round_tuning=False,
-                    enable_minmax_tuning=False,
-                    enable_norm_bias_tuning=False,
-                    enable_torch_compile=self.enable_torch_compile,
-                    device=self.device,
-                    disable_opt_rtn=self.disable_opt_rtn,
-                )
-                new_layer = wrapper_layer.unwrapper({})
-                set_module(self.model, layer_name, new_layer)
-                layer.cpu()
-                layer_names.remove(layer_name)
-        if len(layer_names) == 0:
-            memory_monitor.update()
-            memory_monitor.log_summary()
-            return
-        q_layer_inputs = None
-        enable_quanted_input = self.enable_quanted_input
-        has_gguf = False
-
-        if hasattr(self, "formats"):
-            has_gguf = any(format_.is_gguf() for format_ in self.formats)
-        if has_gguf and self.immediate_packing:
-            enable_quanted_input = False
-
-        if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1 and enable_quanted_input:
-            dispatch_model(self.model, self.model.hf_device_map)
-
-        if enable_quanted_input:
-            logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names)
-            q_layer_inputs = self.try_cache_inter_data_gpucpu([], self.nsamples, layer_names=layer_names)
-            if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
-                accelerate.hooks.remove_hook_from_submodules(
-                    self.model
-                )  # self.model.hf_device_map has not been changed
-        if not self.immediate_saving:
-            self.model = mv_module_from_gpu(self.model)
-        clear_memory(device_list=self.device_list)
-        quant_layer = self._quantize_layer
-        for layer_name in layer_names:
-            layer_input = layer_inputs[layer_name]
-            layer_input = to_device(layer_input, self.cache_device)
-            q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None
-            q_layer_input = to_device(q_layer_input, self.cache_device)
-            quant_layer(layer_name, layer_input, q_layer_input, device=self.device)
-            if self.immediate_packing:
-                self._immediate_pack(layer_name)
-
-            if self.immediate_saving:
-                m = get_module(self.model, layer_name)
-                immediate_saving(self, m, name=layer_name, last_group=True)
-            del layer_input
-            clear_memory(q_layer_input, device_list=self.device_list)
-            memory_monitor.log_summary()
+        quantizers = create_quantizers(self)
+        return quantizers.quantize()
 
     @torch.no_grad()
     def _get_block_outputs(
@@ -2241,244 +1540,6 @@ def _replace_forward(self):
                 hook_handle = m.register_forward_hook(hook_func)
                 self.hook_handles.append(hook_handle)
 
-    def _register_act_max_hook(self, model):
-        def get_act_max_hook(module, input, output):
-            if isinstance(input, (tuple, list)):
-                input = input[0]
-            if input.numel() == 0:
-                return  # as no needs for act_max update
-            input, _, _ = reshape_pad_tensor_by_group_size(input, self.act_group_size)
-            act_max = torch.max(torch.abs(input), dim=-1).values
-            if not hasattr(module, "act_max") or module.act_max.numel() == 0:
-                module.act_max = act_max
-            else:
-                act_max = act_max.to(module.act_max.device)
-                if is_nv_fp(self.act_data_type):  ## for nvfp per-tensor input_global_scale calculation usage
-                    module.act_max = torch.max(
-                        torch.tensor([act_max.max(), module.act_max.max()], device=act_max.device)
-                    )
-                else:
-                    module.act_max = torch.max(act_max, module.act_max)
-
-        hook_handles = []
-        # for single layers out of blocks, like lm_head
-        if isinstance(model, SUPPORTED_LAYER_TYPES):
-            m = model
-            if (
-                hasattr(m, "act_dynamic")
-                and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
-                and check_to_quantized(m)
-            ):
-                hook = m.register_forward_hook(get_act_max_hook)
-                hook_handles.append(hook)
-            return hook_handles
-
-        for n, m in model.named_modules():
-            if (
-                hasattr(m, "act_dynamic")
-                and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
-                and check_to_quantized(m)
-            ):
-                hook = m.register_forward_hook(get_act_max_hook)
-                hook_handles.append(hook)
-                continue
-
-            # for whole model, RTN
-            if n in self.layer_config:
-                config = self.layer_config[n]
-                act_dynamic = config.get("act_dynamic", True)
-                act_data_type = config.get("act_data_type", None)
-                act_bits = config.get("act_bits", 16)
-                if (
-                    config["bits"] <= 8
-                    and check_need_act_calibration(act_dynamic, act_data_type, act_bits)
-                    and check_to_quantized(config)
-                ):
-                    hook = m.register_forward_hook(get_act_max_hook)
-                    hook_handles.append(hook)
-                    continue
-        return hook_handles
-
-    def _quantize_layer(
-        self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"
-    ):
-        """Quantize a specific layer of the model using the provided inputs.
-
-        Args:
-            layer_name (str): The name of the layer to quantize.
-            inputs (torch.Tensor): Input data for quantization.
-            q_inputs (torch.Tensor, optional): Quantized input data. Defaults to None.
-            device (torch.device, optional): The device to use for quantization. Defaults to torch.device("cpu").
-
-        Returns:
-            None
-        """
-        logger.info(f"quantizing layer {layer_name}")
-        layer = get_module(self.model, layer_name)
-        if hasattr(layer, "tuning_device"):
-            device = layer.tuning_device
-
-        layer = layer.to(device)
-        for i in range(len(inputs)):
-            inputs[i] = inputs[i].to(layer.weight.dtype)
-            if q_inputs is not None:
-                q_inputs[i] = q_inputs[i].to(layer.weight.dtype)
-
-        if self.act_bits <= 8 and check_need_act_calibration(
-            self.act_dynamic,
-            self.act_data_type,
-            self.act_bits,
-            self.static_kv_dtype,
-            self.static_attention_dtype,
-        ):
-            tmp_inputs = q_inputs if q_inputs is not None else inputs
-            hook_handles = self._register_act_max_hook(layer)
-            with torch.no_grad():
-                for input in tmp_inputs:
-                    layer(input)
-            for handle in hook_handles:
-                handle.remove()
-
-        wrapper_linear = WrapperLinear(
-            layer,
-            enable_minmax_tuning=self.enable_minmax_tuning,
-            enable_torch_compile=self.enable_torch_compile,
-            device=device,
-        ).to(device)
-        round_params = []
-        minmax_params = []
-        for key in wrapper_linear.params.keys():
-            if "min" in key or "max" in key:
-                minmax_params.append(wrapper_linear.params[key])
-            else:
-                round_params.append(wrapper_linear.value)
-        if len(round_params) + len(minmax_params) <= 0:
-            dump_info = f"quantized {layer_name}"
-            logger.info(dump_info)
-            with torch.no_grad():
-                unwrapper_layer(self.model, wrapper_linear, layer_name, {})
-            mv_module_from_gpu(layer)
-
-        lr = torch.tensor(self.lr)
-        minmax_lr = torch.tensor(self.minmax_lr)
-        if self.enable_minmax_tuning:
-            optimizer = self.optimizer(
-                [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0
-            )
-        else:
-            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0)
-
-        if self.lr_scheduler is None:
-            lr_schedule = torch.optim.lr_scheduler.LinearLR(
-                optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters
-            )
-        else:
-            lr_schedule = copy.deepcopy(self.lr_scheduler)
-        nsamples = len(inputs)
-        last_best_iter = 0
-        best_loss = torch.finfo(torch.float).max
-        scaler = self._get_scaler()  # pylint: disable=assignment-from-none
-        init_loss = None
-        gradient_accumulate_steps = self.batch_size  # Force to low gpu
-
-        total_loss = 0
-        num_elm = 1
-        mse_reduction = "mean"
-        if gradient_accumulate_steps != 1:
-            mse_reduction = "sum"
-        mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device)
-        batch_size = 1  # Force to low gpu
-        global_batch_size = self.batch_size * gradient_accumulate_steps
-        global_batch_size = min(nsamples, global_batch_size)
-        if gradient_accumulate_steps != 1 and not self.attention_mask:
-            whole_indices = torch.arange(global_batch_size)
-            if q_inputs is not None:
-                num_elm = self._get_current_num_elm(q_inputs, whole_indices)
-            else:
-                num_elm = self._get_current_num_elm(inputs, whole_indices)
-
-        index_sampler = IndexSampler(nsamples, global_batch_size)
-
-        for i in range(self.iters):
-            total_loss = 0
-            global_indices = index_sampler.next_batch()
-            if self.attention_mask:
-                num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices)
-
-            for tmp_step in range(gradient_accumulate_steps):
-                indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
-                if q_inputs is not None:
-                    current_input = [q_inputs[i] for i in indices]
-                    current_input = torch.cat(current_input, dim=0).to(device)
-                    org_input = [inputs[i] for i in indices]
-                    org_input = torch.cat(org_input, dim=0).to(device)
-                else:
-                    current_input = [inputs[i] for i in indices]
-                    current_input = torch.cat(current_input, dim=0).to(device)
-                    org_input = current_input
-                with torch.no_grad():
-                    current_output = layer(org_input)
-                autocast_ctx = (
-                    nullcontext()
-                    if not self.amp
-                    else autocast(device_type=str(device).split(":")[0], dtype=self.amp_dtype)
-                )
-                if self.attention_mask:
-                    tmp_attention_mask = [self.attention_mask[i] for i in indices]
-                    tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device)
-                    tmp_attention_mask.unsqueeze_(-1)
-
-                    with autocast_ctx:
-                        output_q = wrapper_linear(current_input)  # pylint: disable=not-callable
-                        loss = mse_loss(  # pylint: disable=not-callable
-                            (output_q * tmp_attention_mask).to(torch.float32),
-                            (current_output * tmp_attention_mask).to(torch.float32),
-                        )
-
-                else:
-                    with autocast_ctx:
-                        output_q = wrapper_linear(current_input)  # pylint: disable=not-callable
-                        loss = mse_loss(  # pylint: disable=not-callable
-                            output_q.to(torch.float32),
-                            current_output.to(torch.float32),  # mul 1.0 will copy the output
-                        )
-
-                num_elm = 1 if num_elm <= 0 else num_elm
-                total_loss += loss.item() / num_elm
-
-                self._scale_loss_and_backward(scaler, loss)
-            if i == 0:
-                init_loss = total_loss
-
-            if total_loss < best_loss:
-                best_loss = total_loss
-                if not self.not_use_best_mse:
-                    best_params = collect_best_params(wrapper_linear, self.cache_device)
-                    last_best_iter = i
-            if self.not_use_best_mse and i == self.iters - 1:
-                best_params = collect_best_params(wrapper_linear, self.cache_device)
-
-            if not self.not_use_best_mse:
-                if 0 < self.dynamic_max_gap <= i - last_best_iter:
-                    break
-            self._step(scaler, optimizer, lr_schedule)
-
-        last_loss = total_loss
-        best_iter = self.iters
-        if not self.not_use_best_mse:
-            last_loss = best_loss
-            best_iter = last_best_iter
-        with torch.no_grad():
-            unwrapper_layer(self.model, wrapper_linear, layer_name, best_params)
-        mv_module_from_gpu(layer)
-        dump_info = f"quantized {layer_name},  loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
-        logger.info(dump_info)
-
-    def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> torch.Tensor:
-        current_output = [output[x] for x in indices]
-        current_output = torch.cat(current_output, dim=self.batch_dim)
-        return current_output
-
     def _get_current_q_output(
         self,
         block: torch.nn.Module,
@@ -2507,13 +1568,6 @@ def _get_current_num_elm(
         current_input_ids = [input_ids[i] for i in indices]
         return sum(id.numel() for id in current_input_ids)
 
-    def _get_non_zero_cnt(self, tensor: list[torch.Tensor], indices: list[int]) -> int:
-        current_tensors = [tensor[i] for i in indices]
-        non_zero_cnt = 0
-        for t in current_tensors:
-            non_zero_cnt += torch.count_nonzero(t).item()
-        return non_zero_cnt
-
     def quantize_block(
         self,
         block: torch.nn.Module,
@@ -2536,8 +1590,19 @@ def quantize_block(
         self.normalize_decoding_layer_inputs_(inputs)
         block_inputs = self.inputs[self.quant_block_list[0][0]]
         decoding_layer_first_input_name = "hidden_states"
-        input_ids, input_others = self._preprocess_block_inputs(block_inputs, decoding_layer_first_input_name)
-        return self._quantize_block(block, input_ids, input_others, q_input, device, auto_offload)
+        from auto_round.quantizers.algs.auto_round import AutoRoundQuantizer
+        from auto_round.quantizers.utils import preprocess_block_inputs
+
+        input_ids, input_others = preprocess_block_inputs(
+            block_inputs,
+            device_list=self.device_list,
+            first_input_name=decoding_layer_first_input_name,
+            amp=self.amp,
+            amp_dtype=self.amp_dtype,
+            cache_device=self.cache_device,
+            diffusion=self.diffusion,
+        )
+        return AutoRoundQuantizer(self).quantize_block(block, input_ids, input_others, q_input, device, auto_offload)
 
     def _get_loss(
         self,
@@ -2568,384 +1633,6 @@ def _get_loss(
 
         return loss
 
-    def _quantize_block(
-        self,
-        block: torch.nn.Module,
-        input_ids: Union[list[torch.Tensor], dict],
-        input_others: dict,
-        q_input: Union[torch.Tensor, dict, None] = None,
-        device: Union[str, torch.device] = "cpu",
-        auto_offload=True,
-    ):
-        """Quantize the weights of a given block of the model.
-
-        Args:
-        block: The block of the model to be quantized.
-        input_ids: The input tensor containing tokenized input ids.
-        input_others: A dictionary containing additional input data.
-        q_input: The quantized input tensor.
-        device: The device for quantization.
-
-        Returns:
-        Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output)
-        """
-        if is_fp8_model(self.model):
-            for n, m in block.named_modules():
-                if is_fp8_linear(m):
-                    new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device).to(device)
-                    set_module(block, n, new_layer)
-
-        if auto_offload:
-            # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights
-            # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk
-            if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1:
-                card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning(
-                    block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device
-                )
-            else:
-                block = block.to(device)
-                card_0_in_high_risk, loss_device = False, device
-        else:
-            card_0_in_high_risk, loss_device = False, device
-
-        if len(self.device_list) > 1 and auto_offload:
-            for n, m in block.named_modules():
-                if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
-                    continue
-                from accelerate.hooks import AlignDevicesHook, add_hook_to_module
-
-                hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
-                add_hook_to_module(m, hook, True)
-
-        if q_input is None:
-            hook_handles = self._register_act_max_hook(block)
-
-            output = self._get_block_outputs(
-                block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device
-            )
-
-            for handle in hook_handles:
-                handle.remove()
-        else:
-            output = self._get_block_outputs(
-                block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device
-            )
-            hook_handles = self._register_act_max_hook(block)
-            if hook_handles:
-                self._get_block_outputs(
-                    block,
-                    q_input,
-                    input_others,
-                    self.batch_size * self.infer_bs_coeff,
-                    device,
-                    self.cache_device,
-                    save_output=False,
-                )
-
-            for handle in hook_handles:
-                handle.remove()
-
-        if q_input is not None:
-            if input_ids is not q_input:
-                clear_memory(input_ids, device_list=self.device_list)
-            else:
-                clear_memory(device_list=self.device_list)
-            input_ids = q_input
-
-        quantized_layer_names, unquantized_layer_names = self.wrapper_block(
-            block,
-            self.enable_minmax_tuning,
-            self.enable_norm_bias_tuning,
-            enable_torch_compile=self.enable_torch_compile,
-            device=device,
-        )
-        if is_nv_fp(self.data_type):  # enable qkv and moe structure global_scale fuse
-            from auto_round.data_type.utils import update_fused_layer_global_scales
-
-            modules = block.modules()
-            for module in modules:
-                update_fused_layer_global_scales(module)
-        round_params = []
-        minmax_params = []
-        for n, m in block.named_modules():
-            if hasattr(m, "orig_layer"):
-                for key in m.params.keys():
-                    if "min" in key or "max" in key:
-                        minmax_params.append(m.params[key])
-                    else:
-                        round_params.append(m.params[key])
-
-        lr = torch.tensor(self.lr)
-        minmax_lr = torch.tensor(self.minmax_lr)
-        is_adam = "adam" in self.__class__.__name__.lower()
-
-        extra_kwargs = {} if is_adam else {"momentum": self.momentum}
-
-        if self.enable_minmax_tuning:
-            params = [
-                {"params": round_params},
-                {"params": minmax_params, "lr": minmax_lr},
-            ]
-        else:
-            params = round_params
-
-        optimizer = self.optimizer(
-            params,
-            lr=lr,
-            weight_decay=0,
-            **extra_kwargs,
-        )
-
-        if len(round_params) + len(minmax_params) <= 0:
-            dump_info = (
-                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
-                f"layers in the block"
-            )
-            logger.info(dump_info)
-            unwrapper_block(block, {})
-            mv_module_from_gpu(block)
-            return output, output
-
-        if self.lr_scheduler is None:
-            lr_schedule = torch.optim.lr_scheduler.LinearLR(
-                optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters
-            )
-        else:
-            lr_schedule = copy.deepcopy(self.lr_scheduler)
-
-        if isinstance(input_ids, dict):  # input_ids of Flux is dict
-            nsamples = len(input_ids["hidden_states"])
-        else:
-            nsamples = len(input_ids)
-        last_best_iter = 0
-        best_loss = torch.finfo(torch.float).max
-        num_elm = 1
-        mse_reduction = "mean"
-        if self.gradient_accumulate_steps != 1:
-            mse_reduction = "sum"
-        mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device)
-        scaler = self._get_scaler()  # pylint: disable=assignment-from-none
-        init_loss = None
-        best_params = {}
-        total_loss = 0
-        global_batch_size = self.batch_size * self.gradient_accumulate_steps
-        global_batch_size = min(nsamples, global_batch_size)
-        # We assume the block input and output shape is same
-        if self.gradient_accumulate_steps != 1 and not self.attention_mask:
-            whole_indices = torch.arange(global_batch_size)
-            num_elm = self._get_current_num_elm(input_ids, whole_indices)
-
-        index_sampler = IndexSampler(nsamples, global_batch_size)
-        batch_size = self.batch_size
-        for i in range(self.iters):
-            if self.enable_alg_ext and self.data_type.endswith("dq"):
-                for n, m in block.named_modules():
-                    m.cur_iter = i
-            total_loss = 0
-            global_indices = index_sampler.next_batch()
-            if self.attention_mask:
-                num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices)
-
-            for tmp_step in range(self.gradient_accumulate_steps):
-                indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
-                current_output = self._get_current_output(output, indices)
-                current_output = to_device(current_output, loss_device)
-                output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device)
-                loss = self._get_loss(output_q, current_output, indices, mse_loss, device)
-                num_elm = 1 if num_elm <= 0 else num_elm
-                total_loss += loss.item() / num_elm
-
-                if self.low_gpu_mem_usage and card_0_in_high_risk:
-                    # clear memory to avoid OOM due to memory fragmentation
-                    clear_memory_if_reached_threshold(threshold=0.5, device_list=self.device_list)
-
-                self._scale_loss_and_backward(scaler, loss)
-
-                if self.low_gpu_mem_usage and card_0_in_high_risk:
-                    # clear memory to avoid OOM due to memory fragmentation
-                    clear_memory_if_reached_threshold(threshold=0.8, device_list=self.device_list)
-
-            if i == 0:
-                init_loss = total_loss
-
-            if total_loss < best_loss:
-                best_loss = total_loss
-                if not self.not_use_best_mse:
-                    best_params = collect_best_params(block, self.cache_device)
-                    # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True)
-
-                    last_best_iter = i
-            if self.not_use_best_mse and i == self.iters - 1:
-                best_params = collect_best_params(block, self.cache_device)
-
-            if not self.not_use_best_mse:
-                if 0 < self.dynamic_max_gap <= i - last_best_iter:
-                    break
-            self._step(scaler, optimizer, lr_schedule)
-
-        last_loss = total_loss
-        best_iter = self.iters
-        if not self.not_use_best_mse:
-            last_loss = best_loss
-            best_iter = last_best_iter
-        if self.iters > 0:
-            dump_info = (
-                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
-                f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
-            )
-        else:
-            dump_info = (
-                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
-                "layers in the block"
-            )
-
-        if self.low_gpu_mem_usage:
-            clear_memory(device_list=self.device_list)  # clear cached memory during training
-        if len(unquantized_layer_names) != 0:
-            logger.info(f"{unquantized_layer_names} have not been quantized")
-        with torch.no_grad():
-            unwrapper_block(block, best_params)
-
-        if is_nv_fp(self.act_data_type):
-            # enable moe experts act_max automatic generation for WrapperWALayer
-            set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max")
-
-        if self.enable_quanted_input:
-            q_outputs = self._get_block_outputs(
-                block,
-                input_ids,
-                input_others,
-                self.batch_size * self.infer_bs_coeff,
-                device,
-                cache_device=self.cache_device,
-            )
-
-            if len(self.device_list) > 1 and auto_offload:
-                accelerate.hooks.remove_hook_from_submodules(block)
-            if auto_offload:
-                mv_module_from_gpu(block)
-
-            clear_memory(input_ids, device_list=self.device_list)
-            memory_info_summary = memory_monitor.get_summary()
-            logger.infoclean(dump_info + "," + memory_info_summary)
-
-            return q_outputs, output
-        else:
-            if len(self.device_list) > 1 and auto_offload:
-                accelerate.hooks.remove_hook_from_submodules(block)
-            if auto_offload:
-                mv_module_from_gpu(block)
-            clear_memory(input_ids, device_list=self.device_list)
-            memory_info_summary = memory_monitor.get_summary()
-            logger.infoclean(dump_info + "," + memory_info_summary)
-
-            return None, output
-
-    def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[torch.Tensor, dict]:
-        input_ids = inputs[first_input_name]
-        inputs.pop(first_input_name, None)
-        input_others = inputs
-        return input_ids, input_others
-
-    def _preprocess_block_inputs(self, inputs, first_input_name="input_ids"):
-        input_ids, input_others = self._split_inputs(inputs, first_input_name)
-        clear_memory(device_list=self.device_list)
-        input_ids = to_device(input_ids, self.cache_device)
-        input_others = to_device(input_others, self.cache_device)
-        # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage
-
-        tmp_dtype = self.amp_dtype if self.amp else torch.float32
-        input_ids = to_dtype(input_ids, tmp_dtype)
-
-        for key in input_others.keys():
-            if isinstance(input_others[key], torch.Tensor) and (
-                input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16
-            ):
-                input_others[key] = input_others[key].to(tmp_dtype)
-            elif isinstance(input_others[key], list):
-                for i in range(len(input_others[key])):
-                    to_dtype(input_others[key][i], tmp_dtype)
-        return input_ids, input_others
-
-    def _quantize_blocks(
-        self,
-        model: torch.nn.Module,
-        inputs: dict,
-        block_names: list,
-        q_input: torch.Tensor = None,
-        nblocks: int = 1,
-        device: str = "cpu",
-        pbar: tqdm = None,
-    ):
-        """Quantize and dequantize the weights of the specified blocks in the model.
-
-        Args:
-        model: The PyTorch model to be quantized.
-        inputs: The input data for quantization.
-        block_names: The names of the blocks to be quantized and dequantized.
-        nblocks: The number of blocks to quantize and dequantize.
-        device: The device for quantization and dequantization.
-
-        Returns:
-        None
-        """
-        clear_memory(device_list=self.device_list)
-        for n, m in model.named_parameters():
-            m.requires_grad_(False)
-
-        input_ids, input_others = self._preprocess_block_inputs(inputs)
-
-        if pbar is None:
-            pbar = tqdm(range(0, len(block_names), nblocks))
-
-        for i in range(0, len(block_names), nblocks):
-            if i != 0:
-                pbar.update(1)
-            if nblocks == 1:
-                n = block_names[i]
-                pbar.set_description(f"Quantizing {n}")
-                m = get_module(model, n)
-            else:
-                names = block_names[i : min(i + nblocks, len(block_names))]
-                pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}")
-                modules = [get_module(model, n) for n in names]
-                m = WrapperMultiblock(modules)
-
-            m.config = model.config if hasattr(model, "config") else None
-            q_input, input_ids = self._quantize_block(
-                m,
-                input_ids,
-                input_others,
-                q_input=q_input,
-                device=device,
-            )
-            if hasattr(model, "config"):
-                del m.config
-            if self.immediate_packing:
-                for _, tmp_m in m.named_modules():
-                    if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)):
-                        continue
-                    self._immediate_pack(tmp_m.tmp_name)
-
-            if self.immediate_saving:
-                last_group = (i + nblocks) >= len(block_names)
-                immediate_saving(self, m, last_group=last_group)
-        if pbar is not None:
-            pbar.update(1)
-
-        if not self.immediate_saving:
-            self.model = mv_module_from_gpu(self.model)
-        for n, m in self.model.named_modules():
-            if hasattr(m, "name"):
-                delattr(m, "name")
-
-        del q_input
-        del input_ids
-        del input_others
-        del inputs
-
-        clear_memory(device_list=self.device_list)
-
     def save_quantized(
         self,
         output_dir: str = None,
@@ -3008,30 +1695,6 @@ def save_quantized(
         else:
             return compressed_model
 
-    def _get_quantized_layer_names_outside_blocks(self) -> list:
-        """Gets the names of quantized layers outside blocks in the model.
-
-        Returns:
-            list: List of layer names outside blocks.
-        """
-        if self.layer_config is None or len(self.layer_config) == 0:
-            return []
-
-        layer_names = []
-        all_layers_in_block = get_layer_names_in_block(self.model, self.supported_types, self.quant_block_list)
-
-        for key in self.layer_config.keys():
-            if key in all_layers_in_block:
-                continue
-            layer = get_module(self.model, key)
-            if layer is None:
-                logger.error(f"could not find layer {key} in the model, exit...")
-                exit(-1)
-            if type(layer) in self.supported_types and check_to_quantized(self.layer_config[key]):
-                layer_names.append(key)
-
-        return layer_names
-
     def _set_amp_dtype(self) -> None:
         """Sets the automatic mixed precision (AMP) data type for the model based on the device and configuration."""
         self.amp_dtype = torch.bfloat16
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index 50e807320..0bfbf038c 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -172,18 +172,6 @@ def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]:
             q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str}
         return inputs, q_inputs
 
-    def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[dict, dict]:
-        input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
-        input_ids = {k: inputs.pop(k, None) for k in input_id_str}
-        input_others = inputs
-        return input_ids, input_others
-
-    def _get_current_output(self, output: dict, indices: list[int]) -> torch.Tensor:
-        assert "hidden_states" in output
-        current_output = [output["hidden_states"][x] for x in indices]
-        current_output = torch.cat(current_output, dim=self.batch_dim)
-        return current_output
-
     def _get_current_q_output(
         self,
         block: torch.nn.Module,
diff --git a/auto_round/quantizers/__init__.py b/auto_round/quantizers/__init__.py
new file mode 100644
index 000000000..87ac77b62
--- /dev/null
+++ b/auto_round/quantizers/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_round.quantizers.entrypoint import create_quantizers
diff --git a/auto_round/quantizers/algs/auto_round.py b/auto_round/quantizers/algs/auto_round.py
new file mode 100644
index 000000000..da02d0397
--- /dev/null
+++ b/auto_round/quantizers/algs/auto_round.py
@@ -0,0 +1,888 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import time
+import traceback
+from contextlib import nullcontext
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+import accelerate
+import torch
+from accelerate.big_modeling import dispatch_model, infer_auto_device_map
+from torch import autocast
+from tqdm import tqdm
+
+from auto_round.compressors.utils import (
+    IndexSampler,
+    check_need_act_calibration,
+    collect_best_params,
+    immediate_saving,
+    is_nv_fp,
+)
+from auto_round.logger import logger
+from auto_round.quantizers.algs.base import AlgsBaseQuantizer
+from auto_round.quantizers.utils import (
+    get_non_zero_cnt,
+    get_quantized_layer_names_outside_blocks,
+    preprocess_block_inputs,
+    quantize_embedding_layer,
+    register_act_max_hook,
+)
+from auto_round.utils import (
+    check_to_quantized,
+    clear_memory,
+    convert_fp8_layer_to_linear,
+    get_block_names,
+    get_module,
+    is_auto_device_mapping,
+    is_fp8_linear,
+    is_fp8_model,
+    memory_monitor,
+    mv_module_from_gpu,
+    set_amax_for_all_moe_layers,
+    set_module,
+    to_device,
+)
+from auto_round.utils.device import (
+    clear_memory_if_reached_threshold,
+    set_auto_device_map_for_block_with_tuning,
+)
+from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
+
+if TYPE_CHECKING:
+    from auto_round.compressors.base import BaseCompressor
+
+
+class ARQuantizer(AlgsBaseQuantizer):
+
+    def __init__(self, compressor: "BaseCompressor"):
+        super().__init__(compressor)
+
+    def pre_quantize(self, *args, **kwargs):
+        return super().pre_quantize(*args, **kwargs)
+
+    def quantize(self, *args, **kwargs):
+        if bool(self.compressor.quant_block_list):
+            all_blocks = self.compressor.quant_block_list
+        else:
+            all_blocks = get_block_names(self.compressor.model)
+
+        if len(all_blocks) == 0:
+            logger.warning("could not find blocks, exit with original model")
+            return self.compressor.model, self.compressor.layer_config
+
+        if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype:
+            self.compressor.model = self.compressor.model.to(self.compressor.amp_dtype)
+
+        layer_names = get_quantized_layer_names_outside_blocks(
+            model=self.compressor.model,
+            layer_config=self.compressor.layer_config,
+            supported_types=self.compressor.supported_types,
+            quant_block_list=self.compressor.quant_block_list,
+        )
+        start_time = time.time()
+        all_first_block_names = [block[0] for block in all_blocks]
+        if len(layer_names) > 0:
+            logger.info(
+                "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names
+            )
+        else:
+            logger.info("start to cache block inputs")
+
+        # TODO: refactor this
+        all_inputs = self.compressor.try_cache_inter_data_gpucpu(
+            all_first_block_names, self.compressor.nsamples, layer_names=layer_names
+        )
+
+        is_quantized_embedding = quantize_embedding_layer(
+            model=self.compressor.model,
+            layer_config=self.compressor.layer_config,
+            scale_dtype=self.compressor.data_type,
+            disable_opt_rtn=self.compressor.disable_opt_rtn,
+            device=self.compressor.device,
+            device_list=self.compressor.device_list,
+        )
+        clear_memory(device_list=self.compressor.device_list)
+        all_q_inputs = None
+        if is_quantized_embedding:
+            all_inputs = copy.deepcopy(self.compressor.inputs)
+            clear_memory(self.compressor.inputs, device_list=self.compressor.device_list)
+            # TODO: refactor this
+            all_q_inputs = self.compressor.try_cache_inter_data_gpucpu(
+                all_first_block_names, self.compressor.nsamples, layer_names=layer_names
+            )
+        self.compressor.model = mv_module_from_gpu(self.compressor.model)
+        clear_memory(device_list=self.compressor.device_list)
+
+        if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1:
+            accelerate.hooks.remove_hook_from_submodules(
+                self.compressor.model
+            )  # self.compressor.model.hf_device_map has not been changed
+        logger.info("caching done")
+        if len(all_blocks) > 1:
+            pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.compressor.nblocks))
+        else:
+            pbar = tqdm(range(0, len(all_blocks[0]), self.compressor.nblocks))  # move the alg warning outside pbar
+
+        for block_names in all_blocks:
+            inputs = all_inputs[block_names[0]]
+            all_inputs.pop(block_names[0])
+            q_inputs = None
+            if all_q_inputs is not None:
+                q_inputs = all_q_inputs[block_names[0]]
+                all_q_inputs.pop(block_names[0])
+
+            # TODO: refactor this
+            inputs, q_inputs = self.compressor._update_inputs(inputs, q_inputs)
+
+            clear_memory(self.compressor.inputs, device_list=self.compressor.device_list)
+
+            if "input_ids" in inputs.keys():
+                total_samples = len(inputs["input_ids"])
+                if total_samples < self.compressor.batch_size:
+                    self.compressor.batch_size = total_samples
+                    logger.warning(f"force the train batch size to {total_samples}")
+
+            self._quantize_blocks(
+                self.compressor.model,
+                inputs,
+                block_names,
+                q_input=q_inputs if q_inputs is not None else None,
+                nblocks=self.compressor.nblocks,
+                device=self.compressor.device,
+                pbar=pbar,
+            )
+            if self.compressor.immediate_packing and len(self.compressor.formats) != 1:
+                raise ValueError(
+                    f"Expected exactly one packing format when 'immediate_packing' is True, "
+                    f"but got {len(self.compressor.formats)} formats."
+                )
+        pbar.set_description("Quantizing done")
+        pbar.close()
+        self._quantize_layers(layer_names, all_inputs)
+
+        if is_fp8_model(self.compressor.model):
+            for n, m in self.compressor.model.named_modules():
+                if is_fp8_linear(m):
+                    new_layer = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device).to(
+                        "cpu"
+                    )
+                    set_module(self.compressor.model, n, new_layer)
+
+        end_time = time.time()
+        cost_time = end_time - start_time
+        logger.info(f"quantization tuning time {cost_time}")
+
+        # Dump a summary
+        quantized_layers = []
+        unquantized_layers = []
+        for n, m in self.compressor.model.named_modules():
+            if isinstance(m, tuple(self.compressor.supported_types)):
+                if check_to_quantized(m):
+                    quantized_layers.append(n)
+                else:
+                    unquantized_layers.append(n)
+            elif hasattr(m, "scales") or hasattr(m, "scale"):  ##packing_immediately
+                quantized_layers.append(n)
+        summary_info = (
+            f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model"
+        )
+        if len(unquantized_layers) > 0:
+            summary_info += f",  {unquantized_layers} have not been quantized"
+        logger.info(summary_info)
+
+        self.compressor.quantized = True
+        return self.compressor.model, self.compressor.layer_config
+
+    def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
+        """Quantizes specified layers based on inputs and configuration.
+
+        Args:
+            layer_names (list): list of layer names to quantize.
+            layer_inputs (dict): Dictionary mapping layer names to input data.
+
+        Returns:
+            None
+        """
+        # TODO currently we take all the layers outside blocks as post block layers which is not optimal
+        # if there is no input for layer, we use rtn
+
+        for layer_name in copy.deepcopy(layer_names):
+            if layer_name not in layer_inputs:
+                if self.compressor.act_bits < 16 and not self.compressor.act_dynamic:
+                    # Activation quantization requires collected inputs
+                    msg_prefix = (
+                        f"Activation max hook for layer '{layer_name}' is unavailable due to "
+                        f"insufficient collected inputs. "
+                    )
+                    if "fp8_e5m2" in self.compressor.act_data_type:
+                        logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.")
+                    else:
+                        logger.warning(
+                            msg_prefix + "Static activation quantization is not supported or ineffective, "
+                            "Skipping quantization for this layer."
+                        )
+                        layer_names.remove(layer_name)
+                        continue
+                logger.info(f"using rtn to quantize {layer_name}")
+                from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
+
+                layer = get_module(self.compressor.model, layer_name)
+                layer = layer.to(self.compressor.device)
+                if is_fp8_linear(layer):
+                    new_layer = convert_fp8_layer_to_linear(
+                        layer, self.compressor.amp_dtype, self.compressor.device
+                    ).to(self.compressor.device)
+                    set_module(self.compressor.model, layer_name, new_layer)
+                    layer = new_layer
+
+                wrapper_layer = WrapperLinear(
+                    layer,
+                    enable_round_tuning=False,
+                    enable_minmax_tuning=False,
+                    enable_norm_bias_tuning=False,
+                    enable_torch_compile=self.compressor.enable_torch_compile,
+                    device=self.compressor.device,
+                    disable_opt_rtn=self.compressor.disable_opt_rtn,
+                )
+                new_layer = wrapper_layer.unwrapper({})
+                set_module(self.compressor.model, layer_name, new_layer)
+                layer.cpu()
+                layer_names.remove(layer_name)
+        if len(layer_names) == 0:
+            memory_monitor.update()
+            memory_monitor.log_summary()
+            return
+        q_layer_inputs = None
+        enable_quanted_input = self.compressor.enable_quanted_input
+        has_gguf = False
+
+        if hasattr(self.compressor, "formats"):
+            has_gguf = any(format_.is_gguf() for format_ in self.compressor.formats)
+        if has_gguf and self.compressor.immediate_packing:
+            enable_quanted_input = False
+
+        if (
+            hasattr(self.compressor.model, "hf_device_map")
+            and len(self.compressor.model.hf_device_map) > 1
+            and enable_quanted_input
+        ):
+            dispatch_model(self.compressor.model, self.compressor.model.hf_device_map)
+
+        if enable_quanted_input:
+            logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names)
+            # TODO: refactor this
+            q_layer_inputs = self.compressor.try_cache_inter_data_gpucpu(
+                [], self.compressor.nsamples, layer_names=layer_names
+            )
+            if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1:
+                accelerate.hooks.remove_hook_from_submodules(
+                    self.compressor.model
+                )  # self.compressor.model.hf_device_map has not been changed
+        if not self.compressor.immediate_saving:
+            self.compressor.model = mv_module_from_gpu(self.compressor.model)
+        clear_memory(device_list=self.compressor.device_list)
+        for layer_name in layer_names:
+            layer_input = layer_inputs[layer_name]
+            layer_input = to_device(layer_input, self.compressor.cache_device)
+            q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None
+            q_layer_input = to_device(q_layer_input, self.compressor.cache_device)
+            self.quantize_layer(layer_name, layer_input, q_layer_input, device=self.compressor.device)
+            if self.compressor.immediate_packing:
+                self.compressor._immediate_pack(layer_name)
+
+            if self.compressor.immediate_saving:
+                m = get_module(self.compressor.model, layer_name)
+                immediate_saving(self.compressor, m, name=layer_name, last_group=True)
+            del layer_input
+            clear_memory(q_layer_input, device_list=self.compressor.device_list)
+            memory_monitor.log_summary()
+
+    def _quantize_blocks(
+        self,
+        model: torch.nn.Module,
+        inputs: dict,
+        block_names: list,
+        q_input: torch.Tensor = None,
+        nblocks: int = 1,
+        device: str = "cpu",
+        pbar: tqdm = None,
+    ):
+        """Quantize and dequantize the weights of the specified blocks in the model.
+
+        Args:
+        model: The PyTorch model to be quantized.
+        inputs: The input data for quantization.
+        block_names: The names of the blocks to be quantized and dequantized.
+        nblocks: The number of blocks to quantize and dequantize.
+        device: The device for quantization and dequantization.
+
+        Returns:
+        None
+        """
+        clear_memory(device_list=self.compressor.device_list)
+        for n, m in model.named_parameters():
+            m.requires_grad_(False)
+
+        input_ids, input_others = preprocess_block_inputs(
+            inputs,
+            device_list=self.compressor.device_list,
+            first_input_name="input_ids",
+            amp=self.compressor.amp,
+            amp_dtype=self.compressor.amp_dtype,
+            cache_device=self.compressor.cache_device,
+            diffusion=self.compressor.diffusion,
+        )
+
+        if pbar is None:
+            pbar = tqdm(range(0, len(block_names), nblocks))
+
+        for i in range(0, len(block_names), nblocks):
+            if i != 0:
+                pbar.update(1)
+            if nblocks == 1:
+                n = block_names[i]
+                pbar.set_description(f"Quantizing {n}")
+                m = get_module(model, n)
+            else:
+                names = block_names[i : min(i + nblocks, len(block_names))]
+                pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}")
+                modules = [get_module(model, n) for n in names]
+                m = WrapperMultiblock(modules)
+
+            m.config = model.config if hasattr(model, "config") else None
+            q_input, input_ids = self.quantize_block(
+                m,
+                input_ids,
+                input_others,
+                q_input=q_input,
+                device=device,
+            )
+            if hasattr(model, "config"):
+                del m.config
+            if self.compressor.immediate_packing:
+                for _, tmp_m in m.named_modules():
+                    if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)):
+                        continue
+                    self.compressor._immediate_pack(tmp_m.tmp_name)
+
+            if self.compressor.immediate_saving:
+                last_group = (i + nblocks) >= len(block_names)
+                immediate_saving(self.compressor, m, last_group=last_group)
+        if pbar is not None:
+            pbar.update(1)
+
+        if not self.compressor.immediate_saving:
+            self.compressor.model = mv_module_from_gpu(self.compressor.model)
+        for n, m in self.compressor.model.named_modules():
+            if hasattr(m, "name"):
+                delattr(m, "name")
+
+        del q_input
+        del input_ids
+        del input_others
+        del inputs
+
+        clear_memory(device_list=self.compressor.device_list)
+
+    def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"):
+        """Quantize a specific layer of the model using the provided inputs.
+
+        Args:
+            layer_name (str): The name of the layer to quantize.
+            inputs (torch.Tensor): Input data for quantization.
+            q_inputs (torch.Tensor, optional): Quantized input data. Defaults to None.
+            device (torch.device, optional): The device to use for quantization. Defaults to torch.device("cpu").
+
+        Returns:
+            None
+        """
+        logger.info(f"quantizing layer {layer_name}")
+        layer = get_module(self.compressor.model, layer_name)
+        if hasattr(layer, "tuning_device"):
+            device = layer.tuning_device
+
+        layer = layer.to(device)
+        for i in range(len(inputs)):
+            inputs[i] = inputs[i].to(layer.weight.dtype)
+            if q_inputs is not None:
+                q_inputs[i] = q_inputs[i].to(layer.weight.dtype)
+
+        if self.compressor.act_bits <= 8 and check_need_act_calibration(
+            self.compressor.act_dynamic,
+            self.compressor.act_data_type,
+            self.compressor.act_bits,
+            self.compressor.static_kv_dtype,
+            self.compressor.static_attention_dtype,
+        ):
+            tmp_inputs = q_inputs if q_inputs is not None else inputs
+            hook_handles = register_act_max_hook(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                act_group_size=self.compressor.act_group_size,
+                act_data_type=self.compressor.act_data_type,
+            )
+            with torch.no_grad():
+                for input in tmp_inputs:
+                    layer(input)
+            for handle in hook_handles:
+                handle.remove()
+
+        wrapper_linear = WrapperLinear(
+            layer,
+            enable_minmax_tuning=self.compressor.enable_minmax_tuning,
+            enable_torch_compile=self.compressor.enable_torch_compile,
+            device=device,
+        ).to(device)
+        round_params = []
+        minmax_params = []
+        for key in wrapper_linear.params.keys():
+            if "min" in key or "max" in key:
+                minmax_params.append(wrapper_linear.params[key])
+            else:
+                round_params.append(wrapper_linear.value)
+        if len(round_params) + len(minmax_params) <= 0:
+            dump_info = f"quantized {layer_name}"
+            logger.info(dump_info)
+            with torch.no_grad():
+                unwrapper_layer(self.compressor.model, wrapper_linear, layer_name, {})
+            mv_module_from_gpu(layer)
+
+        lr = torch.tensor(self.compressor.lr)
+        minmax_lr = torch.tensor(self.compressor.minmax_lr)
+        if self.compressor.enable_minmax_tuning:
+            optimizer = self.optimizer(
+                [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0
+            )
+        else:
+            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0)
+
+        if self.compressor.lr_scheduler is None:
+            lr_schedule = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.compressor.iters
+            )
+        else:
+            lr_schedule = copy.deepcopy(self.compressor.lr_scheduler)
+        nsamples = len(inputs)
+        last_best_iter = 0
+        best_loss = torch.finfo(torch.float).max
+        scaler = self.compressor._get_scaler()  # pylint: disable=assignment-from-none
+        init_loss = None
+        gradient_accumulate_steps = self.compressor.batch_size  # Force to low gpu
+        total_loss = 0
+        num_elm = 1
+        mse_reduction = "mean"
+        if gradient_accumulate_steps != 1:
+            mse_reduction = "sum"
+        mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device)
+        batch_size = 1  # Force to low gpu
+        global_batch_size = self.compressor.batch_size * gradient_accumulate_steps
+        global_batch_size = min(nsamples, global_batch_size)
+        if gradient_accumulate_steps != 1 and not self.compressor.attention_mask:
+            whole_indices = torch.arange(global_batch_size)
+            if q_inputs is not None:
+                # Todo: refactor this
+                num_elm = self.compressor._get_current_num_elm(q_inputs, whole_indices)
+            else:
+                num_elm = self.compressor._get_current_num_elm(inputs, whole_indices)
+
+        index_sampler = IndexSampler(nsamples, global_batch_size)
+
+        for i in range(self.compressor.iters):
+            total_loss = 0
+            global_indices = index_sampler.next_batch()
+            if self.compressor.attention_mask:
+                num_elm = get_non_zero_cnt(self.compressor.attention_mask, global_indices)
+
+            for tmp_step in range(gradient_accumulate_steps):
+                indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
+                if q_inputs is not None:
+                    current_input = [q_inputs[i] for i in indices]
+                    current_input = torch.cat(current_input, dim=0).to(device)
+                    org_input = [inputs[i] for i in indices]
+                    org_input = torch.cat(org_input, dim=0).to(device)
+                else:
+                    current_input = [inputs[i] for i in indices]
+                    current_input = torch.cat(current_input, dim=0).to(device)
+                    org_input = current_input
+                with torch.no_grad():
+                    current_output = layer(org_input)
+                autocast_ctx = (
+                    nullcontext()
+                    if not self.compressor.amp
+                    else autocast(device_type=str(device).split(":")[0], dtype=self.compressor.amp_dtype)
+                )
+                if self.compressor.attention_mask:
+                    tmp_attention_mask = [self.compressor.attention_mask[i] for i in indices]
+                    tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device)
+                    tmp_attention_mask.unsqueeze_(-1)
+
+                    with autocast_ctx:
+                        output_q = wrapper_linear(current_input)  # pylint: disable=not-callable
+                        loss = mse_loss(  # pylint: disable=not-callable
+                            (output_q * tmp_attention_mask).to(torch.float32),
+                            (current_output * tmp_attention_mask).to(torch.float32),
+                        )
+
+                else:
+                    with autocast_ctx:
+                        output_q = wrapper_linear(current_input)  # pylint: disable=not-callable
+                        loss = mse_loss(  # pylint: disable=not-callable
+                            output_q.to(torch.float32),
+                            current_output.to(torch.float32),  # mul 1.0 will copy the output
+                        )
+
+                num_elm = 1 if num_elm <= 0 else num_elm
+                total_loss += loss.item() / num_elm
+
+                self.compressor.scale_loss_and_backward(scaler, loss)
+            if i == 0:
+                init_loss = total_loss
+
+            if total_loss < best_loss:
+                best_loss = total_loss
+                if not self.compressor.not_use_best_mse:
+                    best_params = collect_best_params(wrapper_linear, self.compressor.cache_device)
+                    last_best_iter = i
+            if self.compressor.not_use_best_mse and i == self.compressor.iters - 1:
+                best_params = collect_best_params(wrapper_linear, self.compressor.cache_device)
+
+            if not self.compressor.not_use_best_mse:
+                if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter:
+                    break
+            self.compressor._step(scaler, optimizer, lr_schedule)
+
+        last_loss = total_loss
+        best_iter = self.compressor.iters
+        if not self.compressor.not_use_best_mse:
+            last_loss = best_loss
+            best_iter = last_best_iter
+        with torch.no_grad():
+            unwrapper_layer(self.compressor.model, wrapper_linear, layer_name, best_params)
+        mv_module_from_gpu(layer)
+        dump_info = f"quantized {layer_name},  loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
+        logger.info(dump_info)
+
+    def quantize_block(
+        self,
+        block: torch.nn.Module,
+        input_ids: Union[list[torch.Tensor], dict],
+        input_others: dict,
+        q_input: Union[torch.Tensor, dict, None] = None,
+        device: Union[str, torch.device] = "cpu",
+        auto_offload=True,
+    ):
+        """Quantize the weights of a given block of the model.
+
+        Args:
+        block: The block of the model to be quantized.
+        input_ids: The input tensor containing tokenized input ids.
+        input_others: A dictionary containing additional input data.
+        q_input: The quantized input tensor.
+        device: The device for quantization.
+
+        Returns:
+        Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output)
+        """
+        if is_fp8_model(self.compressor.model):
+            for n, m in block.named_modules():
+                if is_fp8_linear(m):
+                    new_layer = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device).to(
+                        device
+                    )
+                    set_module(block, n, new_layer)
+
+        if auto_offload:
+            # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights
+            # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk
+            if is_auto_device_mapping(self.compressor.device_map) and len(self.compressor.device_list) > 1:
+                card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning(
+                    block,
+                    self.compressor.device_map,
+                    input_ids,
+                    self.compressor.low_gpu_mem_usage,
+                    self.compressor.batch_size,
+                    device,
+                )
+            else:
+                block = block.to(device)
+                card_0_in_high_risk, loss_device = False, device
+        else:
+            card_0_in_high_risk, loss_device = False, device
+
+        if len(self.compressor.device_list) > 1 and auto_offload:
+            for n, m in block.named_modules():
+                if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
+                    continue
+                from accelerate.hooks import AlignDevicesHook, add_hook_to_module
+
+                hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
+                add_hook_to_module(m, hook, True)
+
+        if q_input is None:
+            hook_handles = register_act_max_hook(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                act_group_size=self.compressor.act_group_size,
+                act_data_type=self.compressor.act_data_type,
+            )
+
+            # TODO: refactor this part
+            output = self.compressor._get_block_outputs(
+                block,
+                input_ids,
+                input_others,
+                self.compressor.batch_size * self.compressor.infer_bs_coeff,
+                device,
+                self.compressor.cache_device,
+            )
+
+            for handle in hook_handles:
+                handle.remove()
+        else:
+            # TODO: refactor this part
+            output = self.compressor._get_block_outputs(
+                block,
+                input_ids,
+                input_others,
+                self.compressor.batch_size * self.compressor.infer_bs_coeff,
+                device,
+                self.compressor.cache_device,
+            )
+            hook_handles = register_act_max_hook(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                act_group_size=self.compressor.act_group_size,
+                act_data_type=self.compressor.act_data_type,
+            )
+            if hook_handles:
+                # TODO: refactor this part
+                self.compressor._get_block_outputs(
+                    block,
+                    q_input,
+                    input_others,
+                    self.compressor.batch_size * self.compressor.infer_bs_coeff,
+                    device,
+                    self.compressor.cache_device,
+                    save_output=False,
+                )
+
+            for handle in hook_handles:
+                handle.remove()
+
+        if q_input is not None:
+            if input_ids is not q_input:
+                clear_memory(input_ids, device_list=self.compressor.device_list)
+            else:
+                clear_memory(device_list=self.compressor.device_list)
+            input_ids = q_input
+
+        quantized_layer_names, unquantized_layer_names = self.compressor.wrapper_block(
+            block,
+            self.compressor.enable_minmax_tuning,
+            self.compressor.enable_norm_bias_tuning,
+            enable_torch_compile=self.compressor.enable_torch_compile,
+            device=device,
+        )
+        if is_nv_fp(self.compressor.data_type):  # enable qkv and moe structure global_scale fuse
+            from auto_round.data_type.utils import update_fused_layer_global_scales
+
+            modules = block.modules()
+            for module in modules:
+                update_fused_layer_global_scales(module)
+        round_params = []
+        minmax_params = []
+        for n, m in block.named_modules():
+            if hasattr(m, "orig_layer"):
+                for key in m.params.keys():
+                    if "min" in key or "max" in key:
+                        minmax_params.append(m.params[key])
+                    else:
+                        round_params.append(m.params[key])
+
+        lr = torch.tensor(self.compressor.lr)
+        minmax_lr = torch.tensor(self.compressor.minmax_lr)
+        is_adam = "adam" in self.compressor.__class__.__name__.lower()
+
+        extra_kwargs = {} if is_adam else {"momentum": self.compressor.momentum}
+
+        if self.compressor.enable_minmax_tuning:
+            params = [
+                {"params": round_params},
+                {"params": minmax_params, "lr": minmax_lr},
+            ]
+        else:
+            params = round_params
+
+        optimizer = self.compressor.optimizer(
+            params,
+            lr=lr,
+            weight_decay=0,
+            **extra_kwargs,
+        )
+
+        if len(round_params) + len(minmax_params) <= 0:
+            dump_info = (
+                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
+                f"layers in the block"
+            )
+            logger.info(dump_info)
+            unwrapper_block(block, {})
+            mv_module_from_gpu(block)
+            return output, output
+
+        if self.compressor.lr_scheduler is None:
+            lr_schedule = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.compressor.iters
+            )
+        else:
+            lr_schedule = copy.deepcopy(self.compressor.lr_scheduler)
+
+        if isinstance(input_ids, dict):  # input_ids of Flux is dict
+            nsamples = len(input_ids["hidden_states"])
+        else:
+            nsamples = len(input_ids)
+        last_best_iter = 0
+        best_loss = torch.finfo(torch.float).max
+        num_elm = 1
+        mse_reduction = "mean"
+        if self.compressor.gradient_accumulate_steps != 1:
+            mse_reduction = "sum"
+        mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device)
+        scaler = self.compressor._get_scaler()  # pylint: disable=assignment-from-none
+        init_loss = None
+        best_params = {}
+        total_loss = 0
+        global_batch_size = self.compressor.batch_size * self.compressor.gradient_accumulate_steps
+        global_batch_size = min(nsamples, global_batch_size)
+        # We assume the block input and output shape is same
+        if self.compressor.gradient_accumulate_steps != 1 and not self.compressor.attention_mask:
+            whole_indices = torch.arange(global_batch_size)
+            num_elm = self.compressor._get_current_num_elm(input_ids, whole_indices)
+
+        index_sampler = IndexSampler(nsamples, global_batch_size)
+        batch_size = self.compressor.batch_size
+        for i in range(self.compressor.iters):
+            if self.compressor.enable_alg_ext and self.compressor.data_type.endswith("dq"):
+                for n, m in block.named_modules():
+                    m.cur_iter = i
+            total_loss = 0
+            global_indices = index_sampler.next_batch()
+            if self.compressor.attention_mask:
+                num_elm = get_non_zero_cnt(self.compressor.attention_mask, global_indices)
+
+            for tmp_step in range(self.compressor.gradient_accumulate_steps):
+                indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
+                current_output = self._get_current_output(output, indices, self.compressor.batch_dim)
+                current_output = to_device(current_output, loss_device)
+                # TODO: refactor this
+                output_q = self.compressor._get_current_q_output(
+                    block, input_ids, input_others, indices, device, loss_device
+                )
+                # TODO: refactor this
+                loss = self.compressor._get_loss(output_q, current_output, indices, mse_loss, device)
+                num_elm = 1 if num_elm <= 0 else num_elm
+                total_loss += loss.item() / num_elm
+
+                if self.compressor.low_gpu_mem_usage and card_0_in_high_risk:
+                    # clear memory to avoid OOM due to memory fragmentation
+                    clear_memory_if_reached_threshold(threshold=0.5, device_list=self.compressor.device_list)
+
+                self.compressor._scale_loss_and_backward(scaler, loss)
+                if self.compressor.low_gpu_mem_usage and card_0_in_high_risk:
+                    # clear memory to avoid OOM due to memory fragmentation
+                    clear_memory_if_reached_threshold(threshold=0.8, device_list=self.compressor.device_list)
+
+            if i == 0:
+                init_loss = total_loss
+
+            if total_loss < best_loss:
+                best_loss = total_loss
+                if not self.compressor.not_use_best_mse:
+                    best_params = collect_best_params(block, self.compressor.cache_device)
+                    # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True)
+
+                    last_best_iter = i
+            if self.compressor.not_use_best_mse and i == self.compressor.iters - 1:
+                best_params = collect_best_params(block, self.compressor.cache_device)
+
+            if not self.compressor.not_use_best_mse:
+                if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter:
+                    break
+            self.compressor._step(scaler, optimizer, lr_schedule)
+
+        last_loss = total_loss
+        best_iter = self.compressor.iters
+        if not self.compressor.not_use_best_mse:
+            last_loss = best_loss
+            best_iter = last_best_iter
+        if self.compressor.iters > 0:
+            dump_info = (
+                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
+                f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
+            )
+        else:
+            dump_info = (
+                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
+                "layers in the block"
+            )
+
+        if self.compressor.low_gpu_mem_usage:
+            clear_memory(device_list=self.compressor.device_list)  # clear cached memory during training
+        if len(unquantized_layer_names) != 0:
+            logger.info(f"{unquantized_layer_names} have not been quantized")
+        with torch.no_grad():
+            unwrapper_block(block, best_params)
+
+        if is_nv_fp(self.compressor.act_data_type):
+            # enable moe experts act_max automatic generation for WrapperWALayer
+            set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max")
+
+        if self.compressor.enable_quanted_input:
+            # TODO: refactor this
+            q_outputs = self.compressor._get_block_outputs(
+                block,
+                input_ids,
+                input_others,
+                self.compressor.batch_size * self.compressor.infer_bs_coeff,
+                device,
+                cache_device=self.compressor.cache_device,
+            )
+
+            if len(self.compressor.device_list) > 1 and auto_offload:
+                accelerate.hooks.remove_hook_from_submodules(block)
+            if auto_offload:
+                mv_module_from_gpu(block)
+
+            clear_memory(input_ids, device_list=self.compressor.device_list)
+            memory_info_summary = memory_monitor.get_summary()
+            logger.infoclean(dump_info + "," + memory_info_summary)
+
+            return q_outputs, output
+        else:
+            if len(self.compressor.device_list) > 1 and auto_offload:
+                accelerate.hooks.remove_hook_from_submodules(block)
+            if auto_offload:
+                mv_module_from_gpu(block)
+            clear_memory(input_ids, device_list=self.compressor.device_list)
+            memory_info_summary = memory_monitor.get_summary()
+            logger.infoclean(dump_info + "," + memory_info_summary)
+
+            return None, output
+
+    @staticmethod
+    def _get_current_output(output: list[torch.Tensor], indices: list[int], batch_dim: int) -> torch.Tensor:
+        current_output = [output[x] for x in indices]
+        current_output = torch.cat(current_output, dim=batch_dim)
+        return current_output
diff --git a/auto_round/quantizers/algs/base.py b/auto_round/quantizers/algs/base.py
new file mode 100644
index 000000000..62e97c134
--- /dev/null
+++ b/auto_round/quantizers/algs/base.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+from auto_round.quantizers.base import BaseQuantizer
+
+if TYPE_CHECKING:
+    from auto_round.compressors.base import BaseCompressor
+
+
+class AlgsBaseQuantizer(BaseQuantizer):
+    @abstractmethod
+    def quantize(self, *args, **kwargs):
+        pass
diff --git a/auto_round/quantizers/algs/rtn.py b/auto_round/quantizers/algs/rtn.py
new file mode 100644
index 000000000..80d37d2fe
--- /dev/null
+++ b/auto_round/quantizers/algs/rtn.py
@@ -0,0 +1,627 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import traceback
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+import accelerate
+import torch
+from accelerate.big_modeling import dispatch_model, infer_auto_device_map
+from tqdm import tqdm
+
+from auto_round.compressors.utils import (
+    check_need_act_calibration,
+    immediate_saving,
+    is_nv_fp,
+    is_static_wfp8afp8,
+)
+from auto_round.logger import logger
+from auto_round.quantizers.algs.base import AlgsBaseQuantizer
+from auto_round.quantizers.utils import (
+    get_quantized_layer_names_outside_blocks,
+    quantize_embedding_layer,
+    register_act_max_hook,
+)
+from auto_round.utils import (
+    check_to_quantized,
+    clear_memory,
+    convert_fp8_layer_to_linear,
+    convert_fp8_model_to_16b_model,
+    flatten_list,
+    get_block_names,
+    get_module,
+    is_auto_device_mapping,
+    is_fp8_linear,
+    is_fp8_model,
+    memory_monitor,
+    mv_module_from_gpu,
+    set_amax_for_all_moe_layers,
+    set_module,
+    to_device,
+    to_dtype,
+)
+from auto_round.utils.device import (
+    clear_memory_if_reached_threshold,
+    get_major_device,
+    parse_available_devices,
+    set_auto_device_map_for_block_with_tuning,
+    set_non_auto_device_map,
+)
+from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
+
+if TYPE_CHECKING:
+    from auto_round.compressors.base import BaseCompressor
+
+
+class RTNQuantizer(AlgsBaseQuantizer):
+
+    def __init__(self, compressor: "BaseCompressor"):
+        super().__init__(compressor)
+        self.all_to_quantized_module_names: list[str] = [
+            n for n, m in self.compressor.model.named_modules() if check_to_quantized(m)
+        ]
+
+    def pre_quantize(self, *args, **kwargs):
+        if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype:
+            self.compressor.model.to(self.compressor.amp_dtype)
+
+        if is_nv_fp(self.compressor.data_type):
+            from auto_round.data_type.nvfp import calculate_gparam
+            from auto_round.data_type.utils import update_fused_layer_global_scales
+
+            pbar = tqdm(self.all_to_quantized_module_names)
+            for name in pbar:
+                pbar.set_description(f"Calculate weight global scale: {name}")
+                m = get_module(self.compressor.model, name)
+                if is_fp8_linear(m):
+                    m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device)
+                    set_module(self.compressor.model, name, m)
+                weight_global_scale = calculate_gparam(m.weight, self.compressor.group_size)
+                setattr(m, "weight_global_scale", weight_global_scale)
+
+            logger.info("Start to update fused layer global scales, it may take some time.")
+            for name, module in self.compressor.model.named_modules():
+                update_fused_layer_global_scales(module)
+            logger.info("Finished updating fused layer global scales.")
+
+    @torch.inference_mode()
+    def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
+        """Quantize all modules in the model using RTN (Round-To-Nearest) strategy.
+
+        If the target format includes GGUF with `k`, and optimized RTN is enabled,
+        blockwise quantization with input caching and imatrix is used.
+
+        Returns:
+            tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration.
+        """
+        if not (
+            any(fmt.is_gguf() for fmt in getattr(self.compressor, "formats", []))
+            or self.compressor.super_bits is not None
+        ):
+            quantize_embedding_layer(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                scale_dtype=self.compressor.data_type,
+                disable_opt_rtn=self.compressor.disable_opt_rtn,
+                device=self.compressor.device,
+                device_list=self.compressor.device_list,
+            )  # leave to gguf itself to handle
+
+        self.compressor.model.to("cpu")
+        # Release memory
+        clear_memory(device_list=self.compressor.device_list)
+
+        if self.compressor.act_bits <= 8 and check_need_act_calibration(
+            self.compressor.act_dynamic,
+            self.compressor.act_data_type,
+            self.compressor.act_bits,
+            self.compressor.static_kv_dtype,
+            self.compressor.static_attention_dtype,
+        ):  # TODO, mixed datatype has bug
+            hook_handles = register_act_max_hook(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                act_group_size=self.compressor.act_group_size,
+                act_data_type=self.compressor.act_data_type,
+            )
+            try:
+                self._quantize_via_rtn_blockwise()
+            except torch.OutOfMemoryError:
+                logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.")
+                self.model = self.model.to("cpu")
+                clear_memory(device_list=self.compressor.device_list)
+                if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
+                    import accelerate
+
+                    accelerate.hooks.remove_hook_from_submodules(self.model)
+                orig_device = self.compressor.device
+                self.compressor.device = "cpu"
+                self._quantize_via_rtn_blockwise()
+                self.compressor.device = orig_device
+            for handle in hook_handles:
+                handle.remove()
+        else:
+            block_names_cnt = len(flatten_list(get_block_names(self.compressor.model, True)))
+            clear_mem_freq = len(self.all_to_quantized_module_names) // block_names_cnt
+            if clear_mem_freq == 0:
+                clear_mem_freq = 1
+            pbar = tqdm(self.all_to_quantized_module_names)
+            cnt = 1
+            for name in pbar:
+                pbar.set_description(f"Quantizing {name}")
+                self._quantize_layer_via_rtn(name)
+                if cnt % clear_mem_freq == 0:
+                    clear_memory(device_list=self.compressor.device_list)
+                    memory_monitor.log_summary()
+                    cnt = 1
+                cnt += 1
+        # Convert remaining fp8
+        if is_fp8_model(self.compressor.model):
+            convert_fp8_model_to_16b_model(self.compressor.model, self.compressor.amp_dtype, self.compressor.device)
+        self.compressor.quantized = True
+        return self.compressor.model, self.compressor.layer_config
+
+    def post_quantize(self, *args, **kwargs):
+        pass
+
+    def _quantize_via_rtn_blockwise(self) -> None:
+        """Quantize model layers block by block using cached inputs."""
+
+        all_to_quantized_module_names = list(set(self.all_to_quantized_module_names))
+
+        all_blocks = (
+            self.compressor.quant_block_list
+            if self.compressor.quant_block_list
+            else get_block_names(self.compressor.model)
+        )
+        if not all_blocks:
+            raise ValueError("Could not find any blocks. Check the model or quant_block_list.")
+
+        all_first_block_names = [block[0] for block in all_blocks]
+        layer_names = get_quantized_layer_names_outside_blocks(
+            model=self.compressor.model,
+            layer_config=self.compressor.layer_config,
+            supported_types=self.compressor.supported_types,
+            quant_block_list=self.compressor.quant_block_list,
+        )
+        if self.compressor.act_bits < 16 and (not self.compressor.act_dynamic or len(layer_names) > 0):
+            if len(layer_names) > 0:
+                logger.warning(
+                    "quantize layers outside blocks for static activation quantizaiton"
+                    " will significantly increase calibration time"
+                )
+            all_inputs = self.compressor.try_cache_inter_data_gpucpu(
+                all_first_block_names, self.compressor.nsamples, layer_names
+            )
+        else:
+            all_inputs = self.compressor.cache_inter_data(all_first_block_names, self.compressor.nsamples)
+
+        # Clear hooks for multi-GPU setups
+        if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1:
+            accelerate.hooks.remove_hook_from_submodules(self.compressor.model)
+
+        pbar = tqdm(range(sum(len(block) for block in all_blocks)))
+
+        for block_names in all_blocks:
+            first_block = block_names[0]
+            inputs = all_inputs.pop(first_block)
+            input_keys = [k for k in inputs if k.startswith("hidden_state")]
+            if len(input_keys) != 1:
+                raise RuntimeError(
+                    "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues"
+                )
+            inputs["input_ids"] = inputs.pop(input_keys[0])
+
+            clear_memory(self.compressor.inputs, device_list=self.compressor.device_list)
+
+            total_samples = len(inputs["input_ids"])
+            if total_samples < self.compressor.batch_size:
+                self.compressor.batch_size = total_samples
+                logger.warning(f"Forcing batch size to {total_samples}")
+
+            input_ids = to_device(inputs.pop("input_ids"), self.compressor.cache_device)
+            input_others = to_device(inputs, self.compressor.cache_device)
+
+            tmp_dtype = self.compressor.amp_dtype if self.compressor.amp else torch.float32
+            input_ids = [id_.to(tmp_dtype) for id_ in input_ids]
+
+            for key, val in input_others.items():
+                if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16):
+                    input_others[key] = val.to(tmp_dtype)
+                elif isinstance(val, list):
+                    input_others[key] = [to_dtype(v, tmp_dtype) for v in val]
+
+            for block_name in block_names:
+                pbar.set_description(f"Quantizing {block_name}")
+                block = get_module(self.compressor.model, block_name)
+                if is_fp8_model(self.compressor.model):
+                    convert_fp8_model_to_16b_model(
+                        block, dtype=self.compressor.amp_dtype, device=self.compressor.device
+                    )
+
+                if is_auto_device_mapping(self.compressor.device_map) and len(self.compressor.device_list) > 1:
+                    set_auto_device_map_for_block_with_tuning(
+                        block,
+                        self.compressor.device_map,
+                        input_ids,
+                        self.compressor.low_gpu_mem_usage,
+                        self.compressor.batch_size,
+                        self.compressor.device,
+                    )
+                # Dispatch model if needed
+                if len(self.compressor.device_list) > 1:
+                    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
+
+                    for _, m in block.named_modules():
+                        if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
+                            continue
+                        hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
+                        add_hook_to_module(m, hook, True)
+                else:
+                    block = block.to(self.compressor.device)
+
+                # TODO: refactor this part
+                input_ids = self.compressor._get_block_outputs(
+                    block,
+                    input_ids,
+                    input_others,
+                    self.compressor.batch_size * self.compressor.infer_bs_coeff,
+                    self.compressor.device,
+                    self.compressor.cache_device,
+                )
+
+                if len(self.compressor.device_list) > 1:
+                    accelerate.hooks.remove_hook_from_submodules(block)
+
+                if is_nv_fp(self.compressor.act_data_type) or is_static_wfp8afp8(self.compressor):
+                    # enable moe experts act_max automatic generation for Linear
+                    set_amax_for_all_moe_layers(block, attr_name="act_max")
+                if self.compressor.low_gpu_mem_usage:
+                    block.to("cpu")
+                    clear_memory(device_list=self.compressor.device_list)
+
+                for _, m in block.named_modules():
+                    if hasattr(m, "tmp_name") and m.tmp_name in all_to_quantized_module_names:
+                        self._quantize_layer_via_rtn(m.tmp_name, to_cpu=self.compressor.low_gpu_mem_usage)
+                        all_to_quantized_module_names.remove(m.tmp_name)
+                if not self.compressor.immediate_saving:
+                    mv_module_from_gpu(block)
+                if block_name == block_names[-1]:
+                    clear_memory(input_ids, device_list=self.compressor.device_list)
+                else:
+                    clear_memory(device_list=self.compressor.device_list)
+
+                memory_monitor.log_summary()
+                pbar.update(1)
+        pbar.close()
+        # Process remaining layers not in blocks
+        for name in all_to_quantized_module_names:
+            dtype = None
+            if self.compressor.super_group_size is not None:
+                dtype = torch.float32
+            self._quantize_layer_via_rtn(name, dtype=dtype)
+            # clear_memory(device_list=self.compressor.device_list)
+
+    def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None:
+        """Quantizes a layer using RTN (Round-To-Nearest) if available.
+
+        This function attempts to quantize a layer by switching its data type to a
+        `rtn_*` version if supported, then wraps and unwraps the module to apply
+        quantization. If GPU memory is insufficient, it falls back to CPU.
+
+        If packing is enabled (`immediate_packing`), the function will also export
+        the quantized layer to the appropriate backend format.
+
+        Args:
+            name (str): Name of the layer to quantize.
+
+        Raises:
+            RuntimeError: If quantization fails for reasons unrelated to memory.
+        """
+        m = get_module(self.compressor.model, name)
+        if dtype is not None:
+            m = m.to(dtype)
+
+        if is_fp8_linear(m):
+            m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device)
+            set_module(self.compressor.model, name, m)
+        tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.compressor.device
+
+        try:
+            m = m.to(tuning_device)
+            m = WrapperLinear(
+                m,
+                device=tuning_device,
+                enable_minmax_tuning=False,
+                enable_norm_bias_tuning=False,
+                enable_round_tuning=False,
+                enable_torch_compile=self.compressor.enable_torch_compile,
+            )
+            m = m.unwrapper({})
+        except torch.OutOfMemoryError:
+            cuda_error_msg = traceback.format_exc()
+            m = m.orig_layer if hasattr(m, "orig_layer") else m
+            try:
+                logger.error(cuda_error_msg)
+                logger.warning("falling back to CPU.")
+                m.to("cpu")
+                m = WrapperLinear(
+                    m,
+                    enable_minmax_tuning=False,
+                    enable_norm_bias_tuning=False,
+                    enable_round_tuning=False,
+                    enable_torch_compile=self.compressor.enable_torch_compile,
+                )
+                m = m.unwrapper({})
+            except Exception as e:
+                raise
+
+        # Step 2: Optional immediate packing/export
+        if self.compressor.immediate_packing:  # For gguf, packing conducts on block level
+            self.compressor._immediate_pack(name)
+            if to_cpu:
+                m = m.to("cpu")
+                packed_m = get_module(self.compressor.model, name)
+                set_module(self.compressor.model, name, packed_m.to("cpu"))
+        else:
+            if to_cpu:
+                m = m.to("cpu")
+            set_module(self.compressor.model, name, m)
+        if self.compressor.immediate_saving:
+            if hasattr(self.compressor, "all_to_quantized_module_names"):
+                all_to_quantized_module_names = self.compressor.all_to_quantized_module_names
+            else:
+                all_to_quantized_module_names = [
+                    n for n, m in self.compressor.model.named_modules() if check_to_quantized(m)
+                ]
+            last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1])
+            m = get_module(self.compressor.model, name)
+            immediate_saving(self.compressor, m, name, last_module)
+
+
+class OptRTNQuantizer(RTNQuantizer):
+
+    @staticmethod
+    def register_act_hook(model, supported_types):
+        """Registers hooks to accumulate activation squared norms into `imatrix`."""
+
+        def get_imatrix_hook(module, input, output):
+            input = input[0] if isinstance(input, (tuple, list)) else input
+            flattened = input.reshape(-1, input.shape[-1]).to(torch.float32)
+            squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32)
+
+            if not hasattr(module, "imatrix"):
+                module.imatrix = squared
+                module.imatrix_cnt = input.shape[0]
+            else:
+                module.imatrix += squared.to(module.imatrix.device)
+                module.imatrix_cnt += input.shape[0]
+
+        hook_handles = []
+        for name, module in model.named_modules():
+            if type(module) in supported_types and check_to_quantized(module):
+                hook = module.register_forward_hook(get_imatrix_hook)
+                hook_handles.append(hook)
+        return hook_handles
+
+    @torch.inference_mode()
+    def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
+        enable_imatrix = False
+        has_gguf_k = (
+            any(fmt.is_gguf() and "k" in fmt.output_format for fmt in getattr(self.compressor, "formats", []))
+            or self.compressor.super_bits is not None
+        )
+        if has_gguf_k:
+            enable_imatrix = True
+        elif self.compressor.data_type == "int" and self.compressor.sym:
+            enable_imatrix = True
+        if enable_imatrix:
+            self._quant_rtn_with_imatrix(self.all_to_quantized_module_names)
+            # Convert remaining fp8
+            if is_fp8_model(self.compressor.model):
+                convert_fp8_model_to_16b_model(self.compressor.model, self.compressor.amp_dtype, self.compressor.device)
+            self.compressor.quantized = True
+            return self.compressor.model, self.compressor.layer_config
+        else:
+            return super().quantize(*args, **kwargs)
+
+    def _quant_rtn_with_imatrix(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
+        """Quantize all modules in the model using Optimized RTN strategy.
+
+        This method applies optimized RTN quantization to all modules in the model
+        that are marked for quantization. It leverages input caching and imatrix
+        techniques for enhanced performance.
+
+        Returns:
+            tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration.
+        """
+        if not (
+            any(fmt.is_gguf() for fmt in getattr(self.compressor, "formats", []))
+            or self.compressor.super_bits is not None
+        ):
+            quantize_embedding_layer(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                scale_dtype=self.compressor.data_type,
+                disable_opt_rtn=self.compressor.disable_opt_rtn,
+                device=self.compressor.device,
+                device_list=self.compressor.device_list,
+            )  # leave to gguf itself to handle
+
+        self.compressor.model.to("cpu")
+        # Release memory
+        clear_memory(device_list=self.compressor.device_list)
+
+        logger.info("start to compute imatrix")
+
+        # Load dataset
+        from auto_round.calib_dataset import get_dataloader
+
+        if isinstance(self.compressor.dataset, str):
+            if self.compressor.tokenizer is None:
+                raise ValueError("A tokenizer must be set for the model when using a dataset string.")
+            dataset_name = self.compressor.dataset.replace(" ", "")
+            self.compressor.dataloader = get_dataloader(
+                self.compressor.tokenizer,
+                self.compressor.seqlen,
+                dataset_name,
+                self.compressor.seed,
+                self.compressor.batch_size,
+                self.compressor.nsamples,
+            )
+        else:
+            self.compressor.dataloader = self.compressor.dataset
+
+        model = self.compressor.model
+
+        # Dispatch multi-GPU model if necessary
+        if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
+            dispatch_model(model, model.hf_device_map)
+
+        hooks = self.register_act_hook(model, self.compressor.supported_types)
+
+        try:
+            if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
+                import accelerate
+
+                accelerate.hooks.remove_hook_from_submodules(model)
+            model = model.to("cpu")
+            clear_memory(device_list=self.compressor.device_list)
+            self._quantize_via_rtn_blockwise()
+        except torch.OutOfMemoryError:
+            cuda_error_msg = traceback.format_exc()
+            try:
+                logger.error(cuda_error_msg)
+                # Final fallback: warn and use CPU-only quantization
+                logger.warning(
+                    "Fallback to CPU. "
+                    "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`."
+                )
+                model = model.to("cpu")
+                clear_memory(device_list=self.compressor.device_list)
+                if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
+                    import accelerate
+
+                    accelerate.hooks.remove_hook_from_submodules(model)
+
+                orig_device = self.compressor.device
+                self.compressor.device = "cpu"
+                self._quantize_via_rtn_blockwise()
+                self.compressor.device = orig_device
+            except Exception as e:
+                raise
+        finally:
+            # Always remove hooks
+            for hook in hooks:
+                hook.remove()
+
+    def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None:
+        """Quantizes a layer using RTN (Round-To-Nearest) if available.
+
+        This function attempts to quantize a layer by switching its data type to a
+        `rtn_*` version if supported, then wraps and unwraps the module to apply
+        quantization. If GPU memory is insufficient, it falls back to CPU.
+
+        If packing is enabled (`immediate_packing`), the function will also export
+        the quantized layer to the appropriate backend format.
+
+        Args:
+            name (str): Name of the layer to quantize.
+
+        Raises:
+            RuntimeError: If quantization fails for reasons unrelated to memory.
+        """
+        m = get_module(self.compressor.model, name)
+        if dtype is not None:
+            m = m.to(dtype)
+
+        if is_fp8_linear(m):
+            m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device)
+            set_module(self.compressor.model, name, m)
+        tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.compressor.device
+        # Step 1: Try quantization on GPU first, fall back to CPU if OOM
+        if (
+            self.compressor.immediate_packing
+            and self.compressor.iters == 0
+            and self.compressor.formats[0].is_gguf()
+            and not self.compressor.disable_opt_rtn
+        ):
+            m = m.to(tuning_device)
+            m.scale = None
+            m.zp = None
+        else:
+            try:
+                disable_opt_rtn = False
+                if (
+                    self.compressor.orig_disable_opt_rtn is None
+                    and self.compressor.is_moe_model
+                    and "expert" in m.tmp_name
+                    and "shared_expert" not in m.tmp_name
+                    and self.compressor.super_bits is None  # GGUF still uses the optimized RTN for MoE layers
+                ):
+                    disable_opt_rtn = True
+                    logger.warning_once(
+                        "MoE layer detected: optimized RTN is disabled for efficiency. "
+                        "Use `--enable_opt_rtn` to force-enable it for MoE layers."
+                    )
+                m = m.to(tuning_device)
+                m = WrapperLinear(
+                    m,
+                    device=tuning_device,
+                    enable_minmax_tuning=False,
+                    enable_norm_bias_tuning=False,
+                    enable_round_tuning=False,
+                    enable_torch_compile=self.compressor.enable_torch_compile,
+                    disable_opt_rtn=disable_opt_rtn,
+                )
+                m = m.unwrapper({})
+            except torch.OutOfMemoryError:
+                cuda_error_msg = traceback.format_exc()
+                m = m.orig_layer if hasattr(m, "orig_layer") else m
+                try:
+                    logger.error(cuda_error_msg)
+                    logger.warning("falling back to CPU.")
+                    m.to("cpu")
+                    m = WrapperLinear(
+                        m,
+                        enable_minmax_tuning=False,
+                        enable_norm_bias_tuning=False,
+                        enable_round_tuning=False,
+                        enable_torch_compile=self.compressor.enable_torch_compile,
+                    )
+                    m = m.unwrapper({})
+                except Exception as e:
+                    raise
+
+        # Step 2: Optional immediate packing/export
+        if self.compressor.immediate_packing:  # For gguf, packing conducts on block level
+            self.compressor._immediate_pack(name)
+            if to_cpu:
+                m = m.to("cpu")
+                packed_m = get_module(self.compressor.model, name)
+                set_module(self.compressor.model, name, packed_m.to("cpu"))
+        else:
+            if to_cpu:
+                m = m.to("cpu")
+            set_module(self.compressor.model, name, m)
+        if self.compressor.immediate_saving:
+            if hasattr(self.compressor, "all_to_quantized_module_names"):
+                all_to_quantized_module_names = self.compressor.all_to_quantized_module_names
+            else:
+                all_to_quantized_module_names = [
+                    n for n, m in self.compressor.model.named_modules() if check_to_quantized(m)
+                ]
+            last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1])
+            m = get_module(self.compressor.model, name)
+            immediate_saving(self.compressor, m, name, last_module)
diff --git a/auto_round/quantizers/base.py b/auto_round/quantizers/base.py
new file mode 100644
index 000000000..bd6b07a69
--- /dev/null
+++ b/auto_round/quantizers/base.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific la
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from auto_round.compressors.base import BaseCompressor
+
+
+class BaseQuantizer(ABC):
+    def __init__(self, compressor: "BaseCompressor"):
+        self.compressor = compressor
+
+    def pre_quantize(self, *args, **kwargs):
+        pass
+
+    def quantize(self, *args, **kwargs):
+        pass
+
+    def post_quantize(self, *args, **kwargs):
+        pass
+
+    def pre_quantize_layer(self, *args, **kwargs):
+        pass
+
+    def quantize_layer(self, *args, **kwargs):
+        pass
+
+    def post_quantize_layer(self, *args, **kwargs):
+        pass
+
+    def pre_quantize_block(self, *args, **kwargs):
+        pass
+
+    def quantize_block(self, *args, **kwargs):
+        pass
+
+    def post_quantize_block(self, *args, **kwargs):
+        pass
diff --git a/auto_round/quantizers/entrypoint.py b/auto_round/quantizers/entrypoint.py
new file mode 100644
index 000000000..d22a999f1
--- /dev/null
+++ b/auto_round/quantizers/entrypoint.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from auto_round.compressors.base import BaseCompressor
+
+from auto_round.quantizers.algs.auto_round import ARQuantizer
+from auto_round.quantizers.algs.rtn import OptRTNQuantizer, RTNQuantizer
+
+
+class AutoRoundQuantizer:
+    def __new__(cls, compressor: "BaseCompressor", dynamic_quantizers: dict = None):
+        assert dynamic_quantizers is not None, "Please provide dynamic_quantizers dict."
+        quantizer_cls = type("AutoRoundQuantizer", tuple(dynamic_quantizers.values()), {})
+        return quantizer_cls(compressor)
+
+
+class Quanterizers:
+    def __init__(self, quantizers: list[AutoRoundQuantizer]):
+        self.quantizers = quantizers
+
+    def quantize(self, *args, **kwargs):
+        for quantizer in self.quantizers:
+            quantizer.pre_quantize(*args, **kwargs)
+            model, layer_config = quantizer.quantize(*args, **kwargs)
+            quantizer.post_quantize(*args, **kwargs)
+        return model, layer_config
+
+
+def create_quantizers(compressor: "BaseCompressor"):
+
+    alg_cls = None
+    if compressor.iters > 0:
+        alg_cls = ARQuantizer
+    else:
+        alg_cls = OptRTNQuantizer if compressor.disable_opt_rtn is False else RTNQuantizer
+
+    dynamic_quantizers = {"algs": alg_cls}
+    return Quanterizers(
+        quantizers=[
+            AutoRoundQuantizer(compressor, dynamic_quantizers=dynamic_quantizers),
+        ]
+    )
diff --git a/auto_round/quantizers/readme.md b/auto_round/quantizers/readme.md
new file mode 100644
index 000000000..7a5b7fad6
--- /dev/null
+++ b/auto_round/quantizers/readme.md
@@ -0,0 +1,21 @@
+# AutoRound Quantizer
+主要的功能组件，包含不同的算法，量化的具体执行逻辑。
+
+## 结构与调用流程
+AutoRundQuantizer根据粒度从大到小分为三层（可扩展）： algs、model_type、data_type，从每层中继承方法动态的构造一个Quantizers, 同层间互斥，不同层间可以自由组合。
+
+AutoRoundQuantizer
+- algs
+    - RTN
+    - Tuning(auto_round)
+- model_type
+    - llm
+    - mllm
+    - diffusion
+- data_type
+    - gguf
+    - nvfp/mxfp
+### 1. AutoRoundQuantizer
+主入口，根据配置，使用__new__方法动态构造一个Quantizer, 从AlgsQuantizer, ModelTypeQuantizer, DataTypeQuantizer中继承方法，小粒度层可覆写大粒度层方法
+
+### 2. AlgsQuantizer
\ No newline at end of file
diff --git a/auto_round/quantizers/utils.py b/auto_round/quantizers/utils.py
new file mode 100644
index 000000000..3ea0c009f
--- /dev/null
+++ b/auto_round/quantizers/utils.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import traceback
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+from auto_round.compressors.utils import (
+    check_need_act_calibration,
+    is_nv_fp,
+)
+from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
+from auto_round.data_type.utils import reshape_pad_tensor_by_group_size
+from auto_round.logger import logger
+from auto_round.utils import (
+    SUPPORTED_LAYER_TYPES,
+    check_to_quantized,
+    clear_memory,
+    get_layer_names_in_block,
+    get_module,
+    to_device,
+    to_dtype,
+)
+
+
+def register_act_max_hook(model: torch.nn.Module, layer_config: dict, act_group_size: int, act_data_type: str):
+    def get_act_max_hook(module, input, output):
+        if isinstance(input, (tuple, list)):
+            input = input[0]
+        if input.numel() == 0:
+            return  # as no needs for act_max update
+        input, _, _ = reshape_pad_tensor_by_group_size(input, act_group_size)
+        act_max = torch.max(torch.abs(input), dim=-1).values
+        if not hasattr(module, "act_max") or module.act_max.numel() == 0:
+            module.act_max = act_max
+        else:
+            act_max = act_max.to(module.act_max.device)
+            if is_nv_fp(act_data_type):  ## for nvfp per-tensor input_global_scale calculation usage
+                module.act_max = torch.max(torch.tensor([act_max.max(), module.act_max.max()], device=act_max.device))
+            else:
+                module.act_max = torch.max(act_max, module.act_max)
+
+    hook_handles = []
+    # for single layers out of blocks, like lm_head
+    if isinstance(model, SUPPORTED_LAYER_TYPES):
+        m = model
+        if (
+            hasattr(m, "act_dynamic")
+            and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
+            and check_to_quantized(m)
+        ):
+            hook = m.register_forward_hook(get_act_max_hook)
+            hook_handles.append(hook)
+        return hook_handles
+
+    for n, m in model.named_modules():
+        if (
+            hasattr(m, "act_dynamic")
+            and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
+            and check_to_quantized(m)
+        ):
+            hook = m.register_forward_hook(get_act_max_hook)
+            hook_handles.append(hook)
+            continue
+
+        # for whole model, RTN
+        if n in layer_config:
+            config = layer_config[n]
+            act_dynamic = config.get("act_dynamic", True)
+            act_data_type = config.get("act_data_type", None)
+            act_bits = config.get("act_bits", 16)
+            if (
+                config["bits"] <= 8
+                and check_need_act_calibration(act_dynamic, act_data_type, act_bits)
+                and check_to_quantized(config)
+            ):
+                hook = m.register_forward_hook(get_act_max_hook)
+                hook_handles.append(hook)
+                continue
+    return hook_handles
+
+
+@torch.inference_mode()
+def quantize_embedding_layer(
+    model: torch.nn.Module,
+    layer_config: dict,
+    scale_dtype: str,
+    disable_opt_rtn: bool,
+    device: Union[str, torch.device],
+    device_list: list,
+) -> bool:
+    """Quantizes embedding layers in the model according to the configuration.
+
+    This method iterates through all modules in the model, identifies embedding
+    layers specified in `layer_config`, and applies the appropriate quantization
+    function based on bit precision, grouping strategy, and dtype.
+
+    Returns:
+        bool: True if the quantization process completes without critical errors.
+    """
+    is_quantized = False
+    for name, module in model.named_modules():
+        # Skip non-Embedding modules or layers not in config
+        if not isinstance(module, torch.nn.Embedding) or name not in layer_config:
+            continue
+
+        config = layer_config[name]
+
+        # Skip layers that are not marked for quantization
+        if not check_to_quantized(config):
+            continue
+        is_quantized = True
+        config["scale_dtype"] = scale_dtype
+        dtype = config["data_type"]
+
+        # Determine quantization function key with symmetry/asymmetry
+        if dtype not in QUANT_FUNC_WITH_DTYPE:
+            dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}"
+
+        # Optionally use optimized rounding (RTN) variant
+        if not disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE:
+            dtype = f"rtn_{dtype}"
+
+        quant_func = QUANT_FUNC_WITH_DTYPE[dtype]
+        dtype = module.weight.dtype
+        # As typically float32 are used in RTN to search scale zp,
+        # to avoid cache a bf16 copy we'd better use float32
+        if config.get("super_group_size", None) is not None:
+            dtype = torch.float32
+
+        # Attempt quantization on GPU, fall back to CPU if OOM
+        try:
+            weight, scale, zp = quant_func(
+                module.weight.to(dtype=dtype, device=device),
+                **{
+                    k: config.get(k, None)
+                    for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]
+                },
+            )
+        except torch.OutOfMemoryError:
+            cuda_error_msg = traceback.format_exc()
+            try:
+                logger.error(cuda_error_msg)
+                logger.warning("falling back to CPU")
+                weight, scale, zp = quant_func(
+                    module.weight.to("cpu"),
+                    **{
+                        k: config.get(k, None)
+                        for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]
+                    },
+                )
+            except Exception as e:
+                raise
+
+        # Overwrite the module's weights with the quantized version
+        module.weight.data.copy_(weight.cpu())
+
+        # Attach scale and zero point (zp) to the module
+        for param_name, value in zip(["scale", "zp"], [scale, zp]):
+            if isinstance(value, dict):
+                for k, v in value.items():
+                    setattr(module, k if k == "scale" else f"w_{k}", v.cpu())
+            elif isinstance(value, torch.Tensor):
+                setattr(module, param_name, value.cpu())
+            else:
+                setattr(module, param_name, value)
+
+        # Update config
+        layer_config.setdefault(name, {}).update(config)
+        del weight
+        del scale
+        del zp
+        clear_memory(device_list=device_list)
+    return is_quantized
+
+
+def get_quantized_layer_names_outside_blocks(
+    model: torch.nn.Module, layer_config: dict, supported_types: list, quant_block_list: list
+) -> list:
+    """Gets the names of quantized layers outside blocks in the model.
+
+    Returns:
+        list: List of layer names outside blocks.
+    """
+    if layer_config is None or len(layer_config) == 0:
+        return []
+
+    layer_names = []
+    all_layers_in_block = get_layer_names_in_block(model, supported_types, quant_block_list)
+
+    for key in layer_config.keys():
+        if key in all_layers_in_block:
+            continue
+        layer = get_module(model, key)
+        if layer is None:
+            logger.error(f"could not find layer {key} in the model, exit...")
+            exit(-1)
+        if type(layer) in supported_types and check_to_quantized(layer_config[key]):
+            layer_names.append(key)
+
+    return layer_names
+
+
+def get_non_zero_cnt(tensor: list[torch.Tensor], indices: list[int]) -> int:
+    current_tensors = [tensor[i] for i in indices]
+    non_zero_cnt = 0
+    for t in current_tensors:
+        non_zero_cnt += torch.count_nonzero(t).item()
+    return non_zero_cnt
+
+
+def split_inputs(inputs: dict, first_input_name: str, diffusion: bool = False) -> tuple[torch.Tensor, dict]:
+    if diffusion:
+        input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
+        input_ids = {k: inputs.pop(k, None) for k in input_id_str}
+        input_others = inputs
+        return input_ids, input_others
+    else:
+        input_ids = inputs[first_input_name]
+        inputs.pop(first_input_name, None)
+        input_others = inputs
+        return input_ids, input_others
+
+
+def preprocess_block_inputs(
+    inputs,
+    device_list: list,
+    first_input_name="input_ids",
+    amp: bool = False,
+    amp_dtype: torch.dtype = torch.float32,
+    cache_device: Union[str, torch.device] = "cpu",
+    diffusion: bool = False,
+):
+    input_ids, input_others = split_inputs(inputs, first_input_name, diffusion=diffusion)
+    clear_memory(device_list=device_list)
+    input_ids = to_device(input_ids, cache_device)
+    input_others = to_device(input_others, cache_device)
+    # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage
+
+    tmp_dtype = amp_dtype if amp else torch.float32
+    input_ids = to_dtype(input_ids, tmp_dtype)
+
+    for key in input_others.keys():
+        if isinstance(input_others[key], torch.Tensor) and (
+            input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16
+        ):
+            input_others[key] = input_others[key].to(tmp_dtype)
+        elif isinstance(input_others[key], list):
+            for i in range(len(input_others[key])):
+                to_dtype(input_others[key][i], tmp_dtype)
+    return input_ids, input_others

From cb334610ac80db04017a04152e41f931d0e6e404 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 14 Jan 2026 03:56:13 -0500
Subject: [PATCH 2/5] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/quantizers/algs/auto_round.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/auto_round/quantizers/algs/auto_round.py b/auto_round/quantizers/algs/auto_round.py
index da02d0397..a2159d292 100644
--- a/auto_round/quantizers/algs/auto_round.py
+++ b/auto_round/quantizers/algs/auto_round.py
@@ -784,7 +784,9 @@ def quantize_block(
 
             for tmp_step in range(self.compressor.gradient_accumulate_steps):
                 indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
-                current_output = self._get_current_output(output, indices, self.compressor.batch_dim)
+                current_output = self._get_current_output(
+                    output, indices, self.compressor.batch_dim, diffusion=self.compressor.diffusion
+                )
                 current_output = to_device(current_output, loss_device)
                 # TODO: refactor this
                 output_q = self.compressor._get_current_q_output(
@@ -882,7 +884,15 @@ def quantize_block(
             return None, output
 
     @staticmethod
-    def _get_current_output(output: list[torch.Tensor], indices: list[int], batch_dim: int) -> torch.Tensor:
+    def _get_current_output(
+        output: list[torch.Tensor], indices: list[int], batch_dim: int, diffusion: bool = False
+    ) -> torch.Tensor:
+        if diffusion:
+            assert "hidden_states" in output
+            current_output = [output["hidden_states"][x] for x in indices]
+            current_output = torch.cat(current_output, dim=batch_dim)
+            return current_output
+
         current_output = [output[x] for x in indices]
         current_output = torch.cat(current_output, dim=batch_dim)
         return current_output

From 52afa900cc9160167b0fd74e1dc4ef76a7080d96 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 14 Jan 2026 20:26:55 -0500
Subject: [PATCH 3/5] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/compressors/base.py           | 5 +++--
 auto_round/quantizers/algs/auto_round.py | 6 +++---
 auto_round/quantizers/algs/rtn.py        | 4 ++--
 auto_round/quantizers/base.py            | 3 ++-
 auto_round/quantizers/entrypoint.py      | 4 ++--
 5 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index a383659d0..b59e7354f 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1590,7 +1590,7 @@ def quantize_block(
         self.normalize_decoding_layer_inputs_(inputs)
         block_inputs = self.inputs[self.quant_block_list[0][0]]
         decoding_layer_first_input_name = "hidden_states"
-        from auto_round.quantizers.algs.auto_round import AutoRoundQuantizer
+        from auto_round.quantizers.algs.auto_round import ARQuantizer
         from auto_round.quantizers.utils import preprocess_block_inputs
 
         input_ids, input_others = preprocess_block_inputs(
@@ -1602,7 +1602,8 @@ def quantize_block(
             cache_device=self.cache_device,
             diffusion=self.diffusion,
         )
-        return AutoRoundQuantizer(self).quantize_block(block, input_ids, input_others, q_input, device, auto_offload)
+        ar_quantizer = ARQuantizer(self)
+        return ar_quantizer.quantize_block(block, input_ids, input_others, q_input, device, auto_offload)
 
     def _get_loss(
         self,
diff --git a/auto_round/quantizers/algs/auto_round.py b/auto_round/quantizers/algs/auto_round.py
index a2159d292..db3330088 100644
--- a/auto_round/quantizers/algs/auto_round.py
+++ b/auto_round/quantizers/algs/auto_round.py
@@ -108,7 +108,7 @@ def quantize(self, *args, **kwargs):
         is_quantized_embedding = quantize_embedding_layer(
             model=self.compressor.model,
             layer_config=self.compressor.layer_config,
-            scale_dtype=self.compressor.data_type,
+            scale_dtype=self.compressor.scale_dtype,
             disable_opt_rtn=self.compressor.disable_opt_rtn,
             device=self.compressor.device,
             device_list=self.compressor.device_list,
@@ -462,11 +462,11 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.
         lr = torch.tensor(self.compressor.lr)
         minmax_lr = torch.tensor(self.compressor.minmax_lr)
         if self.compressor.enable_minmax_tuning:
-            optimizer = self.optimizer(
+            optimizer = self.compressor.optimizer(
                 [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0
             )
         else:
-            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0)
+            optimizer = self.compressor.optimizer(round_params, lr=lr, weight_decay=0)
 
         if self.compressor.lr_scheduler is None:
             lr_schedule = torch.optim.lr_scheduler.LinearLR(
diff --git a/auto_round/quantizers/algs/rtn.py b/auto_round/quantizers/algs/rtn.py
index 80d37d2fe..f1b4676f4 100644
--- a/auto_round/quantizers/algs/rtn.py
+++ b/auto_round/quantizers/algs/rtn.py
@@ -112,7 +112,7 @@ def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
             quantize_embedding_layer(
                 model=self.compressor.model,
                 layer_config=self.compressor.layer_config,
-                scale_dtype=self.compressor.data_type,
+                scale_dtype=self.compressor.scale_dtype,
                 disable_opt_rtn=self.compressor.disable_opt_rtn,
                 device=self.compressor.device,
                 device_list=self.compressor.device_list,
@@ -453,7 +453,7 @@ def _quant_rtn_with_imatrix(self, *args, **kwargs) -> tuple[torch.nn.Module, dic
             quantize_embedding_layer(
                 model=self.compressor.model,
                 layer_config=self.compressor.layer_config,
-                scale_dtype=self.compressor.data_type,
+                scale_dtype=self.compressor.scale_dtype,
                 disable_opt_rtn=self.compressor.disable_opt_rtn,
                 device=self.compressor.device,
                 device_list=self.compressor.device_list,
diff --git a/auto_round/quantizers/base.py b/auto_round/quantizers/base.py
index bd6b07a69..1e1a73f17 100644
--- a/auto_round/quantizers/base.py
+++ b/auto_round/quantizers/base.py
@@ -9,7 +9,8 @@
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific la
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING
diff --git a/auto_round/quantizers/entrypoint.py b/auto_round/quantizers/entrypoint.py
index d22a999f1..473dcfca1 100644
--- a/auto_round/quantizers/entrypoint.py
+++ b/auto_round/quantizers/entrypoint.py
@@ -27,7 +27,7 @@ def __new__(cls, compressor: "BaseCompressor", dynamic_quantizers: dict = None):
         return quantizer_cls(compressor)
 
 
-class Quanterizers:
+class Quantizers:
     def __init__(self, quantizers: list[AutoRoundQuantizer]):
         self.quantizers = quantizers
 
@@ -48,7 +48,7 @@ def create_quantizers(compressor: "BaseCompressor"):
         alg_cls = OptRTNQuantizer if compressor.disable_opt_rtn is False else RTNQuantizer
 
     dynamic_quantizers = {"algs": alg_cls}
-    return Quanterizers(
+    return Quantizers(
         quantizers=[
             AutoRoundQuantizer(compressor, dynamic_quantizers=dynamic_quantizers),
         ]

From 22ffe2d8b409befa95387adea6051b220854950c Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 20 Jan 2026 15:51:51 +0800
Subject: [PATCH 4/5] update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 .pre-commit-config.yaml                       |   0
 auto_round/__init__.py                        |   2 +-
 auto_round/autoround.py                       | 108 +-------
 auto_round/compressors/__init__.py            |   1 -
 auto_round/compressors/adam.py                | 164 ------------
 auto_round/compressors/base.py                |  66 +----
 .../compressors/diffusion/compressor.py       |   7 -
 auto_round/compressors/utils.py               |   8 +-
 auto_round/quantizers/algs/auto_round.py      | 247 +++++++++++++-----
 auto_round/quantizers/algs/base.py            |  28 +-
 auto_round/quantizers/algs/rtn.py             |  11 +-
 auto_round/quantizers/base.py                 |  52 ++++
 auto_round/quantizers/entrypoint.py           |   8 +-
 auto_round/quantizers/utils.py                |  28 +-
 test/test_cpu/export/test_gguf_format.py      |  10 +-
 test/test_cpu/utils/test_cli_usage.py         |  16 +-
 .../advanced/test_multiple_card_calib.py      |   4 +-
 test/test_cuda/export/test_gguf.py            |   2 +-
 test/test_cuda/models/test_support_vlms.py    |  10 +-
 test/test_cuda/utils/test_alg_ext.py          |   4 +-
 20 files changed, 322 insertions(+), 454 deletions(-)
 mode change 100644 => 100755 .pre-commit-config.yaml
 mode change 100644 => 100755 auto_round/__init__.py
 mode change 100644 => 100755 auto_round/autoround.py
 mode change 100644 => 100755 auto_round/compressors/__init__.py
 delete mode 100644 auto_round/compressors/adam.py
 mode change 100644 => 100755 auto_round/compressors/base.py
 mode change 100644 => 100755 auto_round/compressors/diffusion/compressor.py
 mode change 100644 => 100755 auto_round/compressors/utils.py
 mode change 100644 => 100755 auto_round/quantizers/algs/auto_round.py
 mode change 100644 => 100755 auto_round/quantizers/algs/base.py
 mode change 100644 => 100755 auto_round/quantizers/algs/rtn.py
 mode change 100644 => 100755 auto_round/quantizers/base.py
 mode change 100644 => 100755 auto_round/quantizers/entrypoint.py
 mode change 100644 => 100755 auto_round/quantizers/utils.py
 mode change 100644 => 100755 test/test_cpu/export/test_gguf_format.py
 mode change 100644 => 100755 test/test_cpu/utils/test_cli_usage.py
 mode change 100644 => 100755 test/test_cuda/advanced/test_multiple_card_calib.py
 mode change 100644 => 100755 test/test_cuda/export/test_gguf.py
 mode change 100644 => 100755 test/test_cuda/models/test_support_vlms.py
 mode change 100644 => 100755 test/test_cuda/utils/test_alg_ext.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
old mode 100644
new mode 100755
diff --git a/auto_round/__init__.py b/auto_round/__init__.py
old mode 100644
new mode 100755
index 87c70e06a..e2f1b6c58
--- a/auto_round/__init__.py
+++ b/auto_round/__init__.py
@@ -14,7 +14,7 @@
 from auto_round.autoround import AutoRound
 
 # support for old api
-from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam, AutoRoundDiffusion
+from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundDiffusion
 from auto_round.schemes import QuantizationScheme
 from auto_round.auto_scheme import AutoScheme
 from auto_round.utils import LazyImport
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
old mode 100644
new mode 100755
index 6d69c01c3..0cff92082
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -18,7 +18,6 @@
 import torch
 
 from auto_round.compressors import (
-    AdamCompressor,
     BaseCompressor,
     DiffusionCompressor,
     ExtraConfig,
@@ -173,8 +172,6 @@ def __new__(
                 extra_config.diffusion_config = None
             model_cls.append(LLMCompressor)
 
-        if enable_adam:
-            model_cls.append(AdamCompressor)
         dynamic_compressor = type("AutoRound", tuple(model_cls), {})
         if extra_config:
             kwargs.update(extra_config.to_dict())
@@ -187,6 +184,7 @@ def __new__(
                 "'fp_layers' is deprecated, please use 'ignore_layers' to set layers not to be quantized."
             )
             kwargs["ignore"] = kwargs.pop("fp_layers")
+        kwargs["enable_adam"] = enable_adam
         ar = dynamic_compressor(
             model=model,
             tokenizer=tokenizer,
@@ -371,110 +369,6 @@ def __init__(
         )
 
 
-@deprecated("AutoRound")
-class AutoRoundAdam(AdamCompressor):
-    """Class for quantization with optimizers like adamw of a PyTorch model.
-
-    Args:
-        model: The PyTorch model to be quantized.
-        tokenizer: An optional tokenizer for processing input data.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
-        scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
-        bits (int): Number of bits for quantization (default is 4).
-        group_size (int): Size of the quantization group (default is 128).
-        sym (bool): Whether sym to be used (default is True).
-        layer_config (dict): Configuration for weight quantization (default is None).
-        batch_size (int): Batch size for training (default is 8).
-        amp (bool): Whether to use automatic mixed precision (default is True).
-        device: The device to be used for training (default is "auto").
-        lr_scheduler: The learning rate scheduler to be used.
-        dataset: The default dataset name (default is "NeelNanda/pile-10k").
-        enable_quanted_input (bool): Whether to use quantized input data (default is True).
-        enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True).
-        lr (float): The learning rate (default is 0.005).
-        minmax_lr (float): The learning rate for min-max tuning (default is None).
-        low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False).
-        iters (int): Number of iterations (default is 200).
-        seqlen (int): Length of the sequence.
-        nsamples (int): Number of samples (default is 128).
-        sampler (str): The sampling method (default is "rand").
-        seed (int): The random seed (default is 42).
-        nblocks (int): Number of blocks (default is 1).
-        gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
-        not_use_best_mse (bool): Whether to use mean squared error (default is False).
-        dynamic_max_gap (int): The dynamic maximum gap (default is -1).
-        data_type (str): The data type to be used (default is "int").
-        scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
-                           have different choices.
-        act_bits (int): Number of bits for activation quantization. Default is 16.
-        act_group_size (int): Group size for activation quantization. Default is None.
-        act_sym (bool): Whether to use symmetric activation quantization. Default is None.
-        act_data_type (str): Specifies the data type for activations.
-                             Defaults to None, in which case it inherits the weight data type.
-        act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
-        to_quant_block_names (str|list): A string or list whose elements are list of
-                            block's layer names to be quantized.
-        enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning
-        enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer function
-        **kwargs: Additional keyword arguments.
-
-    Returns:
-        The quantized model.
-    """
-
-    bits: int | None
-    group_size: int | None
-    sym: bool | None
-    data_type: str | None
-    act_bits: int | None
-    act_group_size: int | None
-    act_sym: bool | None
-    act_data_type: str | None
-    act_dynamic: bool | None
-    super_bits: int | None
-    super_group_size: int | None
-
-    def __init__(
-        self,
-        model: Union[torch.nn.Module, str],
-        tokenizer=None,
-        platform: str = "hf",
-        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
-        layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
-        dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
-        iters: int = 200,
-        seqlen: int = 2048,
-        nsamples: int = 128,
-        batch_size: int = 8,
-        gradient_accumulate_steps: int = 1,
-        low_gpu_mem_usage: bool = False,
-        device_map: Union[str, int, torch.device, dict] = 0,
-        enable_torch_compile: bool = False,
-        seed: int = 42,
-        optimizer="AdamW",
-        **kwargs,
-    ):
-        super().__init__(
-            model=model,
-            tokenizer=tokenizer,
-            platform=platform,
-            scheme=scheme,
-            layer_config=layer_config,
-            batch_size=batch_size,
-            dataset=dataset,
-            low_gpu_mem_usage=low_gpu_mem_usage,
-            iters=iters,
-            seqlen=seqlen,
-            nsamples=nsamples,
-            seed=seed,
-            gradient_accumulate_steps=gradient_accumulate_steps,
-            enable_torch_compile=enable_torch_compile,
-            device_map=device_map,
-            optimizer=optimizer,
-            **kwargs,
-        )
-
-
 @deprecated("AutoRound")
 class AutoRoundMLLM(MLLMCompressor):
     """Class for automatic rounding-based quantization with MLLMs.
diff --git a/auto_round/compressors/__init__.py b/auto_round/compressors/__init__.py
old mode 100644
new mode 100755
index 6f8ddf681..05623ecb7
--- a/auto_round/compressors/__init__.py
+++ b/auto_round/compressors/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from auto_round.compressors.adam import AdamCompressor
 from auto_round.compressors.base import BaseCompressor
 from auto_round.compressors.base import LLMCompressor
 from auto_round.compressors.mllm.compressor import MLLMCompressor
diff --git a/auto_round/compressors/adam.py b/auto_round/compressors/adam.py
deleted file mode 100644
index fb79cf39a..000000000
--- a/auto_round/compressors/adam.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) 2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Union
-
-import torch
-
-from auto_round.compressors.base import BaseCompressor
-from auto_round.schemes import QuantizationScheme
-from auto_round.utils import check_is_cpu, htcore, is_hpex_available
-
-
-class AdamCompressor(BaseCompressor):
-    """Class for quantization with optimizers like adamw of a PyTorch model.
-
-    Args:
-        model: The PyTorch model to be quantized.
-        tokenizer: An optional tokenizer for processing input data.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
-        scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
-        bits (int): Number of bits for quantization (default is 4).
-        group_size (int): Size of the quantization group (default is 128).
-        sym (bool): Whether sym to be used (default is True).
-        layer_config (dict): Configuration for weight quantization (default is None).
-        batch_size (int): Batch size for training (default is 8).
-        amp (bool): Whether to use automatic mixed precision (default is True).
-        device: The device to be used for training (default is "auto").
-        lr_scheduler: The learning rate scheduler to be used.
-        dataset: The default dataset name (default is "NeelNanda/pile-10k").
-        enable_quanted_input (bool): Whether to use quantized input data (default is True).
-        enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True).
-        lr (float): The learning rate (default is 0.005).
-        minmax_lr (float): The learning rate for min-max tuning (default is None).
-        low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False).
-        iters (int): Number of iterations (default is 200).
-        seqlen (int): Length of the sequence.
-        nsamples (int): Number of samples (default is 128).
-        sampler (str): The sampling method (default is "rand").
-        seed (int): The random seed (default is 42).
-        nblocks (int): Number of blocks (default is 1).
-        gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
-        not_use_best_mse (bool): Whether to use mean squared error (default is False).
-        dynamic_max_gap (int): The dynamic maximum gap (default is -1).
-        data_type (str): The data type to be used (default is "int").
-        scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
-                           have different choices.
-        act_bits (int): Number of bits for activation quantization. Default is 16.
-        act_group_size (int): Group size for activation quantization. Default is None.
-        act_sym (bool): Whether to use symmetric activation quantization. Default is None.
-        act_data_type (str): Specifies the data type for activations.
-                             Defaults to None, in which case it inherits the weight data type.
-        act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
-        to_quant_block_names (str|list): A string or list whose elements are list of
-                            block's layer names to be quantized.
-        enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning
-        enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer function
-        **kwargs: Additional keyword arguments.
-
-    Returns:
-        The quantized model.
-    """
-
-    bits: int | None
-    group_size: int | None
-    sym: bool | None
-    data_type: str | None
-    act_bits: int | None
-    act_group_size: int | None
-    act_sym: bool | None
-    act_data_type: str | None
-    act_dynamic: bool | None
-    super_bits: int | None
-    super_group_size: int | None
-
-    def __init__(
-        self,
-        model: Union[torch.nn.Module, str],
-        tokenizer=None,
-        platform="hf",
-        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
-        layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
-        dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
-        iters: int = 200,
-        seqlen: int = 2048,
-        nsamples: int = 128,
-        batch_size: int = 8,
-        gradient_accumulate_steps: int = 1,
-        low_gpu_mem_usage: bool = False,
-        device_map: Union[str, int, torch.device, dict] = 0,
-        enable_torch_compile: bool = False,
-        seed: int = 42,
-        optimizer="AdamW",
-        **kwargs,
-    ):
-        super(AdamCompressor, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            platform=platform,
-            scheme=scheme,
-            layer_config=layer_config,
-            batch_size=batch_size,
-            dataset=dataset,
-            low_gpu_mem_usage=low_gpu_mem_usage,
-            iters=iters,
-            seqlen=seqlen,
-            nsamples=nsamples,
-            seed=seed,
-            gradient_accumulate_steps=gradient_accumulate_steps,
-            enable_torch_compile=enable_torch_compile,
-            device_map=device_map,
-            **kwargs,
-        )
-
-        self.optimizer = self._get_optimizer(optimizer)
-
-    def _get_optimizer(self, optimizer):
-        if optimizer is None:
-            optimizer = torch.optim.AdamW
-        elif isinstance(optimizer, str):
-            optimizer = getattr(torch.optim, optimizer)
-        else:
-            optimizer = optimizer
-        return optimizer
-
-    def _get_scaler(self):
-        scaler = None
-        if self.amp and not check_is_cpu(self.device):
-            from torch.cuda.amp import GradScaler
-
-            scaler = GradScaler(init_scale=1024, growth_interval=100000)
-        return scaler
-
-    def _scale_loss_and_backward(self, scaler, loss):
-        if scaler is not None:
-            loss = scaler.scale(loss)
-
-        loss.backward()
-        if is_hpex_available():
-            htcore.mark_step()
-        return loss
-
-    def _step(self, scaler, optimizer, lr_schedule):
-        if scaler is not None:
-            scaler.step(optimizer)
-            optimizer.zero_grad()
-            lr_schedule.step()
-            scaler.update()
-        else:
-            optimizer.step()
-            optimizer.zero_grad()
-            lr_schedule.step()
-        if is_hpex_available():
-            htcore.mark_step()
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
old mode 100644
new mode 100755
index b59e7354f..67a5092f8
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -256,6 +256,7 @@ def __init__(
         model_dtype = kwargs.pop("model_dtype", None)
         self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
         self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
+        self.enable_adam = kwargs.pop("enable_adam") if "enable_adam" in kwargs else False
         self.quantized = False
         if isinstance(model, str):
             model, tokenizer = llm_load_model(
@@ -413,7 +414,6 @@ def __init__(
         self.not_use_best_mse = not_use_best_mse
         self.dynamic_max_gap = dynamic_max_gap
         self.lr_scheduler = lr_scheduler
-        self.optimizer = self._get_optimizer(None)
         self.disable_opt_rtn = disable_opt_rtn
 
         # Whether to pack the layer immediately after tuning
@@ -916,19 +916,6 @@ def _immediate_pack(self, name: str):
             image_processor=self.image_processor if hasattr(self, "image_processor") else None,
         )
 
-    def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]:
-        keys = inputs.keys()
-        input_id_str = [key for key in keys if key.startswith("hidden_state")]
-        if len(input_id_str) != 1:
-            raise RuntimeError(
-                "hidden_states arg mismatch error,"
-                "please raise an issue in https://github.com/intel/auto-round/issues"
-            )
-        inputs["input_ids"] = inputs.pop(input_id_str[0], None)
-        if q_inputs is not None:
-            q_inputs = q_inputs.pop(input_id_str[0], None)
-        return inputs, q_inputs
-
     def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True):
         fill_default_value = True
         if self.is_auto_scheme:
@@ -1600,7 +1587,7 @@ def quantize_block(
             amp=self.amp,
             amp_dtype=self.amp_dtype,
             cache_device=self.cache_device,
-            diffusion=self.diffusion,
+            is_diffusion=self.diffusion,
         )
         ar_quantizer = ARQuantizer(self)
         return ar_quantizer.quantize_block(block, input_ids, input_others, q_input, device, auto_offload)
@@ -1718,55 +1705,6 @@ def _set_amp_dtype(self) -> None:
             self.amp_dtype = torch.float32
             self.model = self.model.to(torch.float32)
 
-    def _get_optimizer(self, optimizer: Any):
-        """Returns the specified optimizer. In SignRound, we fix the optimizer.
-
-        Args:
-        optimizer: The optimizer to be used.
-
-        Returns:
-        The specified optimizer.
-        """
-        return SignSGD
-
-    def _get_scaler(self):
-        """Returns scaler, in SignRound, no need to use scaler."""
-        return None
-
-    def _scale_loss_and_backward(self, scaler: Any, loss: torch.Tensor) -> torch.Tensor:
-        """Scales the loss and performs backward pass.
-
-        Args:
-        scaler: The scaler to be used.
-        loss: The loss to be scaled.
-
-        Returns:
-        The scaled loss.
-        """
-        scale_loss = loss * 1000
-        scale_loss.backward()
-        if is_hpex_available():
-            htcore.mark_step()
-        return scale_loss
-
-    def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any):
-        """Performs a step in the optimization process.
-
-        Args:
-        scaler: The scaler to be used.
-        optimizer: The optimizer for the step.
-        lr_schedule: The learning rate schedule.
-
-        Returns:
-        None
-        """
-        optimizer.step()
-        # for hpu
-        if is_hpex_available():
-            htcore.mark_step()
-        optimizer.zero_grad()
-        lr_schedule.step()
-
     @classmethod
     @torch.no_grad()
     def _sampling_inputs(
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
old mode 100644
new mode 100755
index 0bfbf038c..ec03c8a84
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -165,13 +165,6 @@ def __init__(
             **kwargs,
         )
 
-    def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]:
-        # flux transformer model's blocks will update hidden_states and encoder_hidden_states
-        input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
-        if q_inputs is not None:
-            q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str}
-        return inputs, q_inputs
-
     def _get_current_q_output(
         self,
         block: torch.nn.Module,
diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
old mode 100644
new mode 100755
index db95d9547..4d240b3cc
--- a/auto_round/compressors/utils.py
+++ b/auto_round/compressors/utils.py
@@ -957,13 +957,19 @@ def immediate_saving(rounder: object, m: torch.nn.Module, name: str = None, last
     import json
     from collections import OrderedDict
 
+    from auto_round.quantizers.utils import get_quantized_layer_names_outside_blocks
     from auto_round.utils import clear_memory, get_module
 
     # User configurable (can be preset on rounder)
     max_shard_size = getattr(rounder, "max_shard_size", "5GB")
     safe_serialization = getattr(rounder, "safe_serialization", True)
     if not hasattr(rounder, "quantized_layer_names_outside_blocks"):
-        rounder.quantized_layer_names_outside_blocks = rounder._get_quantized_layer_names_outside_blocks()
+        rounder.quantized_layer_names_outside_blocks = get_quantized_layer_names_outside_blocks(
+            rounder.model,
+            rounder.layer_config,
+            rounder.supported_types,
+            rounder.quant_block_list,
+        )
     layer_names = rounder.quantized_layer_names_outside_blocks
     if len(layer_names) > 0 and name != layer_names[-1]:
         last_group = False
diff --git a/auto_round/quantizers/algs/auto_round.py b/auto_round/quantizers/algs/auto_round.py
old mode 100644
new mode 100755
index db3330088..8d0577b88
--- a/auto_round/quantizers/algs/auto_round.py
+++ b/auto_round/quantizers/algs/auto_round.py
@@ -38,16 +38,21 @@
     preprocess_block_inputs,
     quantize_embedding_layer,
     register_act_max_hook,
+    update_inputs,
 )
+from auto_round.sign_sgd import SignSGD
 from auto_round.utils import (
+    check_is_cpu,
     check_to_quantized,
     clear_memory,
     convert_fp8_layer_to_linear,
     get_block_names,
     get_module,
+    htcore,
     is_auto_device_mapping,
     is_fp8_linear,
     is_fp8_model,
+    is_hpex_available,
     memory_monitor,
     mv_module_from_gpu,
     set_amax_for_all_moe_layers,
@@ -68,43 +73,42 @@ class ARQuantizer(AlgsBaseQuantizer):
 
     def __init__(self, compressor: "BaseCompressor"):
         super().__init__(compressor)
+        self.all_blocks = []
+        self.layer_names = []
+        self.all_q_inputs = None
+        self.optimizer = self._get_optimizer(None)
+        self.is_adam = False
 
-    def pre_quantize(self, *args, **kwargs):
-        return super().pre_quantize(*args, **kwargs)
-
-    def quantize(self, *args, **kwargs):
+    def _pre_quantize_impl(self, *args, **kwargs):
         if bool(self.compressor.quant_block_list):
-            all_blocks = self.compressor.quant_block_list
+            self.all_blocks = self.compressor.quant_block_list
         else:
-            all_blocks = get_block_names(self.compressor.model)
-
-        if len(all_blocks) == 0:
+            self.all_blocks = get_block_names(self.compressor.model)
+        if len(self.all_blocks) == 0:
             logger.warning("could not find blocks, exit with original model")
             return self.compressor.model, self.compressor.layer_config
 
         if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype:
             self.compressor.model = self.compressor.model.to(self.compressor.amp_dtype)
 
-        layer_names = get_quantized_layer_names_outside_blocks(
+        self.layer_names = get_quantized_layer_names_outside_blocks(
             model=self.compressor.model,
             layer_config=self.compressor.layer_config,
             supported_types=self.compressor.supported_types,
             quant_block_list=self.compressor.quant_block_list,
         )
-        start_time = time.time()
-        all_first_block_names = [block[0] for block in all_blocks]
-        if len(layer_names) > 0:
+        self.all_first_block_names = [block[0] for block in self.all_blocks]
+        if len(self.layer_names) > 0:
             logger.info(
-                "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names
+                "Starting to cache block inputs. This may be slow due to external block layers: %s", self.layer_names
             )
         else:
             logger.info("start to cache block inputs")
 
         # TODO: refactor this
-        all_inputs = self.compressor.try_cache_inter_data_gpucpu(
-            all_first_block_names, self.compressor.nsamples, layer_names=layer_names
+        self.all_inputs = self.compressor.try_cache_inter_data_gpucpu(
+            self.all_first_block_names, self.compressor.nsamples, layer_names=self.layer_names
         )
-
         is_quantized_embedding = quantize_embedding_layer(
             model=self.compressor.model,
             layer_config=self.compressor.layer_config,
@@ -114,13 +118,12 @@ def quantize(self, *args, **kwargs):
             device_list=self.compressor.device_list,
         )
         clear_memory(device_list=self.compressor.device_list)
-        all_q_inputs = None
         if is_quantized_embedding:
-            all_inputs = copy.deepcopy(self.compressor.inputs)
+            self.all_inputs = copy.deepcopy(self.compressor.inputs)
             clear_memory(self.compressor.inputs, device_list=self.compressor.device_list)
             # TODO: refactor this
-            all_q_inputs = self.compressor.try_cache_inter_data_gpucpu(
-                all_first_block_names, self.compressor.nsamples, layer_names=layer_names
+            self.all_q_inputs = self.compressor.try_cache_inter_data_gpucpu(
+                self.all_first_block_names, self.compressor.nsamples, layer_names=self.layer_names
             )
         self.compressor.model = mv_module_from_gpu(self.compressor.model)
         clear_memory(device_list=self.compressor.device_list)
@@ -130,21 +133,24 @@ def quantize(self, *args, **kwargs):
                 self.compressor.model
             )  # self.compressor.model.hf_device_map has not been changed
         logger.info("caching done")
-        if len(all_blocks) > 1:
-            pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.compressor.nblocks))
+
+    def _quantize_impl(self, *args, **kwargs):
+        start_time = time.time()
+
+        if len(self.all_blocks) > 1:
+            pbar = tqdm(range(0, sum([len(i) for i in self.all_blocks]), self.compressor.nblocks))
         else:
-            pbar = tqdm(range(0, len(all_blocks[0]), self.compressor.nblocks))  # move the alg warning outside pbar
+            pbar = tqdm(range(0, len(self.all_blocks[0]), self.compressor.nblocks))  # move the alg warning outside pbar
 
-        for block_names in all_blocks:
-            inputs = all_inputs[block_names[0]]
-            all_inputs.pop(block_names[0])
+        for block_names in self.all_blocks:
+            inputs = self.all_inputs[block_names[0]]
+            self.all_inputs.pop(block_names[0])
             q_inputs = None
-            if all_q_inputs is not None:
-                q_inputs = all_q_inputs[block_names[0]]
-                all_q_inputs.pop(block_names[0])
+            if self.all_q_inputs is not None:
+                q_inputs = self.all_q_inputs[block_names[0]]
+                self.all_q_inputs.pop(block_names[0])
 
-            # TODO: refactor this
-            inputs, q_inputs = self.compressor._update_inputs(inputs, q_inputs)
+            inputs, q_inputs = update_inputs(inputs, q_inputs, self.compressor.diffusion)
 
             clear_memory(self.compressor.inputs, device_list=self.compressor.device_list)
 
@@ -170,8 +176,15 @@ def quantize(self, *args, **kwargs):
                 )
         pbar.set_description("Quantizing done")
         pbar.close()
-        self._quantize_layers(layer_names, all_inputs)
+        self._quantize_layers(self.layer_names, self.all_inputs)
 
+        end_time = time.time()
+        cost_time = end_time - start_time
+        logger.info(f"quantization tuning time {cost_time}")
+
+        return self.compressor.model, self.compressor.layer_config
+
+    def _post_quantize_impl(self, *args, **kwargs):
         if is_fp8_model(self.compressor.model):
             for n, m in self.compressor.model.named_modules():
                 if is_fp8_linear(m):
@@ -180,10 +193,6 @@ def quantize(self, *args, **kwargs):
                     )
                     set_module(self.compressor.model, n, new_layer)
 
-        end_time = time.time()
-        cost_time = end_time - start_time
-        logger.info(f"quantization tuning time {cost_time}")
-
         # Dump a summary
         quantized_layers = []
         unquantized_layers = []
@@ -203,7 +212,6 @@ def quantize(self, *args, **kwargs):
         logger.info(summary_info)
 
         self.compressor.quantized = True
-        return self.compressor.model, self.compressor.layer_config
 
     def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
         """Quantizes specified layers based on inputs and configuration.
@@ -342,7 +350,7 @@ def _quantize_blocks(
             amp=self.compressor.amp,
             amp_dtype=self.compressor.amp_dtype,
             cache_device=self.compressor.cache_device,
-            diffusion=self.compressor.diffusion,
+            is_diffusion=self.compressor.diffusion,
         )
 
         if pbar is None:
@@ -363,23 +371,8 @@ def _quantize_blocks(
 
             m.config = model.config if hasattr(model, "config") else None
             q_input, input_ids = self.quantize_block(
-                m,
-                input_ids,
-                input_others,
-                q_input=q_input,
-                device=device,
+                m, input_ids, input_others, q_input=q_input, device=device, last_group=(i + nblocks) >= len(block_names)
             )
-            if hasattr(model, "config"):
-                del m.config
-            if self.compressor.immediate_packing:
-                for _, tmp_m in m.named_modules():
-                    if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)):
-                        continue
-                    self.compressor._immediate_pack(tmp_m.tmp_name)
-
-            if self.compressor.immediate_saving:
-                last_group = (i + nblocks) >= len(block_names)
-                immediate_saving(self.compressor, m, last_group=last_group)
         if pbar is not None:
             pbar.update(1)
 
@@ -396,7 +389,9 @@ def _quantize_blocks(
 
         clear_memory(device_list=self.compressor.device_list)
 
-    def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"):
+    def _quantize_layer_impl(
+        self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"
+    ):
         """Quantize a specific layer of the model using the provided inputs.
 
         Args:
@@ -462,12 +457,11 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.
         lr = torch.tensor(self.compressor.lr)
         minmax_lr = torch.tensor(self.compressor.minmax_lr)
         if self.compressor.enable_minmax_tuning:
-            optimizer = self.compressor.optimizer(
+            optimizer = self.optimizer(
                 [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0
             )
         else:
-            optimizer = self.compressor.optimizer(round_params, lr=lr, weight_decay=0)
-
+            optimizer = self.optimizer([{"params": round_params}], lr=lr, weight_decay=0)
         if self.compressor.lr_scheduler is None:
             lr_schedule = torch.optim.lr_scheduler.LinearLR(
                 optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.compressor.iters
@@ -477,7 +471,7 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.
         nsamples = len(inputs)
         last_best_iter = 0
         best_loss = torch.finfo(torch.float).max
-        scaler = self.compressor._get_scaler()  # pylint: disable=assignment-from-none
+        scaler = self._get_scaler()  # pylint: disable=assignment-from-none
         init_loss = None
         gradient_accumulate_steps = self.compressor.batch_size  # Force to low gpu
         total_loss = 0
@@ -546,7 +540,7 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.
                 num_elm = 1 if num_elm <= 0 else num_elm
                 total_loss += loss.item() / num_elm
 
-                self.compressor.scale_loss_and_backward(scaler, loss)
+                self._scale_loss_and_backward(scaler, loss)
             if i == 0:
                 init_loss = total_loss
 
@@ -561,7 +555,7 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.
             if not self.compressor.not_use_best_mse:
                 if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter:
                     break
-            self.compressor._step(scaler, optimizer, lr_schedule)
+            self._step(scaler, optimizer, lr_schedule)
 
         last_loss = total_loss
         best_iter = self.compressor.iters
@@ -574,7 +568,7 @@ def quantize_layer(self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.
         dump_info = f"quantized {layer_name},  loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
         logger.info(dump_info)
 
-    def quantize_block(
+    def _quantize_block_impl(
         self,
         block: torch.nn.Module,
         input_ids: Union[list[torch.Tensor], dict],
@@ -582,6 +576,7 @@ def quantize_block(
         q_input: Union[torch.Tensor, dict, None] = None,
         device: Union[str, torch.device] = "cpu",
         auto_offload=True,
+        **kwargs,
     ):
         """Quantize the weights of a given block of the model.
 
@@ -713,9 +708,8 @@ def quantize_block(
 
         lr = torch.tensor(self.compressor.lr)
         minmax_lr = torch.tensor(self.compressor.minmax_lr)
-        is_adam = "adam" in self.compressor.__class__.__name__.lower()
 
-        extra_kwargs = {} if is_adam else {"momentum": self.compressor.momentum}
+        extra_kwargs = {} if self.is_adam else {"momentum": self.compressor.momentum}
 
         if self.compressor.enable_minmax_tuning:
             params = [
@@ -725,7 +719,7 @@ def quantize_block(
         else:
             params = round_params
 
-        optimizer = self.compressor.optimizer(
+        optimizer = self.optimizer(
             params,
             lr=lr,
             weight_decay=0,
@@ -760,7 +754,7 @@ def quantize_block(
         if self.compressor.gradient_accumulate_steps != 1:
             mse_reduction = "sum"
         mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device)
-        scaler = self.compressor._get_scaler()  # pylint: disable=assignment-from-none
+        scaler = self._get_scaler()  # pylint: disable=assignment-from-none
         init_loss = None
         best_params = {}
         total_loss = 0
@@ -801,7 +795,7 @@ def quantize_block(
                     # clear memory to avoid OOM due to memory fragmentation
                     clear_memory_if_reached_threshold(threshold=0.5, device_list=self.compressor.device_list)
 
-                self.compressor._scale_loss_and_backward(scaler, loss)
+                self._scale_loss_and_backward(scaler, loss)
                 if self.compressor.low_gpu_mem_usage and card_0_in_high_risk:
                     # clear memory to avoid OOM due to memory fragmentation
                     clear_memory_if_reached_threshold(threshold=0.8, device_list=self.compressor.device_list)
@@ -822,7 +816,7 @@ def quantize_block(
             if not self.compressor.not_use_best_mse:
                 if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter:
                     break
-            self.compressor._step(scaler, optimizer, lr_schedule)
+            self._step(scaler, optimizer, lr_schedule)
 
         last_loss = total_loss
         best_iter = self.compressor.iters
@@ -883,6 +877,26 @@ def quantize_block(
 
             return None, output
 
+    def _post_quantize_block_impl(self, block: torch.nn.Module, *args, last_group: bool, **kwargs):
+        """Post-process after quantizing a block.
+
+        Args:
+        block: The block of the model that was quantized.
+
+        Returns:
+        None
+        """
+        if hasattr(block, "config"):
+            del block.config
+        if self.compressor.immediate_packing:
+            for _, tmp_m in block.named_modules():
+                if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)):
+                    continue
+                self.compressor._immediate_pack(tmp_m.tmp_name)
+
+        if self.compressor.immediate_saving:
+            immediate_saving(self.compressor, block, last_group=last_group)
+
     @staticmethod
     def _get_current_output(
         output: list[torch.Tensor], indices: list[int], batch_dim: int, diffusion: bool = False
@@ -896,3 +910,100 @@ def _get_current_output(
         current_output = [output[x] for x in indices]
         current_output = torch.cat(current_output, dim=batch_dim)
         return current_output
+
+    def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any):
+        """Performs a step in the optimization process.
+
+        Args:
+        scaler: The scaler to be used.
+        optimizer: The optimizer for the step.
+        lr_schedule: The learning rate schedule.
+
+        Returns:
+        None
+        """
+        optimizer.step()
+        # for hpu
+        if is_hpex_available():
+            htcore.mark_step()
+        optimizer.zero_grad()
+        lr_schedule.step()
+
+    def _scale_loss_and_backward(self, scaler: Any, loss: torch.Tensor) -> torch.Tensor:
+        """Scales the loss and performs backward pass.
+
+        Args:
+        scaler: The scaler to be used.
+        loss: The loss to be scaled.
+
+        Returns:
+        The scaled loss.
+        """
+        scale_loss = loss * 1000
+        scale_loss.backward()
+        if is_hpex_available():
+            htcore.mark_step()
+        return scale_loss
+
+    def _get_scaler(self):
+        """Returns scaler, in SignRound, no need to use scaler."""
+        return None
+
+    def _get_optimizer(self, optimizer: Any):
+        """Returns the specified optimizer. In SignRound, we fix the optimizer.
+
+        Args:
+        optimizer: The optimizer to be used.
+
+        Returns:
+        The specified optimizer.
+        """
+        return SignSGD
+
+
+class ARAdamQuantizer(ARQuantizer):
+    """AutoRound Quantizer with Adam optimizer."""
+
+    def __init__(self, compressor: "BaseCompressor"):
+        super().__init__(compressor)
+        self.optimizer = self._get_optimizer("AdamW")
+        self.is_adam = True
+
+    def _step(self, scaler, optimizer, lr_schedule):
+        if scaler is not None:
+            scaler.step(optimizer)
+            optimizer.zero_grad()
+            lr_schedule.step()
+            scaler.update()
+        else:
+            optimizer.step()
+            optimizer.zero_grad()
+            lr_schedule.step()
+        if is_hpex_available():
+            htcore.mark_step()
+
+    def _scale_loss_and_backward(self, scaler, loss):
+        if scaler is not None:
+            loss = scaler.scale(loss)
+
+        loss.backward()
+        if is_hpex_available():
+            htcore.mark_step()
+        return loss
+
+    def _get_scaler(self):
+        scaler = None
+        if self.compressor.amp and not check_is_cpu(self.compressor.device):
+            from torch.cuda.amp import GradScaler
+
+            scaler = GradScaler(init_scale=1024, growth_interval=100000)
+        return scaler
+
+    def _get_optimizer(self, optimizer):
+        if optimizer is None:
+            optimizer = torch.optim.AdamW
+        elif isinstance(optimizer, str):
+            optimizer = getattr(torch.optim, optimizer)
+        else:
+            optimizer = optimizer
+        return optimizer
diff --git a/auto_round/quantizers/algs/base.py b/auto_round/quantizers/algs/base.py
old mode 100644
new mode 100755
index 62e97c134..bcb112094
--- a/auto_round/quantizers/algs/base.py
+++ b/auto_round/quantizers/algs/base.py
@@ -21,7 +21,31 @@
     from auto_round.compressors.base import BaseCompressor
 
 
-class AlgsBaseQuantizer(BaseQuantizer):
+class AlgsBaseQuantizer(BaseQuantizer, ABC):
+    def _pre_quantize_impl(self, *args, **kwargs):
+        pass
+
     @abstractmethod
-    def quantize(self, *args, **kwargs):
+    def _quantize_impl(self, *args, **kwargs):
+        pass
+
+    def _post_quantize_impl(self, *args, **kwargs):
+        pass
+
+    def _pre_quantize_layer_impl(self, *args, **kwargs):
+        pass
+
+    def _quantize_layer_impl(self, *args, **kwargs):
+        pass
+
+    def _post_quantize_layer_impl(self, *args, **kwargs):
+        pass
+
+    def _pre_quantize_block_impl(self, *args, **kwargs):
+        pass
+
+    def _quantize_block_impl(self, *args, **kwargs):
+        pass
+
+    def _post_quantize_block_impl(self, *args, **kwargs):
         pass
diff --git a/auto_round/quantizers/algs/rtn.py b/auto_round/quantizers/algs/rtn.py
old mode 100644
new mode 100755
index f1b4676f4..d70ab6a76
--- a/auto_round/quantizers/algs/rtn.py
+++ b/auto_round/quantizers/algs/rtn.py
@@ -72,7 +72,7 @@ def __init__(self, compressor: "BaseCompressor"):
             n for n, m in self.compressor.model.named_modules() if check_to_quantized(m)
         ]
 
-    def pre_quantize(self, *args, **kwargs):
+    def _pre_quantize_impl(self, *args, **kwargs):
         if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype:
             self.compressor.model.to(self.compressor.amp_dtype)
 
@@ -96,7 +96,7 @@ def pre_quantize(self, *args, **kwargs):
             logger.info("Finished updating fused layer global scales.")
 
     @torch.inference_mode()
-    def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
+    def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
         """Quantize all modules in the model using RTN (Round-To-Nearest) strategy.
 
         If the target format includes GGUF with `k`, and optimized RTN is enabled,
@@ -172,9 +172,6 @@ def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
         self.compressor.quantized = True
         return self.compressor.model, self.compressor.layer_config
 
-    def post_quantize(self, *args, **kwargs):
-        pass
-
     def _quantize_via_rtn_blockwise(self) -> None:
         """Quantize model layers block by block using cached inputs."""
 
@@ -416,7 +413,7 @@ def get_imatrix_hook(module, input, output):
         return hook_handles
 
     @torch.inference_mode()
-    def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
+    def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
         enable_imatrix = False
         has_gguf_k = (
             any(fmt.is_gguf() and "k" in fmt.output_format for fmt in getattr(self.compressor, "formats", []))
@@ -434,7 +431,7 @@ def quantize(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
             self.compressor.quantized = True
             return self.compressor.model, self.compressor.layer_config
         else:
-            return super().quantize(*args, **kwargs)
+            return super()._quantize_impl(*args, **kwargs)
 
     def _quant_rtn_with_imatrix(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
         """Quantize all modules in the model using Optimized RTN strategy.
diff --git a/auto_round/quantizers/base.py b/auto_round/quantizers/base.py
old mode 100644
new mode 100755
index 1e1a73f17..d4f70a7ef
--- a/auto_round/quantizers/base.py
+++ b/auto_round/quantizers/base.py
@@ -23,29 +23,81 @@ class BaseQuantizer(ABC):
     def __init__(self, compressor: "BaseCompressor"):
         self.compressor = compressor
 
+    def __mro_call(self, method_name: str, *args, **kwargs):
+        for cls in type(self).mro():
+            method = cls.__dict__.get(method_name, None)
+            if method:
+                method(self, *args, **kwargs)
+
     def pre_quantize(self, *args, **kwargs):
+        self.__mro_call("_pre_quantize_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _pre_quantize_impl(self, *args, **kwargs):
         pass
 
     def quantize(self, *args, **kwargs):
+        self.pre_quantize(*args, **kwargs)
+        self._quantize_impl(*args, **kwargs)
+        self.post_quantize(*args, **kwargs)
+        return self.compressor.model, self.compressor.layer_config
+
+    @abstractmethod
+    def _quantize_impl(self, *args, **kwargs):
         pass
 
     def post_quantize(self, *args, **kwargs):
+        self.__mro_call("_post_quantize_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _post_quantize_impl(self, *args, **kwargs):
         pass
 
     def pre_quantize_layer(self, *args, **kwargs):
+        self.__mro_call("_pre_quantize_layer_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _pre_quantize_layer_impl(self, *args, **kwargs):
         pass
 
     def quantize_layer(self, *args, **kwargs):
+        self.pre_quantize_layer(*args, **kwargs)
+        result = self._quantize_layer_impl(*args, **kwargs)
+        self.post_quantize_layer(*args, **kwargs)
+        return result
+
+    @abstractmethod
+    def _quantize_layer_impl(self, *args, **kwargs):
         pass
 
     def post_quantize_layer(self, *args, **kwargs):
         pass
+        self.__mro_call("_post_quantize_layer_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _post_quantize_layer_impl(self, *args, **kwargs):
+        pass
 
     def pre_quantize_block(self, *args, **kwargs):
+        self.__mro_call("_pre_quantize_block_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _pre_quantize_block_impl(self, *args, **kwargs):
         pass
 
     def quantize_block(self, *args, **kwargs):
+        self.pre_quantize_block(*args, **kwargs)
+        result = self._quantize_block_impl(*args, **kwargs)
+        self.post_quantize_block(*args, **kwargs)
+        return result
+
+    @abstractmethod
+    def _quantize_block_impl(self, *args, **kwargs):
         pass
 
     def post_quantize_block(self, *args, **kwargs):
+        self.__mro_call("_post_quantize_block_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _post_quantize_block_impl(self, *args, **kwargs):
         pass
diff --git a/auto_round/quantizers/entrypoint.py b/auto_round/quantizers/entrypoint.py
old mode 100644
new mode 100755
index 473dcfca1..34745ca46
--- a/auto_round/quantizers/entrypoint.py
+++ b/auto_round/quantizers/entrypoint.py
@@ -16,14 +16,14 @@
 if TYPE_CHECKING:
     from auto_round.compressors.base import BaseCompressor
 
-from auto_round.quantizers.algs.auto_round import ARQuantizer
+from auto_round.quantizers.algs.auto_round import ARAdamQuantizer, ARQuantizer
 from auto_round.quantizers.algs.rtn import OptRTNQuantizer, RTNQuantizer
 
 
 class AutoRoundQuantizer:
     def __new__(cls, compressor: "BaseCompressor", dynamic_quantizers: dict = None):
         assert dynamic_quantizers is not None, "Please provide dynamic_quantizers dict."
-        quantizer_cls = type("AutoRoundQuantizer", tuple(dynamic_quantizers.values()), {})
+        quantizer_cls = type("AutoRoundQuantizer", (dynamic_quantizers["data_type"], dynamic_quantizers["algs"]), {})
         return quantizer_cls(compressor)
 
 
@@ -33,9 +33,7 @@ def __init__(self, quantizers: list[AutoRoundQuantizer]):
 
     def quantize(self, *args, **kwargs):
         for quantizer in self.quantizers:
-            quantizer.pre_quantize(*args, **kwargs)
             model, layer_config = quantizer.quantize(*args, **kwargs)
-            quantizer.post_quantize(*args, **kwargs)
         return model, layer_config
 
 
@@ -43,7 +41,7 @@ def create_quantizers(compressor: "BaseCompressor"):
 
     alg_cls = None
     if compressor.iters > 0:
-        alg_cls = ARQuantizer
+        alg_cls = ARQuantizer if compressor.enable_adam is False else ARAdamQuantizer
     else:
         alg_cls = OptRTNQuantizer if compressor.disable_opt_rtn is False else RTNQuantizer
 
diff --git a/auto_round/quantizers/utils.py b/auto_round/quantizers/utils.py
old mode 100644
new mode 100755
index 3ea0c009f..a6f1dd4bc
--- a/auto_round/quantizers/utils.py
+++ b/auto_round/quantizers/utils.py
@@ -220,8 +220,8 @@ def get_non_zero_cnt(tensor: list[torch.Tensor], indices: list[int]) -> int:
     return non_zero_cnt
 
 
-def split_inputs(inputs: dict, first_input_name: str, diffusion: bool = False) -> tuple[torch.Tensor, dict]:
-    if diffusion:
+def split_inputs(inputs: dict, first_input_name: str, is_diffusion: bool = False) -> tuple[torch.Tensor, dict]:
+    if is_diffusion:
         input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
         input_ids = {k: inputs.pop(k, None) for k in input_id_str}
         input_others = inputs
@@ -240,9 +240,9 @@ def preprocess_block_inputs(
     amp: bool = False,
     amp_dtype: torch.dtype = torch.float32,
     cache_device: Union[str, torch.device] = "cpu",
-    diffusion: bool = False,
+    is_diffusion: bool = False,
 ):
-    input_ids, input_others = split_inputs(inputs, first_input_name, diffusion=diffusion)
+    input_ids, input_others = split_inputs(inputs, first_input_name, is_diffusion=is_diffusion)
     clear_memory(device_list=device_list)
     input_ids = to_device(input_ids, cache_device)
     input_others = to_device(input_others, cache_device)
@@ -260,3 +260,23 @@ def preprocess_block_inputs(
             for i in range(len(input_others[key])):
                 to_dtype(input_others[key][i], tmp_dtype)
     return input_ids, input_others
+
+
+def update_inputs(inputs: dict, q_inputs: dict, is_diffusion: bool) -> tuple[dict, dict]:
+    if is_diffusion:
+        input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
+        if q_inputs is not None:
+            q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str}
+        return inputs, q_inputs
+    else:
+        keys = inputs.keys()
+        input_id_str = [key for key in keys if key.startswith("hidden_state")]
+        if len(input_id_str) != 1:
+            raise RuntimeError(
+                "hidden_states arg mismatch error,"
+                "please raise an issue in https://github.com/intel/auto-round/issues"
+            )
+        inputs["input_ids"] = inputs.pop(input_id_str[0], None)
+        if q_inputs is not None:
+            q_inputs = q_inputs.pop(input_id_str[0], None)
+        return inputs, q_inputs
diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py
old mode 100644
new mode 100755
index 4ba59629e..6f736eb98
--- a/test/test_cpu/export/test_gguf_format.py
+++ b/test/test_cpu/export/test_gguf_format.py
@@ -29,7 +29,7 @@ def teardown_class(self):
     def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
         python_path = sys.executable
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {tiny_gemma_model_path} "
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {tiny_gemma_model_path} "
             f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m"
         )
         if res > 0 or res == -1:
@@ -37,7 +37,7 @@ def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
         shutil.rmtree("./saved", ignore_errors=True)
 
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {tiny_qwen_model_path}"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {tiny_qwen_model_path}"
             f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
         )
         if res > 0 or res == -1:
@@ -165,7 +165,7 @@ def test_all_format(self, tiny_qwen_model_path):
         # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]:
         for gguf_format in ["gguf:q4_k_m"]:
             res = os.system(
-                f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {model_name} "
+                f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {model_name} "
                 f" --bs 16 --iters 1 --nsamples 1 --seqlen 16 --format {gguf_format}"
             )
             if res > 0 or res == -1:
@@ -173,7 +173,7 @@ def test_all_format(self, tiny_qwen_model_path):
             shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
             res = os.system(
-                f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {model_name}"
+                f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {model_name}"
                 f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --format fake,{gguf_format}"
             )
             if res > 0 or res == -1:
@@ -182,7 +182,7 @@ def test_all_format(self, tiny_qwen_model_path):
 
         # test mixed q2_k_s
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {model_name}"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {model_name}"
             f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --scheme GGUF:Q2_K_MIXED"
         )
         if res > 0 or res == -1:
diff --git a/test/test_cpu/utils/test_cli_usage.py b/test/test_cpu/utils/test_cli_usage.py
old mode 100644
new mode 100755
index 6ba676936..f57e9c102
--- a/test/test_cpu/utils/test_cli_usage.py
+++ b/test/test_cpu/utils/test_cli_usage.py
@@ -25,24 +25,24 @@ def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path):
         python_path = sys.executable
 
         # Test llm script
-        res = os.system(f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round -h")
+        res = os.system(f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
@@ -50,23 +50,23 @@ def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path):
         # test mllm script
 
         # test auto_round_mllm --eval help
-        res = os.system(f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --eval -h")
+        res = os.system(f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --eval -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         # test auto_round_mllm --lmms help
-        res = os.system(f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --eval --lmms -h")
+        res = os.system(f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --eval --lmms -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round"
             " --quant_nontext_module --output_dir ./saved "
         )
         if res > 0 or res == -1:
diff --git a/test/test_cuda/advanced/test_multiple_card_calib.py b/test/test_cuda/advanced/test_multiple_card_calib.py
old mode 100644
new mode 100755
index 06c869a86..8eef8b981
--- a/test/test_cuda/advanced/test_multiple_card_calib.py
+++ b/test/test_cuda/advanced/test_multiple_card_calib.py
@@ -44,7 +44,7 @@ def test_multiple_card_calib(self):
 
         ##test llm script
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
@@ -55,7 +55,7 @@ def test_multiple_card_nvfp4(self):
 
         ##test llm script
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {python_path} -m auto_round --model facebook/opt-125m  --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {python_path} -m auto_round --model facebook/opt-125m  --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py
old mode 100644
new mode 100755
index a6089a086..454a2c6ce
--- a/test/test_cuda/export/test_gguf.py
+++ b/test/test_cuda/export/test_gguf.py
@@ -59,7 +59,7 @@ def test_gguf_format(self, tiny_qwen_model_path, dataloader):
 
         save_dir = os.path.join(os.path.dirname(__file__), "saved")
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 "
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 "
             f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0"
         )
         print(save_dir)
diff --git a/test/test_cuda/models/test_support_vlms.py b/test/test_cuda/models/test_support_vlms.py
old mode 100644
new mode 100755
index 4f9373ca5..a339ba69a
--- a/test/test_cuda/models/test_support_vlms.py
+++ b/test/test_cuda/models/test_support_vlms.py
@@ -30,7 +30,7 @@ def test_qwen2(self):
         model_path = "/models/Qwen2-VL-2B-Instruct/"
         # test tune
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "qwen2 tuning fail"
@@ -85,7 +85,7 @@ def test_phi3(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "Phi-3.5 tuning fail"
@@ -133,7 +133,7 @@ def test_phi3_vision_awq(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --quant_nontext_module "
             f"--nsample 64 --seqlen 32 "
             f"--format auto_awq --output_dir {self.save_dir} --device {self.device}"
@@ -181,7 +181,7 @@ def test_glm(self):
         model_path = "/models/glm-4v-9b/"
         ## test tune
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {self.python_path} -m auto_round "
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {self.python_path} -m auto_round "
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "glm-4v-9b tuning fail"
@@ -190,7 +190,7 @@ def test_granite_vision(self):
         model_path = "/models/granite-vision-3.2-2b"
         ## test tune
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' {self.python_path} -m auto_round "
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' {self.python_path} -m auto_round "
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "granite-vision-3.2-2b tuning fail"
diff --git a/test/test_cuda/utils/test_alg_ext.py b/test/test_cuda/utils/test_alg_ext.py
old mode 100644
new mode 100755
index d2e8e4bfb..16f42bda5
--- a/test/test_cuda/utils/test_alg_ext.py
+++ b/test/test_cuda/utils/test_alg_ext.py
@@ -52,13 +52,13 @@ def test_cli(self, tiny_opt_model_path):
         python_path = sys.executable
 
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"PYTHONPATH='AUTO_ROUND_PATH:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32"
+            f"PYTHONPATH='{AUTO_ROUND_PATH}:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"

From 85757811b2f496c07c6e7d7324c198165d87f6aa Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 27 Jan 2026 14:54:19 +0800
Subject: [PATCH 5/5] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/quantizers/algs/rtn.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/auto_round/quantizers/algs/rtn.py b/auto_round/quantizers/algs/rtn.py
index 1d6710f85..585c2631a 100755
--- a/auto_round/quantizers/algs/rtn.py
+++ b/auto_round/quantizers/algs/rtn.py
@@ -39,7 +39,7 @@
     check_to_quantized,
     clear_memory,
     convert_fp8_layer_to_linear,
-    convert_fp8_model_to_16b_model,
+    convert_fp8_module_to_16b,
     flatten_list,
     get_block_names,
     get_lm_head_name,
@@ -190,7 +190,7 @@ def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, An
                         materialize_model_(block)
                         for name, m in block.named_modules():
                             if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names:
-                                self._quantize_layer_via_rtn(m.global_name, to_cpu=self.low_gpu_mem_usage)
+                                self._quantize_layer_via_rtn(m.global_name, to_cpu=self.compressor.low_gpu_mem_usage)
                                 all_to_quantized_module_names.remove(m.global_name)
                             elif (
                                 not any(m.children())
@@ -198,10 +198,10 @@ def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, An
                                 and m.global_name not in tied_weights_layers
                             ):
                                 set_module(self.compressor.model, m.global_name, copy.deepcopy(m))
-                                if self.is_immediate_saving:
+                                if self.compressor.is_immediate_saving:
                                     shard_writer(self, name=m.global_name)
                                 m.to("meta")
-                        clear_memory(device_list=self.device_list)
+                        clear_memory(device_list=self.compressor.device_list)
                         memory_monitor.log_summary()
                         pbar.update(1)
                 cnt = 1
@@ -210,7 +210,7 @@ def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, An
                     self._quantize_layer_via_rtn(name, to_cpu=True)
                     cnt += 1
                     if cnt % 10 == 0:
-                        clear_memory(device_list=self.device_list)
+                        clear_memory(device_list=self.compressor.device_list)
                         memory_monitor.log_summary()
             else:
                 materialize_model_(self.model)
@@ -231,8 +231,8 @@ def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, An
                     cnt += 1
         # Convert remaining fp8
         if is_fp8_model(self.compressor.model):
-            convert_fp8_model_to_16b_model(self.compressor.model, self.compressor.amp_dtype, self.compressor.device)
-        if self.is_immediate_saving:
+            convert_fp8_module_to_16b(self.compressor.model, self.compressor.amp_dtype, self.compressor.device)
+        if self.compressor.is_immediate_saving:
             shard_writer(self, is_finalize=True)
         self.compressor.quantized = True
         return self.compressor.model, self.compressor.layer_config
@@ -310,9 +310,7 @@ def _quantize_via_rtn_blockwise(self) -> None:
                 materialize_model_(block)
                 block.to("cpu")
                 if is_fp8_model(self.compressor.model):
-                    convert_fp8_model_to_16b_model(
-                        block, dtype=self.compressor.amp_dtype, device=self.compressor.device
-                    )
+                    convert_fp8_module_to_16b(block, dtype=self.compressor.amp_dtype, device=self.compressor.device)
 
                 if is_auto_device_mapping(self.compressor.device_map) and len(self.compressor.device_list) > 1:
                     set_auto_device_map_for_block_with_tuning(
@@ -494,7 +492,7 @@ def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, An
             self._quant_rtn_with_imatrix(self.all_to_quantized_module_names)
             # Convert remaining fp8
             if is_fp8_model(self.compressor.model):
-                convert_fp8_model_to_16b_model(self.compressor.model, self.compressor.amp_dtype, self.compressor.device)
+                convert_fp8_module_to_16b(self.compressor.model, self.compressor.amp_dtype, self.compressor.device)
             self.compressor.quantized = True
             return self.compressor.model, self.compressor.layer_config
         else: