diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
old mode 100644
new mode 100755
diff --git a/auto_round/__init__.py b/auto_round/__init__.py
old mode 100644
new mode 100755
index 509885b33..e075b37e0
--- a/auto_round/__init__.py
+++ b/auto_round/__init__.py
@@ -14,7 +14,7 @@
 from auto_round.autoround import AutoRound
 
 # support for old api
-from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam, AutoRoundDiffusion
+from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundDiffusion
 from auto_round.schemes import QuantizationScheme
 from auto_round.auto_scheme import AutoScheme
 from auto_round.utils import LazyImport
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
old mode 100644
new mode 100755
index 33ca1ccd7..7be3e3136
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -18,7 +18,6 @@
 import torch
 
 from auto_round.compressors import (
-    AdamCompressor,
     BaseCompressor,
     DiffusionCompressor,
     ExtraConfig,
@@ -173,8 +172,6 @@ def __new__(
                 extra_config.diffusion_config = None
             model_cls.append(LLMCompressor)
 
-        if enable_adam:
-            model_cls.append(AdamCompressor)
         dynamic_compressor = type("AutoRound", tuple(model_cls), {})
         if extra_config:
             kwargs.update(extra_config.to_dict())
@@ -371,110 +368,6 @@ def __init__(
         )
 
 
-@deprecated("AutoRound")
-class AutoRoundAdam(AdamCompressor):
-    """Class for quantization with optimizers like adamw of a PyTorch model.
-
-    Args:
-        model: The PyTorch model to be quantized.
-        tokenizer: An optional tokenizer for processing input data.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
-        scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
-        bits (int): Number of bits for quantization (default is 4).
-        group_size (int): Size of the quantization group (default is 128).
-        sym (bool): Whether sym to be used (default is True).
-        layer_config (dict): Configuration for weight quantization (default is None).
-        batch_size (int): Batch size for training (default is 8).
-        amp (bool): Whether to use automatic mixed precision (default is True).
-        device: The device to be used for training (default is "auto").
-        lr_scheduler: The learning rate scheduler to be used.
-        dataset: The default dataset name (default is "NeelNanda/pile-10k").
-        enable_quanted_input (bool): Whether to use quantized input data (default is True).
-        enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True).
-        lr (float): The learning rate (default is 0.005).
-        minmax_lr (float): The learning rate for min-max tuning (default is None).
-        low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False).
-        iters (int): Number of iterations (default is 200).
-        seqlen (int): Length of the sequence.
-        nsamples (int): Number of samples (default is 128).
-        sampler (str): The sampling method (default is "rand").
-        seed (int): The random seed (default is 42).
-        nblocks (int): Number of blocks (default is 1).
-        gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
-        not_use_best_mse (bool): Whether to use mean squared error (default is False).
-        dynamic_max_gap (int): The dynamic maximum gap (default is -1).
-        data_type (str): The data type to be used (default is "int").
-        scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
-                           have different choices.
-        act_bits (int): Number of bits for activation quantization. Default is 16.
-        act_group_size (int): Group size for activation quantization. Default is None.
-        act_sym (bool): Whether to use symmetric activation quantization. Default is None.
-        act_data_type (str): Specifies the data type for activations.
-                             Defaults to None, in which case it inherits the weight data type.
-        act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
-        to_quant_block_names (str|list): A string or list whose elements are list of
-                            block's layer names to be quantized.
-        enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning
-        enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer function
-        **kwargs: Additional keyword arguments.
-
-    Returns:
-        The quantized model.
-    """
-
-    bits: int | None
-    group_size: int | None
-    sym: bool | None
-    data_type: str | None
-    act_bits: int | None
-    act_group_size: int | None
-    act_sym: bool | None
-    act_data_type: str | None
-    act_dynamic: bool | None
-    super_bits: int | None
-    super_group_size: int | None
-
-    def __init__(
-        self,
-        model: Union[torch.nn.Module, str],
-        tokenizer=None,
-        platform: str = "hf",
-        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
-        layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
-        dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
-        iters: int = 200,
-        seqlen: int = 2048,
-        nsamples: int = 128,
-        batch_size: int = 8,
-        gradient_accumulate_steps: int = 1,
-        low_gpu_mem_usage: bool = False,
-        device_map: Union[str, int, torch.device, dict] = 0,
-        enable_torch_compile: bool = False,
-        seed: int = 42,
-        optimizer="AdamW",
-        **kwargs,
-    ):
-        super().__init__(
-            model=model,
-            tokenizer=tokenizer,
-            platform=platform,
-            scheme=scheme,
-            layer_config=layer_config,
-            batch_size=batch_size,
-            dataset=dataset,
-            low_gpu_mem_usage=low_gpu_mem_usage,
-            iters=iters,
-            seqlen=seqlen,
-            nsamples=nsamples,
-            seed=seed,
-            gradient_accumulate_steps=gradient_accumulate_steps,
-            enable_torch_compile=enable_torch_compile,
-            device_map=device_map,
-            optimizer=optimizer,
-            **kwargs,
-        )
-
-
 @deprecated("AutoRound")
 class AutoRoundMLLM(MLLMCompressor):
     """Class for automatic rounding-based quantization with MLLMs.
diff --git a/auto_round/compressors/__init__.py b/auto_round/compressors/__init__.py
old mode 100644
new mode 100755
index 6f8ddf681..05623ecb7
--- a/auto_round/compressors/__init__.py
+++ b/auto_round/compressors/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from auto_round.compressors.adam import AdamCompressor
 from auto_round.compressors.base import BaseCompressor
 from auto_round.compressors.base import LLMCompressor
 from auto_round.compressors.mllm.compressor import MLLMCompressor
diff --git a/auto_round/compressors/adam.py b/auto_round/compressors/adam.py
deleted file mode 100644
index fb79cf39a..000000000
--- a/auto_round/compressors/adam.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) 2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Union
-
-import torch
-
-from auto_round.compressors.base import BaseCompressor
-from auto_round.schemes import QuantizationScheme
-from auto_round.utils import check_is_cpu, htcore, is_hpex_available
-
-
-class AdamCompressor(BaseCompressor):
-    """Class for quantization with optimizers like adamw of a PyTorch model.
-
-    Args:
-        model: The PyTorch model to be quantized.
-        tokenizer: An optional tokenizer for processing input data.
-        platform (str): The platform to load pretrained moded, options: ["hf", "model_scope"]
-        scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
-        bits (int): Number of bits for quantization (default is 4).
-        group_size (int): Size of the quantization group (default is 128).
-        sym (bool): Whether sym to be used (default is True).
-        layer_config (dict): Configuration for weight quantization (default is None).
-        batch_size (int): Batch size for training (default is 8).
-        amp (bool): Whether to use automatic mixed precision (default is True).
-        device: The device to be used for training (default is "auto").
-        lr_scheduler: The learning rate scheduler to be used.
-        dataset: The default dataset name (default is "NeelNanda/pile-10k").
-        enable_quanted_input (bool): Whether to use quantized input data (default is True).
-        enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True).
-        lr (float): The learning rate (default is 0.005).
-        minmax_lr (float): The learning rate for min-max tuning (default is None).
-        low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False).
-        iters (int): Number of iterations (default is 200).
-        seqlen (int): Length of the sequence.
-        nsamples (int): Number of samples (default is 128).
-        sampler (str): The sampling method (default is "rand").
-        seed (int): The random seed (default is 42).
-        nblocks (int): Number of blocks (default is 1).
-        gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
-        not_use_best_mse (bool): Whether to use mean squared error (default is False).
-        dynamic_max_gap (int): The dynamic maximum gap (default is -1).
-        data_type (str): The data type to be used (default is "int").
-        scale_dtype (str): The data type of quantization scale to be used (default is "float16"), different kernels
-                           have different choices.
-        act_bits (int): Number of bits for activation quantization. Default is 16.
-        act_group_size (int): Group size for activation quantization. Default is None.
-        act_sym (bool): Whether to use symmetric activation quantization. Default is None.
-        act_data_type (str): Specifies the data type for activations.
-                             Defaults to None, in which case it inherits the weight data type.
-        act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
-        to_quant_block_names (str|list): A string or list whose elements are list of
-                            block's layer names to be quantized.
-        enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning
-        enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer function
-        **kwargs: Additional keyword arguments.
-
-    Returns:
-        The quantized model.
-    """
-
-    bits: int | None
-    group_size: int | None
-    sym: bool | None
-    data_type: str | None
-    act_bits: int | None
-    act_group_size: int | None
-    act_sym: bool | None
-    act_data_type: str | None
-    act_dynamic: bool | None
-    super_bits: int | None
-    super_group_size: int | None
-
-    def __init__(
-        self,
-        model: Union[torch.nn.Module, str],
-        tokenizer=None,
-        platform="hf",
-        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
-        layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
-        dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
-        iters: int = 200,
-        seqlen: int = 2048,
-        nsamples: int = 128,
-        batch_size: int = 8,
-        gradient_accumulate_steps: int = 1,
-        low_gpu_mem_usage: bool = False,
-        device_map: Union[str, int, torch.device, dict] = 0,
-        enable_torch_compile: bool = False,
-        seed: int = 42,
-        optimizer="AdamW",
-        **kwargs,
-    ):
-        super(AdamCompressor, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            platform=platform,
-            scheme=scheme,
-            layer_config=layer_config,
-            batch_size=batch_size,
-            dataset=dataset,
-            low_gpu_mem_usage=low_gpu_mem_usage,
-            iters=iters,
-            seqlen=seqlen,
-            nsamples=nsamples,
-            seed=seed,
-            gradient_accumulate_steps=gradient_accumulate_steps,
-            enable_torch_compile=enable_torch_compile,
-            device_map=device_map,
-            **kwargs,
-        )
-
-        self.optimizer = self._get_optimizer(optimizer)
-
-    def _get_optimizer(self, optimizer):
-        if optimizer is None:
-            optimizer = torch.optim.AdamW
-        elif isinstance(optimizer, str):
-            optimizer = getattr(torch.optim, optimizer)
-        else:
-            optimizer = optimizer
-        return optimizer
-
-    def _get_scaler(self):
-        scaler = None
-        if self.amp and not check_is_cpu(self.device):
-            from torch.cuda.amp import GradScaler
-
-            scaler = GradScaler(init_scale=1024, growth_interval=100000)
-        return scaler
-
-    def _scale_loss_and_backward(self, scaler, loss):
-        if scaler is not None:
-            loss = scaler.scale(loss)
-
-        loss.backward()
-        if is_hpex_available():
-            htcore.mark_step()
-        return loss
-
-    def _step(self, scaler, optimizer, lr_schedule):
-        if scaler is not None:
-            scaler.step(optimizer)
-            optimizer.zero_grad()
-            lr_schedule.step()
-            scaler.update()
-        else:
-            optimizer.step()
-            optimizer.zero_grad()
-            lr_schedule.step()
-        if is_hpex_available():
-            htcore.mark_step()
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
old mode 100644
new mode 100755
index b1f46cc1e..3d43ce929
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -50,7 +50,6 @@
     reset_params,
     set_layer_config,
 )
-from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size
 from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG, ModelType
 from auto_round.formats import OutputFormat, get_formats
@@ -258,6 +257,7 @@ def __init__(
         self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
         self.trust_remote_code = kwargs.pop("trust_remote_code") if "trust_remote_code" in kwargs else True
         self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
+        self.enable_adam = kwargs.pop("enable_adam") if "enable_adam" in kwargs else False
         self.quantized = False
         if isinstance(model, str):
             model, tokenizer = llm_load_model(
@@ -425,7 +425,6 @@ def __init__(
         self.not_use_best_mse = not_use_best_mse
         self.dynamic_max_gap = dynamic_max_gap
         self.lr_scheduler = lr_scheduler
-        self.optimizer = self._get_optimizer(None)
         self.disable_opt_rtn = disable_opt_rtn
 
         # Whether to pack the layer immediately after tuning
@@ -912,277 +911,6 @@ def _get_save_folder_name(self, format: OutputFormat) -> str:
 
         return self.orig_output_dir
 
-    @torch.inference_mode()
-    def _quantize_embedding_layer(self):
-        """Quantizes embedding layers in the model according to the configuration.
-
-        This method iterates through all modules in the model, identifies embedding
-        layers specified in `self.layer_config`, and applies the appropriate quantization
-        function based on bit precision, grouping strategy, and dtype.
-
-        Returns:
-            bool: True if the quantization process completes without critical errors.
-        """
-        is_quantized = False
-        for name, module in self.model.named_modules():
-            # Skip non-Embedding modules or layers not in config
-            if not isinstance(module, torch.nn.Embedding) or name not in self.layer_config:
-                continue
-
-            config = self.layer_config[name]
-
-            # Skip layers that are not marked for quantization
-            if not check_to_quantized(config):
-                continue
-            is_quantized = True
-            config["scale_dtype"] = self.scale_dtype
-            dtype = config["data_type"]
-
-            # Determine quantization function key with symmetry/asymmetry
-            if dtype not in QUANT_FUNC_WITH_DTYPE:
-                dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}"
-
-            # Optionally use optimized rounding (RTN) variant
-            if not self.disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE:
-                dtype = f"rtn_{dtype}"
-
-            quant_func = QUANT_FUNC_WITH_DTYPE[dtype]
-            dtype = module.weight.dtype
-            # As typically float32 are used in RTN to search scale zp,
-            # to avoid cache a bf16 copy we'd better use float32
-            if config.get("super_group_size", None) is not None:
-                dtype = torch.float32
-
-            # Attempt quantization on GPU, fall back to CPU if OOM
-            try:
-                weight, scale, zp = quant_func(
-                    module.weight.to(dtype=dtype, device=self.device),
-                    **{
-                        k: config.get(k, None)
-                        for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]
-                    },
-                )
-            except torch.OutOfMemoryError:
-                cuda_error_msg = traceback.format_exc()
-                try:
-                    logger.error(cuda_error_msg)
-                    logger.warning("falling back to CPU")
-                    weight, scale, zp = quant_func(
-                        module.weight.to("cpu"),
-                        **{
-                            k: config.get(k, None)
-                            for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]
-                        },
-                    )
-                except Exception as e:
-                    raise
-
-            # Overwrite the module's weights with the quantized version
-            module.weight.data.copy_(weight.cpu())
-
-            # Attach scale and zero point (zp) to the module
-            for param_name, value in zip(["scale", "zp"], [scale, zp]):
-                if isinstance(value, dict):
-                    for k, v in value.items():
-                        setattr(module, k if k == "scale" else f"w_{k}", v.cpu())
-                elif isinstance(value, torch.Tensor):
-                    setattr(module, param_name, value.cpu())
-                else:
-                    setattr(module, param_name, value)
-
-            # Update config
-            self.layer_config.setdefault(name, {}).update(config)
-            del weight
-            del scale
-            del zp
-            clear_memory(device_list=self.device_list)
-
-        return is_quantized
-
-    def _quant_rtn_with_imatrix(self, all_to_quantized_module_names: list[str]) -> None:
-        """Performs RTN quantization using input activation statistics (imatrix).
-
-        This method accumulates per-channel second-moment activation statistics (imatrix)
-        via forward hooks and uses them to perform RTN quantization. If CUDA memory runs out,
-        it falls back to CPU-based blockwise quantization.
-
-        Args:
-            all_to_quantized_module_names (list[str]):
-                A list of module names (e.g., 'model.layers.0.self_attn.q_proj') to be quantized.
-
-        Returns:
-            None
-        """
-        logger.info("start to compute imatrix")
-
-        # Load dataset
-        from auto_round.calib_dataset import get_dataloader
-
-        if isinstance(self.dataset, str):
-            if self.tokenizer is None:
-                raise ValueError("A tokenizer must be set for the model when using a dataset string.")
-            dataset_name = self.dataset.replace(" ", "")
-            self.dataloader = get_dataloader(
-                self.tokenizer, self.seqlen, dataset_name, self.seed, self.batch_size, self.nsamples
-            )
-        else:
-            self.dataloader = self.dataset
-
-        model = self.model
-
-        # Dispatch multi-GPU model if necessary
-        if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
-            dispatch_model(model, model.hf_device_map)
-
-        def register_act_hook(model):
-            """Registers hooks to accumulate activation squared norms into `imatrix`."""
-
-            def get_imatrix_hook(module, input, output):
-                input = input[0] if isinstance(input, (tuple, list)) else input
-                flattened = input.reshape(-1, input.shape[-1]).to(torch.float32)
-                squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32)
-
-                if not hasattr(module, "imatrix"):
-                    module.imatrix = squared
-                    module.imatrix_cnt = input.shape[0]
-                else:
-                    module.imatrix += squared.to(module.imatrix.device)
-                    module.imatrix_cnt += input.shape[0]
-
-            hook_handles = []
-            for name, module in model.named_modules():
-                if type(module) in self.supported_types and check_to_quantized(module):
-                    hook = module.register_forward_hook(get_imatrix_hook)
-                    hook_handles.append(hook)
-            return hook_handles
-
-        hooks = register_act_hook(model)
-
-        try:
-            if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
-                import accelerate
-
-                accelerate.hooks.remove_hook_from_submodules(model)
-            safe_to_cpu_(model)
-            clear_memory(device_list=self.device_list)
-            self._quantize_via_rtn_blockwise(all_to_quantized_module_names)
-        except torch.OutOfMemoryError:
-            cuda_error_msg = traceback.format_exc()
-            try:
-                logger.error(cuda_error_msg)
-                # Final fallback: warn and use CPU-only quantization
-                logger.warning(
-                    "Fallback to CPU. "
-                    "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`."
-                )
-                safe_to_cpu_(model)
-                clear_memory(device_list=self.device_list)
-                if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
-                    import accelerate
-
-                    accelerate.hooks.remove_hook_from_submodules(model)
-
-                orig_device = self.device
-                self.device = "cpu"
-                self._quantize_via_rtn_blockwise(all_to_quantized_module_names)
-                self.device = orig_device
-            except Exception as e:
-                raise
-        finally:
-            # Always remove hooks
-            for hook in hooks:
-                hook.remove()
-
-    def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None:
-        """Quantizes a layer using RTN (Round-To-Nearest) if available.
-
-        This function attempts to quantize a layer by switching its data type to a
-        `rtn_*` version if supported, then wraps and unwraps the module to apply
-        quantization. If GPU memory is insufficient, it falls back to CPU.
-
-        If packing is enabled (`immediate_packing`), the function will also export
-        the quantized layer to the appropriate backend format.
-
-        Args:
-            name (str): Name of the layer to quantize.
-
-        Raises:
-            RuntimeError: If quantization fails for reasons unrelated to memory.
-        """
-        m = get_module(self.model, name)
-        if dtype is not None:
-            m = m.to(dtype)
-
-        if is_fp8_linear(m):
-            m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device)
-            set_module(self.model, name, m)
-        tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.device
-        # Step 1: let gguf merge layers or rename module first and we will handle the RTN is gguf specific logic
-        if self.is_immediate_packing and self.iters == 0 and self.formats[0].is_gguf() and not self.disable_opt_rtn:
-            m = m.to(tuning_device)
-            m.scale = None
-            m.zp = None
-        else:
-            try:
-                disable_opt_rtn = self.disable_opt_rtn
-                if (
-                    not disable_opt_rtn
-                    and self.orig_disable_opt_rtn is None
-                    and self.is_moe_model
-                    and "expert" in m.global_name
-                    and "shared_expert" not in m.global_name
-                    and self.super_bits is None  # GGUF still uses the optimized RTN for MoE layers
-                ):
-                    disable_opt_rtn = True
-                    logger.warning_once(
-                        "MoE layer detected: optimized RTN is disabled for efficiency. "
-                        "Use `--enable_opt_rtn` to force-enable it for MoE layers."
-                    )
-                m = m.to(tuning_device)
-                m = WrapperLinear(
-                    m,
-                    device=tuning_device,
-                    enable_minmax_tuning=False,
-                    enable_norm_bias_tuning=False,
-                    enable_round_tuning=False,
-                    enable_torch_compile=self.enable_torch_compile,
-                    disable_opt_rtn=disable_opt_rtn,
-                )
-                m = m.unwrapper({})
-            except torch.OutOfMemoryError:
-                cuda_error_msg = traceback.format_exc()
-                m = m.orig_layer if hasattr(m, "orig_layer") else m
-                try:
-                    logger.error(cuda_error_msg)
-                    logger.warning("falling back to CPU.")
-                    m.to("cpu")
-                    m = WrapperLinear(
-                        m,
-                        enable_minmax_tuning=False,
-                        enable_norm_bias_tuning=False,
-                        enable_round_tuning=False,
-                        enable_torch_compile=self.enable_torch_compile,
-                    )
-                    m = m.unwrapper({})
-                except Exception as e:
-                    raise
-
-        # Step 2: Optional immediate packing/export
-        if self.is_immediate_packing:  # For gguf, packing conducts on block level
-            self._immediate_pack(name)
-            if to_cpu:
-                m = m.to("cpu")
-                packed_m = get_module(self.model, name)
-                set_module(self.model, name, packed_m.to("cpu"))
-        else:
-            if to_cpu:
-                m = m.to("cpu")
-            set_module(self.model, name, m)
-        if self.is_immediate_saving:
-            m = get_module(self.model, name)
-            m.to("cpu")
-            shard_writer(self, m, name, False)
-
     def _immediate_pack(self, name: str):
         if not self.is_immediate_packing:
             return
@@ -1195,315 +923,6 @@ def _immediate_pack(self, name: str):
             tokenizer=self.tokenizer,
         )
 
-    @torch.inference_mode()
-    def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
-        """Quantize all modules in the model using RTN (Round-To-Nearest) strategy.
-
-        If the target format includes GGUF with `k`, and optimized RTN is enabled,
-        blockwise quantization with input caching and imatrix is used.
-
-        Returns:
-            tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration.
-        """
-        if self.amp and self.model.dtype != self.amp_dtype:
-            self.model.to(self.amp_dtype)
-
-        all_to_quantized_module_names: list[str] = [n for n, m in self.model.named_modules() if check_to_quantized(m)]
-        self.all_to_quantized_module_names = all_to_quantized_module_names
-        if is_nv_fp(self.data_type):
-            # FIXME: (yiliu30) change it to block-wise after we refactor the quantization code and
-            # https://github.com/intel/auto-round/issues/1331
-            materialize_model_(self.model)
-            self.model.to("cpu")
-            from auto_round.data_type.nvfp import calculate_gparam
-            from auto_round.data_type.utils import update_fused_layer_global_scales
-
-            pbar = tqdm(all_to_quantized_module_names)
-            for name in pbar:
-                pbar.set_description(f"Calculate weight global scale: {name}")
-                m = get_module(self.model, name)
-                if is_fp8_linear(m):
-                    m = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device)
-                    set_module(self.model, name, m)
-                weight_global_scale = calculate_gparam(m.weight, self.group_size)
-                setattr(m, "weight_global_scale", weight_global_scale)
-
-            logger.info("Start to update fused layer global scales, it may take some time.")
-            for name, module in self.model.named_modules():
-                update_fused_layer_global_scales(module)
-            logger.info("Finished updating fused layer global scales.")
-
-        if not (any(fmt.is_gguf() for fmt in getattr(self, "formats", [])) or self.super_bits is not None):
-            self._quantize_embedding_layer()  # leave to gguf itself to handle
-
-        # Release memory
-        clear_memory(device_list=self.device_list)
-
-        enable_imatrix = False
-        if not self.disable_opt_rtn:
-            has_gguf_k = (
-                any(fmt.is_gguf() and "k" in fmt.output_format for fmt in getattr(self, "formats", []))
-                or self.super_bits is not None
-            )
-            if has_gguf_k:
-                enable_imatrix = True
-            elif self.data_type == "int" and self.sym:
-                enable_imatrix = True
-        if enable_imatrix:
-            self._quant_rtn_with_imatrix(all_to_quantized_module_names)
-        elif self.act_bits <= 8 and check_need_act_calibration(
-            self.act_dynamic,
-            self.act_data_type,
-            self.act_bits,
-            self.static_kv_dtype,
-            self.static_attention_dtype,
-        ):  # TODO, mixed datatype has bug
-            hook_handles = self._register_act_max_hook(self.model)
-            try:
-                self._quantize_via_rtn_blockwise(all_to_quantized_module_names)
-            except torch.OutOfMemoryError:
-                logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.")
-                self.model = self.model.to("cpu")
-                clear_memory(device_list=self.device_list)
-                if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
-                    import accelerate
-
-                    accelerate.hooks.remove_hook_from_submodules(self.model)
-                orig_device = self.device
-                self.device = "cpu"
-                self._quantize_via_rtn_blockwise(all_to_quantized_module_names)
-                self.device = orig_device
-            for handle in hook_handles:
-                handle.remove()
-        else:
-            # By default, we go with layer-wise way if no replacement happened
-            use_blockwise_quantization = global_state.replaced_module_count > 0
-            tied_weights_keys = getattr(self.model, "_tied_weights_keys", [])
-            if tied_weights_keys is None:
-                tied_weights_keys = []
-            if isinstance(tied_weights_keys, dict):
-                tied_weights_values = list(tied_weights_keys.values())
-            else:
-                tied_weights_values = list(tied_weights_keys)
-            tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values]  # rm weight/bias
-            # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it
-            if hasattr(self, "formats") and self.formats[0].is_gguf():
-                lm_head_name = get_lm_head_name(self.model)
-                if lm_head_name is not None:
-                    tied_weights_layers.append(lm_head_name)
-
-            if use_blockwise_quantization:  # The ram usage is a little higher
-                all_to_quantized_module_names = list(set(all_to_quantized_module_names))
-
-                all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model)
-                pbar = tqdm(range(sum(len(block) for block in all_blocks)))
-                for block_names in all_blocks:
-                    for block_name in block_names:
-                        pbar.set_description(f"Quantizing {block_name}")
-                        block = get_module(self.model, block_name)
-                        materialize_model_(block)
-                        for name, m in block.named_modules():
-                            if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names:
-                                self._quantize_layer_via_rtn(m.global_name, to_cpu=self.low_gpu_mem_usage)
-                                all_to_quantized_module_names.remove(m.global_name)
-                            elif (
-                                not any(m.children())
-                                and len(m.state_dict()) > 0
-                                and m.global_name not in tied_weights_layers
-                            ):
-                                set_module(self.model, m.global_name, copy.deepcopy(m))
-                                if self.is_immediate_saving:
-                                    shard_writer(self, name=m.global_name)
-                                m.to("meta")
-                        clear_memory(device_list=self.device_list)
-                        memory_monitor.log_summary()
-                        pbar.update(1)
-                cnt = 1
-                for name in all_to_quantized_module_names:
-                    logger.info(f"Quantizing remaining layer {name} on CPU.")
-                    self._quantize_layer_via_rtn(name, to_cpu=True)
-                    cnt += 1
-                    if cnt % 10 == 0:
-                        clear_memory(device_list=self.device_list)
-                        memory_monitor.log_summary()
-            else:
-                materialize_model_(self.model)
-                self.model.to("cpu")
-                block_names_cnt = len(flatten_list(get_block_names(self.model, True)))
-                clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt
-                cnt = 0
-                pbar = tqdm(all_to_quantized_module_names)
-
-                for n, m in self.model.named_modules():
-                    if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names:
-                        pbar.set_description(f"Quantizing {m.global_name}")
-                        self._quantize_layer_via_rtn(m.global_name)
-                        cnt += 1
-                        pbar.update()
-                        if cnt % clear_mem_freq == 0:
-                            clear_memory(device_list=self.device_list)
-                            memory_monitor.log_summary()
-
-                    elif not any(m.children()) and len(m.state_dict()) > 0 and n not in tied_weights_layers:
-                        set_module(self.model, n, copy.deepcopy(m))
-                        if self.is_immediate_saving:
-                            shard_writer(self, name=n)
-                        m.to("meta")
-
-        # Convert remaining fp8
-        if is_fp8_model(self.model):
-            convert_fp8_module_to_16b(self.model, self.amp_dtype, self.device)
-        if self.is_immediate_saving:
-            shard_writer(self, is_finalize=True)
-
-        self.quantized = True
-        return self.model, self.layer_config
-
-    def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) -> None:
-        """Quantize model layers block by block using cached inputs and imatrix.
-
-        Args:
-            all_to_quantized_module_names (list[str]): Names of layers to be quantized.
-        """
-        all_to_quantized_module_names = list(set(all_to_quantized_module_names))
-
-        all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model)
-        if not all_blocks:
-            raise ValueError("Could not find any blocks. Check the model or quant_block_list.")
-
-        all_first_block_names = [block[0] for block in all_blocks]
-        layer_names = self._get_quantized_layer_names_outside_blocks()
-        if self.act_bits < 16 and (not self.act_dynamic or len(layer_names) > 0):
-            if len(layer_names) > 0:
-                logger.warning(
-                    "quantize layers outside blocks for static activation quantizaiton"
-                    " will significantly increase calibration time"
-                )
-            all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names)
-        else:
-            all_inputs = self.cache_inter_data(all_first_block_names, self.nsamples)
-
-        # Clear hooks for multi-GPU setups
-        if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
-            accelerate.hooks.remove_hook_from_submodules(self.model)
-
-        pbar = tqdm(range(sum(len(block) for block in all_blocks)))
-
-        for block_names in all_blocks:
-            first_block = block_names[0]
-            inputs = all_inputs.pop(first_block)
-            input_keys = [k for k in inputs if k.startswith("hidden_state")]
-            if len(input_keys) != 1:
-                raise RuntimeError(
-                    "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues"
-                )
-            inputs["input_ids"] = inputs.pop(input_keys[0])
-
-            clear_memory(self.inputs, device_list=self.device_list)
-
-            total_samples = len(inputs["input_ids"])
-            if total_samples < self.batch_size:
-                self.batch_size = total_samples
-                logger.warning(f"Forcing batch size to {total_samples}")
-
-            input_ids = to_device(inputs.pop("input_ids"), self.cache_device)
-            input_others = to_device(inputs, self.cache_device)
-
-            tmp_dtype = self.amp_dtype if self.amp else torch.float32
-            input_ids = [id_.to(tmp_dtype) for id_ in input_ids]
-
-            for key, val in input_others.items():
-                if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16):
-                    input_others[key] = val.to(tmp_dtype)
-                elif isinstance(val, list):
-                    input_others[key] = [to_dtype(v, tmp_dtype) for v in val]
-
-            for block_name in block_names:
-                pbar.set_description(f"Quantizing {block_name}")
-                block = get_module(self.model, block_name)
-                materialize_model_(block)
-                block.to("cpu")
-                if is_fp8_model(self.model):
-                    convert_fp8_module_to_16b(block, dtype=self.amp_dtype, device=self.device)
-
-                if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1:
-                    set_auto_device_map_for_block_with_tuning(
-                        block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, self.device
-                    )
-                # Dispatch model if needed
-                if len(self.device_list) > 1:
-                    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
-
-                    for _, m in block.named_modules():
-                        if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
-                            continue
-                        hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
-                        add_hook_to_module(m, hook, True)
-                else:
-                    block = block.to(self.device)
-                input_ids = self._get_block_outputs(
-                    block,
-                    input_ids,
-                    input_others,
-                    self.batch_size * self.infer_bs_coeff,
-                    self.device,
-                    self.cache_device,
-                )
-
-                if len(self.device_list) > 1:
-                    accelerate.hooks.remove_hook_from_submodules(block)
-
-                if is_nv_fp(self.act_data_type) or is_static_wfp8afp8(self):
-                    # enable moe experts act_max automatic generation for Linear
-                    set_amax_for_all_moe_layers(block, attr_name="act_max")
-                # Normalize imatrix and quantize layers
-                if self.low_gpu_mem_usage:
-                    block.to("cpu")
-                    clear_memory(device_list=self.device_list)
-
-                for name, m in block.named_modules():
-                    # fix issue: Ling-flash-2.0-q2_k_s fail infer on cuda but well on cpu
-                    # https://huggingface.co/Intel/Ling-flash-2.0-gguf-q2ks-mixed-AutoRound/discussions/1
-                    if hasattr(m, "imatrix"):
-                        m.imatrix /= m.imatrix_cnt
-                    if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names:
-                        self._quantize_layer_via_rtn(m.global_name, to_cpu=self.low_gpu_mem_usage)
-                        all_to_quantized_module_names.remove(m.global_name)
-
-                if not self.is_immediate_saving:
-                    # some modules may have been flushed and set to meta, so we could not  move to gpu
-                    mv_module_from_gpu(block)
-                if block_name == block_names[-1]:
-                    clear_memory(input_ids, device_list=self.device_list)
-                else:
-                    clear_memory(device_list=self.device_list)
-
-                memory_monitor.log_summary()
-                pbar.update(1)
-        pbar.close()
-        # Process remaining layers not in blocks
-        for name in all_to_quantized_module_names:
-            dtype = None
-            if self.super_group_size is not None:
-                dtype = torch.float32
-            self._quantize_layer_via_rtn(name, dtype=dtype)
-            # clear_memory(device_list=self.device_list)
-        # if self.is_immediate_saving:
-        #     shard_writer(self, is_finalize=True)
-
-    def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]:
-        keys = inputs.keys()
-        input_id_str = [key for key in keys if key.startswith("hidden_state")]
-        if len(input_id_str) != 1:
-            raise RuntimeError(
-                "hidden_states arg mismatch error,"
-                "please raise an issue in https://github.com/intel/auto-round/issues"
-            )
-        inputs["input_ids"] = inputs.pop(input_id_str[0], None)
-        if q_inputs is not None:
-            q_inputs = q_inputs.pop(input_id_str[0], None)
-        return inputs, q_inputs
-
     def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True):
         is_gguf_format = (f := getattr(self, "formats", None)) is not None and len(f) > 0 and f[0].is_gguf()
         if not is_gguf_format:
@@ -1639,214 +1058,14 @@ def _should_disable_inplace_due_to_layers_outside_block() -> bool:
             # Determine if immediate packing is required
             self._adjust_immediate_packing_and_saving()
 
-        if self.iters == 0:
-            return self._quantize_rtn()
-
-        if bool(self.quant_block_list):
-            all_blocks = self.quant_block_list
-        else:
-            all_blocks = get_block_names(self.model)
-
-        if len(all_blocks) == 0:
-            logger.warning("could not find blocks, exit with original model")
-            return self.model, self.layer_config
-
-        if self.amp and self.model.dtype != self.amp_dtype:
-            self.model = self.model.to(self.amp_dtype)
-
-        layer_names = self._get_quantized_layer_names_outside_blocks()
-        start_time = time.time()
-        all_first_block_names = [block[0] for block in all_blocks]
-        if len(layer_names) > 0:
-            logger.info(
-                "Starting to cache block inputs. This may be slow due to external block layers: %s", layer_names
-            )
-        else:
-            logger.info("start to cache block inputs")
-        all_inputs = self.try_cache_inter_data_gpucpu(all_first_block_names, self.nsamples, layer_names=layer_names)
-        is_quantized_embedding = self._quantize_embedding_layer()
-        clear_memory(device_list=self.device_list)
-        all_q_inputs = None
-        if is_quantized_embedding:
-            all_inputs = copy.deepcopy(self.inputs)
-            clear_memory(self.inputs, device_list=self.device_list)
-            all_q_inputs = self.try_cache_inter_data_gpucpu(
-                all_first_block_names, self.nsamples, layer_names=layer_names
-            )
-        self.model = mv_module_from_gpu(self.model)
-        clear_memory(device_list=self.device_list)
-        if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
-            accelerate.hooks.remove_hook_from_submodules(self.model)  # self.model.hf_device_map has not been changed
-        logger.info("caching done")
-        if len(all_blocks) > 1:
-            pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks))
-        else:
-            pbar = tqdm(range(0, len(all_blocks[0]), self.nblocks))  # move the alg warning outside pbar
-
-        for block_names in all_blocks:
-            inputs = all_inputs[block_names[0]]
-            all_inputs.pop(block_names[0])
-            q_inputs = None
-            if all_q_inputs is not None:
-                q_inputs = all_q_inputs[block_names[0]]
-                all_q_inputs.pop(block_names[0])
-
-            inputs, q_inputs = self._update_inputs(inputs, q_inputs)
-
-            clear_memory(self.inputs, device_list=self.device_list)
-
-            if "input_ids" in inputs.keys():
-                total_samples = len(inputs["input_ids"])
-                if total_samples < self.batch_size:
-                    self.batch_size = total_samples
-                    logger.warning(f"force the train batch size to {total_samples}")
-
-            self._quantize_blocks(
-                self.model,
-                inputs,
-                block_names,
-                q_input=q_inputs if q_inputs is not None else None,
-                nblocks=self.nblocks,
-                device=self.device,
-                pbar=pbar,
-            )
-            if self.is_immediate_packing and len(self.formats) != 1:
-                raise ValueError(
-                    f"Expected exactly one packing format when 'immediate_packing' is True, "
-                    f"but got {len(self.formats)} formats."
-                )
-        pbar.set_description("Quantizing done")
-        pbar.close()
-        self._quantize_layers(layer_names, all_inputs)
-
-        if is_fp8_model(self.model):
-            for n, m in self.model.named_modules():
-                if is_fp8_linear(m):
-                    new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device).to("cpu")
-                    set_module(self.model, n, new_layer)
-        if self.is_immediate_saving:
-            shard_writer(self, is_finalize=True)
-
-        end_time = time.time()
-        cost_time = end_time - start_time
-        logger.info(f"quantization tuning time {cost_time}")
-
-        # Dump a summary
-        quantized_layers = []
-        unquantized_layers = []
-        for n, m in self.model.named_modules():
-            if isinstance(m, tuple(self.supported_types)):
-                if check_to_quantized(m):
-                    quantized_layers.append(n)
-                else:
-                    unquantized_layers.append(n)
-            elif hasattr(m, "scales") or hasattr(m, "scale"):  # packing_immediately
-                quantized_layers.append(n)
-        summary_info = (
-            f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model"
-        )
-        if len(unquantized_layers) > 0:
-            summary_info += f",  {unquantized_layers} have not been quantized"
-        logger.info(summary_info)
-
-        self.quantized = True
-        return self.model, self.layer_config
+        if self.immediate_saving and "int" not in self.data_type:
+            logger.warning("immediate_saving is only supported for int quantization, set to False")
+            self.immediate_saving = False
 
-    def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
-        """Quantizes specified layers based on inputs and configuration.
+        from auto_round.quantizers import create_quantizers
 
-        Args:
-            layer_names (list): list of layer names to quantize.
-            layer_inputs (dict): Dictionary mapping layer names to input data.
-
-        Returns:
-            None
-        """
-        # TODO currently we take all the layers outside blocks as post block layers which is not optimal
-        # if there is no input for layer, we use rtn
-
-        for layer_name in copy.deepcopy(layer_names):
-            if layer_name not in layer_inputs:
-                if self.act_bits < 16 and not self.act_dynamic:
-                    # Activation quantization requires collected inputs
-                    msg_prefix = (
-                        f"Activation max hook for layer '{layer_name}' is unavailable due to "
-                        f"insufficient collected inputs. "
-                    )
-                    if "fp8_e5m2" in self.act_data_type:
-                        logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.")
-                    else:
-                        logger.warning(
-                            msg_prefix + "Static activation quantization is not supported or ineffective, "
-                            "Skipping quantization for this layer."
-                        )
-                        layer_names.remove(layer_name)
-                        continue
-                logger.info(f"using rtn to quantize {layer_name}")
-                from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
-
-                layer = get_module(self.model, layer_name)
-                layer = layer.to(self.device)
-                if is_fp8_linear(layer):
-                    new_layer = convert_fp8_layer_to_linear(layer, self.amp_dtype, self.device).to(self.device)
-                    set_module(self.model, layer_name, new_layer)
-                    layer = new_layer
-
-                wrapper_layer = WrapperLinear(
-                    layer,
-                    enable_round_tuning=False,
-                    enable_minmax_tuning=False,
-                    enable_norm_bias_tuning=False,
-                    enable_torch_compile=self.enable_torch_compile,
-                    device=self.device,
-                    disable_opt_rtn=self.disable_opt_rtn,
-                )
-                new_layer = wrapper_layer.unwrapper({})
-                set_module(self.model, layer_name, new_layer)
-                layer.cpu()
-                layer_names.remove(layer_name)
-        if len(layer_names) == 0:
-            memory_monitor.update()
-            memory_monitor.log_summary()
-            return
-        q_layer_inputs = None
-        enable_quanted_input = self.enable_quanted_input
-        has_gguf = False
-
-        if hasattr(self, "formats"):
-            has_gguf = any(format_.is_gguf() for format_ in self.formats)
-        if has_gguf and self.is_immediate_packing:
-            enable_quanted_input = False
-
-        if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1 and enable_quanted_input:
-            dispatch_model(self.model, self.model.hf_device_map)
-
-        if enable_quanted_input:
-            logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names)
-            q_layer_inputs = self.try_cache_inter_data_gpucpu([], self.nsamples, layer_names=layer_names)
-            if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
-                accelerate.hooks.remove_hook_from_submodules(
-                    self.model
-                )  # self.model.hf_device_map has not been changed
-        if not self.is_immediate_saving:
-            self.model = mv_module_from_gpu(self.model)
-        clear_memory(device_list=self.device_list)
-        quant_layer = self._quantize_layer
-        for layer_name in layer_names:
-            layer_input = layer_inputs[layer_name]
-            layer_input = to_device(layer_input, self.cache_device)
-            q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None
-            q_layer_input = to_device(q_layer_input, self.cache_device)
-            quant_layer(layer_name, layer_input, q_layer_input, device=self.device)
-            if self.is_immediate_packing:
-                self._immediate_pack(layer_name)
-
-            if self.is_immediate_saving:
-                m = get_module(self.model, layer_name)
-                shard_writer(self, m, name=layer_name, is_finalize=False)
-            del layer_input
-            clear_memory(q_layer_input, device_list=self.device_list)
-            memory_monitor.log_summary()
+        quantizers = create_quantizers(self)
+        return quantizers.quantize()
 
     @torch.no_grad()
     def _get_block_outputs(
@@ -2399,244 +1618,6 @@ def _replace_forward(self):
                 hook_handle = m.register_forward_hook(hook_func)
                 self.hook_handles.append(hook_handle)
 
-    def _register_act_max_hook(self, model):
-        def get_act_max_hook(module, input, output):
-            if isinstance(input, (tuple, list)):
-                input = input[0]
-            if input.numel() == 0:
-                return  # as no needs for act_max update
-            input, _, _ = reshape_pad_tensor_by_group_size(input, self.act_group_size)
-            act_max = torch.max(torch.abs(input), dim=-1).values
-            if not hasattr(module, "act_max") or module.act_max.numel() == 0:
-                module.act_max = act_max
-            else:
-                act_max = act_max.to(module.act_max.device)
-                if is_nv_fp(self.act_data_type):  ## for nvfp per-tensor input_global_scale calculation usage
-                    module.act_max = torch.max(
-                        torch.tensor([act_max.max(), module.act_max.max()], device=act_max.device)
-                    )
-                else:
-                    module.act_max = torch.max(act_max, module.act_max)
-
-        hook_handles = []
-        # for single layers out of blocks, like lm_head
-        if isinstance(model, SUPPORTED_LAYER_TYPES):
-            m = model
-            if (
-                hasattr(m, "act_dynamic")
-                and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
-                and check_to_quantized(m)
-            ):
-                hook = m.register_forward_hook(get_act_max_hook)
-                hook_handles.append(hook)
-            return hook_handles
-
-        for n, m in model.named_modules():
-            if (
-                hasattr(m, "act_dynamic")
-                and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
-                and check_to_quantized(m)
-            ):
-                hook = m.register_forward_hook(get_act_max_hook)
-                hook_handles.append(hook)
-                continue
-
-            # for whole model, RTN
-            if n in self.layer_config:
-                config = self.layer_config[n]
-                act_dynamic = config.get("act_dynamic", True)
-                act_data_type = config.get("act_data_type", None)
-                act_bits = config.get("act_bits", 16)
-                if (
-                    config["bits"] <= 8
-                    and check_need_act_calibration(act_dynamic, act_data_type, act_bits)
-                    and check_to_quantized(config)
-                ):
-                    hook = m.register_forward_hook(get_act_max_hook)
-                    hook_handles.append(hook)
-                    continue
-        return hook_handles
-
-    def _quantize_layer(
-        self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"
-    ):
-        """Quantize a specific layer of the model using the provided inputs.
-
-        Args:
-            layer_name (str): The name of the layer to quantize.
-            inputs (torch.Tensor): Input data for quantization.
-            q_inputs (torch.Tensor, optional): Quantized input data. Defaults to None.
-            device (torch.device, optional): The device to use for quantization. Defaults to torch.device("cpu").
-
-        Returns:
-            None
-        """
-        logger.info(f"quantizing layer {layer_name}")
-        layer = get_module(self.model, layer_name)
-        if hasattr(layer, "tuning_device"):
-            device = layer.tuning_device
-
-        layer = layer.to(device)
-        for i in range(len(inputs)):
-            inputs[i] = inputs[i].to(layer.weight.dtype)
-            if q_inputs is not None:
-                q_inputs[i] = q_inputs[i].to(layer.weight.dtype)
-
-        if self.act_bits <= 8 and check_need_act_calibration(
-            self.act_dynamic,
-            self.act_data_type,
-            self.act_bits,
-            self.static_kv_dtype,
-            self.static_attention_dtype,
-        ):
-            tmp_inputs = q_inputs if q_inputs is not None else inputs
-            hook_handles = self._register_act_max_hook(layer)
-            with torch.no_grad():
-                for input in tmp_inputs:
-                    layer(input)
-            for handle in hook_handles:
-                handle.remove()
-
-        wrapper_linear = WrapperLinear(
-            layer,
-            enable_minmax_tuning=self.enable_minmax_tuning,
-            enable_torch_compile=self.enable_torch_compile,
-            device=device,
-        ).to(device)
-        round_params = []
-        minmax_params = []
-        for key in wrapper_linear.params.keys():
-            if "min" in key or "max" in key:
-                minmax_params.append(wrapper_linear.params[key])
-            else:
-                round_params.append(wrapper_linear.value)
-        if len(round_params) + len(minmax_params) <= 0:
-            dump_info = f"quantized {layer_name}"
-            logger.info(dump_info)
-            with torch.no_grad():
-                unwrapper_layer(self.model, wrapper_linear, layer_name, {})
-            mv_module_from_gpu(layer)
-
-        lr = torch.tensor(self.lr)
-        minmax_lr = torch.tensor(self.minmax_lr)
-        if self.enable_minmax_tuning:
-            optimizer = self.optimizer(
-                [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0
-            )
-        else:
-            optimizer = self.optimizer(round_params, lr=lr, weight_decay=0)
-
-        if self.lr_scheduler is None:
-            lr_schedule = torch.optim.lr_scheduler.LinearLR(
-                optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters
-            )
-        else:
-            lr_schedule = copy.deepcopy(self.lr_scheduler)
-        nsamples = len(inputs)
-        last_best_iter = 0
-        best_loss = torch.finfo(torch.float).max
-        scaler = self._get_scaler()  # pylint: disable=assignment-from-none
-        init_loss = None
-        gradient_accumulate_steps = self.batch_size  # Force to low gpu
-
-        total_loss = 0
-        num_elm = 1
-        mse_reduction = "mean"
-        if gradient_accumulate_steps != 1:
-            mse_reduction = "sum"
-        mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device)
-        batch_size = 1  # Force to low gpu
-        global_batch_size = self.batch_size * gradient_accumulate_steps
-        global_batch_size = min(nsamples, global_batch_size)
-        if gradient_accumulate_steps != 1 and not self.attention_mask:
-            whole_indices = torch.arange(global_batch_size)
-            if q_inputs is not None:
-                num_elm = self._get_current_num_elm(q_inputs, whole_indices)
-            else:
-                num_elm = self._get_current_num_elm(inputs, whole_indices)
-
-        index_sampler = IndexSampler(nsamples, global_batch_size)
-
-        for i in range(self.iters):
-            total_loss = 0
-            global_indices = index_sampler.next_batch()
-            if self.attention_mask:
-                num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices)
-
-            for tmp_step in range(gradient_accumulate_steps):
-                indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
-                if q_inputs is not None:
-                    current_input = [q_inputs[i] for i in indices]
-                    current_input = torch.cat(current_input, dim=0).to(device)
-                    org_input = [inputs[i] for i in indices]
-                    org_input = torch.cat(org_input, dim=0).to(device)
-                else:
-                    current_input = [inputs[i] for i in indices]
-                    current_input = torch.cat(current_input, dim=0).to(device)
-                    org_input = current_input
-                with torch.no_grad():
-                    current_output = layer(org_input)
-                autocast_ctx = (
-                    nullcontext()
-                    if not self.amp
-                    else autocast(device_type=str(device).split(":")[0], dtype=self.amp_dtype)
-                )
-                if self.attention_mask:
-                    tmp_attention_mask = [self.attention_mask[i] for i in indices]
-                    tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device)
-                    tmp_attention_mask.unsqueeze_(-1)
-
-                    with autocast_ctx:
-                        output_q = wrapper_linear(current_input)  # pylint: disable=not-callable
-                        loss = mse_loss(  # pylint: disable=not-callable
-                            (output_q * tmp_attention_mask).to(torch.float32),
-                            (current_output * tmp_attention_mask).to(torch.float32),
-                        )
-
-                else:
-                    with autocast_ctx:
-                        output_q = wrapper_linear(current_input)  # pylint: disable=not-callable
-                        loss = mse_loss(  # pylint: disable=not-callable
-                            output_q.to(torch.float32),
-                            current_output.to(torch.float32),  # mul 1.0 will copy the output
-                        )
-
-                num_elm = 1 if num_elm <= 0 else num_elm
-                total_loss += loss.item() / num_elm
-
-                self._scale_loss_and_backward(scaler, loss)
-            if i == 0:
-                init_loss = total_loss
-
-            if total_loss < best_loss:
-                best_loss = total_loss
-                if not self.not_use_best_mse:
-                    best_params = collect_best_params(wrapper_linear, self.cache_device)
-                    last_best_iter = i
-            if self.not_use_best_mse and i == self.iters - 1:
-                best_params = collect_best_params(wrapper_linear, self.cache_device)
-
-            if not self.not_use_best_mse:
-                if 0 < self.dynamic_max_gap <= i - last_best_iter:
-                    break
-            self._step(scaler, optimizer, lr_schedule)
-
-        last_loss = total_loss
-        best_iter = self.iters
-        if not self.not_use_best_mse:
-            last_loss = best_loss
-            best_iter = last_best_iter
-        with torch.no_grad():
-            unwrapper_layer(self.model, wrapper_linear, layer_name, best_params)
-        mv_module_from_gpu(layer)
-        dump_info = f"quantized {layer_name},  loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
-        logger.info(dump_info)
-
-    def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> torch.Tensor:
-        current_output = [output[x] for x in indices]
-        current_output = torch.cat(current_output, dim=self.batch_dim)
-        return current_output
-
     def _get_current_q_output(
         self,
         block: torch.nn.Module,
@@ -2665,13 +1646,6 @@ def _get_current_num_elm(
         current_input_ids = [input_ids[i] for i in indices]
         return sum(id.numel() for id in current_input_ids)
 
-    def _get_non_zero_cnt(self, tensor: list[torch.Tensor], indices: list[int]) -> int:
-        current_tensors = [tensor[i] for i in indices]
-        non_zero_cnt = 0
-        for t in current_tensors:
-            non_zero_cnt += torch.count_nonzero(t).item()
-        return non_zero_cnt
-
     def quantize_block(
         self,
         block: torch.nn.Module,
@@ -2694,8 +1668,20 @@ def quantize_block(
         self.normalize_decoding_layer_inputs_(inputs)
         block_inputs = self.inputs[self.quant_block_list[0][0]]
         decoding_layer_first_input_name = "hidden_states"
-        input_ids, input_others = self._preprocess_block_inputs(block_inputs, decoding_layer_first_input_name)
-        return self._quantize_block(block, input_ids, input_others, q_input, device, auto_offload)
+        from auto_round.quantizers.algs.auto_round import ARQuantizer
+        from auto_round.quantizers.utils import preprocess_block_inputs
+
+        input_ids, input_others = preprocess_block_inputs(
+            block_inputs,
+            device_list=self.device_list,
+            first_input_name=decoding_layer_first_input_name,
+            amp=self.amp,
+            amp_dtype=self.amp_dtype,
+            cache_device=self.cache_device,
+            is_diffusion=self.diffusion,
+        )
+        ar_quantizer = ARQuantizer(self)
+        return ar_quantizer.quantize_block(block, input_ids, input_others, q_input, device, auto_offload)
 
     def _get_loss(
         self,
@@ -2726,384 +1712,6 @@ def _get_loss(
 
         return loss
 
-    def _quantize_block(
-        self,
-        block: torch.nn.Module,
-        input_ids: Union[list[torch.Tensor], dict],
-        input_others: dict,
-        q_input: Union[torch.Tensor, dict, None] = None,
-        device: Union[str, torch.device] = "cpu",
-        auto_offload=True,
-    ):
-        """Quantize the weights of a given block of the model.
-
-        Args:
-        block: The block of the model to be quantized.
-        input_ids: The input tensor containing tokenized input ids.
-        input_others: A dictionary containing additional input data.
-        q_input: The quantized input tensor.
-        device: The device for quantization.
-
-        Returns:
-        Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output)
-        """
-        materialize_model_(block)
-        if is_fp8_model(self.model):
-            for n, m in block.named_modules():
-                if is_fp8_linear(m):
-                    new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype, self.device).to(device)
-                    set_module(block, n, new_layer)
-
-        if auto_offload:
-            # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights
-            # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk
-            if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1:
-                card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning(
-                    block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device
-                )
-            else:
-                block = block.to(device)
-                card_0_in_high_risk, loss_device = False, device
-        else:
-            card_0_in_high_risk, loss_device = False, device
-
-        if len(self.device_list) > 1 and auto_offload:
-            for n, m in block.named_modules():
-                if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
-                    continue
-                from accelerate.hooks import AlignDevicesHook, add_hook_to_module
-
-                hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
-                add_hook_to_module(m, hook, True)
-
-        if q_input is None:
-            hook_handles = self._register_act_max_hook(block)
-
-            output = self._get_block_outputs(
-                block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device
-            )
-
-            for handle in hook_handles:
-                handle.remove()
-        else:
-            output = self._get_block_outputs(
-                block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device
-            )
-            hook_handles = self._register_act_max_hook(block)
-            if hook_handles:
-                self._get_block_outputs(
-                    block,
-                    q_input,
-                    input_others,
-                    self.batch_size * self.infer_bs_coeff,
-                    device,
-                    self.cache_device,
-                    save_output=False,
-                )
-
-            for handle in hook_handles:
-                handle.remove()
-
-        if q_input is not None:
-            if input_ids is not q_input:
-                clear_memory(input_ids, device_list=self.device_list)
-            else:
-                clear_memory(device_list=self.device_list)
-            input_ids = q_input
-
-        quantized_layer_names, unquantized_layer_names = self.wrapper_block(
-            block,
-            self.enable_minmax_tuning,
-            self.enable_norm_bias_tuning,
-            enable_torch_compile=self.enable_torch_compile,
-            device=device,
-        )
-        if is_nv_fp(self.data_type):  # enable qkv and moe structure global_scale fuse
-            from auto_round.data_type.utils import update_fused_layer_global_scales
-
-            modules = block.modules()
-            for module in modules:
-                update_fused_layer_global_scales(module)
-        round_params = []
-        minmax_params = []
-        for n, m in block.named_modules():
-            if hasattr(m, "orig_layer"):
-                for key in m.params.keys():
-                    if "min" in key or "max" in key:
-                        minmax_params.append(m.params[key])
-                    else:
-                        round_params.append(m.params[key])
-
-        lr = torch.tensor(self.lr)
-        minmax_lr = torch.tensor(self.minmax_lr)
-        is_adam = "adam" in self.__class__.__name__.lower()
-
-        extra_kwargs = {} if is_adam else {"momentum": self.momentum}
-
-        if self.enable_minmax_tuning:
-            params = [
-                {"params": round_params},
-                {"params": minmax_params, "lr": minmax_lr},
-            ]
-        else:
-            params = round_params
-
-        optimizer = self.optimizer(
-            params,
-            lr=lr,
-            weight_decay=0,
-            **extra_kwargs,
-        )
-
-        if len(round_params) + len(minmax_params) <= 0:
-            dump_info = (
-                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
-                f"layers in the block"
-            )
-            logger.info(dump_info)
-            unwrapper_block(block, {})
-            mv_module_from_gpu(block)
-            return output, output
-
-        if self.lr_scheduler is None:
-            lr_schedule = torch.optim.lr_scheduler.LinearLR(
-                optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.iters
-            )
-        else:
-            lr_schedule = copy.deepcopy(self.lr_scheduler)
-
-        if isinstance(input_ids, dict):  # input_ids of Flux is dict
-            nsamples = len(input_ids["hidden_states"])
-        else:
-            nsamples = len(input_ids)
-        last_best_iter = 0
-        best_loss = torch.finfo(torch.float).max
-        num_elm = 1
-        mse_reduction = "mean"
-        if self.gradient_accumulate_steps != 1:
-            mse_reduction = "sum"
-        mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device)
-        scaler = self._get_scaler()  # pylint: disable=assignment-from-none
-        init_loss = None
-        best_params = {}
-        total_loss = 0
-        global_batch_size = self.batch_size * self.gradient_accumulate_steps
-        global_batch_size = min(nsamples, global_batch_size)
-        # We assume the block input and output shape is same
-        if self.gradient_accumulate_steps != 1 and not self.attention_mask:
-            whole_indices = torch.arange(global_batch_size)
-            num_elm = self._get_current_num_elm(input_ids, whole_indices)
-
-        index_sampler = IndexSampler(nsamples, global_batch_size)
-        batch_size = self.batch_size
-        for i in range(self.iters):
-            if self.enable_alg_ext and self.data_type.endswith("dq"):
-                for n, m in block.named_modules():
-                    m.cur_iter = i
-            total_loss = 0
-            global_indices = index_sampler.next_batch()
-            if self.attention_mask:
-                num_elm = self._get_non_zero_cnt(self.attention_mask, global_indices)
-
-            for tmp_step in range(self.gradient_accumulate_steps):
-                indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
-                current_output = self._get_current_output(output, indices)
-                current_output = to_device(current_output, loss_device)
-                output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device)
-                loss = self._get_loss(output_q, current_output, indices, mse_loss, device)
-                num_elm = 1 if num_elm <= 0 else num_elm
-                total_loss += loss.item() / num_elm
-
-                if self.low_gpu_mem_usage and card_0_in_high_risk:
-                    # clear memory to avoid OOM due to memory fragmentation
-                    clear_memory_if_reached_threshold(threshold=0.5, device_list=self.device_list)
-
-                self._scale_loss_and_backward(scaler, loss)
-
-                if self.low_gpu_mem_usage and card_0_in_high_risk:
-                    # clear memory to avoid OOM due to memory fragmentation
-                    clear_memory_if_reached_threshold(threshold=0.8, device_list=self.device_list)
-
-            if i == 0:
-                init_loss = total_loss
-
-            if total_loss < best_loss:
-                best_loss = total_loss
-                if not self.not_use_best_mse:
-                    best_params = collect_best_params(block, self.cache_device)
-                    # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True)
-
-                    last_best_iter = i
-            if self.not_use_best_mse and i == self.iters - 1:
-                best_params = collect_best_params(block, self.cache_device)
-
-            if not self.not_use_best_mse:
-                if 0 < self.dynamic_max_gap <= i - last_best_iter:
-                    break
-            self._step(scaler, optimizer, lr_schedule)
-
-        last_loss = total_loss
-        best_iter = self.iters
-        if not self.not_use_best_mse:
-            last_loss = best_loss
-            best_iter = last_best_iter
-        if self.iters > 0:
-            dump_info = (
-                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
-                f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
-            )
-        else:
-            dump_info = (
-                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
-                "layers in the block"
-            )
-
-        if self.low_gpu_mem_usage:
-            clear_memory(device_list=self.device_list)  # clear cached memory during training
-        if len(unquantized_layer_names) != 0:
-            logger.info(f"{unquantized_layer_names} have not been quantized")
-        with torch.no_grad():
-            unwrapper_block(block, best_params)
-
-        if is_nv_fp(self.act_data_type):
-            # enable moe experts act_max automatic generation for WrapperWALayer
-            set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max")
-
-        if self.enable_quanted_input:
-            q_outputs = self._get_block_outputs(
-                block,
-                input_ids,
-                input_others,
-                self.batch_size * self.infer_bs_coeff,
-                device,
-                cache_device=self.cache_device,
-            )
-
-            if len(self.device_list) > 1 and auto_offload:
-                accelerate.hooks.remove_hook_from_submodules(block)
-            if auto_offload:
-                mv_module_from_gpu(block)
-
-            clear_memory(input_ids, device_list=self.device_list)
-            memory_info_summary = memory_monitor.get_summary()
-            logger.infoclean(dump_info + "," + memory_info_summary)
-
-            return q_outputs, output
-        else:
-            if len(self.device_list) > 1 and auto_offload:
-                accelerate.hooks.remove_hook_from_submodules(block)
-            if auto_offload:
-                mv_module_from_gpu(block)
-            clear_memory(input_ids, device_list=self.device_list)
-            memory_info_summary = memory_monitor.get_summary()
-            logger.infoclean(dump_info + "," + memory_info_summary)
-
-            return None, output
-
-    def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[torch.Tensor, dict]:
-        input_ids = inputs[first_input_name]
-        inputs.pop(first_input_name, None)
-        input_others = inputs
-        return input_ids, input_others
-
-    def _preprocess_block_inputs(self, inputs, first_input_name="input_ids"):
-        input_ids, input_others = self._split_inputs(inputs, first_input_name)
-        clear_memory(device_list=self.device_list)
-        input_ids = to_device(input_ids, self.cache_device)
-        input_others = to_device(input_others, self.cache_device)
-        # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage
-
-        tmp_dtype = self.amp_dtype if self.amp else torch.float32
-        input_ids = to_dtype(input_ids, tmp_dtype)
-
-        for key in input_others.keys():
-            if isinstance(input_others[key], torch.Tensor) and (
-                input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16
-            ):
-                input_others[key] = input_others[key].to(tmp_dtype)
-            elif isinstance(input_others[key], list):
-                for i in range(len(input_others[key])):
-                    to_dtype(input_others[key][i], tmp_dtype)
-        return input_ids, input_others
-
-    def _quantize_blocks(
-        self,
-        model: torch.nn.Module,
-        inputs: dict,
-        block_names: list,
-        q_input: torch.Tensor = None,
-        nblocks: int = 1,
-        device: str = "cpu",
-        pbar: tqdm = None,
-    ):
-        """Quantize and dequantize the weights of the specified blocks in the model.
-
-        Args:
-        model: The PyTorch model to be quantized.
-        inputs: The input data for quantization.
-        block_names: The names of the blocks to be quantized and dequantized.
-        nblocks: The number of blocks to quantize and dequantize.
-        device: The device for quantization and dequantization.
-
-        Returns:
-        None
-        """
-        clear_memory(device_list=self.device_list)
-        for n, m in model.named_parameters():
-            m.requires_grad_(False)
-
-        input_ids, input_others = self._preprocess_block_inputs(inputs)
-
-        if pbar is None:
-            pbar = tqdm(range(0, len(block_names), nblocks))
-
-        for i in range(0, len(block_names), nblocks):
-            if i != 0:
-                pbar.update(1)
-            if nblocks == 1:
-                n = block_names[i]
-                pbar.set_description(f"Quantizing {n}")
-                m = get_module(model, n)
-            else:
-                names = block_names[i : min(i + nblocks, len(block_names))]
-                pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}")
-                modules = [get_module(model, n) for n in names]
-                m = WrapperMultiblock(modules)
-
-            m.config = model.config if hasattr(model, "config") else None
-            q_input, input_ids = self._quantize_block(
-                m,
-                input_ids,
-                input_others,
-                q_input=q_input,
-                device=device,
-            )
-            if hasattr(model, "config"):
-                del m.config
-            if self.is_immediate_packing:
-                for n, tmp_m in m.named_modules():
-                    if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)):
-                        continue
-                    self._immediate_pack(tmp_m.global_name)
-
-            if self.is_immediate_saving:
-                shard_writer(self, m, is_finalize=False)
-        if pbar is not None:
-            pbar.update(1)
-
-        if not self.is_immediate_saving:
-            self.model = mv_module_from_gpu(self.model)
-        for n, m in self.model.named_modules():
-            if hasattr(m, "name"):
-                delattr(m, "name")
-
-        del q_input
-        del input_ids
-        del input_others
-        del inputs
-
-        clear_memory(device_list=self.device_list)
-
     def save_quantized(
         self,
         output_dir: str = None,
@@ -3166,30 +1774,6 @@ def save_quantized(
         else:
             return compressed_model
 
-    def _get_quantized_layer_names_outside_blocks(self) -> list:
-        """Gets the names of quantized layers outside blocks in the model.
-
-        Returns:
-            list: List of layer names outside blocks.
-        """
-        if self.layer_config is None or len(self.layer_config) == 0:
-            return []
-
-        layer_names = []
-        all_layers_in_block = get_layer_names_in_block(self.model, self.supported_types, self.quant_block_list)
-
-        for key in self.layer_config.keys():
-            if key in all_layers_in_block:
-                continue
-            layer = get_module(self.model, key)
-            if layer is None:
-                logger.error(f"could not find layer {key} in the model, exit...")
-                exit(-1)
-            if type(layer) in self.supported_types and check_to_quantized(self.layer_config[key]):
-                layer_names.append(key)
-
-        return layer_names
-
     def _set_amp_dtype(self) -> None:
         """Sets the automatic mixed precision (AMP) data type for the model based on the device and configuration."""
         self.amp_dtype = torch.bfloat16
@@ -3212,55 +1796,6 @@ def _set_amp_dtype(self) -> None:
             self.amp_dtype = torch.float32
             self.model = self.model.to(torch.float32)
 
-    def _get_optimizer(self, optimizer: Any):
-        """Returns the specified optimizer. In SignRound, we fix the optimizer.
-
-        Args:
-        optimizer: The optimizer to be used.
-
-        Returns:
-        The specified optimizer.
-        """
-        return SignSGD
-
-    def _get_scaler(self):
-        """Returns scaler, in SignRound, no need to use scaler."""
-        return None
-
-    def _scale_loss_and_backward(self, scaler: Any, loss: torch.Tensor) -> torch.Tensor:
-        """Scales the loss and performs backward pass.
-
-        Args:
-        scaler: The scaler to be used.
-        loss: The loss to be scaled.
-
-        Returns:
-        The scaled loss.
-        """
-        scale_loss = loss * 1000
-        scale_loss.backward()
-        if is_hpex_available():
-            htcore.mark_step()
-        return scale_loss
-
-    def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any):
-        """Performs a step in the optimization process.
-
-        Args:
-        scaler: The scaler to be used.
-        optimizer: The optimizer for the step.
-        lr_schedule: The learning rate schedule.
-
-        Returns:
-        None
-        """
-        optimizer.step()
-        # for hpu
-        if is_hpex_available():
-            htcore.mark_step()
-        optimizer.zero_grad()
-        lr_schedule.step()
-
     @classmethod
     @torch.no_grad()
     def _sampling_inputs(
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
old mode 100644
new mode 100755
index 6d9580e4f..f6ffd6741
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -165,25 +165,6 @@ def __init__(
             **kwargs,
         )
 
-    def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]:
-        # flux transformer model's blocks will update hidden_states and encoder_hidden_states
-        input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
-        if q_inputs is not None:
-            q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str}
-        return inputs, q_inputs
-
-    def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[dict, dict]:
-        input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
-        input_ids = {k: inputs.pop(k, None) for k in input_id_str}
-        input_others = inputs
-        return input_ids, input_others
-
-    def _get_current_output(self, output: dict, indices: list[int]) -> torch.Tensor:
-        assert "hidden_states" in output
-        current_output = [output["hidden_states"][x] for x in indices]
-        current_output = torch.cat(current_output, dim=self.batch_dim)
-        return current_output
-
     def _get_current_q_output(
         self,
         block: torch.nn.Module,
diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
old mode 100644
new mode 100755
diff --git a/auto_round/quantizers/__init__.py b/auto_round/quantizers/__init__.py
new file mode 100644
index 000000000..87ac77b62
--- /dev/null
+++ b/auto_round/quantizers/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_round.quantizers.entrypoint import create_quantizers
diff --git a/auto_round/quantizers/algs/__init__.py b/auto_round/quantizers/algs/__init__.py
new file mode 100755
index 000000000..14a492441
--- /dev/null
+++ b/auto_round/quantizers/algs/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/auto_round/quantizers/algs/auto_round.py b/auto_round/quantizers/algs/auto_round.py
new file mode 100755
index 000000000..5970b467a
--- /dev/null
+++ b/auto_round/quantizers/algs/auto_round.py
@@ -0,0 +1,1021 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import time
+import traceback
+from contextlib import nullcontext
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+import accelerate
+import torch
+from accelerate.big_modeling import dispatch_model, infer_auto_device_map
+from torch import autocast
+from tqdm import tqdm
+
+from auto_round.compressors.shard_writer import shard_writer
+from auto_round.compressors.utils import (
+    IndexSampler,
+    check_need_act_calibration,
+    collect_best_params,
+    is_nv_fp,
+)
+from auto_round.logger import logger
+from auto_round.modelling.replace_modules import materialize_model_, safe_to_cpu_
+from auto_round.quantizers.algs.base import AlgsBaseQuantizer
+from auto_round.quantizers.utils import (
+    get_non_zero_cnt,
+    get_quantized_layer_names_outside_blocks,
+    preprocess_block_inputs,
+    quantize_embedding_layer,
+    register_act_max_hook,
+    update_inputs,
+)
+from auto_round.sign_sgd import SignSGD
+from auto_round.utils import (
+    check_is_cpu,
+    check_to_quantized,
+    clear_memory,
+    convert_fp8_layer_to_linear,
+    get_block_names,
+    get_module,
+    htcore,
+    is_auto_device_mapping,
+    is_fp8_linear,
+    is_fp8_model,
+    is_hpex_available,
+    memory_monitor,
+    mv_module_from_gpu,
+    set_amax_for_all_moe_layers,
+    set_module,
+    to_device,
+)
+from auto_round.utils.device import (
+    clear_memory_if_reached_threshold,
+    set_auto_device_map_for_block_with_tuning,
+)
+from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
+
+if TYPE_CHECKING:
+    from auto_round.compressors.base import BaseCompressor
+
+
+class ARQuantizer(AlgsBaseQuantizer):
+
+    def __init__(self, compressor: "BaseCompressor"):
+        super().__init__(compressor)
+        self.all_blocks = []
+        self.layer_names = []
+        self.all_q_inputs = None
+        self.optimizer = self._get_optimizer(None)
+        self.is_adam = False
+
+    def _pre_quantize_impl(self, *args, **kwargs):
+        if bool(self.compressor.quant_block_list):
+            self.all_blocks = self.compressor.quant_block_list
+        else:
+            self.all_blocks = get_block_names(self.compressor.model)
+        if len(self.all_blocks) == 0:
+            logger.warning("could not find blocks, exit with original model")
+            return self.compressor.model, self.compressor.layer_config
+
+        if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype:
+            self.compressor.model = self.compressor.model.to(self.compressor.amp_dtype)
+
+        self.layer_names = get_quantized_layer_names_outside_blocks(
+            model=self.compressor.model,
+            layer_config=self.compressor.layer_config,
+            supported_types=self.compressor.supported_types,
+            quant_block_list=self.compressor.quant_block_list,
+        )
+        self.all_first_block_names = [block[0] for block in self.all_blocks]
+        if len(self.layer_names) > 0:
+            logger.info(
+                "Starting to cache block inputs. This may be slow due to external block layers: %s", self.layer_names
+            )
+        else:
+            logger.info("start to cache block inputs")
+
+        # TODO: refactor this
+        self.all_inputs = self.compressor.try_cache_inter_data_gpucpu(
+            self.all_first_block_names, self.compressor.nsamples, layer_names=self.layer_names
+        )
+        is_quantized_embedding = quantize_embedding_layer(
+            model=self.compressor.model,
+            layer_config=self.compressor.layer_config,
+            scale_dtype=self.compressor.scale_dtype,
+            disable_opt_rtn=self.compressor.disable_opt_rtn,
+            device=self.compressor.device,
+            device_list=self.compressor.device_list,
+        )
+        clear_memory(device_list=self.compressor.device_list)
+        if is_quantized_embedding:
+            self.all_inputs = copy.deepcopy(self.compressor.inputs)
+            clear_memory(self.compressor.inputs, device_list=self.compressor.device_list)
+            # TODO: refactor this
+            self.all_q_inputs = self.compressor.try_cache_inter_data_gpucpu(
+                self.all_first_block_names, self.compressor.nsamples, layer_names=self.layer_names
+            )
+        self.compressor.model = mv_module_from_gpu(self.compressor.model)
+        clear_memory(device_list=self.compressor.device_list)
+
+        if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1:
+            accelerate.hooks.remove_hook_from_submodules(
+                self.compressor.model
+            )  # self.compressor.model.hf_device_map has not been changed
+        logger.info("caching done")
+
+    def _quantize_impl(self, *args, **kwargs):
+        start_time = time.time()
+
+        if len(self.all_blocks) > 1:
+            pbar = tqdm(range(0, sum([len(i) for i in self.all_blocks]), self.compressor.nblocks))
+        else:
+            pbar = tqdm(range(0, len(self.all_blocks[0]), self.compressor.nblocks))  # move the alg warning outside pbar
+
+        for block_names in self.all_blocks:
+            inputs = self.all_inputs[block_names[0]]
+            self.all_inputs.pop(block_names[0])
+            q_inputs = None
+            if self.all_q_inputs is not None:
+                q_inputs = self.all_q_inputs[block_names[0]]
+                self.all_q_inputs.pop(block_names[0])
+
+            inputs, q_inputs = update_inputs(inputs, q_inputs, self.compressor.diffusion)
+
+            clear_memory(self.compressor.inputs, device_list=self.compressor.device_list)
+
+            if "input_ids" in inputs.keys():
+                total_samples = len(inputs["input_ids"])
+                if total_samples < self.compressor.batch_size:
+                    self.compressor.batch_size = total_samples
+                    logger.warning(f"force the train batch size to {total_samples}")
+
+            self._quantize_blocks(
+                self.compressor.model,
+                inputs,
+                block_names,
+                q_input=q_inputs if q_inputs is not None else None,
+                nblocks=self.compressor.nblocks,
+                device=self.compressor.device,
+                pbar=pbar,
+            )
+            if self.compressor.is_immediate_packing and len(self.compressor.formats) != 1:
+                raise ValueError(
+                    f"Expected exactly one packing format when 'immediate_packing' is True, "
+                    f"but got {len(self.compressor.formats)} formats."
+                )
+        pbar.set_description("Quantizing done")
+        pbar.close()
+        self._quantize_layers(self.layer_names, self.all_inputs)
+
+        end_time = time.time()
+        cost_time = end_time - start_time
+        logger.info(f"quantization tuning time {cost_time}")
+
+        return self.compressor.model, self.compressor.layer_config
+
+    def _post_quantize_impl(self, *args, **kwargs):
+        if is_fp8_model(self.compressor.model):
+            for n, m in self.compressor.model.named_modules():
+                if is_fp8_linear(m):
+                    new_layer = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device).to(
+                        "cpu"
+                    )
+                    set_module(self.compressor.model, n, new_layer)
+
+        # Dump a summary
+        quantized_layers = []
+        unquantized_layers = []
+        for n, m in self.compressor.model.named_modules():
+            if isinstance(m, tuple(self.compressor.supported_types)):
+                if check_to_quantized(m):
+                    quantized_layers.append(n)
+                else:
+                    unquantized_layers.append(n)
+            elif hasattr(m, "scales") or hasattr(m, "scale"):  ##packing_immediately
+                quantized_layers.append(n)
+        summary_info = (
+            f"Summary: quantized {len(quantized_layers)}/{len(quantized_layers) + len(unquantized_layers)} in the model"
+        )
+        if len(unquantized_layers) > 0:
+            summary_info += f",  {unquantized_layers} have not been quantized"
+        logger.info(summary_info)
+
+        self.compressor.quantized = True
+
+    def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
+        """Quantizes specified layers based on inputs and configuration.
+
+        Args:
+            layer_names (list): list of layer names to quantize.
+            layer_inputs (dict): Dictionary mapping layer names to input data.
+
+        Returns:
+            None
+        """
+        # TODO currently we take all the layers outside blocks as post block layers which is not optimal
+        # if there is no input for layer, we use rtn
+
+        for layer_name in copy.deepcopy(layer_names):
+            if layer_name not in layer_inputs:
+                if self.compressor.act_bits < 16 and not self.compressor.act_dynamic:
+                    # Activation quantization requires collected inputs
+                    msg_prefix = (
+                        f"Activation max hook for layer '{layer_name}' is unavailable due to "
+                        f"insufficient collected inputs. "
+                    )
+                    if "fp8_e5m2" in self.compressor.act_data_type:
+                        logger.warning(msg_prefix + "Please notes that unit scale is used for this layer.")
+                    else:
+                        logger.warning(
+                            msg_prefix + "Static activation quantization is not supported or ineffective, "
+                            "Skipping quantization for this layer."
+                        )
+                        layer_names.remove(layer_name)
+                        continue
+                logger.info(f"using rtn to quantize {layer_name}")
+                from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
+
+                layer = get_module(self.compressor.model, layer_name)
+                layer = layer.to(self.compressor.device)
+                if is_fp8_linear(layer):
+                    new_layer = convert_fp8_layer_to_linear(
+                        layer, self.compressor.amp_dtype, self.compressor.device
+                    ).to(self.compressor.device)
+                    set_module(self.compressor.model, layer_name, new_layer)
+                    layer = new_layer
+
+                wrapper_layer = WrapperLinear(
+                    layer,
+                    enable_round_tuning=False,
+                    enable_minmax_tuning=False,
+                    enable_norm_bias_tuning=False,
+                    enable_torch_compile=self.compressor.enable_torch_compile,
+                    device=self.compressor.device,
+                    disable_opt_rtn=self.compressor.disable_opt_rtn,
+                )
+                new_layer = wrapper_layer.unwrapper({})
+                set_module(self.compressor.model, layer_name, new_layer)
+                layer.cpu()
+                layer_names.remove(layer_name)
+        if len(layer_names) == 0:
+            memory_monitor.update()
+            memory_monitor.log_summary()
+            return
+        q_layer_inputs = None
+        enable_quanted_input = self.compressor.enable_quanted_input
+        has_gguf = False
+
+        if hasattr(self.compressor, "formats"):
+            has_gguf = any(format_.is_gguf() for format_ in self.compressor.formats)
+        if has_gguf and self.compressor.is_immediate_packing:
+            enable_quanted_input = False
+
+        if (
+            hasattr(self.compressor.model, "hf_device_map")
+            and len(self.compressor.model.hf_device_map) > 1
+            and enable_quanted_input
+        ):
+            dispatch_model(self.compressor.model, self.compressor.model.hf_device_map)
+
+        if enable_quanted_input:
+            logger.info("starting to cache layer inputs for %s, this may be quite slow ", layer_names)
+            # TODO: refactor this
+            q_layer_inputs = self.compressor.try_cache_inter_data_gpucpu(
+                [], self.compressor.nsamples, layer_names=layer_names
+            )
+            if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1:
+                accelerate.hooks.remove_hook_from_submodules(
+                    self.compressor.model
+                )  # self.compressor.model.hf_device_map has not been changed
+        if not self.compressor.is_immediate_saving:
+            self.compressor.model = mv_module_from_gpu(self.compressor.model)
+        clear_memory(device_list=self.compressor.device_list)
+        for layer_name in layer_names:
+            layer_input = layer_inputs[layer_name]
+            layer_input = to_device(layer_input, self.compressor.cache_device)
+            q_layer_input = q_layer_inputs.get(layer_name, None) if q_layer_inputs is not None else None
+            q_layer_input = to_device(q_layer_input, self.compressor.cache_device)
+            self.quantize_layer(layer_name, layer_input, q_layer_input, device=self.compressor.device)
+            if self.compressor.is_immediate_packing:
+                self.compressor._immediate_pack(layer_name)
+
+            if self.compressor.is_immediate_saving:
+                m = get_module(self.compressor.model, layer_name)
+                shard_writer(self.compressor, m, name=layer_name, last_group=True)
+            del layer_input
+            clear_memory(q_layer_input, device_list=self.compressor.device_list)
+            memory_monitor.log_summary()
+
+    def _quantize_blocks(
+        self,
+        model: torch.nn.Module,
+        inputs: dict,
+        block_names: list,
+        q_input: torch.Tensor = None,
+        nblocks: int = 1,
+        device: str = "cpu",
+        pbar: tqdm = None,
+    ):
+        """Quantize and dequantize the weights of the specified blocks in the model.
+
+        Args:
+        model: The PyTorch model to be quantized.
+        inputs: The input data for quantization.
+        block_names: The names of the blocks to be quantized and dequantized.
+        nblocks: The number of blocks to quantize and dequantize.
+        device: The device for quantization and dequantization.
+
+        Returns:
+        None
+        """
+        clear_memory(device_list=self.compressor.device_list)
+        for n, m in model.named_parameters():
+            m.requires_grad_(False)
+
+        input_ids, input_others = preprocess_block_inputs(
+            inputs,
+            device_list=self.compressor.device_list,
+            first_input_name="input_ids",
+            amp=self.compressor.amp,
+            amp_dtype=self.compressor.amp_dtype,
+            cache_device=self.compressor.cache_device,
+            is_diffusion=self.compressor.diffusion,
+        )
+
+        if pbar is None:
+            pbar = tqdm(range(0, len(block_names), nblocks))
+
+        for i in range(0, len(block_names), nblocks):
+            if i != 0:
+                pbar.update(1)
+            if nblocks == 1:
+                n = block_names[i]
+                pbar.set_description(f"Quantizing {n}")
+                m = get_module(model, n)
+            else:
+                names = block_names[i : min(i + nblocks, len(block_names))]
+                pbar.set_description(f"Quantizing [{i + 1}-{min(i + nblocks, len(block_names))}]/{len(block_names)}")
+                modules = [get_module(model, n) for n in names]
+                m = WrapperMultiblock(modules)
+
+            m.config = model.config if hasattr(model, "config") else None
+            q_input, input_ids = self.quantize_block(
+                m, input_ids, input_others, q_input=q_input, device=device, last_group=(i + nblocks) >= len(block_names)
+            )
+            if hasattr(model, "config"):
+                del m.config
+            if self.compressor.is_immediate_packing:
+                for n, tmp_m in m.named_modules():
+                    if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)):
+                        continue
+                    self.compressor._immediate_pack(tmp_m.global_name)
+
+            if self.compressor.is_immediate_saving:
+                shard_writer(self.compressor, m, is_finalize=False)
+        if pbar is not None:
+            pbar.update(1)
+
+        if not self.compressor.is_immediate_saving:
+            self.compressor.model = mv_module_from_gpu(self.compressor.model)
+        for n, m in self.compressor.model.named_modules():
+            if hasattr(m, "name"):
+                delattr(m, "name")
+
+        del q_input
+        del input_ids
+        del input_others
+        del inputs
+
+        clear_memory(device_list=self.compressor.device_list)
+
+    def _quantize_layer_impl(
+        self, layer_name: str, inputs: torch.Tensor, q_inputs: torch.Tensor = None, device: str = "cpu"
+    ):
+        """Quantize a specific layer of the model using the provided inputs.
+
+        Args:
+            layer_name (str): The name of the layer to quantize.
+            inputs (torch.Tensor): Input data for quantization.
+            q_inputs (torch.Tensor, optional): Quantized input data. Defaults to None.
+            device (torch.device, optional): The device to use for quantization. Defaults to torch.device("cpu").
+
+        Returns:
+            None
+        """
+        logger.info(f"quantizing layer {layer_name}")
+        layer = get_module(self.compressor.model, layer_name)
+        if hasattr(layer, "tuning_device"):
+            device = layer.tuning_device
+
+        layer = layer.to(device)
+        for i in range(len(inputs)):
+            inputs[i] = inputs[i].to(layer.weight.dtype)
+            if q_inputs is not None:
+                q_inputs[i] = q_inputs[i].to(layer.weight.dtype)
+
+        if self.compressor.act_bits <= 8 and check_need_act_calibration(
+            self.compressor.act_dynamic,
+            self.compressor.act_data_type,
+            self.compressor.act_bits,
+            self.compressor.static_kv_dtype,
+            self.compressor.static_attention_dtype,
+        ):
+            tmp_inputs = q_inputs if q_inputs is not None else inputs
+            hook_handles = register_act_max_hook(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                act_group_size=self.compressor.act_group_size,
+                act_data_type=self.compressor.act_data_type,
+            )
+            with torch.no_grad():
+                for input in tmp_inputs:
+                    layer(input)
+            for handle in hook_handles:
+                handle.remove()
+
+        wrapper_linear = WrapperLinear(
+            layer,
+            enable_minmax_tuning=self.compressor.enable_minmax_tuning,
+            enable_torch_compile=self.compressor.enable_torch_compile,
+            device=device,
+        ).to(device)
+        round_params = []
+        minmax_params = []
+        for key in wrapper_linear.params.keys():
+            if "min" in key or "max" in key:
+                minmax_params.append(wrapper_linear.params[key])
+            else:
+                round_params.append(wrapper_linear.value)
+        if len(round_params) + len(minmax_params) <= 0:
+            dump_info = f"quantized {layer_name}"
+            logger.info(dump_info)
+            with torch.no_grad():
+                unwrapper_layer(self.compressor.model, wrapper_linear, layer_name, {})
+            mv_module_from_gpu(layer)
+
+        lr = torch.tensor(self.compressor.lr)
+        minmax_lr = torch.tensor(self.compressor.minmax_lr)
+        if self.compressor.enable_minmax_tuning:
+            optimizer = self.optimizer(
+                [{"params": round_params}, {"params": minmax_params, "lr": minmax_lr}], lr=lr, weight_decay=0
+            )
+        else:
+            optimizer = self.optimizer([{"params": round_params}], lr=lr, weight_decay=0)
+        if self.compressor.lr_scheduler is None:
+            lr_schedule = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.compressor.iters
+            )
+        else:
+            lr_schedule = copy.deepcopy(self.compressor.lr_scheduler)
+        nsamples = len(inputs)
+        last_best_iter = 0
+        best_loss = torch.finfo(torch.float).max
+        scaler = self._get_scaler()  # pylint: disable=assignment-from-none
+        init_loss = None
+        gradient_accumulate_steps = self.compressor.batch_size  # Force to low gpu
+        total_loss = 0
+        num_elm = 1
+        mse_reduction = "mean"
+        if gradient_accumulate_steps != 1:
+            mse_reduction = "sum"
+        mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device)
+        batch_size = 1  # Force to low gpu
+        global_batch_size = self.compressor.batch_size * gradient_accumulate_steps
+        global_batch_size = min(nsamples, global_batch_size)
+        if gradient_accumulate_steps != 1 and not self.compressor.attention_mask:
+            whole_indices = torch.arange(global_batch_size)
+            if q_inputs is not None:
+                # Todo: refactor this
+                num_elm = self.compressor._get_current_num_elm(q_inputs, whole_indices)
+            else:
+                num_elm = self.compressor._get_current_num_elm(inputs, whole_indices)
+
+        index_sampler = IndexSampler(nsamples, global_batch_size)
+
+        for i in range(self.compressor.iters):
+            total_loss = 0
+            global_indices = index_sampler.next_batch()
+            if self.compressor.attention_mask:
+                num_elm = get_non_zero_cnt(self.compressor.attention_mask, global_indices)
+
+            for tmp_step in range(gradient_accumulate_steps):
+                indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
+                if q_inputs is not None:
+                    current_input = [q_inputs[i] for i in indices]
+                    current_input = torch.cat(current_input, dim=0).to(device)
+                    org_input = [inputs[i] for i in indices]
+                    org_input = torch.cat(org_input, dim=0).to(device)
+                else:
+                    current_input = [inputs[i] for i in indices]
+                    current_input = torch.cat(current_input, dim=0).to(device)
+                    org_input = current_input
+                with torch.no_grad():
+                    current_output = layer(org_input)
+                autocast_ctx = (
+                    nullcontext()
+                    if not self.compressor.amp
+                    else autocast(device_type=str(device).split(":")[0], dtype=self.compressor.amp_dtype)
+                )
+                if self.compressor.attention_mask:
+                    tmp_attention_mask = [self.compressor.attention_mask[i] for i in indices]
+                    tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device)
+                    tmp_attention_mask.unsqueeze_(-1)
+
+                    with autocast_ctx:
+                        output_q = wrapper_linear(current_input)  # pylint: disable=not-callable
+                        loss = mse_loss(  # pylint: disable=not-callable
+                            (output_q * tmp_attention_mask).to(torch.float32),
+                            (current_output * tmp_attention_mask).to(torch.float32),
+                        )
+
+                else:
+                    with autocast_ctx:
+                        output_q = wrapper_linear(current_input)  # pylint: disable=not-callable
+                        loss = mse_loss(  # pylint: disable=not-callable
+                            output_q.to(torch.float32),
+                            current_output.to(torch.float32),  # mul 1.0 will copy the output
+                        )
+
+                num_elm = 1 if num_elm <= 0 else num_elm
+                total_loss += loss.item() / num_elm
+
+                self._scale_loss_and_backward(scaler, loss)
+            if i == 0:
+                init_loss = total_loss
+
+            if total_loss < best_loss:
+                best_loss = total_loss
+                if not self.compressor.not_use_best_mse:
+                    best_params = collect_best_params(wrapper_linear, self.compressor.cache_device)
+                    last_best_iter = i
+            if self.compressor.not_use_best_mse and i == self.compressor.iters - 1:
+                best_params = collect_best_params(wrapper_linear, self.compressor.cache_device)
+
+            if not self.compressor.not_use_best_mse:
+                if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter:
+                    break
+            self._step(scaler, optimizer, lr_schedule)
+
+        last_loss = total_loss
+        best_iter = self.compressor.iters
+        if not self.compressor.not_use_best_mse:
+            last_loss = best_loss
+            best_iter = last_best_iter
+        with torch.no_grad():
+            unwrapper_layer(self.compressor.model, wrapper_linear, layer_name, best_params)
+        mv_module_from_gpu(layer)
+        dump_info = f"quantized {layer_name},  loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
+        logger.info(dump_info)
+
+    def _quantize_block_impl(
+        self,
+        block: torch.nn.Module,
+        input_ids: Union[list[torch.Tensor], dict],
+        input_others: dict,
+        q_input: Union[torch.Tensor, dict, None] = None,
+        device: Union[str, torch.device] = "cpu",
+        auto_offload=True,
+        **kwargs,
+    ):
+        """Quantize the weights of a given block of the model.
+
+        Args:
+        block: The block of the model to be quantized.
+        input_ids: The input tensor containing tokenized input ids.
+        input_others: A dictionary containing additional input data.
+        q_input: The quantized input tensor.
+        device: The device for quantization.
+
+        Returns:
+        Tuple: (q_outputs, output) if self.enable_quanted_input is True, else (None, output)
+        """
+        materialize_model_(block)
+        if is_fp8_model(self.compressor.model):
+            for n, m in block.named_modules():
+                if is_fp8_linear(m):
+                    new_layer = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device).to(
+                        device
+                    )
+                    set_module(block, n, new_layer)
+
+        if auto_offload:
+            # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights
+            # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk
+            if is_auto_device_mapping(self.compressor.device_map) and len(self.compressor.device_list) > 1:
+                card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning(
+                    block,
+                    self.compressor.device_map,
+                    input_ids,
+                    self.compressor.low_gpu_mem_usage,
+                    self.compressor.batch_size,
+                    device,
+                )
+            else:
+                block = block.to(device)
+                card_0_in_high_risk, loss_device = False, device
+        else:
+            card_0_in_high_risk, loss_device = False, device
+
+        if len(self.compressor.device_list) > 1 and auto_offload:
+            for n, m in block.named_modules():
+                if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
+                    continue
+                from accelerate.hooks import AlignDevicesHook, add_hook_to_module
+
+                hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
+                add_hook_to_module(m, hook, True)
+
+        if q_input is None:
+            hook_handles = register_act_max_hook(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                act_group_size=self.compressor.act_group_size,
+                act_data_type=self.compressor.act_data_type,
+            )
+
+            # TODO: refactor this part
+            output = self.compressor._get_block_outputs(
+                block,
+                input_ids,
+                input_others,
+                self.compressor.batch_size * self.compressor.infer_bs_coeff,
+                device,
+                self.compressor.cache_device,
+            )
+
+            for handle in hook_handles:
+                handle.remove()
+        else:
+            # TODO: refactor this part
+            output = self.compressor._get_block_outputs(
+                block,
+                input_ids,
+                input_others,
+                self.compressor.batch_size * self.compressor.infer_bs_coeff,
+                device,
+                self.compressor.cache_device,
+            )
+            hook_handles = register_act_max_hook(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                act_group_size=self.compressor.act_group_size,
+                act_data_type=self.compressor.act_data_type,
+            )
+            if hook_handles:
+                # TODO: refactor this part
+                self.compressor._get_block_outputs(
+                    block,
+                    q_input,
+                    input_others,
+                    self.compressor.batch_size * self.compressor.infer_bs_coeff,
+                    device,
+                    self.compressor.cache_device,
+                    save_output=False,
+                )
+
+            for handle in hook_handles:
+                handle.remove()
+
+        if q_input is not None:
+            if input_ids is not q_input:
+                clear_memory(input_ids, device_list=self.compressor.device_list)
+            else:
+                clear_memory(device_list=self.compressor.device_list)
+            input_ids = q_input
+
+        quantized_layer_names, unquantized_layer_names = self.compressor.wrapper_block(
+            block,
+            self.compressor.enable_minmax_tuning,
+            self.compressor.enable_norm_bias_tuning,
+            enable_torch_compile=self.compressor.enable_torch_compile,
+            device=device,
+        )
+        if is_nv_fp(self.compressor.data_type):  # enable qkv and moe structure global_scale fuse
+            from auto_round.data_type.utils import update_fused_layer_global_scales
+
+            modules = block.modules()
+            for module in modules:
+                update_fused_layer_global_scales(module)
+        round_params = []
+        minmax_params = []
+        for n, m in block.named_modules():
+            if hasattr(m, "orig_layer"):
+                for key in m.params.keys():
+                    if "min" in key or "max" in key:
+                        minmax_params.append(m.params[key])
+                    else:
+                        round_params.append(m.params[key])
+
+        lr = torch.tensor(self.compressor.lr)
+        minmax_lr = torch.tensor(self.compressor.minmax_lr)
+
+        extra_kwargs = {} if self.is_adam else {"momentum": self.compressor.momentum}
+
+        if self.compressor.enable_minmax_tuning:
+            params = [
+                {"params": round_params},
+                {"params": minmax_params, "lr": minmax_lr},
+            ]
+        else:
+            params = round_params
+
+        optimizer = self.optimizer(
+            params,
+            lr=lr,
+            weight_decay=0,
+            **extra_kwargs,
+        )
+
+        if len(round_params) + len(minmax_params) <= 0:
+            dump_info = (
+                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
+                f"layers in the block"
+            )
+            logger.info(dump_info)
+            unwrapper_block(block, {})
+            mv_module_from_gpu(block)
+            return output, output
+
+        if self.compressor.lr_scheduler is None:
+            lr_schedule = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=1.0, end_factor=0.0, total_iters=self.compressor.iters
+            )
+        else:
+            lr_schedule = copy.deepcopy(self.compressor.lr_scheduler)
+
+        if isinstance(input_ids, dict):  # input_ids of Flux is dict
+            nsamples = len(input_ids["hidden_states"])
+        else:
+            nsamples = len(input_ids)
+        last_best_iter = 0
+        best_loss = torch.finfo(torch.float).max
+        num_elm = 1
+        mse_reduction = "mean"
+        if self.compressor.gradient_accumulate_steps != 1:
+            mse_reduction = "sum"
+        mse_loss = torch.nn.MSELoss(reduction=mse_reduction).to(device)
+        scaler = self._get_scaler()  # pylint: disable=assignment-from-none
+        init_loss = None
+        best_params = {}
+        total_loss = 0
+        global_batch_size = self.compressor.batch_size * self.compressor.gradient_accumulate_steps
+        global_batch_size = min(nsamples, global_batch_size)
+        # We assume the block input and output shape is same
+        if self.compressor.gradient_accumulate_steps != 1 and not self.compressor.attention_mask:
+            whole_indices = torch.arange(global_batch_size)
+            num_elm = self.compressor._get_current_num_elm(input_ids, whole_indices)
+
+        index_sampler = IndexSampler(nsamples, global_batch_size)
+        batch_size = self.compressor.batch_size
+        for i in range(self.compressor.iters):
+            if self.compressor.enable_alg_ext and self.compressor.data_type.endswith("dq"):
+                for n, m in block.named_modules():
+                    m.cur_iter = i
+            total_loss = 0
+            global_indices = index_sampler.next_batch()
+            if self.compressor.attention_mask:
+                num_elm = get_non_zero_cnt(self.compressor.attention_mask, global_indices)
+
+            for tmp_step in range(self.compressor.gradient_accumulate_steps):
+                indices = global_indices[tmp_step * batch_size : (tmp_step + 1) * batch_size]
+                current_output = self._get_current_output(
+                    output, indices, self.compressor.batch_dim, diffusion=self.compressor.diffusion
+                )
+                current_output = to_device(current_output, loss_device)
+                # TODO: refactor this
+                output_q = self.compressor._get_current_q_output(
+                    block, input_ids, input_others, indices, device, loss_device
+                )
+                # TODO: refactor this
+                loss = self.compressor._get_loss(output_q, current_output, indices, mse_loss, device)
+                num_elm = 1 if num_elm <= 0 else num_elm
+                total_loss += loss.item() / num_elm
+
+                if self.compressor.low_gpu_mem_usage and card_0_in_high_risk:
+                    # clear memory to avoid OOM due to memory fragmentation
+                    clear_memory_if_reached_threshold(threshold=0.5, device_list=self.compressor.device_list)
+
+                self._scale_loss_and_backward(scaler, loss)
+                if self.compressor.low_gpu_mem_usage and card_0_in_high_risk:
+                    # clear memory to avoid OOM due to memory fragmentation
+                    clear_memory_if_reached_threshold(threshold=0.8, device_list=self.compressor.device_list)
+
+            if i == 0:
+                init_loss = total_loss
+
+            if total_loss < best_loss:
+                best_loss = total_loss
+                if not self.compressor.not_use_best_mse:
+                    best_params = collect_best_params(block, self.compressor.cache_device)
+                    # print(f"get better result at iter {i}, the loss is {total_loss}", flush=True)
+
+                    last_best_iter = i
+            if self.compressor.not_use_best_mse and i == self.compressor.iters - 1:
+                best_params = collect_best_params(block, self.compressor.cache_device)
+
+            if not self.compressor.not_use_best_mse:
+                if 0 < self.compressor.dynamic_max_gap <= i - last_best_iter:
+                    break
+            self._step(scaler, optimizer, lr_schedule)
+
+        last_loss = total_loss
+        best_iter = self.compressor.iters
+        if not self.compressor.not_use_best_mse:
+            last_loss = best_loss
+            best_iter = last_best_iter
+        if self.compressor.iters > 0:
+            dump_info = (
+                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
+                f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
+            )
+        else:
+            dump_info = (
+                f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
+                "layers in the block"
+            )
+
+        if self.compressor.low_gpu_mem_usage:
+            clear_memory(device_list=self.compressor.device_list)  # clear cached memory during training
+        if len(unquantized_layer_names) != 0:
+            logger.info(f"{unquantized_layer_names} have not been quantized")
+        with torch.no_grad():
+            unwrapper_block(block, best_params)
+
+        if is_nv_fp(self.compressor.act_data_type):
+            # enable moe experts act_max automatic generation for WrapperWALayer
+            set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max")
+
+        if self.compressor.enable_quanted_input:
+            # TODO: refactor this
+            q_outputs = self.compressor._get_block_outputs(
+                block,
+                input_ids,
+                input_others,
+                self.compressor.batch_size * self.compressor.infer_bs_coeff,
+                device,
+                cache_device=self.compressor.cache_device,
+            )
+
+            if len(self.compressor.device_list) > 1 and auto_offload:
+                accelerate.hooks.remove_hook_from_submodules(block)
+            if auto_offload:
+                mv_module_from_gpu(block)
+
+            clear_memory(input_ids, device_list=self.compressor.device_list)
+            memory_info_summary = memory_monitor.get_summary()
+            logger.infoclean(dump_info + "," + memory_info_summary)
+
+            return q_outputs, output
+        else:
+            if len(self.compressor.device_list) > 1 and auto_offload:
+                accelerate.hooks.remove_hook_from_submodules(block)
+            if auto_offload:
+                mv_module_from_gpu(block)
+            clear_memory(input_ids, device_list=self.compressor.device_list)
+            memory_info_summary = memory_monitor.get_summary()
+            logger.infoclean(dump_info + "," + memory_info_summary)
+
+            return None, output
+
+    def _post_quantize_block_impl(self, block: torch.nn.Module, *args, last_group: bool, **kwargs):
+        """Post-process after quantizing a block.
+
+        Args:
+        block: The block of the model that was quantized.
+
+        Returns:
+        None
+        """
+        if hasattr(block, "config"):
+            del block.config
+        if self.compressor.is_immediate_packing:
+            for _, tmp_m in block.named_modules():
+                if not (hasattr(tmp_m, "bits") and check_to_quantized(tmp_m)):
+                    continue
+                self.compressor._immediate_pack(tmp_m.global_name)
+
+        if self.compressor.is_immediate_saving:
+            shard_writer(self.compressor, block, last_group=last_group)
+
+    @staticmethod
+    def _get_current_output(
+        output: list[torch.Tensor], indices: list[int], batch_dim: int, diffusion: bool = False
+    ) -> torch.Tensor:
+        if diffusion:
+            assert "hidden_states" in output
+            current_output = [output["hidden_states"][x] for x in indices]
+            current_output = torch.cat(current_output, dim=batch_dim)
+            return current_output
+
+        current_output = [output[x] for x in indices]
+        current_output = torch.cat(current_output, dim=batch_dim)
+        return current_output
+
+    def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any):
+        """Performs a step in the optimization process.
+
+        Args:
+        scaler: The scaler to be used.
+        optimizer: The optimizer for the step.
+        lr_schedule: The learning rate schedule.
+
+        Returns:
+        None
+        """
+        optimizer.step()
+        # for hpu
+        if is_hpex_available():
+            htcore.mark_step()
+        optimizer.zero_grad()
+        lr_schedule.step()
+
+    def _scale_loss_and_backward(self, scaler: Any, loss: torch.Tensor) -> torch.Tensor:
+        """Scales the loss and performs backward pass.
+
+        Args:
+        scaler: The scaler to be used.
+        loss: The loss to be scaled.
+
+        Returns:
+        The scaled loss.
+        """
+        scale_loss = loss * 1000
+        scale_loss.backward()
+        if is_hpex_available():
+            htcore.mark_step()
+        return scale_loss
+
+    def _get_scaler(self):
+        """Returns scaler, in SignRound, no need to use scaler."""
+        return None
+
+    def _get_optimizer(self, optimizer: Any):
+        """Returns the specified optimizer. In SignRound, we fix the optimizer.
+
+        Args:
+        optimizer: The optimizer to be used.
+
+        Returns:
+        The specified optimizer.
+        """
+        return SignSGD
+
+
+class ARAdamQuantizer(ARQuantizer):
+    """AutoRound Quantizer with Adam optimizer."""
+
+    def __init__(self, compressor: "BaseCompressor"):
+        super().__init__(compressor)
+        self.optimizer = self._get_optimizer("AdamW")
+        self.is_adam = True
+
+    def _step(self, scaler, optimizer, lr_schedule):
+        if scaler is not None:
+            scaler.step(optimizer)
+            optimizer.zero_grad()
+            lr_schedule.step()
+            scaler.update()
+        else:
+            optimizer.step()
+            optimizer.zero_grad()
+            lr_schedule.step()
+        if is_hpex_available():
+            htcore.mark_step()
+
+    def _scale_loss_and_backward(self, scaler, loss):
+        if scaler is not None:
+            loss = scaler.scale(loss)
+
+        loss.backward()
+        if is_hpex_available():
+            htcore.mark_step()
+        return loss
+
+    def _get_scaler(self):
+        scaler = None
+        if self.compressor.amp and not check_is_cpu(self.compressor.device):
+            from torch.cuda.amp import GradScaler
+
+            scaler = GradScaler(init_scale=1024, growth_interval=100000)
+        return scaler
+
+    def _get_optimizer(self, optimizer):
+        if optimizer is None:
+            optimizer = torch.optim.AdamW
+        elif isinstance(optimizer, str):
+            optimizer = getattr(torch.optim, optimizer)
+        else:
+            optimizer = optimizer
+        return optimizer
diff --git a/auto_round/quantizers/algs/base.py b/auto_round/quantizers/algs/base.py
new file mode 100755
index 000000000..bcb112094
--- /dev/null
+++ b/auto_round/quantizers/algs/base.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+from auto_round.quantizers.base import BaseQuantizer
+
+if TYPE_CHECKING:
+    from auto_round.compressors.base import BaseCompressor
+
+
+class AlgsBaseQuantizer(BaseQuantizer, ABC):
+    def _pre_quantize_impl(self, *args, **kwargs):
+        pass
+
+    @abstractmethod
+    def _quantize_impl(self, *args, **kwargs):
+        pass
+
+    def _post_quantize_impl(self, *args, **kwargs):
+        pass
+
+    def _pre_quantize_layer_impl(self, *args, **kwargs):
+        pass
+
+    def _quantize_layer_impl(self, *args, **kwargs):
+        pass
+
+    def _post_quantize_layer_impl(self, *args, **kwargs):
+        pass
+
+    def _pre_quantize_block_impl(self, *args, **kwargs):
+        pass
+
+    def _quantize_block_impl(self, *args, **kwargs):
+        pass
+
+    def _post_quantize_block_impl(self, *args, **kwargs):
+        pass
diff --git a/auto_round/quantizers/algs/rtn.py b/auto_round/quantizers/algs/rtn.py
new file mode 100755
index 000000000..585c2631a
--- /dev/null
+++ b/auto_round/quantizers/algs/rtn.py
@@ -0,0 +1,689 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import traceback
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+import accelerate
+import torch
+from accelerate.big_modeling import dispatch_model, infer_auto_device_map
+from tqdm import tqdm
+
+from auto_round.compressors.shard_writer import shard_writer
+from auto_round.compressors.utils import (
+    check_need_act_calibration,
+    is_nv_fp,
+    is_static_wfp8afp8,
+)
+from auto_round.logger import logger
+from auto_round.modelling.replace_modules import materialize_model_, safe_to_cpu_
+from auto_round.quantizers.algs.base import AlgsBaseQuantizer
+from auto_round.quantizers.utils import (
+    get_quantized_layer_names_outside_blocks,
+    quantize_embedding_layer,
+    register_act_max_hook,
+)
+from auto_round.utils import (
+    check_to_quantized,
+    clear_memory,
+    convert_fp8_layer_to_linear,
+    convert_fp8_module_to_16b,
+    flatten_list,
+    get_block_names,
+    get_lm_head_name,
+    get_module,
+    global_state,
+    is_auto_device_mapping,
+    is_fp8_linear,
+    is_fp8_model,
+    memory_monitor,
+    mv_module_from_gpu,
+    set_amax_for_all_moe_layers,
+    set_module,
+    to_device,
+    to_dtype,
+)
+from auto_round.utils.device import (
+    clear_memory_if_reached_threshold,
+    get_major_device,
+    parse_available_devices,
+    set_auto_device_map_for_block_with_tuning,
+    set_non_auto_device_map,
+)
+from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
+
+if TYPE_CHECKING:
+    from auto_round.compressors.base import BaseCompressor
+
+
+class RTNQuantizer(AlgsBaseQuantizer):
+
+    def __init__(self, compressor: "BaseCompressor"):
+        super().__init__(compressor)
+        self.all_to_quantized_module_names: list[str] = [
+            n for n, m in self.compressor.model.named_modules() if check_to_quantized(m)
+        ]
+
+    def _pre_quantize_impl(self, *args, **kwargs):
+        if self.compressor.amp and self.compressor.model.dtype != self.compressor.amp_dtype:
+            self.compressor.model.to(self.compressor.amp_dtype)
+
+        if is_nv_fp(self.compressor.data_type):
+            # FIXME: (yiliu30) change it to block-wise after we refactor the quantization code and
+            # https://github.com/intel/auto-round/issues/1331
+            materialize_model_(self.model)
+            self.compressor.model.to("cpu")
+            from auto_round.data_type.nvfp import calculate_gparam
+            from auto_round.data_type.utils import update_fused_layer_global_scales
+
+            pbar = tqdm(self.all_to_quantized_module_names)
+            for name in pbar:
+                pbar.set_description(f"Calculate weight global scale: {name}")
+                m = get_module(self.compressor.model, name)
+                if is_fp8_linear(m):
+                    m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device)
+                    set_module(self.compressor.model, name, m)
+                weight_global_scale = calculate_gparam(m.weight, self.compressor.group_size)
+                setattr(m, "weight_global_scale", weight_global_scale)
+
+            logger.info("Start to update fused layer global scales, it may take some time.")
+            for name, module in self.compressor.model.named_modules():
+                update_fused_layer_global_scales(module)
+            logger.info("Finished updating fused layer global scales.")
+
+    @torch.inference_mode()
+    def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
+        """Quantize all modules in the model using RTN (Round-To-Nearest) strategy.
+
+        If the target format includes GGUF with `k`, and optimized RTN is enabled,
+        blockwise quantization with input caching and imatrix is used.
+
+        Returns:
+            tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration.
+        """
+        if not (
+            any(fmt.is_gguf() for fmt in getattr(self.compressor, "formats", []))
+            or self.compressor.super_bits is not None
+        ):
+            quantize_embedding_layer(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                scale_dtype=self.compressor.scale_dtype,
+                disable_opt_rtn=self.compressor.disable_opt_rtn,
+                device=self.compressor.device,
+                device_list=self.compressor.device_list,
+            )  # leave to gguf itself to handle
+
+        # Release memory
+        clear_memory(device_list=self.compressor.device_list)
+
+        if self.compressor.act_bits <= 8 and check_need_act_calibration(
+            self.compressor.act_dynamic,
+            self.compressor.act_data_type,
+            self.compressor.act_bits,
+            self.compressor.static_kv_dtype,
+            self.compressor.static_attention_dtype,
+        ):  # TODO, mixed datatype has bug
+            hook_handles = register_act_max_hook(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                act_group_size=self.compressor.act_group_size,
+                act_data_type=self.compressor.act_data_type,
+            )
+            try:
+                self._quantize_via_rtn_blockwise()
+            except torch.OutOfMemoryError:
+                logger.warning("Fallback to CPU. Consider using more GPUs via `--device 0,1,2,3`.")
+                self.model = self.model.to("cpu")
+                clear_memory(device_list=self.compressor.device_list)
+                if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
+                    import accelerate
+
+                    accelerate.hooks.remove_hook_from_submodules(self.model)
+                orig_device = self.compressor.device
+                self.compressor.device = "cpu"
+                self._quantize_via_rtn_blockwise()
+                self.compressor.device = orig_device
+            for handle in hook_handles:
+                handle.remove()
+        else:
+            # By default, we go with layer-wise way if no replacement happened
+            use_blockwise_quantization = global_state.replaced_module_count > 0
+            tied_weights_keys = getattr(self.compressor.model, "_tied_weights_keys", [])
+            if tied_weights_keys is None:
+                tied_weights_keys = []
+            if isinstance(tied_weights_keys, dict):
+                tied_weights_values = list(tied_weights_keys.values())
+            else:
+                tied_weights_values = list(tied_weights_keys)
+            tied_weights_layers = [".".join(val.split(".")[:-1]) for val in tied_weights_values]  # rm weight/bias
+            # In fact, we should detect whether it is is_separate_lm_head, to simplify, we don't do it
+            if hasattr(self.compressor, "formats") and self.compressor.formats[0].is_gguf():
+                lm_head_name = get_lm_head_name(self.model)
+                if lm_head_name is not None:
+                    tied_weights_layers.append(lm_head_name)
+
+            if use_blockwise_quantization:  # The ram usage is a little higher
+                all_to_quantized_module_names = list(set(self.all_to_quantized_module_names))
+                all_blocks = (
+                    self.compressor.quant_block_list
+                    if self.compressor.quant_block_list
+                    else get_block_names(self.model)
+                )
+                pbar = tqdm(range(sum(len(block) for block in all_blocks)))
+                for block_names in all_blocks:
+                    for block_name in block_names:
+                        pbar.set_description(f"Quantizing {block_name}")
+                        block = get_module(self.compressor.model, block_name)
+                        materialize_model_(block)
+                        for name, m in block.named_modules():
+                            if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names:
+                                self._quantize_layer_via_rtn(m.global_name, to_cpu=self.compressor.low_gpu_mem_usage)
+                                all_to_quantized_module_names.remove(m.global_name)
+                            elif (
+                                not any(m.children())
+                                and len(m.state_dict()) > 0
+                                and m.global_name not in tied_weights_layers
+                            ):
+                                set_module(self.compressor.model, m.global_name, copy.deepcopy(m))
+                                if self.compressor.is_immediate_saving:
+                                    shard_writer(self, name=m.global_name)
+                                m.to("meta")
+                        clear_memory(device_list=self.compressor.device_list)
+                        memory_monitor.log_summary()
+                        pbar.update(1)
+                cnt = 1
+                for name in all_to_quantized_module_names:
+                    logger.info(f"Quantizing remaining layer {name} on CPU.")
+                    self._quantize_layer_via_rtn(name, to_cpu=True)
+                    cnt += 1
+                    if cnt % 10 == 0:
+                        clear_memory(device_list=self.compressor.device_list)
+                        memory_monitor.log_summary()
+            else:
+                materialize_model_(self.model)
+                self.compressor.model.to("cpu")
+                block_names_cnt = len(flatten_list(get_block_names(self.compressor.model, True)))
+                clear_mem_freq = len(self.all_to_quantized_module_names) // block_names_cnt
+                if clear_mem_freq == 0:
+                    clear_mem_freq = 1
+                pbar = tqdm(self.all_to_quantized_module_names)
+                cnt = 1
+                for name in pbar:
+                    pbar.set_description(f"Quantizing {name}")
+                    self._quantize_layer_via_rtn(name)
+                    if cnt % clear_mem_freq == 0:
+                        clear_memory(device_list=self.compressor.device_list)
+                        memory_monitor.log_summary()
+                        cnt = 1
+                    cnt += 1
+        # Convert remaining fp8
+        if is_fp8_model(self.compressor.model):
+            convert_fp8_module_to_16b(self.compressor.model, self.compressor.amp_dtype, self.compressor.device)
+        if self.compressor.is_immediate_saving:
+            shard_writer(self, is_finalize=True)
+        self.compressor.quantized = True
+        return self.compressor.model, self.compressor.layer_config
+
+    def _quantize_via_rtn_blockwise(self) -> None:
+        """Quantize model layers block by block using cached inputs."""
+
+        all_to_quantized_module_names = list(set(self.all_to_quantized_module_names))
+
+        all_blocks = (
+            self.compressor.quant_block_list
+            if self.compressor.quant_block_list
+            else get_block_names(self.compressor.model)
+        )
+        if not all_blocks:
+            raise ValueError("Could not find any blocks. Check the model or quant_block_list.")
+
+        all_first_block_names = [block[0] for block in all_blocks]
+        layer_names = get_quantized_layer_names_outside_blocks(
+            model=self.compressor.model,
+            layer_config=self.compressor.layer_config,
+            supported_types=self.compressor.supported_types,
+            quant_block_list=self.compressor.quant_block_list,
+        )
+        if self.compressor.act_bits < 16 and (not self.compressor.act_dynamic or len(layer_names) > 0):
+            if len(layer_names) > 0:
+                logger.warning(
+                    "quantize layers outside blocks for static activation quantizaiton"
+                    " will significantly increase calibration time"
+                )
+            all_inputs = self.compressor.try_cache_inter_data_gpucpu(
+                all_first_block_names, self.compressor.nsamples, layer_names
+            )
+        else:
+            all_inputs = self.compressor.cache_inter_data(all_first_block_names, self.compressor.nsamples)
+
+        # Clear hooks for multi-GPU setups
+        if hasattr(self.compressor.model, "hf_device_map") and len(self.compressor.model.hf_device_map) > 1:
+            accelerate.hooks.remove_hook_from_submodules(self.compressor.model)
+
+        pbar = tqdm(range(sum(len(block) for block in all_blocks)))
+
+        for block_names in all_blocks:
+            first_block = block_names[0]
+            inputs = all_inputs.pop(first_block)
+            input_keys = [k for k in inputs if k.startswith("hidden_state")]
+            if len(input_keys) != 1:
+                raise RuntimeError(
+                    "hidden_states arg mismatch. Please file an issue at https://github.com/intel/auto-round/issues"
+                )
+            inputs["input_ids"] = inputs.pop(input_keys[0])
+
+            clear_memory(self.compressor.inputs, device_list=self.compressor.device_list)
+
+            total_samples = len(inputs["input_ids"])
+            if total_samples < self.compressor.batch_size:
+                self.compressor.batch_size = total_samples
+                logger.warning(f"Forcing batch size to {total_samples}")
+
+            input_ids = to_device(inputs.pop("input_ids"), self.compressor.cache_device)
+            input_others = to_device(inputs, self.compressor.cache_device)
+
+            tmp_dtype = self.compressor.amp_dtype if self.compressor.amp else torch.float32
+            input_ids = [id_.to(tmp_dtype) for id_ in input_ids]
+
+            for key, val in input_others.items():
+                if isinstance(val, torch.Tensor) and val.dtype in (torch.float16, torch.bfloat16):
+                    input_others[key] = val.to(tmp_dtype)
+                elif isinstance(val, list):
+                    input_others[key] = [to_dtype(v, tmp_dtype) for v in val]
+
+            for block_name in block_names:
+                pbar.set_description(f"Quantizing {block_name}")
+                block = get_module(self.compressor.model, block_name)
+                materialize_model_(block)
+                block.to("cpu")
+                if is_fp8_model(self.compressor.model):
+                    convert_fp8_module_to_16b(block, dtype=self.compressor.amp_dtype, device=self.compressor.device)
+
+                if is_auto_device_mapping(self.compressor.device_map) and len(self.compressor.device_list) > 1:
+                    set_auto_device_map_for_block_with_tuning(
+                        block,
+                        self.compressor.device_map,
+                        input_ids,
+                        self.compressor.low_gpu_mem_usage,
+                        self.compressor.batch_size,
+                        self.compressor.device,
+                    )
+                # Dispatch model if needed
+                if len(self.compressor.device_list) > 1:
+                    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
+
+                    for _, m in block.named_modules():
+                        if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
+                            continue
+                        hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
+                        add_hook_to_module(m, hook, True)
+                else:
+                    block = block.to(self.compressor.device)
+
+                # TODO: refactor this part
+                input_ids = self.compressor._get_block_outputs(
+                    block,
+                    input_ids,
+                    input_others,
+                    self.compressor.batch_size * self.compressor.infer_bs_coeff,
+                    self.compressor.device,
+                    self.compressor.cache_device,
+                )
+
+                if len(self.compressor.device_list) > 1:
+                    accelerate.hooks.remove_hook_from_submodules(block)
+
+                if is_nv_fp(self.compressor.act_data_type) or is_static_wfp8afp8(self.compressor):
+                    # enable moe experts act_max automatic generation for Linear
+                    set_amax_for_all_moe_layers(block, attr_name="act_max")
+                if self.compressor.low_gpu_mem_usage:
+                    block.to("cpu")
+                    clear_memory(device_list=self.compressor.device_list)
+
+                for _, m in block.named_modules():
+                    if hasattr(m, "global_name") and m.global_name in all_to_quantized_module_names:
+                        self._quantize_layer_via_rtn(m.global_name, to_cpu=self.compressor.low_gpu_mem_usage)
+                        all_to_quantized_module_names.remove(m.global_name)
+                if not self.compressor.is_immediate_saving:
+                    mv_module_from_gpu(block)
+                if block_name == block_names[-1]:
+                    clear_memory(input_ids, device_list=self.compressor.device_list)
+                else:
+                    clear_memory(device_list=self.compressor.device_list)
+
+                memory_monitor.log_summary()
+                pbar.update(1)
+        pbar.close()
+        # Process remaining layers not in blocks
+        for name in all_to_quantized_module_names:
+            dtype = None
+            if self.compressor.super_group_size is not None:
+                dtype = torch.float32
+            self._quantize_layer_via_rtn(name, dtype=dtype)
+            # clear_memory(device_list=self.compressor.device_list)
+
+    def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None:
+        """Quantizes a layer using RTN (Round-To-Nearest) if available.
+
+        This function attempts to quantize a layer by switching its data type to a
+        `rtn_*` version if supported, then wraps and unwraps the module to apply
+        quantization. If GPU memory is insufficient, it falls back to CPU.
+
+        If packing is enabled (`immediate_packing`), the function will also export
+        the quantized layer to the appropriate backend format.
+
+        Args:
+            name (str): Name of the layer to quantize.
+
+        Raises:
+            RuntimeError: If quantization fails for reasons unrelated to memory.
+        """
+        m = get_module(self.compressor.model, name)
+        if dtype is not None:
+            m = m.to(dtype)
+
+        if is_fp8_linear(m):
+            m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device)
+            set_module(self.compressor.model, name, m)
+        tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.compressor.device
+
+        try:
+            m = m.to(tuning_device)
+            m = WrapperLinear(
+                m,
+                device=tuning_device,
+                enable_minmax_tuning=False,
+                enable_norm_bias_tuning=False,
+                enable_round_tuning=False,
+                enable_torch_compile=self.compressor.enable_torch_compile,
+            )
+            m = m.unwrapper({})
+        except torch.OutOfMemoryError:
+            cuda_error_msg = traceback.format_exc()
+            m = m.orig_layer if hasattr(m, "orig_layer") else m
+            try:
+                logger.error(cuda_error_msg)
+                logger.warning("falling back to CPU.")
+                m.to("cpu")
+                m = WrapperLinear(
+                    m,
+                    enable_minmax_tuning=False,
+                    enable_norm_bias_tuning=False,
+                    enable_round_tuning=False,
+                    enable_torch_compile=self.compressor.enable_torch_compile,
+                )
+                m = m.unwrapper({})
+            except Exception as e:
+                raise
+
+        # Step 2: Optional immediate packing/export
+        if self.compressor.is_immediate_packing:  # For gguf, packing conducts on block level
+            self.compressor._immediate_pack(name)
+            if to_cpu:
+                m = m.to("cpu")
+                packed_m = get_module(self.compressor.model, name)
+                set_module(self.compressor.model, name, packed_m.to("cpu"))
+        else:
+            if to_cpu:
+                m = m.to("cpu")
+            set_module(self.compressor.model, name, m)
+        if self.compressor.is_immediate_saving:
+            if hasattr(self.compressor, "all_to_quantized_module_names"):
+                all_to_quantized_module_names = self.compressor.all_to_quantized_module_names
+            else:
+                all_to_quantized_module_names = [
+                    n for n, m in self.compressor.model.named_modules() if check_to_quantized(m)
+                ]
+            last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1])
+            m = get_module(self.compressor.model, name)
+            shard_writer(self.compressor, m, name, last_module)
+
+
+class OptRTNQuantizer(RTNQuantizer):
+
+    @staticmethod
+    def register_act_hook(model, supported_types):
+        """Registers hooks to accumulate activation squared norms into `imatrix`."""
+
+        def get_imatrix_hook(module, input, output):
+            input = input[0] if isinstance(input, (tuple, list)) else input
+            flattened = input.reshape(-1, input.shape[-1]).to(torch.float32)
+            squared = torch.sum(torch.pow(flattened, 2), dim=0).to(torch.float32)
+
+            if not hasattr(module, "imatrix"):
+                module.imatrix = squared
+                module.imatrix_cnt = input.shape[0]
+            else:
+                module.imatrix += squared.to(module.imatrix.device)
+                module.imatrix_cnt += input.shape[0]
+
+        hook_handles = []
+        for name, module in model.named_modules():
+            if type(module) in supported_types and check_to_quantized(module):
+                hook = module.register_forward_hook(get_imatrix_hook)
+                hook_handles.append(hook)
+        return hook_handles
+
+    @torch.inference_mode()
+    def _quantize_impl(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
+        enable_imatrix = False
+        has_gguf_k = (
+            any(fmt.is_gguf() and "k" in fmt.output_format for fmt in getattr(self.compressor, "formats", []))
+            or self.compressor.super_bits is not None
+        )
+        if has_gguf_k:
+            enable_imatrix = True
+        elif self.compressor.data_type == "int" and self.compressor.sym:
+            enable_imatrix = True
+        if enable_imatrix:
+            self._quant_rtn_with_imatrix(self.all_to_quantized_module_names)
+            # Convert remaining fp8
+            if is_fp8_model(self.compressor.model):
+                convert_fp8_module_to_16b(self.compressor.model, self.compressor.amp_dtype, self.compressor.device)
+            self.compressor.quantized = True
+            return self.compressor.model, self.compressor.layer_config
+        else:
+            return super()._quantize_impl(*args, **kwargs)
+
+    def _quant_rtn_with_imatrix(self, *args, **kwargs) -> tuple[torch.nn.Module, dict[str, Any]]:
+        """Quantize all modules in the model using Optimized RTN strategy.
+
+        This method applies optimized RTN quantization to all modules in the model
+        that are marked for quantization. It leverages input caching and imatrix
+        techniques for enhanced performance.
+
+        Returns:
+            tuple[nn.Module, Dict[str, Any]]: The quantized model and the layer configuration.
+        """
+        if not (
+            any(fmt.is_gguf() for fmt in getattr(self.compressor, "formats", []))
+            or self.compressor.super_bits is not None
+        ):
+            quantize_embedding_layer(
+                model=self.compressor.model,
+                layer_config=self.compressor.layer_config,
+                scale_dtype=self.compressor.scale_dtype,
+                disable_opt_rtn=self.compressor.disable_opt_rtn,
+                device=self.compressor.device,
+                device_list=self.compressor.device_list,
+            )  # leave to gguf itself to handle
+
+        self.compressor.model.to("cpu")
+        # Release memory
+        clear_memory(device_list=self.compressor.device_list)
+
+        logger.info("start to compute imatrix")
+
+        # Load dataset
+        from auto_round.calib_dataset import get_dataloader
+
+        if isinstance(self.compressor.dataset, str):
+            if self.compressor.tokenizer is None:
+                raise ValueError("A tokenizer must be set for the model when using a dataset string.")
+            dataset_name = self.compressor.dataset.replace(" ", "")
+            self.compressor.dataloader = get_dataloader(
+                self.compressor.tokenizer,
+                self.compressor.seqlen,
+                dataset_name,
+                self.compressor.seed,
+                self.compressor.batch_size,
+                self.compressor.nsamples,
+            )
+        else:
+            self.compressor.dataloader = self.compressor.dataset
+
+        model = self.compressor.model
+
+        # Dispatch multi-GPU model if necessary
+        if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
+            dispatch_model(model, model.hf_device_map)
+
+        hooks = self.register_act_hook(model, self.compressor.supported_types)
+
+        try:
+            if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
+                import accelerate
+
+                accelerate.hooks.remove_hook_from_submodules(model)
+            model = model.to("cpu")
+            clear_memory(device_list=self.compressor.device_list)
+            self._quantize_via_rtn_blockwise()
+        except torch.OutOfMemoryError:
+            cuda_error_msg = traceback.format_exc()
+            try:
+                logger.error(cuda_error_msg)
+                # Final fallback: warn and use CPU-only quantization
+                logger.warning(
+                    "Fallback to CPU. "
+                    "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`."
+                )
+                model = model.to("cpu")
+                clear_memory(device_list=self.compressor.device_list)
+                if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
+                    import accelerate
+
+                    accelerate.hooks.remove_hook_from_submodules(model)
+
+                orig_device = self.compressor.device
+                self.compressor.device = "cpu"
+                self._quantize_via_rtn_blockwise()
+                self.compressor.device = orig_device
+            except Exception as e:
+                raise
+        finally:
+            # Always remove hooks
+            for hook in hooks:
+                hook.remove()
+
+    def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=True) -> None:
+        """Quantizes a layer using RTN (Round-To-Nearest) if available.
+
+        This function attempts to quantize a layer by switching its data type to a
+        `rtn_*` version if supported, then wraps and unwraps the module to apply
+        quantization. If GPU memory is insufficient, it falls back to CPU.
+
+        If packing is enabled (`immediate_packing`), the function will also export
+        the quantized layer to the appropriate backend format.
+
+        Args:
+            name (str): Name of the layer to quantize.
+
+        Raises:
+            RuntimeError: If quantization fails for reasons unrelated to memory.
+        """
+        m = get_module(self.compressor.model, name)
+        if dtype is not None:
+            m = m.to(dtype)
+
+        if is_fp8_linear(m):
+            m = convert_fp8_layer_to_linear(m, self.compressor.amp_dtype, self.compressor.device)
+            set_module(self.compressor.model, name, m)
+        tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.compressor.device
+        # Step 1: Try quantization on GPU first, fall back to CPU if OOM
+        if (
+            self.compressor.is_immediate_packing
+            and self.compressor.iters == 0
+            and self.compressor.formats[0].is_gguf()
+            and not self.compressor.disable_opt_rtn
+        ):
+            m = m.to(tuning_device)
+            m.scale = None
+            m.zp = None
+        else:
+            try:
+                disable_opt_rtn = False
+                if (
+                    self.compressor.orig_disable_opt_rtn is None
+                    and self.compressor.is_moe_model
+                    and "expert" in m.global_name
+                    and "shared_expert" not in m.global_name
+                    and self.compressor.super_bits is None  # GGUF still uses the optimized RTN for MoE layers
+                ):
+                    disable_opt_rtn = True
+                    logger.warning_once(
+                        "MoE layer detected: optimized RTN is disabled for efficiency. "
+                        "Use `--enable_opt_rtn` to force-enable it for MoE layers."
+                    )
+                m = m.to(tuning_device)
+                m = WrapperLinear(
+                    m,
+                    device=tuning_device,
+                    enable_minmax_tuning=False,
+                    enable_norm_bias_tuning=False,
+                    enable_round_tuning=False,
+                    enable_torch_compile=self.compressor.enable_torch_compile,
+                    disable_opt_rtn=disable_opt_rtn,
+                )
+                m = m.unwrapper({})
+            except torch.OutOfMemoryError:
+                cuda_error_msg = traceback.format_exc()
+                m = m.orig_layer if hasattr(m, "orig_layer") else m
+                try:
+                    logger.error(cuda_error_msg)
+                    logger.warning("falling back to CPU.")
+                    m.to("cpu")
+                    m = WrapperLinear(
+                        m,
+                        enable_minmax_tuning=False,
+                        enable_norm_bias_tuning=False,
+                        enable_round_tuning=False,
+                        enable_torch_compile=self.compressor.enable_torch_compile,
+                    )
+                    m = m.unwrapper({})
+                except Exception as e:
+                    raise
+
+        # Step 2: Optional immediate packing/export
+        if self.compressor.is_immediate_packing:  # For gguf, packing conducts on block level
+            self.compressor._immediate_pack(name)
+            if to_cpu:
+                m = m.to("cpu")
+                packed_m = get_module(self.compressor.model, name)
+                set_module(self.compressor.model, name, packed_m.to("cpu"))
+        else:
+            if to_cpu:
+                m = m.to("cpu")
+            set_module(self.compressor.model, name, m)
+        if self.compressor.is_immediate_saving:
+            if hasattr(self.compressor, "all_to_quantized_module_names"):
+                all_to_quantized_module_names = self.compressor.all_to_quantized_module_names
+            else:
+                all_to_quantized_module_names = [
+                    n for n, m in self.compressor.model.named_modules() if check_to_quantized(m)
+                ]
+            last_module = (len(all_to_quantized_module_names) == 0) or (name == all_to_quantized_module_names[-1])
+            m = get_module(self.compressor.model, name)
+            shard_writer(self.compressor, m, name, last_module)
diff --git a/auto_round/quantizers/base.py b/auto_round/quantizers/base.py
new file mode 100755
index 000000000..d4f70a7ef
--- /dev/null
+++ b/auto_round/quantizers/base.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from auto_round.compressors.base import BaseCompressor
+
+
+class BaseQuantizer(ABC):
+    def __init__(self, compressor: "BaseCompressor"):
+        self.compressor = compressor
+
+    def __mro_call(self, method_name: str, *args, **kwargs):
+        for cls in type(self).mro():
+            method = cls.__dict__.get(method_name, None)
+            if method:
+                method(self, *args, **kwargs)
+
+    def pre_quantize(self, *args, **kwargs):
+        self.__mro_call("_pre_quantize_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _pre_quantize_impl(self, *args, **kwargs):
+        pass
+
+    def quantize(self, *args, **kwargs):
+        self.pre_quantize(*args, **kwargs)
+        self._quantize_impl(*args, **kwargs)
+        self.post_quantize(*args, **kwargs)
+        return self.compressor.model, self.compressor.layer_config
+
+    @abstractmethod
+    def _quantize_impl(self, *args, **kwargs):
+        pass
+
+    def post_quantize(self, *args, **kwargs):
+        self.__mro_call("_post_quantize_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _post_quantize_impl(self, *args, **kwargs):
+        pass
+
+    def pre_quantize_layer(self, *args, **kwargs):
+        self.__mro_call("_pre_quantize_layer_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _pre_quantize_layer_impl(self, *args, **kwargs):
+        pass
+
+    def quantize_layer(self, *args, **kwargs):
+        self.pre_quantize_layer(*args, **kwargs)
+        result = self._quantize_layer_impl(*args, **kwargs)
+        self.post_quantize_layer(*args, **kwargs)
+        return result
+
+    @abstractmethod
+    def _quantize_layer_impl(self, *args, **kwargs):
+        pass
+
+    def post_quantize_layer(self, *args, **kwargs):
+        pass
+        self.__mro_call("_post_quantize_layer_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _post_quantize_layer_impl(self, *args, **kwargs):
+        pass
+
+    def pre_quantize_block(self, *args, **kwargs):
+        self.__mro_call("_pre_quantize_block_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _pre_quantize_block_impl(self, *args, **kwargs):
+        pass
+
+    def quantize_block(self, *args, **kwargs):
+        self.pre_quantize_block(*args, **kwargs)
+        result = self._quantize_block_impl(*args, **kwargs)
+        self.post_quantize_block(*args, **kwargs)
+        return result
+
+    @abstractmethod
+    def _quantize_block_impl(self, *args, **kwargs):
+        pass
+
+    def post_quantize_block(self, *args, **kwargs):
+        self.__mro_call("_post_quantize_block_impl", *args, **kwargs)
+
+    @abstractmethod
+    def _post_quantize_block_impl(self, *args, **kwargs):
+        pass
diff --git a/auto_round/quantizers/entrypoint.py b/auto_round/quantizers/entrypoint.py
new file mode 100755
index 000000000..e0eda7258
--- /dev/null
+++ b/auto_round/quantizers/entrypoint.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from auto_round.compressors.base import BaseCompressor
+
+from auto_round.quantizers.algs.auto_round import ARAdamQuantizer, ARQuantizer
+from auto_round.quantizers.algs.rtn import OptRTNQuantizer, RTNQuantizer
+
+
+class AutoRoundQuantizer:
+    def __new__(cls, compressor: "BaseCompressor", dynamic_quantizers: dict = None):
+        assert dynamic_quantizers is not None, "Please provide dynamic_quantizers dict."
+        quantizer_cls = type("AutoRoundQuantizer", (dynamic_quantizers["algs"],), {})
+        return quantizer_cls(compressor)
+
+
+class Quantizers:
+    def __init__(self, quantizers: list[AutoRoundQuantizer]):
+        self.quantizers = quantizers
+
+    def quantize(self, *args, **kwargs):
+        for quantizer in self.quantizers:
+            model, layer_config = quantizer.quantize(*args, **kwargs)
+        return model, layer_config
+
+
+def create_quantizers(compressor: "BaseCompressor"):
+
+    alg_cls = None
+    if compressor.iters > 0:
+        alg_cls = ARQuantizer if compressor.enable_adam is False else ARAdamQuantizer
+    else:
+        alg_cls = OptRTNQuantizer if compressor.disable_opt_rtn is False else RTNQuantizer
+
+    dynamic_quantizers = {"algs": alg_cls}
+    return Quantizers(
+        quantizers=[
+            AutoRoundQuantizer(compressor, dynamic_quantizers=dynamic_quantizers),
+        ]
+    )
diff --git a/auto_round/quantizers/readme.md b/auto_round/quantizers/readme.md
new file mode 100644
index 000000000..7a5b7fad6
--- /dev/null
+++ b/auto_round/quantizers/readme.md
@@ -0,0 +1,21 @@
+# AutoRound Quantizer
+主要的功能组件，包含不同的算法，量化的具体执行逻辑。
+
+## 结构与调用流程
+AutoRundQuantizer根据粒度从大到小分为三层（可扩展）： algs、model_type、data_type，从每层中继承方法动态的构造一个Quantizers, 同层间互斥，不同层间可以自由组合。
+
+AutoRoundQuantizer
+- algs
+    - RTN
+    - Tuning(auto_round)
+- model_type
+    - llm
+    - mllm
+    - diffusion
+- data_type
+    - gguf
+    - nvfp/mxfp
+### 1. AutoRoundQuantizer
+主入口，根据配置，使用__new__方法动态构造一个Quantizer, 从AlgsQuantizer, ModelTypeQuantizer, DataTypeQuantizer中继承方法，小粒度层可覆写大粒度层方法
+
+### 2. AlgsQuantizer
\ No newline at end of file
diff --git a/auto_round/quantizers/utils.py b/auto_round/quantizers/utils.py
new file mode 100755
index 000000000..a6f1dd4bc
--- /dev/null
+++ b/auto_round/quantizers/utils.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import traceback
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+from auto_round.compressors.utils import (
+    check_need_act_calibration,
+    is_nv_fp,
+)
+from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
+from auto_round.data_type.utils import reshape_pad_tensor_by_group_size
+from auto_round.logger import logger
+from auto_round.utils import (
+    SUPPORTED_LAYER_TYPES,
+    check_to_quantized,
+    clear_memory,
+    get_layer_names_in_block,
+    get_module,
+    to_device,
+    to_dtype,
+)
+
+
+def register_act_max_hook(model: torch.nn.Module, layer_config: dict, act_group_size: int, act_data_type: str):
+    def get_act_max_hook(module, input, output):
+        if isinstance(input, (tuple, list)):
+            input = input[0]
+        if input.numel() == 0:
+            return  # as no needs for act_max update
+        input, _, _ = reshape_pad_tensor_by_group_size(input, act_group_size)
+        act_max = torch.max(torch.abs(input), dim=-1).values
+        if not hasattr(module, "act_max") or module.act_max.numel() == 0:
+            module.act_max = act_max
+        else:
+            act_max = act_max.to(module.act_max.device)
+            if is_nv_fp(act_data_type):  ## for nvfp per-tensor input_global_scale calculation usage
+                module.act_max = torch.max(torch.tensor([act_max.max(), module.act_max.max()], device=act_max.device))
+            else:
+                module.act_max = torch.max(act_max, module.act_max)
+
+    hook_handles = []
+    # for single layers out of blocks, like lm_head
+    if isinstance(model, SUPPORTED_LAYER_TYPES):
+        m = model
+        if (
+            hasattr(m, "act_dynamic")
+            and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
+            and check_to_quantized(m)
+        ):
+            hook = m.register_forward_hook(get_act_max_hook)
+            hook_handles.append(hook)
+        return hook_handles
+
+    for n, m in model.named_modules():
+        if (
+            hasattr(m, "act_dynamic")
+            and check_need_act_calibration(m.act_dynamic, m.act_data_type, m.act_bits)
+            and check_to_quantized(m)
+        ):
+            hook = m.register_forward_hook(get_act_max_hook)
+            hook_handles.append(hook)
+            continue
+
+        # for whole model, RTN
+        if n in layer_config:
+            config = layer_config[n]
+            act_dynamic = config.get("act_dynamic", True)
+            act_data_type = config.get("act_data_type", None)
+            act_bits = config.get("act_bits", 16)
+            if (
+                config["bits"] <= 8
+                and check_need_act_calibration(act_dynamic, act_data_type, act_bits)
+                and check_to_quantized(config)
+            ):
+                hook = m.register_forward_hook(get_act_max_hook)
+                hook_handles.append(hook)
+                continue
+    return hook_handles
+
+
+@torch.inference_mode()
+def quantize_embedding_layer(
+    model: torch.nn.Module,
+    layer_config: dict,
+    scale_dtype: str,
+    disable_opt_rtn: bool,
+    device: Union[str, torch.device],
+    device_list: list,
+) -> bool:
+    """Quantizes embedding layers in the model according to the configuration.
+
+    This method iterates through all modules in the model, identifies embedding
+    layers specified in `layer_config`, and applies the appropriate quantization
+    function based on bit precision, grouping strategy, and dtype.
+
+    Returns:
+        bool: True if the quantization process completes without critical errors.
+    """
+    is_quantized = False
+    for name, module in model.named_modules():
+        # Skip non-Embedding modules or layers not in config
+        if not isinstance(module, torch.nn.Embedding) or name not in layer_config:
+            continue
+
+        config = layer_config[name]
+
+        # Skip layers that are not marked for quantization
+        if not check_to_quantized(config):
+            continue
+        is_quantized = True
+        config["scale_dtype"] = scale_dtype
+        dtype = config["data_type"]
+
+        # Determine quantization function key with symmetry/asymmetry
+        if dtype not in QUANT_FUNC_WITH_DTYPE:
+            dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}"
+
+        # Optionally use optimized rounding (RTN) variant
+        if not disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE:
+            dtype = f"rtn_{dtype}"
+
+        quant_func = QUANT_FUNC_WITH_DTYPE[dtype]
+        dtype = module.weight.dtype
+        # As typically float32 are used in RTN to search scale zp,
+        # to avoid cache a bf16 copy we'd better use float32
+        if config.get("super_group_size", None) is not None:
+            dtype = torch.float32
+
+        # Attempt quantization on GPU, fall back to CPU if OOM
+        try:
+            weight, scale, zp = quant_func(
+                module.weight.to(dtype=dtype, device=device),
+                **{
+                    k: config.get(k, None)
+                    for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]
+                },
+            )
+        except torch.OutOfMemoryError:
+            cuda_error_msg = traceback.format_exc()
+            try:
+                logger.error(cuda_error_msg)
+                logger.warning("falling back to CPU")
+                weight, scale, zp = quant_func(
+                    module.weight.to("cpu"),
+                    **{
+                        k: config.get(k, None)
+                        for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]
+                    },
+                )
+            except Exception as e:
+                raise
+
+        # Overwrite the module's weights with the quantized version
+        module.weight.data.copy_(weight.cpu())
+
+        # Attach scale and zero point (zp) to the module
+        for param_name, value in zip(["scale", "zp"], [scale, zp]):
+            if isinstance(value, dict):
+                for k, v in value.items():
+                    setattr(module, k if k == "scale" else f"w_{k}", v.cpu())
+            elif isinstance(value, torch.Tensor):
+                setattr(module, param_name, value.cpu())
+            else:
+                setattr(module, param_name, value)
+
+        # Update config
+        layer_config.setdefault(name, {}).update(config)
+        del weight
+        del scale
+        del zp
+        clear_memory(device_list=device_list)
+    return is_quantized
+
+
+def get_quantized_layer_names_outside_blocks(
+    model: torch.nn.Module, layer_config: dict, supported_types: list, quant_block_list: list
+) -> list:
+    """Gets the names of quantized layers outside blocks in the model.
+
+    Returns:
+        list: List of layer names outside blocks.
+    """
+    if layer_config is None or len(layer_config) == 0:
+        return []
+
+    layer_names = []
+    all_layers_in_block = get_layer_names_in_block(model, supported_types, quant_block_list)
+
+    for key in layer_config.keys():
+        if key in all_layers_in_block:
+            continue
+        layer = get_module(model, key)
+        if layer is None:
+            logger.error(f"could not find layer {key} in the model, exit...")
+            exit(-1)
+        if type(layer) in supported_types and check_to_quantized(layer_config[key]):
+            layer_names.append(key)
+
+    return layer_names
+
+
+def get_non_zero_cnt(tensor: list[torch.Tensor], indices: list[int]) -> int:
+    current_tensors = [tensor[i] for i in indices]
+    non_zero_cnt = 0
+    for t in current_tensors:
+        non_zero_cnt += torch.count_nonzero(t).item()
+    return non_zero_cnt
+
+
+def split_inputs(inputs: dict, first_input_name: str, is_diffusion: bool = False) -> tuple[torch.Tensor, dict]:
+    if is_diffusion:
+        input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
+        input_ids = {k: inputs.pop(k, None) for k in input_id_str}
+        input_others = inputs
+        return input_ids, input_others
+    else:
+        input_ids = inputs[first_input_name]
+        inputs.pop(first_input_name, None)
+        input_others = inputs
+        return input_ids, input_others
+
+
+def preprocess_block_inputs(
+    inputs,
+    device_list: list,
+    first_input_name="input_ids",
+    amp: bool = False,
+    amp_dtype: torch.dtype = torch.float32,
+    cache_device: Union[str, torch.device] = "cpu",
+    is_diffusion: bool = False,
+):
+    input_ids, input_others = split_inputs(inputs, first_input_name, is_diffusion=is_diffusion)
+    clear_memory(device_list=device_list)
+    input_ids = to_device(input_ids, cache_device)
+    input_others = to_device(input_others, cache_device)
+    # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage
+
+    tmp_dtype = amp_dtype if amp else torch.float32
+    input_ids = to_dtype(input_ids, tmp_dtype)
+
+    for key in input_others.keys():
+        if isinstance(input_others[key], torch.Tensor) and (
+            input_others[key].dtype == torch.float16 or input_others[key].dtype == torch.bfloat16
+        ):
+            input_others[key] = input_others[key].to(tmp_dtype)
+        elif isinstance(input_others[key], list):
+            for i in range(len(input_others[key])):
+                to_dtype(input_others[key][i], tmp_dtype)
+    return input_ids, input_others
+
+
+def update_inputs(inputs: dict, q_inputs: dict, is_diffusion: bool) -> tuple[dict, dict]:
+    if is_diffusion:
+        input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
+        if q_inputs is not None:
+            q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str}
+        return inputs, q_inputs
+    else:
+        keys = inputs.keys()
+        input_id_str = [key for key in keys if key.startswith("hidden_state")]
+        if len(input_id_str) != 1:
+            raise RuntimeError(
+                "hidden_states arg mismatch error,"
+                "please raise an issue in https://github.com/intel/auto-round/issues"
+            )
+        inputs["input_ids"] = inputs.pop(input_id_str[0], None)
+        if q_inputs is not None:
+            q_inputs = q_inputs.pop(input_id_str[0], None)
+        return inputs, q_inputs
diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py
old mode 100644
new mode 100755
diff --git a/test/test_cpu/utils/test_cli_usage.py b/test/test_cpu/utils/test_cli_usage.py
old mode 100644
new mode 100755
diff --git a/test/test_cuda/advanced/test_multiple_card_calib.py b/test/test_cuda/advanced/test_multiple_card_calib.py
old mode 100644
new mode 100755
diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py
old mode 100644
new mode 100755
diff --git a/test/test_cuda/models/test_support_vlms.py b/test/test_cuda/models/test_support_vlms.py
old mode 100644
new mode 100755
diff --git a/test/test_cuda/utils/test_alg_ext.py b/test/test_cuda/utils/test_alg_ext.py
old mode 100644
new mode 100755