From b5f7489594c5c45a57e803760fc4a918ebe6f17d Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 12 Feb 2026 07:40:16 +0000
Subject: [PATCH 01/11] fake cuda on hpu

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/utils/device.py | 18 ++++++++++++++++++
 auto_round/utils/model.py  | 18 +++++++++++-------
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index a16f441bf..b2a91f510 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -337,6 +337,24 @@ def __exit__(self, exc_type, exc, exc_tb):
             torch.cuda.get_device_capability = self._orig_func
             self._orig_func = None
         return False
+    
+class fake_cuda_for_hpu(ContextDecorator):
+    """Context manager/decorator to fake CUDA availability for HPU devices."""
+    
+    def __init__(self):
+        self._orig_is_available = None
+
+    def __enter__(self):
+        if is_hpex_available():
+            self._orig_is_available = torch.cuda.is_available
+            torch.cuda.is_available = lambda: True
+        return self
+
+    def __exit__(self, exc_type, exc, exc_tb):
+        if is_hpex_available() and hasattr(self, "_orig_is_available"):
+            torch.cuda.is_available = self._orig_is_available
+            del self._orig_is_available
+        return False
 
 
 def get_packing_device(device: str | torch.device | None = "auto") -> torch.device:
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 2536c0a93..126ebf4fb 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -274,6 +274,8 @@ def llm_load_model(
         _use_hpu_compile_mode,
         get_device_and_parallelism,
         override_cuda_device_capability,
+        fake_cuda_for_hpu,
+        is_hpex_available,
     )
 
     device_str, use_auto_mapping = get_device_and_parallelism(device)
@@ -289,13 +291,15 @@ def llm_load_model(
     if "deepseek" in pretrained_model_name_or_path.lower() and trust_remote_code:
         logger.warning("trust_remote_code is enabled by default, please ensure its correctness.")
 
-    if _use_hpu_compile_mode():
-        model = model_cls.from_pretrained(
-            pretrained_model_name_or_path,
-            torch_dtype=torch_dtype,
-            trust_remote_code=trust_remote_code,
-            device_map="auto" if use_auto_mapping else None,
-        )
+    if is_hpex_available():
+        # For loading FP8 model on HPU
+        with fake_cuda_for_hpu(), override_cuda_device_capability():
+            model = model_cls.from_pretrained(
+                pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                trust_remote_code=trust_remote_code,
+                device_map="auto" if use_auto_mapping else None,
+            )
     else:
         try:
             model = model_cls.from_pretrained(

From d8a98495bf417932bd8d2580445007d9f98e0cfd Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 12 Feb 2026 07:41:04 +0000
Subject: [PATCH 02/11] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/compressors/base.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index e855fd0ce..d6f8af87e 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -486,8 +486,6 @@ def __init__(
             logger.info(f"using {self.model.dtype} for quantization tuning")
 
         # Some helpers
-        if "hpu" in str(self.device):
-            self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear")
         self.batch_dim = None
         self.infer_bs_coeff = 1
 

From 6332241eea2e4d41fa51420f7669e4d9499d0921 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 12 Feb 2026 08:08:17 +0000
Subject: [PATCH 03/11] load fp8 model

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/test_hpu/test_load_fp8_model.py | 30 ++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 test/test_hpu/test_load_fp8_model.py

diff --git a/test/test_hpu/test_load_fp8_model.py b/test/test_hpu/test_load_fp8_model.py
new file mode 100644
index 000000000..5fb584499
--- /dev/null
+++ b/test/test_hpu/test_load_fp8_model.py
@@ -0,0 +1,30 @@
+import os
+import shutil
+
+import pytest
+import torch
+
+from auto_round import AutoRound
+
+
+class TestAutoRound:
+    save_dir = "./saved"
+
+    def check_nan_inf_in_tensor(self, tensor, name=""):
+        return torch.isnan(tensor).any() or torch.isinf(tensor).any()
+
+    def test_small_model_rtn_generation(self):
+        model_name = "Qwen/Qwen3-0.6B-FP8"
+        ar = AutoRound(model_name, iters=0, scheme="FP8_STATIC", nsamples=16)
+        model, folder = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor")
+        # all linears except lm_head should be quantized to FP8
+        fp8_linear_count = 0
+        for name, module in model.named_modules():
+            if "FP8QLinear" in type(module).__name__:
+                assert module.weight.dtype == torch.float8_e4m3fn, f"{name} is not in FP8"
+                assert not self.check_nan_inf_in_tensor(module.weight.to(torch.float32)), (
+                    f"{name} has NaN or Inf in weights"
+                )
+                fp8_linear_count += 1
+        assert fp8_linear_count > 0, "No FP8 linear layer found in the quantized model"
+        shutil.rmtree(self.save_dir, ignore_errors=True)

From b6b6222ca57202770b33574e4df67474c5d76434 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 12 Feb 2026 08:11:12 +0000
Subject: [PATCH 04/11] add more ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/test_hpu/test_load_fp8_model.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/test_hpu/test_load_fp8_model.py b/test/test_hpu/test_load_fp8_model.py
index 5fb584499..57ac975ea 100644
--- a/test/test_hpu/test_load_fp8_model.py
+++ b/test/test_hpu/test_load_fp8_model.py
@@ -7,14 +7,20 @@
 from auto_round import AutoRound
 
 
+MODEL_LIST = (
+    "Qwen/Qwen3-0.6B-FP8",
+    "Qwen/Qwen3-0.6B",
+)
+
 class TestAutoRound:
     save_dir = "./saved"
 
     def check_nan_inf_in_tensor(self, tensor, name=""):
         return torch.isnan(tensor).any() or torch.isinf(tensor).any()
 
-    def test_small_model_rtn_generation(self):
-        model_name = "Qwen/Qwen3-0.6B-FP8"
+
+    @pytest.mark.parametrize("model_name", MODEL_LIST)
+    def test_small_model_rtn_generation(self, model_name):
         ar = AutoRound(model_name, iters=0, scheme="FP8_STATIC", nsamples=16)
         model, folder = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor")
         # all linears except lm_head should be quantized to FP8

From 0918cb48f6ea96c9bd6c3a14defd7091aa870380 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 12 Feb 2026 08:11:33 +0000
Subject: [PATCH 05/11] add more ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/test_hpu/{test_load_fp8_model.py => test_quant_fp8.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/test_hpu/{test_load_fp8_model.py => test_quant_fp8.py} (100%)

diff --git a/test/test_hpu/test_load_fp8_model.py b/test/test_hpu/test_quant_fp8.py
similarity index 100%
rename from test/test_hpu/test_load_fp8_model.py
rename to test/test_hpu/test_quant_fp8.py

From 251be9471f5153d250201d83918b8f44b13f378e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Feb 2026 08:57:25 +0000
Subject: [PATCH 06/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/utils/device.py      | 5 +++--
 auto_round/utils/model.py       | 4 ++--
 test/test_hpu/test_quant_fp8.py | 9 ++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index b2a91f510..0f19454b9 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -337,10 +337,11 @@ def __exit__(self, exc_type, exc, exc_tb):
             torch.cuda.get_device_capability = self._orig_func
             self._orig_func = None
         return False
-    
+
+
 class fake_cuda_for_hpu(ContextDecorator):
     """Context manager/decorator to fake CUDA availability for HPU devices."""
-    
+
     def __init__(self):
         self._orig_is_available = None
 
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 126ebf4fb..d1f515b04 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -272,10 +272,10 @@ def llm_load_model(
         from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
     from auto_round.utils.device import (
         _use_hpu_compile_mode,
-        get_device_and_parallelism,
-        override_cuda_device_capability,
         fake_cuda_for_hpu,
+        get_device_and_parallelism,
         is_hpex_available,
+        override_cuda_device_capability,
     )
 
     device_str, use_auto_mapping = get_device_and_parallelism(device)
diff --git a/test/test_hpu/test_quant_fp8.py b/test/test_hpu/test_quant_fp8.py
index 57ac975ea..eaaa61741 100644
--- a/test/test_hpu/test_quant_fp8.py
+++ b/test/test_hpu/test_quant_fp8.py
@@ -6,19 +6,18 @@
 
 from auto_round import AutoRound
 
-
 MODEL_LIST = (
     "Qwen/Qwen3-0.6B-FP8",
     "Qwen/Qwen3-0.6B",
 )
 
+
 class TestAutoRound:
     save_dir = "./saved"
 
     def check_nan_inf_in_tensor(self, tensor, name=""):
         return torch.isnan(tensor).any() or torch.isinf(tensor).any()
 
-
     @pytest.mark.parametrize("model_name", MODEL_LIST)
     def test_small_model_rtn_generation(self, model_name):
         ar = AutoRound(model_name, iters=0, scheme="FP8_STATIC", nsamples=16)
@@ -28,9 +27,9 @@ def test_small_model_rtn_generation(self, model_name):
         for name, module in model.named_modules():
             if "FP8QLinear" in type(module).__name__:
                 assert module.weight.dtype == torch.float8_e4m3fn, f"{name} is not in FP8"
-                assert not self.check_nan_inf_in_tensor(module.weight.to(torch.float32)), (
-                    f"{name} has NaN or Inf in weights"
-                )
+                assert not self.check_nan_inf_in_tensor(
+                    module.weight.to(torch.float32)
+                ), f"{name} has NaN or Inf in weights"
                 fp8_linear_count += 1
         assert fp8_linear_count > 0, "No FP8 linear layer found in the quantized model"
         shutil.rmtree(self.save_dir, ignore_errors=True)

From 73f93b2c05256f450e71799a55fe90fee6498be9 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 13 Feb 2026 02:24:12 +0000
Subject: [PATCH 07/11] patch fp8 quantized for hpu

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/modeling/__init__.py              |   1 +
 auto_round/modeling/finegrained_fp8_patch.py | 282 +++++++++++++++++++
 auto_round/modeling/hpu_patch.py             |  29 ++
 auto_round/utils/device.py                   |  50 +++-
 auto_round/utils/model.py                    |   3 +-
 5 files changed, 363 insertions(+), 2 deletions(-)
 create mode 100644 auto_round/modeling/finegrained_fp8_patch.py
 create mode 100644 auto_round/modeling/hpu_patch.py

diff --git a/auto_round/modeling/__init__.py b/auto_round/modeling/__init__.py
index d1bc25269..ece2d5cd7 100644
--- a/auto_round/modeling/__init__.py
+++ b/auto_round/modeling/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.
 
 from .fp8_quant import *
+from .hpu_patch import *
\ No newline at end of file
diff --git a/auto_round/modeling/finegrained_fp8_patch.py b/auto_round/modeling/finegrained_fp8_patch.py
new file mode 100644
index 000000000..7c2a16644
--- /dev/null
+++ b/auto_round/modeling/finegrained_fp8_patch.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copied from https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/finegrained_fp8.py
+from transformers.core_model_loading import ConversionOps
+from transformers.quantizers.quantizers_utils import should_convert_module
+from transformers.utils import is_kernels_available, is_torch_accelerator_available, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+    # import triton
+    # import triton.language as tl
+    from torch.nn import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+
+_FP8_DTYPE = torch.float8_e4m3fn
+_FP8_MIN = torch.finfo(_FP8_DTYPE).min
+_FP8_MAX = torch.finfo(_FP8_DTYPE).max
+
+
+
+class FP8Linear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = False,
+        dtype=torch.float8_e4m3fn,
+        block_size: tuple[int, int] | None = None,
+        activation_scheme="dynamic",
+    ):
+        super().__init__(in_features, out_features)
+
+        # If block size is None, it means that we are doing per-tensor quantization
+        self.block_size = block_size
+        self.activation_scheme = activation_scheme
+
+        self.weight = torch.nn.Parameter(torch.empty(out_features, in_features, dtype=dtype))
+
+        if self.block_size is None:
+            self.weight_scale_inv = nn.Parameter(torch.tensor(1.0, dtype=torch.float32))
+        else:
+            scale_out_features = (out_features + self.block_size[0] - 1) // self.block_size[0]
+            scale_in_features = (in_features + self.block_size[1] - 1) // self.block_size[1]
+            self.weight_scale_inv = nn.Parameter(
+                torch.empty(scale_out_features, scale_in_features, dtype=torch.float32)
+            )
+
+        if self.activation_scheme == "static":
+            self.activation_scale = nn.Parameter(torch.tensor(1.0, dtype=torch.float32))
+
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.out_features))
+        else:
+            self.register_parameter("bias", None)
+
+    # def forward(self, input: torch.Tensor) -> torch.Tensor:
+    #     if self.weight.element_size() > 1:
+    #         return F.linear(input, self.weight, self.bias)
+    #     else:
+    #         if isinstance(self.weight, torch.distributed.tensor.DTensor):
+    #             weight = self.weight._local_tensor.contiguous()
+    #             scale_inv = self.weight_scale_inv._local_tensor.contiguous()
+    #         else:
+    #             weight = self.weight.contiguous()
+    #             scale_inv = self.weight_scale_inv.contiguous()
+    #         # Context manager used to switch among the available accelerators
+    #         device_type = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda"
+    #         torch_accelerator_module = getattr(torch, device_type, torch.cuda)
+    #         with torch_accelerator_module.device(input.device):
+    #             if self.activation_scheme == "dynamic":
+    #                 qinput, scale = act_quant(input, self.block_size[1])
+    #             elif self.activation_scheme == "static":
+    #                 scale = self.activation_scale.to(torch.float32)
+    #                 qinput = (input / scale).clamp(min=_FP8_MIN, max=_FP8_MAX).to(torch.float8_e4m3fn)
+
+    #             else:
+    #                 raise NotImplementedError("Not supported")
+
+    #             output = w8a8_block_fp8_matmul(
+    #                 qinput,
+    #                 weight,
+    #                 scale,
+    #                 scale_inv,
+    #                 self.block_size,
+    #                 output_dtype=input.dtype,
+    #             )
+
+    #         # Blocks the CPU until all accelerator operations on the specified device are complete. It is used to ensure that the results of the
+    #         # preceding operations are ready before proceeding
+    #         torch_accelerator_module.synchronize()
+    #         if self.bias is not None:
+    #             output = output + self.bias
+
+    #         return output.to(dtype=input.dtype)
+
+
+def _ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+
+def replace_with_fp8_linear(
+    model, modules_to_not_convert: list[str] | None = None, quantization_config=None, pre_quantized=False
+):
+    """
+    A helper function to replace all `torch.nn.Linear` modules by `FP8Linear` modules.
+
+    Parameters:
+        model (`torch.nn.Module`):
+            Input model or `torch.nn.Module` as the function is run recursively.
+        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
+            Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
+        quantization_config (`FbgemmFp8Config`):
+            The quantization config object that contains the quantization parameters.
+        pre_quantized (`book`, defaults to `False`):
+            Whether the model is pre-quantized or not
+    """
+
+    if quantization_config.dequantize:
+        return model
+
+    has_been_replaced = False
+    for module_name, module in model.named_modules():
+        if not should_convert_module(module_name, modules_to_not_convert):
+            continue
+        # we need this to correctly materialize the weights during quantization
+        module_kwargs = {} if pre_quantized else {"dtype": None}
+        new_module = None
+        with torch.device("meta"):
+            if isinstance(module, nn.Linear):
+                new_module = FP8Linear(
+                    in_features=module.in_features,
+                    out_features=module.out_features,
+                    bias=module.bias is not None,
+                    activation_scheme=quantization_config.activation_scheme,
+                    block_size=quantization_config.weight_block_size,
+                    **module_kwargs,
+                )
+            if new_module is not None:
+                model.set_submodule(module_name, new_module)
+                has_been_replaced = True
+
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading your model using fp8 but no linear modules were found in your model."
+            " Please double check your model architecture."
+        )
+    return model
+
+
+class Fp8Quantize(ConversionOps):
+    """
+    A quantization operation that creates two tensors, weight and scale out of a weight.
+    """
+
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(self, input_dict: torch.Tensor, **kwargs) -> dict[str, torch.Tensor]:
+        # Unpack single key/value (value may be wrapped in a list)
+        target_keys, value = tuple(input_dict.items())[0]
+        value = value[0]
+
+        # Resolve block size (support dict-like or attr-like quant_config)
+        block_size = None
+        if self.hf_quantizer.quantization_config is not None:
+            if isinstance(self.hf_quantizer.quantization_config, dict):
+                block_size = self.hf_quantizer.quantization_config.get("weight_block_size")
+            else:
+                block_size = getattr(self.hf_quantizer.quantization_config, "weight_block_size", None)
+        if block_size is None:
+            block_size = (value.shape[-2], value.shape[-1])
+
+        block_m, block_n = block_size
+        rows, cols = value.shape[-2], value.shape[-1]
+
+        # Enforce exact tiling like your original
+        if rows % block_m != 0 or cols % block_n != 0:
+            raise ValueError(
+                f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_m}, {block_n}). for {target_keys}"
+            )
+
+        # Leading dims can be empty (2D) or include num_experts/... (3D+)
+        leading_shape = value.shape[:-2]
+        rows_tiles = rows // block_m
+        cols_tiles = cols // block_n
+
+        original_shape = value.shape
+        value_fp32 = value.to(torch.float32)
+
+        # Reshape to (..., rows_tiles, block_m, cols_tiles, block_n)
+        reshaped = value_fp32.reshape(*leading_shape, rows_tiles, block_m, cols_tiles, block_n)
+
+        # Per-tile max-abs over the block dims
+        # dims: block_m is at -3, block_n is at -1 after the reshape
+        max_abs = reshaped.abs().amax(dim=(-3, -1))
+        safe_max_abs = torch.where(max_abs > 0, max_abs, torch.ones_like(max_abs))
+
+        # Tile scale (we store inverse scale like your Linear: weight_scale_inv)
+        scales = _FP8_MAX / safe_max_abs
+        scales = torch.where(max_abs > 0, scales, torch.ones_like(scales))  # keep zeros stable
+
+        # Broadcast scales back over the block dims and quantize
+        # max_abs/scales shape: (..., rows_tiles, cols_tiles)
+        scales_broadcast = scales.unsqueeze(-1).unsqueeze(-3)  # -> (..., rows_tiles, 1, cols_tiles, 1)
+        scaled = reshaped * scales_broadcast
+
+        quantized = torch.clamp(scaled, min=_FP8_MIN, max=_FP8_MAX).to(_FP8_DTYPE)
+
+        quantized = quantized.reshape(original_shape)
+
+        inv_scales = (1.0 / scales).to(torch.float32)  # shape: (*leading, rows_tiles, cols_tiles)
+        if target_keys.endswith("weight"):
+            scale_key = target_keys.rsplit(".", 1)[0] + ".weight_scale_inv"
+        else:
+            scale_key = target_keys + "_scale_inv"
+
+        # Return both quantized weights and per-tile inverse scales (keeps leading dims, e.g., num_experts)
+        return {
+            target_keys: quantized,
+            scale_key: inv_scales,
+        }
+
+
+class Fp8Dequantize(ConversionOps):
+    """Inverse operation of :class:`Fp8Quantize`. Takes a pair (weight, scale) and reconstructs the fp32 tensor."""
+
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        full_layer_name: str | None = None,
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
+        if len(input_dict) < 2:
+            # case where we only got weights, need to check for "weight$"
+            return {full_layer_name: input_dict["weight$"]}
+
+        quantized = input_dict["weight$"][0]
+        scales = input_dict["weight_scale_inv"][0]
+
+        rows, cols = quantized.shape[-2:]
+        block_size = self.hf_quantizer.quantization_config.weight_block_size
+        if block_size is None:
+            block_size = (quantized.shape[-2], quantized.shape[-1])
+
+        block_m, block_n = block_size
+
+        if rows % block_m != 0 or cols % block_n != 0:
+            raise ValueError(
+                f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_m}, {block_n})."
+            )
+        quantized = quantized.to(scales.dtype)
+        reshaped = quantized.reshape(-1, rows // block_m, block_m, cols // block_n, block_n)
+        expanded_scales = scales.reshape(-1, rows // block_m, cols // block_n)
+        expanded_scales = expanded_scales.unsqueeze(-1).unsqueeze(2)
+        dequantized = reshaped * expanded_scales
+
+        return {
+            full_layer_name: dequantized.reshape(quantized.shape),
+        }
+
+
diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py
new file mode 100644
index 000000000..5639c0b6b
--- /dev/null
+++ b/auto_round/modeling/hpu_patch.py
@@ -0,0 +1,29 @@
+
+from auto_round.logger import logger
+
+def patch_finegrained_fp8():
+    """Use importlib to replace transformers.integrations.finegrained_fp8 with auto-round's HPU-compatible version."""
+    try:
+        from auto_round.utils.hpu_utils import is_hpu_available
+        
+        if not is_hpu_available():
+            return  # No patching needed on non-HPU devices
+        
+        import importlib
+        import sys
+        
+        # Import auto-round's HPU-compatible finegrained_fp8_patch module
+        finegrained_fp8_patch = importlib.import_module('auto_round.modeling.finegrained_fp8_patch')
+        
+        # Replace transformers.integrations.finegrained_fp8 in sys.modules
+        sys.modules['transformers.integrations.finegrained_fp8'] = finegrained_fp8_patch
+        
+        logger.info("✓ Replaced transformers.integrations.finegrained_fp8 with auto_round.modeling.finegrained_fp8_patch")
+        
+    except Exception as e:
+        import warnings
+        logger.warning(f"Failed to patch finegrained_fp8: {e}")
+
+
+# Apply patch on import if HPU is available
+patch_finegrained_fp8()
\ No newline at end of file
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 0f19454b9..7d1d09177 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -24,7 +24,7 @@
 import cpuinfo
 import psutil
 import torch
-
+import sys
 from auto_round.logger import logger
 from auto_round.utils.model import check_to_quantized, get_block_names, get_layer_features, get_module
 
@@ -358,6 +358,54 @@ def __exit__(self, exc_type, exc, exc_tb):
         return False
 
 
+
+class fake_triton_for_hpu(ContextDecorator):
+    """Context manager/decorator to fake triton availability for HPU devices."""
+    
+    def __init__(self):
+        self._orig_triton = None
+        self._orig_triton_language = None
+        self._had_triton = False
+        self._had_triton_language = False
+
+    def __enter__(self):
+        if is_hpex_available():
+            # Save original state
+            self._had_triton = 'triton' in sys.modules
+            self._had_triton_language = 'triton.language' in sys.modules
+            
+            if self._had_triton:
+                self._orig_triton = sys.modules['triton']
+            if self._had_triton_language:
+                self._orig_triton_language = sys.modules['triton.language']
+            
+            # Create and inject fake triton module
+            class FakeTriton:
+                def __getattr__(self, name):
+                    return None
+            
+            fake_triton = FakeTriton()
+            fake_triton.jit = lambda func: func  # Make triton.jit a no-op decorator
+            sys.modules['triton'] = fake_triton
+            sys.modules['triton.language'] = FakeTriton()
+        return self
+
+    def __exit__(self, exc_type, exc, exc_tb):
+        if is_hpex_available():
+            # Restore original state
+            if self._had_triton and self._orig_triton is not None:
+                sys.modules['triton'] = self._orig_triton
+            elif not self._had_triton and 'triton' in sys.modules:
+                del sys.modules['triton']
+            
+            if self._had_triton_language and self._orig_triton_language is not None:
+                sys.modules['triton.language'] = self._orig_triton_language
+            elif not self._had_triton_language and 'triton.language' in sys.modules:
+                del sys.modules['triton.language']
+        return False
+
+
+
 def get_packing_device(device: str | torch.device | None = "auto") -> torch.device:
     """
     Selects the packing device.
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index d1f515b04..f6efaa364 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -274,6 +274,7 @@ def llm_load_model(
         _use_hpu_compile_mode,
         fake_cuda_for_hpu,
         get_device_and_parallelism,
+        fake_triton_for_hpu,
         is_hpex_available,
         override_cuda_device_capability,
     )
@@ -293,7 +294,7 @@ def llm_load_model(
 
     if is_hpex_available():
         # For loading FP8 model on HPU
-        with fake_cuda_for_hpu(), override_cuda_device_capability():
+        with fake_cuda_for_hpu(), fake_triton_for_hpu(), override_cuda_device_capability():
             model = model_cls.from_pretrained(
                 pretrained_model_name_or_path,
                 torch_dtype=torch_dtype,

From 581c993b6000d3bf59467a595abf298688a3762f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 13 Feb 2026 02:45:53 +0000
Subject: [PATCH 08/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/modeling/__init__.py              |  2 +-
 auto_round/modeling/finegrained_fp8_patch.py |  7 +---
 auto_round/modeling/hpu_patch.py             | 26 ++++++++-----
 auto_round/utils/device.py                   | 39 ++++++++++----------
 auto_round/utils/model.py                    |  2 +-
 5 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/auto_round/modeling/__init__.py b/auto_round/modeling/__init__.py
index ece2d5cd7..144e78ca3 100644
--- a/auto_round/modeling/__init__.py
+++ b/auto_round/modeling/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 
 from .fp8_quant import *
-from .hpu_patch import *
\ No newline at end of file
+from .hpu_patch import *
diff --git a/auto_round/modeling/finegrained_fp8_patch.py b/auto_round/modeling/finegrained_fp8_patch.py
index 7c2a16644..536a5dd7f 100644
--- a/auto_round/modeling/finegrained_fp8_patch.py
+++ b/auto_round/modeling/finegrained_fp8_patch.py
@@ -16,10 +16,10 @@
 from transformers.quantizers.quantizers_utils import should_convert_module
 from transformers.utils import is_kernels_available, is_torch_accelerator_available, is_torch_available, logging
 
-
 if is_torch_available():
     import torch
     import torch.nn as nn
+
     # import triton
     # import triton.language as tl
     from torch.nn import functional as F
@@ -28,13 +28,11 @@
 logger = logging.get_logger(__name__)
 
 
-
 _FP8_DTYPE = torch.float8_e4m3fn
 _FP8_MIN = torch.finfo(_FP8_DTYPE).min
 _FP8_MAX = torch.finfo(_FP8_DTYPE).max
 
 
-
 class FP8Linear(nn.Linear):
     def __init__(
         self,
@@ -115,7 +113,6 @@ def _ceil_div(a, b):
     return (a + b - 1) // b
 
 
-
 def replace_with_fp8_linear(
     model, modules_to_not_convert: list[str] | None = None, quantization_config=None, pre_quantized=False
 ):
@@ -278,5 +275,3 @@ def convert(
         return {
             full_layer_name: dequantized.reshape(quantized.shape),
         }
-
-
diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py
index 5639c0b6b..e69ba268d 100644
--- a/auto_round/modeling/hpu_patch.py
+++ b/auto_round/modeling/hpu_patch.py
@@ -1,29 +1,35 @@
+# # Copyright (C) 2026 Intel Corporation
+# # SPDX-License-Identifier: Apache-2.0
 
 from auto_round.logger import logger
 
+
 def patch_finegrained_fp8():
     """Use importlib to replace transformers.integrations.finegrained_fp8 with auto-round's HPU-compatible version."""
     try:
         from auto_round.utils.hpu_utils import is_hpu_available
-        
+
         if not is_hpu_available():
             return  # No patching needed on non-HPU devices
-        
+
         import importlib
         import sys
-        
+
         # Import auto-round's HPU-compatible finegrained_fp8_patch module
-        finegrained_fp8_patch = importlib.import_module('auto_round.modeling.finegrained_fp8_patch')
-        
+        finegrained_fp8_patch = importlib.import_module("auto_round.modeling.finegrained_fp8_patch")
+
         # Replace transformers.integrations.finegrained_fp8 in sys.modules
-        sys.modules['transformers.integrations.finegrained_fp8'] = finegrained_fp8_patch
-        
-        logger.info("✓ Replaced transformers.integrations.finegrained_fp8 with auto_round.modeling.finegrained_fp8_patch")
-        
+        sys.modules["transformers.integrations.finegrained_fp8"] = finegrained_fp8_patch
+
+        logger.info(
+            "✓ Replaced transformers.integrations.finegrained_fp8 with auto_round.modeling.finegrained_fp8_patch"
+        )
+
     except Exception as e:
         import warnings
+
         logger.warning(f"Failed to patch finegrained_fp8: {e}")
 
 
 # Apply patch on import if HPU is available
-patch_finegrained_fp8()
\ No newline at end of file
+patch_finegrained_fp8()
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 7d1d09177..e9a11923d 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -15,6 +15,7 @@
 import gc
 import os
 import re
+import sys
 from contextlib import ContextDecorator, contextmanager
 from functools import lru_cache
 from itertools import combinations
@@ -24,7 +25,7 @@
 import cpuinfo
 import psutil
 import torch
-import sys
+
 from auto_round.logger import logger
 from auto_round.utils.model import check_to_quantized, get_block_names, get_layer_features, get_module
 
@@ -358,10 +359,9 @@ def __exit__(self, exc_type, exc, exc_tb):
         return False
 
 
-
 class fake_triton_for_hpu(ContextDecorator):
     """Context manager/decorator to fake triton availability for HPU devices."""
-    
+
     def __init__(self):
         self._orig_triton = None
         self._orig_triton_language = None
@@ -371,41 +371,40 @@ def __init__(self):
     def __enter__(self):
         if is_hpex_available():
             # Save original state
-            self._had_triton = 'triton' in sys.modules
-            self._had_triton_language = 'triton.language' in sys.modules
-            
+            self._had_triton = "triton" in sys.modules
+            self._had_triton_language = "triton.language" in sys.modules
+
             if self._had_triton:
-                self._orig_triton = sys.modules['triton']
+                self._orig_triton = sys.modules["triton"]
             if self._had_triton_language:
-                self._orig_triton_language = sys.modules['triton.language']
-            
+                self._orig_triton_language = sys.modules["triton.language"]
+
             # Create and inject fake triton module
             class FakeTriton:
                 def __getattr__(self, name):
                     return None
-            
+
             fake_triton = FakeTriton()
             fake_triton.jit = lambda func: func  # Make triton.jit a no-op decorator
-            sys.modules['triton'] = fake_triton
-            sys.modules['triton.language'] = FakeTriton()
+            sys.modules["triton"] = fake_triton
+            sys.modules["triton.language"] = FakeTriton()
         return self
 
     def __exit__(self, exc_type, exc, exc_tb):
         if is_hpex_available():
             # Restore original state
             if self._had_triton and self._orig_triton is not None:
-                sys.modules['triton'] = self._orig_triton
-            elif not self._had_triton and 'triton' in sys.modules:
-                del sys.modules['triton']
-            
+                sys.modules["triton"] = self._orig_triton
+            elif not self._had_triton and "triton" in sys.modules:
+                del sys.modules["triton"]
+
             if self._had_triton_language and self._orig_triton_language is not None:
-                sys.modules['triton.language'] = self._orig_triton_language
-            elif not self._had_triton_language and 'triton.language' in sys.modules:
-                del sys.modules['triton.language']
+                sys.modules["triton.language"] = self._orig_triton_language
+            elif not self._had_triton_language and "triton.language" in sys.modules:
+                del sys.modules["triton.language"]
         return False
 
 
-
 def get_packing_device(device: str | torch.device | None = "auto") -> torch.device:
     """
     Selects the packing device.
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index f6efaa364..1442512c9 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -273,8 +273,8 @@ def llm_load_model(
     from auto_round.utils.device import (
         _use_hpu_compile_mode,
         fake_cuda_for_hpu,
-        get_device_and_parallelism,
         fake_triton_for_hpu,
+        get_device_and_parallelism,
         is_hpex_available,
         override_cuda_device_capability,
     )

From 84c1f1ecd0b66de87595168d178b9e49eb348941 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 13 Feb 2026 03:08:51 +0000
Subject: [PATCH 09/11] fix format

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/modeling/finegrained_fp8_patch.py | 49 +++-----------------
 1 file changed, 6 insertions(+), 43 deletions(-)

diff --git a/auto_round/modeling/finegrained_fp8_patch.py b/auto_round/modeling/finegrained_fp8_patch.py
index 536a5dd7f..5c7cdee96 100644
--- a/auto_round/modeling/finegrained_fp8_patch.py
+++ b/auto_round/modeling/finegrained_fp8_patch.py
@@ -68,47 +68,6 @@ def __init__(
         else:
             self.register_parameter("bias", None)
 
-    # def forward(self, input: torch.Tensor) -> torch.Tensor:
-    #     if self.weight.element_size() > 1:
-    #         return F.linear(input, self.weight, self.bias)
-    #     else:
-    #         if isinstance(self.weight, torch.distributed.tensor.DTensor):
-    #             weight = self.weight._local_tensor.contiguous()
-    #             scale_inv = self.weight_scale_inv._local_tensor.contiguous()
-    #         else:
-    #             weight = self.weight.contiguous()
-    #             scale_inv = self.weight_scale_inv.contiguous()
-    #         # Context manager used to switch among the available accelerators
-    #         device_type = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda"
-    #         torch_accelerator_module = getattr(torch, device_type, torch.cuda)
-    #         with torch_accelerator_module.device(input.device):
-    #             if self.activation_scheme == "dynamic":
-    #                 qinput, scale = act_quant(input, self.block_size[1])
-    #             elif self.activation_scheme == "static":
-    #                 scale = self.activation_scale.to(torch.float32)
-    #                 qinput = (input / scale).clamp(min=_FP8_MIN, max=_FP8_MAX).to(torch.float8_e4m3fn)
-
-    #             else:
-    #                 raise NotImplementedError("Not supported")
-
-    #             output = w8a8_block_fp8_matmul(
-    #                 qinput,
-    #                 weight,
-    #                 scale,
-    #                 scale_inv,
-    #                 self.block_size,
-    #                 output_dtype=input.dtype,
-    #             )
-
-    #         # Blocks the CPU until all accelerator operations on the specified device are complete. It is used to ensure that the results of the
-    #         # preceding operations are ready before proceeding
-    #         torch_accelerator_module.synchronize()
-    #         if self.bias is not None:
-    #             output = output + self.bias
-
-    #         return output.to(dtype=input.dtype)
-
-
 def _ceil_div(a, b):
     return (a + b - 1) // b
 
@@ -123,7 +82,8 @@ def replace_with_fp8_linear(
         model (`torch.nn.Module`):
             Input model or `torch.nn.Module` as the function is run recursively.
         modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
-            Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
+            Names of the modules to not convert. In practice we keep the `lm_head`
+            in full precision for numerical stability reasons.
         quantization_config (`FbgemmFp8Config`):
             The quantization config object that contains the quantization parameters.
         pre_quantized (`book`, defaults to `False`):
@@ -191,7 +151,10 @@ def convert(self, input_dict: torch.Tensor, **kwargs) -> dict[str, torch.Tensor]
         # Enforce exact tiling like your original
         if rows % block_m != 0 or cols % block_n != 0:
             raise ValueError(
-                f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_m}, {block_n}). for {target_keys}"
+                (
+                f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes"
+                f" ({block_m}, {block_n}). for {target_keys}"
+                )
             )
 
         # Leading dims can be empty (2D) or include num_experts/... (3D+)

From 906353d51c47f4760de1e9ebd64f719b19f6667e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 13 Feb 2026 03:12:37 +0000
Subject: [PATCH 10/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/modeling/finegrained_fp8_patch.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/auto_round/modeling/finegrained_fp8_patch.py b/auto_round/modeling/finegrained_fp8_patch.py
index 5c7cdee96..06d85835b 100644
--- a/auto_round/modeling/finegrained_fp8_patch.py
+++ b/auto_round/modeling/finegrained_fp8_patch.py
@@ -68,6 +68,7 @@ def __init__(
         else:
             self.register_parameter("bias", None)
 
+
 def _ceil_div(a, b):
     return (a + b - 1) // b
 
@@ -152,8 +153,8 @@ def convert(self, input_dict: torch.Tensor, **kwargs) -> dict[str, torch.Tensor]
         if rows % block_m != 0 or cols % block_n != 0:
             raise ValueError(
                 (
-                f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes"
-                f" ({block_m}, {block_n}). for {target_keys}"
+                    f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes"
+                    f" ({block_m}, {block_n}). for {target_keys}"
                 )
             )
 

From 5650385dd9f9ed9f5df5572d4b36c5fc67a6a6a2 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 13 Feb 2026 04:52:19 +0000
Subject: [PATCH 11/11] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/modeling/hpu_patch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py
index e69ba268d..521caec4b 100644
--- a/auto_round/modeling/hpu_patch.py
+++ b/auto_round/modeling/hpu_patch.py
@@ -7,9 +7,9 @@
 def patch_finegrained_fp8():
     """Use importlib to replace transformers.integrations.finegrained_fp8 with auto-round's HPU-compatible version."""
     try:
-        from auto_round.utils.hpu_utils import is_hpu_available
+        from auto_round.utils import is_hpex_available
 
-        if not is_hpu_available():
+        if not is_hpex_available():
             return  # No patching needed on non-HPU devices
 
         import importlib