From b5f7489594c5c45a57e803760fc4a918ebe6f17d Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 12 Feb 2026 07:40:16 +0000 Subject: [PATCH 01/11] fake cuda on hpu Signed-off-by: yiliu30 --- auto_round/utils/device.py | 18 ++++++++++++++++++ auto_round/utils/model.py | 18 +++++++++++------- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index a16f441bf..b2a91f510 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -337,6 +337,24 @@ def __exit__(self, exc_type, exc, exc_tb): torch.cuda.get_device_capability = self._orig_func self._orig_func = None return False + +class fake_cuda_for_hpu(ContextDecorator): + """Context manager/decorator to fake CUDA availability for HPU devices.""" + + def __init__(self): + self._orig_is_available = None + + def __enter__(self): + if is_hpex_available(): + self._orig_is_available = torch.cuda.is_available + torch.cuda.is_available = lambda: True + return self + + def __exit__(self, exc_type, exc, exc_tb): + if is_hpex_available() and hasattr(self, "_orig_is_available"): + torch.cuda.is_available = self._orig_is_available + del self._orig_is_available + return False def get_packing_device(device: str | torch.device | None = "auto") -> torch.device: diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 2536c0a93..126ebf4fb 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -274,6 +274,8 @@ def llm_load_model( _use_hpu_compile_mode, get_device_and_parallelism, override_cuda_device_capability, + fake_cuda_for_hpu, + is_hpex_available, ) device_str, use_auto_mapping = get_device_and_parallelism(device) @@ -289,13 +291,15 @@ def llm_load_model( if "deepseek" in pretrained_model_name_or_path.lower() and trust_remote_code: logger.warning("trust_remote_code is enabled by default, please ensure its correctness.") - if _use_hpu_compile_mode(): - model = model_cls.from_pretrained( - pretrained_model_name_or_path, - torch_dtype=torch_dtype, - trust_remote_code=trust_remote_code, - device_map="auto" if use_auto_mapping else None, - ) + if is_hpex_available(): + # For loading FP8 model on HPU + with fake_cuda_for_hpu(), override_cuda_device_capability(): + model = model_cls.from_pretrained( + pretrained_model_name_or_path, + torch_dtype=torch_dtype, + trust_remote_code=trust_remote_code, + device_map="auto" if use_auto_mapping else None, + ) else: try: model = model_cls.from_pretrained( From d8a98495bf417932bd8d2580445007d9f98e0cfd Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 12 Feb 2026 07:41:04 +0000 Subject: [PATCH 02/11] fix Signed-off-by: yiliu30 --- auto_round/compressors/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index e855fd0ce..d6f8af87e 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -486,8 +486,6 @@ def __init__( logger.info(f"using {self.model.dtype} for quantization tuning") # Some helpers - if "hpu" in str(self.device): - self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear") self.batch_dim = None self.infer_bs_coeff = 1 From 6332241eea2e4d41fa51420f7669e4d9499d0921 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 12 Feb 2026 08:08:17 +0000 Subject: [PATCH 03/11] load fp8 model Signed-off-by: yiliu30 --- test/test_hpu/test_load_fp8_model.py | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 test/test_hpu/test_load_fp8_model.py diff --git a/test/test_hpu/test_load_fp8_model.py b/test/test_hpu/test_load_fp8_model.py new file mode 100644 index 000000000..5fb584499 --- /dev/null +++ b/test/test_hpu/test_load_fp8_model.py @@ -0,0 +1,30 @@ +import os +import shutil + +import pytest +import torch + +from auto_round import AutoRound + + +class TestAutoRound: + save_dir = "./saved" + + def check_nan_inf_in_tensor(self, tensor, name=""): + return torch.isnan(tensor).any() or torch.isinf(tensor).any() + + def test_small_model_rtn_generation(self): + model_name = "Qwen/Qwen3-0.6B-FP8" + ar = AutoRound(model_name, iters=0, scheme="FP8_STATIC", nsamples=16) + model, folder = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor") + # all linears except lm_head should be quantized to FP8 + fp8_linear_count = 0 + for name, module in model.named_modules(): + if "FP8QLinear" in type(module).__name__: + assert module.weight.dtype == torch.float8_e4m3fn, f"{name} is not in FP8" + assert not self.check_nan_inf_in_tensor(module.weight.to(torch.float32)), ( + f"{name} has NaN or Inf in weights" + ) + fp8_linear_count += 1 + assert fp8_linear_count > 0, "No FP8 linear layer found in the quantized model" + shutil.rmtree(self.save_dir, ignore_errors=True) From b6b6222ca57202770b33574e4df67474c5d76434 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 12 Feb 2026 08:11:12 +0000 Subject: [PATCH 04/11] add more ut Signed-off-by: yiliu30 --- test/test_hpu/test_load_fp8_model.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/test_hpu/test_load_fp8_model.py b/test/test_hpu/test_load_fp8_model.py index 5fb584499..57ac975ea 100644 --- a/test/test_hpu/test_load_fp8_model.py +++ b/test/test_hpu/test_load_fp8_model.py @@ -7,14 +7,20 @@ from auto_round import AutoRound +MODEL_LIST = ( + "Qwen/Qwen3-0.6B-FP8", + "Qwen/Qwen3-0.6B", +) + class TestAutoRound: save_dir = "./saved" def check_nan_inf_in_tensor(self, tensor, name=""): return torch.isnan(tensor).any() or torch.isinf(tensor).any() - def test_small_model_rtn_generation(self): - model_name = "Qwen/Qwen3-0.6B-FP8" + + @pytest.mark.parametrize("model_name", MODEL_LIST) + def test_small_model_rtn_generation(self, model_name): ar = AutoRound(model_name, iters=0, scheme="FP8_STATIC", nsamples=16) model, folder = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor") # all linears except lm_head should be quantized to FP8 From 0918cb48f6ea96c9bd6c3a14defd7091aa870380 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 12 Feb 2026 08:11:33 +0000 Subject: [PATCH 05/11] add more ut Signed-off-by: yiliu30 --- test/test_hpu/{test_load_fp8_model.py => test_quant_fp8.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/test_hpu/{test_load_fp8_model.py => test_quant_fp8.py} (100%) diff --git a/test/test_hpu/test_load_fp8_model.py b/test/test_hpu/test_quant_fp8.py similarity index 100% rename from test/test_hpu/test_load_fp8_model.py rename to test/test_hpu/test_quant_fp8.py From 251be9471f5153d250201d83918b8f44b13f378e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 08:57:25 +0000 Subject: [PATCH 06/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/utils/device.py | 5 +++-- auto_round/utils/model.py | 4 ++-- test/test_hpu/test_quant_fp8.py | 9 ++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index b2a91f510..0f19454b9 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -337,10 +337,11 @@ def __exit__(self, exc_type, exc, exc_tb): torch.cuda.get_device_capability = self._orig_func self._orig_func = None return False - + + class fake_cuda_for_hpu(ContextDecorator): """Context manager/decorator to fake CUDA availability for HPU devices.""" - + def __init__(self): self._orig_is_available = None diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 126ebf4fb..d1f515b04 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -272,10 +272,10 @@ def llm_load_model( from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer from auto_round.utils.device import ( _use_hpu_compile_mode, - get_device_and_parallelism, - override_cuda_device_capability, fake_cuda_for_hpu, + get_device_and_parallelism, is_hpex_available, + override_cuda_device_capability, ) device_str, use_auto_mapping = get_device_and_parallelism(device) diff --git a/test/test_hpu/test_quant_fp8.py b/test/test_hpu/test_quant_fp8.py index 57ac975ea..eaaa61741 100644 --- a/test/test_hpu/test_quant_fp8.py +++ b/test/test_hpu/test_quant_fp8.py @@ -6,19 +6,18 @@ from auto_round import AutoRound - MODEL_LIST = ( "Qwen/Qwen3-0.6B-FP8", "Qwen/Qwen3-0.6B", ) + class TestAutoRound: save_dir = "./saved" def check_nan_inf_in_tensor(self, tensor, name=""): return torch.isnan(tensor).any() or torch.isinf(tensor).any() - @pytest.mark.parametrize("model_name", MODEL_LIST) def test_small_model_rtn_generation(self, model_name): ar = AutoRound(model_name, iters=0, scheme="FP8_STATIC", nsamples=16) @@ -28,9 +27,9 @@ def test_small_model_rtn_generation(self, model_name): for name, module in model.named_modules(): if "FP8QLinear" in type(module).__name__: assert module.weight.dtype == torch.float8_e4m3fn, f"{name} is not in FP8" - assert not self.check_nan_inf_in_tensor(module.weight.to(torch.float32)), ( - f"{name} has NaN or Inf in weights" - ) + assert not self.check_nan_inf_in_tensor( + module.weight.to(torch.float32) + ), f"{name} has NaN or Inf in weights" fp8_linear_count += 1 assert fp8_linear_count > 0, "No FP8 linear layer found in the quantized model" shutil.rmtree(self.save_dir, ignore_errors=True) From 73f93b2c05256f450e71799a55fe90fee6498be9 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 13 Feb 2026 02:24:12 +0000 Subject: [PATCH 07/11] patch fp8 quantized for hpu Signed-off-by: yiliu30 --- auto_round/modeling/__init__.py | 1 + auto_round/modeling/finegrained_fp8_patch.py | 282 +++++++++++++++++++ auto_round/modeling/hpu_patch.py | 29 ++ auto_round/utils/device.py | 50 +++- auto_round/utils/model.py | 3 +- 5 files changed, 363 insertions(+), 2 deletions(-) create mode 100644 auto_round/modeling/finegrained_fp8_patch.py create mode 100644 auto_round/modeling/hpu_patch.py diff --git a/auto_round/modeling/__init__.py b/auto_round/modeling/__init__.py index d1bc25269..ece2d5cd7 100644 --- a/auto_round/modeling/__init__.py +++ b/auto_round/modeling/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .fp8_quant import * +from .hpu_patch import * \ No newline at end of file diff --git a/auto_round/modeling/finegrained_fp8_patch.py b/auto_round/modeling/finegrained_fp8_patch.py new file mode 100644 index 000000000..7c2a16644 --- /dev/null +++ b/auto_round/modeling/finegrained_fp8_patch.py @@ -0,0 +1,282 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copied from https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/finegrained_fp8.py +from transformers.core_model_loading import ConversionOps +from transformers.quantizers.quantizers_utils import should_convert_module +from transformers.utils import is_kernels_available, is_torch_accelerator_available, is_torch_available, logging + + +if is_torch_available(): + import torch + import torch.nn as nn + # import triton + # import triton.language as tl + from torch.nn import functional as F + + +logger = logging.get_logger(__name__) + + + +_FP8_DTYPE = torch.float8_e4m3fn +_FP8_MIN = torch.finfo(_FP8_DTYPE).min +_FP8_MAX = torch.finfo(_FP8_DTYPE).max + + + +class FP8Linear(nn.Linear): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = False, + dtype=torch.float8_e4m3fn, + block_size: tuple[int, int] | None = None, + activation_scheme="dynamic", + ): + super().__init__(in_features, out_features) + + # If block size is None, it means that we are doing per-tensor quantization + self.block_size = block_size + self.activation_scheme = activation_scheme + + self.weight = torch.nn.Parameter(torch.empty(out_features, in_features, dtype=dtype)) + + if self.block_size is None: + self.weight_scale_inv = nn.Parameter(torch.tensor(1.0, dtype=torch.float32)) + else: + scale_out_features = (out_features + self.block_size[0] - 1) // self.block_size[0] + scale_in_features = (in_features + self.block_size[1] - 1) // self.block_size[1] + self.weight_scale_inv = nn.Parameter( + torch.empty(scale_out_features, scale_in_features, dtype=torch.float32) + ) + + if self.activation_scheme == "static": + self.activation_scale = nn.Parameter(torch.tensor(1.0, dtype=torch.float32)) + + if bias: + self.bias = nn.Parameter(torch.empty(self.out_features)) + else: + self.register_parameter("bias", None) + + # def forward(self, input: torch.Tensor) -> torch.Tensor: + # if self.weight.element_size() > 1: + # return F.linear(input, self.weight, self.bias) + # else: + # if isinstance(self.weight, torch.distributed.tensor.DTensor): + # weight = self.weight._local_tensor.contiguous() + # scale_inv = self.weight_scale_inv._local_tensor.contiguous() + # else: + # weight = self.weight.contiguous() + # scale_inv = self.weight_scale_inv.contiguous() + # # Context manager used to switch among the available accelerators + # device_type = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda" + # torch_accelerator_module = getattr(torch, device_type, torch.cuda) + # with torch_accelerator_module.device(input.device): + # if self.activation_scheme == "dynamic": + # qinput, scale = act_quant(input, self.block_size[1]) + # elif self.activation_scheme == "static": + # scale = self.activation_scale.to(torch.float32) + # qinput = (input / scale).clamp(min=_FP8_MIN, max=_FP8_MAX).to(torch.float8_e4m3fn) + + # else: + # raise NotImplementedError("Not supported") + + # output = w8a8_block_fp8_matmul( + # qinput, + # weight, + # scale, + # scale_inv, + # self.block_size, + # output_dtype=input.dtype, + # ) + + # # Blocks the CPU until all accelerator operations on the specified device are complete. It is used to ensure that the results of the + # # preceding operations are ready before proceeding + # torch_accelerator_module.synchronize() + # if self.bias is not None: + # output = output + self.bias + + # return output.to(dtype=input.dtype) + + +def _ceil_div(a, b): + return (a + b - 1) // b + + + +def replace_with_fp8_linear( + model, modules_to_not_convert: list[str] | None = None, quantization_config=None, pre_quantized=False +): + """ + A helper function to replace all `torch.nn.Linear` modules by `FP8Linear` modules. + + Parameters: + model (`torch.nn.Module`): + Input model or `torch.nn.Module` as the function is run recursively. + modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`): + Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons. + quantization_config (`FbgemmFp8Config`): + The quantization config object that contains the quantization parameters. + pre_quantized (`book`, defaults to `False`): + Whether the model is pre-quantized or not + """ + + if quantization_config.dequantize: + return model + + has_been_replaced = False + for module_name, module in model.named_modules(): + if not should_convert_module(module_name, modules_to_not_convert): + continue + # we need this to correctly materialize the weights during quantization + module_kwargs = {} if pre_quantized else {"dtype": None} + new_module = None + with torch.device("meta"): + if isinstance(module, nn.Linear): + new_module = FP8Linear( + in_features=module.in_features, + out_features=module.out_features, + bias=module.bias is not None, + activation_scheme=quantization_config.activation_scheme, + block_size=quantization_config.weight_block_size, + **module_kwargs, + ) + if new_module is not None: + model.set_submodule(module_name, new_module) + has_been_replaced = True + + if not has_been_replaced: + logger.warning( + "You are loading your model using fp8 but no linear modules were found in your model." + " Please double check your model architecture." + ) + return model + + +class Fp8Quantize(ConversionOps): + """ + A quantization operation that creates two tensors, weight and scale out of a weight. + """ + + def __init__(self, hf_quantizer): + self.hf_quantizer = hf_quantizer + + def convert(self, input_dict: torch.Tensor, **kwargs) -> dict[str, torch.Tensor]: + # Unpack single key/value (value may be wrapped in a list) + target_keys, value = tuple(input_dict.items())[0] + value = value[0] + + # Resolve block size (support dict-like or attr-like quant_config) + block_size = None + if self.hf_quantizer.quantization_config is not None: + if isinstance(self.hf_quantizer.quantization_config, dict): + block_size = self.hf_quantizer.quantization_config.get("weight_block_size") + else: + block_size = getattr(self.hf_quantizer.quantization_config, "weight_block_size", None) + if block_size is None: + block_size = (value.shape[-2], value.shape[-1]) + + block_m, block_n = block_size + rows, cols = value.shape[-2], value.shape[-1] + + # Enforce exact tiling like your original + if rows % block_m != 0 or cols % block_n != 0: + raise ValueError( + f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_m}, {block_n}). for {target_keys}" + ) + + # Leading dims can be empty (2D) or include num_experts/... (3D+) + leading_shape = value.shape[:-2] + rows_tiles = rows // block_m + cols_tiles = cols // block_n + + original_shape = value.shape + value_fp32 = value.to(torch.float32) + + # Reshape to (..., rows_tiles, block_m, cols_tiles, block_n) + reshaped = value_fp32.reshape(*leading_shape, rows_tiles, block_m, cols_tiles, block_n) + + # Per-tile max-abs over the block dims + # dims: block_m is at -3, block_n is at -1 after the reshape + max_abs = reshaped.abs().amax(dim=(-3, -1)) + safe_max_abs = torch.where(max_abs > 0, max_abs, torch.ones_like(max_abs)) + + # Tile scale (we store inverse scale like your Linear: weight_scale_inv) + scales = _FP8_MAX / safe_max_abs + scales = torch.where(max_abs > 0, scales, torch.ones_like(scales)) # keep zeros stable + + # Broadcast scales back over the block dims and quantize + # max_abs/scales shape: (..., rows_tiles, cols_tiles) + scales_broadcast = scales.unsqueeze(-1).unsqueeze(-3) # -> (..., rows_tiles, 1, cols_tiles, 1) + scaled = reshaped * scales_broadcast + + quantized = torch.clamp(scaled, min=_FP8_MIN, max=_FP8_MAX).to(_FP8_DTYPE) + + quantized = quantized.reshape(original_shape) + + inv_scales = (1.0 / scales).to(torch.float32) # shape: (*leading, rows_tiles, cols_tiles) + if target_keys.endswith("weight"): + scale_key = target_keys.rsplit(".", 1)[0] + ".weight_scale_inv" + else: + scale_key = target_keys + "_scale_inv" + + # Return both quantized weights and per-tile inverse scales (keeps leading dims, e.g., num_experts) + return { + target_keys: quantized, + scale_key: inv_scales, + } + + +class Fp8Dequantize(ConversionOps): + """Inverse operation of :class:`Fp8Quantize`. Takes a pair (weight, scale) and reconstructs the fp32 tensor.""" + + def __init__(self, hf_quantizer): + self.hf_quantizer = hf_quantizer + + def convert( + self, + input_dict: dict[str, torch.Tensor], + full_layer_name: str | None = None, + **kwargs, + ) -> dict[str, torch.Tensor]: + if len(input_dict) < 2: + # case where we only got weights, need to check for "weight$" + return {full_layer_name: input_dict["weight$"]} + + quantized = input_dict["weight$"][0] + scales = input_dict["weight_scale_inv"][0] + + rows, cols = quantized.shape[-2:] + block_size = self.hf_quantizer.quantization_config.weight_block_size + if block_size is None: + block_size = (quantized.shape[-2], quantized.shape[-1]) + + block_m, block_n = block_size + + if rows % block_m != 0 or cols % block_n != 0: + raise ValueError( + f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_m}, {block_n})." + ) + quantized = quantized.to(scales.dtype) + reshaped = quantized.reshape(-1, rows // block_m, block_m, cols // block_n, block_n) + expanded_scales = scales.reshape(-1, rows // block_m, cols // block_n) + expanded_scales = expanded_scales.unsqueeze(-1).unsqueeze(2) + dequantized = reshaped * expanded_scales + + return { + full_layer_name: dequantized.reshape(quantized.shape), + } + + diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py new file mode 100644 index 000000000..5639c0b6b --- /dev/null +++ b/auto_round/modeling/hpu_patch.py @@ -0,0 +1,29 @@ + +from auto_round.logger import logger + +def patch_finegrained_fp8(): + """Use importlib to replace transformers.integrations.finegrained_fp8 with auto-round's HPU-compatible version.""" + try: + from auto_round.utils.hpu_utils import is_hpu_available + + if not is_hpu_available(): + return # No patching needed on non-HPU devices + + import importlib + import sys + + # Import auto-round's HPU-compatible finegrained_fp8_patch module + finegrained_fp8_patch = importlib.import_module('auto_round.modeling.finegrained_fp8_patch') + + # Replace transformers.integrations.finegrained_fp8 in sys.modules + sys.modules['transformers.integrations.finegrained_fp8'] = finegrained_fp8_patch + + logger.info("✓ Replaced transformers.integrations.finegrained_fp8 with auto_round.modeling.finegrained_fp8_patch") + + except Exception as e: + import warnings + logger.warning(f"Failed to patch finegrained_fp8: {e}") + + +# Apply patch on import if HPU is available +patch_finegrained_fp8() \ No newline at end of file diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 0f19454b9..7d1d09177 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -24,7 +24,7 @@ import cpuinfo import psutil import torch - +import sys from auto_round.logger import logger from auto_round.utils.model import check_to_quantized, get_block_names, get_layer_features, get_module @@ -358,6 +358,54 @@ def __exit__(self, exc_type, exc, exc_tb): return False + +class fake_triton_for_hpu(ContextDecorator): + """Context manager/decorator to fake triton availability for HPU devices.""" + + def __init__(self): + self._orig_triton = None + self._orig_triton_language = None + self._had_triton = False + self._had_triton_language = False + + def __enter__(self): + if is_hpex_available(): + # Save original state + self._had_triton = 'triton' in sys.modules + self._had_triton_language = 'triton.language' in sys.modules + + if self._had_triton: + self._orig_triton = sys.modules['triton'] + if self._had_triton_language: + self._orig_triton_language = sys.modules['triton.language'] + + # Create and inject fake triton module + class FakeTriton: + def __getattr__(self, name): + return None + + fake_triton = FakeTriton() + fake_triton.jit = lambda func: func # Make triton.jit a no-op decorator + sys.modules['triton'] = fake_triton + sys.modules['triton.language'] = FakeTriton() + return self + + def __exit__(self, exc_type, exc, exc_tb): + if is_hpex_available(): + # Restore original state + if self._had_triton and self._orig_triton is not None: + sys.modules['triton'] = self._orig_triton + elif not self._had_triton and 'triton' in sys.modules: + del sys.modules['triton'] + + if self._had_triton_language and self._orig_triton_language is not None: + sys.modules['triton.language'] = self._orig_triton_language + elif not self._had_triton_language and 'triton.language' in sys.modules: + del sys.modules['triton.language'] + return False + + + def get_packing_device(device: str | torch.device | None = "auto") -> torch.device: """ Selects the packing device. diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index d1f515b04..f6efaa364 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -274,6 +274,7 @@ def llm_load_model( _use_hpu_compile_mode, fake_cuda_for_hpu, get_device_and_parallelism, + fake_triton_for_hpu, is_hpex_available, override_cuda_device_capability, ) @@ -293,7 +294,7 @@ def llm_load_model( if is_hpex_available(): # For loading FP8 model on HPU - with fake_cuda_for_hpu(), override_cuda_device_capability(): + with fake_cuda_for_hpu(), fake_triton_for_hpu(), override_cuda_device_capability(): model = model_cls.from_pretrained( pretrained_model_name_or_path, torch_dtype=torch_dtype, From 581c993b6000d3bf59467a595abf298688a3762f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Feb 2026 02:45:53 +0000 Subject: [PATCH 08/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/modeling/__init__.py | 2 +- auto_round/modeling/finegrained_fp8_patch.py | 7 +--- auto_round/modeling/hpu_patch.py | 26 ++++++++----- auto_round/utils/device.py | 39 ++++++++++---------- auto_round/utils/model.py | 2 +- 5 files changed, 38 insertions(+), 38 deletions(-) diff --git a/auto_round/modeling/__init__.py b/auto_round/modeling/__init__.py index ece2d5cd7..144e78ca3 100644 --- a/auto_round/modeling/__init__.py +++ b/auto_round/modeling/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. from .fp8_quant import * -from .hpu_patch import * \ No newline at end of file +from .hpu_patch import * diff --git a/auto_round/modeling/finegrained_fp8_patch.py b/auto_round/modeling/finegrained_fp8_patch.py index 7c2a16644..536a5dd7f 100644 --- a/auto_round/modeling/finegrained_fp8_patch.py +++ b/auto_round/modeling/finegrained_fp8_patch.py @@ -16,10 +16,10 @@ from transformers.quantizers.quantizers_utils import should_convert_module from transformers.utils import is_kernels_available, is_torch_accelerator_available, is_torch_available, logging - if is_torch_available(): import torch import torch.nn as nn + # import triton # import triton.language as tl from torch.nn import functional as F @@ -28,13 +28,11 @@ logger = logging.get_logger(__name__) - _FP8_DTYPE = torch.float8_e4m3fn _FP8_MIN = torch.finfo(_FP8_DTYPE).min _FP8_MAX = torch.finfo(_FP8_DTYPE).max - class FP8Linear(nn.Linear): def __init__( self, @@ -115,7 +113,6 @@ def _ceil_div(a, b): return (a + b - 1) // b - def replace_with_fp8_linear( model, modules_to_not_convert: list[str] | None = None, quantization_config=None, pre_quantized=False ): @@ -278,5 +275,3 @@ def convert( return { full_layer_name: dequantized.reshape(quantized.shape), } - - diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py index 5639c0b6b..e69ba268d 100644 --- a/auto_round/modeling/hpu_patch.py +++ b/auto_round/modeling/hpu_patch.py @@ -1,29 +1,35 @@ +# # Copyright (C) 2026 Intel Corporation +# # SPDX-License-Identifier: Apache-2.0 from auto_round.logger import logger + def patch_finegrained_fp8(): """Use importlib to replace transformers.integrations.finegrained_fp8 with auto-round's HPU-compatible version.""" try: from auto_round.utils.hpu_utils import is_hpu_available - + if not is_hpu_available(): return # No patching needed on non-HPU devices - + import importlib import sys - + # Import auto-round's HPU-compatible finegrained_fp8_patch module - finegrained_fp8_patch = importlib.import_module('auto_round.modeling.finegrained_fp8_patch') - + finegrained_fp8_patch = importlib.import_module("auto_round.modeling.finegrained_fp8_patch") + # Replace transformers.integrations.finegrained_fp8 in sys.modules - sys.modules['transformers.integrations.finegrained_fp8'] = finegrained_fp8_patch - - logger.info("✓ Replaced transformers.integrations.finegrained_fp8 with auto_round.modeling.finegrained_fp8_patch") - + sys.modules["transformers.integrations.finegrained_fp8"] = finegrained_fp8_patch + + logger.info( + "✓ Replaced transformers.integrations.finegrained_fp8 with auto_round.modeling.finegrained_fp8_patch" + ) + except Exception as e: import warnings + logger.warning(f"Failed to patch finegrained_fp8: {e}") # Apply patch on import if HPU is available -patch_finegrained_fp8() \ No newline at end of file +patch_finegrained_fp8() diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 7d1d09177..e9a11923d 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -15,6 +15,7 @@ import gc import os import re +import sys from contextlib import ContextDecorator, contextmanager from functools import lru_cache from itertools import combinations @@ -24,7 +25,7 @@ import cpuinfo import psutil import torch -import sys + from auto_round.logger import logger from auto_round.utils.model import check_to_quantized, get_block_names, get_layer_features, get_module @@ -358,10 +359,9 @@ def __exit__(self, exc_type, exc, exc_tb): return False - class fake_triton_for_hpu(ContextDecorator): """Context manager/decorator to fake triton availability for HPU devices.""" - + def __init__(self): self._orig_triton = None self._orig_triton_language = None @@ -371,41 +371,40 @@ def __init__(self): def __enter__(self): if is_hpex_available(): # Save original state - self._had_triton = 'triton' in sys.modules - self._had_triton_language = 'triton.language' in sys.modules - + self._had_triton = "triton" in sys.modules + self._had_triton_language = "triton.language" in sys.modules + if self._had_triton: - self._orig_triton = sys.modules['triton'] + self._orig_triton = sys.modules["triton"] if self._had_triton_language: - self._orig_triton_language = sys.modules['triton.language'] - + self._orig_triton_language = sys.modules["triton.language"] + # Create and inject fake triton module class FakeTriton: def __getattr__(self, name): return None - + fake_triton = FakeTriton() fake_triton.jit = lambda func: func # Make triton.jit a no-op decorator - sys.modules['triton'] = fake_triton - sys.modules['triton.language'] = FakeTriton() + sys.modules["triton"] = fake_triton + sys.modules["triton.language"] = FakeTriton() return self def __exit__(self, exc_type, exc, exc_tb): if is_hpex_available(): # Restore original state if self._had_triton and self._orig_triton is not None: - sys.modules['triton'] = self._orig_triton - elif not self._had_triton and 'triton' in sys.modules: - del sys.modules['triton'] - + sys.modules["triton"] = self._orig_triton + elif not self._had_triton and "triton" in sys.modules: + del sys.modules["triton"] + if self._had_triton_language and self._orig_triton_language is not None: - sys.modules['triton.language'] = self._orig_triton_language - elif not self._had_triton_language and 'triton.language' in sys.modules: - del sys.modules['triton.language'] + sys.modules["triton.language"] = self._orig_triton_language + elif not self._had_triton_language and "triton.language" in sys.modules: + del sys.modules["triton.language"] return False - def get_packing_device(device: str | torch.device | None = "auto") -> torch.device: """ Selects the packing device. diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index f6efaa364..1442512c9 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -273,8 +273,8 @@ def llm_load_model( from auto_round.utils.device import ( _use_hpu_compile_mode, fake_cuda_for_hpu, - get_device_and_parallelism, fake_triton_for_hpu, + get_device_and_parallelism, is_hpex_available, override_cuda_device_capability, ) From 84c1f1ecd0b66de87595168d178b9e49eb348941 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 13 Feb 2026 03:08:51 +0000 Subject: [PATCH 09/11] fix format Signed-off-by: yiliu30 --- auto_round/modeling/finegrained_fp8_patch.py | 49 +++----------------- 1 file changed, 6 insertions(+), 43 deletions(-) diff --git a/auto_round/modeling/finegrained_fp8_patch.py b/auto_round/modeling/finegrained_fp8_patch.py index 536a5dd7f..5c7cdee96 100644 --- a/auto_round/modeling/finegrained_fp8_patch.py +++ b/auto_round/modeling/finegrained_fp8_patch.py @@ -68,47 +68,6 @@ def __init__( else: self.register_parameter("bias", None) - # def forward(self, input: torch.Tensor) -> torch.Tensor: - # if self.weight.element_size() > 1: - # return F.linear(input, self.weight, self.bias) - # else: - # if isinstance(self.weight, torch.distributed.tensor.DTensor): - # weight = self.weight._local_tensor.contiguous() - # scale_inv = self.weight_scale_inv._local_tensor.contiguous() - # else: - # weight = self.weight.contiguous() - # scale_inv = self.weight_scale_inv.contiguous() - # # Context manager used to switch among the available accelerators - # device_type = torch.accelerator.current_accelerator().type if is_torch_accelerator_available() else "cuda" - # torch_accelerator_module = getattr(torch, device_type, torch.cuda) - # with torch_accelerator_module.device(input.device): - # if self.activation_scheme == "dynamic": - # qinput, scale = act_quant(input, self.block_size[1]) - # elif self.activation_scheme == "static": - # scale = self.activation_scale.to(torch.float32) - # qinput = (input / scale).clamp(min=_FP8_MIN, max=_FP8_MAX).to(torch.float8_e4m3fn) - - # else: - # raise NotImplementedError("Not supported") - - # output = w8a8_block_fp8_matmul( - # qinput, - # weight, - # scale, - # scale_inv, - # self.block_size, - # output_dtype=input.dtype, - # ) - - # # Blocks the CPU until all accelerator operations on the specified device are complete. It is used to ensure that the results of the - # # preceding operations are ready before proceeding - # torch_accelerator_module.synchronize() - # if self.bias is not None: - # output = output + self.bias - - # return output.to(dtype=input.dtype) - - def _ceil_div(a, b): return (a + b - 1) // b @@ -123,7 +82,8 @@ def replace_with_fp8_linear( model (`torch.nn.Module`): Input model or `torch.nn.Module` as the function is run recursively. modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`): - Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons. + Names of the modules to not convert. In practice we keep the `lm_head` + in full precision for numerical stability reasons. quantization_config (`FbgemmFp8Config`): The quantization config object that contains the quantization parameters. pre_quantized (`book`, defaults to `False`): @@ -191,7 +151,10 @@ def convert(self, input_dict: torch.Tensor, **kwargs) -> dict[str, torch.Tensor] # Enforce exact tiling like your original if rows % block_m != 0 or cols % block_n != 0: raise ValueError( - f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_m}, {block_n}). for {target_keys}" + ( + f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes" + f" ({block_m}, {block_n}). for {target_keys}" + ) ) # Leading dims can be empty (2D) or include num_experts/... (3D+) From 906353d51c47f4760de1e9ebd64f719b19f6667e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Feb 2026 03:12:37 +0000 Subject: [PATCH 10/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/modeling/finegrained_fp8_patch.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/auto_round/modeling/finegrained_fp8_patch.py b/auto_round/modeling/finegrained_fp8_patch.py index 5c7cdee96..06d85835b 100644 --- a/auto_round/modeling/finegrained_fp8_patch.py +++ b/auto_round/modeling/finegrained_fp8_patch.py @@ -68,6 +68,7 @@ def __init__( else: self.register_parameter("bias", None) + def _ceil_div(a, b): return (a + b - 1) // b @@ -152,8 +153,8 @@ def convert(self, input_dict: torch.Tensor, **kwargs) -> dict[str, torch.Tensor] if rows % block_m != 0 or cols % block_n != 0: raise ValueError( ( - f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes" - f" ({block_m}, {block_n}). for {target_keys}" + f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes" + f" ({block_m}, {block_n}). for {target_keys}" ) ) From 5650385dd9f9ed9f5df5572d4b36c5fc67a6a6a2 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 13 Feb 2026 04:52:19 +0000 Subject: [PATCH 11/11] fix Signed-off-by: yiliu30 --- auto_round/modeling/hpu_patch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py index e69ba268d..521caec4b 100644 --- a/auto_round/modeling/hpu_patch.py +++ b/auto_round/modeling/hpu_patch.py @@ -7,9 +7,9 @@ def patch_finegrained_fp8(): """Use importlib to replace transformers.integrations.finegrained_fp8 with auto-round's HPU-compatible version.""" try: - from auto_round.utils.hpu_utils import is_hpu_available + from auto_round.utils import is_hpex_available - if not is_hpu_available(): + if not is_hpex_available(): return # No patching needed on non-HPU devices import importlib