diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index de9ea903f..0e824e609 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -456,7 +456,7 @@ def fp8_static_scheme_checker( BackendInfos["auto_round_kernel_xpu"] = BackendInfo( device=["xpu"], - sym=[True], + sym=[True, False], packing_format=GPTQ_FORMAT_NO_ZP, bits=[4, 8], group_size=None, @@ -486,7 +486,7 @@ def fp8_static_scheme_checker( BackendInfos["auto_round_kernel_zp_xpu"] = BackendInfo( device=["xpu"], - sym=[True], + sym=[True, False], packing_format=GPTQ_FORMAT, bits=[4, 8], group_size=None, @@ -500,7 +500,7 @@ def fp8_static_scheme_checker( ) BackendInfos["auto_round_kernel_awq"] = BackendInfo( - device=["cpu"], + device=["cpu", "xpu"], sym=[True, False], packing_format=AWQ_FORMAT, bits=[4], @@ -514,21 +514,6 @@ def fp8_static_scheme_checker( requirements=["torch>=2.8.0", "auto_round_kernel"], ) -BackendInfos["auto_round_kernel_awq_xpu"] = BackendInfo( - device=["xpu"], - sym=[True], - packing_format=AWQ_FORMAT, - bits=[4], - group_size=None, - priority=6, - checkers=[ark_feature_checker], - alias=["ark"], - compute_dtype=["float32", "float16"], - data_type=["int"], - act_bits=WOQ_DEFAULT_ACT_BITS, - requirements=["torch>=2.8.0", "auto_round_kernel"], -) - BackendInfos["ipex_gptq_cpu"] = BackendInfo( device=["cpu"], sym=[True, False], diff --git a/auto_round_extension/ark/qlinear.py b/auto_round_extension/ark/qlinear.py index 1e34ad4f6..a02f1643c 100644 --- a/auto_round_extension/ark/qlinear.py +++ b/auto_round_extension/ark/qlinear.py @@ -116,8 +116,7 @@ def post_init(self): raise NotImplementedError( f"Device type {self.qweight.device.type} is not supported. Only CPU and XPU devices are supported." ) - if self.qweight.device.type != "cpu" and self.asym: - raise NotImplementedError("Asymmetric quantization is only supported on CPU devices") + if "awq" in self.QUANT_TYPE: intweight, zeros = unpack_awq( self.qweight, self.qzeros, self.bits diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index 361f1bdf9..5fe1d09e3 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -29,8 +29,6 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t limit = 1000 if not torch.xpu.is_available(): pytest.skip("No XPU device") - if sym is False: - pytest.skip("No asym support for XPU") model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto") tokenizer = AutoTokenizer.from_pretrained(self.model_name) if fast_cfg: @@ -56,9 +54,9 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t shutil.rmtree(self.save_folder, ignore_errors=True) @pytest.mark.parametrize("format", ["auto_round", "auto_round:gptqmodel"]) - @pytest.mark.parametrize("bits, group_size, sym", [(4, 128, True), (8, 128, True)]) + @pytest.mark.parametrize("bits, group_size, sym", [(4, 128, False), (8, 128, True)]) @pytest.mark.parametrize("dtype", [torch.bfloat16]) - @pytest.mark.parametrize("device", ["cpu", "xpu"]) + @pytest.mark.parametrize("device", ["xpu", "cpu"]) def test_formats(self, format, bits, group_size, sym, dtype, device): self.main_op(format, bits, group_size, sym, dtype, device)