From ac9dda2aa0399b94826ee953bbde62bf68c0f0fd Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Mon, 2 Feb 2026 18:21:52 +0800
Subject: [PATCH 01/15] fix --device_map cuda xpu issue

---
 auto_round/compressors/base.py       |   4 +-
 test/test_cpu/schemes/test_scheme.py | 294 +++++++++++++++------------
 2 files changed, 164 insertions(+), 134 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 97702db3e..07300f905 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2149,8 +2149,10 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                                     device = int(device.split(":")[-1])
                                 elif device == "cpu":
                                     device = "cpu"
+                                elif isinstance(device, str):
+                                    device = 0
                                 else:
-                                    raise ValueError(f"Unsupported device {device} in device_map: {self.device_map}")
+                                    raise f"Unsupported device {device} in device_map: {self.device_map}"
                                 # Use 90% of the reported max memory to leave headroom for activations,
                                 # temporary tensors, other processes, and allocator fragmentation, reducing
                                 # the chance of runtime OOM while still utilizing most available memory.
diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py
index e2b0c15c3..d7ff777c6 100644
--- a/test/test_cpu/schemes/test_scheme.py
+++ b/test/test_cpu/schemes/test_scheme.py
@@ -18,144 +18,172 @@ def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_gguf(self, tiny_qwen_model_path, dataloader):
+    # def test_gguf(self, tiny_qwen_model_path, dataloader):
+    #     ar = AutoRound(
+    #         tiny_qwen_model_path,
+    #         scheme="W2A16",
+    #         nsamples=1,
+    #         iters=1,
+    #         seqlen=2,
+    #         dataset=dataloader,
+    #     )
+    #     ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
+    #     assert ar.bits == 4
+    #     shutil.rmtree(self.save_folder, ignore_errors=True)
+    #
+    # def test_w4a16(self, tiny_opt_model_path, dataloader):
+    #     ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+    #     assert ar.bits == 4
+    #     ar.quantize()
+    #
+    # def test_w2a16_rtn(self, tiny_opt_model_path, dataloader):
+    #     ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader)
+    #     assert ar.bits == 2
+    #     ar.quantize()
+    #
+    # def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader):
+    #
+    #     layer_config = {
+    #         "model.layers.0.self_attn.k_proj": {"bits": 16},
+    #     }
+    #     ar = AutoRound(
+    #         tiny_qwen_moe_model_path,
+    #         scheme="W4A16_MIXED",
+    #         nsamples=1,
+    #         iters=0,
+    #         seqlen=2,
+    #         dataset=dataloader,
+    #         low_cpu_mem_usage=False,
+    #         layer_config=layer_config,
+    #     )
+    #     ar.quantize()
+    #     assert ar.bits == 4
+    #     assert ar.model.model.layers[0].self_attn.q_proj.bits == 8
+    #     assert ar.model.model.layers[0].self_attn.k_proj.bits == 16
+    #     assert ar.model.model.layers[0].mlp.experts[0].up_proj.bits == 4
+    #     assert ar.model.model.layers[0].mlp.shared_expert.gate_proj.bits == 8
+    #
+    #     shutil.rmtree(self.save_folder, ignore_errors=True)
+    #
+    # def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader):
+    #
+    #     ar = AutoRound(
+    #         tiny_qwen_2_5_vl_model_path,
+    #         scheme="W4A16_MIXED",
+    #         nsamples=1,
+    #         batch_size=1,
+    #         iters=0,
+    #         seqlen=2,
+    #         dataset=dataloader,
+    #         low_cpu_mem_usage=False,
+    #     )
+    #     ar.quantize()
+    #     assert ar.bits == 4
+    #     assert ar.model.language_model.layers[0].self_attn.q_proj.bits == 16
+    #     assert ar.model.visual.blocks[0].attn.qkv.bits == 16
+    #     shutil.rmtree(self.save_folder, ignore_errors=True)
+    #
+    # def test_mxfp4(self, tiny_opt_model_path, dataloader):
+    #     ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+    #     assert ar.bits == 4
+    #     assert ar.act_bits == 4
+    #     assert ar.data_type == "mx_fp"
+    #     assert ar.act_data_type == "mx_fp"
+    #     ar.quantize()
+    #
+    # def test_vllm(self, tiny_qwen_vl_model_path):
+    #     from auto_round import AutoRoundMLLM
+    #
+    #     ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2)
+    #     assert ar.bits == 2
+    #     assert ar.act_bits == 16
+    #
+    # def test_nvfp4(self, tiny_opt_model_path, dataloader):
+    #     ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+    #     assert ar.bits == 4
+    #     assert ar.act_bits == 4
+    #     assert ar.data_type == "nv_fp"
+    #     assert ar.act_data_type == "nv_fp4_with_static_gs"
+    #     ar.quantize()
+    #
+    # def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader):
+    #     import copy
+    #
+    #     preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"]
+    #     for scheme in preset_schemes:
+    #         model_name = tiny_opt_model_path
+    #         if "gguf" in scheme.lower():
+    #             model_name = tiny_qwen_model_path
+    #         print(f"scheme={scheme}")
+    #         ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+    #         ar.quantize_and_save(self.save_folder)
+    #         shutil.rmtree(self.save_folder, ignore_errors=True)
+    #
+    # def test_scheme_in_layer_config(self, dataloader):
+    #     model = get_tiny_model(opt_name_or_path, num_layers=5)
+    #     tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True)
+    #     layer_config = {
+    #         "model.decoder.layers.2.self_attn": {"bits": 2},
+    #         "model.decoder.layers.3.self_attn.v_proj": "W8A16",
+    #         "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
+    #     }
+    #     ar = AutoRound(
+    #         model,
+    #         tokenizer,
+    #         scheme="W3A16",
+    #         nsamples=1,
+    #         iters=1,
+    #         layer_config=layer_config,
+    #         seqlen=2,
+    #         dataset=dataloader,
+    #     )
+    #
+    #     ar.quantize()
+    #     for n, m in ar.model.named_modules():
+    #         if n == "model.decoder.layers.2.self_attn.q_proj":
+    #             assert m.bits == 2
+    #         if n == "model.decoder.layers.2.self_attn.k_proj":
+    #             assert m.bits == 2
+    #         if n == "model.decoder.layers.3.self_attn.v_proj":
+    #             assert m.bits == 8
+    #         if n == "model.decoder.layers.4.self_attn.k_proj":
+    #             assert m.group_size == 64
+    #
+    # def test_parse_available_devices(self):
+    #     from auto_round.utils.device import parse_available_devices
+    #
+    #     device_list = parse_available_devices("auto")
+    #     assert len(device_list) == 1 and "cpu" in device_list
+    #     device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu")
+    #     assert len(device_list) == 3
+    #     assert device_list == ["cuda:0", "cuda:1", "cpu"]
+    #     device_list = parse_available_devices("0,1")
+    #     assert len(device_list) == 1 and "cpu" in device_list
+    #
+
+    def test_set_scheme(self, tiny_qwen_model_path):
         ar = AutoRound(
             tiny_qwen_model_path,
-            scheme="W2A16",
+            scheme="gguf:q2_k_s",
+            data_type="fp",
             nsamples=1,
-            iters=1,
-            seqlen=2,
-            dataset=dataloader,
-        )
-        ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
-        assert ar.bits == 4
-        shutil.rmtree(self.save_folder, ignore_errors=True)
-
-    def test_w4a16(self, tiny_opt_model_path, dataloader):
-        ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
-        assert ar.bits == 4
-        ar.quantize()
-
-    def test_w2a16_rtn(self, tiny_opt_model_path, dataloader):
-        ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader)
-        assert ar.bits == 2
-        ar.quantize()
-
-    def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader):
-
-        layer_config = {
-            "model.layers.0.self_attn.k_proj": {"bits": 16},
-        }
-        ar = AutoRound(
-            tiny_qwen_moe_model_path,
-            scheme="W4A16_MIXED",
-            nsamples=1,
-            iters=0,
-            seqlen=2,
-            dataset=dataloader,
-            low_cpu_mem_usage=False,
-            layer_config=layer_config,
-        )
-        ar.quantize()
-        assert ar.bits == 4
-        assert ar.model.model.layers[0].self_attn.q_proj.bits == 8
-        assert ar.model.model.layers[0].self_attn.k_proj.bits == 16
-        assert ar.model.model.layers[0].mlp.experts[0].up_proj.bits == 4
-        assert ar.model.model.layers[0].mlp.shared_expert.gate_proj.bits == 8
-
-        shutil.rmtree(self.save_folder, ignore_errors=True)
-
-    def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader):
-
-        ar = AutoRound(
-            tiny_qwen_2_5_vl_model_path,
-            scheme="W4A16_MIXED",
-            nsamples=1,
-            batch_size=1,
+            disable_opt_rtn=True,
             iters=0,
             seqlen=2,
-            dataset=dataloader,
-            low_cpu_mem_usage=False,
         )
         ar.quantize()
-        assert ar.bits == 4
-        assert ar.model.language_model.layers[0].self_attn.q_proj.bits == 16
-        assert ar.model.visual.blocks[0].attn.qkv.bits == 16
-        shutil.rmtree(self.save_folder, ignore_errors=True)
-
-    def test_mxfp4(self, tiny_opt_model_path, dataloader):
-        ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
-        assert ar.bits == 4
-        assert ar.act_bits == 4
-        assert ar.data_type == "mx_fp"
-        assert ar.act_data_type == "mx_fp"
-        ar.quantize()
-
-    def test_vllm(self, tiny_qwen_vl_model_path):
-        from auto_round import AutoRoundMLLM
-
-        ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2)
-        assert ar.bits == 2
-        assert ar.act_bits == 16
-
-    def test_nvfp4(self, tiny_opt_model_path, dataloader):
-        ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
-        assert ar.bits == 4
-        assert ar.act_bits == 4
-        assert ar.data_type == "nv_fp"
-        assert ar.act_data_type == "nv_fp4_with_static_gs"
-        ar.quantize()
-
-    def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader):
-        import copy
-
-        preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"]
-        for scheme in preset_schemes:
-            model_name = tiny_opt_model_path
-            if "gguf" in scheme.lower():
-                model_name = tiny_qwen_model_path
-            print(f"scheme={scheme}")
-            ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader)
-            ar.quantize_and_save(self.save_folder)
-            shutil.rmtree(self.save_folder, ignore_errors=True)
-
-    def test_scheme_in_layer_config(self, dataloader):
-        model = get_tiny_model(opt_name_or_path, num_layers=5)
-        tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True)
-        layer_config = {
-            "model.decoder.layers.2.self_attn": {"bits": 2},
-            "model.decoder.layers.3.self_attn.v_proj": "W8A16",
-            "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
-        }
-        ar = AutoRound(
-            model,
-            tokenizer,
-            scheme="W3A16",
-            nsamples=1,
-            iters=1,
-            layer_config=layer_config,
-            seqlen=2,
-            dataset=dataloader,
-        )
-
-        ar.quantize()
-        for n, m in ar.model.named_modules():
-            if n == "model.decoder.layers.2.self_attn.q_proj":
-                assert m.bits == 2
-            if n == "model.decoder.layers.2.self_attn.k_proj":
-                assert m.bits == 2
-            if n == "model.decoder.layers.3.self_attn.v_proj":
-                assert m.bits == 8
-            if n == "model.decoder.layers.4.self_attn.k_proj":
-                assert m.group_size == 64
-
-    def test_parse_available_devices(self):
-        from auto_round.utils.device import parse_available_devices
 
-        device_list = parse_available_devices("auto")
-        assert len(device_list) == 1 and "cpu" in device_list
-        device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu")
-        assert len(device_list) == 3
-        assert device_list == ["cuda:0", "cuda:1", "cpu"]
-        device_list = parse_available_devices("0,1")
-        assert len(device_list) == 1 and "cpu" in device_list
+        # from auto_round.schemes import QuantizationScheme
+        #
+        # qs = QuantizationScheme.from_dict({"bits": 4, "group_size": 64})
+        # ar = AutoRound(
+        #     tiny_qwen_model_path,
+        #     scheme=qs,
+        #     bits=2,
+        #     data_type="int_asym_dq",
+        #     nsamples=1,
+        #     iters=0,
+        #     disable_opt_rtn=True,
+        #     seqlen=2,
+        # )
+        # ar.quantize()

From 62ef8c5dcc03f4801f1b175380e69919c0e22a31 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 4 Feb 2026 14:56:21 +0800
Subject: [PATCH 02/15] support user model evaluation with multi devices

---
 auto_round/auto_scheme/utils.py |  2 +-
 auto_round/compressors/base.py  |  3 +-
 auto_round/eval/evaluation.py   | 11 ++++----
 auto_round/utils/device.py      | 49 +++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/auto_round/auto_scheme/utils.py b/auto_round/auto_scheme/utils.py
index 3c19acc42..aa1e185a5 100644
--- a/auto_round/auto_scheme/utils.py
+++ b/auto_round/auto_scheme/utils.py
@@ -246,7 +246,7 @@ def dispatch_model_by_all_available_devices(
         else:
             raise ValueError(f"Unsupported device {device} in device_map: {device_map}")
         new_max_memory[device] = max_memory[device]
-
+    model.tie_weights()
     device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules)
     model = dispatch_model(model, device_map=device_map)
     return model
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 609fa8a6e..da47f4613 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1591,7 +1591,7 @@ def _adjust_immediate_packing_and_saving(self):
             self.is_immediate_saving = True
 
         if self.low_cpu_mem_usage and not self.is_immediate_packing:
-            logger.warning(
+            logger.info(
                 "`low_cpu_mem_usage` is only supported when `immediate_packing` is True. "
                 "Setting `low_cpu_mem_usage` to False."
             )
@@ -2163,6 +2163,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                                 max_memory=new_max_memory,
                                 no_split_module_classes=no_split_modules,
                             )
+                            self.model.tie_weights()
                             device_map = infer_auto_device_map(
                                 self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules
                             )
diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py
index cff49b371..27a1fe746 100644
--- a/auto_round/eval/evaluation.py
+++ b/auto_round/eval/evaluation.py
@@ -16,6 +16,7 @@
 from typing import Optional, Union
 
 from auto_round.logger import logger
+from auto_round.utils import dispatch_model_block_wise
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
@@ -199,13 +200,13 @@ def load_gguf_model_for_eval(eval_folder, formats, args):
     return model, tokenizer
 
 
-def prepare_model_for_eval(model, device_str, eval_model_dtype):
+def prepare_model_for_eval(model, device_map, eval_model_dtype):
     """
     Prepare model for evaluation.
 
     Args:
         model: Quantized model
-        device_str: Device string
+        device_map: Device string
         eval_model_dtype: Evaluation data type
 
     Returns:
@@ -221,9 +222,7 @@ def prepare_model_for_eval(model, device_str, eval_model_dtype):
 
         dispatch_model(model, model.hf_device_map)
     else:
-        # Single device model
-        device_str = detect_device(device_str)
-        model = model.to(device_str)
+        dispatch_model_block_wise(model, device_map)
 
     # Convert dtype
     if model.dtype != eval_model_dtype and eval_model_dtype != "auto":
@@ -427,7 +426,7 @@ def run_model_evaluation(model, tokenizer, autoround, folders, formats, device_s
             model, tokenizer = load_gguf_model_for_eval(eval_folder, formats, args)
         else:
             eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto")
-            model = prepare_model_for_eval(model, device_str, eval_model_dtype)
+            model = prepare_model_for_eval(model, args.device_map, eval_model_dtype)
 
         # Evaluate with model instance
         evaluate_with_model_instance(model, tokenizer, device_str, args)
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index a16f441bf..fa53769c0 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -24,6 +24,8 @@
 import cpuinfo
 import psutil
 import torch
+from accelerate import infer_auto_device_map, dispatch_model
+from accelerate.utils import get_balanced_memory, get_max_memory
 
 from auto_round.logger import logger
 from auto_round.utils.model import check_to_quantized, get_block_names, get_layer_features, get_module
@@ -1194,6 +1196,53 @@ def find_optimal_subset(arr, target):
 
     return result
 
+def dispatch_model_block_wise(model: torch.nn.Module, device_map:str, max_mem_ratio=0.9):
+    if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
+        import accelerate
+
+        accelerate.hooks.remove_hook_from_submodules(model)
+    no_split_modules = getattr(model, "_no_split_modules", [])
+    devices = parse_available_devices(device_map)
+    if len(devices)==1:
+        model.to(devices[0])
+        return model
+
+    max_memory = get_max_memory()
+    new_max_memory = {}
+    if "cpu" not in devices:
+        devices.append("cpu")
+    for device in devices:
+        if ":" in device:
+            device = int(device.split(":")[-1])
+        elif device == "cpu":
+            device = "cpu"
+        elif isinstance(device, str):
+            device = 0
+        else:
+            raise f"Unsupported device {device} in device_map: {device_map}"
+        # Use 90% of the reported max memory to leave headroom for activations,
+        # temporary tensors, other processes, and allocator fragmentation, reducing
+        # the chance of runtime OOM while still utilizing most available memory.
+        new_max_memory[device] = max_memory[device] * max_mem_ratio
+    new_max_memory = get_balanced_memory(
+        model,
+        max_memory=new_max_memory,
+        no_split_module_classes=no_split_modules,
+    )
+    model.tie_weights()
+    device_map = infer_auto_device_map(
+        model, max_memory=new_max_memory, no_split_module_classes=no_split_modules
+    )
+    if len(devices) > 1 and "cpu" in device_map.values():
+        logger.warning(
+            "Some layers are offloaded to cpu, which may severely impact calibration speed."
+            " Please consider using more cards."
+        )
+
+
+    model = dispatch_model(model, device_map=device_map)
+
+    return model
 
 def set_avg_auto_device_map(model: torch.nn.Module, device_map):
     block_name_list = get_block_names(model)

From 56f95a3fee70b431b662d9a69bd5d0038f0360ad Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 4 Feb 2026 15:02:14 +0800
Subject: [PATCH 03/15] revert changes

---
 README.md                            |   2 +-
 auto_round/utils/device.py           |   2 +-
 test/test_cpu/schemes/test_scheme.py | 311 +++++++++++++--------------
 3 files changed, 157 insertions(+), 158 deletions(-)

diff --git a/README.md b/README.md
index 82f712023..8ae1c220e 100644
--- a/README.md
+++ b/README.md
@@ -119,7 +119,7 @@ pip install auto-round-hpu
 
 ## Model Quantization (CPU/Intel GPU/Gaudi/CUDA)
 
->If you encounter issues during quantization, try using pure RTN mode with iters=0, disable_opt_rtn=True. Additionally, using group_size=32 or mixed bits is recommended for better results..
+>If you encounter issues during quantization, try using pure RTN mode with iters=0, disable_opt_rtn=True. Additionally, using group_size=32 or mixed bits is recommended for better results.
 
 ### CLI Usage
 The full list of supported arguments is provided by calling `auto-round -h` on the terminal.
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index fa53769c0..9d546070c 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -1219,7 +1219,7 @@ def dispatch_model_block_wise(model: torch.nn.Module, device_map:str, max_mem_ra
         elif isinstance(device, str):
             device = 0
         else:
-            raise f"Unsupported device {device} in device_map: {device_map}"
+            raise ValueError(f"Unsupported device {device} in device_map: {device_map}")
         # Use 90% of the reported max memory to leave headroom for activations,
         # temporary tensors, other processes, and allocator fragmentation, reducing
         # the chance of runtime OOM while still utilizing most available memory.
diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py
index d7ff777c6..46deab36e 100644
--- a/test/test_cpu/schemes/test_scheme.py
+++ b/test/test_cpu/schemes/test_scheme.py
@@ -18,148 +18,147 @@ def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    # def test_gguf(self, tiny_qwen_model_path, dataloader):
-    #     ar = AutoRound(
-    #         tiny_qwen_model_path,
-    #         scheme="W2A16",
-    #         nsamples=1,
-    #         iters=1,
-    #         seqlen=2,
-    #         dataset=dataloader,
-    #     )
-    #     ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
-    #     assert ar.bits == 4
-    #     shutil.rmtree(self.save_folder, ignore_errors=True)
-    #
-    # def test_w4a16(self, tiny_opt_model_path, dataloader):
-    #     ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
-    #     assert ar.bits == 4
-    #     ar.quantize()
-    #
-    # def test_w2a16_rtn(self, tiny_opt_model_path, dataloader):
-    #     ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader)
-    #     assert ar.bits == 2
-    #     ar.quantize()
-    #
-    # def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader):
-    #
-    #     layer_config = {
-    #         "model.layers.0.self_attn.k_proj": {"bits": 16},
-    #     }
-    #     ar = AutoRound(
-    #         tiny_qwen_moe_model_path,
-    #         scheme="W4A16_MIXED",
-    #         nsamples=1,
-    #         iters=0,
-    #         seqlen=2,
-    #         dataset=dataloader,
-    #         low_cpu_mem_usage=False,
-    #         layer_config=layer_config,
-    #     )
-    #     ar.quantize()
-    #     assert ar.bits == 4
-    #     assert ar.model.model.layers[0].self_attn.q_proj.bits == 8
-    #     assert ar.model.model.layers[0].self_attn.k_proj.bits == 16
-    #     assert ar.model.model.layers[0].mlp.experts[0].up_proj.bits == 4
-    #     assert ar.model.model.layers[0].mlp.shared_expert.gate_proj.bits == 8
-    #
-    #     shutil.rmtree(self.save_folder, ignore_errors=True)
-    #
-    # def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader):
-    #
-    #     ar = AutoRound(
-    #         tiny_qwen_2_5_vl_model_path,
-    #         scheme="W4A16_MIXED",
-    #         nsamples=1,
-    #         batch_size=1,
-    #         iters=0,
-    #         seqlen=2,
-    #         dataset=dataloader,
-    #         low_cpu_mem_usage=False,
-    #     )
-    #     ar.quantize()
-    #     assert ar.bits == 4
-    #     assert ar.model.language_model.layers[0].self_attn.q_proj.bits == 16
-    #     assert ar.model.visual.blocks[0].attn.qkv.bits == 16
-    #     shutil.rmtree(self.save_folder, ignore_errors=True)
-    #
-    # def test_mxfp4(self, tiny_opt_model_path, dataloader):
-    #     ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
-    #     assert ar.bits == 4
-    #     assert ar.act_bits == 4
-    #     assert ar.data_type == "mx_fp"
-    #     assert ar.act_data_type == "mx_fp"
-    #     ar.quantize()
-    #
-    # def test_vllm(self, tiny_qwen_vl_model_path):
-    #     from auto_round import AutoRoundMLLM
-    #
-    #     ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2)
-    #     assert ar.bits == 2
-    #     assert ar.act_bits == 16
-    #
-    # def test_nvfp4(self, tiny_opt_model_path, dataloader):
-    #     ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
-    #     assert ar.bits == 4
-    #     assert ar.act_bits == 4
-    #     assert ar.data_type == "nv_fp"
-    #     assert ar.act_data_type == "nv_fp4_with_static_gs"
-    #     ar.quantize()
-    #
-    # def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader):
-    #     import copy
-    #
-    #     preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"]
-    #     for scheme in preset_schemes:
-    #         model_name = tiny_opt_model_path
-    #         if "gguf" in scheme.lower():
-    #             model_name = tiny_qwen_model_path
-    #         print(f"scheme={scheme}")
-    #         ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader)
-    #         ar.quantize_and_save(self.save_folder)
-    #         shutil.rmtree(self.save_folder, ignore_errors=True)
-    #
-    # def test_scheme_in_layer_config(self, dataloader):
-    #     model = get_tiny_model(opt_name_or_path, num_layers=5)
-    #     tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True)
-    #     layer_config = {
-    #         "model.decoder.layers.2.self_attn": {"bits": 2},
-    #         "model.decoder.layers.3.self_attn.v_proj": "W8A16",
-    #         "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
-    #     }
-    #     ar = AutoRound(
-    #         model,
-    #         tokenizer,
-    #         scheme="W3A16",
-    #         nsamples=1,
-    #         iters=1,
-    #         layer_config=layer_config,
-    #         seqlen=2,
-    #         dataset=dataloader,
-    #     )
-    #
-    #     ar.quantize()
-    #     for n, m in ar.model.named_modules():
-    #         if n == "model.decoder.layers.2.self_attn.q_proj":
-    #             assert m.bits == 2
-    #         if n == "model.decoder.layers.2.self_attn.k_proj":
-    #             assert m.bits == 2
-    #         if n == "model.decoder.layers.3.self_attn.v_proj":
-    #             assert m.bits == 8
-    #         if n == "model.decoder.layers.4.self_attn.k_proj":
-    #             assert m.group_size == 64
-    #
-    # def test_parse_available_devices(self):
-    #     from auto_round.utils.device import parse_available_devices
-    #
-    #     device_list = parse_available_devices("auto")
-    #     assert len(device_list) == 1 and "cpu" in device_list
-    #     device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu")
-    #     assert len(device_list) == 3
-    #     assert device_list == ["cuda:0", "cuda:1", "cpu"]
-    #     device_list = parse_available_devices("0,1")
-    #     assert len(device_list) == 1 and "cpu" in device_list
-    #
+    def test_gguf(self, tiny_qwen_model_path, dataloader):
+        ar = AutoRound(
+            tiny_qwen_model_path,
+            scheme="W2A16",
+            nsamples=1,
+            iters=1,
+            seqlen=2,
+            dataset=dataloader,
+        )
+        ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
+        assert ar.bits == 4
+        shutil.rmtree(self.save_folder, ignore_errors=True)
+
+    def test_w4a16(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        assert ar.bits == 4
+        ar.quantize()
+
+    def test_w2a16_rtn(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader)
+        assert ar.bits == 2
+        ar.quantize()
+
+    def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader):
+
+        layer_config = {
+            "model.layers.0.self_attn.k_proj": {"bits": 16},
+        }
+        ar = AutoRound(
+            tiny_qwen_moe_model_path,
+            scheme="W4A16_MIXED",
+            nsamples=1,
+            iters=0,
+            seqlen=2,
+            dataset=dataloader,
+            low_cpu_mem_usage=False,
+            layer_config=layer_config,
+        )
+        ar.quantize()
+        assert ar.bits == 4
+        assert ar.model.model.layers[0].self_attn.q_proj.bits == 8
+        assert ar.model.model.layers[0].self_attn.k_proj.bits == 16
+        assert ar.model.model.layers[0].mlp.experts[0].up_proj.bits == 4
+        assert ar.model.model.layers[0].mlp.shared_expert.gate_proj.bits == 8
+
+        shutil.rmtree(self.save_folder, ignore_errors=True)
+
+    def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader):
+
+        ar = AutoRound(
+            tiny_qwen_2_5_vl_model_path,
+            scheme="W4A16_MIXED",
+            nsamples=1,
+            batch_size=1,
+            iters=0,
+            seqlen=2,
+            dataset=dataloader,
+            low_cpu_mem_usage=False,
+        )
+        ar.quantize()
+        assert ar.bits == 4
+        assert ar.model.language_model.layers[0].self_attn.q_proj.bits == 16
+        assert ar.model.visual.blocks[0].attn.qkv.bits == 16
+        shutil.rmtree(self.save_folder, ignore_errors=True)
+
+    def test_mxfp4(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        assert ar.bits == 4
+        assert ar.act_bits == 4
+        assert ar.data_type == "mx_fp"
+        assert ar.act_data_type == "mx_fp"
+        ar.quantize()
+
+    def test_vllm(self, tiny_qwen_vl_model_path):
+        from auto_round import AutoRoundMLLM
+
+        ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2)
+        assert ar.bits == 2
+        assert ar.act_bits == 16
+
+    def test_nvfp4(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        assert ar.bits == 4
+        assert ar.act_bits == 4
+        assert ar.data_type == "nv_fp"
+        assert ar.act_data_type == "nv_fp4_with_static_gs"
+        ar.quantize()
+
+    def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader):
+        import copy
+
+        preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"]
+        for scheme in preset_schemes:
+            model_name = tiny_opt_model_path
+            if "gguf" in scheme.lower():
+                model_name = tiny_qwen_model_path
+            print(f"scheme={scheme}")
+            ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+            ar.quantize_and_save(self.save_folder)
+            shutil.rmtree(self.save_folder, ignore_errors=True)
+
+    def test_scheme_in_layer_config(self, dataloader):
+        model = get_tiny_model(opt_name_or_path, num_layers=5)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True)
+        layer_config = {
+            "model.decoder.layers.2.self_attn": {"bits": 2},
+            "model.decoder.layers.3.self_attn.v_proj": "W8A16",
+            "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
+        }
+        ar = AutoRound(
+            model,
+            tokenizer,
+            scheme="W3A16",
+            nsamples=1,
+            iters=1,
+            layer_config=layer_config,
+            seqlen=2,
+            dataset=dataloader,
+        )
+
+        ar.quantize()
+        for n, m in ar.model.named_modules():
+            if n == "model.decoder.layers.2.self_attn.q_proj":
+                assert m.bits == 2
+            if n == "model.decoder.layers.2.self_attn.k_proj":
+                assert m.bits == 2
+            if n == "model.decoder.layers.3.self_attn.v_proj":
+                assert m.bits == 8
+            if n == "model.decoder.layers.4.self_attn.k_proj":
+                assert m.group_size == 64
+
+    def test_parse_available_devices(self):
+        from auto_round.utils.device import parse_available_devices
+
+        device_list = parse_available_devices("auto")
+        assert len(device_list) == 1 and "cpu" in device_list
+        device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu")
+        assert len(device_list) == 3
+        assert device_list == ["cuda:0", "cuda:1", "cpu"]
+        device_list = parse_available_devices("0,1")
+        assert len(device_list) == 1 and "cpu" in device_list
 
     def test_set_scheme(self, tiny_qwen_model_path):
         ar = AutoRound(
@@ -173,17 +172,17 @@ def test_set_scheme(self, tiny_qwen_model_path):
         )
         ar.quantize()
 
-        # from auto_round.schemes import QuantizationScheme
-        #
-        # qs = QuantizationScheme.from_dict({"bits": 4, "group_size": 64})
-        # ar = AutoRound(
-        #     tiny_qwen_model_path,
-        #     scheme=qs,
-        #     bits=2,
-        #     data_type="int_asym_dq",
-        #     nsamples=1,
-        #     iters=0,
-        #     disable_opt_rtn=True,
-        #     seqlen=2,
-        # )
-        # ar.quantize()
+        from auto_round.schemes import QuantizationScheme
+
+        qs = QuantizationScheme.from_dict({"bits": 4, "group_size": 64})
+        ar = AutoRound(
+            tiny_qwen_model_path,
+            scheme=qs,
+            bits=2,
+            data_type="int_asym_dq",
+            nsamples=1,
+            iters=0,
+            disable_opt_rtn=True,
+            seqlen=2,
+        )
+        ar.quantize()
\ No newline at end of file

From cc9b82804a2ae01ece32064eb0e93bfd7852aa9d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 4 Feb 2026 07:04:17 +0000
Subject: [PATCH 04/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/utils/device.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 9d546070c..e1ca78bcf 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -24,7 +24,7 @@
 import cpuinfo
 import psutil
 import torch
-from accelerate import infer_auto_device_map, dispatch_model
+from accelerate import dispatch_model, infer_auto_device_map
 from accelerate.utils import get_balanced_memory, get_max_memory
 
 from auto_round.logger import logger
@@ -1196,14 +1196,15 @@ def find_optimal_subset(arr, target):
 
     return result
 
-def dispatch_model_block_wise(model: torch.nn.Module, device_map:str, max_mem_ratio=0.9):
+
+def dispatch_model_block_wise(model: torch.nn.Module, device_map: str, max_mem_ratio=0.9):
     if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
         import accelerate
 
         accelerate.hooks.remove_hook_from_submodules(model)
     no_split_modules = getattr(model, "_no_split_modules", [])
     devices = parse_available_devices(device_map)
-    if len(devices)==1:
+    if len(devices) == 1:
         model.to(devices[0])
         return model
 
@@ -1230,20 +1231,18 @@ def dispatch_model_block_wise(model: torch.nn.Module, device_map:str, max_mem_ra
         no_split_module_classes=no_split_modules,
     )
     model.tie_weights()
-    device_map = infer_auto_device_map(
-        model, max_memory=new_max_memory, no_split_module_classes=no_split_modules
-    )
+    device_map = infer_auto_device_map(model, max_memory=new_max_memory, no_split_module_classes=no_split_modules)
     if len(devices) > 1 and "cpu" in device_map.values():
         logger.warning(
             "Some layers are offloaded to cpu, which may severely impact calibration speed."
             " Please consider using more cards."
         )
 
-
     model = dispatch_model(model, device_map=device_map)
 
     return model
 
+
 def set_avg_auto_device_map(model: torch.nn.Module, device_map):
     block_name_list = get_block_names(model)
     device_list = parse_available_devices(device_map)

From 9f99e0429f5d356afcbd084e25f68d1018e12164 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 4 Feb 2026 15:06:22 +0800
Subject: [PATCH 05/15] fix

---
 auto_round/compressors/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index da47f4613..22919a5d7 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2153,7 +2153,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                                 elif isinstance(device, str):
                                     device = 0
                                 else:
-                                    raise f"Unsupported device {device} in device_map: {self.device_map}"
+                                    raise ValueError(f"Unsupported device {device} in device_map: {self.device_map}")
                                 # Use 90% of the reported max memory to leave headroom for activations,
                                 # temporary tensors, other processes, and allocator fragmentation, reducing
                                 # the chance of runtime OOM while still utilizing most available memory.

From 23001c6576901abb35f4a4bd3aa0def2a67a085c Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 4 Feb 2026 15:14:09 +0800
Subject: [PATCH 06/15] fix

---
 auto_round/eval/eval_cli.py   | 6 +++++-
 auto_round/eval/evaluation.py | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 938be51c6..d56e66ccc 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -16,13 +16,14 @@
 import os
 import time
 
+import torch.nn
 from transformers.utils.versions import require_version
 
 from auto_round.utils import (
     get_device_and_parallelism,
     get_device_str,
     get_model_dtype,
-    set_cuda_visible_devices,
+    set_cuda_visible_devices, dispatch_model_block_wise,
 )
 
 
@@ -270,6 +271,7 @@ def eval_task_by_task(
     retry_times=3,
     mllm=False,
     add_bos_token=False,
+    device_map=None,
 ):
     require_version(
         "lm_eval>=0.4.2", "lm-eval is required for evaluation, please install it with `pip install 'lm-eval>=0.4.2'`"
@@ -294,6 +296,8 @@ def eval_task_by_task(
         model, tokenizer, is_gguf_file, gguf_file = _load_gguf_model_if_needed(model, eval_model_dtype)
         if is_gguf_file:
             parallelism = False
+    if isinstance(model, torch.nn.Module):
+        dispatch_model_block_wise(model,args.device_map)
 
     eval_model_dtype = get_model_dtype(eval_model_dtype)
     if mllm:
diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py
index 27a1fe746..742bd4f0c 100644
--- a/auto_round/eval/evaluation.py
+++ b/auto_round/eval/evaluation.py
@@ -264,6 +264,7 @@ def evaluate_with_model_instance(model, tokenizer, device_str, args):
             batch_size=args.eval_bs,
             eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"),
             add_bos_token=args.add_bos_token,
+            device_map=args.device_map
         )
     else:
         # Batch evaluation

From 15669d93ca431ac2ea0eafd7a0af59a524877be3 Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 4 Feb 2026 15:23:08 +0800
Subject: [PATCH 07/15] fix

---
 auto_round/eval/eval_cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index d56e66ccc..9f35f25e7 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -277,7 +277,7 @@ def eval_task_by_task(
         "lm_eval>=0.4.2", "lm-eval is required for evaluation, please install it with `pip install 'lm-eval>=0.4.2'`"
     )
 
-    set_cuda_visible_devices(device)
+    set_cuda_visible_devices(device_map)
     device_str, parallelism = get_device_and_parallelism(device)
 
     # load after _eval_int in order to make sure import torch after set CUDA_VISIBLE_DEVICES

From b8ff386b10807ed488990fbd1bdf30f5823efe93 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 4 Feb 2026 07:23:44 +0000
Subject: [PATCH 08/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/eval/eval_cli.py   | 5 +++--
 auto_round/eval/evaluation.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 9f35f25e7..b2b302c11 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -20,10 +20,11 @@
 from transformers.utils.versions import require_version
 
 from auto_round.utils import (
+    dispatch_model_block_wise,
     get_device_and_parallelism,
     get_device_str,
     get_model_dtype,
-    set_cuda_visible_devices, dispatch_model_block_wise,
+    set_cuda_visible_devices,
 )
 
 
@@ -297,7 +298,7 @@ def eval_task_by_task(
         if is_gguf_file:
             parallelism = False
     if isinstance(model, torch.nn.Module):
-        dispatch_model_block_wise(model,args.device_map)
+        dispatch_model_block_wise(model, args.device_map)
 
     eval_model_dtype = get_model_dtype(eval_model_dtype)
     if mllm:
diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py
index 742bd4f0c..c12e076e5 100644
--- a/auto_round/eval/evaluation.py
+++ b/auto_round/eval/evaluation.py
@@ -264,7 +264,7 @@ def evaluate_with_model_instance(model, tokenizer, device_str, args):
             batch_size=args.eval_bs,
             eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"),
             add_bos_token=args.add_bos_token,
-            device_map=args.device_map
+            device_map=args.device_map,
         )
     else:
         # Batch evaluation

From 439e92bc049963bf9d03a4aa2584fe21e73a44ae Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 4 Feb 2026 15:37:02 +0800
Subject: [PATCH 09/15] fix

---
 auto_round/eval/eval_cli.py | 2 +-
 auto_round/utils/device.py  | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 9f35f25e7..27e9b7609 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -297,7 +297,7 @@ def eval_task_by_task(
         if is_gguf_file:
             parallelism = False
     if isinstance(model, torch.nn.Module):
-        dispatch_model_block_wise(model,args.device_map)
+        dispatch_model_block_wise(model,device_map="auto") # As we set visible device before, so explcits
 
     eval_model_dtype = get_model_dtype(eval_model_dtype)
     if mllm:
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index e1ca78bcf..7b2ace3f6 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -296,8 +296,12 @@ def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[s
     return device, parallelism
 
 
-def set_cuda_visible_devices(device):
-    devices = device.replace(" ", "").split(",")
+def set_cuda_visible_devices(device:str):
+    if device=="cuda":
+        devices = ["0"]
+    else:
+        devices = device.replace(" ", "").split(",")
+    devices = [ device.split(":")[-1] for device in devices]
     if all(s.isdigit() for s in devices):
         if "CUDA_VISIBLE_DEVICES" in os.environ:
             current_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]

From 68e31b1e07f97137430318b94ef53a63b0144667 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 4 Feb 2026 07:42:24 +0000
Subject: [PATCH 10/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/eval/eval_cli.py | 5 +++--
 auto_round/utils/device.py  | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 27e9b7609..6f71d4d97 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -20,10 +20,11 @@
 from transformers.utils.versions import require_version
 
 from auto_round.utils import (
+    dispatch_model_block_wise,
     get_device_and_parallelism,
     get_device_str,
     get_model_dtype,
-    set_cuda_visible_devices, dispatch_model_block_wise,
+    set_cuda_visible_devices,
 )
 
 
@@ -297,7 +298,7 @@ def eval_task_by_task(
         if is_gguf_file:
             parallelism = False
     if isinstance(model, torch.nn.Module):
-        dispatch_model_block_wise(model,device_map="auto") # As we set visible device before, so explcits
+        dispatch_model_block_wise(model, device_map="auto")  # As we set visible device before, so explcits
 
     eval_model_dtype = get_model_dtype(eval_model_dtype)
     if mllm:
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 7b2ace3f6..5280ca6fd 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -296,12 +296,12 @@ def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[s
     return device, parallelism
 
 
-def set_cuda_visible_devices(device:str):
-    if device=="cuda":
+def set_cuda_visible_devices(device: str):
+    if device == "cuda":
         devices = ["0"]
     else:
         devices = device.replace(" ", "").split(",")
-    devices = [ device.split(":")[-1] for device in devices]
+    devices = [device.split(":")[-1] for device in devices]
     if all(s.isdigit() for s in devices):
         if "CUDA_VISIBLE_DEVICES" in os.environ:
             current_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]

From 62673d479ac36e325f0495a22ca288f208a51c2e Mon Sep 17 00:00:00 2001
From: Wenhua Cheng <wenhua.cheng@intel.com>
Date: Wed, 4 Feb 2026 16:22:07 +0800
Subject: [PATCH 11/15] fix

---
 auto_round/utils/device.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 5280ca6fd..ba9733243 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -299,6 +299,8 @@ def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[s
 def set_cuda_visible_devices(device: str):
     if device == "cuda":
         devices = ["0"]
+    elif device=="auto":
+        return
     else:
         devices = device.replace(" ", "").split(",")
     devices = [device.split(":")[-1] for device in devices]

From 548dd511f316ee77d1c6bf04b76efb1a55c81772 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 4 Feb 2026 08:23:16 +0000
Subject: [PATCH 12/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/utils/device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index ba9733243..81c02fd4b 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -299,7 +299,7 @@ def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[s
 def set_cuda_visible_devices(device: str):
     if device == "cuda":
         devices = ["0"]
-    elif device=="auto":
+    elif device == "auto":
         return
     else:
         devices = device.replace(" ", "").split(",")

From 37b1b2e9af1173897d484aa51e111fd0f131386e Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 5 Feb 2026 10:11:26 +0800
Subject: [PATCH 13/15] fix eval_task_by_task

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/eval/eval_cli.py | 6 +++++-
 auto_round/utils/device.py  | 8 +++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 6f71d4d97..fd4732860 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -289,7 +289,11 @@ def eval_task_by_task(
     if batch_size is None:
         batch_size = "auto:8"
 
-    if not isinstance(model, str):
+    if not isinstance(model, str) and parallelism:
+        from accelerate import dispatch_model, infer_auto_device_map
+
+        device_map = infer_auto_device_map(model)
+        model = dispatch_model(model, device_map=device_map)
         parallelism = False
         is_gguf_file = False
         gguf_file = None
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 81c02fd4b..abbd332e7 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -275,7 +275,13 @@ def is_valid_digit(s):
 
 def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[str, bool]:
     if isinstance(device, str):
-        devices = device.replace(" ", "").split(",")
+        if device in ["cuda", "xpu", "hpu"]:
+            device = detect_device(device)
+            parallelism = False
+            return device, parallelism
+        else:
+            device = re.sub("xpu:|hpu:|cuda:", "", device)
+            devices = device.replace(" ", "").split(",")
     elif isinstance(device, int):
         devices = [str(device)]
     else:

From 54cd195e03ac699831fee3303e52939ab277450c Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 5 Feb 2026 13:27:26 +0800
Subject: [PATCH 14/15] fix bug

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/eval/eval_cli.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index fd4732860..c1ccf2b40 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -272,13 +272,12 @@ def eval_task_by_task(
     retry_times=3,
     mllm=False,
     add_bos_token=False,
-    device_map=None,
 ):
     require_version(
         "lm_eval>=0.4.2", "lm-eval is required for evaluation, please install it with `pip install 'lm-eval>=0.4.2'`"
     )
 
-    set_cuda_visible_devices(device_map)
+    set_cuda_visible_devices(device)
     device_str, parallelism = get_device_and_parallelism(device)
 
     # load after _eval_int in order to make sure import torch after set CUDA_VISIBLE_DEVICES

From 4e78b49c126580d748672925dc8b0f27f564aa66 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 5 Feb 2026 14:25:01 +0800
Subject: [PATCH 15/15] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/eval/evaluation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py
index c12e076e5..27a1fe746 100644
--- a/auto_round/eval/evaluation.py
+++ b/auto_round/eval/evaluation.py
@@ -264,7 +264,6 @@ def evaluate_with_model_instance(model, tokenizer, device_str, args):
             batch_size=args.eval_bs,
             eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"),
             add_bos_token=args.add_bos_token,
-            device_map=args.device_map,
         )
     else:
         # Batch evaluation