From ac9dda2aa0399b94826ee953bbde62bf68c0f0fd Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Mon, 2 Feb 2026 18:21:52 +0800 Subject: [PATCH 01/15] fix --device_map cuda xpu issue --- auto_round/compressors/base.py | 4 +- test/test_cpu/schemes/test_scheme.py | 294 +++++++++++++++------------ 2 files changed, 164 insertions(+), 134 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 97702db3e..07300f905 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2149,8 +2149,10 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l device = int(device.split(":")[-1]) elif device == "cpu": device = "cpu" + elif isinstance(device, str): + device = 0 else: - raise ValueError(f"Unsupported device {device} in device_map: {self.device_map}") + raise f"Unsupported device {device} in device_map: {self.device_map}" # Use 90% of the reported max memory to leave headroom for activations, # temporary tensors, other processes, and allocator fragmentation, reducing # the chance of runtime OOM while still utilizing most available memory. diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py index e2b0c15c3..d7ff777c6 100644 --- a/test/test_cpu/schemes/test_scheme.py +++ b/test/test_cpu/schemes/test_scheme.py @@ -18,144 +18,172 @@ def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_gguf(self, tiny_qwen_model_path, dataloader): + # def test_gguf(self, tiny_qwen_model_path, dataloader): + # ar = AutoRound( + # tiny_qwen_model_path, + # scheme="W2A16", + # nsamples=1, + # iters=1, + # seqlen=2, + # dataset=dataloader, + # ) + # ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m") + # assert ar.bits == 4 + # shutil.rmtree(self.save_folder, ignore_errors=True) + # + # def test_w4a16(self, tiny_opt_model_path, dataloader): + # ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + # assert ar.bits == 4 + # ar.quantize() + # + # def test_w2a16_rtn(self, tiny_opt_model_path, dataloader): + # ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) + # assert ar.bits == 2 + # ar.quantize() + # + # def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader): + # + # layer_config = { + # "model.layers.0.self_attn.k_proj": {"bits": 16}, + # } + # ar = AutoRound( + # tiny_qwen_moe_model_path, + # scheme="W4A16_MIXED", + # nsamples=1, + # iters=0, + # seqlen=2, + # dataset=dataloader, + # low_cpu_mem_usage=False, + # layer_config=layer_config, + # ) + # ar.quantize() + # assert ar.bits == 4 + # assert ar.model.model.layers[0].self_attn.q_proj.bits == 8 + # assert ar.model.model.layers[0].self_attn.k_proj.bits == 16 + # assert ar.model.model.layers[0].mlp.experts[0].up_proj.bits == 4 + # assert ar.model.model.layers[0].mlp.shared_expert.gate_proj.bits == 8 + # + # shutil.rmtree(self.save_folder, ignore_errors=True) + # + # def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader): + # + # ar = AutoRound( + # tiny_qwen_2_5_vl_model_path, + # scheme="W4A16_MIXED", + # nsamples=1, + # batch_size=1, + # iters=0, + # seqlen=2, + # dataset=dataloader, + # low_cpu_mem_usage=False, + # ) + # ar.quantize() + # assert ar.bits == 4 + # assert ar.model.language_model.layers[0].self_attn.q_proj.bits == 16 + # assert ar.model.visual.blocks[0].attn.qkv.bits == 16 + # shutil.rmtree(self.save_folder, ignore_errors=True) + # + # def test_mxfp4(self, tiny_opt_model_path, dataloader): + # ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + # assert ar.bits == 4 + # assert ar.act_bits == 4 + # assert ar.data_type == "mx_fp" + # assert ar.act_data_type == "mx_fp" + # ar.quantize() + # + # def test_vllm(self, tiny_qwen_vl_model_path): + # from auto_round import AutoRoundMLLM + # + # ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) + # assert ar.bits == 2 + # assert ar.act_bits == 16 + # + # def test_nvfp4(self, tiny_opt_model_path, dataloader): + # ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + # assert ar.bits == 4 + # assert ar.act_bits == 4 + # assert ar.data_type == "nv_fp" + # assert ar.act_data_type == "nv_fp4_with_static_gs" + # ar.quantize() + # + # def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader): + # import copy + # + # preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"] + # for scheme in preset_schemes: + # model_name = tiny_opt_model_path + # if "gguf" in scheme.lower(): + # model_name = tiny_qwen_model_path + # print(f"scheme={scheme}") + # ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader) + # ar.quantize_and_save(self.save_folder) + # shutil.rmtree(self.save_folder, ignore_errors=True) + # + # def test_scheme_in_layer_config(self, dataloader): + # model = get_tiny_model(opt_name_or_path, num_layers=5) + # tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True) + # layer_config = { + # "model.decoder.layers.2.self_attn": {"bits": 2}, + # "model.decoder.layers.3.self_attn.v_proj": "W8A16", + # "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}), + # } + # ar = AutoRound( + # model, + # tokenizer, + # scheme="W3A16", + # nsamples=1, + # iters=1, + # layer_config=layer_config, + # seqlen=2, + # dataset=dataloader, + # ) + # + # ar.quantize() + # for n, m in ar.model.named_modules(): + # if n == "model.decoder.layers.2.self_attn.q_proj": + # assert m.bits == 2 + # if n == "model.decoder.layers.2.self_attn.k_proj": + # assert m.bits == 2 + # if n == "model.decoder.layers.3.self_attn.v_proj": + # assert m.bits == 8 + # if n == "model.decoder.layers.4.self_attn.k_proj": + # assert m.group_size == 64 + # + # def test_parse_available_devices(self): + # from auto_round.utils.device import parse_available_devices + # + # device_list = parse_available_devices("auto") + # assert len(device_list) == 1 and "cpu" in device_list + # device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu") + # assert len(device_list) == 3 + # assert device_list == ["cuda:0", "cuda:1", "cpu"] + # device_list = parse_available_devices("0,1") + # assert len(device_list) == 1 and "cpu" in device_list + # + + def test_set_scheme(self, tiny_qwen_model_path): ar = AutoRound( tiny_qwen_model_path, - scheme="W2A16", + scheme="gguf:q2_k_s", + data_type="fp", nsamples=1, - iters=1, - seqlen=2, - dataset=dataloader, - ) - ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m") - assert ar.bits == 4 - shutil.rmtree(self.save_folder, ignore_errors=True) - - def test_w4a16(self, tiny_opt_model_path, dataloader): - ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) - assert ar.bits == 4 - ar.quantize() - - def test_w2a16_rtn(self, tiny_opt_model_path, dataloader): - ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) - assert ar.bits == 2 - ar.quantize() - - def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader): - - layer_config = { - "model.layers.0.self_attn.k_proj": {"bits": 16}, - } - ar = AutoRound( - tiny_qwen_moe_model_path, - scheme="W4A16_MIXED", - nsamples=1, - iters=0, - seqlen=2, - dataset=dataloader, - low_cpu_mem_usage=False, - layer_config=layer_config, - ) - ar.quantize() - assert ar.bits == 4 - assert ar.model.model.layers[0].self_attn.q_proj.bits == 8 - assert ar.model.model.layers[0].self_attn.k_proj.bits == 16 - assert ar.model.model.layers[0].mlp.experts[0].up_proj.bits == 4 - assert ar.model.model.layers[0].mlp.shared_expert.gate_proj.bits == 8 - - shutil.rmtree(self.save_folder, ignore_errors=True) - - def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader): - - ar = AutoRound( - tiny_qwen_2_5_vl_model_path, - scheme="W4A16_MIXED", - nsamples=1, - batch_size=1, + disable_opt_rtn=True, iters=0, seqlen=2, - dataset=dataloader, - low_cpu_mem_usage=False, ) ar.quantize() - assert ar.bits == 4 - assert ar.model.language_model.layers[0].self_attn.q_proj.bits == 16 - assert ar.model.visual.blocks[0].attn.qkv.bits == 16 - shutil.rmtree(self.save_folder, ignore_errors=True) - - def test_mxfp4(self, tiny_opt_model_path, dataloader): - ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) - assert ar.bits == 4 - assert ar.act_bits == 4 - assert ar.data_type == "mx_fp" - assert ar.act_data_type == "mx_fp" - ar.quantize() - - def test_vllm(self, tiny_qwen_vl_model_path): - from auto_round import AutoRoundMLLM - - ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) - assert ar.bits == 2 - assert ar.act_bits == 16 - - def test_nvfp4(self, tiny_opt_model_path, dataloader): - ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) - assert ar.bits == 4 - assert ar.act_bits == 4 - assert ar.data_type == "nv_fp" - assert ar.act_data_type == "nv_fp4_with_static_gs" - ar.quantize() - - def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader): - import copy - - preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"] - for scheme in preset_schemes: - model_name = tiny_opt_model_path - if "gguf" in scheme.lower(): - model_name = tiny_qwen_model_path - print(f"scheme={scheme}") - ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader) - ar.quantize_and_save(self.save_folder) - shutil.rmtree(self.save_folder, ignore_errors=True) - - def test_scheme_in_layer_config(self, dataloader): - model = get_tiny_model(opt_name_or_path, num_layers=5) - tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True) - layer_config = { - "model.decoder.layers.2.self_attn": {"bits": 2}, - "model.decoder.layers.3.self_attn.v_proj": "W8A16", - "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}), - } - ar = AutoRound( - model, - tokenizer, - scheme="W3A16", - nsamples=1, - iters=1, - layer_config=layer_config, - seqlen=2, - dataset=dataloader, - ) - - ar.quantize() - for n, m in ar.model.named_modules(): - if n == "model.decoder.layers.2.self_attn.q_proj": - assert m.bits == 2 - if n == "model.decoder.layers.2.self_attn.k_proj": - assert m.bits == 2 - if n == "model.decoder.layers.3.self_attn.v_proj": - assert m.bits == 8 - if n == "model.decoder.layers.4.self_attn.k_proj": - assert m.group_size == 64 - - def test_parse_available_devices(self): - from auto_round.utils.device import parse_available_devices - device_list = parse_available_devices("auto") - assert len(device_list) == 1 and "cpu" in device_list - device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu") - assert len(device_list) == 3 - assert device_list == ["cuda:0", "cuda:1", "cpu"] - device_list = parse_available_devices("0,1") - assert len(device_list) == 1 and "cpu" in device_list + # from auto_round.schemes import QuantizationScheme + # + # qs = QuantizationScheme.from_dict({"bits": 4, "group_size": 64}) + # ar = AutoRound( + # tiny_qwen_model_path, + # scheme=qs, + # bits=2, + # data_type="int_asym_dq", + # nsamples=1, + # iters=0, + # disable_opt_rtn=True, + # seqlen=2, + # ) + # ar.quantize() From 62ef8c5dcc03f4801f1b175380e69919c0e22a31 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 4 Feb 2026 14:56:21 +0800 Subject: [PATCH 02/15] support user model evaluation with multi devices --- auto_round/auto_scheme/utils.py | 2 +- auto_round/compressors/base.py | 3 +- auto_round/eval/evaluation.py | 11 ++++---- auto_round/utils/device.py | 49 +++++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/auto_round/auto_scheme/utils.py b/auto_round/auto_scheme/utils.py index 3c19acc42..aa1e185a5 100644 --- a/auto_round/auto_scheme/utils.py +++ b/auto_round/auto_scheme/utils.py @@ -246,7 +246,7 @@ def dispatch_model_by_all_available_devices( else: raise ValueError(f"Unsupported device {device} in device_map: {device_map}") new_max_memory[device] = max_memory[device] - + model.tie_weights() device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules) model = dispatch_model(model, device_map=device_map) return model diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 609fa8a6e..da47f4613 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1591,7 +1591,7 @@ def _adjust_immediate_packing_and_saving(self): self.is_immediate_saving = True if self.low_cpu_mem_usage and not self.is_immediate_packing: - logger.warning( + logger.info( "`low_cpu_mem_usage` is only supported when `immediate_packing` is True. " "Setting `low_cpu_mem_usage` to False." ) @@ -2163,6 +2163,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l max_memory=new_max_memory, no_split_module_classes=no_split_modules, ) + self.model.tie_weights() device_map = infer_auto_device_map( self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules ) diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index cff49b371..27a1fe746 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -16,6 +16,7 @@ from typing import Optional, Union from auto_round.logger import logger +from auto_round.utils import dispatch_model_block_wise os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -199,13 +200,13 @@ def load_gguf_model_for_eval(eval_folder, formats, args): return model, tokenizer -def prepare_model_for_eval(model, device_str, eval_model_dtype): +def prepare_model_for_eval(model, device_map, eval_model_dtype): """ Prepare model for evaluation. Args: model: Quantized model - device_str: Device string + device_map: Device string eval_model_dtype: Evaluation data type Returns: @@ -221,9 +222,7 @@ def prepare_model_for_eval(model, device_str, eval_model_dtype): dispatch_model(model, model.hf_device_map) else: - # Single device model - device_str = detect_device(device_str) - model = model.to(device_str) + dispatch_model_block_wise(model, device_map) # Convert dtype if model.dtype != eval_model_dtype and eval_model_dtype != "auto": @@ -427,7 +426,7 @@ def run_model_evaluation(model, tokenizer, autoround, folders, formats, device_s model, tokenizer = load_gguf_model_for_eval(eval_folder, formats, args) else: eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") - model = prepare_model_for_eval(model, device_str, eval_model_dtype) + model = prepare_model_for_eval(model, args.device_map, eval_model_dtype) # Evaluate with model instance evaluate_with_model_instance(model, tokenizer, device_str, args) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index a16f441bf..fa53769c0 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -24,6 +24,8 @@ import cpuinfo import psutil import torch +from accelerate import infer_auto_device_map, dispatch_model +from accelerate.utils import get_balanced_memory, get_max_memory from auto_round.logger import logger from auto_round.utils.model import check_to_quantized, get_block_names, get_layer_features, get_module @@ -1194,6 +1196,53 @@ def find_optimal_subset(arr, target): return result +def dispatch_model_block_wise(model: torch.nn.Module, device_map:str, max_mem_ratio=0.9): + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + import accelerate + + accelerate.hooks.remove_hook_from_submodules(model) + no_split_modules = getattr(model, "_no_split_modules", []) + devices = parse_available_devices(device_map) + if len(devices)==1: + model.to(devices[0]) + return model + + max_memory = get_max_memory() + new_max_memory = {} + if "cpu" not in devices: + devices.append("cpu") + for device in devices: + if ":" in device: + device = int(device.split(":")[-1]) + elif device == "cpu": + device = "cpu" + elif isinstance(device, str): + device = 0 + else: + raise f"Unsupported device {device} in device_map: {device_map}" + # Use 90% of the reported max memory to leave headroom for activations, + # temporary tensors, other processes, and allocator fragmentation, reducing + # the chance of runtime OOM while still utilizing most available memory. + new_max_memory[device] = max_memory[device] * max_mem_ratio + new_max_memory = get_balanced_memory( + model, + max_memory=new_max_memory, + no_split_module_classes=no_split_modules, + ) + model.tie_weights() + device_map = infer_auto_device_map( + model, max_memory=new_max_memory, no_split_module_classes=no_split_modules + ) + if len(devices) > 1 and "cpu" in device_map.values(): + logger.warning( + "Some layers are offloaded to cpu, which may severely impact calibration speed." + " Please consider using more cards." + ) + + + model = dispatch_model(model, device_map=device_map) + + return model def set_avg_auto_device_map(model: torch.nn.Module, device_map): block_name_list = get_block_names(model) From 56f95a3fee70b431b662d9a69bd5d0038f0360ad Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 4 Feb 2026 15:02:14 +0800 Subject: [PATCH 03/15] revert changes --- README.md | 2 +- auto_round/utils/device.py | 2 +- test/test_cpu/schemes/test_scheme.py | 311 +++++++++++++-------------- 3 files changed, 157 insertions(+), 158 deletions(-) diff --git a/README.md b/README.md index 82f712023..8ae1c220e 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ pip install auto-round-hpu ## Model Quantization (CPU/Intel GPU/Gaudi/CUDA) ->If you encounter issues during quantization, try using pure RTN mode with iters=0, disable_opt_rtn=True. Additionally, using group_size=32 or mixed bits is recommended for better results.. +>If you encounter issues during quantization, try using pure RTN mode with iters=0, disable_opt_rtn=True. Additionally, using group_size=32 or mixed bits is recommended for better results. ### CLI Usage The full list of supported arguments is provided by calling `auto-round -h` on the terminal. diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index fa53769c0..9d546070c 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -1219,7 +1219,7 @@ def dispatch_model_block_wise(model: torch.nn.Module, device_map:str, max_mem_ra elif isinstance(device, str): device = 0 else: - raise f"Unsupported device {device} in device_map: {device_map}" + raise ValueError(f"Unsupported device {device} in device_map: {device_map}") # Use 90% of the reported max memory to leave headroom for activations, # temporary tensors, other processes, and allocator fragmentation, reducing # the chance of runtime OOM while still utilizing most available memory. diff --git a/test/test_cpu/schemes/test_scheme.py b/test/test_cpu/schemes/test_scheme.py index d7ff777c6..46deab36e 100644 --- a/test/test_cpu/schemes/test_scheme.py +++ b/test/test_cpu/schemes/test_scheme.py @@ -18,148 +18,147 @@ def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - # def test_gguf(self, tiny_qwen_model_path, dataloader): - # ar = AutoRound( - # tiny_qwen_model_path, - # scheme="W2A16", - # nsamples=1, - # iters=1, - # seqlen=2, - # dataset=dataloader, - # ) - # ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m") - # assert ar.bits == 4 - # shutil.rmtree(self.save_folder, ignore_errors=True) - # - # def test_w4a16(self, tiny_opt_model_path, dataloader): - # ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) - # assert ar.bits == 4 - # ar.quantize() - # - # def test_w2a16_rtn(self, tiny_opt_model_path, dataloader): - # ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) - # assert ar.bits == 2 - # ar.quantize() - # - # def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader): - # - # layer_config = { - # "model.layers.0.self_attn.k_proj": {"bits": 16}, - # } - # ar = AutoRound( - # tiny_qwen_moe_model_path, - # scheme="W4A16_MIXED", - # nsamples=1, - # iters=0, - # seqlen=2, - # dataset=dataloader, - # low_cpu_mem_usage=False, - # layer_config=layer_config, - # ) - # ar.quantize() - # assert ar.bits == 4 - # assert ar.model.model.layers[0].self_attn.q_proj.bits == 8 - # assert ar.model.model.layers[0].self_attn.k_proj.bits == 16 - # assert ar.model.model.layers[0].mlp.experts[0].up_proj.bits == 4 - # assert ar.model.model.layers[0].mlp.shared_expert.gate_proj.bits == 8 - # - # shutil.rmtree(self.save_folder, ignore_errors=True) - # - # def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader): - # - # ar = AutoRound( - # tiny_qwen_2_5_vl_model_path, - # scheme="W4A16_MIXED", - # nsamples=1, - # batch_size=1, - # iters=0, - # seqlen=2, - # dataset=dataloader, - # low_cpu_mem_usage=False, - # ) - # ar.quantize() - # assert ar.bits == 4 - # assert ar.model.language_model.layers[0].self_attn.q_proj.bits == 16 - # assert ar.model.visual.blocks[0].attn.qkv.bits == 16 - # shutil.rmtree(self.save_folder, ignore_errors=True) - # - # def test_mxfp4(self, tiny_opt_model_path, dataloader): - # ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) - # assert ar.bits == 4 - # assert ar.act_bits == 4 - # assert ar.data_type == "mx_fp" - # assert ar.act_data_type == "mx_fp" - # ar.quantize() - # - # def test_vllm(self, tiny_qwen_vl_model_path): - # from auto_round import AutoRoundMLLM - # - # ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) - # assert ar.bits == 2 - # assert ar.act_bits == 16 - # - # def test_nvfp4(self, tiny_opt_model_path, dataloader): - # ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) - # assert ar.bits == 4 - # assert ar.act_bits == 4 - # assert ar.data_type == "nv_fp" - # assert ar.act_data_type == "nv_fp4_with_static_gs" - # ar.quantize() - # - # def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader): - # import copy - # - # preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"] - # for scheme in preset_schemes: - # model_name = tiny_opt_model_path - # if "gguf" in scheme.lower(): - # model_name = tiny_qwen_model_path - # print(f"scheme={scheme}") - # ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader) - # ar.quantize_and_save(self.save_folder) - # shutil.rmtree(self.save_folder, ignore_errors=True) - # - # def test_scheme_in_layer_config(self, dataloader): - # model = get_tiny_model(opt_name_or_path, num_layers=5) - # tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True) - # layer_config = { - # "model.decoder.layers.2.self_attn": {"bits": 2}, - # "model.decoder.layers.3.self_attn.v_proj": "W8A16", - # "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}), - # } - # ar = AutoRound( - # model, - # tokenizer, - # scheme="W3A16", - # nsamples=1, - # iters=1, - # layer_config=layer_config, - # seqlen=2, - # dataset=dataloader, - # ) - # - # ar.quantize() - # for n, m in ar.model.named_modules(): - # if n == "model.decoder.layers.2.self_attn.q_proj": - # assert m.bits == 2 - # if n == "model.decoder.layers.2.self_attn.k_proj": - # assert m.bits == 2 - # if n == "model.decoder.layers.3.self_attn.v_proj": - # assert m.bits == 8 - # if n == "model.decoder.layers.4.self_attn.k_proj": - # assert m.group_size == 64 - # - # def test_parse_available_devices(self): - # from auto_round.utils.device import parse_available_devices - # - # device_list = parse_available_devices("auto") - # assert len(device_list) == 1 and "cpu" in device_list - # device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu") - # assert len(device_list) == 3 - # assert device_list == ["cuda:0", "cuda:1", "cpu"] - # device_list = parse_available_devices("0,1") - # assert len(device_list) == 1 and "cpu" in device_list - # + def test_gguf(self, tiny_qwen_model_path, dataloader): + ar = AutoRound( + tiny_qwen_model_path, + scheme="W2A16", + nsamples=1, + iters=1, + seqlen=2, + dataset=dataloader, + ) + ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m") + assert ar.bits == 4 + shutil.rmtree(self.save_folder, ignore_errors=True) + + def test_w4a16(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + assert ar.bits == 4 + ar.quantize() + + def test_w2a16_rtn(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) + assert ar.bits == 2 + ar.quantize() + + def test_w4a16_mixed(self, tiny_qwen_moe_model_path, dataloader): + + layer_config = { + "model.layers.0.self_attn.k_proj": {"bits": 16}, + } + ar = AutoRound( + tiny_qwen_moe_model_path, + scheme="W4A16_MIXED", + nsamples=1, + iters=0, + seqlen=2, + dataset=dataloader, + low_cpu_mem_usage=False, + layer_config=layer_config, + ) + ar.quantize() + assert ar.bits == 4 + assert ar.model.model.layers[0].self_attn.q_proj.bits == 8 + assert ar.model.model.layers[0].self_attn.k_proj.bits == 16 + assert ar.model.model.layers[0].mlp.experts[0].up_proj.bits == 4 + assert ar.model.model.layers[0].mlp.shared_expert.gate_proj.bits == 8 + + shutil.rmtree(self.save_folder, ignore_errors=True) + + def test_w4a16_mixed_mllm(self, tiny_qwen_2_5_vl_model_path, dataloader): + + ar = AutoRound( + tiny_qwen_2_5_vl_model_path, + scheme="W4A16_MIXED", + nsamples=1, + batch_size=1, + iters=0, + seqlen=2, + dataset=dataloader, + low_cpu_mem_usage=False, + ) + ar.quantize() + assert ar.bits == 4 + assert ar.model.language_model.layers[0].self_attn.q_proj.bits == 16 + assert ar.model.visual.blocks[0].attn.qkv.bits == 16 + shutil.rmtree(self.save_folder, ignore_errors=True) + + def test_mxfp4(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + assert ar.bits == 4 + assert ar.act_bits == 4 + assert ar.data_type == "mx_fp" + assert ar.act_data_type == "mx_fp" + ar.quantize() + + def test_vllm(self, tiny_qwen_vl_model_path): + from auto_round import AutoRoundMLLM + + ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) + assert ar.bits == 2 + assert ar.act_bits == 16 + + def test_nvfp4(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + assert ar.bits == 4 + assert ar.act_bits == 4 + assert ar.data_type == "nv_fp" + assert ar.act_data_type == "nv_fp4_with_static_gs" + ar.quantize() + + def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader): + import copy + + preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"] + for scheme in preset_schemes: + model_name = tiny_opt_model_path + if "gguf" in scheme.lower(): + model_name = tiny_qwen_model_path + print(f"scheme={scheme}") + ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader) + ar.quantize_and_save(self.save_folder) + shutil.rmtree(self.save_folder, ignore_errors=True) + + def test_scheme_in_layer_config(self, dataloader): + model = get_tiny_model(opt_name_or_path, num_layers=5) + tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True) + layer_config = { + "model.decoder.layers.2.self_attn": {"bits": 2}, + "model.decoder.layers.3.self_attn.v_proj": "W8A16", + "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}), + } + ar = AutoRound( + model, + tokenizer, + scheme="W3A16", + nsamples=1, + iters=1, + layer_config=layer_config, + seqlen=2, + dataset=dataloader, + ) + + ar.quantize() + for n, m in ar.model.named_modules(): + if n == "model.decoder.layers.2.self_attn.q_proj": + assert m.bits == 2 + if n == "model.decoder.layers.2.self_attn.k_proj": + assert m.bits == 2 + if n == "model.decoder.layers.3.self_attn.v_proj": + assert m.bits == 8 + if n == "model.decoder.layers.4.self_attn.k_proj": + assert m.group_size == 64 + + def test_parse_available_devices(self): + from auto_round.utils.device import parse_available_devices + + device_list = parse_available_devices("auto") + assert len(device_list) == 1 and "cpu" in device_list + device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu") + assert len(device_list) == 3 + assert device_list == ["cuda:0", "cuda:1", "cpu"] + device_list = parse_available_devices("0,1") + assert len(device_list) == 1 and "cpu" in device_list def test_set_scheme(self, tiny_qwen_model_path): ar = AutoRound( @@ -173,17 +172,17 @@ def test_set_scheme(self, tiny_qwen_model_path): ) ar.quantize() - # from auto_round.schemes import QuantizationScheme - # - # qs = QuantizationScheme.from_dict({"bits": 4, "group_size": 64}) - # ar = AutoRound( - # tiny_qwen_model_path, - # scheme=qs, - # bits=2, - # data_type="int_asym_dq", - # nsamples=1, - # iters=0, - # disable_opt_rtn=True, - # seqlen=2, - # ) - # ar.quantize() + from auto_round.schemes import QuantizationScheme + + qs = QuantizationScheme.from_dict({"bits": 4, "group_size": 64}) + ar = AutoRound( + tiny_qwen_model_path, + scheme=qs, + bits=2, + data_type="int_asym_dq", + nsamples=1, + iters=0, + disable_opt_rtn=True, + seqlen=2, + ) + ar.quantize() \ No newline at end of file From cc9b82804a2ae01ece32064eb0e93bfd7852aa9d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Feb 2026 07:04:17 +0000 Subject: [PATCH 04/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/utils/device.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 9d546070c..e1ca78bcf 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -24,7 +24,7 @@ import cpuinfo import psutil import torch -from accelerate import infer_auto_device_map, dispatch_model +from accelerate import dispatch_model, infer_auto_device_map from accelerate.utils import get_balanced_memory, get_max_memory from auto_round.logger import logger @@ -1196,14 +1196,15 @@ def find_optimal_subset(arr, target): return result -def dispatch_model_block_wise(model: torch.nn.Module, device_map:str, max_mem_ratio=0.9): + +def dispatch_model_block_wise(model: torch.nn.Module, device_map: str, max_mem_ratio=0.9): if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: import accelerate accelerate.hooks.remove_hook_from_submodules(model) no_split_modules = getattr(model, "_no_split_modules", []) devices = parse_available_devices(device_map) - if len(devices)==1: + if len(devices) == 1: model.to(devices[0]) return model @@ -1230,20 +1231,18 @@ def dispatch_model_block_wise(model: torch.nn.Module, device_map:str, max_mem_ra no_split_module_classes=no_split_modules, ) model.tie_weights() - device_map = infer_auto_device_map( - model, max_memory=new_max_memory, no_split_module_classes=no_split_modules - ) + device_map = infer_auto_device_map(model, max_memory=new_max_memory, no_split_module_classes=no_split_modules) if len(devices) > 1 and "cpu" in device_map.values(): logger.warning( "Some layers are offloaded to cpu, which may severely impact calibration speed." " Please consider using more cards." ) - model = dispatch_model(model, device_map=device_map) return model + def set_avg_auto_device_map(model: torch.nn.Module, device_map): block_name_list = get_block_names(model) device_list = parse_available_devices(device_map) From 9f99e0429f5d356afcbd084e25f68d1018e12164 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 4 Feb 2026 15:06:22 +0800 Subject: [PATCH 05/15] fix --- auto_round/compressors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index da47f4613..22919a5d7 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2153,7 +2153,7 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l elif isinstance(device, str): device = 0 else: - raise f"Unsupported device {device} in device_map: {self.device_map}" + raise ValueError(f"Unsupported device {device} in device_map: {self.device_map}") # Use 90% of the reported max memory to leave headroom for activations, # temporary tensors, other processes, and allocator fragmentation, reducing # the chance of runtime OOM while still utilizing most available memory. From 23001c6576901abb35f4a4bd3aa0def2a67a085c Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 4 Feb 2026 15:14:09 +0800 Subject: [PATCH 06/15] fix --- auto_round/eval/eval_cli.py | 6 +++++- auto_round/eval/evaluation.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 938be51c6..d56e66ccc 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -16,13 +16,14 @@ import os import time +import torch.nn from transformers.utils.versions import require_version from auto_round.utils import ( get_device_and_parallelism, get_device_str, get_model_dtype, - set_cuda_visible_devices, + set_cuda_visible_devices, dispatch_model_block_wise, ) @@ -270,6 +271,7 @@ def eval_task_by_task( retry_times=3, mllm=False, add_bos_token=False, + device_map=None, ): require_version( "lm_eval>=0.4.2", "lm-eval is required for evaluation, please install it with `pip install 'lm-eval>=0.4.2'`" @@ -294,6 +296,8 @@ def eval_task_by_task( model, tokenizer, is_gguf_file, gguf_file = _load_gguf_model_if_needed(model, eval_model_dtype) if is_gguf_file: parallelism = False + if isinstance(model, torch.nn.Module): + dispatch_model_block_wise(model,args.device_map) eval_model_dtype = get_model_dtype(eval_model_dtype) if mllm: diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index 27a1fe746..742bd4f0c 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -264,6 +264,7 @@ def evaluate_with_model_instance(model, tokenizer, device_str, args): batch_size=args.eval_bs, eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"), add_bos_token=args.add_bos_token, + device_map=args.device_map ) else: # Batch evaluation From 15669d93ca431ac2ea0eafd7a0af59a524877be3 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 4 Feb 2026 15:23:08 +0800 Subject: [PATCH 07/15] fix --- auto_round/eval/eval_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index d56e66ccc..9f35f25e7 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -277,7 +277,7 @@ def eval_task_by_task( "lm_eval>=0.4.2", "lm-eval is required for evaluation, please install it with `pip install 'lm-eval>=0.4.2'`" ) - set_cuda_visible_devices(device) + set_cuda_visible_devices(device_map) device_str, parallelism = get_device_and_parallelism(device) # load after _eval_int in order to make sure import torch after set CUDA_VISIBLE_DEVICES From b8ff386b10807ed488990fbd1bdf30f5823efe93 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Feb 2026 07:23:44 +0000 Subject: [PATCH 08/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/eval/eval_cli.py | 5 +++-- auto_round/eval/evaluation.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 9f35f25e7..b2b302c11 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -20,10 +20,11 @@ from transformers.utils.versions import require_version from auto_round.utils import ( + dispatch_model_block_wise, get_device_and_parallelism, get_device_str, get_model_dtype, - set_cuda_visible_devices, dispatch_model_block_wise, + set_cuda_visible_devices, ) @@ -297,7 +298,7 @@ def eval_task_by_task( if is_gguf_file: parallelism = False if isinstance(model, torch.nn.Module): - dispatch_model_block_wise(model,args.device_map) + dispatch_model_block_wise(model, args.device_map) eval_model_dtype = get_model_dtype(eval_model_dtype) if mllm: diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index 742bd4f0c..c12e076e5 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -264,7 +264,7 @@ def evaluate_with_model_instance(model, tokenizer, device_str, args): batch_size=args.eval_bs, eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"), add_bos_token=args.add_bos_token, - device_map=args.device_map + device_map=args.device_map, ) else: # Batch evaluation From 439e92bc049963bf9d03a4aa2584fe21e73a44ae Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 4 Feb 2026 15:37:02 +0800 Subject: [PATCH 09/15] fix --- auto_round/eval/eval_cli.py | 2 +- auto_round/utils/device.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 9f35f25e7..27e9b7609 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -297,7 +297,7 @@ def eval_task_by_task( if is_gguf_file: parallelism = False if isinstance(model, torch.nn.Module): - dispatch_model_block_wise(model,args.device_map) + dispatch_model_block_wise(model,device_map="auto") # As we set visible device before, so explcits eval_model_dtype = get_model_dtype(eval_model_dtype) if mllm: diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index e1ca78bcf..7b2ace3f6 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -296,8 +296,12 @@ def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[s return device, parallelism -def set_cuda_visible_devices(device): - devices = device.replace(" ", "").split(",") +def set_cuda_visible_devices(device:str): + if device=="cuda": + devices = ["0"] + else: + devices = device.replace(" ", "").split(",") + devices = [ device.split(":")[-1] for device in devices] if all(s.isdigit() for s in devices): if "CUDA_VISIBLE_DEVICES" in os.environ: current_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] From 68e31b1e07f97137430318b94ef53a63b0144667 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Feb 2026 07:42:24 +0000 Subject: [PATCH 10/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/eval/eval_cli.py | 5 +++-- auto_round/utils/device.py | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 27e9b7609..6f71d4d97 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -20,10 +20,11 @@ from transformers.utils.versions import require_version from auto_round.utils import ( + dispatch_model_block_wise, get_device_and_parallelism, get_device_str, get_model_dtype, - set_cuda_visible_devices, dispatch_model_block_wise, + set_cuda_visible_devices, ) @@ -297,7 +298,7 @@ def eval_task_by_task( if is_gguf_file: parallelism = False if isinstance(model, torch.nn.Module): - dispatch_model_block_wise(model,device_map="auto") # As we set visible device before, so explcits + dispatch_model_block_wise(model, device_map="auto") # As we set visible device before, so explcits eval_model_dtype = get_model_dtype(eval_model_dtype) if mllm: diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 7b2ace3f6..5280ca6fd 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -296,12 +296,12 @@ def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[s return device, parallelism -def set_cuda_visible_devices(device:str): - if device=="cuda": +def set_cuda_visible_devices(device: str): + if device == "cuda": devices = ["0"] else: devices = device.replace(" ", "").split(",") - devices = [ device.split(":")[-1] for device in devices] + devices = [device.split(":")[-1] for device in devices] if all(s.isdigit() for s in devices): if "CUDA_VISIBLE_DEVICES" in os.environ: current_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] From 62673d479ac36e325f0495a22ca288f208a51c2e Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Wed, 4 Feb 2026 16:22:07 +0800 Subject: [PATCH 11/15] fix --- auto_round/utils/device.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 5280ca6fd..ba9733243 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -299,6 +299,8 @@ def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[s def set_cuda_visible_devices(device: str): if device == "cuda": devices = ["0"] + elif device=="auto": + return else: devices = device.replace(" ", "").split(",") devices = [device.split(":")[-1] for device in devices] From 548dd511f316ee77d1c6bf04b76efb1a55c81772 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Feb 2026 08:23:16 +0000 Subject: [PATCH 12/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/utils/device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index ba9733243..81c02fd4b 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -299,7 +299,7 @@ def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[s def set_cuda_visible_devices(device: str): if device == "cuda": devices = ["0"] - elif device=="auto": + elif device == "auto": return else: devices = device.replace(" ", "").split(",") From 37b1b2e9af1173897d484aa51e111fd0f131386e Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 5 Feb 2026 10:11:26 +0800 Subject: [PATCH 13/15] fix eval_task_by_task Signed-off-by: n1ck-guo --- auto_round/eval/eval_cli.py | 6 +++++- auto_round/utils/device.py | 8 +++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 6f71d4d97..fd4732860 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -289,7 +289,11 @@ def eval_task_by_task( if batch_size is None: batch_size = "auto:8" - if not isinstance(model, str): + if not isinstance(model, str) and parallelism: + from accelerate import dispatch_model, infer_auto_device_map + + device_map = infer_auto_device_map(model) + model = dispatch_model(model, device_map=device_map) parallelism = False is_gguf_file = False gguf_file = None diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 81c02fd4b..abbd332e7 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -275,7 +275,13 @@ def is_valid_digit(s): def get_device_and_parallelism(device: Union[str, torch.device, int]) -> tuple[str, bool]: if isinstance(device, str): - devices = device.replace(" ", "").split(",") + if device in ["cuda", "xpu", "hpu"]: + device = detect_device(device) + parallelism = False + return device, parallelism + else: + device = re.sub("xpu:|hpu:|cuda:", "", device) + devices = device.replace(" ", "").split(",") elif isinstance(device, int): devices = [str(device)] else: From 54cd195e03ac699831fee3303e52939ab277450c Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 5 Feb 2026 13:27:26 +0800 Subject: [PATCH 14/15] fix bug Signed-off-by: n1ck-guo --- auto_round/eval/eval_cli.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index fd4732860..c1ccf2b40 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -272,13 +272,12 @@ def eval_task_by_task( retry_times=3, mllm=False, add_bos_token=False, - device_map=None, ): require_version( "lm_eval>=0.4.2", "lm-eval is required for evaluation, please install it with `pip install 'lm-eval>=0.4.2'`" ) - set_cuda_visible_devices(device_map) + set_cuda_visible_devices(device) device_str, parallelism = get_device_and_parallelism(device) # load after _eval_int in order to make sure import torch after set CUDA_VISIBLE_DEVICES From 4e78b49c126580d748672925dc8b0f27f564aa66 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 5 Feb 2026 14:25:01 +0800 Subject: [PATCH 15/15] fix Signed-off-by: n1ck-guo --- auto_round/eval/evaluation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index c12e076e5..27a1fe746 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -264,7 +264,6 @@ def evaluate_with_model_instance(model, tokenizer, device_str, args): batch_size=args.eval_bs, eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"), add_bos_token=args.add_bos_token, - device_map=args.device_map, ) else: # Batch evaluation