From 293d165ac0e105572cee30385a233befe2abf677 Mon Sep 17 00:00:00 2001 From: Charles2530 <2569337619@qq.com> Date: Sun, 15 Mar 2026 09:37:15 +0800 Subject: [PATCH 1/8] support model Multilingual-Multimodal-NLP/IndustrialCoder-32B with vllm --- llmc/compression/quantization/module_utils.py | 8 ++ llmc/models/__init__.py | 1 + llmc/models/industrialcoder.py | 122 ++++++++++++++++++ 3 files changed, 131 insertions(+) create mode 100644 llmc/models/industrialcoder.py diff --git a/llmc/compression/quantization/module_utils.py b/llmc/compression/quantization/module_utils.py index 1c0e6e45..991e6d55 100755 --- a/llmc/compression/quantization/module_utils.py +++ b/llmc/compression/quantization/module_utils.py @@ -445,6 +445,13 @@ def __init__(self, weight, eps=1e-6): def __repr__(self): return 'LlmcQwen2RMSNorm()' +class LlmcIndustrialCoderRMSNorm(LlmcLlamaRMSNorm): + def __init__(self, weight, eps=1e-6): + super().__init__(weight, eps) + + def __repr__(self): + return 'LlmcIndustrialCoderRMSNorm()' + class LlmcMixtralRMSNorm(LlmcLlamaRMSNorm): def __init__(self, weight, eps=1e-6): @@ -1187,6 +1194,7 @@ def __repr__(self): 'Mixtral': LlmcMixtralRMSNorm, 'Interlm2': LlmcInternLM2RMSNorm, 'Qwen2': LlmcQwen2RMSNorm, + 'IndustrialCoder': LlmcIndustrialCoderRMSNorm, 'Gemma2': LlmcGemma2RMSNorm, 'MiniCPM': LlmcMiniCPMRMSNorm, 'Starcoder': LlmcLayerNorm, diff --git a/llmc/models/__init__.py b/llmc/models/__init__.py index 7351995d..d100c975 100755 --- a/llmc/models/__init__.py +++ b/llmc/models/__init__.py @@ -9,6 +9,7 @@ from .internomni import InternOmni from .internvl2 import InternVL2 from .internvl3_5 import InternVL3_5 +from .industrialcoder import IndustrialCoder from .llama import Llama from .llava import Llava from .llava_hf import LlavaHf diff --git a/llmc/models/industrialcoder.py b/llmc/models/industrialcoder.py new file mode 100644 index 00000000..b4a77ee3 --- /dev/null +++ b/llmc/models/industrialcoder.py @@ -0,0 +1,122 @@ +""" +IndustrialCoder (IQuestCoder) model adapter for LLMC quantization. + +Model structure follows IQuestCoderForCausalLM / IQuestCoderModel: + - model.model.embed_tokens, model.model.layers, model.model.norm, model.model.rotary_emb + - model.lm_head + - Each layer: input_layernorm, self_attn (q_proj, k_proj, v_proj, o_proj), + post_attention_layernorm, mlp (gate_proj, up_proj, down_proj) + +Layout is the same as Qwen2-style decoders; this module provides a dedicated +adapter so IndustrialCoder is supported as its own model type, not as Qwen2. +""" + +from importlib.metadata import version + +import packaging + +from llmc.utils.registry_factory import MODEL_REGISTRY + +from .base_model import BaseModel + + +@MODEL_REGISTRY +class IndustrialCoder(BaseModel): + """IndustrialCoder (IQuestCoder) – standalone adapter for blockwise quantization.""" + + def __init__(self, config, device_map=None, use_cache=False): + super().__init__(config, device_map, use_cache) + + def find_blocks(self): + # IQuestCoderForCausalLM.model -> IQuestCoderModel with .layers + self.blocks = self.model.model.layers + + def find_embed_layers(self): + base = self.model.model + self.embed_tokens = base.embed_tokens + if hasattr(base, 'rotary_emb') and ( + packaging.version.parse(version('transformers')) >= packaging.version.parse('4.45.0') + ): + self.rotary_emb = base.rotary_emb + + def find_block_name(self): + self.block_name_prefix = 'model.layers' + + def get_embed_layers(self): + return [self.embed_tokens] + + def get_attn_in_block(self, block): + return {'self_attn': block.self_attn} + + def get_attention_rotary_layers(self): + if packaging.version.parse(version('transformers')) >= packaging.version.parse('4.45.0'): + return [self.rotary_emb] if hasattr(self, 'rotary_emb') and self.rotary_emb is not None else [] + return [] + + def get_head_layers(self): + return [self.model.lm_head] + + def get_pre_head_layernorm_layers(self): + return [self.model.model.norm] + + def get_layers_except_blocks(self): + if packaging.version.parse(version('transformers')) >= packaging.version.parse('4.45.0'): + rotary = [self.rotary_emb] if hasattr(self, 'rotary_emb') and self.rotary_emb is not None else [] + return [self.embed_tokens] + rotary + [self.model.model.norm, self.model.lm_head] + return [self.embed_tokens, self.model.model.norm, self.model.lm_head] + + def skip_layer_name(self): + return ['lm_head'] + + def has_bias(self): + # IQuestCoder config: attention_bias, mlp_bias (often False) + cfg = self.model_config + return getattr(cfg, 'attention_bias', False) or getattr(cfg, 'mlp_bias', False) + + def get_layernorms_in_block(self, block): + return { + 'input_layernorm': block.input_layernorm, + 'post_attention_layernorm': block.post_attention_layernorm, + } + + def get_subsets_in_block(self, block): + # Same layout as Qwen2 / IQuestCoderDecoderLayer + return [ + { + 'layers': { + 'self_attn.q_proj': block.self_attn.q_proj, + 'self_attn.k_proj': block.self_attn.k_proj, + 'self_attn.v_proj': block.self_attn.v_proj, + }, + 'prev_op': [block.input_layernorm], + 'input': ['self_attn.q_proj'], + 'inspect': block.self_attn, + 'has_kwargs': True, + }, + { + 'layers': {'self_attn.o_proj': block.self_attn.o_proj}, + 'prev_op': [block.self_attn.v_proj], + 'input': ['self_attn.o_proj'], + 'inspect': block.self_attn.o_proj, + 'has_kwargs': False, + }, + { + 'layers': { + 'mlp.gate_proj': block.mlp.gate_proj, + 'mlp.up_proj': block.mlp.up_proj, + }, + 'prev_op': [block.post_attention_layernorm], + 'input': ['mlp.gate_proj'], + 'inspect': block.mlp, + 'has_kwargs': False, + 'is_mlp': True, + }, + { + 'layers': {'mlp.down_proj': block.mlp.down_proj}, + 'prev_op': [block.mlp.up_proj], + 'input': ['mlp.down_proj'], + 'inspect': block.mlp.down_proj, + 'has_kwargs': False, + 'is_mlp': True, + }, + ] From 96e4a712ea881a325ca78539f2f52bd38bd5d492 Mon Sep 17 00:00:00 2001 From: Charles2530 <2569337619@qq.com> Date: Sun, 15 Mar 2026 09:54:42 +0800 Subject: [PATCH 2/8] Add IndustrialCoder configs and run script --- .gitignore | 2 +- .../fp8/industrialcoder_rtn_fp8_wikitext.yml | 35 +++++++++++++++ .../industrialcoder_rtn_int_awq_wikitext.yml | 41 ++++++++++++++++++ .../industrialcoder_rtn_int_gptq_wikitext.yml | 43 +++++++++++++++++++ llmc/data/dataset/base_dataset.py | 7 +-- scripts/run_llmc_industrialcoder_fp8.sh | 35 +++++++++++++++ 6 files changed, 159 insertions(+), 4 deletions(-) create mode 100644 configs/quantization/backend/vllm/fp8/industrialcoder_rtn_fp8_wikitext.yml create mode 100644 configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_awq_wikitext.yml create mode 100644 configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_gptq_wikitext.yml create mode 100755 scripts/run_llmc_industrialcoder_fp8.sh diff --git a/.gitignore b/.gitignore index 06eb95ea..05685337 100644 --- a/.gitignore +++ b/.gitignore @@ -22,7 +22,7 @@ save* .log *.pid *.ipynb* -models/ +model/ output_* HiFloat4/ datasets/ \ No newline at end of file diff --git a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_fp8_wikitext.yml b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_fp8_wikitext.yml new file mode 100644 index 00000000..8ee12934 --- /dev/null +++ b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_fp8_wikitext.yml @@ -0,0 +1,35 @@ +base: + seed: &seed 42 +model: + type: IndustrialCoder + path: models/IndustrialCoder-32B + tokenizer_mode: slow + torch_dtype: auto + # Reduce peak memory in catcher stage for large models. + use_cpu_to_save_cuda_mem_for_catcher: False +eval: + eval_pos: [fake_quant] + name: wikitext2 + download: True + seq_len: 2048 + bs: 1 + inference_per_block: False +quant: + method: RTN + weight: + quant_type: float-quant + bit: e4m3 + symmetric: True + granularity: per_channel + block_size: 128 + use_qtorch: True + act: + quant_type: float-quant + bit: e4m3 + symmetric: True + granularity: per_token + block_size: 128 + use_qtorch: True +save: + save_vllm: True + save_path: ./save_for_vllm/industrialcoder_rtn_fp8_wikitext/ diff --git a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_awq_wikitext.yml b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_awq_wikitext.yml new file mode 100644 index 00000000..3196a464 --- /dev/null +++ b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_awq_wikitext.yml @@ -0,0 +1,41 @@ +base: + seed: &seed 42 +model: + type: IndustrialCoder + path: models/IndustrialCoder-32B + tokenizer_mode: slow + torch_dtype: auto + # Reduce peak memory in catcher stage for large models. + use_cpu_to_save_cuda_mem_for_catcher: False +calib: + name: pileval + download: True + # path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: txt_general_preproc + seed: *seed +eval: + eval_pos: [fake_quant] + name: wikitext2 + download: True + seq_len: 2048 + bs: 20 + inference_per_block: True +quant: + method: Awq + weight: + bit: 4 + symmetric: True + granularity: per_group + group_size: 128 + need_pack: True + special: + trans: True + trans_version: v2 + weight_clip: True + quant_out: True +save: + save_vllm: True + save_path: ./save_for_vllm/industrialcoder_rtn_int_awq_wikitext/ diff --git a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_gptq_wikitext.yml b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_gptq_wikitext.yml new file mode 100644 index 00000000..5dbaf29f --- /dev/null +++ b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_gptq_wikitext.yml @@ -0,0 +1,43 @@ +base: + seed: &seed 42 +model: + type: IndustrialCoder + path: models/IndustrialCoder-32B + tokenizer_mode: slow + torch_dtype: auto + # Reduce peak memory in catcher stage for large models. + use_cpu_to_save_cuda_mem_for_catcher: False +calib: + name: wikitext2 + download: True + n_samples: 128 + # path: calib data path + bs: 1 + seq_len: 2048 + preproc: wikitext2_gptq + seed: *seed +eval: + eval_pos: [fake_quant] + name: wikitext2 + download: True + seq_len: 2048 + bs: 20 + inference_per_block: True +quant: + method: GPTQ + weight: + bit: 4 + symmetric: True + granularity: per_group + group_size: 128 + need_pack: True + special: + actorder: True + static_groups: True + percdamp: 0.01 + blocksize: 128 + true_sequential: True + quant_out: True +save: + save_vllm: True + save_path: ./save_for_vllm/industrialcoder_rtn_int_gptq_wikitext/ diff --git a/llmc/data/dataset/base_dataset.py b/llmc/data/dataset/base_dataset.py index 7af3de73..c2295e6b 100755 --- a/llmc/data/dataset/base_dataset.py +++ b/llmc/data/dataset/base_dataset.py @@ -167,9 +167,10 @@ def get_batch_process(self, samples): return calib_model_inputs def get_calib_dataset(self): - samples = self.calib_dataset[ - int(os.environ['RANK'])::int(os.environ['WORLD_SIZE']) - ] + samples = self.calib_dataset.shard( + num_shards=int(os.environ['WORLD_SIZE']), + index=int(os.environ['RANK']) + ) logger.info(f'len(samples) rank : {len(samples)}') calib_model_inputs = self.get_calib_model_inputs(samples) diff --git a/scripts/run_llmc_industrialcoder_fp8.sh b/scripts/run_llmc_industrialcoder_fp8.sh new file mode 100755 index 00000000..436a2e28 --- /dev/null +++ b/scripts/run_llmc_industrialcoder_fp8.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +model_name=industrialcoder +method_name=rtn_int_gptq +dataset_name=wikitext + +log_name=${model_name}_${method_name}_${dataset_name} +rm -rf ./save_for_vllm/${log_name}/ +llmc=. +export PYTHONPATH=$llmc:$PYTHONPATH +config=${llmc}/configs/quantization/backend/vllm/fp8/${log_name}.yml +nnodes=1 +nproc_per_node=8 + +find_unused_port() { + while true; do + port=$(shuf -i 10000-60000 -n 1) + if ! ss -tuln | grep -q ":$port "; then + echo "$port" + return 0 + fi + done +} +UNUSED_PORT=$(find_unused_port) +MASTER_ADDR=127.0.0.1 +MASTER_PORT=$UNUSED_PORT +task_id=$UNUSED_PORT + + +torchrun \ +--nnodes $nnodes \ +--nproc_per_node $nproc_per_node \ +--rdzv_id $task_id \ +--rdzv_backend c10d \ +--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ +${llmc}/llmc/__main__.py --config $config --task_id $task_id |tee ${log_name}.log From 91f72456ec0281ae5235f1aa747134d0a994ee80 Mon Sep 17 00:00:00 2001 From: Charles2530 <2569337619@qq.com> Date: Mon, 16 Mar 2026 11:53:00 +0800 Subject: [PATCH 3/8] all --- .../fp8/industrialcoder_rtn_fp8_wikitext.yml | 4 +- .../industrialcoder_rtn_int_awq_wikitext.yml | 2 +- .../industrialcoder_rtn_int_gptq_wikitext.yml | 2 +- .../video_gen/wan2_2_t2v/awq_w_a.yaml | 15 +- llmc/compression/quantization/module_utils.py | 1 - llmc/utils/export_vllm.py | 3 +- scripts/run_llmc.sh | 2 +- scripts/run_llmc_industrialcoder_fp8.sh | 2 +- scripts/test_load_vllm_quant_state_dict.py | 128 ++++++++++++++++++ 9 files changed, 142 insertions(+), 17 deletions(-) create mode 100644 scripts/test_load_vllm_quant_state_dict.py diff --git a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_fp8_wikitext.yml b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_fp8_wikitext.yml index 8ee12934..d7a02d3c 100644 --- a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_fp8_wikitext.yml +++ b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_fp8_wikitext.yml @@ -2,7 +2,7 @@ base: seed: &seed 42 model: type: IndustrialCoder - path: models/IndustrialCoder-32B + path: model/IndustrialCoder-32B tokenizer_mode: slow torch_dtype: auto # Reduce peak memory in catcher stage for large models. @@ -21,14 +21,12 @@ quant: bit: e4m3 symmetric: True granularity: per_channel - block_size: 128 use_qtorch: True act: quant_type: float-quant bit: e4m3 symmetric: True granularity: per_token - block_size: 128 use_qtorch: True save: save_vllm: True diff --git a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_awq_wikitext.yml b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_awq_wikitext.yml index 3196a464..7e19446a 100644 --- a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_awq_wikitext.yml +++ b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_awq_wikitext.yml @@ -2,7 +2,7 @@ base: seed: &seed 42 model: type: IndustrialCoder - path: models/IndustrialCoder-32B + path: model/IndustrialCoder-32B tokenizer_mode: slow torch_dtype: auto # Reduce peak memory in catcher stage for large models. diff --git a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_gptq_wikitext.yml b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_gptq_wikitext.yml index 5dbaf29f..fec3c68f 100644 --- a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_gptq_wikitext.yml +++ b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_gptq_wikitext.yml @@ -2,7 +2,7 @@ base: seed: &seed 42 model: type: IndustrialCoder - path: models/IndustrialCoder-32B + path: model/IndustrialCoder-32B tokenizer_mode: slow torch_dtype: auto # Reduce peak memory in catcher stage for large models. diff --git a/configs/quantization/video_gen/wan2_2_t2v/awq_w_a.yaml b/configs/quantization/video_gen/wan2_2_t2v/awq_w_a.yaml index 75c8c61b..bf66b40b 100644 --- a/configs/quantization/video_gen/wan2_2_t2v/awq_w_a.yaml +++ b/configs/quantization/video_gen/wan2_2_t2v/awq_w_a.yaml @@ -2,19 +2,17 @@ base: seed: &seed 42 model: type: Wan2T2V - path: /mnt/lm_data_afs/wangzining/charles/lab/llmc/models/Wan2.2-T2V-A14B-Diffusers + path: /mnt/lm_data_afs/wangzining/charles/lab/llmc/model/Wan2.2-T2V-A14B-Diffusers torch_dtype: auto - # 显存不足时开启:校准阶段捕获的激活存到 CPU,量化时再按 block 搬到 GPU - use_cpu_to_save_cuda_mem_for_catcher: True calib: name: t2v download: False path: ./assets/wan_t2v/calib/ - sample_steps: 20 # OOM 时可减小,如 8 或 10 + sample_steps: 20 bs: 1 - target_height: 480 # OOM 时可减小,如 320 - target_width: 832 # OOM 时可减小,如 576 - num_frames: 81 # OOM 时可减小,如 49 或 33 + target_height: 480 + target_width: 832 + num_frames: 81 guidance_scale: 5.0 seed: *seed eval: @@ -29,6 +27,7 @@ eval: num_frames: 81 guidance_scale: 5.0 output_video_path: ./output_videos_awq/ + inference_per_block: True quant: video_gen: method: Awq @@ -52,4 +51,4 @@ quant: clip_sym: True save: save_lightx2v: True - save_path: ../lightx2v/wan2_2_t2v_awq_w_a/x2v/ + save_path: ./save_for_lightx2v/wan2_2_t2v/awq_w_a/original/ diff --git a/llmc/compression/quantization/module_utils.py b/llmc/compression/quantization/module_utils.py index 991e6d55..fdfe47bc 100755 --- a/llmc/compression/quantization/module_utils.py +++ b/llmc/compression/quantization/module_utils.py @@ -899,7 +899,6 @@ def new(cls, module, w_q, quant_config): bias = None need_pack = quant_config['weight'].get('need_pack', False) - if quant_config['weight']['granularity'] == 'per_block': scales_name = 'weight_scale_inv' else: diff --git a/llmc/utils/export_vllm.py b/llmc/utils/export_vllm.py index 1128c3df..87271daf 100755 --- a/llmc/utils/export_vllm.py +++ b/llmc/utils/export_vllm.py @@ -31,7 +31,8 @@ def update_vllm_quant_config( with open(config_file, 'w') as file: json.dump(config_vllm, file, indent=4) return - elif config.quant.weight.get('granularity', 'per_block'): + # elif config.quant.weight.get('granularity', 'per_block'): + elif config.quant.weight.get('granularity') == 'per_block': quant_config = { 'activation_scheme': 'dynamic', 'fmt': 'e4m3', diff --git a/scripts/run_llmc.sh b/scripts/run_llmc.sh index efc4141a..0bd994f9 100755 --- a/scripts/run_llmc.sh +++ b/scripts/run_llmc.sh @@ -8,7 +8,7 @@ model_name=wan2_2_t2v task_name=awq_w_a # task_name=awq_w_a_s log_name=${model_name}_${task_name} -rm -rf ../lightx2v/${log_name}/x2v/lightx2v_quant_model +rm -rf ./save_for_lightx2v/${model_name}/${task_name}/original llmc=. export PYTHONPATH=$llmc:$PYTHONPATH config=${llmc}/configs/quantization/video_gen/${model_name}/${task_name}.yaml diff --git a/scripts/run_llmc_industrialcoder_fp8.sh b/scripts/run_llmc_industrialcoder_fp8.sh index 436a2e28..6dad8c00 100755 --- a/scripts/run_llmc_industrialcoder_fp8.sh +++ b/scripts/run_llmc_industrialcoder_fp8.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash model_name=industrialcoder -method_name=rtn_int_gptq +method_name=rtn_fp8 dataset_name=wikitext log_name=${model_name}_${method_name}_${dataset_name} diff --git a/scripts/test_load_vllm_quant_state_dict.py b/scripts/test_load_vllm_quant_state_dict.py new file mode 100644 index 00000000..9cd40bb6 --- /dev/null +++ b/scripts/test_load_vllm_quant_state_dict.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +""" +Load the vLLM quant model from save_for_vllm/industrialcoder_rtn_fp8_wikitext/vllm_quant_model +and print state_dict keys (and optionally full state_dict). +""" +import argparse +import os +import sys + +# allow running from repo root +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import torch +from transformers import AutoConfig, AutoModelForCausalLM + + +def main(): + parser = argparse.ArgumentParser(description="Load vLLM quant model and print state_dict") + parser.add_argument( + "model_dir", + nargs="?", + default="save_for_vllm/industrialcoder_rtn_int_awq_wikitext/", + help="Path to vllm_quant_model directory", + ) + parser.add_argument( + "--list-keys", + action="store_true", + help="Print all state_dict keys (default: only summary and weight_scale keys)", + ) + parser.add_argument( + "--no-load-weights", + action="store_true", + help="Only load config and print expected keys from index (no full model load)", + ) + parser.add_argument( + "--cpu", + action="store_true", + help="Load model on CPU (default: load on GPU)", + ) + args = parser.parse_args() + + model_dir = os.path.abspath(args.model_dir) + if not os.path.isdir(model_dir): + print(f"Error: not a directory: {model_dir}") + sys.exit(1) + + config_path = os.path.join(model_dir, "config.json") + if not os.path.isfile(config_path): + print(f"Error: config.json not found in {model_dir}") + sys.exit(1) + + print(f"Loading from: {model_dir}\n") + + if args.no_load_weights: + # Only inspect index / config without loading full model + config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) + print("Config model_type:", getattr(config, "model_type", "?")) + index_path = os.path.join(model_dir, "model.safetensors.index.json") + if os.path.isfile(index_path): + import json + with open(index_path) as f: + index = json.load(f) + meta = index.get("metadata", {}) + weight_map = index.get("weight_map", {}) + print(f"Total tensors in index: {len(weight_map)}") + print("\nFirst 20 keys in weight_map:") + for i, k in enumerate(sorted(weight_map.keys())): + if i >= 20: + print(" ...") + break + print(f" {k}") + weight_scale_keys = [k for k in weight_map if "weight_scale" in k] + print(f"\nKeys containing 'weight_scale': {len(weight_scale_keys)}") + for k in sorted(weight_scale_keys)[:30]: + print(f" {k}") + if len(weight_scale_keys) > 30: + print(f" ... and {len(weight_scale_keys) - 30} more") + return + + device_map = "cpu" if args.cpu else "cuda:0" + print(f"Loading full model on {device_map} (may take a while and use significant memory)...") + model = AutoModelForCausalLM.from_pretrained( + model_dir, + trust_remote_code=True, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + device_map=device_map, + ) + + state_dict = model.state_dict() + keys = list(state_dict.keys()) + print(f"Total keys in state_dict: {len(keys)}\n") + + if args.list_keys: + print("All state_dict keys:") + for k in sorted(keys): + t = state_dict[k] + print(f" {k} shape={tuple(t.shape)} dtype={t.dtype}") + else: + print("Sample keys (first 30):") + for k in sorted(keys)[:30]: + t = state_dict[k] + print(f" {k} shape={tuple(t.shape)} dtype={t.dtype}") + if len(keys) > 30: + print(" ...") + + weight_scale_keys = [k for k in keys if "weight_scale" in k] + print(f"\nKeys containing 'weight_scale': {len(weight_scale_keys)}") + for k in sorted(weight_scale_keys): + t = state_dict[k] + print(f" {k} shape={tuple(t.shape)} dtype={t.dtype}") + + # Check for the key that was missing in the error + target = "layers.0.mlp.down_proj.weight_scale" + if target in keys: + print(f"\nKey '{target}' present in state_dict.") + else: + print(f"\nKey '{target}' NOT in state_dict.") + # Show similar keys + similar = [k for k in keys if "down_proj" in k and "weight_scale" in k] + if similar: + print("Similar keys (down_proj + weight_scale):") + for k in sorted(similar)[:10]: + print(f" {k}") + + +if __name__ == "__main__": + main() From b2b4e831f5b722ce9f35ded018ffc21cbbf30464 Mon Sep 17 00:00:00 2001 From: Charles2530 <2569337619@qq.com> Date: Mon, 16 Mar 2026 18:15:49 +0800 Subject: [PATCH 4/8] all --- scripts/test_load_vllm_quant_state_dict.py | 130 ++++++++++++++++++--- 1 file changed, 114 insertions(+), 16 deletions(-) diff --git a/scripts/test_load_vllm_quant_state_dict.py b/scripts/test_load_vllm_quant_state_dict.py index 9cd40bb6..3e24cf90 100644 --- a/scripts/test_load_vllm_quant_state_dict.py +++ b/scripts/test_load_vllm_quant_state_dict.py @@ -1,9 +1,13 @@ #!/usr/bin/env python3 """ -Load the vLLM quant model from save_for_vllm/industrialcoder_rtn_fp8_wikitext/vllm_quant_model -and print state_dict keys (and optionally full state_dict). +Load the vLLM quant model from save_for_vllm/.../vllm_quant_model and print state_dict keys. +Supports: + - Full model via from_pretrained (HF) + - State_dict only via torch.load / safetensors + - Mimic vLLM: load with vLLM's LLM() like vLLM does for FP8 (--vllm) """ import argparse +import json import os import sys @@ -14,12 +18,69 @@ from transformers import AutoConfig, AutoModelForCausalLM +def load_state_dict_only(model_dir, device="cpu"): + """Load state_dict from disk using safetensors or torch.load (no model instantiation).""" + model_dir = os.path.abspath(model_dir) + state_dict = {} + + # 1) Safetensors: sharded (model.safetensors.index.json) or single (model.safetensors) + index_path = os.path.join(model_dir, "model.safetensors.index.json") + if os.path.isfile(index_path): + with open(index_path) as f: + index = json.load(f) + weight_map = index.get("weight_map", {}) + shard_paths = sorted(set(weight_map.values())) + try: + from safetensors.torch import load_file + except ImportError: + raise RuntimeError("Safetensors format detected but 'safetensors' not installed. pip install safetensors") + for shard_name in shard_paths: + shard_path = os.path.join(model_dir, shard_name) + if not os.path.isfile(shard_path): + raise FileNotFoundError(f"Shard not found: {shard_path}") + tensors = load_file(shard_path, device=device) + state_dict.update(tensors) + return state_dict + + single_safetensors = os.path.join(model_dir, "model.safetensors") + if os.path.isfile(single_safetensors): + try: + from safetensors.torch import load_file + except ImportError: + raise RuntimeError("safetensors not installed. pip install safetensors") + return dict(load_file(single_safetensors, device=device)) + + # 2) PyTorch .bin / .pt: torch.load + def _torch_load(path): + try: + return torch.load(path, map_location=device, weights_only=True) + except TypeError: + return torch.load(path, map_location=device) + + for name in ("pytorch_model.bin", "model.pt", "pytorch_model.pt"): + path = os.path.join(model_dir, name) + if os.path.isfile(path): + return _torch_load(path) + # Sometimes sharded as model-00001-of-00003.bin + import glob + bin_files = sorted(glob.glob(os.path.join(model_dir, "pytorch_model*.bin"))) + if bin_files: + for path in bin_files: + state_dict.update(_torch_load(path)) + return state_dict + + raise FileNotFoundError( + f"No state dict found in {model_dir}. " + "Expected: model.safetensors.index.json + .safetensors, model.safetensors, or pytorch_model.bin / .pt" + ) + + def main(): parser = argparse.ArgumentParser(description="Load vLLM quant model and print state_dict") parser.add_argument( "model_dir", nargs="?", - default="save_for_vllm/industrialcoder_rtn_int_awq_wikitext/", + default="save_for_vllm/industrialcoder_rtn_fp8_wikitext/", help="Path to vllm_quant_model directory", ) parser.add_argument( @@ -37,6 +98,16 @@ def main(): action="store_true", help="Load model on CPU (default: load on GPU)", ) + parser.add_argument( + "--state-dict-only", + action="store_true", + help="Load only state_dict via torch.load / safetensors (no full model). Lighter and faster for key inspection.", + ) + parser.add_argument( + "--vllm", + action="store_true", + help="Load model with vLLM's LLM() (same as vLLM does for FP8). Requires vLLM installed. Use to verify vLLM compatibility.", + ) args = parser.parse_args() model_dir = os.path.abspath(args.model_dir) @@ -51,13 +122,36 @@ def main(): print(f"Loading from: {model_dir}\n") + if args.vllm: + # Mimic vLLM loading FP8 model (same code path vLLM uses) + try: + from vllm import LLM + except ImportError as e: + print("Error: vLLM is not installed. Install with: pip install vllm") + sys.exit(1) + print("Loading with vLLM LLM() (same as vLLM for FP8 / compressed-tensors)...") + try: + llm = LLM( + model=model_dir, + trust_remote_code=True, + tensor_parallel_size=1, + ) + print("OK: vLLM loaded the model successfully.") + # Optional: print one sample to confirm inference + out = llm.generate(["Hello"], max_tokens=4) + print("Sample generate:", out) + except Exception as e: + print(f"vLLM load failed: {type(e).__name__}: {e}") + import traceback + traceback.print_exc() + return + if args.no_load_weights: # Only inspect index / config without loading full model config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) print("Config model_type:", getattr(config, "model_type", "?")) index_path = os.path.join(model_dir, "model.safetensors.index.json") if os.path.isfile(index_path): - import json with open(index_path) as f: index = json.load(f) meta = index.get("metadata", {}) @@ -77,17 +171,21 @@ def main(): print(f" ... and {len(weight_scale_keys) - 30} more") return - device_map = "cpu" if args.cpu else "cuda:0" - print(f"Loading full model on {device_map} (may take a while and use significant memory)...") - model = AutoModelForCausalLM.from_pretrained( - model_dir, - trust_remote_code=True, - torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, - device_map=device_map, - ) - - state_dict = model.state_dict() + if args.state_dict_only: + device = "cpu" if args.cpu else "cuda:0" + print(f"Loading state_dict only (torch.load / safetensors) on {device}...") + state_dict = load_state_dict_only(model_dir, device=device) + else: + device_map = "cpu" if args.cpu else "cuda:0" + print(f"Loading full model on {device_map} (may take a while and use significant memory)...") + model = AutoModelForCausalLM.from_pretrained( + model_dir, + trust_remote_code=True, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + device_map=device_map, + ) + state_dict = model.state_dict() keys = list(state_dict.keys()) print(f"Total keys in state_dict: {len(keys)}\n") @@ -111,7 +209,7 @@ def main(): print(f" {k} shape={tuple(t.shape)} dtype={t.dtype}") # Check for the key that was missing in the error - target = "layers.0.mlp.down_proj.weight_scale" + target = "model.layers.0.mlp.down_proj.weight_scale" if target in keys: print(f"\nKey '{target}' present in state_dict.") else: From b9b342b2420223d3b17abc60c3ca1b4b56e8a8e5 Mon Sep 17 00:00:00 2001 From: Charles2530 <2569337619@qq.com> Date: Mon, 30 Mar 2026 16:08:00 +0800 Subject: [PATCH 5/8] quantize Industral-Coder-Thinging --- .../backend/vllm/fp8/thinking_model_fp8.yml | 33 ++++++++++++ .../backend/vllm/thinkingmodel/gptq_w4a16.yml | 41 +++++++++++++++ .../thinkingmodel/thinkingmodel_awq_w4a16.yml | 43 ++++++++++++++++ .../thinkingmodel_awq_w4a16_2.yml | 42 +++++++++++++++ run_awq_w4a16.sh | 51 +++++++++++++++++++ run_fp8.sh | 50 ++++++++++++++++++ run_gptq_w4.sh | 50 ++++++++++++++++++ thinkingmodel | 1 + 8 files changed, 311 insertions(+) create mode 100644 configs/quantization/backend/vllm/fp8/thinking_model_fp8.yml create mode 100644 configs/quantization/backend/vllm/thinkingmodel/gptq_w4a16.yml create mode 100644 configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16.yml create mode 100644 configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16_2.yml create mode 100755 run_awq_w4a16.sh create mode 100755 run_fp8.sh create mode 100755 run_gptq_w4.sh create mode 120000 thinkingmodel diff --git a/configs/quantization/backend/vllm/fp8/thinking_model_fp8.yml b/configs/quantization/backend/vllm/fp8/thinking_model_fp8.yml new file mode 100644 index 00000000..8dd8ae0e --- /dev/null +++ b/configs/quantization/backend/vllm/fp8/thinking_model_fp8.yml @@ -0,0 +1,33 @@ +base: + seed: &seed 42 +model: + type: IndustrialCoder + path: /mnt/lm_data_afs/wangzining/charles/industrial_thinking/IndustrialCoder-Thinking + tokenizer_mode: slow + torch_dtype: auto + # Reduce peak memory in catcher stage for large models. + use_cpu_to_save_cuda_mem_for_catcher: False +eval: + eval_pos: [fake_quant] + name: wikitext2 + download: True + seq_len: 2048 + bs: 1 + inference_per_block: False +quant: + method: RTN + weight: + quant_type: float-quant + bit: e4m3 + symmetric: True + granularity: per_channel + use_qtorch: True + act: + quant_type: float-quant + bit: e4m3 + symmetric: True + granularity: per_token + use_qtorch: True +save: + save_vllm: True + save_path: ./save_for_vllm/thinking_rtn_fp8_wikitext/ diff --git a/configs/quantization/backend/vllm/thinkingmodel/gptq_w4a16.yml b/configs/quantization/backend/vllm/thinkingmodel/gptq_w4a16.yml new file mode 100644 index 00000000..48c2c14a --- /dev/null +++ b/configs/quantization/backend/vllm/thinkingmodel/gptq_w4a16.yml @@ -0,0 +1,41 @@ +base: + seed: &seed 0 +model: + type: IndustrialCoder + path: /mnt/lm_data_afs/wangzining/charles/lab/llmc/thinkingmodel/IndustrialCoder-Thinking + torch_dtype: auto +calib: + name: wikitext2 + download: True + n_samples: 128 + # path: calib data path + bs: 1 + seq_len: 2048 + preproc: wikitext2_gptq + seed: *seed +eval: + eval_pos: [fake_quant] + name: wikitext2 + download: True + # path: eval data path + bs: 1 + seq_len: 2048 + inference_per_block: False +quant: + method: GPTQ + weight: + bit: 4 + symmetric: True + granularity: per_group + group_size: 128 + need_pack: True + special: + actorder: True + static_groups: True + percdamp: 0.01 + blocksize: 128 + true_sequential: True + quant_out: True +save: + save_vllm: True + save_path: /mnt/lm_data_afs/wangzining/charles/lab/llmc/save_for_vllm/thinking_gptq_w4/ diff --git a/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16.yml b/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16.yml new file mode 100644 index 00000000..63ff8278 --- /dev/null +++ b/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16.yml @@ -0,0 +1,43 @@ +base: + seed: &seed 42 +model: + type: IndustrialCoder + path: /mnt/lm_data_afs/wangzining/charles/industrial_thinking/IndustrialCoder-Thinking + tokenizer_mode: slow + torch_dtype: auto + + use_cpu_to_save_cuda_mem_for_catcher: False +calib: + name: wikitext2 + download: True + # path: /mnt/lm_data_afs/wangzining/charles/datasets/wikitext2 + n_samples: 128 + bs: 1 + seq_len: 512 + preproc: txt_general_preproc + seed: *seed +eval: + eval_pos: [fake_quant] + name: wikitext2 + download: True + # path: /mnt/lm_data_afs/wangzining/charles/datasets/wikitext2 + seq_len: 2048 + bs: 1 + inference_per_block: False +quant: + method: Awq + weight: + bit: 4 + symmetric: True + granularity: per_group + group_size: 128 + need_pack: True + special: + trans: True + trans_version: v2 + weight_clip: True + quant_out: True +save: + save_vllm: True + + save_path: /mnt/lm_data_afs/wangzining/charles/lab/llmc/save_for_vllm/thinking_awq_w4/ diff --git a/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16_2.yml b/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16_2.yml new file mode 100644 index 00000000..b99cdc35 --- /dev/null +++ b/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16_2.yml @@ -0,0 +1,42 @@ +base: + seed: &seed 42 +model: + type: IndustrialCoder + path: /mnt/lm_data_afs/wangzining/charles/industrial_thinking/IndustrialCoder-Thinking + tokenizer_mode: slow + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: -1 + seq_len: 512 + preproc: txt_general_preproc + seed: *seed +eval: + eval_pos: [fake_quant] + name: wikitext2 + download: False + path: eval data path + seq_len: 2048 + # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False". + # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True". + bs: 1 + inference_per_block: False +quant: + method: Awq + weight: + bit: 4 + symmetric: True + granularity: per_group + group_size: 128 + need_pack: True + special: + trans: True + trans_version: v2 + weight_clip: True + quant_out: True +save: + save_vllm: True + save_path: /path/to/save_for_vllm_awq_w4/ diff --git a/run_awq_w4a16.sh b/run_awq_w4a16.sh new file mode 100755 index 00000000..a86bb5d3 --- /dev/null +++ b/run_awq_w4a16.sh @@ -0,0 +1,51 @@ +#!/bin/bash +export PATH=/mnt/lm_data_afs/wangzining/charles/miniconda3/envs/llmc/bin:$PATH +export PYTHON=/mnt/lm_data_afs/wangzining/charles/miniconda3/envs/llmc/bin/python +export PIP=/mnt/lm_data_afs/wangzining/charles/miniconda3/envs/llmc/bin/pip +export HF_ENDPOINT=https://hf-mirror.com + +cd /mnt/lm_data_afs/wangzining/charles/lab/llmc + + +model_name=thinking_model +method_name=awq +dataset_name=wikitext +# ============================== + +log_name=${model_name}_${method_name}_${dataset_name} +rm -rf ./save_for_vllm/${log_name}/ + +llmc=. +export PYTHONPATH=$llmc:$PYTHONPATH + + +config=${llmc}/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16.yml + + +nnodes=1 +nproc_per_node=4 +# ========================== + +find_unused_port() { + while true; do + port=$(shuf -i 10000-60000 -n 1) + if ! ss -tuln | grep -q ":$port "; then + echo "$port" + return 0 + fi + done +} +UNUSED_PORT=$(find_unused_port) +MASTER_ADDR=127.0.0.1 +MASTER_PORT=$UNUSED_PORT +task_id=$UNUSED_PORT + +echo "开始执行任务,日志将保存在 ${log_name}.log" + +torchrun \ +--nnodes $nnodes \ +--nproc_per_node $nproc_per_node \ +--rdzv_id $task_id \ +--rdzv_backend c10d \ +--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ +${llmc}/llmc/__main__.py --config $config --task_id $task_id | tee ${log_name}.log diff --git a/run_fp8.sh b/run_fp8.sh new file mode 100755 index 00000000..f68a8f89 --- /dev/null +++ b/run_fp8.sh @@ -0,0 +1,50 @@ +#!/bin/bash +export PATH=/mnt/lm_data_afs/wangzining/charles/miniconda3/envs/llmc/bin:$PATH +export PYTHON=/mnt/lm_data_afs/wangzining/charles/miniconda3/envs/llmc/bin/python +export PIP=/mnt/lm_data_afs/wangzining/charles/miniconda3/envs/llmc/bin/pip +export HF_ENDPOINT=https://hf-mirror.com + +cd /mnt/lm_data_afs/wangzining/charles/lab/llmc + + +model_name=thinking_model +method_name=fp8 +dataset_name=wikitext +# ============================== + +log_name=${model_name}_${method_name}_${dataset_name} +rm -rf ./save_for_vllm/${log_name}/ + +llmc=. +export PYTHONPATH=$llmc:$PYTHONPATH + + +config=${llmc}/configs/quantization/backend/vllm/fp8/thinking_model_fp8.yml + +nnodes=1 +nproc_per_node=4 + + +find_unused_port() { + while true; do + port=$(shuf -i 10000-60000 -n 1) + if ! ss -tuln | grep -q ":$port "; then + echo "$port" + return 0 + fi + done +} +UNUSED_PORT=$(find_unused_port) +MASTER_ADDR=127.0.0.1 +MASTER_PORT=$UNUSED_PORT +task_id=$UNUSED_PORT + +echo "开始执行任务,日志将保存在 ${log_name}.log" + +torchrun \ +--nnodes $nnodes \ +--nproc_per_node $nproc_per_node \ +--rdzv_id $task_id \ +--rdzv_backend c10d \ +--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ +${llmc}/llmc/__main__.py --config $config --task_id $task_id | tee ${log_name}.log diff --git a/run_gptq_w4.sh b/run_gptq_w4.sh new file mode 100755 index 00000000..1c208a70 --- /dev/null +++ b/run_gptq_w4.sh @@ -0,0 +1,50 @@ +#!/bin/bash +export PATH=/mnt/lm_data_afs/wangzining/charles/miniconda3/envs/llmc/bin:$PATH +export PYTHON=/mnt/lm_data_afs/wangzining/charles/miniconda3/envs/llmc/bin/python +export PIP=/mnt/lm_data_afs/wangzining/charles/miniconda3/envs/llmc/bin/pip +export HF_ENDPOINT=https://hf-mirror.com + +cd /mnt/lm_data_afs/wangzining/charles/lab/llmc + + +model_name=thinking_model +method_name=gptq +dataset_name=wikitext +# ============================== + +log_name=${model_name}_${method_name}_${dataset_name} +rm -rf ./save_for_vllm/${log_name}/ + +llmc=. +export PYTHONPATH=$llmc:$PYTHONPATH + + +config=/mnt/lm_data_afs/wangzining/charles/lab/llmc/configs/quantization/backend/vllm/thinkingmodel/gptq_w4a16.yml + +nnodes=1 +nproc_per_node=4 + + +find_unused_port() { + while true; do + port=$(shuf -i 10000-60000 -n 1) + if ! ss -tuln | grep -q ":$port "; then + echo "$port" + return 0 + fi + done +} +UNUSED_PORT=$(find_unused_port) +MASTER_ADDR=127.0.0.1 +MASTER_PORT=$UNUSED_PORT +task_id=$UNUSED_PORT + +echo "开始执行任务,日志将保存在 ${log_name}.log" + +torchrun \ +--nnodes $nnodes \ +--nproc_per_node $nproc_per_node \ +--rdzv_id $task_id \ +--rdzv_backend c10d \ +--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ +${llmc}/llmc/__main__.py --config $config --task_id $task_id | tee ${log_name}.log diff --git a/thinkingmodel b/thinkingmodel new file mode 120000 index 00000000..8da8c246 --- /dev/null +++ b/thinkingmodel @@ -0,0 +1 @@ +model \ No newline at end of file From 981dd994d92b037fbc0edf3040d251ee9dbbb6ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yang=20Yong=20=28=E9=9B=8D=E6=B4=8B=29?= Date: Wed, 13 May 2026 20:29:20 +0800 Subject: [PATCH 6/8] Delete thinkingmodel --- thinkingmodel | 1 - 1 file changed, 1 deletion(-) delete mode 120000 thinkingmodel diff --git a/thinkingmodel b/thinkingmodel deleted file mode 120000 index 8da8c246..00000000 --- a/thinkingmodel +++ /dev/null @@ -1 +0,0 @@ -model \ No newline at end of file From b2b07238f9f06045d55df53138e070e45e068d36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yang=20Yong=20=28=E9=9B=8D=E6=B4=8B=29?= Date: Wed, 13 May 2026 20:30:02 +0800 Subject: [PATCH 7/8] Update .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 40e0026e..97b241fc 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,3 @@ save* *.pid *.ipynb* .venv/ -*.sh From 7d531e299a04382f276d055fbb6fb2e05ea0c14d Mon Sep 17 00:00:00 2001 From: Charles2530 <2569337619@qq.com> Date: Wed, 13 May 2026 21:23:07 +0800 Subject: [PATCH 8/8] style: satisfy pre-commit after main merge --- .../video_gen/wan_i2v/awq_w_a.yaml | 2 +- .../video_gen/wan_t2v/awq_w_a.yaml | 2 +- .../video_gen/wan_t2v/rtn_w_a.yaml | 2 +- .../video_gen/wan_t2v/smoothquant_w_a.yaml | 2 +- .../base_blockwise_quantization.py | 6 +-- llmc/eval/eval_base.py | 3 +- llmc/eval/eval_ppl.py | 3 +- llmc/models/base_model.py | 2 +- llmc/models/wan2_2_t2v.py | 47 +++++++++++++------ llmc/models/wan_t2v.py | 2 +- tools/download_calib_dataset.py | 3 +- tools/download_eval_dataset.py | 3 +- 12 files changed, 47 insertions(+), 30 deletions(-) diff --git a/configs/quantization/video_gen/wan_i2v/awq_w_a.yaml b/configs/quantization/video_gen/wan_i2v/awq_w_a.yaml index 262d6852..680fab43 100755 --- a/configs/quantization/video_gen/wan_i2v/awq_w_a.yaml +++ b/configs/quantization/video_gen/wan_i2v/awq_w_a.yaml @@ -46,4 +46,4 @@ quant: clip_sym: True save: save_lightx2v: True - save_path: /path/to/x2v/ \ No newline at end of file + save_path: /path/to/x2v/ diff --git a/configs/quantization/video_gen/wan_t2v/awq_w_a.yaml b/configs/quantization/video_gen/wan_t2v/awq_w_a.yaml index 59e35dd4..14d05479 100755 --- a/configs/quantization/video_gen/wan_t2v/awq_w_a.yaml +++ b/configs/quantization/video_gen/wan_t2v/awq_w_a.yaml @@ -46,4 +46,4 @@ quant: clip_sym: True save: save_lightx2v: True - save_path: /path/to/x2v/ \ No newline at end of file + save_path: /path/to/x2v/ diff --git a/configs/quantization/video_gen/wan_t2v/rtn_w_a.yaml b/configs/quantization/video_gen/wan_t2v/rtn_w_a.yaml index 844b6221..b6a53b0e 100644 --- a/configs/quantization/video_gen/wan_t2v/rtn_w_a.yaml +++ b/configs/quantization/video_gen/wan_t2v/rtn_w_a.yaml @@ -29,4 +29,4 @@ quant: granularity: per_token save: save_lightx2v: True - save_path: /path/to/x2v/ \ No newline at end of file + save_path: /path/to/x2v/ diff --git a/configs/quantization/video_gen/wan_t2v/smoothquant_w_a.yaml b/configs/quantization/video_gen/wan_t2v/smoothquant_w_a.yaml index 122d31f7..7d65f31f 100755 --- a/configs/quantization/video_gen/wan_t2v/smoothquant_w_a.yaml +++ b/configs/quantization/video_gen/wan_t2v/smoothquant_w_a.yaml @@ -42,4 +42,4 @@ quant: alpha: 0.7 save: save_lightx2v: True - save_path: /path/to/x2v/ \ No newline at end of file + save_path: /path/to/x2v/ diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py index 2df4f8c9..a09fc5f8 100755 --- a/llmc/compression/quantization/base_blockwise_quantization.py +++ b/llmc/compression/quantization/base_blockwise_quantization.py @@ -35,11 +35,7 @@ _TRANSFORMERS_LN_TYPES_, EffcientFakeQuantLinear, FakeQuantLinear, LlmcActFn, OriginFloatLinear, RotateLinear) -from .quant import ( - FloatQuantizer, - IntegerQuantizer, - Weight48IntegerQuantizer, -) +from .quant import FloatQuantizer, IntegerQuantizer, Weight48IntegerQuantizer class BaseBlockwiseQuantization(BlockwiseOpt): diff --git a/llmc/eval/eval_base.py b/llmc/eval/eval_base.py index 60a60589..098c9bb8 100755 --- a/llmc/eval/eval_base.py +++ b/llmc/eval/eval_base.py @@ -5,10 +5,11 @@ import torch import torch.nn as nn -from datasets import load_dataset, load_from_disk from human_eval.data import read_problems from loguru import logger +from datasets import load_dataset, load_from_disk + class BaseEval: def __init__(self, model, config): diff --git a/llmc/eval/eval_ppl.py b/llmc/eval/eval_ppl.py index d598218c..bb41329f 100644 --- a/llmc/eval/eval_ppl.py +++ b/llmc/eval/eval_ppl.py @@ -3,10 +3,11 @@ import torch import torch.nn as nn -from datasets import load_dataset, load_from_disk from loguru import logger from tqdm import tqdm +from datasets import load_dataset, load_from_disk + from .eval_base import BaseEval diff --git a/llmc/models/base_model.py b/llmc/models/base_model.py index 25393a87..315a749b 100755 --- a/llmc/models/base_model.py +++ b/llmc/models/base_model.py @@ -129,7 +129,7 @@ def build_tokenizer(self): if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token else: - self.tokenizer = None + self.tokenizer = None def get_tokenizer(self): return self.tokenizer diff --git a/llmc/models/wan2_2_t2v.py b/llmc/models/wan2_2_t2v.py index c3db088d..a19ff0b0 100755 --- a/llmc/models/wan2_2_t2v.py +++ b/llmc/models/wan2_2_t2v.py @@ -1,5 +1,5 @@ -import gc import copy +import gc import inspect import os import shutil @@ -19,7 +19,8 @@ class WanOfficialPipelineAdapter: - """Adapter that exposes Wan-Video/Wan2.2 official t2v runtime as a Pipeline-like interface.""" + """Adapter that exposes Wan-Video/Wan2.2 official t2v runtime as a + Pipeline-like interface.""" def __init__( self, @@ -116,7 +117,8 @@ def __call__( @MODEL_REGISTRY class Wan2T2V(BaseModel): - """Wan2.2-T2V with MoE: two experts (high-noise + low-noise), same block structure as Wan2.1.""" + """Wan2.2-T2V with MoE: two experts (high-noise + low-noise), same block + structure as Wan2.1.""" def __init__(self, config, device_map=None, use_cache=False): super().__init__(config, device_map, use_cache) @@ -200,11 +202,13 @@ def _import_impl(): return _import_impl() except Exception as e2: logger.warning( - f'Failed to import official Wan2.2 from wan2_repo_path={repo_path}: {e2}' + 'Failed to import official Wan2.2 from ' + f'wan2_repo_path={repo_path}: {e2}' ) logger.warning( 'Failed to import official Wan2.2 runtime (wan package). ' - 'Diffusers fallback depends on model.allow_diffusers_fallback/model.force_diffusers. ' + 'Diffusers fallback depends on model.allow_diffusers_fallback/' + 'model.force_diffusers. ' f'import_error={e}' ) return None, None @@ -257,7 +261,8 @@ def _try_build_official_wan_pipeline(self): self.pipeline_source = 'wan_official' self.use_official_wan = True logger.info( - f'Loaded Wan2.2 via official Wan runtime from native checkpoint: {normalized_model_path}' + 'Loaded Wan2.2 via official Wan runtime from native checkpoint: ' + f'{normalized_model_path}' ) return True @@ -360,7 +365,10 @@ def build_model(self): new_block = LlmcWanTransformerBlock.new(block) self.Pipeline.transformer_2.blocks[block_idx] = new_block self.num_transformer_blocks = len(self.Pipeline.transformer.blocks) - self.blocks = list(self.Pipeline.transformer.blocks) + list(self.Pipeline.transformer_2.blocks) + self.blocks = ( + list(self.Pipeline.transformer.blocks) + + list(self.Pipeline.transformer_2.blocks) + ) logger.info( 'Wan2.2 MoE: both experts wrapped (high-noise + low-noise, 80 blocks total).' ) @@ -456,7 +464,10 @@ def forward(self, *args, **kwargs): first_block_input[self.expert_name]['kwargs'].append( {k: self._to_cpu(v) for k, v in capture_kwargs.items()} ) - if all(len(first_block_input[name]['data']) >= sample_steps for name in first_block_input): + if all( + len(first_block_input[name]['data']) >= sample_steps + for name in first_block_input + ): raise ValueError return self.module(*args, **kwargs) @@ -488,10 +499,13 @@ def forward(self, *args, **kwargs): self.Pipeline.transformer.blocks[0] = self.Pipeline.transformer.blocks[0].module if first_block_2 is not None: - self.Pipeline.transformer_2.blocks[0] = self.Pipeline.transformer_2.blocks[0].module + transformer_2 = self.Pipeline.transformer_2 + transformer_2.blocks[0] = transformer_2.blocks[0].module self.Pipeline.to('cpu') - assert len(first_block_input['transformer']['data']) > 0, 'Catch transformer input data failed.' + assert len(first_block_input['transformer']['data']) > 0, ( + 'Catch transformer input data failed.' + ) if hasattr(self.Pipeline, 'transformer_2') and self.Pipeline.transformer_2 is not None: assert len(first_block_input['transformer_2']['data']) > 0, \ 'Catch transformer_2 input data failed.' @@ -623,7 +637,8 @@ def get_layers_except_blocks(self): @staticmethod def copy_native_checkpoint(src, dst): - """Copy full Wan2.2 native checkpoint tree before overwriting expert safetensors.""" + """Copy full Wan2.2 native checkpoint tree before overwriting expert + safetensors.""" if not isinstance(src, str) or not os.path.isdir(src): raise RuntimeError( 'Wan2.2 official save expects a local native checkpoint directory, ' @@ -641,7 +656,8 @@ def copy_native_checkpoint(src, dst): @staticmethod def validate_native_save_structure(save_path, source_path=None): - """Verify saved directory has Wan2.2 native layout (experts + copied non-expert assets).""" + """Verify saved directory has Wan2.2 native layout (experts + copied + non-expert assets).""" if not os.path.isdir(save_path): raise RuntimeError(f'Wan2.2 saved path is not a directory: {save_path}') @@ -705,11 +721,12 @@ def save_wan2_2_pretrained(self, path): self.validate_native_save_structure(path, source_path=src) return - # Copy the full original pipeline (VAE, text encoder, tokenizer, scheduler, etc.) - # so that non-quantized components are preserved. + # Copy the full original pipeline (VAE, text encoder, tokenizer, + # scheduler, etc.) so that non-quantized components are preserved. src = getattr(self, 'pipeline_model_path', self.model_path) copied_from_source = False - if isinstance(src, str) and os.path.isdir(src) and os.path.abspath(src) != os.path.abspath(path): + same_path = os.path.abspath(src) == os.path.abspath(path) + if isinstance(src, str) and os.path.isdir(src) and not same_path: if os.path.exists(path): shutil.rmtree(path) shutil.copytree(src, path) diff --git a/llmc/models/wan_t2v.py b/llmc/models/wan_t2v.py index 59696686..885bccda 100755 --- a/llmc/models/wan_t2v.py +++ b/llmc/models/wan_t2v.py @@ -162,4 +162,4 @@ def get_layers_except_blocks(self): pass def skip_layer_name(self): - pass \ No newline at end of file + pass diff --git a/tools/download_calib_dataset.py b/tools/download_calib_dataset.py index 37ce76ba..31fe1477 100644 --- a/tools/download_calib_dataset.py +++ b/tools/download_calib_dataset.py @@ -6,9 +6,10 @@ import argparse import os -from datasets import load_dataset from loguru import logger +from datasets import load_dataset + def download(calib_dataset_name, path): if 'pileval' in calib_dataset_name: diff --git a/tools/download_eval_dataset.py b/tools/download_eval_dataset.py index 7eddd8bd..12f1f2a6 100644 --- a/tools/download_eval_dataset.py +++ b/tools/download_eval_dataset.py @@ -6,9 +6,10 @@ import argparse import os -from datasets import load_dataset from loguru import logger +from datasets import load_dataset + def download(calib_dataset_name, path): if 'c4' in calib_dataset_name: