From abd9648b68d6c4ed6c84e558e7d5599c563c0681 Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Fri, 5 Dec 2025 07:41:40 +0000 Subject: [PATCH 1/4] Added support of subfunction to Qwen2.5VL Signed-off-by: abhishek-singh591 --- QEfficient/base/modeling_qeff.py | 9 +- .../transformers/models/modeling_auto.py | 1 - .../transformers/models/pytorch_transforms.py | 21 ++- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 11 +- QEfficient/utils/torch_patches.py | 11 +- test.py | 169 ++++++++++++++++++ 6 files changed, 201 insertions(+), 21 deletions(-) create mode 100644 test.py diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index ef7e83adf..2fc77a458 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -260,8 +260,13 @@ def _export( "The subfunction feature is experimental. Please note that using compile consecutively with and without subfunction may produce inconsistent results." ) apply_torch_patches() - InvalidIndexProvider.SUBFUNC_ENABLED = True - output_names = [re.sub("_RetainedState", "_InternalRetainedState", s) for s in output_names] + InvalidIndexProvider.SUBFUNC_ENABLED = True + output_names = [ + re.sub("_RetainedState", "_InternalRetainedState", name) + if name.endswith("_RetainedState") and ("key" in name or "value" in name) + else name + for name in output_names + ] export_kwargs["export_modules_as_functions"] = get_decoder_layer_classes_for_export(self.model) self._onnx_transforms.append(RenameFunctionOutputsTransform) self._onnx_transforms.append(CustomOpTransform) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index f3618cb1e..71e0caa8f 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1067,7 +1067,6 @@ def export( kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode ) output_names = self.model.get_output_names(kv_offload=True) - self.vision_model.export( inputs["vision"], output_names["vision"], diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 21a867eb5..1e4358579 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -832,23 +832,28 @@ def get_decoder_layer_classes_for_export(model: nn.Module) -> set: Dynamically determine which DecoderLayer classes should be exported as functions based on the model's architecture using the existing KVCacheTransform mapping. """ - # Define patterns that identify decoder layer classes - DECODER_LAYER_PATTERNS = ["DecoderLayer", "Block", "Layer"] - # Get all QEff classes that are decoder layers from the existing mapping + DECODER_LAYER_PATTERNS = ["DecoderLayer", "Block", "Layer"] decoder_layer_classes = set() for original_class, qeff_class in KVCacheTransform._module_mapping.items(): - # Check if the QEff class name contains decoder layer patterns qeff_class_name = qeff_class.__name__ if any(pattern in qeff_class_name for pattern in DECODER_LAYER_PATTERNS): decoder_layer_classes.add(qeff_class) - # Filter to only include classes that are actually used in the current model model_decoder_classes = set() - for module in model.modules(): - if module.__class__ in decoder_layer_classes: - model_decoder_classes.add(module.__class__) + model_class_name = model.__class__.__name__ + if "EncoderWrapper" in model_class_name: + model_decoder_classes.update( + module.__class__ for module in model.modules() + if "Qwen2_5_VLVisionBlock" in module.__class__.__name__ + ) + return model_decoder_classes + + model_decoder_classes.update( + module.__class__ for module in model.modules() + if module.__class__ in decoder_layer_classes + ) return model_decoder_classes diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 33a434db1..b88fd4925 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -73,20 +73,15 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, mrope_section, unsqu Returns: `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. """ - - mrope_section = mrope_section * 2 cos = cos[position_ids] sin = sin[position_ids] - - cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim) - sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim) - + cos = torch.cat([cos[0,..., 0:32],cos[0,..., 32:80], cos[0,..., 80:128]], dim=-1).unsqueeze(0) + sin = torch.cat([sin[0,..., 0:32],sin[0,..., 32:80], sin[0,..., 80:128]], dim=-1).unsqueeze(0) q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) - + return q_embed.to(q.dtype), k_embed.to(k.dtype) - class QEffQwen2_5_VLVisionAttention(Qwen2_5_VLVisionAttention): def __init__(self, dim: int, num_heads: int = 16) -> None: super().__init__() diff --git a/QEfficient/utils/torch_patches.py b/QEfficient/utils/torch_patches.py index 0b9b37afa..e5c8aa675 100644 --- a/QEfficient/utils/torch_patches.py +++ b/QEfficient/utils/torch_patches.py @@ -10,6 +10,7 @@ import torch import torch.onnx.utils as onnx_utils from torch import _C +import warnings # Store original references before patching _original_setup_trace_module_map = onnx_utils._setup_trace_module_map @@ -37,9 +38,15 @@ def _track_module_attributes_forward_hook(module, input, output): if hasattr(module, attr_name): onnx_attrs = getattr(module, attr_name) delattr(module, attr_name) + # FIX: use empty dict to avoid type mismatch - onnx_attrs = {} - _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs) + # onnx_attrs = {} + try: + _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs) + except Exception as e: + warnings.warn( + f"Failed to track ONNX scope attributes: {e}. Skipping this step." + ) for m in model.modules(): m.register_forward_hook(_track_module_attributes_forward_hook) diff --git a/test.py b/test.py new file mode 100644 index 000000000..46d8f7ff8 --- /dev/null +++ b/test.py @@ -0,0 +1,169 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +''' +For running qwen 2.5 32B VL model with subfunction using Qefficient one need to first +export encoder and decoder separately and then use them in the pipeline. +for this please refer to Qefficient.transformers.models.modeling_auto.py + 1. for exporting only encoder comment from line 1028-1035 and run this script also in this script skip_vision=False and skip_lang=True. + 2. for exporting only decoder comment from line 1017-1023 and uncomment the above and then run this script skip_vision=True and skip_lang=False. +''' + +# If we want to enable QBlocking Run below command:, default is without blocking +# ATTENTION_BLOCKING_MODE=q num_q_blocks=2 python -W ignore qwen2_5_vl_example.py + +import requests +import transformers +from PIL import Image +from qwen_vl_utils import process_vision_info +from transformers import AutoConfig, AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + +## For AWQ model update pytorch version to 2.8.* +model_id = "Qwen/Qwen2.5-VL-3B-Instruct" +config = AutoConfig.from_pretrained(model_id) +config.text_config.num_hidden_layers = 2 + +qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_id, attn_implementation="eager", kv_offload=True, config=config +) +tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +### use skip_vision=Ture, if want to run only text, ow false ### +skip_vision = True +skip_lang = False +if skip_vision: + ## Only Text ## + + ## Set Batch_Size ## + batch_size = 1 + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + height=354, + width=536, + mxfp6_matmul=False, + aic_enable_depth_first=True, + skip_vision=True, + mos=1, + use_onnx_subfunctions=True, + ) + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Tell me about yourself."}, + ], + }, + ] + + messages = [messages] * batch_size + + inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate(inputs=inputs, generation_len=100) + print(output.generated_ids) + print(tokenizer.batch_decode(output.generated_ids)) + print(output) + +else: + batch_size = 1 + ## Vision + Text ## + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=4, + height=354, + width=536, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + skip_lang=True, + mos=1, + use_onnx_subfunctions=True, + ) + + ### IMAGE + TEXT ### + image_url = "https://picsum.photos/id/237/536/354" + + image = Image.open(requests.get(image_url, stream=True).raw) + + messages_1 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": "Describe this image."}, + ], + }, + ] + + messages_2 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": "Describe about the color of the dog."}, + ], + }, + ] + + messages = [messages_2] * batch_size + + texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages] + + image_inputs, video_inputs = process_vision_info(messages) + inputs = processor( + text=texts, + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate(inputs=inputs, generation_len=100) + print(output.generated_ids) + print(tokenizer.batch_decode(output.generated_ids)) + print(output) + + + +# import os +# from QEfficient import QEFFAutoModelForCausalLM +# from transformers import AutoTokenizer, AutoModelForCausalLM + +# os.environ["QEFF_USE_ONNX_FUNCTIONS"] = "True" +# os.environ["QAIC_COMPILER_OPTS_UNSUPPORTED"] = "-loader-inline-all=0" + + +# model = QEFFAutoModelForCausalLM.from_pretrained("gpt2", num_hidden_layers=2) +# model.compile(num_devices=2) +# tokenizer = AutoTokenizer.from_pretrained("gpt2") +# model.generate(prompts=["Hi there!!"], tokenizer=tokenizer) + + +# export QAIC_COMPILER_OPTS_UNSUPPORTED="-loader-inline-all=0" \ No newline at end of file From 8f78722899ebdeaa41fe491805f9dd84f4ccf1d1 Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Fri, 5 Dec 2025 07:42:16 +0000 Subject: [PATCH 2/4] Added support of subfunction to Qwen2.5VL Signed-off-by: abhishek-singh591 --- test.py | 169 -------------------------------------------------------- 1 file changed, 169 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 46d8f7ff8..000000000 --- a/test.py +++ /dev/null @@ -1,169 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -''' -For running qwen 2.5 32B VL model with subfunction using Qefficient one need to first -export encoder and decoder separately and then use them in the pipeline. -for this please refer to Qefficient.transformers.models.modeling_auto.py - 1. for exporting only encoder comment from line 1028-1035 and run this script also in this script skip_vision=False and skip_lang=True. - 2. for exporting only decoder comment from line 1017-1023 and uncomment the above and then run this script skip_vision=True and skip_lang=False. -''' - -# If we want to enable QBlocking Run below command:, default is without blocking -# ATTENTION_BLOCKING_MODE=q num_q_blocks=2 python -W ignore qwen2_5_vl_example.py - -import requests -import transformers -from PIL import Image -from qwen_vl_utils import process_vision_info -from transformers import AutoConfig, AutoProcessor, TextStreamer - -from QEfficient import QEFFAutoModelForImageTextToText - -## For AWQ model update pytorch version to 2.8.* -model_id = "Qwen/Qwen2.5-VL-3B-Instruct" -config = AutoConfig.from_pretrained(model_id) -config.text_config.num_hidden_layers = 2 - -qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( - model_id, attn_implementation="eager", kv_offload=True, config=config -) -tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) -processor = AutoProcessor.from_pretrained(model_id) - -### use skip_vision=Ture, if want to run only text, ow false ### -skip_vision = True -skip_lang = False -if skip_vision: - ## Only Text ## - - ## Set Batch_Size ## - batch_size = 1 - qeff_model.compile( - batch_size=batch_size, - prefill_seq_len=128, - ctx_len=4096, - num_cores=16, - num_devices=4, - height=354, - width=536, - mxfp6_matmul=False, - aic_enable_depth_first=True, - skip_vision=True, - mos=1, - use_onnx_subfunctions=True, - ) - - messages = [ - { - "role": "user", - "content": [ - {"type": "text", "text": "Tell me about yourself."}, - ], - }, - ] - - messages = [messages] * batch_size - - inputs = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - return_tensors="pt", - ) - - inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) - - streamer = TextStreamer(tokenizer) - output = qeff_model.generate(inputs=inputs, generation_len=100) - print(output.generated_ids) - print(tokenizer.batch_decode(output.generated_ids)) - print(output) - -else: - batch_size = 1 - ## Vision + Text ## - qeff_model.compile( - batch_size=batch_size, - prefill_seq_len=128, - ctx_len=4096, - num_cores=16, - num_devices=4, - height=354, - width=536, - mxfp6_matmul=True, - mxint8_kv_cache=True, - aic_enable_depth_first=True, - skip_lang=True, - mos=1, - use_onnx_subfunctions=True, - ) - - ### IMAGE + TEXT ### - image_url = "https://picsum.photos/id/237/536/354" - - image = Image.open(requests.get(image_url, stream=True).raw) - - messages_1 = [ - { - "role": "user", - "content": [ - {"type": "image", "image": image}, - {"type": "text", "text": "Describe this image."}, - ], - }, - ] - - messages_2 = [ - { - "role": "user", - "content": [ - {"type": "image", "image": image}, - {"type": "text", "text": "Describe about the color of the dog."}, - ], - }, - ] - - messages = [messages_2] * batch_size - - texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages] - - image_inputs, video_inputs = process_vision_info(messages) - inputs = processor( - text=texts, - images=image_inputs, - videos=video_inputs, - padding=True, - return_tensors="pt", - ) - - inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) - - streamer = TextStreamer(tokenizer) - output = qeff_model.generate(inputs=inputs, generation_len=100) - print(output.generated_ids) - print(tokenizer.batch_decode(output.generated_ids)) - print(output) - - - -# import os -# from QEfficient import QEFFAutoModelForCausalLM -# from transformers import AutoTokenizer, AutoModelForCausalLM - -# os.environ["QEFF_USE_ONNX_FUNCTIONS"] = "True" -# os.environ["QAIC_COMPILER_OPTS_UNSUPPORTED"] = "-loader-inline-all=0" - - -# model = QEFFAutoModelForCausalLM.from_pretrained("gpt2", num_hidden_layers=2) -# model.compile(num_devices=2) -# tokenizer = AutoTokenizer.from_pretrained("gpt2") -# model.generate(prompts=["Hi there!!"], tokenizer=tokenizer) - - -# export QAIC_COMPILER_OPTS_UNSUPPORTED="-loader-inline-all=0" \ No newline at end of file From 2871558b1eff816c21cbd4b1be16a898a2966eb7 Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Fri, 5 Dec 2025 07:54:14 +0000 Subject: [PATCH 3/4] Resolved lint and format error Signed-off-by: abhishek-singh591 --- QEfficient/base/modeling_qeff.py | 2 +- QEfficient/transformers/models/modeling_auto.py | 1 + QEfficient/transformers/models/pytorch_transforms.py | 6 ++---- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 7 ++++--- QEfficient/utils/torch_patches.py | 9 ++++----- .../causallm/example_pytorch_transforms.py | 12 ++++++------ 6 files changed, 18 insertions(+), 19 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 2fc77a458..0378a800f 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -260,7 +260,7 @@ def _export( "The subfunction feature is experimental. Please note that using compile consecutively with and without subfunction may produce inconsistent results." ) apply_torch_patches() - InvalidIndexProvider.SUBFUNC_ENABLED = True + InvalidIndexProvider.SUBFUNC_ENABLED = True output_names = [ re.sub("_RetainedState", "_InternalRetainedState", name) if name.endswith("_RetainedState") and ("key" in name or "value" in name) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 71e0caa8f..f3618cb1e 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1067,6 +1067,7 @@ def export( kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode ) output_names = self.model.get_output_names(kv_offload=True) + self.vision_model.export( inputs["vision"], output_names["vision"], diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 1e4358579..e64634b62 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -845,14 +845,12 @@ def get_decoder_layer_classes_for_export(model: nn.Module) -> set: model_class_name = model.__class__.__name__ if "EncoderWrapper" in model_class_name: model_decoder_classes.update( - module.__class__ for module in model.modules() - if "Qwen2_5_VLVisionBlock" in module.__class__.__name__ + module.__class__ for module in model.modules() if "Qwen2_5_VLVisionBlock" in module.__class__.__name__ ) return model_decoder_classes model_decoder_classes.update( - module.__class__ for module in model.modules() - if module.__class__ in decoder_layer_classes + module.__class__ for module in model.modules() if module.__class__ in decoder_layer_classes ) return model_decoder_classes diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index b88fd4925..018ce0851 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -75,13 +75,14 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, mrope_section, unsqu """ cos = cos[position_ids] sin = sin[position_ids] - cos = torch.cat([cos[0,..., 0:32],cos[0,..., 32:80], cos[0,..., 80:128]], dim=-1).unsqueeze(0) - sin = torch.cat([sin[0,..., 0:32],sin[0,..., 32:80], sin[0,..., 80:128]], dim=-1).unsqueeze(0) + cos = torch.cat([cos[0, ..., 0:32], cos[0, ..., 32:80], cos[0, ..., 80:128]], dim=-1).unsqueeze(0) + sin = torch.cat([sin[0, ..., 0:32], sin[0, ..., 32:80], sin[0, ..., 80:128]], dim=-1).unsqueeze(0) q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) - + return q_embed.to(q.dtype), k_embed.to(k.dtype) + class QEffQwen2_5_VLVisionAttention(Qwen2_5_VLVisionAttention): def __init__(self, dim: int, num_heads: int = 16) -> None: super().__init__() diff --git a/QEfficient/utils/torch_patches.py b/QEfficient/utils/torch_patches.py index e5c8aa675..241b32fbf 100644 --- a/QEfficient/utils/torch_patches.py +++ b/QEfficient/utils/torch_patches.py @@ -7,10 +7,11 @@ """Monkey patches for torch.onnx.utils to fix ONNX export issues.""" +import warnings + import torch import torch.onnx.utils as onnx_utils from torch import _C -import warnings # Store original references before patching _original_setup_trace_module_map = onnx_utils._setup_trace_module_map @@ -38,15 +39,13 @@ def _track_module_attributes_forward_hook(module, input, output): if hasattr(module, attr_name): onnx_attrs = getattr(module, attr_name) delattr(module, attr_name) - + # FIX: use empty dict to avoid type mismatch # onnx_attrs = {} try: _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs) except Exception as e: - warnings.warn( - f"Failed to track ONNX scope attributes: {e}. Skipping this step." - ) + warnings.warn(f"Failed to track ONNX scope attributes: {e}. Skipping this step.") for m in model.modules(): m.register_forward_hook(_track_module_attributes_forward_hook) diff --git a/examples/onboarding_guide/causallm/example_pytorch_transforms.py b/examples/onboarding_guide/causallm/example_pytorch_transforms.py index ff62588f9..503efc12d 100644 --- a/examples/onboarding_guide/causallm/example_pytorch_transforms.py +++ b/examples/onboarding_guide/causallm/example_pytorch_transforms.py @@ -27,12 +27,6 @@ from types import MethodType from typing import Callable, Optional, Tuple, Union -from QEfficient.transformers.models.blueprint.modeling_blueprint import ( - QEffBlueprintAttention, - QEffBlueprintDecoderLayer, - QEffBlueprintForCausalLM, - QEffBlueprintModel, -) from torch import nn # Example imports for three representative models @@ -62,6 +56,12 @@ from QEfficient.base.pytorch_transforms import ExternalModuleMapperTransform, ModuleMappingTransform from QEfficient.customop import CustomRMSNormAIC from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function +from QEfficient.transformers.models.blueprint.modeling_blueprint import ( + QEffBlueprintAttention, + QEffBlueprintDecoderLayer, + QEffBlueprintForCausalLM, + QEffBlueprintModel, +) from QEfficient.transformers.models.llama.modeling_llama import ( QEffLlamaAttention, QEffLlamaDecoderLayer, From 7e1327cea70f480591371cf9a565315dd9026890 Mon Sep 17 00:00:00 2001 From: abhishek-singh591 Date: Fri, 5 Dec 2025 07:58:54 +0000 Subject: [PATCH 4/4] Made minnor fixes Signed-off-by: abhishek-singh591 --- .../causallm/example_pytorch_transforms.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/onboarding_guide/causallm/example_pytorch_transforms.py b/examples/onboarding_guide/causallm/example_pytorch_transforms.py index 503efc12d..ff62588f9 100644 --- a/examples/onboarding_guide/causallm/example_pytorch_transforms.py +++ b/examples/onboarding_guide/causallm/example_pytorch_transforms.py @@ -27,6 +27,12 @@ from types import MethodType from typing import Callable, Optional, Tuple, Union +from QEfficient.transformers.models.blueprint.modeling_blueprint import ( + QEffBlueprintAttention, + QEffBlueprintDecoderLayer, + QEffBlueprintForCausalLM, + QEffBlueprintModel, +) from torch import nn # Example imports for three representative models @@ -56,12 +62,6 @@ from QEfficient.base.pytorch_transforms import ExternalModuleMapperTransform, ModuleMappingTransform from QEfficient.customop import CustomRMSNormAIC from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function -from QEfficient.transformers.models.blueprint.modeling_blueprint import ( - QEffBlueprintAttention, - QEffBlueprintDecoderLayer, - QEffBlueprintForCausalLM, - QEffBlueprintModel, -) from QEfficient.transformers.models.llama.modeling_llama import ( QEffLlamaAttention, QEffLlamaDecoderLayer,