From abd9648b68d6c4ed6c84e558e7d5599c563c0681 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Fri, 5 Dec 2025 07:41:40 +0000
Subject: [PATCH 1/4] Added support of subfunction to Qwen2.5VL

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 QEfficient/base/modeling_qeff.py              |   9 +-
 .../transformers/models/modeling_auto.py      |   1 -
 .../transformers/models/pytorch_transforms.py |  21 ++-
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py  |  11 +-
 QEfficient/utils/torch_patches.py             |  11 +-
 test.py                                       | 169 ++++++++++++++++++
 6 files changed, 201 insertions(+), 21 deletions(-)
 create mode 100644 test.py

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index ef7e83adf..2fc77a458 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -260,8 +260,13 @@ def _export(
                     "The subfunction feature is experimental. Please note that using compile consecutively with and without subfunction may produce inconsistent results."
                 )
                 apply_torch_patches()
-                InvalidIndexProvider.SUBFUNC_ENABLED = True
-                output_names = [re.sub("_RetainedState", "_InternalRetainedState", s) for s in output_names]
+                InvalidIndexProvider.SUBFUNC_ENABLED = True       
+                output_names = [
+                    re.sub("_RetainedState", "_InternalRetainedState", name)
+                    if name.endswith("_RetainedState") and ("key" in name or "value" in name)
+                    else name
+                    for name in output_names
+                ]
                 export_kwargs["export_modules_as_functions"] = get_decoder_layer_classes_for_export(self.model)
                 self._onnx_transforms.append(RenameFunctionOutputsTransform)
                 self._onnx_transforms.append(CustomOpTransform)
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index f3618cb1e..71e0caa8f 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1067,7 +1067,6 @@ def export(
                 kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode
             )
         output_names = self.model.get_output_names(kv_offload=True)
-
         self.vision_model.export(
             inputs["vision"],
             output_names["vision"],
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 21a867eb5..1e4358579 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -832,23 +832,28 @@ def get_decoder_layer_classes_for_export(model: nn.Module) -> set:
     Dynamically determine which DecoderLayer classes should be exported as functions
     based on the model's architecture using the existing KVCacheTransform mapping.
     """
-    # Define patterns that identify decoder layer classes
-    DECODER_LAYER_PATTERNS = ["DecoderLayer", "Block", "Layer"]
 
-    # Get all QEff classes that are decoder layers from the existing mapping
+    DECODER_LAYER_PATTERNS = ["DecoderLayer", "Block", "Layer"]
     decoder_layer_classes = set()
 
     for original_class, qeff_class in KVCacheTransform._module_mapping.items():
-        # Check if the QEff class name contains decoder layer patterns
         qeff_class_name = qeff_class.__name__
         if any(pattern in qeff_class_name for pattern in DECODER_LAYER_PATTERNS):
             decoder_layer_classes.add(qeff_class)
 
-    # Filter to only include classes that are actually used in the current model
     model_decoder_classes = set()
-    for module in model.modules():
-        if module.__class__ in decoder_layer_classes:
-            model_decoder_classes.add(module.__class__)
+    model_class_name = model.__class__.__name__
+    if "EncoderWrapper" in model_class_name:
+        model_decoder_classes.update(
+            module.__class__ for module in model.modules()
+            if "Qwen2_5_VLVisionBlock" in module.__class__.__name__
+        )
+        return model_decoder_classes
+
+    model_decoder_classes.update(
+        module.__class__ for module in model.modules()
+        if module.__class__ in decoder_layer_classes
+    )
 
     return model_decoder_classes
 
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 33a434db1..b88fd4925 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -73,20 +73,15 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, mrope_section, unsqu
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
-
-    mrope_section = mrope_section * 2
     cos = cos[position_ids]
     sin = sin[position_ids]
-
-    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim)
-    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim)
-
+    cos = torch.cat([cos[0,..., 0:32],cos[0,..., 32:80], cos[0,..., 80:128]], dim=-1).unsqueeze(0)
+    sin = torch.cat([sin[0,..., 0:32],sin[0,..., 32:80], sin[0,..., 80:128]], dim=-1).unsqueeze(0)    
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
-
+    
     return q_embed.to(q.dtype), k_embed.to(k.dtype)
 
-
 class QEffQwen2_5_VLVisionAttention(Qwen2_5_VLVisionAttention):
     def __init__(self, dim: int, num_heads: int = 16) -> None:
         super().__init__()
diff --git a/QEfficient/utils/torch_patches.py b/QEfficient/utils/torch_patches.py
index 0b9b37afa..e5c8aa675 100644
--- a/QEfficient/utils/torch_patches.py
+++ b/QEfficient/utils/torch_patches.py
@@ -10,6 +10,7 @@
 import torch
 import torch.onnx.utils as onnx_utils
 from torch import _C
+import warnings
 
 # Store original references before patching
 _original_setup_trace_module_map = onnx_utils._setup_trace_module_map
@@ -37,9 +38,15 @@ def _track_module_attributes_forward_hook(module, input, output):
             if hasattr(module, attr_name):
                 onnx_attrs = getattr(module, attr_name)
                 delattr(module, attr_name)
+            
             # FIX: use empty dict to avoid type mismatch
-            onnx_attrs = {}
-            _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+            # onnx_attrs = {}
+            try:
+                _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+            except Exception as e:
+                warnings.warn(
+                    f"Failed to track ONNX scope attributes: {e}. Skipping this step."
+                )
 
         for m in model.modules():
             m.register_forward_hook(_track_module_attributes_forward_hook)
diff --git a/test.py b/test.py
new file mode 100644
index 000000000..46d8f7ff8
--- /dev/null
+++ b/test.py
@@ -0,0 +1,169 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+'''
+For running qwen 2.5 32B VL model with subfunction using Qefficient one need to first
+export encoder and decoder separately and then use them in the pipeline.
+for this please refer to Qefficient.transformers.models.modeling_auto.py
+  1. for exporting only encoder comment from line 1028-1035 and run this script also in this script skip_vision=False and skip_lang=True.
+  2. for exporting only decoder comment from line 1017-1023 and uncomment the above and then run this script skip_vision=True and skip_lang=False.
+'''
+
+# If we want to enable QBlocking Run below command:, default is without blocking
+# ATTENTION_BLOCKING_MODE=q num_q_blocks=2 python -W ignore qwen2_5_vl_example.py
+
+import requests
+import transformers
+from PIL import Image
+from qwen_vl_utils import process_vision_info
+from transformers import AutoConfig, AutoProcessor, TextStreamer
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+## For AWQ model update pytorch version to 2.8.*
+model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
+config = AutoConfig.from_pretrained(model_id)
+config.text_config.num_hidden_layers = 2
+
+qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+    model_id, attn_implementation="eager", kv_offload=True, config=config
+)
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+### use skip_vision=Ture, if want to run only text, ow false ###
+skip_vision = True
+skip_lang = False
+if skip_vision:
+    ## Only Text ##
+
+    ## Set Batch_Size ##
+    batch_size = 1
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=4096,
+        num_cores=16,
+        num_devices=4,
+        height=354,
+        width=536,
+        mxfp6_matmul=False,
+        aic_enable_depth_first=True,
+        skip_vision=True,
+        mos=1,
+        use_onnx_subfunctions=True,
+    )
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Tell me about yourself."},
+            ],
+        },
+    ]
+
+    messages = [messages] * batch_size
+
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
+    print(output.generated_ids)
+    print(tokenizer.batch_decode(output.generated_ids))
+    print(output)
+
+else:
+    batch_size = 1
+    ## Vision + Text ##
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=4096,
+        num_cores=16,
+        num_devices=4,
+        height=354,
+        width=536,
+        mxfp6_matmul=True,
+        mxint8_kv_cache=True,
+        aic_enable_depth_first=True,
+        skip_lang=True,
+        mos=1,
+        use_onnx_subfunctions=True,
+    )
+
+    ### IMAGE + TEXT ###
+    image_url = "https://picsum.photos/id/237/536/354"
+
+    image = Image.open(requests.get(image_url, stream=True).raw)
+
+    messages_1 = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Describe this image."},
+            ],
+        },
+    ]
+
+    messages_2 = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Describe about the color of the dog."},
+            ],
+        },
+    ]
+
+    messages = [messages_2] * batch_size
+
+    texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
+
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=texts,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
+    print(output.generated_ids)
+    print(tokenizer.batch_decode(output.generated_ids))
+    print(output)
+
+
+
+# import os
+# from QEfficient import QEFFAutoModelForCausalLM
+# from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# os.environ["QEFF_USE_ONNX_FUNCTIONS"] = "True"
+# os.environ["QAIC_COMPILER_OPTS_UNSUPPORTED"] = "-loader-inline-all=0"
+
+
+# model = QEFFAutoModelForCausalLM.from_pretrained("gpt2", num_hidden_layers=2)
+# model.compile(num_devices=2)
+# tokenizer = AutoTokenizer.from_pretrained("gpt2")
+# model.generate(prompts=["Hi there!!"], tokenizer=tokenizer)
+
+
+# export QAIC_COMPILER_OPTS_UNSUPPORTED="-loader-inline-all=0"
\ No newline at end of file

From 8f78722899ebdeaa41fe491805f9dd84f4ccf1d1 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Fri, 5 Dec 2025 07:42:16 +0000
Subject: [PATCH 2/4] Added support of subfunction to Qwen2.5VL

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 test.py | 169 --------------------------------------------------------
 1 file changed, 169 deletions(-)
 delete mode 100644 test.py

diff --git a/test.py b/test.py
deleted file mode 100644
index 46d8f7ff8..000000000
--- a/test.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-'''
-For running qwen 2.5 32B VL model with subfunction using Qefficient one need to first
-export encoder and decoder separately and then use them in the pipeline.
-for this please refer to Qefficient.transformers.models.modeling_auto.py
-  1. for exporting only encoder comment from line 1028-1035 and run this script also in this script skip_vision=False and skip_lang=True.
-  2. for exporting only decoder comment from line 1017-1023 and uncomment the above and then run this script skip_vision=True and skip_lang=False.
-'''
-
-# If we want to enable QBlocking Run below command:, default is without blocking
-# ATTENTION_BLOCKING_MODE=q num_q_blocks=2 python -W ignore qwen2_5_vl_example.py
-
-import requests
-import transformers
-from PIL import Image
-from qwen_vl_utils import process_vision_info
-from transformers import AutoConfig, AutoProcessor, TextStreamer
-
-from QEfficient import QEFFAutoModelForImageTextToText
-
-## For AWQ model update pytorch version to 2.8.*
-model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
-config = AutoConfig.from_pretrained(model_id)
-config.text_config.num_hidden_layers = 2
-
-qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
-    model_id, attn_implementation="eager", kv_offload=True, config=config
-)
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
-processor = AutoProcessor.from_pretrained(model_id)
-
-### use skip_vision=Ture, if want to run only text, ow false ###
-skip_vision = True
-skip_lang = False
-if skip_vision:
-    ## Only Text ##
-
-    ## Set Batch_Size ##
-    batch_size = 1
-    qeff_model.compile(
-        batch_size=batch_size,
-        prefill_seq_len=128,
-        ctx_len=4096,
-        num_cores=16,
-        num_devices=4,
-        height=354,
-        width=536,
-        mxfp6_matmul=False,
-        aic_enable_depth_first=True,
-        skip_vision=True,
-        mos=1,
-        use_onnx_subfunctions=True,
-    )
-
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "Tell me about yourself."},
-            ],
-        },
-    ]
-
-    messages = [messages] * batch_size
-
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt",
-    )
-
-    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
-
-    streamer = TextStreamer(tokenizer)
-    output = qeff_model.generate(inputs=inputs, generation_len=100)
-    print(output.generated_ids)
-    print(tokenizer.batch_decode(output.generated_ids))
-    print(output)
-
-else:
-    batch_size = 1
-    ## Vision + Text ##
-    qeff_model.compile(
-        batch_size=batch_size,
-        prefill_seq_len=128,
-        ctx_len=4096,
-        num_cores=16,
-        num_devices=4,
-        height=354,
-        width=536,
-        mxfp6_matmul=True,
-        mxint8_kv_cache=True,
-        aic_enable_depth_first=True,
-        skip_lang=True,
-        mos=1,
-        use_onnx_subfunctions=True,
-    )
-
-    ### IMAGE + TEXT ###
-    image_url = "https://picsum.photos/id/237/536/354"
-
-    image = Image.open(requests.get(image_url, stream=True).raw)
-
-    messages_1 = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": "Describe this image."},
-            ],
-        },
-    ]
-
-    messages_2 = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": "Describe about the color of the dog."},
-            ],
-        },
-    ]
-
-    messages = [messages_2] * batch_size
-
-    texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
-
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=texts,
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-
-    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
-
-    streamer = TextStreamer(tokenizer)
-    output = qeff_model.generate(inputs=inputs, generation_len=100)
-    print(output.generated_ids)
-    print(tokenizer.batch_decode(output.generated_ids))
-    print(output)
-
-
-
-# import os
-# from QEfficient import QEFFAutoModelForCausalLM
-# from transformers import AutoTokenizer, AutoModelForCausalLM
-
-# os.environ["QEFF_USE_ONNX_FUNCTIONS"] = "True"
-# os.environ["QAIC_COMPILER_OPTS_UNSUPPORTED"] = "-loader-inline-all=0"
-
-
-# model = QEFFAutoModelForCausalLM.from_pretrained("gpt2", num_hidden_layers=2)
-# model.compile(num_devices=2)
-# tokenizer = AutoTokenizer.from_pretrained("gpt2")
-# model.generate(prompts=["Hi there!!"], tokenizer=tokenizer)
-
-
-# export QAIC_COMPILER_OPTS_UNSUPPORTED="-loader-inline-all=0"
\ No newline at end of file

From 2871558b1eff816c21cbd4b1be16a898a2966eb7 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Fri, 5 Dec 2025 07:54:14 +0000
Subject: [PATCH 3/4] Resolved lint and format error

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 QEfficient/base/modeling_qeff.py                     |  2 +-
 QEfficient/transformers/models/modeling_auto.py      |  1 +
 QEfficient/transformers/models/pytorch_transforms.py |  6 ++----
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py         |  7 ++++---
 QEfficient/utils/torch_patches.py                    |  9 ++++-----
 .../causallm/example_pytorch_transforms.py           | 12 ++++++------
 6 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index 2fc77a458..0378a800f 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -260,7 +260,7 @@ def _export(
                     "The subfunction feature is experimental. Please note that using compile consecutively with and without subfunction may produce inconsistent results."
                 )
                 apply_torch_patches()
-                InvalidIndexProvider.SUBFUNC_ENABLED = True       
+                InvalidIndexProvider.SUBFUNC_ENABLED = True
                 output_names = [
                     re.sub("_RetainedState", "_InternalRetainedState", name)
                     if name.endswith("_RetainedState") and ("key" in name or "value" in name)
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 71e0caa8f..f3618cb1e 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1067,6 +1067,7 @@ def export(
                 kv_offload=True, comp_ctx_lengths=self.comp_ctx_lengths_decode
             )
         output_names = self.model.get_output_names(kv_offload=True)
+
         self.vision_model.export(
             inputs["vision"],
             output_names["vision"],
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 1e4358579..e64634b62 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -845,14 +845,12 @@ def get_decoder_layer_classes_for_export(model: nn.Module) -> set:
     model_class_name = model.__class__.__name__
     if "EncoderWrapper" in model_class_name:
         model_decoder_classes.update(
-            module.__class__ for module in model.modules()
-            if "Qwen2_5_VLVisionBlock" in module.__class__.__name__
+            module.__class__ for module in model.modules() if "Qwen2_5_VLVisionBlock" in module.__class__.__name__
         )
         return model_decoder_classes
 
     model_decoder_classes.update(
-        module.__class__ for module in model.modules()
-        if module.__class__ in decoder_layer_classes
+        module.__class__ for module in model.modules() if module.__class__ in decoder_layer_classes
     )
 
     return model_decoder_classes
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index b88fd4925..018ce0851 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -75,13 +75,14 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, mrope_section, unsqu
     """
     cos = cos[position_ids]
     sin = sin[position_ids]
-    cos = torch.cat([cos[0,..., 0:32],cos[0,..., 32:80], cos[0,..., 80:128]], dim=-1).unsqueeze(0)
-    sin = torch.cat([sin[0,..., 0:32],sin[0,..., 32:80], sin[0,..., 80:128]], dim=-1).unsqueeze(0)    
+    cos = torch.cat([cos[0, ..., 0:32], cos[0, ..., 32:80], cos[0, ..., 80:128]], dim=-1).unsqueeze(0)
+    sin = torch.cat([sin[0, ..., 0:32], sin[0, ..., 32:80], sin[0, ..., 80:128]], dim=-1).unsqueeze(0)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
-    
+
     return q_embed.to(q.dtype), k_embed.to(k.dtype)
 
+
 class QEffQwen2_5_VLVisionAttention(Qwen2_5_VLVisionAttention):
     def __init__(self, dim: int, num_heads: int = 16) -> None:
         super().__init__()
diff --git a/QEfficient/utils/torch_patches.py b/QEfficient/utils/torch_patches.py
index e5c8aa675..241b32fbf 100644
--- a/QEfficient/utils/torch_patches.py
+++ b/QEfficient/utils/torch_patches.py
@@ -7,10 +7,11 @@
 
 """Monkey patches for torch.onnx.utils to fix ONNX export issues."""
 
+import warnings
+
 import torch
 import torch.onnx.utils as onnx_utils
 from torch import _C
-import warnings
 
 # Store original references before patching
 _original_setup_trace_module_map = onnx_utils._setup_trace_module_map
@@ -38,15 +39,13 @@ def _track_module_attributes_forward_hook(module, input, output):
             if hasattr(module, attr_name):
                 onnx_attrs = getattr(module, attr_name)
                 delattr(module, attr_name)
-            
+
             # FIX: use empty dict to avoid type mismatch
             # onnx_attrs = {}
             try:
                 _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
             except Exception as e:
-                warnings.warn(
-                    f"Failed to track ONNX scope attributes: {e}. Skipping this step."
-                )
+                warnings.warn(f"Failed to track ONNX scope attributes: {e}. Skipping this step.")
 
         for m in model.modules():
             m.register_forward_hook(_track_module_attributes_forward_hook)
diff --git a/examples/onboarding_guide/causallm/example_pytorch_transforms.py b/examples/onboarding_guide/causallm/example_pytorch_transforms.py
index ff62588f9..503efc12d 100644
--- a/examples/onboarding_guide/causallm/example_pytorch_transforms.py
+++ b/examples/onboarding_guide/causallm/example_pytorch_transforms.py
@@ -27,12 +27,6 @@
 from types import MethodType
 from typing import Callable, Optional, Tuple, Union
 
-from QEfficient.transformers.models.blueprint.modeling_blueprint import (
-    QEffBlueprintAttention,
-    QEffBlueprintDecoderLayer,
-    QEffBlueprintForCausalLM,
-    QEffBlueprintModel,
-)
 from torch import nn
 
 # Example imports for three representative models
@@ -62,6 +56,12 @@
 from QEfficient.base.pytorch_transforms import ExternalModuleMapperTransform, ModuleMappingTransform
 from QEfficient.customop import CustomRMSNormAIC
 from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function
+from QEfficient.transformers.models.blueprint.modeling_blueprint import (
+    QEffBlueprintAttention,
+    QEffBlueprintDecoderLayer,
+    QEffBlueprintForCausalLM,
+    QEffBlueprintModel,
+)
 from QEfficient.transformers.models.llama.modeling_llama import (
     QEffLlamaAttention,
     QEffLlamaDecoderLayer,

From 7e1327cea70f480591371cf9a565315dd9026890 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Fri, 5 Dec 2025 07:58:54 +0000
Subject: [PATCH 4/4] Made minnor fixes

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 .../causallm/example_pytorch_transforms.py           | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/onboarding_guide/causallm/example_pytorch_transforms.py b/examples/onboarding_guide/causallm/example_pytorch_transforms.py
index 503efc12d..ff62588f9 100644
--- a/examples/onboarding_guide/causallm/example_pytorch_transforms.py
+++ b/examples/onboarding_guide/causallm/example_pytorch_transforms.py
@@ -27,6 +27,12 @@
 from types import MethodType
 from typing import Callable, Optional, Tuple, Union
 
+from QEfficient.transformers.models.blueprint.modeling_blueprint import (
+    QEffBlueprintAttention,
+    QEffBlueprintDecoderLayer,
+    QEffBlueprintForCausalLM,
+    QEffBlueprintModel,
+)
 from torch import nn
 
 # Example imports for three representative models
@@ -56,12 +62,6 @@
 from QEfficient.base.pytorch_transforms import ExternalModuleMapperTransform, ModuleMappingTransform
 from QEfficient.customop import CustomRMSNormAIC
 from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function
-from QEfficient.transformers.models.blueprint.modeling_blueprint import (
-    QEffBlueprintAttention,
-    QEffBlueprintDecoderLayer,
-    QEffBlueprintForCausalLM,
-    QEffBlueprintModel,
-)
 from QEfficient.transformers.models.llama.modeling_llama import (
     QEffLlamaAttention,
     QEffLlamaDecoderLayer,