diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index 2b0d50849..3bf747058 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -10,6 +10,8 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
+import transformers
+from packaging import version
 from transformers.cache_utils import DynamicCache, DynamicLayer, EncoderDecoderCache, HybridCache, HybridChunkedCache
 
 from QEfficient.customop import (
@@ -330,12 +332,15 @@ def __init__(
         layers = []
         # If a config is passed, use it to infer the layer types and initialize accordingly
         if len(layers) == 0:
-            Cache.__init__(
-                self,
-                layer_class_to_replicate=QEffDynamicLayer,
-                offloading=offloading,
-                offload_only_non_sliding=offload_only_non_sliding,
-            )
+            if version.parse(transformers.__version__) < version.parse("4.57.0"):
+                Cache.__init__(self, layer_classes=QEffDynamicLayer, *args, **kwargs)
+            else:
+                Cache.__init__(
+                    self,
+                    layer_class_to_replicate=QEffDynamicLayer,
+                    offloading=offloading,
+                    offload_only_non_sliding=offload_only_non_sliding,
+                )
         else:
             Cache.__init__(
                 self,
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 00e7f2d23..0564dd834 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -10,6 +10,8 @@
 from types import MethodType
 from typing import Callable, Optional, Tuple, Union
 
+import transformers
+from packaging import version
 from torch import nn
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
@@ -194,27 +196,29 @@
     Qwen3MoeRotaryEmbedding,
     Qwen3MoeSparseMoeBlock,
 )
-from transformers.models.qwen3_vl.modeling_qwen3_vl import (
-    Qwen3VLForConditionalGeneration,
-    Qwen3VLModel,
-    Qwen3VLTextAttention,
-    Qwen3VLTextDecoderLayer,
-    Qwen3VLTextModel,
-    Qwen3VLTextRMSNorm,
-    Qwen3VLVisionAttention,
-    Qwen3VLVisionModel,
-)
-from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
-    Qwen3VLMoeForConditionalGeneration,
-    Qwen3VLMoeModel,
-    Qwen3VLMoeTextAttention,
-    Qwen3VLMoeTextDecoderLayer,
-    Qwen3VLMoeTextModel,
-    Qwen3VLMoeTextRMSNorm,
-    Qwen3VLMoeTextSparseMoeBlock,
-    Qwen3VLMoeVisionAttention,
-    Qwen3VLMoeVisionModel,
-)
+
+if version.parse(transformers.__version__) >= version.parse("4.57.0"):
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import (
+        Qwen3VLForConditionalGeneration,
+        Qwen3VLModel,
+        Qwen3VLTextAttention,
+        Qwen3VLTextDecoderLayer,
+        Qwen3VLTextModel,
+        Qwen3VLTextRMSNorm,
+        Qwen3VLVisionAttention,
+        Qwen3VLVisionModel,
+    )
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
+        Qwen3VLMoeForConditionalGeneration,
+        Qwen3VLMoeModel,
+        Qwen3VLMoeTextAttention,
+        Qwen3VLMoeTextDecoderLayer,
+        Qwen3VLMoeTextModel,
+        Qwen3VLMoeTextRMSNorm,
+        Qwen3VLMoeTextSparseMoeBlock,
+        Qwen3VLMoeVisionAttention,
+        Qwen3VLMoeVisionModel,
+    )
 from transformers.models.starcoder2.modeling_starcoder2 import (
     Starcoder2Attention,
     Starcoder2DecoderLayer,
@@ -454,26 +458,28 @@
     QEffQwen3MoeRotaryEmbedding,
     QEffQwen3MoeSparseMoeBlock,
 )
-from QEfficient.transformers.models.qwen3_vl.modeling_qwen3_vl import (
-    QEffQwen3VLForConditionalGeneration,
-    QEffQwen3VLModel,
-    QEffQwen3VLTextAttention,
-    QEffQwen3VLTextDecoderLayer,
-    QEffQwen3VLTextModel,
-    QEffQwen3VLVisionAttention,
-    QEffQwen3VLVisionModel,
-)
-from QEfficient.transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
-    QEffPrefillChunkedQwen3VLMoeTextSparseMoeBlock,
-    QEffQwen3VLMoeForConditionalGeneration,
-    QEffQwen3VLMoeModel,
-    QEffQwen3VLMoeTextAttention,
-    QEffQwen3VLMoeTextDecoderLayer,
-    QEffQwen3VLMoeTextModel,
-    QEffQwen3VLMoeTextSparseMoeBlock,
-    QEffQwen3VLMoeVisionAttention,
-    QEffQwen3VLMoeVisionModel,
-)
+
+if version.parse(transformers.__version__) >= version.parse("4.57.0"):
+    from QEfficient.transformers.models.qwen3_vl.modeling_qwen3_vl import (
+        QEffQwen3VLForConditionalGeneration,
+        QEffQwen3VLModel,
+        QEffQwen3VLTextAttention,
+        QEffQwen3VLTextDecoderLayer,
+        QEffQwen3VLTextModel,
+        QEffQwen3VLVisionAttention,
+        QEffQwen3VLVisionModel,
+    )
+    from QEfficient.transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
+        QEffPrefillChunkedQwen3VLMoeTextSparseMoeBlock,
+        QEffQwen3VLMoeForConditionalGeneration,
+        QEffQwen3VLMoeModel,
+        QEffQwen3VLMoeTextAttention,
+        QEffQwen3VLMoeTextDecoderLayer,
+        QEffQwen3VLMoeTextModel,
+        QEffQwen3VLMoeTextSparseMoeBlock,
+        QEffQwen3VLMoeVisionAttention,
+        QEffQwen3VLMoeVisionModel,
+    )
 from QEfficient.transformers.models.starcoder2.modeling_starcoder2 import (
     QEffStarcoder2Attention,
     QEFFStarcoder2DecoderLayer,
@@ -501,217 +507,226 @@
 
 
 class CustomOpsTransform(ModuleMappingTransform):
-    _module_mapping = {
-        GemmaRMSNorm: GemmaCustomRMSNormAIC,
-        Gemma2RMSNorm: GemmaCustomRMSNormAIC,
-        GptOssRMSNorm: CustomRMSNormAIC,
-        LlamaRMSNorm: CustomRMSNormAIC,
-        Llama4TextRMSNorm: CustomRMSNormAIC,
-        MistralRMSNorm: CustomRMSNormAIC,
-        Mistral3RMSNorm: CustomRMSNormAIC,
-        MixtralRMSNorm: CustomRMSNormAIC,
-        Phi3RMSNorm: CustomRMSNormAIC,
-        Qwen2RMSNorm: CustomRMSNormAIC,
-        Qwen3RMSNorm: CustomRMSNormAIC,
-        Qwen2_5RMSNorm: CustomRMSNormAIC,
-        MllamaTextRMSNorm: CustomRMSNormAIC,
-        GraniteRMSNorm: CustomRMSNormAIC,
-        PixtralRMSNorm: CustomRMSNormAIC,
-        GraniteMoeRMSNorm: CustomRMSNormAIC,
-        Qwen3MoeRMSNorm: CustomRMSNormAIC,
-        Gemma3RMSNorm: QEffGemma3CustomRMSNormAIC,
-        Olmo2RMSNorm: CustomRMSNormAIC,
-        Qwen3VLMoeTextRMSNorm: CustomRMSNormAIC,
-        Qwen3VLTextRMSNorm: CustomRMSNormAIC,
-    }
+    if version.parse(transformers.__version__) >= version.parse("4.57.0"):
+        _module_mapping = {
+            Qwen3VLMoeTextRMSNorm: CustomRMSNormAIC,
+            Qwen3VLTextRMSNorm: CustomRMSNormAIC,
+        }
+    else:
+        _module_mapping = {
+            GemmaRMSNorm: GemmaCustomRMSNormAIC,
+            Gemma2RMSNorm: GemmaCustomRMSNormAIC,
+            GptOssRMSNorm: CustomRMSNormAIC,
+            LlamaRMSNorm: CustomRMSNormAIC,
+            Llama4TextRMSNorm: CustomRMSNormAIC,
+            MistralRMSNorm: CustomRMSNormAIC,
+            Mistral3RMSNorm: CustomRMSNormAIC,
+            MixtralRMSNorm: CustomRMSNormAIC,
+            Phi3RMSNorm: CustomRMSNormAIC,
+            Qwen2RMSNorm: CustomRMSNormAIC,
+            Qwen3RMSNorm: CustomRMSNormAIC,
+            Qwen2_5RMSNorm: CustomRMSNormAIC,
+            MllamaTextRMSNorm: CustomRMSNormAIC,
+            GraniteRMSNorm: CustomRMSNormAIC,
+            PixtralRMSNorm: CustomRMSNormAIC,
+            GraniteMoeRMSNorm: CustomRMSNormAIC,
+            Qwen3MoeRMSNorm: CustomRMSNormAIC,
+            Gemma3RMSNorm: QEffGemma3CustomRMSNormAIC,
+            Olmo2RMSNorm: CustomRMSNormAIC,
+        }
 
 
 class KVCacheTransform(ModuleMappingTransform):
-    _module_mapping = {
-        # CodeGen
-        CodeGenAttention: QEffCodeGenAttention,
-        CodeGenBlock: QEffCodeGenBlock,
-        CodeGenModel: QEffCodeGenModel,
-        CodeGenForCausalLM: QEffCodeGenForCausalLM,
-        # Falcon
-        FalconAttention: QEffFalconAttention,
-        FalconDecoderLayer: QEffFalconDecoderLayer,
-        FalconModel: QEffFalconModel,
-        FalconForCausalLM: QEffFalconForCausalLM,
-        # GPT2
-        GPT2Attention: QEffGPT2Attention,
-        GPT2Block: QEffGPT2Block,
-        GPT2Model: QEffGPT2Model,
-        GPT2LMHeadModel: QEffGPT2LMHeadModel,
-        # GPTJ
-        GPTJAttention: QEffGPTJAttention,
-        GPTJBlock: QEffGPTJBlock,
-        GPTJModel: QEffGPTJModel,
-        GPTJForCausalLM: QEffGPTJForCausalLM,
-        # Llama
-        LlamaAttention: QEffLlamaAttention,
-        LlamaDecoderLayer: QEffLlamaDecoderLayer,
-        LlamaModel: QEffLlamaModel,
-        LlamaForCausalLM: QEffLlamaForCausalLM,
-        LlamaRotaryEmbedding: QEffLlamaRotaryEmbedding,
-        # Llama4
-        Llama4TextAttention: QEffLlama4TextAttention,
-        Llama4ForCausalLM: QEffLlama4ForCausalLM,
-        Llama4TextDecoderLayer: QEffLlama4TextDecoderLayer,
-        Llama4TextModel: QEffLlama4TextModel,
-        Llama4TextMoe: QEffLlama4TextMoe,
-        Llama4ForConditionalGeneration: QEffLlama4ForConditionalGeneration,
-        Llama4VisionAttention: QEffLlama4VisionAttention,
-        Llama4VisionModel: QEffLlama4VisionModel,
-        Llama4TextExperts: QEffLlama4TextExperts,
-        Llama4Router: QEffLlama4Router,
-        # Llava
-        LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration,
-        # Llava Next
-        LlavaNextForConditionalGeneration: QEffLlavaNextForConditionalGeneration,
-        # Gemma
-        GemmaAttention: QEffGemmaAttention,
-        GemmaDecoderLayer: QEffGemmaDecoderLayer,
-        GemmaModel: QEffGemmaModel,
-        GemmaForCausalLM: QEffGemmaForCausalLM,
-        # Qwen3Moe
-        Qwen3MoeForCausalLM: QEffQwen3MoeForCausalLM,
-        Qwen3MoeModel: QEffQwen3MoeModel,
-        Qwen3MoeDecoderLayer: QEffQwen3MoeDecoderLayer,
-        Qwen3MoeAttention: QEffQwen3MoeAttention,
-        Qwen3MoeRotaryEmbedding: QEffQwen3MoeRotaryEmbedding,
-        Qwen3MoeSparseMoeBlock: QEffQwen3MoeSparseMoeBlock,
-        Qwen3VLMoeForConditionalGeneration: QEffQwen3VLMoeForConditionalGeneration,
-        Qwen3VLMoeModel: QEffQwen3VLMoeModel,
-        Qwen3VLMoeTextAttention: QEffQwen3VLMoeTextAttention,
-        Qwen3VLMoeTextDecoderLayer: QEffQwen3VLMoeTextDecoderLayer,
-        Qwen3VLMoeVisionAttention: QEffQwen3VLMoeVisionAttention,
-        Qwen3VLMoeVisionModel: QEffQwen3VLMoeVisionModel,
-        Qwen3VLMoeTextModel: QEffQwen3VLMoeTextModel,
-        Qwen3VLMoeTextSparseMoeBlock: QEffQwen3VLMoeTextSparseMoeBlock,
-        # Qwen3vl
-        Qwen3VLForConditionalGeneration: QEffQwen3VLForConditionalGeneration,
-        Qwen3VLModel: QEffQwen3VLModel,
-        Qwen3VLTextAttention: QEffQwen3VLTextAttention,
-        Qwen3VLTextDecoderLayer: QEffQwen3VLTextDecoderLayer,
-        Qwen3VLVisionAttention: QEffQwen3VLVisionAttention,
-        Qwen3VLVisionModel: QEffQwen3VLVisionModel,
-        Qwen3VLTextModel: QEffQwen3VLTextModel,
-        # Gemma2
-        Gemma2Attention: QEffGemma2Attention,
-        Gemma2DecoderLayer: QEffGemma2DecoderLayer,
-        Gemma2Model: QEffGemma2Model,
-        Gemma2ForCausalLM: QEffGemma2ForCausalLM,
-        # Gemma3
-        Gemma3Attention: QEffGemma3Attention,
-        Gemma3DecoderLayer: QEffGemma3DecoderLayer,
-        Gemma3TextModel: QEffGemma3TextModel,
-        Gemma3ForCausalLM: QEffGemma3ForCausalLMModel,
-        Gemma3ForConditionalGeneration: QEffGemma3ForConditionalGeneration,
-        # GPT_OSS
-        GptOssAttention: QEffGptOssAttention,
-        GptOssDecoderLayer: QEffGptOssDecoderLayer,
-        GptOssModel: QEffGptOssModel,
-        GptOssForCausalLM: QEffGptOssForCausalLM,
-        GptOssMLP: QEffGptOssMLP,
-        GptOssExperts: QEffGptOssExperts,
-        # Granite
-        GraniteModel: QEffGraniteModel,
-        GraniteForCausalLM: QEffGraniteForCausalLM,
-        GraniteAttention: QEffGraniteAttention,
-        GraniteDecoderLayer: QEffGraniteDecoderLayer,
-        # GraniteMoe
-        GraniteMoeModel: QEffGraniteMoeModel,
-        GraniteMoeForCausalLM: QEffGraniteMoeForCausalLM,
-        GraniteMoeAttention: QEffGraniteMoeAttention,
-        GraniteMoeRotaryEmbedding: QEffGraniteMoeRotaryEmbedding,
-        GraniteMoeParallelExperts: QEffGraniteMoeParallelExperts,
-        GraniteMoeTopKGating: QEffGraniteMoeTopKGating,
-        GraniteMoeMoE: QEffGraniteMoeMoE,
-        # mllama
-        MllamaTextRMSNorm: CustomRMSNormAIC,
-        MllamaTextSelfAttention: QEffMllamaTextSelfAttention,
-        MllamaSelfAttentionDecoderLayer: QEffMllamaSelfAttentionDecoderLayer,
-        MllamaModel: QEffMllamaModel,
-        MllamaCrossAttentionDecoderLayer: QEffMllamaCrossAttentionDecoderLayer,
-        MllamaRotaryEmbedding: QEffMllamaRotaryEmbedding,
-        MllamaVisionModel: QEffMllamaVisionModel,
-        MllamaTextModel: QEffMllamaTextModel,
-        MllamaForCausalLM: QEffMllamaForCausalLM,
-        MllamaForConditionalGeneration: QEffMllamaForConditionalGeneration,
-        # Mistral
-        MistralAttention: QEffMistralAttention,
-        MistralDecoderLayer: QEffMistralDecoderLayer,
-        MistralModel: QEffMistralModel,
-        MistralForCausalLM: QEffMistralForCausalLM,
-        # Mistral3
-        Mistral3ForConditionalGeneration: QEffMistral3ForConditionalGeneration,
-        Mistral3Model: QEffMistral3Model,
-        # Mixtral
-        MixtralAttention: QEffMixtralAttention,
-        MixtralSparseMoeBlock: QEffMixtralSparseMoeBlock,
-        MixtralDecoderLayer: QeffMixtralDecoderLayer,
-        MixtralModel: QEffMixtralModel,
-        MixtralForCausalLM: QEffMixtralForCausalLM,
-        # Mpt
-        MptAttention: QEffMptAttention,
-        MptBlock: QEffMptBlock,
-        MptModel: QEFfMptModel,
-        MptForCausalLM: QEffMptForCausalLM,
-        # Phi3
-        Phi3Attention: QEffPhi3Attention,
-        Phi3DecoderLayer: QEffPhi3DecoderLayer,
-        Phi3Model: QEffPhi3Model,
-        Phi3ForCausalLM: QEffPhi3ForCausalLM,
-        # Phi
-        PhiAttention: QEffPhiAttention,
-        PhiDecoderLayer: QEffPhiDecoderLayer,
-        PhiModel: QEffPhiModel,
-        PhiForCausalLM: QEffPhiForCausalLM,
-        # Pixtral
-        PixtralVisionModel: QEffPixtralVisionModel,
-        # Qwen2
-        Qwen2Attention: QEffQwen2Attention,
-        Qwen2DecoderLayer: QEffQwen2DecoderLayer,
-        Qwen2Model: QEffQwen2Model,
-        Qwen2ForCausalLM: QEffQwen2ForCausalLM,
-        # Qwen3
-        Qwen3Attention: QEffQwen3Attention,
-        Qwen3DecoderLayer: QEffQwen3DecoderLayer,
-        Qwen3Model: QEffQwen3Model,
-        Qwen3ForCausalLM: QEffQwen3ForCausalLM,
-        # Qwen2.5 VL
-        Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration,
-        Qwen2_5_VLModel: QEffQwen2_5_VLModel,
-        Qwen2_5_VLAttention: QEffQwen2_5_VLAttention,
-        Qwen2_5_VLDecoderLayer: QEffQwen2_5_VLDecoderLayer,
-        Qwen2_5_VisionTransformerPretrainedModel: QEffQwen2_5_VisionTransformerPretrainedModel,
-        Qwen2_5_VLVisionAttention: QEffQwen2_5_VLVisionAttention,
-        Qwen2_5_VLTextModel: QEffQwen2_5_VLTextModel,
-        # Starcoder2
-        Starcoder2Attention: QEffStarcoder2Attention,
-        Starcoder2DecoderLayer: QEFFStarcoder2DecoderLayer,
-        Starcoder2Model: QEffStarcoder2Model,
-        Starcoder2ForCausalLM: QEffStarcoder2ForCausalLM,
-        # GptBigcode
-        GPTBigCodeAttention: QEffGPTBigCodeAttention,
-        GPTBigCodeBlock: QEffGPTBigCodeBlock,
-        GPTBigCodeModel: QEffGPTBigCodeModel,
-        GPTBigCodeForCausalLM: QEffGPTBigCodeForCausalLM,
-        # Olmo2
-        Olmo2Attention: QEffOlmo2Attention,
-        Olmo2DecoderLayer: QEffOlmo2DecoderLayer,
-        Olmo2Model: QEffOlmo2Model,
-        Olmo2ForCausalLM: QEffOlmo2ForCausalLM,
-        # Whisper encoder and decoder layers
-        WhisperPositionalEmbedding: QEffWhisperPositionalEmbedding,
-        WhisperAttention: QEffWhisperAttention,
-        WhisperDecoderLayer: QEffWhisperDecoderLayer,
-        WhisperEncoder: QEffWhisperEncoder,
-        WhisperDecoder: QEffWhisperDecoder,
-        WhisperModel: QEffWhisperModel,
-        WhisperForConditionalGeneration: QEffWhisperForConditionalGeneration,
-    }
+    if version.parse(transformers.__version__) >= version.parse("4.57.0"):
+        _module_mapping = {
+            # Qwen3VlMoe
+            Qwen3VLMoeForConditionalGeneration: QEffQwen3VLMoeForConditionalGeneration,
+            Qwen3VLMoeModel: QEffQwen3VLMoeModel,
+            Qwen3VLMoeTextAttention: QEffQwen3VLMoeTextAttention,
+            Qwen3VLMoeTextDecoderLayer: QEffQwen3VLMoeTextDecoderLayer,
+            Qwen3VLMoeVisionAttention: QEffQwen3VLMoeVisionAttention,
+            Qwen3VLMoeVisionModel: QEffQwen3VLMoeVisionModel,
+            Qwen3VLMoeTextModel: QEffQwen3VLMoeTextModel,
+            Qwen3VLMoeTextSparseMoeBlock: QEffQwen3VLMoeTextSparseMoeBlock,
+            # Qwen3vl
+            Qwen3VLForConditionalGeneration: QEffQwen3VLForConditionalGeneration,
+            Qwen3VLModel: QEffQwen3VLModel,
+            Qwen3VLTextAttention: QEffQwen3VLTextAttention,
+            Qwen3VLTextDecoderLayer: QEffQwen3VLTextDecoderLayer,
+            Qwen3VLVisionAttention: QEffQwen3VLVisionAttention,
+            Qwen3VLVisionModel: QEffQwen3VLVisionModel,
+            Qwen3VLTextModel: QEffQwen3VLTextModel,
+        }
+    else:
+        _module_mapping = {
+            # CodeGen
+            CodeGenAttention: QEffCodeGenAttention,
+            CodeGenBlock: QEffCodeGenBlock,
+            CodeGenModel: QEffCodeGenModel,
+            CodeGenForCausalLM: QEffCodeGenForCausalLM,
+            # Falcon
+            FalconAttention: QEffFalconAttention,
+            FalconDecoderLayer: QEffFalconDecoderLayer,
+            FalconModel: QEffFalconModel,
+            FalconForCausalLM: QEffFalconForCausalLM,
+            # GPT2
+            GPT2Attention: QEffGPT2Attention,
+            GPT2Block: QEffGPT2Block,
+            GPT2Model: QEffGPT2Model,
+            GPT2LMHeadModel: QEffGPT2LMHeadModel,
+            # GPTJ
+            GPTJAttention: QEffGPTJAttention,
+            GPTJBlock: QEffGPTJBlock,
+            GPTJModel: QEffGPTJModel,
+            GPTJForCausalLM: QEffGPTJForCausalLM,
+            # Llama
+            LlamaAttention: QEffLlamaAttention,
+            LlamaDecoderLayer: QEffLlamaDecoderLayer,
+            LlamaModel: QEffLlamaModel,
+            LlamaForCausalLM: QEffLlamaForCausalLM,
+            LlamaRotaryEmbedding: QEffLlamaRotaryEmbedding,
+            # Llama4
+            Llama4TextAttention: QEffLlama4TextAttention,
+            Llama4ForCausalLM: QEffLlama4ForCausalLM,
+            Llama4TextDecoderLayer: QEffLlama4TextDecoderLayer,
+            Llama4TextModel: QEffLlama4TextModel,
+            Llama4TextMoe: QEffLlama4TextMoe,
+            Llama4ForConditionalGeneration: QEffLlama4ForConditionalGeneration,
+            Llama4VisionAttention: QEffLlama4VisionAttention,
+            Llama4VisionModel: QEffLlama4VisionModel,
+            Llama4TextExperts: QEffLlama4TextExperts,
+            Llama4Router: QEffLlama4Router,
+            # Llava
+            LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration,
+            # Llava Next
+            LlavaNextForConditionalGeneration: QEffLlavaNextForConditionalGeneration,
+            # Gemma
+            GemmaAttention: QEffGemmaAttention,
+            GemmaDecoderLayer: QEffGemmaDecoderLayer,
+            GemmaModel: QEffGemmaModel,
+            GemmaForCausalLM: QEffGemmaForCausalLM,
+            # Qwen3Moe
+            Qwen3MoeForCausalLM: QEffQwen3MoeForCausalLM,
+            Qwen3MoeModel: QEffQwen3MoeModel,
+            Qwen3MoeDecoderLayer: QEffQwen3MoeDecoderLayer,
+            Qwen3MoeAttention: QEffQwen3MoeAttention,
+            Qwen3MoeRotaryEmbedding: QEffQwen3MoeRotaryEmbedding,
+            Qwen3MoeSparseMoeBlock: QEffQwen3MoeSparseMoeBlock,
+            # Gemma2
+            Gemma2Attention: QEffGemma2Attention,
+            Gemma2DecoderLayer: QEffGemma2DecoderLayer,
+            Gemma2Model: QEffGemma2Model,
+            Gemma2ForCausalLM: QEffGemma2ForCausalLM,
+            # Gemma3
+            Gemma3Attention: QEffGemma3Attention,
+            Gemma3DecoderLayer: QEffGemma3DecoderLayer,
+            Gemma3TextModel: QEffGemma3TextModel,
+            Gemma3ForCausalLM: QEffGemma3ForCausalLMModel,
+            Gemma3ForConditionalGeneration: QEffGemma3ForConditionalGeneration,
+            # GPT_OSS
+            GptOssAttention: QEffGptOssAttention,
+            GptOssDecoderLayer: QEffGptOssDecoderLayer,
+            GptOssModel: QEffGptOssModel,
+            GptOssForCausalLM: QEffGptOssForCausalLM,
+            GptOssMLP: QEffGptOssMLP,
+            GptOssExperts: QEffGptOssExperts,
+            # Granite
+            GraniteModel: QEffGraniteModel,
+            GraniteForCausalLM: QEffGraniteForCausalLM,
+            GraniteAttention: QEffGraniteAttention,
+            GraniteDecoderLayer: QEffGraniteDecoderLayer,
+            # GraniteMoe
+            GraniteMoeModel: QEffGraniteMoeModel,
+            GraniteMoeForCausalLM: QEffGraniteMoeForCausalLM,
+            GraniteMoeAttention: QEffGraniteMoeAttention,
+            GraniteMoeRotaryEmbedding: QEffGraniteMoeRotaryEmbedding,
+            GraniteMoeParallelExperts: QEffGraniteMoeParallelExperts,
+            GraniteMoeTopKGating: QEffGraniteMoeTopKGating,
+            GraniteMoeMoE: QEffGraniteMoeMoE,
+            # mllama
+            MllamaTextRMSNorm: CustomRMSNormAIC,
+            MllamaTextSelfAttention: QEffMllamaTextSelfAttention,
+            MllamaSelfAttentionDecoderLayer: QEffMllamaSelfAttentionDecoderLayer,
+            MllamaModel: QEffMllamaModel,
+            MllamaCrossAttentionDecoderLayer: QEffMllamaCrossAttentionDecoderLayer,
+            MllamaRotaryEmbedding: QEffMllamaRotaryEmbedding,
+            MllamaVisionModel: QEffMllamaVisionModel,
+            MllamaTextModel: QEffMllamaTextModel,
+            MllamaForCausalLM: QEffMllamaForCausalLM,
+            MllamaForConditionalGeneration: QEffMllamaForConditionalGeneration,
+            # Mistral
+            MistralAttention: QEffMistralAttention,
+            MistralDecoderLayer: QEffMistralDecoderLayer,
+            MistralModel: QEffMistralModel,
+            MistralForCausalLM: QEffMistralForCausalLM,
+            # Mistral3
+            Mistral3ForConditionalGeneration: QEffMistral3ForConditionalGeneration,
+            Mistral3Model: QEffMistral3Model,
+            # Mixtral
+            MixtralAttention: QEffMixtralAttention,
+            MixtralSparseMoeBlock: QEffMixtralSparseMoeBlock,
+            MixtralDecoderLayer: QeffMixtralDecoderLayer,
+            MixtralModel: QEffMixtralModel,
+            MixtralForCausalLM: QEffMixtralForCausalLM,
+            # Mpt
+            MptAttention: QEffMptAttention,
+            MptBlock: QEffMptBlock,
+            MptModel: QEFfMptModel,
+            MptForCausalLM: QEffMptForCausalLM,
+            # Phi3
+            Phi3Attention: QEffPhi3Attention,
+            Phi3DecoderLayer: QEffPhi3DecoderLayer,
+            Phi3Model: QEffPhi3Model,
+            Phi3ForCausalLM: QEffPhi3ForCausalLM,
+            # Phi
+            PhiAttention: QEffPhiAttention,
+            PhiDecoderLayer: QEffPhiDecoderLayer,
+            PhiModel: QEffPhiModel,
+            PhiForCausalLM: QEffPhiForCausalLM,
+            # Pixtral
+            PixtralVisionModel: QEffPixtralVisionModel,
+            # Qwen2
+            Qwen2Attention: QEffQwen2Attention,
+            Qwen2DecoderLayer: QEffQwen2DecoderLayer,
+            Qwen2Model: QEffQwen2Model,
+            Qwen2ForCausalLM: QEffQwen2ForCausalLM,
+            # Qwen3
+            Qwen3Attention: QEffQwen3Attention,
+            Qwen3DecoderLayer: QEffQwen3DecoderLayer,
+            Qwen3Model: QEffQwen3Model,
+            Qwen3ForCausalLM: QEffQwen3ForCausalLM,
+            # Qwen2.5 VL
+            Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration,
+            Qwen2_5_VLModel: QEffQwen2_5_VLModel,
+            Qwen2_5_VLAttention: QEffQwen2_5_VLAttention,
+            Qwen2_5_VLDecoderLayer: QEffQwen2_5_VLDecoderLayer,
+            Qwen2_5_VisionTransformerPretrainedModel: QEffQwen2_5_VisionTransformerPretrainedModel,
+            Qwen2_5_VLVisionAttention: QEffQwen2_5_VLVisionAttention,
+            Qwen2_5_VLTextModel: QEffQwen2_5_VLTextModel,
+            # Starcoder2
+            Starcoder2Attention: QEffStarcoder2Attention,
+            Starcoder2DecoderLayer: QEFFStarcoder2DecoderLayer,
+            Starcoder2Model: QEffStarcoder2Model,
+            Starcoder2ForCausalLM: QEffStarcoder2ForCausalLM,
+            # GptBigcode
+            GPTBigCodeAttention: QEffGPTBigCodeAttention,
+            GPTBigCodeBlock: QEffGPTBigCodeBlock,
+            GPTBigCodeModel: QEffGPTBigCodeModel,
+            GPTBigCodeForCausalLM: QEffGPTBigCodeForCausalLM,
+            # Olmo2
+            Olmo2Attention: QEffOlmo2Attention,
+            Olmo2DecoderLayer: QEffOlmo2DecoderLayer,
+            Olmo2Model: QEffOlmo2Model,
+            Olmo2ForCausalLM: QEffOlmo2ForCausalLM,
+            # Whisper encoder and decoder layers
+            WhisperPositionalEmbedding: QEffWhisperPositionalEmbedding,
+            WhisperAttention: QEffWhisperAttention,
+            WhisperDecoderLayer: QEffWhisperDecoderLayer,
+            WhisperEncoder: QEffWhisperEncoder,
+            WhisperDecoder: QEffWhisperDecoder,
+            WhisperModel: QEffWhisperModel,
+            WhisperForConditionalGeneration: QEffWhisperForConditionalGeneration,
+        }
 
     @classmethod
     def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
@@ -728,16 +743,20 @@ class PrefillOnlyTransform(ModuleMappingTransform):
 
 
 class PrefillOnlyChunkedTransform(ModuleMappingTransform):
-    _module_mapping = {
-        # GPT_OSS
-        QEffGptOssModel: QEffPrefillOnlyGptOssModel,
-        QEffGptOssAttention: QEffPrefillOnlyChunkedGptOssAttention,
-        QEffGptOssMLP: QEffPrefillOnlyChunkedGptOssMLP,
-        # Qwen3Moe
-        QEffQwen3MoeSparseMoeBlock: QEffPrefillChunkedQwen3MoeSparseMoeBlock,
-        # Qwen3 VL Moe
-        QEffQwen3VLMoeTextSparseMoeBlock: QEffPrefillChunkedQwen3VLMoeTextSparseMoeBlock,
-    }
+    if version.parse(transformers.__version__) >= version.parse("4.57.0"):
+        _module_mapping = {
+            # Qwen3 VL Moe
+            QEffQwen3VLMoeTextSparseMoeBlock: QEffPrefillChunkedQwen3VLMoeTextSparseMoeBlock,
+        }
+    else:
+        _module_mapping = {
+            # GPT_OSS
+            QEffGptOssModel: QEffPrefillOnlyGptOssModel,
+            QEffGptOssAttention: QEffPrefillOnlyChunkedGptOssAttention,
+            QEffGptOssMLP: QEffPrefillOnlyChunkedGptOssMLP,
+            # Qwen3Moe
+            QEffQwen3MoeSparseMoeBlock: QEffPrefillChunkedQwen3MoeSparseMoeBlock,
+        }
 
 
 class RevertPrefillKeepAttentionTransform(ModuleMappingTransform):
diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
index 070856c6e..2dbd42b7b 100644
--- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
+++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
@@ -10,23 +10,27 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import transformers
+from packaging import version
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
 )
-from transformers.models.qwen3_vl.modeling_qwen3_vl import (
-    Qwen3VLForConditionalGeneration,
-    Qwen3VLModel,
-    Qwen3VLModelOutputWithPast,
-    Qwen3VLTextAttention,
-    Qwen3VLTextDecoderLayer,
-    Qwen3VLTextModel,
-    Qwen3VLVisionAttention,
-    Qwen3VLVisionModel,
-    apply_rotary_pos_emb_vision,
-    repeat_kv,
-    rotate_half,
-)
+
+if version.parse(transformers.__version__) >= version.parse("4.57.0"):
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import (
+        Qwen3VLForConditionalGeneration,
+        Qwen3VLModel,
+        Qwen3VLModelOutputWithPast,
+        Qwen3VLTextAttention,
+        Qwen3VLTextDecoderLayer,
+        Qwen3VLTextModel,
+        Qwen3VLVisionAttention,
+        Qwen3VLVisionModel,
+        apply_rotary_pos_emb_vision,
+        repeat_kv,
+        rotate_half,
+    )
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -844,10 +848,10 @@ def get_specializations(
         comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill", None)
         comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode", None)
         if height is None or width is None:
-            height = 1365
-            width = 2048
+            height = constants.QWEN3_VL_HEIGHT
+            width = constants.QWEN3_VL_WIDTH
             logger.warning(
-                "Setting height and width to be 1365 and 2048 respectively, as it was neither passed nor found in vision_config"
+                f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config"
             )
         prefill_seq_len = prefill_seq_len if prefill_seq_len else 128
         ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
diff --git a/QEfficient/transformers/quantizers/quant_transforms.py b/QEfficient/transformers/quantizers/quant_transforms.py
index f97bfe998..9870ced5e 100644
--- a/QEfficient/transformers/quantizers/quant_transforms.py
+++ b/QEfficient/transformers/quantizers/quant_transforms.py
@@ -6,10 +6,14 @@
 # -----------------------------------------------------------------------------
 
 import torch
+import transformers
+from packaging import version
 from torch import nn
 from transformers import AutoConfig
 from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts
-from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
+
+if version.parse(transformers.__version__) >= version.parse("4.57.0"):
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
 
 from QEfficient.base.pytorch_transforms import ModuleMutatorTransform
 from QEfficient.customop.matmulnbits import QuantLinearORT
diff --git a/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py
index 382677bcf..94cf6ba98 100644
--- a/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py
@@ -10,7 +10,11 @@
 from typing import List
 
 import torch
-from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
+import transformers
+from packaging import version
+
+if version.parse(transformers.__version__) >= version.parse("4.57.0"):
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
 from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer
 from transformers.utils.quantization_config import CompressedTensorsConfig, QuantizationConfigMixin, QuantizationMethod
 
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 7e6dd1cbb..af1fe5c03 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -143,6 +143,10 @@ def get_models_dir():
 QWEN2_5_VL_HEIGHT = 354
 QWEN2_5_VL_WIDTH = 536
 
+# Qwen3_vl Constanst
+QWEN3_VL_HEIGHT = 354
+QWEN3_VL_WIDTH = 536
+
 # Modules to cache while clearing the pytorch weights
 CACHE_MODULES = ["get_output_names", "get_dummy_inputs", "get_onnx_dynamic_axes", "get_specializations"]
 
diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json
index e5a3f9503..ef5b775f2 100644
--- a/tests/configs/image_text_model_configs.json
+++ b/tests/configs/image_text_model_configs.json
@@ -105,6 +105,48 @@
       "full_batch_size": 2,
       "additional_params": {}
     },
+    {
+      "model_name": "Qwen/Qwen3-VL-2B-Instruct",
+      "model_type": "qwen3_vl",
+      "batch_size": 1,
+      "prompt_len": 128,
+      "ctx_len": 4096,
+      "img_size": 1540,
+      "img_url": "https://picsum.photos/id/237/536/354",
+      "text_prompt": "Can you describe the image in detail.",
+      "num_layers": 1,
+      "img_url_list":[
+            "https://picsum.photos/id/237/536/354",
+            "https://picsum.photos/id/237/536/354"
+        ],
+      "text_prompt_list": [
+            "Can you describe the image in detail?",
+            "What are the objects in the image?"
+        ],
+      "full_batch_size": 2,
+      "additional_params": {}
+    },
+    {
+      "model_name": "Qwen/Qwen3-VL-30B-A3B-Instruct",
+      "model_type": "qwen3_vl_moe",
+      "batch_size": 1,
+      "prompt_len": 128,
+      "ctx_len": 4096,cd ..
+      "img_size": 1540,
+      "img_url": "https://picsum.photos/id/237/536/354",
+      "text_prompt": "Can you describe the image in detail.",
+      "num_layers": 1,
+      "img_url_list":[
+            "https://picsum.photos/id/237/536/354",
+            "https://picsum.photos/id/237/536/354"
+        ],
+      "text_prompt_list": [
+            "Can you describe the image in detail?",
+            "What are the objects in the image?"
+        ],
+      "full_batch_size": 2,
+      "additional_params": {}
+    },
     {
       "model_name": "allenai/Molmo-7B-D-0924",
       "model_type": "molmo",
diff --git a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py
index a2c72ba7a..18dc92548 100644
--- a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py
+++ b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py
@@ -13,6 +13,8 @@
 import pytest
 import requests
 import torch
+import transformers
+from packaging import version
 from PIL import Image
 from transformers import (
     AutoConfig,
@@ -321,10 +323,25 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
 
     if not is_intern_model and not is_molmo_model:
         inputs = processor(images=image, text=prompt, return_tensors="pt")
-        if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl":
+        if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type in ["qwen2_5_vl"]:
             inputs = qeff_model.model.prepare_inputs_for_generation(
                 inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size
             )
+        if (
+            hasattr(qeff_model.model.config, "model_type")
+            and version.parse(transformers.__version__) >= version.parse("4.57.0")
+            and qeff_model.model.config.model_type in ["qwen3_vl", "qwen3_vl_moe"]
+        ):
+            inputs = qeff_model.model.prepare_inputs_for_generation(
+                inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size
+            )
+            if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type in [
+                "qwen3_vl",
+                "qwen3_vl_moe",
+            ]:
+                config.vision_config.depth = 9
+                config.text_config.num_hidden_layers = 1
+                config.vision_config.deepstack_visual_indexes = [8]
         if "pixel_values" in inputs:
             inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
 
@@ -352,7 +369,14 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload
     ]:
         pytest.skip("Test skipped for this model due to some issues.")
     if (
-        model_name in ["OpenGVLab/InternVL2_5-1B", "OpenGVLab/InternVL3_5-1B", "Qwen/Qwen2.5-VL-3B-Instruct"]
+        model_name
+        in [
+            "OpenGVLab/InternVL2_5-1B",
+            "OpenGVLab/InternVL3_5-1B",
+            "Qwen/Qwen2.5-VL-3B-Instruct",
+            "Qwen/Qwen3-VL-2B-Instruct",
+            "Qwen/Qwen3-VL-30B-A3B-Instruct",
+        ]
         and not kv_offload
     ):
         pytest.skip("These models require kv_offload=True for testing.")