diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py index 2b0d50849..3bf747058 100644 --- a/QEfficient/transformers/cache_utils.py +++ b/QEfficient/transformers/cache_utils.py @@ -10,6 +10,8 @@ from typing import Any, Dict, List, Optional, Tuple import torch +import transformers +from packaging import version from transformers.cache_utils import DynamicCache, DynamicLayer, EncoderDecoderCache, HybridCache, HybridChunkedCache from QEfficient.customop import ( @@ -330,12 +332,15 @@ def __init__( layers = [] # If a config is passed, use it to infer the layer types and initialize accordingly if len(layers) == 0: - Cache.__init__( - self, - layer_class_to_replicate=QEffDynamicLayer, - offloading=offloading, - offload_only_non_sliding=offload_only_non_sliding, - ) + if version.parse(transformers.__version__) < version.parse("4.57.0"): + Cache.__init__(self, layer_classes=QEffDynamicLayer, *args, **kwargs) + else: + Cache.__init__( + self, + layer_class_to_replicate=QEffDynamicLayer, + offloading=offloading, + offload_only_non_sliding=offload_only_non_sliding, + ) else: Cache.__init__( self, diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 00e7f2d23..0564dd834 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -10,6 +10,8 @@ from types import MethodType from typing import Callable, Optional, Tuple, Union +import transformers +from packaging import version from torch import nn from transformers.models.codegen.modeling_codegen import ( CodeGenAttention, @@ -194,27 +196,29 @@ Qwen3MoeRotaryEmbedding, Qwen3MoeSparseMoeBlock, ) -from transformers.models.qwen3_vl.modeling_qwen3_vl import ( - Qwen3VLForConditionalGeneration, - Qwen3VLModel, - Qwen3VLTextAttention, - Qwen3VLTextDecoderLayer, - Qwen3VLTextModel, - Qwen3VLTextRMSNorm, - Qwen3VLVisionAttention, - Qwen3VLVisionModel, -) -from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import ( - Qwen3VLMoeForConditionalGeneration, - Qwen3VLMoeModel, - Qwen3VLMoeTextAttention, - Qwen3VLMoeTextDecoderLayer, - Qwen3VLMoeTextModel, - Qwen3VLMoeTextRMSNorm, - Qwen3VLMoeTextSparseMoeBlock, - Qwen3VLMoeVisionAttention, - Qwen3VLMoeVisionModel, -) + +if version.parse(transformers.__version__) >= version.parse("4.57.0"): + from transformers.models.qwen3_vl.modeling_qwen3_vl import ( + Qwen3VLForConditionalGeneration, + Qwen3VLModel, + Qwen3VLTextAttention, + Qwen3VLTextDecoderLayer, + Qwen3VLTextModel, + Qwen3VLTextRMSNorm, + Qwen3VLVisionAttention, + Qwen3VLVisionModel, + ) + from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import ( + Qwen3VLMoeForConditionalGeneration, + Qwen3VLMoeModel, + Qwen3VLMoeTextAttention, + Qwen3VLMoeTextDecoderLayer, + Qwen3VLMoeTextModel, + Qwen3VLMoeTextRMSNorm, + Qwen3VLMoeTextSparseMoeBlock, + Qwen3VLMoeVisionAttention, + Qwen3VLMoeVisionModel, + ) from transformers.models.starcoder2.modeling_starcoder2 import ( Starcoder2Attention, Starcoder2DecoderLayer, @@ -454,26 +458,28 @@ QEffQwen3MoeRotaryEmbedding, QEffQwen3MoeSparseMoeBlock, ) -from QEfficient.transformers.models.qwen3_vl.modeling_qwen3_vl import ( - QEffQwen3VLForConditionalGeneration, - QEffQwen3VLModel, - QEffQwen3VLTextAttention, - QEffQwen3VLTextDecoderLayer, - QEffQwen3VLTextModel, - QEffQwen3VLVisionAttention, - QEffQwen3VLVisionModel, -) -from QEfficient.transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import ( - QEffPrefillChunkedQwen3VLMoeTextSparseMoeBlock, - QEffQwen3VLMoeForConditionalGeneration, - QEffQwen3VLMoeModel, - QEffQwen3VLMoeTextAttention, - QEffQwen3VLMoeTextDecoderLayer, - QEffQwen3VLMoeTextModel, - QEffQwen3VLMoeTextSparseMoeBlock, - QEffQwen3VLMoeVisionAttention, - QEffQwen3VLMoeVisionModel, -) + +if version.parse(transformers.__version__) >= version.parse("4.57.0"): + from QEfficient.transformers.models.qwen3_vl.modeling_qwen3_vl import ( + QEffQwen3VLForConditionalGeneration, + QEffQwen3VLModel, + QEffQwen3VLTextAttention, + QEffQwen3VLTextDecoderLayer, + QEffQwen3VLTextModel, + QEffQwen3VLVisionAttention, + QEffQwen3VLVisionModel, + ) + from QEfficient.transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import ( + QEffPrefillChunkedQwen3VLMoeTextSparseMoeBlock, + QEffQwen3VLMoeForConditionalGeneration, + QEffQwen3VLMoeModel, + QEffQwen3VLMoeTextAttention, + QEffQwen3VLMoeTextDecoderLayer, + QEffQwen3VLMoeTextModel, + QEffQwen3VLMoeTextSparseMoeBlock, + QEffQwen3VLMoeVisionAttention, + QEffQwen3VLMoeVisionModel, + ) from QEfficient.transformers.models.starcoder2.modeling_starcoder2 import ( QEffStarcoder2Attention, QEFFStarcoder2DecoderLayer, @@ -501,217 +507,226 @@ class CustomOpsTransform(ModuleMappingTransform): - _module_mapping = { - GemmaRMSNorm: GemmaCustomRMSNormAIC, - Gemma2RMSNorm: GemmaCustomRMSNormAIC, - GptOssRMSNorm: CustomRMSNormAIC, - LlamaRMSNorm: CustomRMSNormAIC, - Llama4TextRMSNorm: CustomRMSNormAIC, - MistralRMSNorm: CustomRMSNormAIC, - Mistral3RMSNorm: CustomRMSNormAIC, - MixtralRMSNorm: CustomRMSNormAIC, - Phi3RMSNorm: CustomRMSNormAIC, - Qwen2RMSNorm: CustomRMSNormAIC, - Qwen3RMSNorm: CustomRMSNormAIC, - Qwen2_5RMSNorm: CustomRMSNormAIC, - MllamaTextRMSNorm: CustomRMSNormAIC, - GraniteRMSNorm: CustomRMSNormAIC, - PixtralRMSNorm: CustomRMSNormAIC, - GraniteMoeRMSNorm: CustomRMSNormAIC, - Qwen3MoeRMSNorm: CustomRMSNormAIC, - Gemma3RMSNorm: QEffGemma3CustomRMSNormAIC, - Olmo2RMSNorm: CustomRMSNormAIC, - Qwen3VLMoeTextRMSNorm: CustomRMSNormAIC, - Qwen3VLTextRMSNorm: CustomRMSNormAIC, - } + if version.parse(transformers.__version__) >= version.parse("4.57.0"): + _module_mapping = { + Qwen3VLMoeTextRMSNorm: CustomRMSNormAIC, + Qwen3VLTextRMSNorm: CustomRMSNormAIC, + } + else: + _module_mapping = { + GemmaRMSNorm: GemmaCustomRMSNormAIC, + Gemma2RMSNorm: GemmaCustomRMSNormAIC, + GptOssRMSNorm: CustomRMSNormAIC, + LlamaRMSNorm: CustomRMSNormAIC, + Llama4TextRMSNorm: CustomRMSNormAIC, + MistralRMSNorm: CustomRMSNormAIC, + Mistral3RMSNorm: CustomRMSNormAIC, + MixtralRMSNorm: CustomRMSNormAIC, + Phi3RMSNorm: CustomRMSNormAIC, + Qwen2RMSNorm: CustomRMSNormAIC, + Qwen3RMSNorm: CustomRMSNormAIC, + Qwen2_5RMSNorm: CustomRMSNormAIC, + MllamaTextRMSNorm: CustomRMSNormAIC, + GraniteRMSNorm: CustomRMSNormAIC, + PixtralRMSNorm: CustomRMSNormAIC, + GraniteMoeRMSNorm: CustomRMSNormAIC, + Qwen3MoeRMSNorm: CustomRMSNormAIC, + Gemma3RMSNorm: QEffGemma3CustomRMSNormAIC, + Olmo2RMSNorm: CustomRMSNormAIC, + } class KVCacheTransform(ModuleMappingTransform): - _module_mapping = { - # CodeGen - CodeGenAttention: QEffCodeGenAttention, - CodeGenBlock: QEffCodeGenBlock, - CodeGenModel: QEffCodeGenModel, - CodeGenForCausalLM: QEffCodeGenForCausalLM, - # Falcon - FalconAttention: QEffFalconAttention, - FalconDecoderLayer: QEffFalconDecoderLayer, - FalconModel: QEffFalconModel, - FalconForCausalLM: QEffFalconForCausalLM, - # GPT2 - GPT2Attention: QEffGPT2Attention, - GPT2Block: QEffGPT2Block, - GPT2Model: QEffGPT2Model, - GPT2LMHeadModel: QEffGPT2LMHeadModel, - # GPTJ - GPTJAttention: QEffGPTJAttention, - GPTJBlock: QEffGPTJBlock, - GPTJModel: QEffGPTJModel, - GPTJForCausalLM: QEffGPTJForCausalLM, - # Llama - LlamaAttention: QEffLlamaAttention, - LlamaDecoderLayer: QEffLlamaDecoderLayer, - LlamaModel: QEffLlamaModel, - LlamaForCausalLM: QEffLlamaForCausalLM, - LlamaRotaryEmbedding: QEffLlamaRotaryEmbedding, - # Llama4 - Llama4TextAttention: QEffLlama4TextAttention, - Llama4ForCausalLM: QEffLlama4ForCausalLM, - Llama4TextDecoderLayer: QEffLlama4TextDecoderLayer, - Llama4TextModel: QEffLlama4TextModel, - Llama4TextMoe: QEffLlama4TextMoe, - Llama4ForConditionalGeneration: QEffLlama4ForConditionalGeneration, - Llama4VisionAttention: QEffLlama4VisionAttention, - Llama4VisionModel: QEffLlama4VisionModel, - Llama4TextExperts: QEffLlama4TextExperts, - Llama4Router: QEffLlama4Router, - # Llava - LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration, - # Llava Next - LlavaNextForConditionalGeneration: QEffLlavaNextForConditionalGeneration, - # Gemma - GemmaAttention: QEffGemmaAttention, - GemmaDecoderLayer: QEffGemmaDecoderLayer, - GemmaModel: QEffGemmaModel, - GemmaForCausalLM: QEffGemmaForCausalLM, - # Qwen3Moe - Qwen3MoeForCausalLM: QEffQwen3MoeForCausalLM, - Qwen3MoeModel: QEffQwen3MoeModel, - Qwen3MoeDecoderLayer: QEffQwen3MoeDecoderLayer, - Qwen3MoeAttention: QEffQwen3MoeAttention, - Qwen3MoeRotaryEmbedding: QEffQwen3MoeRotaryEmbedding, - Qwen3MoeSparseMoeBlock: QEffQwen3MoeSparseMoeBlock, - Qwen3VLMoeForConditionalGeneration: QEffQwen3VLMoeForConditionalGeneration, - Qwen3VLMoeModel: QEffQwen3VLMoeModel, - Qwen3VLMoeTextAttention: QEffQwen3VLMoeTextAttention, - Qwen3VLMoeTextDecoderLayer: QEffQwen3VLMoeTextDecoderLayer, - Qwen3VLMoeVisionAttention: QEffQwen3VLMoeVisionAttention, - Qwen3VLMoeVisionModel: QEffQwen3VLMoeVisionModel, - Qwen3VLMoeTextModel: QEffQwen3VLMoeTextModel, - Qwen3VLMoeTextSparseMoeBlock: QEffQwen3VLMoeTextSparseMoeBlock, - # Qwen3vl - Qwen3VLForConditionalGeneration: QEffQwen3VLForConditionalGeneration, - Qwen3VLModel: QEffQwen3VLModel, - Qwen3VLTextAttention: QEffQwen3VLTextAttention, - Qwen3VLTextDecoderLayer: QEffQwen3VLTextDecoderLayer, - Qwen3VLVisionAttention: QEffQwen3VLVisionAttention, - Qwen3VLVisionModel: QEffQwen3VLVisionModel, - Qwen3VLTextModel: QEffQwen3VLTextModel, - # Gemma2 - Gemma2Attention: QEffGemma2Attention, - Gemma2DecoderLayer: QEffGemma2DecoderLayer, - Gemma2Model: QEffGemma2Model, - Gemma2ForCausalLM: QEffGemma2ForCausalLM, - # Gemma3 - Gemma3Attention: QEffGemma3Attention, - Gemma3DecoderLayer: QEffGemma3DecoderLayer, - Gemma3TextModel: QEffGemma3TextModel, - Gemma3ForCausalLM: QEffGemma3ForCausalLMModel, - Gemma3ForConditionalGeneration: QEffGemma3ForConditionalGeneration, - # GPT_OSS - GptOssAttention: QEffGptOssAttention, - GptOssDecoderLayer: QEffGptOssDecoderLayer, - GptOssModel: QEffGptOssModel, - GptOssForCausalLM: QEffGptOssForCausalLM, - GptOssMLP: QEffGptOssMLP, - GptOssExperts: QEffGptOssExperts, - # Granite - GraniteModel: QEffGraniteModel, - GraniteForCausalLM: QEffGraniteForCausalLM, - GraniteAttention: QEffGraniteAttention, - GraniteDecoderLayer: QEffGraniteDecoderLayer, - # GraniteMoe - GraniteMoeModel: QEffGraniteMoeModel, - GraniteMoeForCausalLM: QEffGraniteMoeForCausalLM, - GraniteMoeAttention: QEffGraniteMoeAttention, - GraniteMoeRotaryEmbedding: QEffGraniteMoeRotaryEmbedding, - GraniteMoeParallelExperts: QEffGraniteMoeParallelExperts, - GraniteMoeTopKGating: QEffGraniteMoeTopKGating, - GraniteMoeMoE: QEffGraniteMoeMoE, - # mllama - MllamaTextRMSNorm: CustomRMSNormAIC, - MllamaTextSelfAttention: QEffMllamaTextSelfAttention, - MllamaSelfAttentionDecoderLayer: QEffMllamaSelfAttentionDecoderLayer, - MllamaModel: QEffMllamaModel, - MllamaCrossAttentionDecoderLayer: QEffMllamaCrossAttentionDecoderLayer, - MllamaRotaryEmbedding: QEffMllamaRotaryEmbedding, - MllamaVisionModel: QEffMllamaVisionModel, - MllamaTextModel: QEffMllamaTextModel, - MllamaForCausalLM: QEffMllamaForCausalLM, - MllamaForConditionalGeneration: QEffMllamaForConditionalGeneration, - # Mistral - MistralAttention: QEffMistralAttention, - MistralDecoderLayer: QEffMistralDecoderLayer, - MistralModel: QEffMistralModel, - MistralForCausalLM: QEffMistralForCausalLM, - # Mistral3 - Mistral3ForConditionalGeneration: QEffMistral3ForConditionalGeneration, - Mistral3Model: QEffMistral3Model, - # Mixtral - MixtralAttention: QEffMixtralAttention, - MixtralSparseMoeBlock: QEffMixtralSparseMoeBlock, - MixtralDecoderLayer: QeffMixtralDecoderLayer, - MixtralModel: QEffMixtralModel, - MixtralForCausalLM: QEffMixtralForCausalLM, - # Mpt - MptAttention: QEffMptAttention, - MptBlock: QEffMptBlock, - MptModel: QEFfMptModel, - MptForCausalLM: QEffMptForCausalLM, - # Phi3 - Phi3Attention: QEffPhi3Attention, - Phi3DecoderLayer: QEffPhi3DecoderLayer, - Phi3Model: QEffPhi3Model, - Phi3ForCausalLM: QEffPhi3ForCausalLM, - # Phi - PhiAttention: QEffPhiAttention, - PhiDecoderLayer: QEffPhiDecoderLayer, - PhiModel: QEffPhiModel, - PhiForCausalLM: QEffPhiForCausalLM, - # Pixtral - PixtralVisionModel: QEffPixtralVisionModel, - # Qwen2 - Qwen2Attention: QEffQwen2Attention, - Qwen2DecoderLayer: QEffQwen2DecoderLayer, - Qwen2Model: QEffQwen2Model, - Qwen2ForCausalLM: QEffQwen2ForCausalLM, - # Qwen3 - Qwen3Attention: QEffQwen3Attention, - Qwen3DecoderLayer: QEffQwen3DecoderLayer, - Qwen3Model: QEffQwen3Model, - Qwen3ForCausalLM: QEffQwen3ForCausalLM, - # Qwen2.5 VL - Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration, - Qwen2_5_VLModel: QEffQwen2_5_VLModel, - Qwen2_5_VLAttention: QEffQwen2_5_VLAttention, - Qwen2_5_VLDecoderLayer: QEffQwen2_5_VLDecoderLayer, - Qwen2_5_VisionTransformerPretrainedModel: QEffQwen2_5_VisionTransformerPretrainedModel, - Qwen2_5_VLVisionAttention: QEffQwen2_5_VLVisionAttention, - Qwen2_5_VLTextModel: QEffQwen2_5_VLTextModel, - # Starcoder2 - Starcoder2Attention: QEffStarcoder2Attention, - Starcoder2DecoderLayer: QEFFStarcoder2DecoderLayer, - Starcoder2Model: QEffStarcoder2Model, - Starcoder2ForCausalLM: QEffStarcoder2ForCausalLM, - # GptBigcode - GPTBigCodeAttention: QEffGPTBigCodeAttention, - GPTBigCodeBlock: QEffGPTBigCodeBlock, - GPTBigCodeModel: QEffGPTBigCodeModel, - GPTBigCodeForCausalLM: QEffGPTBigCodeForCausalLM, - # Olmo2 - Olmo2Attention: QEffOlmo2Attention, - Olmo2DecoderLayer: QEffOlmo2DecoderLayer, - Olmo2Model: QEffOlmo2Model, - Olmo2ForCausalLM: QEffOlmo2ForCausalLM, - # Whisper encoder and decoder layers - WhisperPositionalEmbedding: QEffWhisperPositionalEmbedding, - WhisperAttention: QEffWhisperAttention, - WhisperDecoderLayer: QEffWhisperDecoderLayer, - WhisperEncoder: QEffWhisperEncoder, - WhisperDecoder: QEffWhisperDecoder, - WhisperModel: QEffWhisperModel, - WhisperForConditionalGeneration: QEffWhisperForConditionalGeneration, - } + if version.parse(transformers.__version__) >= version.parse("4.57.0"): + _module_mapping = { + # Qwen3VlMoe + Qwen3VLMoeForConditionalGeneration: QEffQwen3VLMoeForConditionalGeneration, + Qwen3VLMoeModel: QEffQwen3VLMoeModel, + Qwen3VLMoeTextAttention: QEffQwen3VLMoeTextAttention, + Qwen3VLMoeTextDecoderLayer: QEffQwen3VLMoeTextDecoderLayer, + Qwen3VLMoeVisionAttention: QEffQwen3VLMoeVisionAttention, + Qwen3VLMoeVisionModel: QEffQwen3VLMoeVisionModel, + Qwen3VLMoeTextModel: QEffQwen3VLMoeTextModel, + Qwen3VLMoeTextSparseMoeBlock: QEffQwen3VLMoeTextSparseMoeBlock, + # Qwen3vl + Qwen3VLForConditionalGeneration: QEffQwen3VLForConditionalGeneration, + Qwen3VLModel: QEffQwen3VLModel, + Qwen3VLTextAttention: QEffQwen3VLTextAttention, + Qwen3VLTextDecoderLayer: QEffQwen3VLTextDecoderLayer, + Qwen3VLVisionAttention: QEffQwen3VLVisionAttention, + Qwen3VLVisionModel: QEffQwen3VLVisionModel, + Qwen3VLTextModel: QEffQwen3VLTextModel, + } + else: + _module_mapping = { + # CodeGen + CodeGenAttention: QEffCodeGenAttention, + CodeGenBlock: QEffCodeGenBlock, + CodeGenModel: QEffCodeGenModel, + CodeGenForCausalLM: QEffCodeGenForCausalLM, + # Falcon + FalconAttention: QEffFalconAttention, + FalconDecoderLayer: QEffFalconDecoderLayer, + FalconModel: QEffFalconModel, + FalconForCausalLM: QEffFalconForCausalLM, + # GPT2 + GPT2Attention: QEffGPT2Attention, + GPT2Block: QEffGPT2Block, + GPT2Model: QEffGPT2Model, + GPT2LMHeadModel: QEffGPT2LMHeadModel, + # GPTJ + GPTJAttention: QEffGPTJAttention, + GPTJBlock: QEffGPTJBlock, + GPTJModel: QEffGPTJModel, + GPTJForCausalLM: QEffGPTJForCausalLM, + # Llama + LlamaAttention: QEffLlamaAttention, + LlamaDecoderLayer: QEffLlamaDecoderLayer, + LlamaModel: QEffLlamaModel, + LlamaForCausalLM: QEffLlamaForCausalLM, + LlamaRotaryEmbedding: QEffLlamaRotaryEmbedding, + # Llama4 + Llama4TextAttention: QEffLlama4TextAttention, + Llama4ForCausalLM: QEffLlama4ForCausalLM, + Llama4TextDecoderLayer: QEffLlama4TextDecoderLayer, + Llama4TextModel: QEffLlama4TextModel, + Llama4TextMoe: QEffLlama4TextMoe, + Llama4ForConditionalGeneration: QEffLlama4ForConditionalGeneration, + Llama4VisionAttention: QEffLlama4VisionAttention, + Llama4VisionModel: QEffLlama4VisionModel, + Llama4TextExperts: QEffLlama4TextExperts, + Llama4Router: QEffLlama4Router, + # Llava + LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration, + # Llava Next + LlavaNextForConditionalGeneration: QEffLlavaNextForConditionalGeneration, + # Gemma + GemmaAttention: QEffGemmaAttention, + GemmaDecoderLayer: QEffGemmaDecoderLayer, + GemmaModel: QEffGemmaModel, + GemmaForCausalLM: QEffGemmaForCausalLM, + # Qwen3Moe + Qwen3MoeForCausalLM: QEffQwen3MoeForCausalLM, + Qwen3MoeModel: QEffQwen3MoeModel, + Qwen3MoeDecoderLayer: QEffQwen3MoeDecoderLayer, + Qwen3MoeAttention: QEffQwen3MoeAttention, + Qwen3MoeRotaryEmbedding: QEffQwen3MoeRotaryEmbedding, + Qwen3MoeSparseMoeBlock: QEffQwen3MoeSparseMoeBlock, + # Gemma2 + Gemma2Attention: QEffGemma2Attention, + Gemma2DecoderLayer: QEffGemma2DecoderLayer, + Gemma2Model: QEffGemma2Model, + Gemma2ForCausalLM: QEffGemma2ForCausalLM, + # Gemma3 + Gemma3Attention: QEffGemma3Attention, + Gemma3DecoderLayer: QEffGemma3DecoderLayer, + Gemma3TextModel: QEffGemma3TextModel, + Gemma3ForCausalLM: QEffGemma3ForCausalLMModel, + Gemma3ForConditionalGeneration: QEffGemma3ForConditionalGeneration, + # GPT_OSS + GptOssAttention: QEffGptOssAttention, + GptOssDecoderLayer: QEffGptOssDecoderLayer, + GptOssModel: QEffGptOssModel, + GptOssForCausalLM: QEffGptOssForCausalLM, + GptOssMLP: QEffGptOssMLP, + GptOssExperts: QEffGptOssExperts, + # Granite + GraniteModel: QEffGraniteModel, + GraniteForCausalLM: QEffGraniteForCausalLM, + GraniteAttention: QEffGraniteAttention, + GraniteDecoderLayer: QEffGraniteDecoderLayer, + # GraniteMoe + GraniteMoeModel: QEffGraniteMoeModel, + GraniteMoeForCausalLM: QEffGraniteMoeForCausalLM, + GraniteMoeAttention: QEffGraniteMoeAttention, + GraniteMoeRotaryEmbedding: QEffGraniteMoeRotaryEmbedding, + GraniteMoeParallelExperts: QEffGraniteMoeParallelExperts, + GraniteMoeTopKGating: QEffGraniteMoeTopKGating, + GraniteMoeMoE: QEffGraniteMoeMoE, + # mllama + MllamaTextRMSNorm: CustomRMSNormAIC, + MllamaTextSelfAttention: QEffMllamaTextSelfAttention, + MllamaSelfAttentionDecoderLayer: QEffMllamaSelfAttentionDecoderLayer, + MllamaModel: QEffMllamaModel, + MllamaCrossAttentionDecoderLayer: QEffMllamaCrossAttentionDecoderLayer, + MllamaRotaryEmbedding: QEffMllamaRotaryEmbedding, + MllamaVisionModel: QEffMllamaVisionModel, + MllamaTextModel: QEffMllamaTextModel, + MllamaForCausalLM: QEffMllamaForCausalLM, + MllamaForConditionalGeneration: QEffMllamaForConditionalGeneration, + # Mistral + MistralAttention: QEffMistralAttention, + MistralDecoderLayer: QEffMistralDecoderLayer, + MistralModel: QEffMistralModel, + MistralForCausalLM: QEffMistralForCausalLM, + # Mistral3 + Mistral3ForConditionalGeneration: QEffMistral3ForConditionalGeneration, + Mistral3Model: QEffMistral3Model, + # Mixtral + MixtralAttention: QEffMixtralAttention, + MixtralSparseMoeBlock: QEffMixtralSparseMoeBlock, + MixtralDecoderLayer: QeffMixtralDecoderLayer, + MixtralModel: QEffMixtralModel, + MixtralForCausalLM: QEffMixtralForCausalLM, + # Mpt + MptAttention: QEffMptAttention, + MptBlock: QEffMptBlock, + MptModel: QEFfMptModel, + MptForCausalLM: QEffMptForCausalLM, + # Phi3 + Phi3Attention: QEffPhi3Attention, + Phi3DecoderLayer: QEffPhi3DecoderLayer, + Phi3Model: QEffPhi3Model, + Phi3ForCausalLM: QEffPhi3ForCausalLM, + # Phi + PhiAttention: QEffPhiAttention, + PhiDecoderLayer: QEffPhiDecoderLayer, + PhiModel: QEffPhiModel, + PhiForCausalLM: QEffPhiForCausalLM, + # Pixtral + PixtralVisionModel: QEffPixtralVisionModel, + # Qwen2 + Qwen2Attention: QEffQwen2Attention, + Qwen2DecoderLayer: QEffQwen2DecoderLayer, + Qwen2Model: QEffQwen2Model, + Qwen2ForCausalLM: QEffQwen2ForCausalLM, + # Qwen3 + Qwen3Attention: QEffQwen3Attention, + Qwen3DecoderLayer: QEffQwen3DecoderLayer, + Qwen3Model: QEffQwen3Model, + Qwen3ForCausalLM: QEffQwen3ForCausalLM, + # Qwen2.5 VL + Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration, + Qwen2_5_VLModel: QEffQwen2_5_VLModel, + Qwen2_5_VLAttention: QEffQwen2_5_VLAttention, + Qwen2_5_VLDecoderLayer: QEffQwen2_5_VLDecoderLayer, + Qwen2_5_VisionTransformerPretrainedModel: QEffQwen2_5_VisionTransformerPretrainedModel, + Qwen2_5_VLVisionAttention: QEffQwen2_5_VLVisionAttention, + Qwen2_5_VLTextModel: QEffQwen2_5_VLTextModel, + # Starcoder2 + Starcoder2Attention: QEffStarcoder2Attention, + Starcoder2DecoderLayer: QEFFStarcoder2DecoderLayer, + Starcoder2Model: QEffStarcoder2Model, + Starcoder2ForCausalLM: QEffStarcoder2ForCausalLM, + # GptBigcode + GPTBigCodeAttention: QEffGPTBigCodeAttention, + GPTBigCodeBlock: QEffGPTBigCodeBlock, + GPTBigCodeModel: QEffGPTBigCodeModel, + GPTBigCodeForCausalLM: QEffGPTBigCodeForCausalLM, + # Olmo2 + Olmo2Attention: QEffOlmo2Attention, + Olmo2DecoderLayer: QEffOlmo2DecoderLayer, + Olmo2Model: QEffOlmo2Model, + Olmo2ForCausalLM: QEffOlmo2ForCausalLM, + # Whisper encoder and decoder layers + WhisperPositionalEmbedding: QEffWhisperPositionalEmbedding, + WhisperAttention: QEffWhisperAttention, + WhisperDecoderLayer: QEffWhisperDecoderLayer, + WhisperEncoder: QEffWhisperEncoder, + WhisperDecoder: QEffWhisperDecoder, + WhisperModel: QEffWhisperModel, + WhisperForConditionalGeneration: QEffWhisperForConditionalGeneration, + } @classmethod def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: @@ -728,16 +743,20 @@ class PrefillOnlyTransform(ModuleMappingTransform): class PrefillOnlyChunkedTransform(ModuleMappingTransform): - _module_mapping = { - # GPT_OSS - QEffGptOssModel: QEffPrefillOnlyGptOssModel, - QEffGptOssAttention: QEffPrefillOnlyChunkedGptOssAttention, - QEffGptOssMLP: QEffPrefillOnlyChunkedGptOssMLP, - # Qwen3Moe - QEffQwen3MoeSparseMoeBlock: QEffPrefillChunkedQwen3MoeSparseMoeBlock, - # Qwen3 VL Moe - QEffQwen3VLMoeTextSparseMoeBlock: QEffPrefillChunkedQwen3VLMoeTextSparseMoeBlock, - } + if version.parse(transformers.__version__) >= version.parse("4.57.0"): + _module_mapping = { + # Qwen3 VL Moe + QEffQwen3VLMoeTextSparseMoeBlock: QEffPrefillChunkedQwen3VLMoeTextSparseMoeBlock, + } + else: + _module_mapping = { + # GPT_OSS + QEffGptOssModel: QEffPrefillOnlyGptOssModel, + QEffGptOssAttention: QEffPrefillOnlyChunkedGptOssAttention, + QEffGptOssMLP: QEffPrefillOnlyChunkedGptOssMLP, + # Qwen3Moe + QEffQwen3MoeSparseMoeBlock: QEffPrefillChunkedQwen3MoeSparseMoeBlock, + } class RevertPrefillKeepAttentionTransform(ModuleMappingTransform): diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 070856c6e..2dbd42b7b 100644 --- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -10,23 +10,27 @@ import torch import torch.nn as nn import torch.nn.functional as F +import transformers +from packaging import version from transformers.cache_utils import Cache from transformers.modeling_outputs import ( BaseModelOutputWithPast, ) -from transformers.models.qwen3_vl.modeling_qwen3_vl import ( - Qwen3VLForConditionalGeneration, - Qwen3VLModel, - Qwen3VLModelOutputWithPast, - Qwen3VLTextAttention, - Qwen3VLTextDecoderLayer, - Qwen3VLTextModel, - Qwen3VLVisionAttention, - Qwen3VLVisionModel, - apply_rotary_pos_emb_vision, - repeat_kv, - rotate_half, -) + +if version.parse(transformers.__version__) >= version.parse("4.57.0"): + from transformers.models.qwen3_vl.modeling_qwen3_vl import ( + Qwen3VLForConditionalGeneration, + Qwen3VLModel, + Qwen3VLModelOutputWithPast, + Qwen3VLTextAttention, + Qwen3VLTextDecoderLayer, + Qwen3VLTextModel, + Qwen3VLVisionAttention, + Qwen3VLVisionModel, + apply_rotary_pos_emb_vision, + repeat_kv, + rotate_half, + ) from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask @@ -844,10 +848,10 @@ def get_specializations( comp_ctx_lengths_prefill = compiler_options.pop("comp_ctx_lengths_prefill", None) comp_ctx_lengths_decode = compiler_options.pop("comp_ctx_lengths_decode", None) if height is None or width is None: - height = 1365 - width = 2048 + height = constants.QWEN3_VL_HEIGHT + width = constants.QWEN3_VL_WIDTH logger.warning( - "Setting height and width to be 1365 and 2048 respectively, as it was neither passed nor found in vision_config" + f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config" ) prefill_seq_len = prefill_seq_len if prefill_seq_len else 128 ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN diff --git a/QEfficient/transformers/quantizers/quant_transforms.py b/QEfficient/transformers/quantizers/quant_transforms.py index f97bfe998..9870ced5e 100644 --- a/QEfficient/transformers/quantizers/quant_transforms.py +++ b/QEfficient/transformers/quantizers/quant_transforms.py @@ -6,10 +6,14 @@ # ----------------------------------------------------------------------------- import torch +import transformers +from packaging import version from torch import nn from transformers import AutoConfig from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts -from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts + +if version.parse(transformers.__version__) >= version.parse("4.57.0"): + from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts from QEfficient.base.pytorch_transforms import ModuleMutatorTransform from QEfficient.customop.matmulnbits import QuantLinearORT diff --git a/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py index 382677bcf..94cf6ba98 100644 --- a/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py +++ b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py @@ -10,7 +10,11 @@ from typing import List import torch -from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts +import transformers +from packaging import version + +if version.parse(transformers.__version__) >= version.parse("4.57.0"): + from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer from transformers.utils.quantization_config import CompressedTensorsConfig, QuantizationConfigMixin, QuantizationMethod diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 7e6dd1cbb..af1fe5c03 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -143,6 +143,10 @@ def get_models_dir(): QWEN2_5_VL_HEIGHT = 354 QWEN2_5_VL_WIDTH = 536 +# Qwen3_vl Constanst +QWEN3_VL_HEIGHT = 354 +QWEN3_VL_WIDTH = 536 + # Modules to cache while clearing the pytorch weights CACHE_MODULES = ["get_output_names", "get_dummy_inputs", "get_onnx_dynamic_axes", "get_specializations"] diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json index e5a3f9503..ef5b775f2 100644 --- a/tests/configs/image_text_model_configs.json +++ b/tests/configs/image_text_model_configs.json @@ -105,6 +105,48 @@ "full_batch_size": 2, "additional_params": {} }, + { + "model_name": "Qwen/Qwen3-VL-2B-Instruct", + "model_type": "qwen3_vl", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 4096, + "img_size": 1540, + "img_url": "https://picsum.photos/id/237/536/354", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 1, + "img_url_list":[ + "https://picsum.photos/id/237/536/354", + "https://picsum.photos/id/237/536/354" + ], + "text_prompt_list": [ + "Can you describe the image in detail?", + "What are the objects in the image?" + ], + "full_batch_size": 2, + "additional_params": {} + }, + { + "model_name": "Qwen/Qwen3-VL-30B-A3B-Instruct", + "model_type": "qwen3_vl_moe", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 4096,cd .. + "img_size": 1540, + "img_url": "https://picsum.photos/id/237/536/354", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 1, + "img_url_list":[ + "https://picsum.photos/id/237/536/354", + "https://picsum.photos/id/237/536/354" + ], + "text_prompt_list": [ + "Can you describe the image in detail?", + "What are the objects in the image?" + ], + "full_batch_size": 2, + "additional_params": {} + }, { "model_name": "allenai/Molmo-7B-D-0924", "model_type": "molmo", diff --git a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py index a2c72ba7a..18dc92548 100644 --- a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py +++ b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py @@ -13,6 +13,8 @@ import pytest import requests import torch +import transformers +from packaging import version from PIL import Image from transformers import ( AutoConfig, @@ -321,10 +323,25 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( if not is_intern_model and not is_molmo_model: inputs = processor(images=image, text=prompt, return_tensors="pt") - if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": + if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type in ["qwen2_5_vl"]: inputs = qeff_model.model.prepare_inputs_for_generation( inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size ) + if ( + hasattr(qeff_model.model.config, "model_type") + and version.parse(transformers.__version__) >= version.parse("4.57.0") + and qeff_model.model.config.model_type in ["qwen3_vl", "qwen3_vl_moe"] + ): + inputs = qeff_model.model.prepare_inputs_for_generation( + inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size + ) + if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type in [ + "qwen3_vl", + "qwen3_vl_moe", + ]: + config.vision_config.depth = 9 + config.text_config.num_hidden_layers = 1 + config.vision_config.deepstack_visual_indexes = [8] if "pixel_values" in inputs: inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) @@ -352,7 +369,14 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload ]: pytest.skip("Test skipped for this model due to some issues.") if ( - model_name in ["OpenGVLab/InternVL2_5-1B", "OpenGVLab/InternVL3_5-1B", "Qwen/Qwen2.5-VL-3B-Instruct"] + model_name + in [ + "OpenGVLab/InternVL2_5-1B", + "OpenGVLab/InternVL3_5-1B", + "Qwen/Qwen2.5-VL-3B-Instruct", + "Qwen/Qwen3-VL-2B-Instruct", + "Qwen/Qwen3-VL-30B-A3B-Instruct", + ] and not kv_offload ): pytest.skip("These models require kv_offload=True for testing.")