Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
fd2f295
Onboarding Qwen3VL Dense
qcdipankar Feb 6, 2026
dc61d12
Minor Fix of Output Putting Rotary back to hf rotary
qcdipankar Feb 6, 2026
1ccf45b
Fixed ros_embed and added multi vision config
qcdipankar Feb 7, 2026
240a0b3
Updating the Min Pixel Calculations
qcdipankar Feb 7, 2026
b65399f
Cleaning Code 3
qcdipankar Feb 8, 2026
330444a
Removed breakpoints and commented code only.
quic-dhirajku Feb 11, 2026
b94ae73
Code Cleaning Done 1
qcdipankar Feb 12, 2026
db4d3b3
Modified the qwen3vl multi config example script
qcdipankar Feb 18, 2026
bddbf40
Added Continous batch script for qwen3vl
qcdipankar Feb 19, 2026
4727b63
Added CB support for Qwen3_VL
qcdipankar Feb 19, 2026
a398371
Code Cleaning Done 1
qcdipankar Feb 19, 2026
7d68604
Qwen3Vl
qcdipankar Feb 20, 2026
953ab34
Add fp8 support (#802)
ochougul Feb 23, 2026
5afc2cf
Merge branch 'qwen3_vl_mainline' into qwen3_vl
quic-dhirajku Feb 25, 2026
2c19750
Fixing the issue of CCL support during the decoding phase of Disaggre…
vjanfaza Feb 25, 2026
037124e
Fixed Granite_moe and added to CI (#771)
quic-akuruvil Feb 26, 2026
a8fec95
removed duplication of `mdp_json_path` in compilation command (#706) …
ochougul Feb 27, 2026
41311ba
[Proxy]: Adding support for exporting proxy Model (#620)
abukhoy Mar 2, 2026
633cb60
Gemma3 NPI File Update (#810)
quic-hemagnih Mar 3, 2026
d267fd7
Updated FT docs (#822)
quic-akuruvil Mar 4, 2026
97b60ff
Daily PR report workflow and email notification system (#824)
quic-rishinr Mar 5, 2026
926da4c
Updated SMPT server (#830)
quic-rishinr Mar 5, 2026
1d2707f
Removed git workflow and email test changes (#836)
quic-rishinr Mar 9, 2026
29b09e6
Upgrade python version from 3.10 to 3.12 (#782)
quic-rishinr Mar 9, 2026
ff8a305
Adding dissagg mode support to Qwen3Moe (#682)
qcdipankar Mar 10, 2026
7f8dab4
fix(cloud.infer): reduce Qwen3-MoE export OOM risk (#821)
jd316 Mar 11, 2026
dd494c9
Removed urllib and multidict (#846)
quic-rishinr Mar 13, 2026
71d6245
CPU pytest unit test suite (#852)
quic-rishinr Mar 17, 2026
1fd7a3b
Onboarding Qwen3VL Dense
qcdipankar Feb 6, 2026
cbcae2a
Add fp8 support (#802)
ochougul Feb 23, 2026
c6ef663
Merge branch 'main' into qwen3_vl
qcdipankar Mar 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions QEfficient/generation/embedding_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,21 @@ def prepare_vlm_inputs(self, image_url: str, query: str, prefill_seq_len: int) -
inputs = self._qeff_model.model.prepare_inputs_for_generation(
inputs=inputs, prefill_seq_len=prefill_seq_len, batch_size=inputs["input_ids"].shape[0]
)
if (
hasattr(self._qeff_model.model.config, "model_type")
and self._qeff_model.model.config.model_type == "qwen3_vl"
):
inputs = self._qeff_model.model.prepare_inputs_for_generation(
inputs=inputs, prefill_seq_len=prefill_seq_len, batch_size=inputs["input_ids"].shape[0]
)

if (
hasattr(self._qeff_model.model.config, "model_type")
and self._qeff_model.model.config.model_type == "qwen3_vl_moe"
):
inputs = self._qeff_model.model.prepare_inputs_for_generation(
inputs=inputs, prefill_seq_len=prefill_seq_len, batch_size=inputs["input_ids"].shape[0]
)

# Convert to float32 if needed
if "pixel_values" in inputs:
Expand Down
33 changes: 32 additions & 1 deletion QEfficient/generation/vlm_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ def __init__(
self.is_qwen2_5_vl = (
hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl"
)
self.is_qwen3_vl = (
hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen3_vl"
)
self.qeff_model = qeff_model
self.processor = processor
self.tokenizer = tokenizer
Expand Down Expand Up @@ -259,6 +262,8 @@ def run_prefill_for_all_inputs(self, prompt_queue, generation_len):

if self.is_qwen2_5_vl:
_ = self.update_decode_inputs_qwen2_5_vl(outputs, position_ids, generation_len, decode_batch_id)
if self.is_qwen3_vl:
_ = self.update_decode_inputs_qwen3_vl(outputs, position_ids, generation_len, decode_batch_id)
else:
_ = self.update_decode_input(outputs, position_ids, generation_len, decode_batch_id)

Expand All @@ -283,6 +288,27 @@ def update_decode_inputs_qwen2_5_vl(self, outputs, position_ids, generation_len,
self.generation_len[decode_batch_id or slice(None)] = generation_len
return next_token_id

def update_decode_inputs_qwen3_vl(self, outputs, position_ids, generation_len, decode_batch_id=None):
"""
Updates the decode input with the generated values.
Args:
outputs (dict): The outputs of the model.
position_ids (array): The position IDs.
generation_len (int): The generation length.
decode_batch_id (int, optional): The decode batch ID. If None, all values are updated. Defaults to None.

Returns:
next_token_id (array): The next token ID.
"""
next_token_id = self._fetch_next_token_id(outputs)

# Store the generated values.
self.decode_input_ids[decode_batch_id or slice(None)] = next_token_id
self.decode_pos_ids[:, decode_batch_id] = position_ids.squeeze(1)
self.generated_ids[decode_batch_id or slice(None), 0] = next_token_id.squeeze(1)
self.generation_len[decode_batch_id or slice(None)] = generation_len
return next_token_id

def _execute_chunked_prefill(
self,
lang_inputs: Dict[str, np.ndarray],
Expand Down Expand Up @@ -583,7 +609,8 @@ def _generate_continuous_batching(self, vision_prompts, generation_len, stream,
self.initialize_decode_inputs(num_prompts, execution_batch_size, max_gen_length)
if self.is_qwen2_5_vl:
self.decode_pos_ids = np.zeros((4, execution_batch_size, 1), np.int64)

if self.is_qwen3_vl:
self.decode_pos_ids = np.zeros((4, execution_batch_size, 1), np.int64)
# Create prompt queue
prompt_queue = deque(vision_prompts)

Expand Down Expand Up @@ -696,6 +723,10 @@ def run_prefill_for_all_inputs_with_cached_vision(self, prompt_queue, generation
self.update_decode_inputs_qwen2_5_vl(
outputs, position_ids_decode, generation_len_final, decode_batch_id
)
elif self.is_qwen3_vl:
self.update_decode_inputs_qwen3_vl(
outputs, position_ids_decode, generation_len_final, decode_batch_id
)
else:
self.update_decode_input(outputs, position_ids_decode, generation_len_final, decode_batch_id)
else:
Expand Down
69 changes: 63 additions & 6 deletions QEfficient/transformers/cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ def _get_invalid_idx_value(cls):


class QEffDynamicLayer(DynamicLayer):
def lazy_initialization(self, key_states: torch.Tensor):
self.dtype, self.device = key_states.dtype, key_states.device
self.keys = torch.tensor([], dtype=self.dtype, device=self.device)
self.values = torch.tensor([], dtype=self.dtype, device=self.device)
self.is_initialized = True

def read_only(self, cache_kwargs):
"""
Reads the `key_states` and `value_states` for the layer.
Expand Down Expand Up @@ -151,6 +157,7 @@ def write_only(self, key_states, value_states, cache_kwargs):
self.keys = key_states
self.values = value_states
else:
# breakpoint()
position_ids = cache_kwargs.get("position_ids")
batch_index = cache_kwargs.get("batch_index", None) # Check and fetch batch index value form the kwargs

Expand Down Expand Up @@ -185,11 +192,15 @@ def update(
Return:
A tuple containing the updated key and value states.
"""
# breakpoint()
# Update the cache
# if not self.is_initialized:

if self.keys is None:
self.keys = key_states
self.values = value_states
k_out, v_out = self.keys, self.values
self.is_initialized = True
else:
position_ids = cache_kwargs.get("position_ids")
batch_index = cache_kwargs.get("batch_index", None) # Check and fetch batch index value form the kwargs
Expand Down Expand Up @@ -306,15 +317,48 @@ class QEffDynamicCache(DynamicCache):

"""

def __init__(self, ddp_cache_data: Optional[Iterable[tuple[torch.Tensor, torch.Tensor]]] = None, *args, **kwargs):
def __init__(
self,
ddp_cache_data: Optional[Iterable[tuple[torch.Tensor, torch.Tensor]]] = None,
config=None,
offloading: bool = False,
offload_only_non_sliding: bool = False,
*args,
**kwargs,
):
# Remove layer_classes if present to avoid duplicate argument
kwargs.pop("layer_classes", None)
# breakpoint()
kwargs.pop("layers", None)
from transformers.cache_utils import Cache # Import here to avoid circular import

Cache.__init__(self, layer_classes=QEffDynamicLayer, *args, **kwargs)
# breakpoint()
layers = []
# If a config is passed, use it to infer the layer types and initialize accordingly
if len(layers) == 0:
Cache.__init__(
self,
layer_class_to_replicate=QEffDynamicLayer,
offloading=offloading,
offload_only_non_sliding=offload_only_non_sliding,
# args=args,
# kwargs=kwargs,
)
else:
Cache.__init__(
self,
layers=layers,
offloading=offloading,
offload_only_non_sliding=offload_only_non_sliding,
# args=args,
# kwargs=kwargs,
)

if ddp_cache_data is not None:
for key_states, value_states in ddp_cache_data:
self.layers.append(QEffDynamicLayer.from_tensors(key_states, value_states))
for layer_idx, (key_states, value_states) in enumerate(ddp_cache_data):
# If the config was not passed above, initialize a DynamicLayer for each entry of the ddp_data
layers.append(QEffDynamicLayer())
# Update the layer with the data
_, _ = layers[layer_idx].update(key_states, value_states)

def read_only(self, layer_idx, cache_kwargs):
"""
Expand All @@ -329,6 +373,7 @@ def read_only(self, layer_idx, cache_kwargs):
Return:
A tuple containing the updated key and value states.
"""
# breakpoint()
return self.layers[layer_idx].read_only(cache_kwargs)

def read_only_blockedKV(self, start_index, end_index, layer_idx, cache_kwargs):
Expand Down Expand Up @@ -394,6 +439,18 @@ def update3D(
self.append_new_layers(layer_idx)
return self.layers[layer_idx].update3D(key_states, value_states, cache_kwargs)

# def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
# """Returns the sequence length of the cached states. A layer index can be optionally passed."""
# # TODO: deprecate this function in favor of `cache_position`
# breakpoint()
# is_empty_layer = (
# len(self.key_cache) == 0 # no cache in any layer
# or len(self.key_cache) <= layer_idx # skipped `layer_idx` and hasn't run a layer with cache after it
# or len(self.key_cache[layer_idx]) == 0 # the layer has no cache
# )
# layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
# return layer_seq_length


class QEffEncoderDecoderCache(EncoderDecoderCache):
"""
Expand Down Expand Up @@ -987,4 +1044,4 @@ def sliding_window_update_chunked(
v_out = CtxGatherFunc.apply(v_out, ctx_indices, ctx_len)
v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)

return k_out, v_out
return k_out, v_out
47 changes: 41 additions & 6 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
write_io_files,
)
from QEfficient.generation.vlm_generation import VisionLanguageGeneration
from QEfficient.proxy.pytorch_transform import QeffProxyModuleTransform
from QEfficient.transformers.modeling_utils import (
DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH,
SPECIALIZED_DISAGG_SERVING_MODEL_ARCH,
Expand All @@ -64,6 +65,8 @@
from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
from QEfficient.transformers.quantizers.quant_transforms import (
AwqToMatmulNbitsTransform,
FP8BlockWiseDequantLinearToLinearTransform,
FP8BlockWiseDequantQwen3VLMoeTextExpertsToQwen3VLMoeTextExpertsTransform,
FP8DeQuantLinearToLinearTransform,
GPTQToMatmulNbitsTransform,
Mxfp4GptOssExpertDequantizeTransform,
Expand Down Expand Up @@ -246,6 +249,10 @@ def __init__(self, model: nn.Module, pooling=None, **kwargs):
**kwargs :
Additional keyword arguments passed to the base class constructor.
"""
if kwargs.pop("enable_proxy", False):
self._pytorch_transforms.append(QeffProxyModuleTransform)
logger.info("Proxy Model Enabled for QEfficient Model")

super().__init__(model, **kwargs)

# Make Embedding specific transforms like appending pooling
Expand Down Expand Up @@ -993,6 +1000,8 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
_pytorch_transforms = [
AwqToMatmulNbitsTransform,
GPTQToMatmulNbitsTransform,
FP8BlockWiseDequantQwen3VLMoeTextExpertsToQwen3VLMoeTextExpertsTransform,
FP8BlockWiseDequantLinearToLinearTransform,
CustomOpsTransform,
KVCacheTransform,
VlmKVOffloadTransform,
Expand Down Expand Up @@ -1493,7 +1502,6 @@ def compile(
use_onnx_subfunctions=use_onnx_subfunctions,
**compiler_options,
)

# Custom NPI file options
if hasattr(self.model, "get_npi_file") and "node_precision_info" not in compiler_options:
compiler_options["node_precision_info"] = self.model.get_npi_file(self.model.name_or_path)
Expand Down Expand Up @@ -1657,6 +1665,7 @@ def kv_offload_generate(
AssertionError
If `generation_len` is not greater than zero.
"""
# breakpoint()
if not self.lang_model.qpc_path:
raise TypeError("Please run compile API for language model first!")

Expand Down Expand Up @@ -1688,7 +1697,7 @@ def kv_offload_generate(
[x[lang_session.binding_index_map["input_ids"]][1][1] for x in lang_session.allowed_shapes]
+ [lang_session.bindings[lang_session.binding_index_map["input_ids"]].dims[1]]
)

# breakpoint()
input_len = inputs["attention_mask"].sum(1, keepdims=True)
input_ids_length = inputs["input_ids"].shape[1]
num_chunks = -(input_ids_length // -prefill_seq_len) # ceil divide without float
Expand Down Expand Up @@ -1725,6 +1734,14 @@ def kv_offload_generate(

vision_inputs_fp16 = {"pixel_values", "image_masks"}
vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs})
pixel_values_shape = list(vision_inputs["pixel_values"].shape)
idx = next(i for i, inner in enumerate(vision_session.allowed_shapes) if (2, pixel_values_shape) in inner)

biffer_set = {
"vision_embeds": np.zeros(vision_session.allowed_shapes[idx][2][1], dtype=np.float16),
"image_grid_thw": np.zeros(vision_session.allowed_shapes[idx][0][1], dtype=np.int64),
}
vision_session.set_buffers(biffer_set)

vision_start = perf_counter()

Expand All @@ -1734,7 +1751,7 @@ def kv_offload_generate(
vision_end = perf_counter()

lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs}

# breakpoint()
if "position_ids" in inputs:
lang_inputs["position_ids"] = inputs["position_ids"]
lang_inputs.pop("attention_mask")
Expand All @@ -1746,11 +1763,21 @@ def kv_offload_generate(
not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
if not_mllama:
lang_inputs["image_idx"] = np.array([[0]])

# breakpoint()
if self.vision_model.qpc_path:
vision_session.deactivate()
lang_session.activate()

vision_outputs["vision_embeds"] = np.pad(
vision_outputs["vision_embeds"],
pad_width=(
(0, 0),
(0, lang_session.allowed_shapes[0][1][1][1] - vision_session.allowed_shapes[idx][2][1][1]),
(0, 0),
), # pad axis=1 only
mode="constant",
constant_values=0,
)
lang_session.set_buffers(vision_outputs)

if self.comp_ctx_lengths_prefill is not None:
Expand All @@ -1761,7 +1788,7 @@ def kv_offload_generate(
lang_inputs["comp_ctx_lengths"] = list_of_comp_ctx_lengths_prefill[prefill_ccl_id]

lang_start = perf_counter()

# breakpoint()
# Run prefill
chunk_inputs = lang_inputs.copy()
for i in range(num_chunks):
Expand Down Expand Up @@ -1793,7 +1820,7 @@ def kv_offload_generate(
)
if not_mllama:
lang_session.skip_buffers(vision_outputs.keys())

# breakpoint()
# Get first token
lang_inputs["input_ids"] = outputs["logits"].argmax(2)
lang_inputs["position_ids"] = np.max(lang_inputs["position_ids"], axis=-1, keepdims=True) + 1
Expand Down Expand Up @@ -2687,6 +2714,10 @@ def __init__(
raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}")
_configure_proxy_for_model(self, kwargs.pop("enable_proxy", False))

if kwargs.pop("enable_proxy", False):
self._pytorch_transforms.append(QeffProxyModuleTransform)
logger.info("Proxy Model Enabled for QEfficient Model")

# TODO: remove from version 1.20
if kwargs.pop("full_batch_size", None):
continuous_batching = True
Expand Down Expand Up @@ -3937,6 +3968,10 @@ class QEFFAutoModelForCTC(QEFFTransformersBase):
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]

def __init__(self, model: nn.Module, **kwargs):
if kwargs.pop("enable_proxy", False):
self._pytorch_transforms.append(QeffProxyModuleTransform)
logger.info("Proxy Model Enabled for QEfficient Model")

super().__init__(model, **kwargs)
self.model.base_model.config.use_cache = True

Expand Down
Loading
Loading