From 7524b0f94ee8460e445d20f056955b189d631f58 Mon Sep 17 00:00:00 2001 From: Tsukasa OI Date: Tue, 9 Dec 2025 04:58:53 +0000 Subject: [PATCH] Clarify the intent of GGUF FusedMoE weight materialization In the process of FusedMoE weight data materialization from GGUF files, there is a magic number and some intents are not clear enough. This commit clarifies some of them: 1. GGUF (currently) requires 3D tensor(s) for FusedMoE layer weights as we have to know full tensor shape to materialize the parameter (including number of experts). 2. w1 and w3 are merged per expert, i.e. the next dimension after the expert ID is to be doubled to store both w1 and w3. ... and makes some minor adjustments. Signed-off-by: Tsukasa OI --- vllm/model_executor/layers/fused_moe/layer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 7f803720d477..eba6ab4cc35f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1200,10 +1200,14 @@ def weight_loader( if full_load: shard_dim += 1 - # Materialize GGUF UninitializedParameter + # Materialize GGUF UninitializedParameter accounting merged weights if is_gguf_weight and isinstance(param, UninitializedParameter): + # To materialize a tensor, we must have full shape including + # number of experts, making this portion to require `full_load`. + assert full_load final_shape = list(loaded_weight.shape) - if shard_id in ["w1", "w3"]: + # w1 and w3 are merged per expert. + if shard_id in {"w1", "w3"}: final_shape[1] *= 2 final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size param.materialize(final_shape, dtype=loaded_weight.dtype)