From 7524b0f94ee8460e445d20f056955b189d631f58 Mon Sep 17 00:00:00 2001
From: Tsukasa OI <floss_llm@irq.a4lg.com>
Date: Tue, 9 Dec 2025 04:58:53 +0000
Subject: [PATCH] Clarify the intent of GGUF FusedMoE weight materialization

In the process of FusedMoE weight data materialization from GGUF files,
there is a magic number and some intents are not clear enough.

This commit clarifies some of them:

1.  GGUF (currently) requires 3D tensor(s) for FusedMoE layer weights
    as we have to know full tensor shape to materialize the parameter
    (including number of experts).
2.  w1 and w3 are merged per expert, i.e. the next dimension after
    the expert ID is to be doubled to store both w1 and w3.

... and makes some minor adjustments.

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 7f803720d477..eba6ab4cc35f 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1200,10 +1200,14 @@ def weight_loader(
         if full_load:
             shard_dim += 1
 
-        # Materialize GGUF UninitializedParameter
+        # Materialize GGUF UninitializedParameter accounting merged weights
         if is_gguf_weight and isinstance(param, UninitializedParameter):
+            # To materialize a tensor, we must have full shape including
+            # number of experts, making this portion to require `full_load`.
+            assert full_load
             final_shape = list(loaded_weight.shape)
-            if shard_id in ["w1", "w3"]:
+            # w1 and w3 are merged per expert.
+            if shard_id in {"w1", "w3"}:
                 final_shape[1] *= 2
             final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
             param.materialize(final_shape, dtype=loaded_weight.dtype)