EAddario
diff --git a/‎src/llama-graph.cpp‎
Lines changed: 1 addition & 5 deletions b/‎src/llama-graph.cpp‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎src/llama-graph.h‎
Lines changed: 0 additions & 2 deletions b/‎src/llama-graph.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/llama-model.cpp‎
Lines changed: 2 additions & 0 deletions b/‎src/llama-model.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/models/afmoe.cpp‎
Lines changed: 0 additions & 1 deletion b/‎src/models/afmoe.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/models/arctic.cpp‎
Lines changed: 1 addition & 2 deletions b/‎src/models/arctic.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/models/bailingmoe.cpp‎
Lines changed: 1 addition & 2 deletions b/‎src/models/bailingmoe.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/models/bailingmoe2.cpp‎
Lines changed: 1 addition & 3 deletions b/‎src/models/bailingmoe2.cpp‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/models/bert.cpp‎
Lines changed: 11 additions & 5 deletions b/‎src/models/bert.cpp‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎src/models/dbrx.cpp‎
Lines changed: 1 addition & 2 deletions b/‎src/models/dbrx.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/models/deepseek.cpp‎
Lines changed: 1 addition & 3 deletions b/‎src/models/deepseek.cpp‎
Lines changed: 1 addition & 3 deletions
@@ -1151,7 +1151,6 @@ ggml_tensor * llm_graph_context::build_ffn(
     return cur;
 }
 
-// TODO remove redundant scale_w argument
 ggml_tensor * llm_graph_context::build_moe_ffn(
          ggml_tensor * cur,
          ggml_tensor * gate_inp,
@@ -1163,7 +1162,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
              int64_t   n_expert_used,
      llm_ffn_op_type   type_op,
                 bool   norm_w,
-                bool   scale_w,
                float   w_scale,
          llama_expert_gating_func_type gating_op,
                  int   il,
@@ -1180,7 +1178,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         n_expert_used,
         type_op,
         norm_w,
-        scale_w,
         w_scale,
         gating_op,
         il,
@@ -1204,7 +1201,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
              int64_t   n_expert_used,
      llm_ffn_op_type   type_op,
                 bool   norm_w,
-                bool   scale_w,
                float   w_scale,
         llama_expert_gating_func_type gating_op,
                  int   il,
@@ -1332,7 +1328,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
         weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
     }
-    if (scale_w) {
+    if (w_scale != 0.0f && w_scale != 1.0f) {
         weights = ggml_scale(ctx0, weights, w_scale);
         cb(weights, "ffn_moe_weights_scaled", il);
     }
 
@@ -810,7 +810,6 @@ struct llm_graph_context {
                  int64_t   n_expert_used,
          llm_ffn_op_type   type_op,
                     bool   norm_w,
-                    bool   scale_w,
                    float   w_scale,
             llama_expert_gating_func_type gating_op,
                      int   il,
@@ -832,7 +831,6 @@ struct llm_graph_context {
                  int64_t   n_expert_used,
          llm_ffn_op_type   type_op,
                     bool   norm_w,
-                    bool   scale_w,
                    float   w_scale,
             llama_expert_gating_func_type gating_op,
                      int   il,
 
@@ -1570,6 +1570,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
                 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
 
                 switch (hparams.n_ff_exp) {
                     case 1408: type = LLM_TYPE_16B; break;
@@ -2076,6 +2077,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
                 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
                 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
 
                 switch (hparams.n_layer) {
 
@@ -127,7 +127,6 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
                     n_expert, n_expert_used,
                     LLM_FFN_SILU,
                     hparams.expert_weights_norm,           // norm_w (route_norm=True)
-                    hparams.expert_weights_scale,          // scale_w
                     hparams.expert_weights_scale,          // w_scale (route_scale=2.826)
                     (llama_expert_gating_func_type) hparams.expert_gating_func,
                     il);
 
@@ -1,6 +1,5 @@
 #include "models.h"
 
-
 llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v;
 
@@ -104,7 +103,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
                 nullptr,
                 n_expert, n_expert_used,
                 LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                 LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                 il);
         cb(cur, "ffn_moe_out", il);
 
@@ -1,6 +1,5 @@
 #include "models.h"
 
-
 llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
@@ -97,7 +96,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
                     nullptr,
                     n_expert, n_expert_used,
                     LLM_FFN_SILU, hparams.expert_weights_norm,
-                    false, hparams.expert_weights_scale,
+                    hparams.expert_weights_scale,
                     LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                     il);
         cb(moe_out, "ffn_moe_out", il);
 
@@ -1,7 +1,5 @@
 #include "models.h"
 
-
-
 llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -90,7 +88,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
                 model.layers[il].ffn_exp_probs_b,
                 n_expert, n_expert_used,
                 LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                 (llama_expert_gating_func_type) hparams.expert_gating_func,
                 il);
             cb(moe_out, "ffn_moe_out", il);
 
@@ -1,7 +1,5 @@
 #include "models.h"
 
-
-
 llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v;
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -129,9 +127,17 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
         // feed-forward network
         if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
             // MoE branch
-            cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
-                                model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
-                                LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    nullptr,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    hparams.n_expert, hparams.n_expert_used,
+                    LLM_FFN_GELU, false,
+                    hparams.expert_weights_scale,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
             cb(cur, "ffn_moe_out", il);
         } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
                    model.arch == LLM_ARCH_JINA_BERT_V3) {
 
@@ -1,6 +1,5 @@
 #include "models.h"
 
-
 llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v;
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -89,7 +88,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
                 nullptr,
                 n_expert, n_expert_used,
                 LLM_FFN_SILU, true,
-                false, 0.0,
+                hparams.expert_weights_scale,
                 LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                 il);
         cb(cur, "ffn_moe_out", il);
 
@@ -1,7 +1,5 @@
 #include "models.h"
 
-
-
 llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -100,7 +98,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
                 nullptr,
                 n_expert, n_expert_used,
                 LLM_FFN_SILU, false,
-                false, hparams.expert_weights_scale,
+                hparams.expert_weights_scale,
                 LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                 il);
             cb(moe_out, "ffn_moe_out", il);