Skip to content

Commit 35bee03

Browse files
authored
graph : remove redundant scale_w parameter (ggml-org#20235)
1 parent 451ef08 commit 35bee03

41 files changed

Lines changed: 85 additions & 86 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/llama-graph.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,7 +1151,6 @@ ggml_tensor * llm_graph_context::build_ffn(
11511151
return cur;
11521152
}
11531153

1154-
// TODO remove redundant scale_w argument
11551154
ggml_tensor * llm_graph_context::build_moe_ffn(
11561155
ggml_tensor * cur,
11571156
ggml_tensor * gate_inp,
@@ -1163,7 +1162,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
11631162
int64_t n_expert_used,
11641163
llm_ffn_op_type type_op,
11651164
bool norm_w,
1166-
bool scale_w,
11671165
float w_scale,
11681166
llama_expert_gating_func_type gating_op,
11691167
int il,
@@ -1180,7 +1178,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
11801178
n_expert_used,
11811179
type_op,
11821180
norm_w,
1183-
scale_w,
11841181
w_scale,
11851182
gating_op,
11861183
il,
@@ -1204,7 +1201,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
12041201
int64_t n_expert_used,
12051202
llm_ffn_op_type type_op,
12061203
bool norm_w,
1207-
bool scale_w,
12081204
float w_scale,
12091205
llama_expert_gating_func_type gating_op,
12101206
int il,
@@ -1332,7 +1328,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
13321328

13331329
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
13341330
}
1335-
if (scale_w) {
1331+
if (w_scale != 0.0f && w_scale != 1.0f) {
13361332
weights = ggml_scale(ctx0, weights, w_scale);
13371333
cb(weights, "ffn_moe_weights_scaled", il);
13381334
}

src/llama-graph.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -810,7 +810,6 @@ struct llm_graph_context {
810810
int64_t n_expert_used,
811811
llm_ffn_op_type type_op,
812812
bool norm_w,
813-
bool scale_w,
814813
float w_scale,
815814
llama_expert_gating_func_type gating_op,
816815
int il,
@@ -832,7 +831,6 @@ struct llm_graph_context {
832831
int64_t n_expert_used,
833832
llm_ffn_op_type type_op,
834833
bool norm_w,
835-
bool scale_w,
836834
float w_scale,
837835
llama_expert_gating_func_type gating_op,
838836
int il,

src/llama-model.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1570,6 +1570,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
15701570
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
15711571
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
15721572
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1573+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
15731574

15741575
switch (hparams.n_ff_exp) {
15751576
case 1408: type = LLM_TYPE_16B; break;
@@ -2076,6 +2077,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
20762077
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
20772078
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
20782079
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
2080+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
20792081
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
20802082

20812083
switch (hparams.n_layer) {

src/models/afmoe.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
127127
n_expert, n_expert_used,
128128
LLM_FFN_SILU,
129129
hparams.expert_weights_norm, // norm_w (route_norm=True)
130-
hparams.expert_weights_scale, // scale_w
131130
hparams.expert_weights_scale, // w_scale (route_scale=2.826)
132131
(llama_expert_gating_func_type) hparams.expert_gating_func,
133132
il);

src/models/arctic.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include "models.h"
22

3-
43
llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
54
const int64_t n_embd_head = hparams.n_embd_head_v;
65

@@ -104,7 +103,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
104103
nullptr,
105104
n_expert, n_expert_used,
106105
LLM_FFN_SILU, true,
107-
false, 0.0,
106+
hparams.expert_weights_scale,
108107
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
109108
il);
110109
cb(cur, "ffn_moe_out", il);

src/models/bailingmoe.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include "models.h"
22

3-
43
llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
54
ggml_tensor * cur;
65
ggml_tensor * inpL;
@@ -97,7 +96,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
9796
nullptr,
9897
n_expert, n_expert_used,
9998
LLM_FFN_SILU, hparams.expert_weights_norm,
100-
false, hparams.expert_weights_scale,
99+
hparams.expert_weights_scale,
101100
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
102101
il);
103102
cb(moe_out, "ffn_moe_out", il);

src/models/bailingmoe2.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include "models.h"
22

3-
4-
53
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
64
llm_graph_context(params) {
75
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -90,7 +88,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
9088
model.layers[il].ffn_exp_probs_b,
9189
n_expert, n_expert_used,
9290
LLM_FFN_SILU, hparams.expert_weights_norm,
93-
hparams.expert_weights_scale, hparams.expert_weights_scale,
91+
hparams.expert_weights_scale,
9492
(llama_expert_gating_func_type) hparams.expert_gating_func,
9593
il);
9694
cb(moe_out, "ffn_moe_out", il);

src/models/bert.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include "models.h"
22

3-
4-
53
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
64
const int64_t n_embd_head = hparams.n_embd_head_v;
75
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -129,9 +127,17 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
129127
// feed-forward network
130128
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
131129
// MoE branch
132-
cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
133-
model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
134-
LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
130+
cur = build_moe_ffn(cur,
131+
model.layers[il].ffn_gate_inp,
132+
model.layers[il].ffn_up_exps,
133+
nullptr,
134+
model.layers[il].ffn_down_exps,
135+
nullptr,
136+
hparams.n_expert, hparams.n_expert_used,
137+
LLM_FFN_GELU, false,
138+
hparams.expert_weights_scale,
139+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
140+
il);
135141
cb(cur, "ffn_moe_out", il);
136142
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
137143
model.arch == LLM_ARCH_JINA_BERT_V3) {

src/models/dbrx.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include "models.h"
22

3-
43
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
54
const int64_t n_embd_head = hparams.n_embd_head_v;
65
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -89,7 +88,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
8988
nullptr,
9089
n_expert, n_expert_used,
9190
LLM_FFN_SILU, true,
92-
false, 0.0,
91+
hparams.expert_weights_scale,
9392
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
9493
il);
9594
cb(cur, "ffn_moe_out", il);

src/models/deepseek.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include "models.h"
22

3-
4-
53
llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
64
llm_graph_context(params) {
75
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -100,7 +98,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
10098
nullptr,
10199
n_expert, n_expert_used,
102100
LLM_FFN_SILU, false,
103-
false, hparams.expert_weights_scale,
101+
hparams.expert_weights_scale,
104102
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
105103
il);
106104
cb(moe_out, "ffn_moe_out", il);

0 commit comments

Comments
 (0)