Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 37 additions & 111 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,12 +618,15 @@
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
res = "chatglm-bpe"
if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
res = "chatglm-bpe"
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
res = "glm4"
if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902":
# ref: https://huggingface.co/zai-org/GLM-4.5-Air, https://huggingface.co/zai-org/GLM-4.5
res = "gpt-2"
res = "glm4"
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking"
Expand Down Expand Up @@ -1875,20 +1878,20 @@
tensors: list[tuple[str, Tensor]] = []

if name.endswith("q_proj.weight"):
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), weight_torch))

Check failure on line 1881 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"weight_torch" is possibly unbound (reportPossiblyUnboundVariable)

Check failure on line 1881 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"weight_torch" is possibly unbound (reportPossiblyUnboundVariable)
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid, suffix=".scale"), scale_torch))

Check failure on line 1882 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"scale_torch" is possibly unbound (reportPossiblyUnboundVariable)

Check failure on line 1882 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"scale_torch" is possibly unbound (reportPossiblyUnboundVariable)
elif name.endswith("k_proj.weight"):
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), weight_torch))

Check failure on line 1884 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"weight_torch" is possibly unbound (reportPossiblyUnboundVariable)

Check failure on line 1884 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"weight_torch" is possibly unbound (reportPossiblyUnboundVariable)
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid, suffix=".scale"), scale_torch))

Check failure on line 1885 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"scale_torch" is possibly unbound (reportPossiblyUnboundVariable)

Check failure on line 1885 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"scale_torch" is possibly unbound (reportPossiblyUnboundVariable)
elif name.endswith("v_proj.weight"):
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), weight_torch))

Check failure on line 1887 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"weight_torch" is possibly unbound (reportPossiblyUnboundVariable)

Check failure on line 1887 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"weight_torch" is possibly unbound (reportPossiblyUnboundVariable)
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid, suffix=".scale"), scale_torch))

Check failure on line 1888 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"scale_torch" is possibly unbound (reportPossiblyUnboundVariable)

Check failure on line 1888 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"scale_torch" is possibly unbound (reportPossiblyUnboundVariable)
elif name.endswith("o_proj.weight"):
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), weight_torch))

Check failure on line 1890 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"weight_torch" is possibly unbound (reportPossiblyUnboundVariable)

Check failure on line 1890 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"weight_torch" is possibly unbound (reportPossiblyUnboundVariable)
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, suffix=".scale"), scale_torch))

Check failure on line 1891 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"scale_torch" is possibly unbound (reportPossiblyUnboundVariable)

Check failure on line 1891 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"scale_torch" is possibly unbound (reportPossiblyUnboundVariable)
elif name.endswith("up_proj.weight"):
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), weight_torch))

Check failure on line 1893 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"weight_torch" is possibly unbound (reportPossiblyUnboundVariable)

Check failure on line 1893 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"weight_torch" is possibly unbound (reportPossiblyUnboundVariable)
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid, suffix=".scale"), scale_torch))

Check failure on line 1894 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"scale_torch" is possibly unbound (reportPossiblyUnboundVariable)

Check failure on line 1894 in convert_hf_to_gguf.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"scale_torch" is possibly unbound (reportPossiblyUnboundVariable)
elif name.endswith("down_proj.weight"):
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), weight_torch))
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, suffix=".scale"), scale_torch))
Expand Down Expand Up @@ -3961,33 +3964,32 @@
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
self.block_count = self.hparams["num_hidden_layers"] + 1
self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)

def set_vocab(self):
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
self.dir_model, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)

# Set special tokens
special_vocab._set_special_token(
"eos", tokenizer.get_added_vocab()["<|endoftext|>"]
)
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
special_vocab._set_special_token(
"unk", tokenizer.get_added_vocab()["<|endoftext|>"]
)
special_vocab._set_special_token(
"bos", tokenizer.get_added_vocab()["<|endoftext|>"]
)
# Special tokens
# Note: Using <|endoftext|> (151329) for eot causes endless generation
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338

# Patch broken chat template
if isinstance(special_vocab.chat_template, str) and "visible_text(m.content).endswith" in special_vocab.chat_template:
special_vocab.chat_template = special_vocab.chat_template.replace(
"""{{ visible_text(m.content) }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}""",
"""{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothink")) else '' -}}""")

special_vocab.add_to_gguf(self.gguf_writer)

Expand All @@ -4001,10 +4003,9 @@
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
)

# MoE parameters
if (n_experts := self.hparams.get("n_routed_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
# Note: expert_used_count is already set by parent class using num_experts_per_tok
# MoE parameters - Use only routed expert count (shared experts handled separately)
if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None:
self.gguf_writer.add_expert_count(n_routed_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None:
Expand All @@ -4023,8 +4024,11 @@
if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None:
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)

# NextN/MTP prediction layers
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)

_experts: list[dict[str, Tensor]] | None = None
_shared_experts: list[dict[str, Tensor]] | None = None

def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None
Expand All @@ -4035,21 +4039,17 @@
name = name.replace("language_model.", "") # for multimodal variants

# Handle main token embedding (but not layer-specific NextN embeddings)
if name == "model.embed_tokens.weight":
if name == "model.embed_tokens.weight" and ".layers." not in name:
return [(self.map_tensor_name("token_embd.weight"), data_torch)]

# Handle routed experts
if name.find("mlp.experts") != -1 and "shared_experts" not in name:
if name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None

if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]

# Extend experts array if needed (for models where actual layers > num_hidden_layers)
while len(self._experts) <= bid:
self._experts.append({})

self._experts[bid][name] = data_torch

if len(self._experts[bid]) >= n_experts * 3:
Expand All @@ -4065,95 +4065,21 @@
del self._experts[bid][ename]

data_torch = torch.stack(datas, dim=0)
# Generate GGUF tensor names for merged experts
if w_name == "down_proj":
new_name = f"blk.{bid}.ffn_down_exps.weight"
elif w_name == "gate_proj":
new_name = f"blk.{bid}.ffn_gate_exps.weight"
elif w_name == "up_proj":
new_name = f"blk.{bid}.ffn_up_exps.weight"
else:
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)

merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"

new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []

# Handle expert gating input (routing gate)
if ".mlp.gate.e_score_correction_bias" in name:
new_name = name.replace("model.layers.", "blk.").replace(
".mlp.gate.e_score_correction_bias", ".ffn_gate_inp.bias" # *NOTE* this is ".exp_probs_b" in mainline PR
)
return [(new_name, data_torch)]
elif ".mlp.gate.weight" in name:
new_name = name.replace("model.layers.", "blk.").replace(
".mlp.gate.weight", ".ffn_gate_inp.weight"
)
return [(new_name, data_torch)]

# Handle shared expert tensors
if ".mlp.shared_experts." in name:
new_name = name.replace("model.layers.", "blk.").replace(".mlp.shared_experts.", ".ffn_")
if "gate_proj" in new_name:
new_name = new_name.replace("gate_proj", "gate_shexp")
elif "down_proj" in new_name:
new_name = new_name.replace("down_proj", "down_shexp")
elif "up_proj" in new_name:
new_name = new_name.replace("up_proj", "up_shexp")
return [(new_name, data_torch)]

# Handle regular dense FFN layers (for hybrid dense/MoE architecture)
if ".mlp." in name and "experts" not in name and "_shexp" not in name:
if "gate_proj" in name:
new_name = name.replace("model.layers.", "blk.").replace(
".mlp.gate_proj.weight", ".ffn_gate.weight"
)
elif "up_proj" in name:
new_name = name.replace("model.layers.", "blk.").replace(
".mlp.up_proj.weight", ".ffn_up.weight"
)
elif "down_proj" in name:
new_name = name.replace("model.layers.", "blk.").replace(
".mlp.down_proj.weight", ".ffn_down.weight"
)
else:
new_name = name
return [(self.map_tensor_name(new_name), data_torch)]

# Handle special NextN tensors - preserve for future MTP support - See https://github.com/ggml-org/llama.cpp/pull/13236
if (
".embed_tokens." in name
or ".shared_head." in name
or ".eh_proj." in name
or ".enorm." in name
or ".hnorm." in name
):
new_name = name.replace("model.layers.", "blk.").replace("model.", "").replace(".weight", "")
# logger.debug(f"Skipping MTP tensor: {new_name}")
return [(new_name, data_torch)]

# GLM tensor mapping - handle directly without map_tensor_name
if ".input_layernorm." in name:
new_name = name.replace("model.layers.", "blk.").replace(".input_layernorm.", ".attn_norm.")
return [(new_name, data_torch)]
elif ".post_attention_layernorm." in name:
new_name = name.replace("model.layers.", "blk.").replace(".post_attention_layernorm.", ".ffn_norm.")
return [(new_name, data_torch)]
elif ".self_attn." in name:
# Map GLM self_attn to standard attention naming
new_name = name.replace("model.layers.", "blk.").replace(".self_attn.", ".attn_")
if "q_proj" in new_name:
new_name = new_name.replace("q_proj", "q")
elif "k_proj" in new_name:
new_name = new_name.replace("k_proj", "k")
elif "v_proj" in new_name:
new_name = new_name.replace("v_proj", "v")
elif "o_proj" in new_name:
new_name = new_name.replace("o_proj", "output")
return [(new_name, data_torch)]
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")

return super().modify_tensors(data_torch, name, bid)
new_name = self.map_tensor_name(name)

return [(new_name, data_torch)]

def prepare_tensors(self):
super().prepare_tensors()
Expand Down
2 changes: 2 additions & 0 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2", },
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902", },
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890", },
]

Expand Down
51 changes: 31 additions & 20 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ class LLM:
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale"
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
Expand Down Expand Up @@ -159,6 +160,13 @@ class Tokenizer:
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
CHAT_TEMPLATES = "tokenizer.chat_templates"
# FIM/Infill special tokens constants
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
# FIM/Infill special tokens constants
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
Expand Down Expand Up @@ -263,9 +271,6 @@ class MODEL_TENSOR(IntEnum):
FFN_GATE_EXP = auto()
FFN_DOWN_EXP = auto()
FFN_UP_EXP = auto()
FFN_GATE_EXPS = auto() # merged experts
FFN_DOWN_EXPS = auto() # merged experts
FFN_UP_EXPS = auto() # merged experts
FFN_GATE_SHEXP = auto()
FFN_DOWN_SHEXP = auto()
FFN_UP_SHEXP = auto()
Expand Down Expand Up @@ -415,9 +420,6 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
MODEL_TENSOR.FFN_GATE_EXPS: "blk.{bid}.ffn_gate_exps", # merged experts
MODEL_TENSOR.FFN_DOWN_EXPS: "blk.{bid}.ffn_down_exps", # merged experts
MODEL_TENSOR.FFN_UP_EXPS: "blk.{bid}.ffn_up_exps", # merged experts
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
Expand Down Expand Up @@ -465,13 +467,13 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
# NextN/MTP tensors (GLM4_MOE)
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.eh_proj",
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.embed_tokens",
MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.enorm",
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.hnorm",
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.shared_head.head",
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.shared_head.norm",
# NextN/MTP
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm",
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
}

MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
Expand Down Expand Up @@ -1096,23 +1098,24 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE, # dense layers
MODEL_TENSOR.FFN_DOWN, # dense layers
MODEL_TENSOR.FFN_UP, # dense layers
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXPS,
MODEL_TENSOR.FFN_DOWN_EXPS,
MODEL_TENSOR.FFN_UP_EXPS,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
# NextN/MTP tensors - preserved but unused
MODEL_TENSOR.NEXTN_EH_PROJ,
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
Expand Down Expand Up @@ -1684,6 +1687,14 @@ def get_type(val: Any) -> GGUFValueType:
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV

KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID

KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,9 @@ def add_expert_weights_norm(self, value: bool) -> None:
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)

def add_nextn_predict_layers(self, count: int) -> None:
self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)

def add_layer_norm_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)

Expand Down
25 changes: 25 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,31 @@ class TensorNameMap:
MODEL_TENSOR.ENC_OUTPUT_NORM: (
"encoder.final_layer_norm", # t5
),

# NextN/MTP tensors for GLM4_MOE
MODEL_TENSOR.NEXTN_EH_PROJ: (
"model.layers.{bid}.eh_proj",
),

MODEL_TENSOR.NEXTN_EMBED_TOKENS: (
"model.layers.{bid}.embed_tokens",
),

MODEL_TENSOR.NEXTN_ENORM: (
"model.layers.{bid}.enorm",
),

MODEL_TENSOR.NEXTN_HNORM: (
"model.layers.{bid}.hnorm",
),

MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: (
"model.layers.{bid}.shared_head.head",
),

MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: (
"model.layers.{bid}.shared_head.norm",
),
}

# architecture-specific block mappings
Expand Down
Loading
Loading