Thireus · Thireus · Aug 5, 2025 · Aug 4, 2025
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -618,12 +618,15 @@
         if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
             # ref: https://huggingface.co/THUDM/glm-4-9b-chat
             res = "chatglm-bpe"
+        if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
+            res = "chatglm-bpe"
         if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
             # ref: https://huggingface.co/THUDM/glm-4-9b-hf
             res = "glm4"
         if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902":
             # ref: https://huggingface.co/zai-org/GLM-4.5-Air, https://huggingface.co/zai-org/GLM-4.5
-            res = "gpt-2"
+            res = "glm4"
         if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
             # ref: https://huggingface.co/LumiOpen/Viking-7B
             res = "viking"
@@ -1875,20 +1878,20 @@
        tensors: list[tuple[str, Tensor]] = []

        if name.endswith("q_proj.weight"):
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), weight_torch))
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid, suffix=".scale"), scale_torch))
        elif name.endswith("k_proj.weight"):
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), weight_torch))
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid, suffix=".scale"), scale_torch))
        elif name.endswith("v_proj.weight"):
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), weight_torch))
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid, suffix=".scale"), scale_torch))
        elif name.endswith("o_proj.weight"):
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), weight_torch))
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, suffix=".scale"), scale_torch))
        elif name.endswith("up_proj.weight"):
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), weight_torch))
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid, suffix=".scale"), scale_torch))
        elif name.endswith("down_proj.weight"):
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), weight_torch))
            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, suffix=".scale"), scale_torch))
@@ -3961,33 +3964,32 @@
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
-        self.block_count = self.hparams["num_hidden_layers"] + 1
+        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
-    
+
     def set_vocab(self):
         from transformers import AutoTokenizer
 
-        tokenizer = AutoTokenizer.from_pretrained(
-            self.dir_model, trust_remote_code=True
-        )
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
         tokens, toktypes, tokpre = self.get_vocab_base()
         self.gguf_writer.add_tokenizer_model("gpt2")
         self.gguf_writer.add_tokenizer_pre(tokpre)
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
 
-        # Set special tokens
-        special_vocab._set_special_token(
-            "eos", tokenizer.get_added_vocab()["<|endoftext|>"]
-        )
-        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
-        special_vocab._set_special_token(
-            "unk", tokenizer.get_added_vocab()["<|endoftext|>"]
-        )
-        special_vocab._set_special_token(
-            "bos", tokenizer.get_added_vocab()["<|endoftext|>"]
-        )
+        # Special tokens
+        # Note: Using <|endoftext|> (151329) for eot causes endless generation
+        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"])  # 151331
+        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])  # 151336
+        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
+        special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"])  # 151338
+
+        # Patch broken chat template
+        if isinstance(special_vocab.chat_template, str) and "visible_text(m.content).endswith" in special_vocab.chat_template:
+            special_vocab.chat_template = special_vocab.chat_template.replace(
+                """{{ visible_text(m.content) }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}""",
+                """{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothink")) else '' -}}""")
 
         special_vocab.add_to_gguf(self.gguf_writer)
 
@@ -4001,10 +4003,9 @@
             int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
         )
 
-        # MoE parameters
-        if (n_experts := self.hparams.get("n_routed_experts")) is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-        # Note: expert_used_count is already set by parent class using num_experts_per_tok
+        # MoE parameters - Use only routed expert count (shared experts handled separately)
+        if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_routed_experts)
         if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
             self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
         if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None:
@@ -4023,8 +4024,11 @@
         if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None:
             self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
 
+        # NextN/MTP prediction layers
+        if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
+            self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
     _experts: list[dict[str, Tensor]] | None = None
-    _shared_experts: list[dict[str, Tensor]] | None = None
 
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
@@ -4035,21 +4039,17 @@
             name = name.replace("language_model.", "")  # for multimodal variants
 
         # Handle main token embedding (but not layer-specific NextN embeddings)
-        if name == "model.embed_tokens.weight":
+        if name == "model.embed_tokens.weight" and ".layers." not in name:
             return [(self.map_tensor_name("token_embd.weight"), data_torch)]
 
         # Handle routed experts
-        if name.find("mlp.experts") != -1 and "shared_experts" not in name:
+        if name.find("mlp.experts") != -1:
             n_experts = self.hparams["n_routed_experts"]
             assert bid is not None
 
             if self._experts is None:
                 self._experts = [{} for _ in range(self.block_count)]
 
-            # Extend experts array if needed (for models where actual layers > num_hidden_layers)
-            while len(self._experts) <= bid:
-                self._experts.append({})
-
             self._experts[bid][name] = data_torch
 
             if len(self._experts[bid]) >= n_experts * 3:
@@ -4065,95 +4065,21 @@
                         del self._experts[bid][ename]
 
                     data_torch = torch.stack(datas, dim=0)
-                    # Generate GGUF tensor names for merged experts
-                    if w_name == "down_proj":
-                        new_name = f"blk.{bid}.ffn_down_exps.weight"
-                    elif w_name == "gate_proj":
-                        new_name = f"blk.{bid}.ffn_gate_exps.weight"
-                    elif w_name == "up_proj":
-                        new_name = f"blk.{bid}.ffn_up_exps.weight"
-                    else:
-                        merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-                        new_name = self.map_tensor_name(merged_name)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
                     tensors.append((new_name, data_torch))
                 return tensors
             else:
                 return []
 
-        # Handle expert gating input (routing gate)
-        if ".mlp.gate.e_score_correction_bias" in name:
-            new_name = name.replace("model.layers.", "blk.").replace(
-                ".mlp.gate.e_score_correction_bias", ".ffn_gate_inp.bias" # *NOTE* this is ".exp_probs_b" in mainline PR
-            )
-            return [(new_name, data_torch)]
-        elif ".mlp.gate.weight" in name:
-            new_name = name.replace("model.layers.", "blk.").replace(
-                ".mlp.gate.weight", ".ffn_gate_inp.weight"
-            )
-            return [(new_name, data_torch)]
-
-        # Handle shared expert tensors
-        if ".mlp.shared_experts." in name:
-            new_name = name.replace("model.layers.", "blk.").replace(".mlp.shared_experts.", ".ffn_")
-            if "gate_proj" in new_name:
-                new_name = new_name.replace("gate_proj", "gate_shexp")
-            elif "down_proj" in new_name:
-                new_name = new_name.replace("down_proj", "down_shexp")
-            elif "up_proj" in new_name:
-                new_name = new_name.replace("up_proj", "up_shexp")
-            return [(new_name, data_torch)]
-
-        # Handle regular dense FFN layers (for hybrid dense/MoE architecture)
-        if ".mlp." in name and "experts" not in name and "_shexp" not in name:
-            if "gate_proj" in name:
-                new_name = name.replace("model.layers.", "blk.").replace(
-                    ".mlp.gate_proj.weight", ".ffn_gate.weight"
-                )
-            elif "up_proj" in name:
-                new_name = name.replace("model.layers.", "blk.").replace(
-                    ".mlp.up_proj.weight", ".ffn_up.weight"
-                )
-            elif "down_proj" in name:
-                new_name = name.replace("model.layers.", "blk.").replace(
-                    ".mlp.down_proj.weight", ".ffn_down.weight"
-                )
-            else:
-                new_name = name
-            return [(self.map_tensor_name(new_name), data_torch)]
-
-        # Handle special NextN tensors - preserve for future MTP support - See https://github.com/ggml-org/llama.cpp/pull/13236
-        if (
-            ".embed_tokens." in name
-            or ".shared_head." in name
-            or ".eh_proj." in name
-            or ".enorm." in name
-            or ".hnorm." in name
-        ):
-            new_name = name.replace("model.layers.", "blk.").replace("model.", "").replace(".weight", "")
-            # logger.debug(f"Skipping MTP tensor: {new_name}")
-            return [(new_name, data_torch)]
-
-        # GLM tensor mapping - handle directly without map_tensor_name
-        if ".input_layernorm." in name:
-            new_name = name.replace("model.layers.", "blk.").replace(".input_layernorm.", ".attn_norm.")
-            return [(new_name, data_torch)]
-        elif ".post_attention_layernorm." in name:
-            new_name = name.replace("model.layers.", "blk.").replace(".post_attention_layernorm.", ".ffn_norm.")
-            return [(new_name, data_torch)]
-        elif ".self_attn." in name:
-            # Map GLM self_attn to standard attention naming
-            new_name = name.replace("model.layers.", "blk.").replace(".self_attn.", ".attn_")
-            if "q_proj" in new_name:
-                new_name = new_name.replace("q_proj", "q")
-            elif "k_proj" in new_name:
-                new_name = new_name.replace("k_proj", "k")
-            elif "v_proj" in new_name:
-                new_name = new_name.replace("v_proj", "v")
-            elif "o_proj" in new_name:
-                new_name = new_name.replace("o_proj", "output")
-            return [(new_name, data_torch)]
+        if name.endswith("e_score_correction_bias"):
+            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
 
-        return super().modify_tensors(data_torch, name, bid)
+        new_name = self.map_tensor_name(name)
+
+        return [(new_name, data_torch)]
 
     def prepare_tensors(self):
         super().prepare_tensors()

diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -96,6 +96,8 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
     {"name": "deepseek-v3",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
     {"name": "seed-coder",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
+    {"name": "glm4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf",       "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2", },
+    {"name": "glm4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air",     "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902", },
     {"name": "kimi-k2",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890", },
 ]
 

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -91,6 +91,7 @@ class LLM:
         EXPERT_WEIGHTS_SCALE              = "{arch}.expert_weights_scale"
         EXPERT_WEIGHTS_NORM               = "{arch}.expert_weights_norm"
         EXPERT_GATING_FUNC                = "{arch}.expert_gating_func"
+        NEXTN_PREDICT_LAYERS              = "{arch}.nextn_predict_layers"
         POOLING_TYPE                      = "{arch}.pooling_type"
         LOGIT_SCALE                       = "{arch}.logit_scale"
         DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
@@ -159,6 +160,13 @@ class Tokenizer:
         CHAT_TEMPLATE_N      = "tokenizer.chat_template.{name}"
         CHAT_TEMPLATES       = "tokenizer.chat_templates"
         # FIM/Infill special tokens constants
+        FIM_PRE_ID           = "tokenizer.ggml.fim_pre_token_id"
+        FIM_SUF_ID           = "tokenizer.ggml.fim_suf_token_id"
+        FIM_MID_ID           = "tokenizer.ggml.fim_mid_token_id"
+        FIM_PAD_ID           = "tokenizer.ggml.fim_pad_token_id"
+        FIM_REP_ID           = "tokenizer.ggml.fim_rep_token_id"
+        FIM_SEP_ID           = "tokenizer.ggml.fim_sep_token_id"
+        # FIM/Infill special tokens constants
         PREFIX_ID            = "tokenizer.ggml.prefix_token_id"
         SUFFIX_ID            = "tokenizer.ggml.suffix_token_id"
         MIDDLE_ID            = "tokenizer.ggml.middle_token_id"
@@ -263,9 +271,6 @@ class MODEL_TENSOR(IntEnum):
     FFN_GATE_EXP         = auto()
     FFN_DOWN_EXP         = auto()
     FFN_UP_EXP           = auto()
-    FFN_GATE_EXPS        = auto()  # merged experts
-    FFN_DOWN_EXPS        = auto()  # merged experts
-    FFN_UP_EXPS          = auto()  # merged experts
     FFN_GATE_SHEXP       = auto()
     FFN_DOWN_SHEXP       = auto()
     FFN_UP_SHEXP         = auto()
@@ -415,9 +420,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.FFN_GATE_EXP:         "blk.{bid}.ffn_gate_exps",
     MODEL_TENSOR.FFN_DOWN_EXP:         "blk.{bid}.ffn_down_exps",
     MODEL_TENSOR.FFN_UP_EXP:           "blk.{bid}.ffn_up_exps",
-    MODEL_TENSOR.FFN_GATE_EXPS:        "blk.{bid}.ffn_gate_exps",  # merged experts
-    MODEL_TENSOR.FFN_DOWN_EXPS:        "blk.{bid}.ffn_down_exps",  # merged experts
-    MODEL_TENSOR.FFN_UP_EXPS:          "blk.{bid}.ffn_up_exps",    # merged experts
     MODEL_TENSOR.FFN_EXP_PROBS_B:      "blk.{bid}.exp_probs_b",
     MODEL_TENSOR.LAYER_OUT_NORM:       "blk.{bid}.layer_output_norm",
     MODEL_TENSOR.SSM_IN:               "blk.{bid}.ssm_in",
@@ -465,13 +467,13 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.ENC_FFN_DOWN:         "enc.blk.{bid}.ffn_down",
     MODEL_TENSOR.ENC_FFN_UP:           "enc.blk.{bid}.ffn_up",
     MODEL_TENSOR.ENC_OUTPUT_NORM:      "enc.output_norm",
-    # NextN/MTP tensors (GLM4_MOE)
-    MODEL_TENSOR.NEXTN_EH_PROJ:             "blk.{bid}.eh_proj",
-    MODEL_TENSOR.NEXTN_EMBED_TOKENS:        "blk.{bid}.embed_tokens",
-    MODEL_TENSOR.NEXTN_ENORM:               "blk.{bid}.enorm",
-    MODEL_TENSOR.NEXTN_HNORM:               "blk.{bid}.hnorm",
-    MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD:    "blk.{bid}.shared_head.head",
-    MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM:    "blk.{bid}.shared_head.norm",
+    # NextN/MTP
+    MODEL_TENSOR.NEXTN_EH_PROJ:             "blk.{bid}.nextn.eh_proj",
+    MODEL_TENSOR.NEXTN_EMBED_TOKENS:        "blk.{bid}.nextn.embed_tokens",
+    MODEL_TENSOR.NEXTN_ENORM:               "blk.{bid}.nextn.enorm",
+    MODEL_TENSOR.NEXTN_HNORM:               "blk.{bid}.nextn.hnorm",
+    MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD:    "blk.{bid}.nextn.shared_head_head",
+    MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM:    "blk.{bid}.nextn.shared_head_norm",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1096,23 +1098,24 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_POST_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
         MODEL_TENSOR.ATTN_V,
         MODEL_TENSOR.ATTN_OUT,
         MODEL_TENSOR.ATTN_Q_NORM,
         MODEL_TENSOR.ATTN_K_NORM,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_GATE,          # dense layers
-        MODEL_TENSOR.FFN_DOWN,          # dense layers
-        MODEL_TENSOR.FFN_UP,            # dense layers
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.FFN_GATE_INP,
-        MODEL_TENSOR.FFN_GATE_EXPS,
-        MODEL_TENSOR.FFN_DOWN_EXPS,
-        MODEL_TENSOR.FFN_UP_EXPS,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
         MODEL_TENSOR.FFN_GATE_SHEXP,
         MODEL_TENSOR.FFN_DOWN_SHEXP,
         MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
         # NextN/MTP tensors - preserved but unused
         MODEL_TENSOR.NEXTN_EH_PROJ,
         MODEL_TENSOR.NEXTN_EMBED_TOKENS,
@@ -1684,6 +1687,14 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_TOKENIZER_MASK_ID    = Keys.Tokenizer.MASK_ID
 KEY_TOKENIZER_HF_JSON    = Keys.Tokenizer.HF_JSON
 KEY_TOKENIZER_RWKV       = Keys.Tokenizer.RWKV
+
+KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
+KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
+KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
+KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
+KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
+KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
+
 KEY_TOKENIZER_PREFIX_ID  = Keys.Tokenizer.PREFIX_ID
 KEY_TOKENIZER_SUFFIX_ID  = Keys.Tokenizer.SUFFIX_ID
 KEY_TOKENIZER_MIDDLE_ID  = Keys.Tokenizer.MIDDLE_ID

diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -677,6 +677,9 @@ def add_expert_weights_norm(self, value: bool) -> None:
     def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
         self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
 
+    def add_nextn_predict_layers(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
+
     def add_layer_norm_eps(self, value: float) -> None:
         self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
 

diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
@@ -592,6 +592,31 @@ class TensorNameMap:
         MODEL_TENSOR.ENC_OUTPUT_NORM: (
             "encoder.final_layer_norm", # t5
         ),
+
+        # NextN/MTP tensors for GLM4_MOE
+        MODEL_TENSOR.NEXTN_EH_PROJ: (
+            "model.layers.{bid}.eh_proj",
+        ),
+
+        MODEL_TENSOR.NEXTN_EMBED_TOKENS: (
+            "model.layers.{bid}.embed_tokens",
+        ),
+
+        MODEL_TENSOR.NEXTN_ENORM: (
+            "model.layers.{bid}.enorm",
+        ),
+
+        MODEL_TENSOR.NEXTN_HNORM: (
+            "model.layers.{bid}.hnorm",
+        ),
+
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: (
+            "model.layers.{bid}.shared_head.head",
+        ),
+
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: (
+            "model.layers.{bid}.shared_head.norm",
+        ),
     }
 
     # architecture-specific block mappings