Skip to content

Commit 69311be

Browse files
committed
adding code to create deepseekocr gguf
1 parent 2e1c881 commit 69311be

3 files changed

Lines changed: 334 additions & 12 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 199 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1485,6 +1485,9 @@ def __init__(self, *args, **kwargs):
14851485
# TODO @ngxson : this is a hack to support both vision and audio encoders
14861486
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
14871487
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
1488+
1489+
if (self.block_count is None):
1490+
self.block_count = max(self.hparams["width"].get("clip-l-14-224").get("layers"), self.hparams["width"].get("sam_vit_b").get("layers"))
14881491
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
14891492

14901493
# load preprocessor config
@@ -1565,6 +1568,15 @@ def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool =
15651568
key = next((k for k in keys if k in obj), None)
15661569
if key is not None:
15671570
return obj[key]
1571+
for key in keys:
1572+
if key := ( obj.get("width", {}).get("clip-l-14-224", {}).get(key)):
1573+
return key
1574+
else:
1575+
raise KeyError(f"could not find any of: {keys}")
1576+
1577+
key = next((k for k in keys if k in self.global_config), None)
1578+
if key is not None:
1579+
return self.global_config[key]
15681580
if optional:
15691581
return None
15701582
raise KeyError(f"could not find any of: {keys}")
@@ -6723,6 +6735,7 @@ def prepare_tensors(self):
67236735
@ModelBase.register(
67246736
"DeepseekV2ForCausalLM",
67256737
"DeepseekV3ForCausalLM",
6738+
"DeepseekOCRForCausalLM",
67266739
"KimiVLForConditionalGeneration",
67276740
)
67286741
class DeepseekV2Model(TextModel):
@@ -6794,26 +6807,42 @@ def set_gguf_parameters(self):
67946807
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
67956808
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
67966809
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
6797-
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
6810+
else:
6811+
self.gguf_writer.add_q_lora_rank(1536)
6812+
# self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
67986813

67996814
# note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
6800-
self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
6801-
self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
6815+
if (hparams["kv_lora_rank"] is not None):
6816+
self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
6817+
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
6818+
else:
6819+
self.gguf_writer.add_key_length(hparams["qk_rope_head_dim"])
6820+
self.gguf_writer.add_kv_lora_rank(512)
6821+
6822+
# self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
68026823
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
68036824
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
68046825

68056826
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
68066827
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
68076828
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
6808-
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
6809-
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
6810-
6811-
if hparams["scoring_func"] == "sigmoid":
6812-
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
6813-
elif hparams["scoring_func"] == "softmax":
6814-
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
6829+
if ("routed_scaling_factor" not in hparams):
6830+
self.gguf_writer.add_expert_weights_scale(1.0)
68156831
else:
6816-
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
6832+
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
6833+
6834+
if ("norm_topk_prob" in hparams):
6835+
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
6836+
else:
6837+
self.gguf_writer.add_expert_weights_norm(False)
6838+
6839+
6840+
# if hparams["scoring_func"] == "sigmoid":
6841+
# self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
6842+
# elif hparams["scoring_func"] == "softmax":
6843+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
6844+
# else:
6845+
# raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
68176846

68186847
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
68196848

@@ -6823,12 +6852,16 @@ def set_gguf_parameters(self):
68236852
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
68246853
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
68256854
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
6855+
if ("rms_norm_eps" in hparams):
6856+
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
6857+
else:
6858+
self.gguf_writer.add_layer_norm_rms_eps(1e-06)
68266859

68276860
_experts: list[dict[str, Tensor]] | None = None
68286861

68296862
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
68306863
# skip vision tensors and remove "language_model." for Kimi-VL
6831-
if "vision_tower" in name or "multi_modal_projector" in name:
6864+
if "vision_tower" in name or "multi_modal_projector" in name or "sam_model" in name or "vision_model" in name:
68326865
return []
68336866

68346867
if name.startswith("language_model."):
@@ -6897,6 +6930,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
68976930
(self.map_tensor_name(name_vb), v_b)
68986931
]
68996932

6933+
if (name.startswith("model.projector") or name.startswith("model.view_seperator") or name.startswith("model.image_newline")):
6934+
return []
6935+
69006936
return [(self.map_tensor_name(name), data_torch)]
69016937

69026938
def prepare_tensors(self):
@@ -6909,6 +6945,157 @@ def prepare_tensors(self):
69096945
raise ValueError(f"Unprocessed experts: {experts}")
69106946

69116947

6948+
@ModelBase.register(
6949+
"DeepseekOCRForCausalLM"
6950+
)
6951+
class DeepseekOCR(MmprojModel):
6952+
model_arch = gguf.MODEL_ARCH.MMPROJ
6953+
has_vision_encoder = True
6954+
6955+
def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
6956+
key = next((k for k in keys if k in obj), None)
6957+
if key is not None:
6958+
return obj[key]
6959+
6960+
key = next((k for k in keys if k in self.global_config), None)
6961+
if key is not None:
6962+
return self.global_config[key]
6963+
if optional:
6964+
return None
6965+
for key in keys:
6966+
if key := ( obj.get("width", {}).get("clip-l-14-224", {}).get(key)):
6967+
return key
6968+
else:
6969+
raise KeyError(f"could not find any of: {keys}")
6970+
def set_gguf_parameters(self):
6971+
self.gguf_writer.add_file_type(self.ftype)
6972+
6973+
if self.has_vision_encoder:
6974+
self.gguf_writer.add_clip_has_vision_encoder(True)
6975+
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
6976+
6977+
# vision config
6978+
self.image_size = self.find_vparam(["image_size"])
6979+
self.gguf_writer.add_vision_image_size(self.image_size)
6980+
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
6981+
# self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
6982+
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
6983+
# self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
6984+
self.gguf_writer.add_vision_head_count(self.find_vparam(["heads"]))
6985+
6986+
# preprocessor config
6987+
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
6988+
image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
6989+
6990+
self.gguf_writer.add_vision_image_mean(image_mean)
6991+
self.gguf_writer.add_vision_image_std(image_std)
6992+
print(f"{self.hparams=}")
6993+
hparams = self.hparams["width"]
6994+
self.gguf_writer.add_key_value("clip.width", hparams["clip-l-14-224"].get("width"), gguf.GGUFValueType.INT32)
6995+
self.gguf_writer.add_key_value("sam.downsample_channels", hparams["sam_vit_b"].get("downsample_channels"), gguf.GGUFValueType.ARRAY)
6996+
self.gguf_writer.add_key_value("sam.global_attn_indexes", hparams["sam_vit_b"].get("global_attn_indexes"), gguf.GGUFValueType.ARRAY)
6997+
self.gguf_writer.add_key_value("sam.heads", hparams["sam_vit_b"].get("heads"), gguf.GGUFValueType.INT32)
6998+
self.gguf_writer.add_key_value("sam.layers", hparams["sam_vit_b"].get("layers"), gguf.GGUFValueType.INT32)
6999+
self.gguf_writer.add_key_value("sam.width", hparams["sam_vit_b"].get("width"), gguf.GGUFValueType.INT32)
7000+
7001+
7002+
7003+
7004+
_experts: list[dict[str, Tensor]] | None = None
7005+
7006+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7007+
# skip vision tensors and remove "language_model." for Kimi-VL
7008+
if "vision_tower" in name or "multi_modal_projector" in name:
7009+
return []
7010+
7011+
if name.startswith("language_model."):
7012+
name = name.replace("language_model.", "")
7013+
7014+
# rename e_score_correction_bias tensors
7015+
if name.endswith("e_score_correction_bias"):
7016+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
7017+
7018+
# skip Multi-Token Prediction (MTP) layers
7019+
if ("num_hidden_layers" in self.hparams):
7020+
block_count = self.hparams["num_hidden_layers"]
7021+
match = re.match(r"model.layers.(\d+)", name)
7022+
if match and int(match.group(1)) >= block_count:
7023+
return []
7024+
7025+
# process the experts separately
7026+
if name.find("mlp.experts") != -1:
7027+
n_experts = self.hparams["n_routed_experts"]
7028+
assert bid is not None
7029+
7030+
if self._experts is None:
7031+
self._experts = [{} for _ in range(self.block_count)]
7032+
7033+
self._experts[bid][name] = data_torch
7034+
7035+
if len(self._experts[bid]) >= n_experts * 3:
7036+
tensors: list[tuple[str, Tensor]] = []
7037+
7038+
# merge the experts into a single 3d tensor
7039+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
7040+
datas: list[Tensor] = []
7041+
7042+
for xid in range(n_experts):
7043+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
7044+
datas.append(self._experts[bid][ename])
7045+
del self._experts[bid][ename]
7046+
7047+
data_torch = torch.stack(datas, dim=0)
7048+
7049+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
7050+
7051+
new_name = self.map_tensor_name(merged_name)
7052+
7053+
tensors.append((new_name, data_torch))
7054+
return tensors
7055+
else:
7056+
return []
7057+
7058+
7059+
# if (name.startswith("lm_head.weight")):
7060+
# return []
7061+
if (name.startswith("model.projector") or name.startswith("model.view_seperator") or name.startswith("model.image_newline")):
7062+
return [(self.map_tensor_name(name), data_torch)]
7063+
if not (name.startswith("model.sam_model") or (name.startswith("model.vision_model"))):
7064+
return []
7065+
7066+
if (name.endswith("attn.rel_pos_h") or name.endswith("attn.rel_pos_w")):
7067+
return []
7068+
# note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
7069+
if name.endswith("kv_b_proj.weight"):
7070+
name_kb = name.replace("kv_b_proj", "k_b_proj")
7071+
name_vb = name.replace("kv_b_proj", "v_b_proj")
7072+
7073+
n_head_kv = self.hparams["num_key_value_heads"]
7074+
v_head_dim = self.hparams["v_head_dim"]
7075+
qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
7076+
7077+
assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
7078+
7079+
kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
7080+
k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
7081+
k_b = k_b.transpose(1, 2)
7082+
7083+
return [
7084+
(self.map_tensor_name(name_kb), k_b),
7085+
(self.map_tensor_name(name_vb), v_b)
7086+
]
7087+
7088+
return [(self.map_tensor_name(name), data_torch)]
7089+
7090+
def prepare_tensors(self):
7091+
super().prepare_tensors()
7092+
7093+
if self._experts is not None:
7094+
# flatten `list[dict[str, Tensor]]` into `list[str]`
7095+
experts = [k for d in self._experts for k in d.keys()]
7096+
if len(experts) > 0:
7097+
raise ValueError(f"Unprocessed experts: {experts}")
7098+
69127099
@ModelBase.register("Dots1ForCausalLM")
69137100
class Dots1Model(Qwen2MoeModel):
69147101
model_arch = gguf.MODEL_ARCH.DOTS1

gguf-py/gguf/constants.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,7 @@ class MODEL_ARCH(IntEnum):
384384
ARCTIC = auto()
385385
DEEPSEEK = auto()
386386
DEEPSEEK2 = auto()
387+
# DEEPSEEKOCR = auto()
387388
CHATGLM = auto()
388389
GLM4 = auto()
389390
GLM4_MOE = auto()
@@ -430,6 +431,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
430431
GLM_EDGE = auto()
431432
MERGER = auto()
432433
GEMMA3 = auto()
434+
# DEEPSEEKOCR = auto()
433435

434436

435437
class MODEL_TENSOR(IntEnum):
@@ -640,6 +642,25 @@ class MODEL_TENSOR(IntEnum):
640642
V_RESMPL_QUERY = auto() # minicpmv
641643
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
642644
V_MM_PATCH_MERGER = auto() # mistral small 3.1
645+
# ocr
646+
V_IMAGE_NEWLINE = auto()
647+
V_VIEW_SEPERATOR = auto()
648+
V_ATTN_QKV = auto()
649+
V_FFN_UP = auto()
650+
V_FFN_DOWN = auto()
651+
V_NECK_CONV2D_0 = auto()
652+
V_NECK_CONV2D_1 = auto()
653+
V_NECK_LAYERNORM_2D_0 = auto()
654+
V_NECK_LAYERNORM_2D_1 = auto()
655+
V_NET_CONV2D_2 = auto()
656+
V_NET_CONV2D_3 = auto()
657+
V_EMBD_PATCH_PROJ = auto()
658+
V_VIT_LAYERORM_0 = auto()
659+
V_VIT_LAYERORM_1 = auto()
660+
V_VIT_ATTN_PROJ = auto()
661+
V_ATTN_QKV_PROJ = auto()
662+
663+
643664
# audio (mtmd)
644665
A_ENC_EMBD_POS = auto()
645666
A_ENC_CONV1D = auto()
@@ -986,6 +1007,25 @@ class MODEL_TENSOR(IntEnum):
9861007
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
9871008
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
9881009
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
1010+
1011+
# ocr
1012+
MODEL_TENSOR.V_IMAGE_NEWLINE: "model.image_newline",
1013+
MODEL_TENSOR.V_VIEW_SEPERATOR: "model.view_seperator",
1014+
MODEL_TENSOR.V_ATTN_QKV: "model.sam_model.blocks.{bid}.attn.qkv",
1015+
MODEL_TENSOR.V_FFN_UP: "model.sam_model.blocks.{bid}.mlp.lin1",
1016+
MODEL_TENSOR.V_FFN_DOWN: "model.sam_model.blocks.{bid}.mlp.lin2",
1017+
MODEL_TENSOR.V_NECK_CONV2D_0: "model.sam_model.neck.0",
1018+
MODEL_TENSOR.V_NECK_CONV2D_1: "model.sam_model.neck.2",
1019+
MODEL_TENSOR.V_NECK_LAYERNORM_2D_0: "model.sam_model.neck.1",
1020+
MODEL_TENSOR.V_NECK_LAYERNORM_2D_1: "model.sam_model.neck.3",
1021+
MODEL_TENSOR.V_NET_CONV2D_2: "model.sam_model.net_2",
1022+
MODEL_TENSOR.V_NET_CONV2D_3: "model.sam_model.net_3",
1023+
MODEL_TENSOR.V_EMBD_PATCH_PROJ: "model.sam_model.patch_embed.proj",
1024+
MODEL_TENSOR.V_VIT_LAYERORM_0: "model.sam_model.blocks.{bid}.norm1",
1025+
MODEL_TENSOR.V_VIT_LAYERORM_1: "model.sam_model.blocks.{bid}.norm2",
1026+
MODEL_TENSOR.V_VIT_ATTN_PROJ: "model.sam_model.blocks.{bid}.attn.proj",
1027+
MODEL_TENSOR.V_ATTN_QKV_PROJ: "model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj",
1028+
9891029
# audio (mtmd)
9901030
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
9911031
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
@@ -1054,6 +1094,23 @@ class MODEL_TENSOR(IntEnum):
10541094
MODEL_TENSOR.V_RESMPL_QUERY,
10551095
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
10561096
MODEL_TENSOR.V_MM_PATCH_MERGER,
1097+
# ocr
1098+
MODEL_TENSOR.V_IMAGE_NEWLINE,
1099+
MODEL_TENSOR.V_VIEW_SEPERATOR,
1100+
MODEL_TENSOR.V_ATTN_QKV,
1101+
MODEL_TENSOR.V_FFN_UP,
1102+
MODEL_TENSOR.V_FFN_DOWN,
1103+
MODEL_TENSOR.V_NECK_CONV2D_0,
1104+
MODEL_TENSOR.V_NECK_CONV2D_1,
1105+
MODEL_TENSOR.V_NECK_LAYERNORM_2D_0,
1106+
MODEL_TENSOR.V_NECK_LAYERNORM_2D_1,
1107+
MODEL_TENSOR.V_NET_CONV2D_2,
1108+
MODEL_TENSOR.V_NET_CONV2D_3,
1109+
MODEL_TENSOR.V_VIT_LAYERORM_0,
1110+
MODEL_TENSOR.V_VIT_LAYERORM_1,
1111+
MODEL_TENSOR.V_VIT_ATTN_PROJ,
1112+
MODEL_TENSOR.V_ATTN_QKV_PROJ,
1113+
MODEL_TENSOR.V_EMBD_PATCH_PROJ,
10571114
# audio
10581115
MODEL_TENSOR.A_ENC_EMBD_POS,
10591116
MODEL_TENSOR.A_ENC_CONV1D,
@@ -2177,6 +2234,10 @@ class MODEL_TENSOR(IntEnum):
21772234
MODEL_TENSOR.FFN_DOWN_SHEXP,
21782235
MODEL_TENSOR.FFN_UP_SHEXP,
21792236
MODEL_TENSOR.FFN_EXP_PROBS_B,
2237+
2238+
# MODEL_TENSOR.V_IMAGE_NEWLINE,
2239+
MODEL_TENSOR.ATTN_K,
2240+
MODEL_TENSOR.ATTN_V,
21802241
],
21812242
MODEL_ARCH.ERNIE4_5_MOE: [
21822243
MODEL_TENSOR.TOKEN_EMBD,

0 commit comments

Comments
 (0)