@@ -1485,6 +1485,9 @@ def __init__(self, *args, **kwargs):
14851485 # TODO @ngxson : this is a hack to support both vision and audio encoders
14861486 have_multiple_encoders = self .has_audio_encoder and self .has_vision_encoder
14871487 self .block_count = 128 if have_multiple_encoders else self .find_hparam (self .n_block_keys , True )
1488+
1489+ if (self .block_count is None ):
1490+ self .block_count = max (self .hparams ["width" ].get ("clip-l-14-224" ).get ("layers" ), self .hparams ["width" ].get ("sam_vit_b" ).get ("layers" ))
14881491 self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .MMPROJ , self .block_count )
14891492
14901493 # load preprocessor config
@@ -1565,6 +1568,15 @@ def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool =
15651568 key = next ((k for k in keys if k in obj ), None )
15661569 if key is not None :
15671570 return obj [key ]
1571+ for key in keys :
1572+ if key := ( obj .get ("width" , {}).get ("clip-l-14-224" , {}).get (key )):
1573+ return key
1574+ else :
1575+ raise KeyError (f"could not find any of: { keys } " )
1576+
1577+ key = next ((k for k in keys if k in self .global_config ), None )
1578+ if key is not None :
1579+ return self .global_config [key ]
15681580 if optional :
15691581 return None
15701582 raise KeyError (f"could not find any of: { keys } " )
@@ -6723,6 +6735,7 @@ def prepare_tensors(self):
67236735@ModelBase .register (
67246736 "DeepseekV2ForCausalLM" ,
67256737 "DeepseekV3ForCausalLM" ,
6738+ "DeepseekOCRForCausalLM" ,
67266739 "KimiVLForConditionalGeneration" ,
67276740)
67286741class DeepseekV2Model (TextModel ):
@@ -6794,26 +6807,42 @@ def set_gguf_parameters(self):
67946807 self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
67956808 if "q_lora_rank" in hparams and hparams ["q_lora_rank" ] is not None :
67966809 self .gguf_writer .add_q_lora_rank (hparams ["q_lora_rank" ])
6797- self .gguf_writer .add_kv_lora_rank (hparams ["kv_lora_rank" ])
6810+ else :
6811+ self .gguf_writer .add_q_lora_rank (1536 )
6812+ # self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
67986813
67996814 # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
6800- self .gguf_writer .add_key_length (hparams ["kv_lora_rank" ] + hparams ["qk_rope_head_dim" ])
6801- self .gguf_writer .add_value_length (hparams ["kv_lora_rank" ])
6815+ if (hparams ["kv_lora_rank" ] is not None ):
6816+ self .gguf_writer .add_key_length (hparams ["kv_lora_rank" ] + hparams ["qk_rope_head_dim" ])
6817+ self .gguf_writer .add_kv_lora_rank (hparams ["kv_lora_rank" ])
6818+ else :
6819+ self .gguf_writer .add_key_length (hparams ["qk_rope_head_dim" ])
6820+ self .gguf_writer .add_kv_lora_rank (512 )
6821+
6822+ # self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
68026823 self .gguf_writer .add_key_length_mla (hparams ["qk_nope_head_dim" ] + hparams ["qk_rope_head_dim" ])
68036824 self .gguf_writer .add_value_length_mla (hparams ["v_head_dim" ])
68046825
68056826 self .gguf_writer .add_expert_feed_forward_length (hparams ["moe_intermediate_size" ])
68066827 self .gguf_writer .add_expert_count (hparams ["n_routed_experts" ])
68076828 self .gguf_writer .add_expert_shared_count (hparams ["n_shared_experts" ])
6808- self .gguf_writer .add_expert_weights_scale (hparams ["routed_scaling_factor" ])
6809- self .gguf_writer .add_expert_weights_norm (hparams ["norm_topk_prob" ])
6810-
6811- if hparams ["scoring_func" ] == "sigmoid" :
6812- self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SIGMOID )
6813- elif hparams ["scoring_func" ] == "softmax" :
6814- self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SOFTMAX )
6829+ if ("routed_scaling_factor" not in hparams ):
6830+ self .gguf_writer .add_expert_weights_scale (1.0 )
68156831 else :
6816- raise ValueError (f"Unsupported scoring_func value: { hparams ['scoring_func' ]} " )
6832+ self .gguf_writer .add_expert_weights_scale (hparams ["routed_scaling_factor" ])
6833+
6834+ if ("norm_topk_prob" in hparams ):
6835+ self .gguf_writer .add_expert_weights_norm (hparams ["norm_topk_prob" ])
6836+ else :
6837+ self .gguf_writer .add_expert_weights_norm (False )
6838+
6839+
6840+ # if hparams["scoring_func"] == "sigmoid":
6841+ # self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
6842+ # elif hparams["scoring_func"] == "softmax":
6843+ self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SOFTMAX )
6844+ # else:
6845+ # raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
68176846
68186847 self .gguf_writer .add_rope_dimension_count (hparams ["qk_rope_head_dim" ])
68196848
@@ -6823,12 +6852,16 @@ def set_gguf_parameters(self):
68236852 self .gguf_writer .add_rope_scaling_factor (rope_scaling ["factor" ])
68246853 self .gguf_writer .add_rope_scaling_orig_ctx_len (rope_scaling ["original_max_position_embeddings" ])
68256854 self .gguf_writer .add_rope_scaling_yarn_log_mul (0.1 * rope_scaling ["mscale_all_dim" ])
6855+ if ("rms_norm_eps" in hparams ):
6856+ self .gguf_writer .add_layer_norm_rms_eps (hparams ["rms_norm_eps" ])
6857+ else :
6858+ self .gguf_writer .add_layer_norm_rms_eps (1e-06 )
68266859
68276860 _experts : list [dict [str , Tensor ]] | None = None
68286861
68296862 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
68306863 # skip vision tensors and remove "language_model." for Kimi-VL
6831- if "vision_tower" in name or "multi_modal_projector" in name :
6864+ if "vision_tower" in name or "multi_modal_projector" in name or "sam_model" in name or "vision_model" in name :
68326865 return []
68336866
68346867 if name .startswith ("language_model." ):
@@ -6897,6 +6930,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
68976930 (self .map_tensor_name (name_vb ), v_b )
68986931 ]
68996932
6933+ if (name .startswith ("model.projector" ) or name .startswith ("model.view_seperator" ) or name .startswith ("model.image_newline" )):
6934+ return []
6935+
69006936 return [(self .map_tensor_name (name ), data_torch )]
69016937
69026938 def prepare_tensors (self ):
@@ -6909,6 +6945,157 @@ def prepare_tensors(self):
69096945 raise ValueError (f"Unprocessed experts: { experts } " )
69106946
69116947
6948+ @ModelBase .register (
6949+ "DeepseekOCRForCausalLM"
6950+ )
6951+ class DeepseekOCR (MmprojModel ):
6952+ model_arch = gguf .MODEL_ARCH .MMPROJ
6953+ has_vision_encoder = True
6954+
6955+ def _find_param (self , obj : dict [str , Any ], keys : Iterable [str ], optional : bool = False ) -> Any :
6956+ key = next ((k for k in keys if k in obj ), None )
6957+ if key is not None :
6958+ return obj [key ]
6959+
6960+ key = next ((k for k in keys if k in self .global_config ), None )
6961+ if key is not None :
6962+ return self .global_config [key ]
6963+ if optional :
6964+ return None
6965+ for key in keys :
6966+ if key := ( obj .get ("width" , {}).get ("clip-l-14-224" , {}).get (key )):
6967+ return key
6968+ else :
6969+ raise KeyError (f"could not find any of: { keys } " )
6970+ def set_gguf_parameters (self ):
6971+ self .gguf_writer .add_file_type (self .ftype )
6972+
6973+ if self .has_vision_encoder :
6974+ self .gguf_writer .add_clip_has_vision_encoder (True )
6975+ self .gguf_writer .add_vision_projection_dim (self .n_embd_text )
6976+
6977+ # vision config
6978+ self .image_size = self .find_vparam (["image_size" ])
6979+ self .gguf_writer .add_vision_image_size (self .image_size )
6980+ self .gguf_writer .add_vision_patch_size (self .find_vparam (["patch_size" ]))
6981+ # self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
6982+ self .gguf_writer .add_vision_feed_forward_length (self .find_vparam (["intermediate_size" ]))
6983+ # self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
6984+ self .gguf_writer .add_vision_head_count (self .find_vparam (["heads" ]))
6985+
6986+ # preprocessor config
6987+ image_mean = _MISTRAL_COMMON_DATASET_MEAN if self .is_mistral_format else self .preprocessor_config ["image_mean" ]
6988+ image_std = _MISTRAL_COMMON_DATASET_STD if self .is_mistral_format else self .preprocessor_config ["image_std" ]
6989+
6990+ self .gguf_writer .add_vision_image_mean (image_mean )
6991+ self .gguf_writer .add_vision_image_std (image_std )
6992+ print (f"{ self .hparams = } " )
6993+ hparams = self .hparams ["width" ]
6994+ self .gguf_writer .add_key_value ("clip.width" , hparams ["clip-l-14-224" ].get ("width" ), gguf .GGUFValueType .INT32 )
6995+ self .gguf_writer .add_key_value ("sam.downsample_channels" , hparams ["sam_vit_b" ].get ("downsample_channels" ), gguf .GGUFValueType .ARRAY )
6996+ self .gguf_writer .add_key_value ("sam.global_attn_indexes" , hparams ["sam_vit_b" ].get ("global_attn_indexes" ), gguf .GGUFValueType .ARRAY )
6997+ self .gguf_writer .add_key_value ("sam.heads" , hparams ["sam_vit_b" ].get ("heads" ), gguf .GGUFValueType .INT32 )
6998+ self .gguf_writer .add_key_value ("sam.layers" , hparams ["sam_vit_b" ].get ("layers" ), gguf .GGUFValueType .INT32 )
6999+ self .gguf_writer .add_key_value ("sam.width" , hparams ["sam_vit_b" ].get ("width" ), gguf .GGUFValueType .INT32 )
7000+
7001+
7002+
7003+
7004+ _experts : list [dict [str , Tensor ]] | None = None
7005+
7006+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
7007+ # skip vision tensors and remove "language_model." for Kimi-VL
7008+ if "vision_tower" in name or "multi_modal_projector" in name :
7009+ return []
7010+
7011+ if name .startswith ("language_model." ):
7012+ name = name .replace ("language_model." , "" )
7013+
7014+ # rename e_score_correction_bias tensors
7015+ if name .endswith ("e_score_correction_bias" ):
7016+ name = name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
7017+
7018+ # skip Multi-Token Prediction (MTP) layers
7019+ if ("num_hidden_layers" in self .hparams ):
7020+ block_count = self .hparams ["num_hidden_layers" ]
7021+ match = re .match (r"model.layers.(\d+)" , name )
7022+ if match and int (match .group (1 )) >= block_count :
7023+ return []
7024+
7025+ # process the experts separately
7026+ if name .find ("mlp.experts" ) != - 1 :
7027+ n_experts = self .hparams ["n_routed_experts" ]
7028+ assert bid is not None
7029+
7030+ if self ._experts is None :
7031+ self ._experts = [{} for _ in range (self .block_count )]
7032+
7033+ self ._experts [bid ][name ] = data_torch
7034+
7035+ if len (self ._experts [bid ]) >= n_experts * 3 :
7036+ tensors : list [tuple [str , Tensor ]] = []
7037+
7038+ # merge the experts into a single 3d tensor
7039+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
7040+ datas : list [Tensor ] = []
7041+
7042+ for xid in range (n_experts ):
7043+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
7044+ datas .append (self ._experts [bid ][ename ])
7045+ del self ._experts [bid ][ename ]
7046+
7047+ data_torch = torch .stack (datas , dim = 0 )
7048+
7049+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
7050+
7051+ new_name = self .map_tensor_name (merged_name )
7052+
7053+ tensors .append ((new_name , data_torch ))
7054+ return tensors
7055+ else :
7056+ return []
7057+
7058+
7059+ # if (name.startswith("lm_head.weight")):
7060+ # return []
7061+ if (name .startswith ("model.projector" ) or name .startswith ("model.view_seperator" ) or name .startswith ("model.image_newline" )):
7062+ return [(self .map_tensor_name (name ), data_torch )]
7063+ if not (name .startswith ("model.sam_model" ) or (name .startswith ("model.vision_model" ))):
7064+ return []
7065+
7066+ if (name .endswith ("attn.rel_pos_h" ) or name .endswith ("attn.rel_pos_w" )):
7067+ return []
7068+ # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
7069+ if name .endswith ("kv_b_proj.weight" ):
7070+ name_kb = name .replace ("kv_b_proj" , "k_b_proj" )
7071+ name_vb = name .replace ("kv_b_proj" , "v_b_proj" )
7072+
7073+ n_head_kv = self .hparams ["num_key_value_heads" ]
7074+ v_head_dim = self .hparams ["v_head_dim" ]
7075+ qk_nope_head_dim = self .hparams ["qk_nope_head_dim" ]
7076+
7077+ assert data_torch .shape [0 ] == n_head_kv * (v_head_dim + qk_nope_head_dim )
7078+
7079+ kv_b = data_torch .view (n_head_kv , v_head_dim + qk_nope_head_dim , data_torch .shape [- 1 ])
7080+ k_b , v_b = torch .split (kv_b , [qk_nope_head_dim , v_head_dim ], dim = 1 )
7081+ k_b = k_b .transpose (1 , 2 )
7082+
7083+ return [
7084+ (self .map_tensor_name (name_kb ), k_b ),
7085+ (self .map_tensor_name (name_vb ), v_b )
7086+ ]
7087+
7088+ return [(self .map_tensor_name (name ), data_torch )]
7089+
7090+ def prepare_tensors (self ):
7091+ super ().prepare_tensors ()
7092+
7093+ if self ._experts is not None :
7094+ # flatten `list[dict[str, Tensor]]` into `list[str]`
7095+ experts = [k for d in self ._experts for k in d .keys ()]
7096+ if len (experts ) > 0 :
7097+ raise ValueError (f"Unprocessed experts: { experts } " )
7098+
69127099@ModelBase .register ("Dots1ForCausalLM" )
69137100class Dots1Model (Qwen2MoeModel ):
69147101 model_arch = gguf .MODEL_ARCH .DOTS1
0 commit comments