ggml-org · Jaffe2718 · Dec 4, 2025 · Dec 4, 2025 · Dec 6, 2025 · Dec 6, 2025
diff --git a/models/convert-h5-to-ggml.py b/models/convert-h5-to-ggml.py
@@ -107,6 +107,8 @@ def bytes_to_unicode():
 fname_out = dir_out / "ggml-model.bin"
 
 tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8"))
+if "<|endoftext|>" in tokens:
+    del tokens["<|endoftext|>"]
 
 # use 16-bit or 32-bit floats
 use_f16 = True

diff --git a/models/for-tests-ggml-base.bin b/models/for-tests-ggml-base.bin
diff --git a/models/for-tests-ggml-base.en.bin b/models/for-tests-ggml-base.en.bin
diff --git a/models/for-tests-ggml-large.bin b/models/for-tests-ggml-large.bin
diff --git a/models/for-tests-ggml-medium.bin b/models/for-tests-ggml-medium.bin
diff --git a/models/for-tests-ggml-medium.en.bin b/models/for-tests-ggml-medium.en.bin
diff --git a/models/for-tests-ggml-small.bin b/models/for-tests-ggml-small.bin
diff --git a/models/for-tests-ggml-small.en.bin b/models/for-tests-ggml-small.en.bin
diff --git a/models/for-tests-ggml-tiny.bin b/models/for-tests-ggml-tiny.bin
diff --git a/models/for-tests-ggml-tiny.en.bin b/models/for-tests-ggml-tiny.en.bin
diff --git a/models/gen-test-models.py b/models/gen-test-models.py
@@ -0,0 +1,136 @@
+import base64
+import os
+import shutil
+import struct
+import numpy as np
+
+# ggml magic number
+GGML_FILE_MAGIC = 0x67676d6c  # "ggml"
+
+
+# Hyperparameter settings (configuration using tiny.en model)
+class HyperParams:
+    def __init__(self,
+                 n_vocab=51865,
+                 n_audio_ctx=1500,
+                 n_audio_state=384,
+                 n_audio_head=6,
+                 n_audio_layer=4,
+                 n_text_ctx=448,
+                 n_text_state=384,
+                 n_text_head=6,
+                 n_text_layer=4,
+                 n_mels=80):
+        self.n_vocab = n_vocab
+        self.n_audio_ctx = n_audio_ctx
+        self.n_audio_state = n_audio_state
+        self.n_audio_head = n_audio_head
+        self.n_audio_layer = n_audio_layer
+        self.n_text_ctx = n_text_ctx
+        self.n_text_state = n_text_state
+        self.n_text_head = n_text_head
+        self.n_text_layer = n_text_layer
+        self.n_mels = n_mels
+        self.ftype = True   # True: fp16, False: fp32
+
+def write_ggml_metadata(fout, hparams):
+    # write magic number
+    fout.write(struct.pack("i", GGML_FILE_MAGIC))
+
+    # write hyperparameters
+    fout.write(struct.pack("i", hparams.n_vocab))
+    fout.write(struct.pack("i", hparams.n_audio_ctx))
+    fout.write(struct.pack("i", hparams.n_audio_state))
+    fout.write(struct.pack("i", hparams.n_audio_head))
+    fout.write(struct.pack("i", hparams.n_audio_layer))
+    fout.write(struct.pack("i", hparams.n_text_ctx))
+    fout.write(struct.pack("i", hparams.n_text_state))
+    fout.write(struct.pack("i", hparams.n_text_head))
+    fout.write(struct.pack("i", hparams.n_text_layer))
+    fout.write(struct.pack("i", hparams.n_mels))
+    fout.write(struct.pack("i", hparams.ftype))
+
+def write_mel_filters(fout, hparams, mel_filters_path):
+    print("loading real Mel filter data...")
+    # load the Mel filter from the npz file
+    with np.load(mel_filters_path) as f:
+        filters = f[f"mel_{hparams.n_mels}"]
+    fout.write(struct.pack("i", filters.shape[0]))
+    fout.write(struct.pack("i", filters.shape[1]))
+    for i in range(filters.shape[0]):
+        for j in range(filters.shape[1]):
+            fout.write(struct.pack("f", filters[i][j]))
+
+def write_tokenizer(fout, tokenizer_path):
+    # read tokenizer file
+    with open(tokenizer_path, "r") as f:
+        tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in f.readlines() if line)}
+    # write size of tokenizer
+    fout.write(struct.pack("i", len(tokens)))
+    # write vocabulary
+    for t in tokens:
+        fout.write(struct.pack("i", len(t)))
+        fout.write(t)
+
+def generate_empty_model(filename, hparams):
+    print(f"generate empty model file: {filename}")
+    with open(filename, "wb") as f:
+        write_ggml_metadata(f, hparams)
+        write_mel_filters(f, hparams, "whisper/whisper/assets/mel_filters.npz")
+        write_tokenizer(f, f"whisper/whisper/assets/{'gpt2' if hparams.n_vocab < 51865 else 'multilingual'}.tiktoken")
+        # ignore the rest of the model
+
+if __name__ == "__main__":
+    os.system("git clone https://github.com/openai/whisper.git")
+
+    # Base models
+    generate_empty_model("for-tests-ggml-base.bin", HyperParams(
+        n_vocab=51865, n_audio_state=512, n_audio_head=8, n_audio_layer=6,
+        n_text_state=512, n_text_head=8, n_text_layer=6
+    ))
+    generate_empty_model("for-tests-ggml-base.en.bin", HyperParams(
+        n_vocab=51864, n_audio_state=512, n_audio_head=8, n_audio_layer=6,
+        n_text_state=512, n_text_head=8, n_text_layer=6
+    ))
+
+    # Small models
+    generate_empty_model("for-tests-ggml-small.bin", HyperParams(
+        n_vocab=51865, n_audio_state=768, n_audio_head=12, n_audio_layer=12,
+        n_text_state=768, n_text_head=12, n_text_layer=12
+    ))
+    generate_empty_model("for-tests-ggml-small.en.bin", HyperParams(
+        n_vocab=51864, n_audio_state=768, n_audio_head=12, n_audio_layer=12,
+        n_text_state=768, n_text_head=12, n_text_layer=12
+    ))
+
+    # Medium models
+    generate_empty_model("for-tests-ggml-medium.bin", HyperParams(
+        n_vocab=51865, n_audio_state=1024, n_audio_head=16, n_audio_layer=24,
+        n_text_state=1024, n_text_head=16, n_text_layer=24
+    ))
+    generate_empty_model("for-tests-ggml-medium.en.bin", HyperParams(
+        n_vocab=51864, n_audio_state=1024, n_audio_head=16, n_audio_layer=24,
+        n_text_state=1024, n_text_head=16, n_text_layer=24
+    ))
+
+    # Large models
+    generate_empty_model("for-tests-ggml-large.bin", HyperParams(
+        n_vocab=51865, n_audio_state=1280, n_audio_head=20, n_audio_layer=32,
+        n_text_state=1280, n_text_head=20, n_text_layer=32
+    ))
+    # generate_empty_model("for-tests-ggml-large-v3.bin", HyperParams(    # add <|yue|>
+    #     n_vocab=51866, n_audio_state=1280, n_audio_head=20, n_audio_layer=32,
+    #     n_text_state=1280, n_text_head=20, n_text_layer=32
+    # ))
+
+    # Tiny models
+    generate_empty_model("for-tests-ggml-tiny.bin", HyperParams(n_vocab=51865))
+    generate_empty_model("for-tests-ggml-tiny.en.bin", HyperParams(n_vocab=51864))
+
+    # Turbo model (based on large-v3 with optimizations)
+    # generate_empty_model("for-tests-ggml-turbo.bin", HyperParams(    # add <|yue|>
+    #     n_vocab=51866, n_audio_state=1280, n_audio_head=20, n_audio_layer=32,
+    #     n_text_state=1280, n_text_head=20, n_text_layer=32
+    # ))
+
+    shutil.rmtree("whisper", ignore_errors=True)
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -453,7 +453,7 @@ struct whisper_vocab {
     }
 
     int num_languages() const {
-        return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
+        return token_translate - token_sot - 1;
     }
 };
 
@@ -1587,21 +1587,16 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
     // load vocab
     {
-        int32_t n_vocab = 0;
-        read_safe(loader, n_vocab);
-
-        //if (n_vocab != model.hparams.n_vocab) {
-        //    WHISPER_LOG_ERROR("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-        //            __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-        //    return false;
-        //}
+        int32_t n_common_vocab = 0;
+        read_safe(loader, n_common_vocab);
+        WHISPER_LOG_INFO("%s: n_common_vocab = %d\n", __func__, n_common_vocab);
 
         std::string word;
         std::vector<char> tmp;
 
         tmp.reserve(128);
 
-        for (int i = 0; i < n_vocab; i++) {
+        for (int i = 0; i < n_common_vocab; i++) {
             uint32_t len;
             read_safe(loader, len);
 
@@ -1621,26 +1616,23 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
         }
 
-        vocab.n_vocab = model.hparams.n_vocab;
-        if (vocab.is_multilingual()) {
-            vocab.token_eot++;
-            vocab.token_sot++;
-
-            // account for variable number of language tokens
-            const int dt = vocab.num_languages() - 98;
+        vocab.n_vocab = model.hparams.n_vocab;             // all tokens, including special tokens
 
-            vocab.token_translate  += dt;
-            vocab.token_transcribe += dt;
-            vocab.token_solm       += dt;
-            vocab.token_prev       += dt;
-            vocab.token_nosp       += dt;
-            vocab.token_not        += dt;
-            vocab.token_beg        += dt;
-        }
+        vocab.token_eot        = n_common_vocab;           // <|endoftext|>   50256 for en, 50257 for multilingual, others for custom model
+        vocab.token_sot        = n_common_vocab + 1;       // <|startoftranscribe|>
+        // [n_common_vocab + 2, vocab.n_vocab - 1507) are language tokens
+        // num_language = vocab.token_translate - vocab.token_sot - 1 = vocab.n_vocab - n_common_vocab - 1509
+        vocab.token_translate  = vocab.n_vocab - 1507;     // <|translate|>
+        vocab.token_transcribe = vocab.n_vocab - 1506;     // <|transcribe|>
+        vocab.token_solm       = vocab.n_vocab - 1505;     // <|startoflm|>
+        vocab.token_prev       = vocab.n_vocab - 1504;     // <|startofprev|>
+        vocab.token_nosp       = vocab.n_vocab - 1503;     // <|nospeech|>
+        vocab.token_not        = vocab.n_vocab - 1502;     // <|notimestamps|>
+        vocab.token_beg        = vocab.n_vocab - 1501;     // timestamps from <|0.00|> to <|30.00|>, 1501 tokens
 
-        if (n_vocab < model.hparams.n_vocab) {
-            WHISPER_LOG_INFO("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
-            for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
+        if (n_common_vocab < model.hparams.n_vocab) {
+            WHISPER_LOG_INFO("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_common_vocab);
+            for (int i = n_common_vocab; i < model.hparams.n_vocab; i++) {
                 if (i > vocab.token_beg) {
                     word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
                 } else if (i == vocab.token_eot) {