Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions models/convert-h5-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def bytes_to_unicode():
fname_out = dir_out / "ggml-model.bin"

tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8"))
if "<|endoftext|>" in tokens:
del tokens["<|endoftext|>"]
Comment on lines +110 to +111
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I use ggml-tiny.en.bin recognition, the result is also empty, the same reason why the tests fail

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the last commit was Migrate from HG dataset into HG model, it is necessary that these models need to be reconverted if they were last generated with this script, otherwise, <|endoftext|> will be written into common tokens.

Copy link
Author

@Jaffe2718 Jaffe2718 Dec 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just like the test model, the GGML models that are actually converted from OpenAI's official model should not record special tokens in the vocabulary, otherwise the ID of the subsequent special token will be positioned incorrectly.
Image

also see: #725


# use 16-bit or 32-bit floats
use_f16 = True
Expand Down
Binary file modified models/for-tests-ggml-base.bin
Binary file not shown.
Binary file modified models/for-tests-ggml-base.en.bin
Binary file not shown.
Binary file modified models/for-tests-ggml-large.bin
Binary file not shown.
Binary file modified models/for-tests-ggml-medium.bin
Binary file not shown.
Binary file modified models/for-tests-ggml-medium.en.bin
Binary file not shown.
Binary file modified models/for-tests-ggml-small.bin
Binary file not shown.
Binary file modified models/for-tests-ggml-small.en.bin
Binary file not shown.
Binary file modified models/for-tests-ggml-tiny.bin
Binary file not shown.
Binary file modified models/for-tests-ggml-tiny.en.bin
Binary file not shown.
136 changes: 136 additions & 0 deletions models/gen-test-models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import base64
import os
import shutil
import struct
import numpy as np

# ggml magic number
GGML_FILE_MAGIC = 0x67676d6c # "ggml"


# Hyperparameter settings (configuration using tiny.en model)
class HyperParams:
def __init__(self,
n_vocab=51865,
n_audio_ctx=1500,
n_audio_state=384,
n_audio_head=6,
n_audio_layer=4,
n_text_ctx=448,
n_text_state=384,
n_text_head=6,
n_text_layer=4,
n_mels=80):
self.n_vocab = n_vocab
self.n_audio_ctx = n_audio_ctx
self.n_audio_state = n_audio_state
self.n_audio_head = n_audio_head
self.n_audio_layer = n_audio_layer
self.n_text_ctx = n_text_ctx
self.n_text_state = n_text_state
self.n_text_head = n_text_head
self.n_text_layer = n_text_layer
self.n_mels = n_mels
self.ftype = True # True: fp16, False: fp32

def write_ggml_metadata(fout, hparams):
# write magic number
fout.write(struct.pack("i", GGML_FILE_MAGIC))

# write hyperparameters
fout.write(struct.pack("i", hparams.n_vocab))
fout.write(struct.pack("i", hparams.n_audio_ctx))
fout.write(struct.pack("i", hparams.n_audio_state))
fout.write(struct.pack("i", hparams.n_audio_head))
fout.write(struct.pack("i", hparams.n_audio_layer))
fout.write(struct.pack("i", hparams.n_text_ctx))
fout.write(struct.pack("i", hparams.n_text_state))
fout.write(struct.pack("i", hparams.n_text_head))
fout.write(struct.pack("i", hparams.n_text_layer))
fout.write(struct.pack("i", hparams.n_mels))
fout.write(struct.pack("i", hparams.ftype))

def write_mel_filters(fout, hparams, mel_filters_path):
print("loading real Mel filter data...")
# load the Mel filter from the npz file
with np.load(mel_filters_path) as f:
filters = f[f"mel_{hparams.n_mels}"]
fout.write(struct.pack("i", filters.shape[0]))
fout.write(struct.pack("i", filters.shape[1]))
for i in range(filters.shape[0]):
for j in range(filters.shape[1]):
fout.write(struct.pack("f", filters[i][j]))

def write_tokenizer(fout, tokenizer_path):
# read tokenizer file
with open(tokenizer_path, "r") as f:
tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in f.readlines() if line)}
# write size of tokenizer
fout.write(struct.pack("i", len(tokens)))
# write vocabulary
for t in tokens:
fout.write(struct.pack("i", len(t)))
fout.write(t)

def generate_empty_model(filename, hparams):
print(f"generate empty model file: {filename}")
with open(filename, "wb") as f:
write_ggml_metadata(f, hparams)
write_mel_filters(f, hparams, "whisper/whisper/assets/mel_filters.npz")
write_tokenizer(f, f"whisper/whisper/assets/{'gpt2' if hparams.n_vocab < 51865 else 'multilingual'}.tiktoken")
# ignore the rest of the model

if __name__ == "__main__":
os.system("git clone https://github.com/openai/whisper.git")

# Base models
generate_empty_model("for-tests-ggml-base.bin", HyperParams(
n_vocab=51865, n_audio_state=512, n_audio_head=8, n_audio_layer=6,
n_text_state=512, n_text_head=8, n_text_layer=6
))
generate_empty_model("for-tests-ggml-base.en.bin", HyperParams(
n_vocab=51864, n_audio_state=512, n_audio_head=8, n_audio_layer=6,
n_text_state=512, n_text_head=8, n_text_layer=6
))

# Small models
generate_empty_model("for-tests-ggml-small.bin", HyperParams(
n_vocab=51865, n_audio_state=768, n_audio_head=12, n_audio_layer=12,
n_text_state=768, n_text_head=12, n_text_layer=12
))
generate_empty_model("for-tests-ggml-small.en.bin", HyperParams(
n_vocab=51864, n_audio_state=768, n_audio_head=12, n_audio_layer=12,
n_text_state=768, n_text_head=12, n_text_layer=12
))

# Medium models
generate_empty_model("for-tests-ggml-medium.bin", HyperParams(
n_vocab=51865, n_audio_state=1024, n_audio_head=16, n_audio_layer=24,
n_text_state=1024, n_text_head=16, n_text_layer=24
))
generate_empty_model("for-tests-ggml-medium.en.bin", HyperParams(
n_vocab=51864, n_audio_state=1024, n_audio_head=16, n_audio_layer=24,
n_text_state=1024, n_text_head=16, n_text_layer=24
))

# Large models
generate_empty_model("for-tests-ggml-large.bin", HyperParams(
n_vocab=51865, n_audio_state=1280, n_audio_head=20, n_audio_layer=32,
n_text_state=1280, n_text_head=20, n_text_layer=32
))
# generate_empty_model("for-tests-ggml-large-v3.bin", HyperParams( # add <|yue|>
# n_vocab=51866, n_audio_state=1280, n_audio_head=20, n_audio_layer=32,
# n_text_state=1280, n_text_head=20, n_text_layer=32
# ))

# Tiny models
generate_empty_model("for-tests-ggml-tiny.bin", HyperParams(n_vocab=51865))
generate_empty_model("for-tests-ggml-tiny.en.bin", HyperParams(n_vocab=51864))

# Turbo model (based on large-v3 with optimizations)
# generate_empty_model("for-tests-ggml-turbo.bin", HyperParams( # add <|yue|>
# n_vocab=51866, n_audio_state=1280, n_audio_head=20, n_audio_layer=32,
# n_text_state=1280, n_text_head=20, n_text_layer=32
# ))

shutil.rmtree("whisper", ignore_errors=True)
48 changes: 20 additions & 28 deletions src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ struct whisper_vocab {
}

int num_languages() const {
return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
return token_translate - token_sot - 1;
}
};

Expand Down Expand Up @@ -1587,21 +1587,16 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

// load vocab
{
int32_t n_vocab = 0;
read_safe(loader, n_vocab);

//if (n_vocab != model.hparams.n_vocab) {
// WHISPER_LOG_ERROR("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
// return false;
//}
int32_t n_common_vocab = 0;
read_safe(loader, n_common_vocab);
WHISPER_LOG_INFO("%s: n_common_vocab = %d\n", __func__, n_common_vocab);

std::string word;
std::vector<char> tmp;

tmp.reserve(128);

for (int i = 0; i < n_vocab; i++) {
for (int i = 0; i < n_common_vocab; i++) {
uint32_t len;
read_safe(loader, len);

Expand All @@ -1621,26 +1616,23 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
//printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
}

vocab.n_vocab = model.hparams.n_vocab;
if (vocab.is_multilingual()) {
vocab.token_eot++;
vocab.token_sot++;

// account for variable number of language tokens
const int dt = vocab.num_languages() - 98;
vocab.n_vocab = model.hparams.n_vocab; // all tokens, including special tokens

vocab.token_translate += dt;
vocab.token_transcribe += dt;
vocab.token_solm += dt;
vocab.token_prev += dt;
vocab.token_nosp += dt;
vocab.token_not += dt;
vocab.token_beg += dt;
}
vocab.token_eot = n_common_vocab; // <|endoftext|> 50256 for en, 50257 for multilingual, others for custom model
vocab.token_sot = n_common_vocab + 1; // <|startoftranscribe|>
// [n_common_vocab + 2, vocab.n_vocab - 1507) are language tokens
// num_language = vocab.token_translate - vocab.token_sot - 1 = vocab.n_vocab - n_common_vocab - 1509
vocab.token_translate = vocab.n_vocab - 1507; // <|translate|>
vocab.token_transcribe = vocab.n_vocab - 1506; // <|transcribe|>
vocab.token_solm = vocab.n_vocab - 1505; // <|startoflm|>
vocab.token_prev = vocab.n_vocab - 1504; // <|startofprev|>
vocab.token_nosp = vocab.n_vocab - 1503; // <|nospeech|>
vocab.token_not = vocab.n_vocab - 1502; // <|notimestamps|>
vocab.token_beg = vocab.n_vocab - 1501; // timestamps from <|0.00|> to <|30.00|>, 1501 tokens

if (n_vocab < model.hparams.n_vocab) {
WHISPER_LOG_INFO("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
if (n_common_vocab < model.hparams.n_vocab) {
WHISPER_LOG_INFO("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_common_vocab);
for (int i = n_common_vocab; i < model.hparams.n_vocab; i++) {
if (i > vocab.token_beg) {
word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
} else if (i == vocab.token_eot) {
Expand Down