From 1ab4db358431066d0df3704785518f30324a6635 Mon Sep 17 00:00:00 2001
From: Achyut Krishna Byanjankar <achyut.benz@gmail.com>
Date: Sat, 18 Apr 2026 18:29:05 -0700
Subject: [PATCH] whisper : validate vocab size and per-token length when
 loading model

whisper_model_load reads n_vocab (int32) and per-token length (uint32)
directly from the model file with no bounds check. A malformed or fuzzed
model (e.g. an 8-byte AFL++ finding) can set these to values that cause
std::vector::resize to throw bad_alloc, which is uncaught and terminates
the process with SIGABRT (signal 6) before any error is reported.

Cap n_vocab at 2^20 tokens (real models top out around 52k) and each
per-token length at 2^16 bytes. On violation, log a clear error message
and return false so whisper_init_from_file_with_params_no_state can fail
gracefully.

Fixes #3674
---
 src/whisper.cpp | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 2f356da0f06..f61846951df 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1587,14 +1587,22 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
     // load vocab
     {
+        // Upper bounds for values read from untrusted model files. A malformed
+        // or fuzzed file can otherwise set these to values that cause
+        // std::vector::resize to throw (bad_alloc) or std::string to terminate
+        // the process with SIGABRT during vocab construction.
+        // ref: https://github.com/ggml-org/whisper.cpp/issues/3674
+        constexpr int32_t  max_n_vocab  = 1 << 20; // ~1M tokens (largest real models are ~52k)
+        constexpr uint32_t max_word_len = 1 << 16; // 64 KiB per vocab token
+
         int32_t n_vocab = 0;
         read_safe(loader, n_vocab);
 
-        //if (n_vocab != model.hparams.n_vocab) {
-        //    WHISPER_LOG_ERROR("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-        //            __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-        //    return false;
-        //}
+        if (n_vocab < 0 || n_vocab > max_n_vocab) {
+            WHISPER_LOG_ERROR("%s: invalid vocab size %d (expected 0..%d); malformed model file\n",
+                    __func__, n_vocab, max_n_vocab);
+            return false;
+        }
 
         std::string word;
         std::vector<char> tmp;
@@ -1605,6 +1613,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             uint32_t len;
             read_safe(loader, len);
 
+            if (len > max_word_len) {
+                WHISPER_LOG_ERROR("%s: invalid vocab entry %d length %u (max %u); malformed model file\n",
+                        __func__, i, len, max_word_len);
+                return false;
+            }
+
             if (len > 0) {
                 tmp.resize(len);
                 loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer