From 9fad5d5bb93d4a344692ed13e5b08e1dff1fb471 Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Thu, 28 Aug 2025 21:54:27 +0200 Subject: [PATCH 01/14] Add support for --carry-initial-prompt --- bindings/go/params.go | 4 + .../whispercpp/params/WhisperFullParams.java | 8 +- bindings/ruby/ext/ruby_whisper_params.c | 25 +- bindings/ruby/sig/whisper.rbs | 3 + bindings/ruby/test/test_params.rb | 8 + examples/cli/cli.cpp | 220 +++++++++--------- include/whisper.h | 1 + src/whisper.cpp | 61 ++++- 8 files changed, 210 insertions(+), 120 deletions(-) diff --git a/bindings/go/params.go b/bindings/go/params.go index 95c5bfaf934..e1d54b266aa 100644 --- a/bindings/go/params.go +++ b/bindings/go/params.go @@ -47,6 +47,10 @@ func (p *Params) SetPrintTimestamps(v bool) { p.print_timestamps = toBool(v) } +func (p *Params) SetCarryInitialPrompt(v bool) { + p.carry_initial_prompt = toBool(v) +} + // Set language id func (p *Params) SetLanguage(lang int) error { if lang == -1 { diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java index 498ff126037..2decd5a2ef2 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java @@ -157,6 +157,8 @@ public void tdrzEnable(boolean enable) { /** Tokens to provide to the whisper decoder as an initial prompt. * These are prepended to any existing text context from a previous call. */ public String initial_prompt; + /** Always prepend initial_prompt for every decode chunk. */ + public CBool carry_initial_prompt; /** Prompt tokens. (int*) */ public Pointer prompt_tokens; @@ -331,13 +333,13 @@ public void setLogitsFilterCallback(WhisperLogitsFilterCallback callback) { @Override protected List getFieldOrder() { - return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", + return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate", "no_context", "no_timestamps", "single_segment", "print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps", "thold_pt", "thold_ptsum", "max_len", - "split_on_word", "max_tokens", "debug_mode", "audio_ctx", - "tdrz_enable", "suppress_regex", "initial_prompt", + "split_on_word", "max_tokens", "debug_mode", "audio_ctx", + "tdrz_enable", "suppress_regex", "initial_prompt", "carry_initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language", "suppress_blank", "suppress_nst", "temperature", "max_initial_ts", "length_penalty", "temperature_inc", diff --git a/bindings/ruby/ext/ruby_whisper_params.c b/bindings/ruby/ext/ruby_whisper_params.c index 882c68d042f..670f9351375 100644 --- a/bindings/ruby/ext/ruby_whisper_params.c +++ b/bindings/ruby/ext/ruby_whisper_params.c @@ -26,7 +26,7 @@ rb_define_method(cParams, #param_name, ruby_whisper_params_get_ ## param_name, 0); \ rb_define_method(cParams, #param_name "=", ruby_whisper_params_set_ ## param_name, 1); -#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 36 +#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 37 extern VALUE cParams; extern VALUE cVADParams; @@ -46,6 +46,7 @@ static ID id_print_special; static ID id_print_progress; static ID id_print_realtime; static ID id_print_timestamps; +static ID id_carry_initial_prompt; static ID id_suppress_blank; static ID id_suppress_nst; static ID id_token_timestamps; @@ -455,6 +456,26 @@ ruby_whisper_params_get_print_timestamps(VALUE self) { BOOL_PARAMS_GETTER(self, print_timestamps) } + +/* + * call-seq: + * carry_initial_prompt -> true or false + */ +static VALUE +ruby_whisper_params_get_carry_initial_prompt(VALUE self) +{ + BOOL_PARAMS_GETTER(self, carry_initial_prompt) +} + +/* + * call-seq: + * carry_initial_prompt = bool -> bool + */ +static VALUE +ruby_whisper_params_set_carry_initial_prompt(VALUE self, VALUE value) +{ + BOOL_PARAMS_SETTER(self, carry_initial_prompt, value) +} /* * call-seq: * suppress_blank = force_suppress -> force_suppress @@ -1168,6 +1189,7 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self) SET_PARAM_IF_SAME(max_len) SET_PARAM_IF_SAME(split_on_word) SET_PARAM_IF_SAME(initial_prompt) + SET_PARAM_IF_SAME(carry_initial_prompt) SET_PARAM_IF_SAME(offset) SET_PARAM_IF_SAME(duration) SET_PARAM_IF_SAME(max_text_tokens) @@ -1303,6 +1325,7 @@ init_ruby_whisper_params(VALUE *mWhisper) DEFINE_PARAM(max_len, 11) DEFINE_PARAM(split_on_word, 12) DEFINE_PARAM(initial_prompt, 13) + DEFINE_PARAM(carry_initial_prompt, 36) DEFINE_PARAM(diarize, 14) DEFINE_PARAM(offset, 15) DEFINE_PARAM(duration, 16) diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs index 0489432a249..d5905dd7037 100644 --- a/bindings/ruby/sig/whisper.rbs +++ b/bindings/ruby/sig/whisper.rbs @@ -138,6 +138,7 @@ module Whisper ?max_len: Integer, ?split_on_word: boolish, ?initial_prompt: string | nil, + ?carry_initial_prompt: boolish, ?diarize: boolish, ?offset: Integer, ?duration: Integer, @@ -236,6 +237,7 @@ module Whisper def split_on_word: () -> (true | false) def initial_prompt=: (_ToS) -> _ToS + def carry_initial_prompt=: (boolish) -> boolish # Tokens to provide to the whisper decoder as initial prompt # these are prepended to any existing text context from a previous call @@ -243,6 +245,7 @@ module Whisper # Maximum of whisper_n_text_ctx()/2 tokens are used (typically 224). # def initial_prompt: () -> (String | nil) + def carry_initial_prompt: () -> (true | false) def diarize=: (boolish) -> boolish diff --git a/bindings/ruby/test/test_params.rb b/bindings/ruby/test/test_params.rb index d5c5d140e8c..4dd9780de7d 100644 --- a/bindings/ruby/test/test_params.rb +++ b/bindings/ruby/test/test_params.rb @@ -16,6 +16,7 @@ class TestParams < TestBase :max_len, :split_on_word, :initial_prompt, + :carry_initial_prompt, :diarize, :offset, :duration, @@ -119,6 +120,13 @@ def test_print_timestamps assert !@params.print_timestamps end + def test_carry_initial_prompt + @params.carry_initial_prompt = true + assert @params.carry_initial_prompt + @params.carry_initial_prompt = false + assert !@params.carry_initial_prompt + end + def test_suppress_blank @params.suppress_blank = true assert @params.suppress_blank diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp index f73ed9ae078..45516db614f 100644 --- a/examples/cli/cli.cpp +++ b/examples/cli/cli.cpp @@ -77,6 +77,7 @@ struct whisper_params { bool use_gpu = true; bool flash_attn = false; bool suppress_nst = false; + bool carry_initial_prompt = false; std::string language = "en"; std::string prompt; @@ -145,59 +146,60 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params exit(0); } #define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg)) - else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); } - else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(ARGV_NEXT); } - else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(ARGV_NEXT); } - else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(ARGV_NEXT); } - else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(ARGV_NEXT); } - else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(ARGV_NEXT); } - else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(ARGV_NEXT); } - else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(ARGV_NEXT); } - else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(ARGV_NEXT); } - else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(ARGV_NEXT); } - else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(ARGV_NEXT); } - else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(ARGV_NEXT); } - else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(ARGV_NEXT); } - else if (arg == "-nth" || arg == "--no-speech-thold") { params.no_speech_thold = std::stof(ARGV_NEXT); } - else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(ARGV_NEXT); } - else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(ARGV_NEXT); } - else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; } - else if (arg == "-tr" || arg == "--translate") { params.translate = true; } - else if (arg == "-di" || arg == "--diarize") { params.diarize = true; } - else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; } - else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; } - else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; } - else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; } - else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; } - else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; } - else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; } - else if (arg == "-olrc" || arg == "--output-lrc") { params.output_lrc = true; } - else if (arg == "-fp" || arg == "--font-path") { params.font_path = ARGV_NEXT; } - else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; } - else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; } - else if (arg == "-ojf" || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; } - else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(ARGV_NEXT); } - else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; } - else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } - else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; } - else if ( arg == "--print-confidence"){ params.print_confidence= true; } - else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; } - else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; } - else if (arg == "-l" || arg == "--language") { params.language = whisper_param_turn_lowercase(ARGV_NEXT); } - else if (arg == "-dl" || arg == "--detect-language") { params.detect_language = true; } - else if ( arg == "--prompt") { params.prompt = ARGV_NEXT; } - else if (arg == "-m" || arg == "--model") { params.model = ARGV_NEXT; } - else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(ARGV_NEXT); } - else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = ARGV_NEXT; } - else if (arg == "-dtw" || arg == "--dtw") { params.dtw = ARGV_NEXT; } - else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; } - else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } - else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } - else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; } - else if ( arg == "--suppress-regex") { params.suppress_regex = ARGV_NEXT; } - else if ( arg == "--grammar") { params.grammar = ARGV_NEXT; } - else if ( arg == "--grammar-rule") { params.grammar_rule = ARGV_NEXT; } - else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(ARGV_NEXT); } + else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); } + else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(ARGV_NEXT); } + else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(ARGV_NEXT); } + else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(ARGV_NEXT); } + else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(ARGV_NEXT); } + else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(ARGV_NEXT); } + else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(ARGV_NEXT); } + else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(ARGV_NEXT); } + else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(ARGV_NEXT); } + else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(ARGV_NEXT); } + else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(ARGV_NEXT); } + else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(ARGV_NEXT); } + else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(ARGV_NEXT); } + else if (arg == "-nth" || arg == "--no-speech-thold") { params.no_speech_thold = std::stof(ARGV_NEXT); } + else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(ARGV_NEXT); } + else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(ARGV_NEXT); } + else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; } + else if (arg == "-tr" || arg == "--translate") { params.translate = true; } + else if (arg == "-di" || arg == "--diarize") { params.diarize = true; } + else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; } + else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; } + else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; } + else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; } + else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; } + else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; } + else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; } + else if (arg == "-olrc" || arg == "--output-lrc") { params.output_lrc = true; } + else if (arg == "-fp" || arg == "--font-path") { params.font_path = ARGV_NEXT; } + else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; } + else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; } + else if (arg == "-ojf" || arg == "--output-json-full") { params.output_jsn_full = params.output_jsn = true; } + else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(ARGV_NEXT); } + else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; } + else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } + else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; } + else if ( arg == "--print-confidence") { params.print_confidence= true; } + else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; } + else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; } + else if (arg == "-l" || arg == "--language") { params.language = whisper_param_turn_lowercase(ARGV_NEXT); } + else if (arg == "-dl" || arg == "--detect-language") { params.detect_language = true; } + else if ( arg == "--prompt") { params.prompt = ARGV_NEXT; } + else if ( arg == "--carry-initial-prompt") { params.carry_initial_prompt = true; } + else if (arg == "-m" || arg == "--model") { params.model = ARGV_NEXT; } + else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(ARGV_NEXT); } + else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = ARGV_NEXT; } + else if (arg == "-dtw" || arg == "--dtw") { params.dtw = ARGV_NEXT; } + else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } + else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } + else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; } + else if ( arg == "--suppress-regex") { params.suppress_regex = ARGV_NEXT; } + else if ( arg == "--grammar") { params.grammar = ARGV_NEXT; } + else if ( arg == "--grammar-rule") { params.grammar_rule = ARGV_NEXT; } + else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(ARGV_NEXT); } // Voice Activity Detection (VAD) else if ( arg == "--vad") { params.vad = true; } else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; } @@ -223,60 +225,61 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n"); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help [default] show this help message and exit\n"); - fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads); - fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors); - fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms); - fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n); - fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms); - fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context); - fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len); - fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false"); - fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of); - fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size); - fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); - fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold); - fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold); - fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold); - fprintf(stderr, " -nth N, --no-speech-thold N [%-7.2f] no speech threshold\n", params.no_speech_thold); - fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature); - fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc); - fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false"); - fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); - fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false"); - fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false"); - fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false"); - fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false"); - fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false"); - fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false"); - fprintf(stderr, " -olrc, --output-lrc [%-7s] output result in a lrc file\n", params.output_lrc ? "true" : "false"); - fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false"); - fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str()); - fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false"); - fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false"); - fprintf(stderr, " -ojf, --output-json-full [%-7s] include more information in the JSON file\n", params.output_jsn_full ? "true" : "false"); - fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", ""); - fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false"); - fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); - fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false"); - fprintf(stderr, " --print-confidence [%-7s] print confidence\n", params.print_confidence ? "true" : "false"); - fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false"); - fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false"); - fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str()); - fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false"); - fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt (max n_text_ctx/2 tokens)\n", params.prompt.c_str()); - fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); - fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", ""); - fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str()); - fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str()); - fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false"); - fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); - fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false"); - fprintf(stderr, " -sns, --suppress-nst [%-7s] suppress non-speech tokens\n", params.suppress_nst ? "true" : "false"); - fprintf(stderr, " --suppress-regex REGEX [%-7s] regular expression matching tokens to suppress\n", params.suppress_regex.c_str()); - fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str()); - fprintf(stderr, " --grammar-rule RULE [%-7s] top-level GBNF grammar rule name\n", params.grammar_rule.c_str()); - fprintf(stderr, " --grammar-penalty N [%-7.1f] scales down logits of nongrammar tokens\n", params.grammar_penalty); + fprintf(stderr, " -h, --help [default] show this help message and exit\n"); + fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads); + fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors); + fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms); + fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n); + fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms); + fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context); + fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len); + fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false"); + fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of); + fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size); + fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); + fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold); + fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold); + fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold); + fprintf(stderr, " -nth N, --no-speech-thold N [%-7.2f] no speech threshold\n", params.no_speech_thold); + fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature); + fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc); + fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false"); + fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); + fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false"); + fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false"); + fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false"); + fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false"); + fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false"); + fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false"); + fprintf(stderr, " -olrc, --output-lrc [%-7s] output result in a lrc file\n", params.output_lrc ? "true" : "false"); + fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false"); + fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str()); + fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false"); + fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false"); + fprintf(stderr, " -ojf, --output-json-full [%-7s] include more information in the JSON file\n", params.output_jsn_full ? "true" : "false"); + fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", ""); + fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false"); + fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); + fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false"); + fprintf(stderr, " --print-confidence [%-7s] print confidence\n", params.print_confidence ? "true" : "false"); + fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false"); + fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false"); + fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str()); + fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false"); + fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt (max n_text_ctx/2 tokens)\n", params.prompt.c_str()); + fprintf(stderr, " --carry-initial-prompt [%-7s] always prepend initial prompt\n", params.carry_initial_prompt ? "true" : "false"); + fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); + fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", ""); + fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str()); + fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str()); + fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false"); + fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); + fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false"); + fprintf(stderr, " -sns, --suppress-nst [%-7s] suppress non-speech tokens\n", params.suppress_nst ? "true" : "false"); + fprintf(stderr, " --suppress-regex REGEX [%-7s] regular expression matching tokens to suppress\n", params.suppress_regex.c_str()); + fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str()); + fprintf(stderr, " --grammar-rule RULE [%-7s] top-level GBNF grammar rule name\n", params.grammar_rule.c_str()); + fprintf(stderr, " --grammar-penalty N [%-7.1f] scales down logits of nongrammar tokens\n", params.grammar_penalty); // Voice Activity Detection (VAD) parameters fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n"); fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false"); @@ -1176,7 +1179,8 @@ int main(int argc, char ** argv) { wparams.suppress_regex = params.suppress_regex.empty() ? nullptr : params.suppress_regex.c_str(); - wparams.initial_prompt = params.prompt.c_str(); + wparams.initial_prompt = params.prompt.c_str(); + wparams.carry_initial_prompt = params.carry_initial_prompt; wparams.greedy.best_of = params.best_of; wparams.beam_search.beam_size = params.beam_size; diff --git a/include/whisper.h b/include/whisper.h index fcd756a9fe2..f4cc6bf7abd 100644 --- a/include/whisper.h +++ b/include/whisper.h @@ -525,6 +525,7 @@ extern "C" { // use whisper_tokenize() to convert text to tokens // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224) const char * initial_prompt; + bool carry_initial_prompt; // if true, always prepend initial_prompt to every decode window (may reduce conditioning on previous text) const whisper_token * prompt_tokens; int prompt_n_tokens; diff --git a/src/whisper.cpp b/src/whisper.cpp index 52de68c2b12..6e402bb052d 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -5952,9 +5952,10 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str /* suppress_regex =*/ nullptr, - /*.initial_prompt =*/ nullptr, - /*.prompt_tokens =*/ nullptr, - /*.prompt_n_tokens =*/ 0, + /*.initial_prompt =*/ nullptr, + /*.carry_initial_prompt =*/ false, + /*.prompt_tokens =*/ nullptr, + /*.prompt_n_tokens =*/ 0, /*.language =*/ "en", /*.detect_language =*/ false, @@ -6913,6 +6914,7 @@ int whisper_full_with_state( } // prepare prompt + std::vector initial_prompt_tokens; // persistent for carry_initial_prompt { std::vector prompt_tokens; @@ -6927,6 +6929,9 @@ int whisper_full_with_state( prompt_tokens.resize(n_needed); params.prompt_tokens = prompt_tokens.data(); params.prompt_n_tokens = prompt_tokens.size(); + if (params.carry_initial_prompt) { + initial_prompt_tokens = prompt_tokens; // copy for reuse + } } // prepend the prompt tokens to the prompt_past @@ -6937,6 +6942,10 @@ int whisper_full_with_state( } std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end()); } + + if (initial_prompt_tokens.empty() && params.carry_initial_prompt && params.prompt_tokens && params.prompt_n_tokens > 0) { + initial_prompt_tokens.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); + } } // overwrite audio_ctx, max allowed is hparams.n_audio_ctx @@ -6992,6 +7001,7 @@ int whisper_full_with_state( std::vector beam_candidates; // main loop + bool first_iter_with_prompt = true; // track first decode iteration for carry_initial_prompt logic while (true) { if (params.progress_callback) { const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start); @@ -7083,16 +7093,43 @@ int whisper_full_with_state( prompt.clear(); // if we have already generated some text, use it as a prompt to condition the next generation - if (!prompt_past.empty() && t_cur < 0.5f && params.n_max_text_ctx > 0) { - int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size())); - + if (( (!prompt_past.empty()) || (params.carry_initial_prompt && !initial_prompt_tokens.empty() && !first_iter_with_prompt) ) + && t_cur < 0.5f && params.n_max_text_ctx > 0) { + int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2); prompt = { whisper_token_prev(ctx) }; - prompt.insert(prompt.begin() + 1, prompt_past.end() - n_take, prompt_past.end()); + if (params.carry_initial_prompt) { + if (first_iter_with_prompt) { + // behave like non-carry on first chunk to avoid duplication + int n_take = std::min(max_ctx_half, (int)prompt_past.size()); + prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end()); + } else { + std::vector ipt = initial_prompt_tokens; + if (!ipt.empty()) { + if ((int)ipt.size() > max_ctx_half - 1) { + ipt.erase(ipt.begin(), ipt.begin() + (ipt.size() - (max_ctx_half - 1))); + } + prompt.insert(prompt.end(), ipt.begin(), ipt.end()); + } + int remaining_budget = max_ctx_half - (int)ipt.size(); + if (remaining_budget > 0) { + int n_take = std::min(remaining_budget, (int)prompt_past.size()); + if (n_take > 0) { + prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end()); + } + } + } + } else { + int n_take = std::min(max_ctx_half, (int)prompt_past.size()); + prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end()); + } } // init new transcription with sot, language (opt) and task tokens prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end()); + // mark first iteration done + first_iter_with_prompt = false; + // print the prompt WHISPER_LOG_DEBUG("\n\n"); for (int i = 0; i < (int) prompt.size(); i++) { @@ -7572,7 +7609,15 @@ int whisper_full_with_state( // update prompt_past prompt_past.clear(); if (prompt.front() == whisper_token_prev(ctx)) { - prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - prompt_init.size()); + auto start_it = prompt.begin() + 1; + if (params.carry_initial_prompt && params.prompt_n_tokens > 0) { + // skip initial prompt tokens to avoid accumulating duplicates + int n_ip = params.prompt_n_tokens; + if (prompt.end() - start_it > n_ip) { + start_it += n_ip; + } + } + prompt_past.insert(prompt_past.end(), start_it, prompt.end() - prompt_init.size()); } for (int i = 0; i < result_len && !is_no_speech; ++i) { From 02714dddde092735de2e6a77ea97b18e8d612b08 Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Mon, 8 Sep 2025 13:01:58 +0200 Subject: [PATCH 02/14] PR fixes for ruby and go --- bindings/go/params.go | 10 ++++-- bindings/ruby/ext/ruby_whisper_params.c | 46 ++++++++++++------------- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/bindings/go/params.go b/bindings/go/params.go index e1d54b266aa..d8dee57e331 100644 --- a/bindings/go/params.go +++ b/bindings/go/params.go @@ -47,9 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) { p.print_timestamps = toBool(v) } -func (p *Params) SetCarryInitialPrompt(v bool) { - p.carry_initial_prompt = toBool(v) -} // Set language id func (p *Params) SetLanguage(lang int) error { @@ -150,6 +147,10 @@ func (p *Params) SetInitialPrompt(prompt string) { p.initial_prompt = C.CString(prompt) } +func (p *Params) SetCarryInitialPrompt(v bool) { + p.carry_initial_prompt = toBool(v) +} + /////////////////////////////////////////////////////////////////////////////// // PRIVATE METHODS @@ -203,6 +204,9 @@ func (p *Params) String() string { if p.token_timestamps { str += " token_timestamps" } + if p.carry_initial_prompt { + str += " carry_initial_prompt" + } return str + ">" } diff --git a/bindings/ruby/ext/ruby_whisper_params.c b/bindings/ruby/ext/ruby_whisper_params.c index 670f9351375..70417cb1664 100644 --- a/bindings/ruby/ext/ruby_whisper_params.c +++ b/bindings/ruby/ext/ruby_whisper_params.c @@ -1325,29 +1325,29 @@ init_ruby_whisper_params(VALUE *mWhisper) DEFINE_PARAM(max_len, 11) DEFINE_PARAM(split_on_word, 12) DEFINE_PARAM(initial_prompt, 13) - DEFINE_PARAM(carry_initial_prompt, 36) - DEFINE_PARAM(diarize, 14) - DEFINE_PARAM(offset, 15) - DEFINE_PARAM(duration, 16) - DEFINE_PARAM(max_text_tokens, 17) - DEFINE_PARAM(temperature, 18) - DEFINE_PARAM(max_initial_ts, 19) - DEFINE_PARAM(length_penalty, 20) - DEFINE_PARAM(temperature_inc, 21) - DEFINE_PARAM(entropy_thold, 22) - DEFINE_PARAM(logprob_thold, 23) - DEFINE_PARAM(no_speech_thold, 24) - DEFINE_PARAM(new_segment_callback, 25) - DEFINE_PARAM(new_segment_callback_user_data, 26) - DEFINE_PARAM(progress_callback, 27) - DEFINE_PARAM(progress_callback_user_data, 28) - DEFINE_PARAM(encoder_begin_callback, 29) - DEFINE_PARAM(encoder_begin_callback_user_data, 30) - DEFINE_PARAM(abort_callback, 31) - DEFINE_PARAM(abort_callback_user_data, 32) - DEFINE_PARAM(vad, 33) - DEFINE_PARAM(vad_model_path, 34) - DEFINE_PARAM(vad_params, 35) + DEFINE_PARAM(carry_initial_prompt, 14) + DEFINE_PARAM(diarize, 15) + DEFINE_PARAM(offset, 16) + DEFINE_PARAM(duration, 17) + DEFINE_PARAM(max_text_tokens, 18) + DEFINE_PARAM(temperature, 19) + DEFINE_PARAM(max_initial_ts, 20) + DEFINE_PARAM(length_penalty, 21) + DEFINE_PARAM(temperature_inc, 22) + DEFINE_PARAM(entropy_thold, 23) + DEFINE_PARAM(logprob_thold, 24) + DEFINE_PARAM(no_speech_thold, 25) + DEFINE_PARAM(new_segment_callback, 26) + DEFINE_PARAM(new_segment_callback_user_data, 27) + DEFINE_PARAM(progress_callback, 28) + DEFINE_PARAM(progress_callback_user_data, 29) + DEFINE_PARAM(encoder_begin_callback, 30) + DEFINE_PARAM(encoder_begin_callback_user_data, 31) + DEFINE_PARAM(abort_callback, 32) + DEFINE_PARAM(abort_callback_user_data, 33) + DEFINE_PARAM(vad, 34) + DEFINE_PARAM(vad_model_path, 35) + DEFINE_PARAM(vad_params, 36) rb_define_method(cParams, "on_new_segment", ruby_whisper_params_on_new_segment, 0); rb_define_method(cParams, "on_progress", ruby_whisper_params_on_progress, 0); From e7468c2949107407fe8541a9356cd3b0716c8d14 Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Tue, 9 Sep 2025 08:59:26 +0200 Subject: [PATCH 03/14] Refactoring for readability --- .../ggerganov/whispercpp/params/WhisperFullParams.java | 2 +- src/whisper.cpp | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java index 2decd5a2ef2..76ce80fb4cc 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java @@ -333,7 +333,7 @@ public void setLogitsFilterCallback(WhisperLogitsFilterCallback callback) { @Override protected List getFieldOrder() { - return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", + return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate", "no_context", "no_timestamps", "single_segment", "print_special", "print_progress", "print_realtime", "print_timestamps", diff --git a/src/whisper.cpp b/src/whisper.cpp index 6e402bb052d..179df2a4e77 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -7093,8 +7093,12 @@ int whisper_full_with_state( prompt.clear(); // if we have already generated some text, use it as a prompt to condition the next generation - if (( (!prompt_past.empty()) || (params.carry_initial_prompt && !initial_prompt_tokens.empty() && !first_iter_with_prompt) ) - && t_cur < 0.5f && params.n_max_text_ctx > 0) { + const bool has_past_text = !prompt_past.empty(); + const bool carrying_initial_prompt_now = params.carry_initial_prompt && !initial_prompt_tokens.empty() && !first_iter_with_prompt; + // We only condition on previous text at lower temperatures and when a context limit is set + const bool allow_conditioning = (t_cur < 0.5f) && (params.n_max_text_ctx > 0); + + if ((has_past_text || carrying_initial_prompt_now) && allow_conditioning) { int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2); prompt = { whisper_token_prev(ctx) }; if (params.carry_initial_prompt) { From 8be27dc4458b0459f6db551e32aa5f8c29b960b6 Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Wed, 8 Oct 2025 12:05:21 +0200 Subject: [PATCH 04/14] WIP 1 --- src/whisper.cpp | 94 ++++++++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 48 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 046fab9dba7..04f1466405e 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -880,7 +880,10 @@ struct whisper_state { std::vector logits; std::vector result_all; - std::vector prompt_past; + + // prompt history split into static prefix (prompt_past0) and dynamic rolling context (prompt_past1) + std::vector prompt_past0; // static carried initial prompt (if enabled) + std::vector prompt_past1; // dynamic context from decoded output int lang_id = 0; // english by default @@ -6875,10 +6878,12 @@ int whisper_full_with_state( decoder.rng = std::mt19937(j); } - // the accumulated text context so far - auto & prompt_past = state->prompt_past; + // the accumulated text context split into static (prompt_past0) and dynamic (prompt_past1) + auto & prompt_past0 = state->prompt_past0; + auto & prompt_past1 = state->prompt_past1; if (params.no_context) { - prompt_past.clear(); + prompt_past0.clear(); + prompt_past1.clear(); } // prepare prompt @@ -6902,15 +6907,16 @@ int whisper_full_with_state( } } - // prepend the prompt tokens to the prompt_past + // store initial prompt in prompt_past0 if carrying, else treat as part of dynamic context if (params.prompt_tokens && params.prompt_n_tokens > 0) { - // parse tokens from the pointer - for (int i = 0; i < params.prompt_n_tokens; i++) { - prompt_past.push_back(params.prompt_tokens[i]); + if (params.carry_initial_prompt) { + if (prompt_past0.empty()) { + prompt_past0.insert(prompt_past0.end(), params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); + } + } else { + prompt_past1.insert(prompt_past1.end(), params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); } - std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end()); } - if (initial_prompt_tokens.empty() && params.carry_initial_prompt && params.prompt_tokens && params.prompt_n_tokens > 0) { initial_prompt_tokens.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); } @@ -6999,7 +7005,7 @@ int whisper_full_with_state( // if there is a very short audio segment left to process, we remove any past prompt since it tends // to confuse the decoder and often make it repeat or hallucinate stuff if (seek > seek_start && seek + 500 >= seek_end) { - prompt_past.clear(); + prompt_past1.clear(); } int best_decoder_id = 0; @@ -7061,38 +7067,35 @@ int whisper_full_with_state( prompt.clear(); // if we have already generated some text, use it as a prompt to condition the next generation - const bool has_past_text = !prompt_past.empty(); - const bool carrying_initial_prompt_now = params.carry_initial_prompt && !initial_prompt_tokens.empty() && !first_iter_with_prompt; + const bool has_past_text = !prompt_past1.empty(); + const bool carrying_initial_prompt_now = params.carry_initial_prompt && !prompt_past0.empty() && !first_iter_with_prompt; // We only condition on previous text at lower temperatures and when a context limit is set const bool allow_conditioning = (t_cur < 0.5f) && (params.n_max_text_ctx > 0); if ((has_past_text || carrying_initial_prompt_now) && allow_conditioning) { int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2); prompt = { whisper_token_prev(ctx) }; - if (params.carry_initial_prompt) { - if (first_iter_with_prompt) { - // behave like non-carry on first chunk to avoid duplication - int n_take = std::min(max_ctx_half, (int)prompt_past.size()); - prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end()); - } else { - std::vector ipt = initial_prompt_tokens; - if (!ipt.empty()) { - if ((int)ipt.size() > max_ctx_half - 1) { - ipt.erase(ipt.begin(), ipt.begin() + (ipt.size() - (max_ctx_half - 1))); - } - prompt.insert(prompt.end(), ipt.begin(), ipt.end()); - } - int remaining_budget = max_ctx_half - (int)ipt.size(); - if (remaining_budget > 0) { - int n_take = std::min(remaining_budget, (int)prompt_past.size()); - if (n_take > 0) { - prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end()); - } + if (params.carry_initial_prompt && !prompt_past0.empty() && !first_iter_with_prompt) { + int budget = max_ctx_half; + prompt.push_back(whisper_token_prev(ctx)); + int take0 = std::min(budget - 1, (int)prompt_past0.size()); + if (take0 > 0) { + auto start0 = take0 < (int)prompt_past0.size() ? prompt_past0.end() - take0 : prompt_past0.begin(); + prompt.insert(prompt.end(), start0, prompt_past0.end()); + } + int remaining = budget - take0; + if (remaining > 0) { + int take1 = std::min(remaining, (int)prompt_past1.size()); + if (take1 > 0) { + prompt.insert(prompt.end(), prompt_past1.end() - take1, prompt_past1.end()); } } } else { - int n_take = std::min(max_ctx_half, (int)prompt_past.size()); - prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end()); + int n_take = std::min(max_ctx_half, (int)prompt_past1.size()); + if (n_take > 0) { + prompt = { whisper_token_prev(ctx) }; + prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end()); + } } } @@ -7578,22 +7581,17 @@ int whisper_full_with_state( //WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta); - // update prompt_past - prompt_past.clear(); - if (prompt.front() == whisper_token_prev(ctx)) { - auto start_it = prompt.begin() + 1; - if (params.carry_initial_prompt && params.prompt_n_tokens > 0) { - // skip initial prompt tokens to avoid accumulating duplicates - int n_ip = params.prompt_n_tokens; - if (prompt.end() - start_it > n_ip) { - start_it += n_ip; - } + if (!params.carry_initial_prompt) { + prompt_past1.clear(); + if (!prompt.empty() && prompt.front() == whisper_token_prev(ctx)) { + auto start_it = prompt.begin() + 1; + prompt_past1.insert(prompt_past1.end(), start_it, prompt.end() - prompt_init.size()); } - prompt_past.insert(prompt_past.end(), start_it, prompt.end() - prompt_init.size()); } - - for (int i = 0; i < result_len && !is_no_speech; ++i) { - prompt_past.push_back(tokens_cur[i].id); + if (!is_no_speech) { + for (int i = 0; i < result_len; ++i) { + prompt_past1.push_back(tokens_cur[i].id); + } } if (!tokens_cur.empty() && ctx->model.n_loaded > 0 && !is_no_speech) { From 8abf7d96a87f5e918e1822487d8e81f7fe6fd200 Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Wed, 8 Oct 2025 12:05:30 +0200 Subject: [PATCH 05/14] WIP 2 --- src/whisper.cpp | 84 +++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 04f1466405e..89efd9af049 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -138,6 +138,10 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text } while (0) #define WHISPER_MAX_DECODERS 8 + +// temperature below which we condition on past text history +static constexpr float WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF = 0.5f; + #define WHISPER_MAX_NODES 4096 static std::string format(const char * fmt, ...) { @@ -6887,11 +6891,8 @@ int whisper_full_with_state( } // prepare prompt - std::vector initial_prompt_tokens; // persistent for carry_initial_prompt { std::vector prompt_tokens; - - // initial prompt if (!params.prompt_tokens && params.initial_prompt) { prompt_tokens.resize(1024); int n_needed = whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()); @@ -6902,12 +6903,7 @@ int whisper_full_with_state( prompt_tokens.resize(n_needed); params.prompt_tokens = prompt_tokens.data(); params.prompt_n_tokens = prompt_tokens.size(); - if (params.carry_initial_prompt) { - initial_prompt_tokens = prompt_tokens; // copy for reuse - } } - - // store initial prompt in prompt_past0 if carrying, else treat as part of dynamic context if (params.prompt_tokens && params.prompt_n_tokens > 0) { if (params.carry_initial_prompt) { if (prompt_past0.empty()) { @@ -6917,9 +6913,6 @@ int whisper_full_with_state( prompt_past1.insert(prompt_past1.end(), params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); } } - if (initial_prompt_tokens.empty() && params.carry_initial_prompt && params.prompt_tokens && params.prompt_n_tokens > 0) { - initial_prompt_tokens.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); - } } // overwrite audio_ctx, max allowed is hparams.n_audio_ctx @@ -6975,7 +6968,7 @@ int whisper_full_with_state( std::vector beam_candidates; // main loop - bool first_iter_with_prompt = true; // track first decode iteration for carry_initial_prompt logic + bool first_history_iter = true; // track first decode iteration for carry_initial_prompt logic while (true) { if (params.progress_callback) { const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start); @@ -7066,36 +7059,45 @@ int whisper_full_with_state( { prompt.clear(); - // if we have already generated some text, use it as a prompt to condition the next generation - const bool has_past_text = !prompt_past1.empty(); - const bool carrying_initial_prompt_now = params.carry_initial_prompt && !prompt_past0.empty() && !first_iter_with_prompt; - // We only condition on previous text at lower temperatures and when a context limit is set - const bool allow_conditioning = (t_cur < 0.5f) && (params.n_max_text_ctx > 0); - - if ((has_past_text || carrying_initial_prompt_now) && allow_conditioning) { - int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2); - prompt = { whisper_token_prev(ctx) }; - if (params.carry_initial_prompt && !prompt_past0.empty() && !first_iter_with_prompt) { - int budget = max_ctx_half; - prompt.push_back(whisper_token_prev(ctx)); - int take0 = std::min(budget - 1, (int)prompt_past0.size()); - if (take0 > 0) { - auto start0 = take0 < (int)prompt_past0.size() ? prompt_past0.end() - take0 : prompt_past0.begin(); - prompt.insert(prompt.end(), start0, prompt_past0.end()); - } - int remaining = budget - take0; - if (remaining > 0) { - int take1 = std::min(remaining, (int)prompt_past1.size()); - if (take1 > 0) { - prompt.insert(prompt.end(), prompt_past1.end() - take1, prompt_past1.end()); + if (params.n_max_text_ctx > 0 && + t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) { + + const bool have_dynamic = !prompt_past1.empty(); + const bool can_carry_static = params.carry_initial_prompt && !prompt_past0.empty() && !first_history_iter; + + if (have_dynamic || can_carry_static) { + int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2); + if (max_ctx_half > 0) { + // Always start with previous token marker to connect continuity + prompt.push_back(whisper_token_prev(ctx)); + + if (can_carry_static) { + // Budget includes the prev token; we already consumed 1 slot. + int budget = max_ctx_half; // total allowed (including prev) + + // Take as many static tokens as fit (reserving at least the prev token already placed) + int take_static = std::min(budget - 1, (int) prompt_past0.size()); + if (take_static > 0) { + auto start0 = take_static < (int) prompt_past0.size() ? prompt_past0.end() - take_static : prompt_past0.begin(); + prompt.insert(prompt.end(), start0, prompt_past0.end()); + } + + // Remaining budget for dynamic tail + int remaining = budget - take_static; + if (remaining > 0) { + int take_dynamic = std::min(remaining, (int) prompt_past1.size()); + if (take_dynamic > 0) { + prompt.insert(prompt.end(), prompt_past1.end() - take_dynamic, prompt_past1.end()); + } + } + } else { + // Dynamic only path + int n_take = std::min(max_ctx_half, (int) prompt_past1.size()); + if (n_take > 0) { + prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end()); + } } } - } else { - int n_take = std::min(max_ctx_half, (int)prompt_past1.size()); - if (n_take > 0) { - prompt = { whisper_token_prev(ctx) }; - prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end()); - } } } @@ -7103,7 +7105,7 @@ int whisper_full_with_state( prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end()); // mark first iteration done - first_iter_with_prompt = false; + first_history_iter = false; // print the prompt WHISPER_LOG_DEBUG("\n\n"); From 44880cbdecf8bccbf45ce1924db1ec01db7754e0 Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Wed, 8 Oct 2025 15:31:38 +0200 Subject: [PATCH 06/14] PR fixes --- src/whisper.cpp | 76 ++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 89efd9af049..c94ee02feda 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -6893,6 +6893,8 @@ int whisper_full_with_state( // prepare prompt { std::vector prompt_tokens; + + // tokenize the initial prompt if (!params.prompt_tokens && params.initial_prompt) { prompt_tokens.resize(1024); int n_needed = whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()); @@ -6907,10 +6909,13 @@ int whisper_full_with_state( if (params.prompt_tokens && params.prompt_n_tokens > 0) { if (params.carry_initial_prompt) { if (prompt_past0.empty()) { - prompt_past0.insert(prompt_past0.end(), params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); + prompt_past0.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); } } else { - prompt_past1.insert(prompt_past1.end(), params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); + for (int i = 0; i < params.prompt_n_tokens; ++i) { + prompt_past1.push_back(params.prompt_tokens[i]); + } + std::rotate(prompt_past1.begin(), prompt_past1.end() - params.prompt_n_tokens, prompt_past1.end()); } } } @@ -6968,7 +6973,6 @@ int whisper_full_with_state( std::vector beam_candidates; // main loop - bool first_history_iter = true; // track first decode iteration for carry_initial_prompt logic while (true) { if (params.progress_callback) { const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start); @@ -7063,40 +7067,38 @@ int whisper_full_with_state( t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) { const bool have_dynamic = !prompt_past1.empty(); - const bool can_carry_static = params.carry_initial_prompt && !prompt_past0.empty() && !first_history_iter; - - if (have_dynamic || can_carry_static) { - int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2); - if (max_ctx_half > 0) { - // Always start with previous token marker to connect continuity - prompt.push_back(whisper_token_prev(ctx)); - - if (can_carry_static) { - // Budget includes the prev token; we already consumed 1 slot. - int budget = max_ctx_half; // total allowed (including prev) - - // Take as many static tokens as fit (reserving at least the prev token already placed) - int take_static = std::min(budget - 1, (int) prompt_past0.size()); - if (take_static > 0) { - auto start0 = take_static < (int) prompt_past0.size() ? prompt_past0.end() - take_static : prompt_past0.begin(); - prompt.insert(prompt.end(), start0, prompt_past0.end()); - } + const bool can_carry_static = params.carry_initial_prompt && !prompt_past0.empty() && seek != seek_start; + + int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2); + if (max_ctx_half > 0 && (have_dynamic || can_carry_static)) { + // Always start with previous token marker to connect continuity + prompt.push_back(whisper_token_prev(ctx)); + + if (can_carry_static) { + // Budget includes the prev token; we already consumed 1 slot. + int budget = max_ctx_half; // total allowed (including prev) + + // Take as many static tokens as fit (reserving at least the prev token already placed) + int take_static = std::min(budget - 1, (int) prompt_past0.size()); + if (take_static > 0) { + auto start0 = take_static < (int) prompt_past0.size() ? prompt_past0.end() - take_static : prompt_past0.begin(); + prompt.insert(prompt.end(), start0, prompt_past0.end()); + } - // Remaining budget for dynamic tail - int remaining = budget - take_static; - if (remaining > 0) { - int take_dynamic = std::min(remaining, (int) prompt_past1.size()); - if (take_dynamic > 0) { - prompt.insert(prompt.end(), prompt_past1.end() - take_dynamic, prompt_past1.end()); - } - } - } else { - // Dynamic only path - int n_take = std::min(max_ctx_half, (int) prompt_past1.size()); - if (n_take > 0) { - prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end()); + // Remaining budget for dynamic tail + int remaining = budget - take_static; + if (remaining > 0) { + int take_dynamic = std::min(remaining, (int) prompt_past1.size()); + if (take_dynamic > 0) { + prompt.insert(prompt.end(), prompt_past1.end() - take_dynamic, prompt_past1.end()); } } + } else { + // Dynamic only path + int n_take = std::min(max_ctx_half, (int) prompt_past1.size()); + if (n_take > 0) { + prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end()); + } } } } @@ -7104,9 +7106,6 @@ int whisper_full_with_state( // init new transcription with sot, language (opt) and task tokens prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end()); - // mark first iteration done - first_history_iter = false; - // print the prompt WHISPER_LOG_DEBUG("\n\n"); for (int i = 0; i < (int) prompt.size(); i++) { @@ -7586,8 +7585,7 @@ int whisper_full_with_state( if (!params.carry_initial_prompt) { prompt_past1.clear(); if (!prompt.empty() && prompt.front() == whisper_token_prev(ctx)) { - auto start_it = prompt.begin() + 1; - prompt_past1.insert(prompt_past1.end(), start_it, prompt.end() - prompt_init.size()); + prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size()); } } if (!is_no_speech) { From e42cbedb4626a63fc07fd32bb3494f48f86f3d9d Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Wed, 8 Oct 2025 18:03:26 +0200 Subject: [PATCH 07/14] More PR fixes --- src/whisper.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index c94ee02feda..c83e995be7b 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -7582,12 +7582,12 @@ int whisper_full_with_state( //WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta); - if (!params.carry_initial_prompt) { - prompt_past1.clear(); - if (!prompt.empty() && prompt.front() == whisper_token_prev(ctx)) { - prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size()); - } + // update prompt_past1 + prompt_past1.clear(); + if (!prompt.empty() && prompt.front() == whisper_token_prev(ctx)) { + prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size()); } + if (!is_no_speech) { for (int i = 0; i < result_len; ++i) { prompt_past1.push_back(tokens_cur[i].id); From ee5adba8de62067eb3bf817c7080a2dd608c8ff6 Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Fri, 10 Oct 2025 09:33:08 +0200 Subject: [PATCH 08/14] PR fix --- src/whisper.cpp | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index c83e995be7b..f851a4d155d 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -7063,42 +7063,27 @@ int whisper_full_with_state( { prompt.clear(); - if (params.n_max_text_ctx > 0 && - t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) { + if (params.n_max_text_ctx > 0 && t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) { + const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty() && seek != seek_start; + const bool can_take1 = !prompt_past1.empty(); - const bool have_dynamic = !prompt_past1.empty(); - const bool can_carry_static = params.carry_initial_prompt && !prompt_past0.empty() && seek != seek_start; - - int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2); - if (max_ctx_half > 0 && (have_dynamic || can_carry_static)) { + int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx) / 2); + if (max_ctx_half > 0 && (can_take0 || can_take1)) { // Always start with previous token marker to connect continuity prompt.push_back(whisper_token_prev(ctx)); - if (can_carry_static) { - // Budget includes the prev token; we already consumed 1 slot. - int budget = max_ctx_half; // total allowed (including prev) - - // Take as many static tokens as fit (reserving at least the prev token already placed) - int take_static = std::min(budget - 1, (int) prompt_past0.size()); - if (take_static > 0) { - auto start0 = take_static < (int) prompt_past0.size() ? prompt_past0.end() - take_static : prompt_past0.begin(); + int n_take0 = 0; + if (can_take0) { + n_take0 = std::min(max_ctx_half - 1, prompt_past0.size()); + if (n_take0 > 0) { + auto start0 = n_take0 < (int)prompt_past0.size() ? prompt_past0.end() - n_take0 : prompt_past0.begin(); prompt.insert(prompt.end(), start0, prompt_past0.end()); } + } - // Remaining budget for dynamic tail - int remaining = budget - take_static; - if (remaining > 0) { - int take_dynamic = std::min(remaining, (int) prompt_past1.size()); - if (take_dynamic > 0) { - prompt.insert(prompt.end(), prompt_past1.end() - take_dynamic, prompt_past1.end()); - } - } - } else { - // Dynamic only path - int n_take = std::min(max_ctx_half, (int) prompt_past1.size()); - if (n_take > 0) { - prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end()); - } + int n_take1 = std::min(max_ctx_half - n_take0 - 1, prompt_past1.size()); + if (n_take1 > 0) { + prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end()); } } } From 6417091b52e02fee0a87b5730168d9f0bd056144 Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Fri, 10 Oct 2025 09:35:07 +0200 Subject: [PATCH 09/14] Further simplification --- src/whisper.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index f851a4d155d..065d606fe02 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -7067,24 +7067,21 @@ int whisper_full_with_state( const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty() && seek != seek_start; const bool can_take1 = !prompt_past1.empty(); - int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx) / 2); + const int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx) / 2); if (max_ctx_half > 0 && (can_take0 || can_take1)) { // Always start with previous token marker to connect continuity prompt.push_back(whisper_token_prev(ctx)); + // Take static tokens (initial prompt) first, up to budget minus the prev token int n_take0 = 0; if (can_take0) { n_take0 = std::min(max_ctx_half - 1, prompt_past0.size()); - if (n_take0 > 0) { - auto start0 = n_take0 < (int)prompt_past0.size() ? prompt_past0.end() - n_take0 : prompt_past0.begin(); - prompt.insert(prompt.end(), start0, prompt_past0.end()); - } + prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end()); } - int n_take1 = std::min(max_ctx_half - n_take0 - 1, prompt_past1.size()); - if (n_take1 > 0) { - prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end()); - } + // Fill remaining budget with dynamic tokens (rolling context) + const int n_take1 = std::min(max_ctx_half - n_take0 - 1, prompt_past1.size()); + prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end()); } } From bd4856160345db1d4c462abccd0ca7dfd11fea3b Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Fri, 10 Oct 2025 10:50:00 +0200 Subject: [PATCH 10/14] d'oh --- src/whisper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 065d606fe02..d8887aab648 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -7064,7 +7064,7 @@ int whisper_full_with_state( prompt.clear(); if (params.n_max_text_ctx > 0 && t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) { - const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty() && seek != seek_start; + const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty(); const bool can_take1 = !prompt_past1.empty(); const int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx) / 2); From 037b419a537f491f21a2e8ae930252c37c8b165c Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Fri, 10 Oct 2025 11:07:08 +0200 Subject: [PATCH 11/14] One more logic fix --- src/whisper.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index d8887aab648..be4371fda63 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -7566,10 +7566,11 @@ int whisper_full_with_state( // update prompt_past1 prompt_past1.clear(); - if (!prompt.empty() && prompt.front() == whisper_token_prev(ctx)) { + if (!params.carry_initial_prompt && !prompt.empty() && prompt.front() == whisper_token_prev(ctx)) { prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size()); } + // Add newly decoded tokens to the rolling context if (!is_no_speech) { for (int i = 0; i < result_len; ++i) { prompt_past1.push_back(tokens_cur[i].id); From f6139dfacc12ff65e51e804608b0d7cba463916c Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Fri, 10 Oct 2025 11:35:15 +0200 Subject: [PATCH 12/14] Update src/whisper.cpp Co-authored-by: Georgi Gerganov --- src/whisper.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/whisper.cpp b/src/whisper.cpp index be4371fda63..cca68194469 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -7002,6 +7002,7 @@ int whisper_full_with_state( // if there is a very short audio segment left to process, we remove any past prompt since it tends // to confuse the decoder and often make it repeat or hallucinate stuff if (seek > seek_start && seek + 500 >= seek_end) { + prompt_past0.clear(); prompt_past1.clear(); } From c86f3c879adc3558d94ce8f5e1de1576718300e9 Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Fri, 10 Oct 2025 12:54:26 +0200 Subject: [PATCH 13/14] Truncate prompt_past0 upon initialization --- src/whisper.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index cca68194469..24d7a260342 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -6890,6 +6890,9 @@ int whisper_full_with_state( prompt_past1.clear(); } + // calculate the maximum context budget for prompt history + const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2); + // prepare prompt { std::vector prompt_tokens; @@ -6909,7 +6912,15 @@ int whisper_full_with_state( if (params.prompt_tokens && params.prompt_n_tokens > 0) { if (params.carry_initial_prompt) { if (prompt_past0.empty()) { - prompt_past0.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); + const int max_tokens = std::max(1, max_prompt_ctx - 1); + + if (params.prompt_n_tokens > max_tokens) { + WHISPER_LOG_WARN("%s: initial prompt is too long (%d tokens), will use only the last %d tokens\n", + __func__, params.prompt_n_tokens, max_tokens); + } + + const int n_tokens = std::min(params.prompt_n_tokens, max_tokens); + prompt_past0.assign(params.prompt_tokens + (params.prompt_n_tokens - n_tokens), params.prompt_tokens + params.prompt_n_tokens); } } else { for (int i = 0; i < params.prompt_n_tokens; ++i) { @@ -7068,20 +7079,19 @@ int whisper_full_with_state( const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty(); const bool can_take1 = !prompt_past1.empty(); - const int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx) / 2); - if (max_ctx_half > 0 && (can_take0 || can_take1)) { + if (max_prompt_ctx > 0 && (can_take0 || can_take1)) { // Always start with previous token marker to connect continuity prompt.push_back(whisper_token_prev(ctx)); // Take static tokens (initial prompt) first, up to budget minus the prev token int n_take0 = 0; if (can_take0) { - n_take0 = std::min(max_ctx_half - 1, prompt_past0.size()); + n_take0 = std::min(max_prompt_ctx - 1, prompt_past0.size()); prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end()); } // Fill remaining budget with dynamic tokens (rolling context) - const int n_take1 = std::min(max_ctx_half - n_take0 - 1, prompt_past1.size()); + const int n_take1 = std::min(max_prompt_ctx - n_take0 - 1, prompt_past1.size()); prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end()); } } From 41df41b948c5047b655fb62fb44537f13f7a9638 Mon Sep 17 00:00:00 2001 From: Andreas Lubbe Date: Fri, 10 Oct 2025 13:00:58 +0200 Subject: [PATCH 14/14] Slight simplification --- src/whisper.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 24d7a260342..6f261f800af 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -7083,10 +7083,10 @@ int whisper_full_with_state( // Always start with previous token marker to connect continuity prompt.push_back(whisper_token_prev(ctx)); - // Take static tokens (initial prompt) first, up to budget minus the prev token + // Take static tokens (initial prompt) first int n_take0 = 0; if (can_take0) { - n_take0 = std::min(max_prompt_ctx - 1, prompt_past0.size()); + n_take0 = prompt_past0.size(); prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end()); }