From 9fad5d5bb93d4a344692ed13e5b08e1dff1fb471 Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Thu, 28 Aug 2025 21:54:27 +0200
Subject: [PATCH 01/14] Add support for --carry-initial-prompt

---
 bindings/go/params.go                         |   4 +
 .../whispercpp/params/WhisperFullParams.java  |   8 +-
 bindings/ruby/ext/ruby_whisper_params.c       |  25 +-
 bindings/ruby/sig/whisper.rbs                 |   3 +
 bindings/ruby/test/test_params.rb             |   8 +
 examples/cli/cli.cpp                          | 220 +++++++++---------
 include/whisper.h                             |   1 +
 src/whisper.cpp                               |  61 ++++-
 8 files changed, 210 insertions(+), 120 deletions(-)
diff --git a/bindings/go/params.go b/bindings/go/params.go
index 95c5bfaf934..e1d54b266aa 100644
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@@ -47,6 +47,10 @@ func (p *Params) SetPrintTimestamps(v bool) {
 	p.print_timestamps = toBool(v)
 }
 
+func (p *Params) SetCarryInitialPrompt(v bool) {
+	p.carry_initial_prompt = toBool(v)
+}
+
 // Set language id
 func (p *Params) SetLanguage(lang int) error {
 	if lang == -1 {
diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
index 498ff126037..2decd5a2ef2 100644
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
@@ -157,6 +157,8 @@ public void tdrzEnable(boolean enable) {
     /** Tokens to provide to the whisper decoder as an initial prompt.
      * These are prepended to any existing text context from a previous call. */
     public String initial_prompt;
+    /** Always prepend initial_prompt for every decode chunk. */
+    public CBool carry_initial_prompt;
 
     /** Prompt tokens. (int*) */
     public Pointer prompt_tokens;
@@ -331,13 +333,13 @@ public void setLogitsFilterCallback(WhisperLogitsFilterCallback callback) {
 
     @Override
     protected List<String> getFieldOrder() {
-        return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
+    return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
                 "offset_ms", "duration_ms", "translate", "no_context",
                 "no_timestamps", "single_segment", "print_special",
                 "print_progress", "print_realtime", "print_timestamps",
                 "token_timestamps", "thold_pt", "thold_ptsum", "max_len",
-                "split_on_word", "max_tokens", "debug_mode", "audio_ctx", 
-                "tdrz_enable", "suppress_regex", "initial_prompt",
+                "split_on_word", "max_tokens", "debug_mode", "audio_ctx",
+                "tdrz_enable", "suppress_regex", "initial_prompt", "carry_initial_prompt",
                 "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
                 "suppress_blank", "suppress_nst", "temperature",
                 "max_initial_ts", "length_penalty", "temperature_inc",
diff --git a/bindings/ruby/ext/ruby_whisper_params.c b/bindings/ruby/ext/ruby_whisper_params.c
index 882c68d042f..670f9351375 100644
--- a/bindings/ruby/ext/ruby_whisper_params.c
+++ b/bindings/ruby/ext/ruby_whisper_params.c
@@ -26,7 +26,7 @@
   rb_define_method(cParams, #param_name, ruby_whisper_params_get_ ## param_name, 0); \
   rb_define_method(cParams, #param_name "=", ruby_whisper_params_set_ ## param_name, 1);
 
-#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 36
+#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 37
 
 extern VALUE cParams;
 extern VALUE cVADParams;
@@ -46,6 +46,7 @@ static ID id_print_special;
 static ID id_print_progress;
 static ID id_print_realtime;
 static ID id_print_timestamps;
+static ID id_carry_initial_prompt;
 static ID id_suppress_blank;
 static ID id_suppress_nst;
 static ID id_token_timestamps;
@@ -455,6 +456,26 @@ ruby_whisper_params_get_print_timestamps(VALUE self)
 {
   BOOL_PARAMS_GETTER(self, print_timestamps)
 }
+
+/*
+ *  call-seq:
+ *    carry_initial_prompt -> true or false
+ */
+static VALUE
+ruby_whisper_params_get_carry_initial_prompt(VALUE self)
+{
+  BOOL_PARAMS_GETTER(self, carry_initial_prompt)
+}
+
+/*
+ *  call-seq:
+ *    carry_initial_prompt = bool -> bool
+ */
+static VALUE
+ruby_whisper_params_set_carry_initial_prompt(VALUE self, VALUE value)
+{
+  BOOL_PARAMS_SETTER(self, carry_initial_prompt, value)
+}
 /*
  * call-seq:
  *   suppress_blank = force_suppress -> force_suppress
@@ -1168,6 +1189,7 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
       SET_PARAM_IF_SAME(max_len)
       SET_PARAM_IF_SAME(split_on_word)
       SET_PARAM_IF_SAME(initial_prompt)
+      SET_PARAM_IF_SAME(carry_initial_prompt)
       SET_PARAM_IF_SAME(offset)
       SET_PARAM_IF_SAME(duration)
       SET_PARAM_IF_SAME(max_text_tokens)
@@ -1303,6 +1325,7 @@ init_ruby_whisper_params(VALUE *mWhisper)
   DEFINE_PARAM(max_len, 11)
   DEFINE_PARAM(split_on_word, 12)
   DEFINE_PARAM(initial_prompt, 13)
+  DEFINE_PARAM(carry_initial_prompt, 36)
   DEFINE_PARAM(diarize, 14)
   DEFINE_PARAM(offset, 15)
   DEFINE_PARAM(duration, 16)
diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs
index 0489432a249..d5905dd7037 100644
--- a/bindings/ruby/sig/whisper.rbs
+++ b/bindings/ruby/sig/whisper.rbs
@@ -138,6 +138,7 @@ module Whisper
       ?max_len: Integer,
       ?split_on_word: boolish,
       ?initial_prompt: string | nil,
+      ?carry_initial_prompt: boolish,
       ?diarize: boolish,
       ?offset: Integer,
       ?duration: Integer,
@@ -236,6 +237,7 @@ module Whisper
     def split_on_word: () -> (true | false)
 
     def initial_prompt=: (_ToS) -> _ToS
+    def carry_initial_prompt=: (boolish) -> boolish
 
     # Tokens to provide to the whisper decoder as initial prompt
     # these are prepended to any existing text context from a previous call
@@ -243,6 +245,7 @@ module Whisper
     # Maximum of whisper_n_text_ctx()/2 tokens are used (typically 224).
     #
     def initial_prompt: () -> (String | nil)
+    def carry_initial_prompt: () -> (true | false)
 
     def diarize=: (boolish) -> boolish
 
diff --git a/bindings/ruby/test/test_params.rb b/bindings/ruby/test/test_params.rb
index d5c5d140e8c..4dd9780de7d 100644
--- a/bindings/ruby/test/test_params.rb
+++ b/bindings/ruby/test/test_params.rb
@@ -16,6 +16,7 @@ class TestParams < TestBase
     :max_len,
     :split_on_word,
     :initial_prompt,
+    :carry_initial_prompt,
     :diarize,
     :offset,
     :duration,
@@ -119,6 +120,13 @@ def test_print_timestamps
     assert !@params.print_timestamps
   end
 
+  def test_carry_initial_prompt
+    @params.carry_initial_prompt = true
+    assert @params.carry_initial_prompt
+    @params.carry_initial_prompt = false
+    assert !@params.carry_initial_prompt
+  end
+
   def test_suppress_blank
     @params.suppress_blank = true
     assert @params.suppress_blank
diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp
index f73ed9ae078..45516db614f 100644
--- a/examples/cli/cli.cpp
+++ b/examples/cli/cli.cpp
@@ -77,6 +77,7 @@ struct whisper_params {
     bool use_gpu         = true;
     bool flash_attn      = false;
     bool suppress_nst    = false;
+    bool carry_initial_prompt = false;
 
     std::string language  = "en";
     std::string prompt;
@@ -145,59 +146,60 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
             exit(0);
         }
         #define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg))
-        else if (arg == "-t"    || arg == "--threads")         { params.n_threads       = std::stoi(ARGV_NEXT); }
-        else if (arg == "-p"    || arg == "--processors")      { params.n_processors    = std::stoi(ARGV_NEXT); }
-        else if (arg == "-ot"   || arg == "--offset-t")        { params.offset_t_ms     = std::stoi(ARGV_NEXT); }
-        else if (arg == "-on"   || arg == "--offset-n")        { params.offset_n        = std::stoi(ARGV_NEXT); }
-        else if (arg == "-d"    || arg == "--duration")        { params.duration_ms     = std::stoi(ARGV_NEXT); }
-        else if (arg == "-mc"   || arg == "--max-context")     { params.max_context     = std::stoi(ARGV_NEXT); }
-        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(ARGV_NEXT); }
-        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(ARGV_NEXT); }
-        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(ARGV_NEXT); }
-        else if (arg == "-ac"   || arg == "--audio-ctx")       { params.audio_ctx       = std::stoi(ARGV_NEXT); }
-        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(ARGV_NEXT); }
-        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(ARGV_NEXT); }
-        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(ARGV_NEXT); }
-        else if (arg == "-nth"  || arg == "--no-speech-thold") { params.no_speech_thold = std::stof(ARGV_NEXT); }
-        else if (arg == "-tp"   || arg == "--temperature")     { params.temperature     = std::stof(ARGV_NEXT); }
-        else if (arg == "-tpi"  || arg == "--temperature-inc") { params.temperature_inc = std::stof(ARGV_NEXT); }
-        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
-        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
-        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
-        else if (arg == "-tdrz" || arg == "--tinydiarize")     { params.tinydiarize     = true; }
-        else if (arg == "-sow"  || arg == "--split-on-word")   { params.split_on_word   = true; }
-        else if (arg == "-nf"   || arg == "--no-fallback")     { params.no_fallback     = true; }
-        else if (arg == "-otxt" || arg == "--output-txt")      { params.output_txt      = true; }
-        else if (arg == "-ovtt" || arg == "--output-vtt")      { params.output_vtt      = true; }
-        else if (arg == "-osrt" || arg == "--output-srt")      { params.output_srt      = true; }
-        else if (arg == "-owts" || arg == "--output-words")    { params.output_wts      = true; }
-        else if (arg == "-olrc" || arg == "--output-lrc")      { params.output_lrc      = true; }
-        else if (arg == "-fp"   || arg == "--font-path")       { params.font_path       = ARGV_NEXT; }
-        else if (arg == "-ocsv" || arg == "--output-csv")      { params.output_csv      = true; }
-        else if (arg == "-oj"   || arg == "--output-json")     { params.output_jsn      = true; }
-        else if (arg == "-ojf"  || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; }
-        else if (arg == "-of"   || arg == "--output-file")     { params.fname_out.emplace_back(ARGV_NEXT); }
-        else if (arg == "-np"   || arg == "--no-prints")       { params.no_prints       = true; }
-        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
-        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
-        else if (                  arg == "--print-confidence"){ params.print_confidence= true; }
-        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
-        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
-        else if (arg == "-l"    || arg == "--language")        { params.language        = whisper_param_turn_lowercase(ARGV_NEXT); }
-        else if (arg == "-dl"   || arg == "--detect-language") { params.detect_language = true; }
-        else if (                  arg == "--prompt")          { params.prompt          = ARGV_NEXT; }
-        else if (arg == "-m"    || arg == "--model")           { params.model           = ARGV_NEXT; }
-        else if (arg == "-f"    || arg == "--file")            { params.fname_inp.emplace_back(ARGV_NEXT); }
-        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = ARGV_NEXT; }
-        else if (arg == "-dtw"  || arg == "--dtw")             { params.dtw             = ARGV_NEXT; }
-        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score       = true; }
-        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu         = false; }
-        else if (arg == "-fa"   || arg == "--flash-attn")      { params.flash_attn      = true; }
-        else if (arg == "-sns"  || arg == "--suppress-nst")    { params.suppress_nst    = true; }
-        else if (                  arg == "--suppress-regex")  { params.suppress_regex  = ARGV_NEXT; }
-        else if (                  arg == "--grammar")         { params.grammar         = ARGV_NEXT; }
-        else if (                  arg == "--grammar-rule")    { params.grammar_rule    = ARGV_NEXT; }
-        else if (                  arg == "--grammar-penalty") { params.grammar_penalty = std::stof(ARGV_NEXT); }
+        else if (arg == "-t"    || arg == "--threads")              { params.n_threads       = std::stoi(ARGV_NEXT); }
+        else if (arg == "-p"    || arg == "--processors")           { params.n_processors    = std::stoi(ARGV_NEXT); }
+        else if (arg == "-ot"   || arg == "--offset-t")             { params.offset_t_ms     = std::stoi(ARGV_NEXT); }
+        else if (arg == "-on"   || arg == "--offset-n")             { params.offset_n        = std::stoi(ARGV_NEXT); }
+        else if (arg == "-d"    || arg == "--duration")             { params.duration_ms     = std::stoi(ARGV_NEXT); }
+        else if (arg == "-mc"   || arg == "--max-context")          { params.max_context     = std::stoi(ARGV_NEXT); }
+        else if (arg == "-ml"   || arg == "--max-len")              { params.max_len         = std::stoi(ARGV_NEXT); }
+        else if (arg == "-bo"   || arg == "--best-of")              { params.best_of         = std::stoi(ARGV_NEXT); }
+        else if (arg == "-bs"   || arg == "--beam-size")            { params.beam_size       = std::stoi(ARGV_NEXT); }
+        else if (arg == "-ac"   || arg == "--audio-ctx")            { params.audio_ctx       = std::stoi(ARGV_NEXT); }
+        else if (arg == "-wt"   || arg == "--word-thold")           { params.word_thold      = std::stof(ARGV_NEXT); }
+        else if (arg == "-et"   || arg == "--entropy-thold")        { params.entropy_thold   = std::stof(ARGV_NEXT); }
+        else if (arg == "-lpt"  || arg == "--logprob-thold")        { params.logprob_thold   = std::stof(ARGV_NEXT); }
+        else if (arg == "-nth"  || arg == "--no-speech-thold")      { params.no_speech_thold = std::stof(ARGV_NEXT); }
+        else if (arg == "-tp"   || arg == "--temperature")          { params.temperature     = std::stof(ARGV_NEXT); }
+        else if (arg == "-tpi"  || arg == "--temperature-inc")      { params.temperature_inc = std::stof(ARGV_NEXT); }
+        else if (arg == "-debug"|| arg == "--debug-mode")           { params.debug_mode      = true; }
+        else if (arg == "-tr"   || arg == "--translate")            { params.translate       = true; }
+        else if (arg == "-di"   || arg == "--diarize")              { params.diarize         = true; }
+        else if (arg == "-tdrz" || arg == "--tinydiarize")          { params.tinydiarize     = true; }
+        else if (arg == "-sow"  || arg == "--split-on-word")        { params.split_on_word   = true; }
+        else if (arg == "-nf"   || arg == "--no-fallback")          { params.no_fallback     = true; }
+        else if (arg == "-otxt" || arg == "--output-txt")           { params.output_txt      = true; }
+        else if (arg == "-ovtt" || arg == "--output-vtt")           { params.output_vtt      = true; }
+        else if (arg == "-osrt" || arg == "--output-srt")           { params.output_srt      = true; }
+        else if (arg == "-owts" || arg == "--output-words")         { params.output_wts      = true; }
+        else if (arg == "-olrc" || arg == "--output-lrc")           { params.output_lrc      = true; }
+        else if (arg == "-fp"   || arg == "--font-path")            { params.font_path       = ARGV_NEXT; }
+        else if (arg == "-ocsv" || arg == "--output-csv")           { params.output_csv      = true; }
+        else if (arg == "-oj"   || arg == "--output-json")          { params.output_jsn      = true; }
+        else if (arg == "-ojf"  || arg == "--output-json-full")     { params.output_jsn_full = params.output_jsn = true; }
+        else if (arg == "-of"   || arg == "--output-file")          { params.fname_out.emplace_back(ARGV_NEXT); }
+        else if (arg == "-np"   || arg == "--no-prints")            { params.no_prints       = true; }
+        else if (arg == "-ps"   || arg == "--print-special")        { params.print_special   = true; }
+        else if (arg == "-pc"   || arg == "--print-colors")         { params.print_colors    = true; }
+        else if (                  arg == "--print-confidence")     { params.print_confidence= true; }
+        else if (arg == "-pp"   || arg == "--print-progress")       { params.print_progress  = true; }
+        else if (arg == "-nt"   || arg == "--no-timestamps")        { params.no_timestamps   = true; }
+        else if (arg == "-l"    || arg == "--language")             { params.language        = whisper_param_turn_lowercase(ARGV_NEXT); }
+        else if (arg == "-dl"   || arg == "--detect-language")      { params.detect_language = true; }
+        else if (                  arg == "--prompt")               { params.prompt          = ARGV_NEXT; }
+        else if (                  arg == "--carry-initial-prompt") { params.carry_initial_prompt = true; }
+        else if (arg == "-m"    || arg == "--model")                { params.model           = ARGV_NEXT; }
+        else if (arg == "-f"    || arg == "--file")                 { params.fname_inp.emplace_back(ARGV_NEXT); }
+        else if (arg == "-oved" || arg == "--ov-e-device")          { params.openvino_encode_device = ARGV_NEXT; }
+        else if (arg == "-dtw"  || arg == "--dtw")                  { params.dtw             = ARGV_NEXT; }
+        else if (arg == "-ls"   || arg == "--log-score")            { params.log_score       = true; }
+        else if (arg == "-ng"   || arg == "--no-gpu")               { params.use_gpu         = false; }
+        else if (arg == "-fa"   || arg == "--flash-attn")           { params.flash_attn      = true; }
+        else if (arg == "-sns"  || arg == "--suppress-nst")         { params.suppress_nst    = true; }
+        else if (                  arg == "--suppress-regex")       { params.suppress_regex  = ARGV_NEXT; }
+        else if (                  arg == "--grammar")              { params.grammar         = ARGV_NEXT; }
+        else if (                  arg == "--grammar-rule")         { params.grammar_rule    = ARGV_NEXT; }
+        else if (                  arg == "--grammar-penalty")      { params.grammar_penalty = std::stof(ARGV_NEXT); }
         // Voice Activity Detection (VAD)
         else if (                  arg == "--vad")                         { params.vad                         = true; }
         else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = ARGV_NEXT; }
@@ -223,60 +225,61 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
     fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,        --help              [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,      --threads N         [%-7d] number of threads to use during computation\n",    params.n_threads);
-    fprintf(stderr, "  -p N,      --processors N      [%-7d] number of processors to use during computation\n", params.n_processors);
-    fprintf(stderr, "  -ot N,     --offset-t N        [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
-    fprintf(stderr, "  -on N,     --offset-n N        [%-7d] segment index offset\n",                           params.offset_n);
-    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
-    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
-    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
-    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
-    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
-    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -ac N,     --audio-ctx N       [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
-    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
-    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
-    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    fprintf(stderr, "  -nth N,    --no-speech-thold N [%-7.2f] no speech threshold\n",                          params.no_speech_thold);
-    fprintf(stderr, "  -tp,       --temperature N     [%-7.2f] The sampling temperature, between 0 and 1\n",    params.temperature);
-    fprintf(stderr, "  -tpi,      --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
-    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
-    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
-    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
-    fprintf(stderr, "  -tdrz,     --tinydiarize       [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
-    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
-    fprintf(stderr, "  -otxt,     --output-txt        [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
-    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
-    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
-    fprintf(stderr, "  -olrc,     --output-lrc        [%-7s] output result in a lrc file\n",                    params.output_lrc ? "true" : "false");
-    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
-    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
-    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
-    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
-    fprintf(stderr, "  -ojf,      --output-json-full  [%-7s] include more information in the JSON file\n",      params.output_jsn_full ? "true" : "false");
-    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
-    fprintf(stderr, "  -np,       --no-prints         [%-7s] do not print anything other than the results\n",   params.no_prints ? "true" : "false");
-    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
-    fprintf(stderr, "             --print-confidence  [%-7s] print confidence\n",                               params.print_confidence ? "true" : "false");
-    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
-    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
-    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
-    fprintf(stderr, "  -dl,       --detect-language   [%-7s] exit after automatically detecting language\n",    params.detect_language ? "true" : "false");
-    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt (max n_text_ctx/2 tokens)\n",       params.prompt.c_str());
-    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
-    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input audio file path\n",                            "");
-    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
-    fprintf(stderr, "  -dtw MODEL --dtw MODEL         [%-7s] compute token-level timestamps\n",                 params.dtw.c_str());
-    fprintf(stderr, "  -ls,       --log-score         [%-7s] log best decoder scores of tokens\n",              params.log_score?"true":"false");
-    fprintf(stderr, "  -ng,       --no-gpu            [%-7s] disable GPU\n",                                    params.use_gpu ? "false" : "true");
-    fprintf(stderr, "  -fa,       --flash-attn        [%-7s] flash attention\n",                                params.flash_attn ? "true" : "false");
-    fprintf(stderr, "  -sns,      --suppress-nst      [%-7s] suppress non-speech tokens\n",                     params.suppress_nst ? "true" : "false");
-    fprintf(stderr, "  --suppress-regex REGEX         [%-7s] regular expression matching tokens to suppress\n", params.suppress_regex.c_str());
-    fprintf(stderr, "  --grammar GRAMMAR              [%-7s] GBNF grammar to guide decoding\n",                 params.grammar.c_str());
-    fprintf(stderr, "  --grammar-rule RULE            [%-7s] top-level GBNF grammar rule name\n",               params.grammar_rule.c_str());
-    fprintf(stderr, "  --grammar-penalty N            [%-7.1f] scales down logits of nongrammar tokens\n",      params.grammar_penalty);
+    fprintf(stderr, "  -h,        --help                 [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,      --threads N            [%-7d] number of threads to use during computation\n",    params.n_threads);
+    fprintf(stderr, "  -p N,      --processors N         [%-7d] number of processors to use during computation\n", params.n_processors);
+    fprintf(stderr, "  -ot N,     --offset-t N           [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
+    fprintf(stderr, "  -on N,     --offset-n N           [%-7d] segment index offset\n",                           params.offset_n);
+    fprintf(stderr, "  -d  N,     --duration N           [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
+    fprintf(stderr, "  -mc N,     --max-context N        [%-7d] maximum number of text context tokens to store\n", params.max_context);
+    fprintf(stderr, "  -ml N,     --max-len N            [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -sow,      --split-on-word        [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
+    fprintf(stderr, "  -bo N,     --best-of N            [%-7d] number of best candidates to keep\n",              params.best_of);
+    fprintf(stderr, "  -bs N,     --beam-size N          [%-7d] beam size for beam search\n",                      params.beam_size);
+    fprintf(stderr, "  -ac N,     --audio-ctx N          [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
+    fprintf(stderr, "  -wt N,     --word-thold N         [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
+    fprintf(stderr, "  -et N,     --entropy-thold N      [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
+    fprintf(stderr, "  -lpt N,    --logprob-thold N      [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
+    fprintf(stderr, "  -nth N,    --no-speech-thold N    [%-7.2f] no speech threshold\n",                          params.no_speech_thold);
+    fprintf(stderr, "  -tp,       --temperature N        [%-7.2f] The sampling temperature, between 0 and 1\n",    params.temperature);
+    fprintf(stderr, "  -tpi,      --temperature-inc N    [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
+    fprintf(stderr, "  -debug,    --debug-mode           [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
+    fprintf(stderr, "  -tr,       --translate            [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
+    fprintf(stderr, "  -di,       --diarize              [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -tdrz,     --tinydiarize          [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
+    fprintf(stderr, "  -nf,       --no-fallback          [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
+    fprintf(stderr, "  -otxt,     --output-txt           [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
+    fprintf(stderr, "  -ovtt,     --output-vtt           [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
+    fprintf(stderr, "  -osrt,     --output-srt           [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
+    fprintf(stderr, "  -olrc,     --output-lrc           [%-7s] output result in a lrc file\n",                    params.output_lrc ? "true" : "false");
+    fprintf(stderr, "  -owts,     --output-words         [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
+    fprintf(stderr, "  -fp,       --font-path            [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
+    fprintf(stderr, "  -ocsv,     --output-csv           [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
+    fprintf(stderr, "  -oj,       --output-json          [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
+    fprintf(stderr, "  -ojf,      --output-json-full     [%-7s] include more information in the JSON file\n",      params.output_jsn_full ? "true" : "false");
+    fprintf(stderr, "  -of FNAME, --output-file FNAME    [%-7s] output file path (without file extension)\n",      "");
+    fprintf(stderr, "  -np,       --no-prints            [%-7s] do not print anything other than the results\n",   params.no_prints ? "true" : "false");
+    fprintf(stderr, "  -ps,       --print-special        [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pc,       --print-colors         [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
+    fprintf(stderr, "             --print-confidence     [%-7s] print confidence\n",                               params.print_confidence ? "true" : "false");
+    fprintf(stderr, "  -pp,       --print-progress       [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
+    fprintf(stderr, "  -nt,       --no-timestamps        [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
+    fprintf(stderr, "  -l LANG,   --language LANG        [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
+    fprintf(stderr, "  -dl,       --detect-language      [%-7s] exit after automatically detecting language\n",    params.detect_language ? "true" : "false");
+    fprintf(stderr, "             --prompt PROMPT        [%-7s] initial prompt (max n_text_ctx/2 tokens)\n",       params.prompt.c_str());
+    fprintf(stderr, "             --carry-initial-prompt [%-7s] always prepend initial prompt\n",                  params.carry_initial_prompt ? "true" : "false");
+    fprintf(stderr, "  -m FNAME,  --model FNAME          [%-7s] model path\n",                                     params.model.c_str());
+    fprintf(stderr, "  -f FNAME,  --file FNAME           [%-7s] input audio file path\n",                          "");
+    fprintf(stderr, "  -oved D,   --ov-e-device DNAME    [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
+    fprintf(stderr, "  -dtw MODEL --dtw MODEL            [%-7s] compute token-level timestamps\n",                 params.dtw.c_str());
+    fprintf(stderr, "  -ls,       --log-score            [%-7s] log best decoder scores of tokens\n",              params.log_score?"true":"false");
+    fprintf(stderr, "  -ng,       --no-gpu               [%-7s] disable GPU\n",                                    params.use_gpu ? "false" : "true");
+    fprintf(stderr, "  -fa,       --flash-attn           [%-7s] flash attention\n",                                params.flash_attn ? "true" : "false");
+    fprintf(stderr, "  -sns,      --suppress-nst         [%-7s] suppress non-speech tokens\n",                     params.suppress_nst ? "true" : "false");
+    fprintf(stderr, "  --suppress-regex REGEX            [%-7s] regular expression matching tokens to suppress\n", params.suppress_regex.c_str());
+    fprintf(stderr, "  --grammar GRAMMAR                 [%-7s] GBNF grammar to guide decoding\n",                 params.grammar.c_str());
+    fprintf(stderr, "  --grammar-rule RULE               [%-7s] top-level GBNF grammar rule name\n",               params.grammar_rule.c_str());
+    fprintf(stderr, "  --grammar-penalty N               [%-7.1f] scales down logits of nongrammar tokens\n",      params.grammar_penalty);
     // Voice Activity Detection (VAD) parameters
     fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
     fprintf(stderr, "             --vad                           [%-7s] enable Voice Activity Detection (VAD)\n",            params.vad ? "true" : "false");
@@ -1176,7 +1179,8 @@ int main(int argc, char ** argv) {
 
             wparams.suppress_regex   = params.suppress_regex.empty() ? nullptr : params.suppress_regex.c_str();
 
-            wparams.initial_prompt   = params.prompt.c_str();
+            wparams.initial_prompt       = params.prompt.c_str();
+            wparams.carry_initial_prompt = params.carry_initial_prompt;
 
             wparams.greedy.best_of        = params.best_of;
             wparams.beam_search.beam_size = params.beam_size;
diff --git a/include/whisper.h b/include/whisper.h
index fcd756a9fe2..f4cc6bf7abd 100644
--- a/include/whisper.h
+++ b/include/whisper.h
@@ -525,6 +525,7 @@ extern "C" {
         // use whisper_tokenize() to convert text to tokens
         // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
         const char * initial_prompt;
+        bool carry_initial_prompt; // if true, always prepend initial_prompt to every decode window (may reduce conditioning on previous text)
         const whisper_token * prompt_tokens;
         int prompt_n_tokens;
 
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 52de68c2b12..6e402bb052d 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -5952,9 +5952,10 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
 
         /* suppress_regex    =*/ nullptr,
 
-        /*.initial_prompt    =*/ nullptr,
-        /*.prompt_tokens     =*/ nullptr,
-        /*.prompt_n_tokens   =*/ 0,
+        /*.initial_prompt       =*/ nullptr,
+        /*.carry_initial_prompt =*/ false,
+        /*.prompt_tokens        =*/ nullptr,
+        /*.prompt_n_tokens      =*/ 0,
 
         /*.language          =*/ "en",
         /*.detect_language   =*/ false,
@@ -6913,6 +6914,7 @@ int whisper_full_with_state(
     }
 
     // prepare prompt
+    std::vector<whisper_token> initial_prompt_tokens; // persistent for carry_initial_prompt
     {
         std::vector<whisper_token> prompt_tokens;
 
@@ -6927,6 +6929,9 @@ int whisper_full_with_state(
             prompt_tokens.resize(n_needed);
             params.prompt_tokens   = prompt_tokens.data();
             params.prompt_n_tokens = prompt_tokens.size();
+            if (params.carry_initial_prompt) {
+                initial_prompt_tokens = prompt_tokens; // copy for reuse
+            }
         }
 
         // prepend the prompt tokens to the prompt_past
@@ -6937,6 +6942,10 @@ int whisper_full_with_state(
             }
             std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
         }
+
+        if (initial_prompt_tokens.empty() && params.carry_initial_prompt && params.prompt_tokens && params.prompt_n_tokens > 0) {
+            initial_prompt_tokens.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
+        }
     }
 
     // overwrite audio_ctx, max allowed is hparams.n_audio_ctx
@@ -6992,6 +7001,7 @@ int whisper_full_with_state(
     std::vector<beam_candidate> beam_candidates;
 
     // main loop
+    bool first_iter_with_prompt = true; // track first decode iteration for carry_initial_prompt logic
     while (true) {
         if (params.progress_callback) {
             const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
@@ -7083,16 +7093,43 @@ int whisper_full_with_state(
                 prompt.clear();
 
                 // if we have already generated some text, use it as a prompt to condition the next generation
-                if (!prompt_past.empty() && t_cur < 0.5f && params.n_max_text_ctx > 0) {
-                    int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size()));
-
+        if (( (!prompt_past.empty()) || (params.carry_initial_prompt && !initial_prompt_tokens.empty() && !first_iter_with_prompt) )
+            && t_cur < 0.5f && params.n_max_text_ctx > 0) {
+                    int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
                     prompt = { whisper_token_prev(ctx) };
-                    prompt.insert(prompt.begin() + 1, prompt_past.end() - n_take, prompt_past.end());
+                    if (params.carry_initial_prompt) {
+                        if (first_iter_with_prompt) {
+                            // behave like non-carry on first chunk to avoid duplication
+                            int n_take = std::min(max_ctx_half, (int)prompt_past.size());
+                            prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end());
+                        } else {
+                            std::vector<whisper_token> ipt = initial_prompt_tokens;
+                            if (!ipt.empty()) {
+                                if ((int)ipt.size() > max_ctx_half - 1) {
+                                    ipt.erase(ipt.begin(), ipt.begin() + (ipt.size() - (max_ctx_half - 1)));
+                                }
+                                prompt.insert(prompt.end(), ipt.begin(), ipt.end());
+                            }
+                            int remaining_budget = max_ctx_half - (int)ipt.size();
+                            if (remaining_budget > 0) {
+                                int n_take = std::min(remaining_budget, (int)prompt_past.size());
+                                if (n_take > 0) {
+                                    prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end());
+                                }
+                            }
+                        }
+                    } else {
+                        int n_take = std::min(max_ctx_half, (int)prompt_past.size());
+                        prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end());
+                    }
                 }
 
                 // init new transcription with sot, language (opt) and task tokens
                 prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
 
+                // mark first iteration done
+                first_iter_with_prompt = false;
+
                 // print the prompt
                 WHISPER_LOG_DEBUG("\n\n");
                 for (int i = 0; i < (int) prompt.size(); i++) {
@@ -7572,7 +7609,15 @@ int whisper_full_with_state(
             // update prompt_past
             prompt_past.clear();
             if (prompt.front() == whisper_token_prev(ctx)) {
-                prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
+                auto start_it = prompt.begin() + 1;
+                if (params.carry_initial_prompt && params.prompt_n_tokens > 0) {
+                    // skip initial prompt tokens to avoid accumulating duplicates
+                    int n_ip = params.prompt_n_tokens;
+                    if (prompt.end() - start_it > n_ip) {
+                        start_it += n_ip;
+                    }
+                }
+                prompt_past.insert(prompt_past.end(), start_it, prompt.end() - prompt_init.size());
             }
 
             for (int i = 0; i < result_len && !is_no_speech; ++i) {

From 02714dddde092735de2e6a77ea97b18e8d612b08 Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Mon, 8 Sep 2025 13:01:58 +0200
Subject: [PATCH 02/14] PR fixes for ruby and go

---
 bindings/go/params.go                   | 10 ++++--
 bindings/ruby/ext/ruby_whisper_params.c | 46 ++++++++++++-------------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/bindings/go/params.go b/bindings/go/params.go
index e1d54b266aa..d8dee57e331 100644
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@@ -47,9 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
 	p.print_timestamps = toBool(v)
 }
 
-func (p *Params) SetCarryInitialPrompt(v bool) {
-	p.carry_initial_prompt = toBool(v)
-}
 
 // Set language id
 func (p *Params) SetLanguage(lang int) error {
@@ -150,6 +147,10 @@ func (p *Params) SetInitialPrompt(prompt string) {
 	p.initial_prompt = C.CString(prompt)
 }
 
+func (p *Params) SetCarryInitialPrompt(v bool) {
+	p.carry_initial_prompt = toBool(v)
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS
 
@@ -203,6 +204,9 @@ func (p *Params) String() string {
 	if p.token_timestamps {
 		str += " token_timestamps"
 	}
+	if p.carry_initial_prompt {
+		str += " carry_initial_prompt"
+	}
 
 	return str + ">"
 }
diff --git a/bindings/ruby/ext/ruby_whisper_params.c b/bindings/ruby/ext/ruby_whisper_params.c
index 670f9351375..70417cb1664 100644
--- a/bindings/ruby/ext/ruby_whisper_params.c
+++ b/bindings/ruby/ext/ruby_whisper_params.c
@@ -1325,29 +1325,29 @@ init_ruby_whisper_params(VALUE *mWhisper)
   DEFINE_PARAM(max_len, 11)
   DEFINE_PARAM(split_on_word, 12)
   DEFINE_PARAM(initial_prompt, 13)
-  DEFINE_PARAM(carry_initial_prompt, 36)
-  DEFINE_PARAM(diarize, 14)
-  DEFINE_PARAM(offset, 15)
-  DEFINE_PARAM(duration, 16)
-  DEFINE_PARAM(max_text_tokens, 17)
-  DEFINE_PARAM(temperature, 18)
-  DEFINE_PARAM(max_initial_ts, 19)
-  DEFINE_PARAM(length_penalty, 20)
-  DEFINE_PARAM(temperature_inc, 21)
-  DEFINE_PARAM(entropy_thold, 22)
-  DEFINE_PARAM(logprob_thold, 23)
-  DEFINE_PARAM(no_speech_thold, 24)
-  DEFINE_PARAM(new_segment_callback, 25)
-  DEFINE_PARAM(new_segment_callback_user_data, 26)
-  DEFINE_PARAM(progress_callback, 27)
-  DEFINE_PARAM(progress_callback_user_data, 28)
-  DEFINE_PARAM(encoder_begin_callback, 29)
-  DEFINE_PARAM(encoder_begin_callback_user_data, 30)
-  DEFINE_PARAM(abort_callback, 31)
-  DEFINE_PARAM(abort_callback_user_data, 32)
-  DEFINE_PARAM(vad, 33)
-  DEFINE_PARAM(vad_model_path, 34)
-  DEFINE_PARAM(vad_params, 35)
+  DEFINE_PARAM(carry_initial_prompt, 14)
+  DEFINE_PARAM(diarize, 15)
+  DEFINE_PARAM(offset, 16)
+  DEFINE_PARAM(duration, 17)
+  DEFINE_PARAM(max_text_tokens, 18)
+  DEFINE_PARAM(temperature, 19)
+  DEFINE_PARAM(max_initial_ts, 20)
+  DEFINE_PARAM(length_penalty, 21)
+  DEFINE_PARAM(temperature_inc, 22)
+  DEFINE_PARAM(entropy_thold, 23)
+  DEFINE_PARAM(logprob_thold, 24)
+  DEFINE_PARAM(no_speech_thold, 25)
+  DEFINE_PARAM(new_segment_callback, 26)
+  DEFINE_PARAM(new_segment_callback_user_data, 27)
+  DEFINE_PARAM(progress_callback, 28)
+  DEFINE_PARAM(progress_callback_user_data, 29)
+  DEFINE_PARAM(encoder_begin_callback, 30)
+  DEFINE_PARAM(encoder_begin_callback_user_data, 31)
+  DEFINE_PARAM(abort_callback, 32)
+  DEFINE_PARAM(abort_callback_user_data, 33)
+  DEFINE_PARAM(vad, 34)
+  DEFINE_PARAM(vad_model_path, 35)
+  DEFINE_PARAM(vad_params, 36)
 
   rb_define_method(cParams, "on_new_segment", ruby_whisper_params_on_new_segment, 0);
   rb_define_method(cParams, "on_progress", ruby_whisper_params_on_progress, 0);

From e7468c2949107407fe8541a9356cd3b0716c8d14 Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Tue, 9 Sep 2025 08:59:26 +0200
Subject: [PATCH 03/14] Refactoring for readability

---
 .../ggerganov/whispercpp/params/WhisperFullParams.java    | 2 +-
 src/whisper.cpp                                           | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
index 2decd5a2ef2..76ce80fb4cc 100644
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
@@ -333,7 +333,7 @@ public void setLogitsFilterCallback(WhisperLogitsFilterCallback callback) {
 
     @Override
     protected List<String> getFieldOrder() {
-    return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
+        return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
                 "offset_ms", "duration_ms", "translate", "no_context",
                 "no_timestamps", "single_segment", "print_special",
                 "print_progress", "print_realtime", "print_timestamps",
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 6e402bb052d..179df2a4e77 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -7093,8 +7093,12 @@ int whisper_full_with_state(
                 prompt.clear();
 
                 // if we have already generated some text, use it as a prompt to condition the next generation
-        if (( (!prompt_past.empty()) || (params.carry_initial_prompt && !initial_prompt_tokens.empty() && !first_iter_with_prompt) )
-            && t_cur < 0.5f && params.n_max_text_ctx > 0) {
+                const bool has_past_text = !prompt_past.empty();
+                const bool carrying_initial_prompt_now = params.carry_initial_prompt && !initial_prompt_tokens.empty() && !first_iter_with_prompt;
+                // We only condition on previous text at lower temperatures and when a context limit is set
+                const bool allow_conditioning = (t_cur < 0.5f) && (params.n_max_text_ctx > 0);
+
+                if ((has_past_text || carrying_initial_prompt_now) && allow_conditioning) {
                     int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
                     prompt = { whisper_token_prev(ctx) };
                     if (params.carry_initial_prompt) {

From 8be27dc4458b0459f6db551e32aa5f8c29b960b6 Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Wed, 8 Oct 2025 12:05:21 +0200
Subject: [PATCH 04/14] WIP 1

---
 src/whisper.cpp | 94 ++++++++++++++++++++++++-------------------------
 1 file changed, 46 insertions(+), 48 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 046fab9dba7..04f1466405e 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -880,7 +880,10 @@ struct whisper_state {
     std::vector<float> logits;
 
     std::vector<whisper_segment> result_all;
-    std::vector<whisper_token>   prompt_past;
+
+    // prompt history split into static prefix (prompt_past0) and dynamic rolling context (prompt_past1)
+    std::vector<whisper_token>   prompt_past0; // static carried initial prompt (if enabled)
+    std::vector<whisper_token>   prompt_past1; // dynamic context from decoded output
 
     int lang_id = 0; // english by default
 
@@ -6875,10 +6878,12 @@ int whisper_full_with_state(
         decoder.rng = std::mt19937(j);
     }
 
-    // the accumulated text context so far
-    auto & prompt_past = state->prompt_past;
+    // the accumulated text context split into static (prompt_past0) and dynamic (prompt_past1)
+    auto & prompt_past0 = state->prompt_past0;
+    auto & prompt_past1 = state->prompt_past1;
     if (params.no_context) {
-        prompt_past.clear();
+        prompt_past0.clear();
+        prompt_past1.clear();
     }
 
     // prepare prompt
@@ -6902,15 +6907,16 @@ int whisper_full_with_state(
             }
         }
 
-        // prepend the prompt tokens to the prompt_past
+        // store initial prompt in prompt_past0 if carrying, else treat as part of dynamic context
         if (params.prompt_tokens && params.prompt_n_tokens > 0) {
-            // parse tokens from the pointer
-            for (int i = 0; i < params.prompt_n_tokens; i++) {
-                prompt_past.push_back(params.prompt_tokens[i]);
+            if (params.carry_initial_prompt) {
+                if (prompt_past0.empty()) {
+                    prompt_past0.insert(prompt_past0.end(), params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
+                }
+            } else {
+                prompt_past1.insert(prompt_past1.end(), params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
             }
-            std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
         }
-
         if (initial_prompt_tokens.empty() && params.carry_initial_prompt && params.prompt_tokens && params.prompt_n_tokens > 0) {
             initial_prompt_tokens.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
         }
@@ -6999,7 +7005,7 @@ int whisper_full_with_state(
         // if there is a very short audio segment left to process, we remove any past prompt since it tends
         // to confuse the decoder and often make it repeat or hallucinate stuff
         if (seek > seek_start && seek + 500 >= seek_end) {
-            prompt_past.clear();
+            prompt_past1.clear();
         }
 
         int best_decoder_id = 0;
@@ -7061,38 +7067,35 @@ int whisper_full_with_state(
                 prompt.clear();
 
                 // if we have already generated some text, use it as a prompt to condition the next generation
-                const bool has_past_text = !prompt_past.empty();
-                const bool carrying_initial_prompt_now = params.carry_initial_prompt && !initial_prompt_tokens.empty() && !first_iter_with_prompt;
+                const bool has_past_text = !prompt_past1.empty();
+                const bool carrying_initial_prompt_now = params.carry_initial_prompt && !prompt_past0.empty() && !first_iter_with_prompt;
                 // We only condition on previous text at lower temperatures and when a context limit is set
                 const bool allow_conditioning = (t_cur < 0.5f) && (params.n_max_text_ctx > 0);
 
                 if ((has_past_text || carrying_initial_prompt_now) && allow_conditioning) {
                     int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
                     prompt = { whisper_token_prev(ctx) };
-                    if (params.carry_initial_prompt) {
-                        if (first_iter_with_prompt) {
-                            // behave like non-carry on first chunk to avoid duplication
-                            int n_take = std::min(max_ctx_half, (int)prompt_past.size());
-                            prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end());
-                        } else {
-                            std::vector<whisper_token> ipt = initial_prompt_tokens;
-                            if (!ipt.empty()) {
-                                if ((int)ipt.size() > max_ctx_half - 1) {
-                                    ipt.erase(ipt.begin(), ipt.begin() + (ipt.size() - (max_ctx_half - 1)));
-                                }
-                                prompt.insert(prompt.end(), ipt.begin(), ipt.end());
-                            }
-                            int remaining_budget = max_ctx_half - (int)ipt.size();
-                            if (remaining_budget > 0) {
-                                int n_take = std::min(remaining_budget, (int)prompt_past.size());
-                                if (n_take > 0) {
-                                    prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end());
-                                }
+                    if (params.carry_initial_prompt && !prompt_past0.empty() && !first_iter_with_prompt) {
+                        int budget = max_ctx_half;
+                        prompt.push_back(whisper_token_prev(ctx));
+                        int take0 = std::min(budget - 1, (int)prompt_past0.size());
+                        if (take0 > 0) {
+                            auto start0 = take0 < (int)prompt_past0.size() ? prompt_past0.end() - take0 : prompt_past0.begin();
+                            prompt.insert(prompt.end(), start0, prompt_past0.end());
+                        }
+                        int remaining = budget - take0;
+                        if (remaining > 0) {
+                            int take1 = std::min(remaining, (int)prompt_past1.size());
+                            if (take1 > 0) {
+                                prompt.insert(prompt.end(), prompt_past1.end() - take1, prompt_past1.end());
                             }
                         }
                     } else {
-                        int n_take = std::min(max_ctx_half, (int)prompt_past.size());
-                        prompt.insert(prompt.end(), prompt_past.end() - n_take, prompt_past.end());
+                        int n_take = std::min(max_ctx_half, (int)prompt_past1.size());
+                        if (n_take > 0) {
+                            prompt = { whisper_token_prev(ctx) };
+                            prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end());
+                        }
                     }
                 }
 
@@ -7578,22 +7581,17 @@ int whisper_full_with_state(
 
             //WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
 
-            // update prompt_past
-            prompt_past.clear();
-            if (prompt.front() == whisper_token_prev(ctx)) {
-                auto start_it = prompt.begin() + 1;
-                if (params.carry_initial_prompt && params.prompt_n_tokens > 0) {
-                    // skip initial prompt tokens to avoid accumulating duplicates
-                    int n_ip = params.prompt_n_tokens;
-                    if (prompt.end() - start_it > n_ip) {
-                        start_it += n_ip;
-                    }
+            if (!params.carry_initial_prompt) {
+                prompt_past1.clear();
+                if (!prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
+                    auto start_it = prompt.begin() + 1;
+                    prompt_past1.insert(prompt_past1.end(), start_it, prompt.end() - prompt_init.size());
                 }
-                prompt_past.insert(prompt_past.end(), start_it, prompt.end() - prompt_init.size());
             }
-
-            for (int i = 0; i < result_len && !is_no_speech; ++i) {
-                prompt_past.push_back(tokens_cur[i].id);
+            if (!is_no_speech) {
+                for (int i = 0; i < result_len; ++i) {
+                    prompt_past1.push_back(tokens_cur[i].id);
+                }
             }
 
             if (!tokens_cur.empty() && ctx->model.n_loaded > 0 && !is_no_speech) {

From 8abf7d96a87f5e918e1822487d8e81f7fe6fd200 Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Wed, 8 Oct 2025 12:05:30 +0200
Subject: [PATCH 05/14] WIP 2

---
 src/whisper.cpp | 84 +++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 04f1466405e..89efd9af049 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -138,6 +138,10 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
     } while (0)
 
 #define WHISPER_MAX_DECODERS 8
+
+// temperature below which we condition on past text history
+static constexpr float WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF = 0.5f;
+
 #define WHISPER_MAX_NODES 4096
 
 static std::string format(const char * fmt, ...) {
@@ -6887,11 +6891,8 @@ int whisper_full_with_state(
     }
 
     // prepare prompt
-    std::vector<whisper_token> initial_prompt_tokens; // persistent for carry_initial_prompt
     {
         std::vector<whisper_token> prompt_tokens;
-
-        // initial prompt
         if (!params.prompt_tokens && params.initial_prompt) {
             prompt_tokens.resize(1024);
             int n_needed = whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size());
@@ -6902,12 +6903,7 @@ int whisper_full_with_state(
             prompt_tokens.resize(n_needed);
             params.prompt_tokens   = prompt_tokens.data();
             params.prompt_n_tokens = prompt_tokens.size();
-            if (params.carry_initial_prompt) {
-                initial_prompt_tokens = prompt_tokens; // copy for reuse
-            }
         }
-
-        // store initial prompt in prompt_past0 if carrying, else treat as part of dynamic context
         if (params.prompt_tokens && params.prompt_n_tokens > 0) {
             if (params.carry_initial_prompt) {
                 if (prompt_past0.empty()) {
@@ -6917,9 +6913,6 @@ int whisper_full_with_state(
                 prompt_past1.insert(prompt_past1.end(), params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
             }
         }
-        if (initial_prompt_tokens.empty() && params.carry_initial_prompt && params.prompt_tokens && params.prompt_n_tokens > 0) {
-            initial_prompt_tokens.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
-        }
     }
 
     // overwrite audio_ctx, max allowed is hparams.n_audio_ctx
@@ -6975,7 +6968,7 @@ int whisper_full_with_state(
     std::vector<beam_candidate> beam_candidates;
 
     // main loop
-    bool first_iter_with_prompt = true; // track first decode iteration for carry_initial_prompt logic
+    bool first_history_iter = true; // track first decode iteration for carry_initial_prompt logic
     while (true) {
         if (params.progress_callback) {
             const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
@@ -7066,36 +7059,45 @@ int whisper_full_with_state(
             {
                 prompt.clear();
 
-                // if we have already generated some text, use it as a prompt to condition the next generation
-                const bool has_past_text = !prompt_past1.empty();
-                const bool carrying_initial_prompt_now = params.carry_initial_prompt && !prompt_past0.empty() && !first_iter_with_prompt;
-                // We only condition on previous text at lower temperatures and when a context limit is set
-                const bool allow_conditioning = (t_cur < 0.5f) && (params.n_max_text_ctx > 0);
-
-                if ((has_past_text || carrying_initial_prompt_now) && allow_conditioning) {
-                    int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
-                    prompt = { whisper_token_prev(ctx) };
-                    if (params.carry_initial_prompt && !prompt_past0.empty() && !first_iter_with_prompt) {
-                        int budget = max_ctx_half;
-                        prompt.push_back(whisper_token_prev(ctx));
-                        int take0 = std::min(budget - 1, (int)prompt_past0.size());
-                        if (take0 > 0) {
-                            auto start0 = take0 < (int)prompt_past0.size() ? prompt_past0.end() - take0 : prompt_past0.begin();
-                            prompt.insert(prompt.end(), start0, prompt_past0.end());
-                        }
-                        int remaining = budget - take0;
-                        if (remaining > 0) {
-                            int take1 = std::min(remaining, (int)prompt_past1.size());
-                            if (take1 > 0) {
-                                prompt.insert(prompt.end(), prompt_past1.end() - take1, prompt_past1.end());
+                if (params.n_max_text_ctx > 0 &&
+                    t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) {
+
+                    const bool have_dynamic = !prompt_past1.empty();
+                    const bool can_carry_static = params.carry_initial_prompt && !prompt_past0.empty() && !first_history_iter;
+
+                    if (have_dynamic || can_carry_static) {
+                        int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
+                        if (max_ctx_half > 0) {
+                            // Always start with previous token marker to connect continuity
+                            prompt.push_back(whisper_token_prev(ctx));
+
+                            if (can_carry_static) {
+                                // Budget includes the prev token; we already consumed 1 slot.
+                                int budget = max_ctx_half; // total allowed (including prev)
+
+                                // Take as many static tokens as fit (reserving at least the prev token already placed)
+                                int take_static = std::min(budget - 1, (int) prompt_past0.size());
+                                if (take_static > 0) {
+                                    auto start0 = take_static < (int) prompt_past0.size() ? prompt_past0.end() - take_static : prompt_past0.begin();
+                                    prompt.insert(prompt.end(), start0, prompt_past0.end());
+                                }
+
+                                // Remaining budget for dynamic tail
+                                int remaining = budget - take_static;
+                                if (remaining > 0) {
+                                    int take_dynamic = std::min(remaining, (int) prompt_past1.size());
+                                    if (take_dynamic > 0) {
+                                        prompt.insert(prompt.end(), prompt_past1.end() - take_dynamic, prompt_past1.end());
+                                    }
+                                }
+                            } else {
+                                // Dynamic only path
+                                int n_take = std::min(max_ctx_half, (int) prompt_past1.size());
+                                if (n_take > 0) {
+                                    prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end());
+                                }
                             }
                         }
-                    } else {
-                        int n_take = std::min(max_ctx_half, (int)prompt_past1.size());
-                        if (n_take > 0) {
-                            prompt = { whisper_token_prev(ctx) };
-                            prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end());
-                        }
                     }
                 }
 
@@ -7103,7 +7105,7 @@ int whisper_full_with_state(
                 prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
 
                 // mark first iteration done
-                first_iter_with_prompt = false;
+                first_history_iter = false;
 
                 // print the prompt
                 WHISPER_LOG_DEBUG("\n\n");

From 44880cbdecf8bccbf45ce1924db1ec01db7754e0 Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Wed, 8 Oct 2025 15:31:38 +0200
Subject: [PATCH 06/14] PR fixes

---
 src/whisper.cpp | 76 ++++++++++++++++++++++++-------------------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 89efd9af049..c94ee02feda 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -6893,6 +6893,8 @@ int whisper_full_with_state(
     // prepare prompt
     {
         std::vector<whisper_token> prompt_tokens;
+
+        // tokenize the initial prompt
         if (!params.prompt_tokens && params.initial_prompt) {
             prompt_tokens.resize(1024);
             int n_needed = whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size());
@@ -6907,10 +6909,13 @@ int whisper_full_with_state(
         if (params.prompt_tokens && params.prompt_n_tokens > 0) {
             if (params.carry_initial_prompt) {
                 if (prompt_past0.empty()) {
-                    prompt_past0.insert(prompt_past0.end(), params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
+                    prompt_past0.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
                 }
             } else {
-                prompt_past1.insert(prompt_past1.end(), params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
+                for (int i = 0; i < params.prompt_n_tokens; ++i) {
+                    prompt_past1.push_back(params.prompt_tokens[i]);
+                }
+                std::rotate(prompt_past1.begin(), prompt_past1.end() - params.prompt_n_tokens, prompt_past1.end());
             }
         }
     }
@@ -6968,7 +6973,6 @@ int whisper_full_with_state(
     std::vector<beam_candidate> beam_candidates;
 
     // main loop
-    bool first_history_iter = true; // track first decode iteration for carry_initial_prompt logic
     while (true) {
         if (params.progress_callback) {
             const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
@@ -7063,40 +7067,38 @@ int whisper_full_with_state(
                     t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) {
 
                     const bool have_dynamic = !prompt_past1.empty();
-                    const bool can_carry_static = params.carry_initial_prompt && !prompt_past0.empty() && !first_history_iter;
-
-                    if (have_dynamic || can_carry_static) {
-                        int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
-                        if (max_ctx_half > 0) {
-                            // Always start with previous token marker to connect continuity
-                            prompt.push_back(whisper_token_prev(ctx));
-
-                            if (can_carry_static) {
-                                // Budget includes the prev token; we already consumed 1 slot.
-                                int budget = max_ctx_half; // total allowed (including prev)
-
-                                // Take as many static tokens as fit (reserving at least the prev token already placed)
-                                int take_static = std::min(budget - 1, (int) prompt_past0.size());
-                                if (take_static > 0) {
-                                    auto start0 = take_static < (int) prompt_past0.size() ? prompt_past0.end() - take_static : prompt_past0.begin();
-                                    prompt.insert(prompt.end(), start0, prompt_past0.end());
-                                }
+                    const bool can_carry_static = params.carry_initial_prompt && !prompt_past0.empty() && seek != seek_start;
+
+                    int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
+                    if (max_ctx_half > 0 && (have_dynamic || can_carry_static)) {
+                        // Always start with previous token marker to connect continuity
+                        prompt.push_back(whisper_token_prev(ctx));
+
+                        if (can_carry_static) {
+                            // Budget includes the prev token; we already consumed 1 slot.
+                            int budget = max_ctx_half; // total allowed (including prev)
+
+                            // Take as many static tokens as fit (reserving at least the prev token already placed)
+                            int take_static = std::min(budget - 1, (int) prompt_past0.size());
+                            if (take_static > 0) {
+                                auto start0 = take_static < (int) prompt_past0.size() ? prompt_past0.end() - take_static : prompt_past0.begin();
+                                prompt.insert(prompt.end(), start0, prompt_past0.end());
+                            }
 
-                                // Remaining budget for dynamic tail
-                                int remaining = budget - take_static;
-                                if (remaining > 0) {
-                                    int take_dynamic = std::min(remaining, (int) prompt_past1.size());
-                                    if (take_dynamic > 0) {
-                                        prompt.insert(prompt.end(), prompt_past1.end() - take_dynamic, prompt_past1.end());
-                                    }
-                                }
-                            } else {
-                                // Dynamic only path
-                                int n_take = std::min(max_ctx_half, (int) prompt_past1.size());
-                                if (n_take > 0) {
-                                    prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end());
+                            // Remaining budget for dynamic tail
+                            int remaining = budget - take_static;
+                            if (remaining > 0) {
+                                int take_dynamic = std::min(remaining, (int) prompt_past1.size());
+                                if (take_dynamic > 0) {
+                                    prompt.insert(prompt.end(), prompt_past1.end() - take_dynamic, prompt_past1.end());
                                 }
                             }
+                        } else {
+                            // Dynamic only path
+                            int n_take = std::min(max_ctx_half, (int) prompt_past1.size());
+                            if (n_take > 0) {
+                                prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end());
+                            }
                         }
                     }
                 }
@@ -7104,9 +7106,6 @@ int whisper_full_with_state(
                 // init new transcription with sot, language (opt) and task tokens
                 prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
 
-                // mark first iteration done
-                first_history_iter = false;
-
                 // print the prompt
                 WHISPER_LOG_DEBUG("\n\n");
                 for (int i = 0; i < (int) prompt.size(); i++) {
@@ -7586,8 +7585,7 @@ int whisper_full_with_state(
             if (!params.carry_initial_prompt) {
                 prompt_past1.clear();
                 if (!prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
-                    auto start_it = prompt.begin() + 1;
-                    prompt_past1.insert(prompt_past1.end(), start_it, prompt.end() - prompt_init.size());
+                    prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
                 }
             }
             if (!is_no_speech) {

From e42cbedb4626a63fc07fd32bb3494f48f86f3d9d Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Wed, 8 Oct 2025 18:03:26 +0200
Subject: [PATCH 07/14] More PR fixes

---
 src/whisper.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index c94ee02feda..c83e995be7b 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -7582,12 +7582,12 @@ int whisper_full_with_state(
 
             //WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
 
-            if (!params.carry_initial_prompt) {
-                prompt_past1.clear();
-                if (!prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
-                    prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
-                }
+            // update prompt_past1
+            prompt_past1.clear();
+            if (!prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
+                prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
             }
+
             if (!is_no_speech) {
                 for (int i = 0; i < result_len; ++i) {
                     prompt_past1.push_back(tokens_cur[i].id);

From ee5adba8de62067eb3bf817c7080a2dd608c8ff6 Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Fri, 10 Oct 2025 09:33:08 +0200
Subject: [PATCH 08/14] PR fix

---
 src/whisper.cpp | 43 ++++++++++++++-----------------------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index c83e995be7b..f851a4d155d 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -7063,42 +7063,27 @@ int whisper_full_with_state(
             {
                 prompt.clear();
 
-                if (params.n_max_text_ctx > 0 &&
-                    t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) {
+                if (params.n_max_text_ctx > 0 && t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) {
+                    const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty() && seek != seek_start;
+                    const bool can_take1 = !prompt_past1.empty();
 
-                    const bool have_dynamic = !prompt_past1.empty();
-                    const bool can_carry_static = params.carry_initial_prompt && !prompt_past0.empty() && seek != seek_start;
-
-                    int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
-                    if (max_ctx_half > 0 && (have_dynamic || can_carry_static)) {
+                    int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx) / 2);
+                    if (max_ctx_half > 0 && (can_take0 || can_take1)) {
                         // Always start with previous token marker to connect continuity
                         prompt.push_back(whisper_token_prev(ctx));
 
-                        if (can_carry_static) {
-                            // Budget includes the prev token; we already consumed 1 slot.
-                            int budget = max_ctx_half; // total allowed (including prev)
-
-                            // Take as many static tokens as fit (reserving at least the prev token already placed)
-                            int take_static = std::min(budget - 1, (int) prompt_past0.size());
-                            if (take_static > 0) {
-                                auto start0 = take_static < (int) prompt_past0.size() ? prompt_past0.end() - take_static : prompt_past0.begin();
+                        int n_take0 = 0;
+                        if (can_take0) {
+                            n_take0 = std::min<int>(max_ctx_half - 1, prompt_past0.size());
+                            if (n_take0 > 0) {
+                                auto start0 = n_take0 < (int)prompt_past0.size() ? prompt_past0.end() - n_take0 : prompt_past0.begin();
                                 prompt.insert(prompt.end(), start0, prompt_past0.end());
                             }
+                        }
 
-                            // Remaining budget for dynamic tail
-                            int remaining = budget - take_static;
-                            if (remaining > 0) {
-                                int take_dynamic = std::min(remaining, (int) prompt_past1.size());
-                                if (take_dynamic > 0) {
-                                    prompt.insert(prompt.end(), prompt_past1.end() - take_dynamic, prompt_past1.end());
-                                }
-                            }
-                        } else {
-                            // Dynamic only path
-                            int n_take = std::min(max_ctx_half, (int) prompt_past1.size());
-                            if (n_take > 0) {
-                                prompt.insert(prompt.end(), prompt_past1.end() - n_take, prompt_past1.end());
-                            }
+                        int n_take1 = std::min<int>(max_ctx_half - n_take0 - 1, prompt_past1.size());
+                        if (n_take1 > 0) {
+                            prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end());
                         }
                     }
                 }

From 6417091b52e02fee0a87b5730168d9f0bd056144 Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Fri, 10 Oct 2025 09:35:07 +0200
Subject: [PATCH 09/14] Further simplification

---
 src/whisper.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index f851a4d155d..065d606fe02 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -7067,24 +7067,21 @@ int whisper_full_with_state(
                     const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty() && seek != seek_start;
                     const bool can_take1 = !prompt_past1.empty();
 
-                    int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx) / 2);
+                    const int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx) / 2);
                     if (max_ctx_half > 0 && (can_take0 || can_take1)) {
                         // Always start with previous token marker to connect continuity
                         prompt.push_back(whisper_token_prev(ctx));
 
+                        // Take static tokens (initial prompt) first, up to budget minus the prev token
                         int n_take0 = 0;
                         if (can_take0) {
                             n_take0 = std::min<int>(max_ctx_half - 1, prompt_past0.size());
-                            if (n_take0 > 0) {
-                                auto start0 = n_take0 < (int)prompt_past0.size() ? prompt_past0.end() - n_take0 : prompt_past0.begin();
-                                prompt.insert(prompt.end(), start0, prompt_past0.end());
-                            }
+                            prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end());
                         }
 
-                        int n_take1 = std::min<int>(max_ctx_half - n_take0 - 1, prompt_past1.size());
-                        if (n_take1 > 0) {
-                            prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end());
-                        }
+                        // Fill remaining budget with dynamic tokens (rolling context)
+                        const int n_take1 = std::min<int>(max_ctx_half - n_take0 - 1, prompt_past1.size());
+                        prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end());
                     }
                 }
 

From bd4856160345db1d4c462abccd0ca7dfd11fea3b Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Fri, 10 Oct 2025 10:50:00 +0200
Subject: [PATCH 10/14] d'oh

---
 src/whisper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 065d606fe02..d8887aab648 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -7064,7 +7064,7 @@ int whisper_full_with_state(
                 prompt.clear();
 
                 if (params.n_max_text_ctx > 0 && t_cur < WHISPER_HISTORY_CONDITIONING_TEMP_CUTOFF) {
-                    const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty() && seek != seek_start;
+                    const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty();
                     const bool can_take1 = !prompt_past1.empty();
 
                     const int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx) / 2);

From 037b419a537f491f21a2e8ae930252c37c8b165c Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Fri, 10 Oct 2025 11:07:08 +0200
Subject: [PATCH 11/14] One more logic fix

---
 src/whisper.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index d8887aab648..be4371fda63 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -7566,10 +7566,11 @@ int whisper_full_with_state(
 
             // update prompt_past1
             prompt_past1.clear();
-            if (!prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
+            if (!params.carry_initial_prompt && !prompt.empty() && prompt.front() == whisper_token_prev(ctx)) {
                 prompt_past1.insert(prompt_past1.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
             }
 
+            // Add newly decoded tokens to the rolling context
             if (!is_no_speech) {
                 for (int i = 0; i < result_len; ++i) {
                     prompt_past1.push_back(tokens_cur[i].id);

From f6139dfacc12ff65e51e804608b0d7cba463916c Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Fri, 10 Oct 2025 11:35:15 +0200
Subject: [PATCH 12/14] Update src/whisper.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 src/whisper.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index be4371fda63..cca68194469 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -7002,6 +7002,7 @@ int whisper_full_with_state(
         // if there is a very short audio segment left to process, we remove any past prompt since it tends
         // to confuse the decoder and often make it repeat or hallucinate stuff
         if (seek > seek_start && seek + 500 >= seek_end) {
+            prompt_past0.clear();
             prompt_past1.clear();
         }
 

From c86f3c879adc3558d94ce8f5e1de1576718300e9 Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Fri, 10 Oct 2025 12:54:26 +0200
Subject: [PATCH 13/14] Truncate prompt_past0 upon initialization

---
 src/whisper.cpp | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index cca68194469..24d7a260342 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -6890,6 +6890,9 @@ int whisper_full_with_state(
         prompt_past1.clear();
     }
 
+    // calculate the maximum context budget for prompt history
+    const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
+
     // prepare prompt
     {
         std::vector<whisper_token> prompt_tokens;
@@ -6909,7 +6912,15 @@ int whisper_full_with_state(
         if (params.prompt_tokens && params.prompt_n_tokens > 0) {
             if (params.carry_initial_prompt) {
                 if (prompt_past0.empty()) {
-                    prompt_past0.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
+                    const int max_tokens = std::max(1, max_prompt_ctx - 1);
+
+                    if (params.prompt_n_tokens > max_tokens) {
+                        WHISPER_LOG_WARN("%s: initial prompt is too long (%d tokens), will use only the last %d tokens\n",
+                                        __func__, params.prompt_n_tokens, max_tokens);
+                    }
+
+                    const int n_tokens = std::min(params.prompt_n_tokens, max_tokens);
+                    prompt_past0.assign(params.prompt_tokens + (params.prompt_n_tokens - n_tokens), params.prompt_tokens + params.prompt_n_tokens);
                 }
             } else {
                 for (int i = 0; i < params.prompt_n_tokens; ++i) {
@@ -7068,20 +7079,19 @@ int whisper_full_with_state(
                     const bool can_take0 = params.carry_initial_prompt && !prompt_past0.empty();
                     const bool can_take1 = !prompt_past1.empty();
 
-                    const int max_ctx_half = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx) / 2);
-                    if (max_ctx_half > 0 && (can_take0 || can_take1)) {
+                    if (max_prompt_ctx > 0 && (can_take0 || can_take1)) {
                         // Always start with previous token marker to connect continuity
                         prompt.push_back(whisper_token_prev(ctx));
 
                         // Take static tokens (initial prompt) first, up to budget minus the prev token
                         int n_take0 = 0;
                         if (can_take0) {
-                            n_take0 = std::min<int>(max_ctx_half - 1, prompt_past0.size());
+                            n_take0 = std::min<int>(max_prompt_ctx - 1, prompt_past0.size());
                             prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end());
                         }
 
                         // Fill remaining budget with dynamic tokens (rolling context)
-                        const int n_take1 = std::min<int>(max_ctx_half - n_take0 - 1, prompt_past1.size());
+                        const int n_take1 = std::min<int>(max_prompt_ctx - n_take0 - 1, prompt_past1.size());
                         prompt.insert(prompt.end(), prompt_past1.end() - n_take1, prompt_past1.end());
                     }
                 }

From 41df41b948c5047b655fb62fb44537f13f7a9638 Mon Sep 17 00:00:00 2001
From: Andreas Lubbe <git@lubbe.org>
Date: Fri, 10 Oct 2025 13:00:58 +0200
Subject: [PATCH 14/14] Slight simplification

---
 src/whisper.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 24d7a260342..6f261f800af 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -7083,10 +7083,10 @@ int whisper_full_with_state(
                         // Always start with previous token marker to connect continuity
                         prompt.push_back(whisper_token_prev(ctx));
 
-                        // Take static tokens (initial prompt) first, up to budget minus the prev token
+                        // Take static tokens (initial prompt) first
                         int n_take0 = 0;
                         if (can_take0) {
-                            n_take0 = std::min<int>(max_prompt_ctx - 1, prompt_past0.size());
+                            n_take0 = prompt_past0.size();
                             prompt.insert(prompt.end(), prompt_past0.end() - n_take0, prompt_past0.end());
                         }