From eca791ffc3453d3bbd9331e36f0dae5c6339c6f6 Mon Sep 17 00:00:00 2001
From: Ivy233 <wangjinrun@uniontech.com>
Date: Mon, 24 Mar 2025 10:27:37 +0800
Subject: [PATCH] =?UTF-8?q?[Develop]=20=E8=B0=83=E6=95=B4=E6=B5=81?=
 =?UTF-8?q?=E8=BE=93=E5=85=A5=E5=9C=A8use-vad=E6=83=85=E5=86=B5=E4=B8=8B?=
 =?UTF-8?q?=E7=9A=84=E9=80=BB=E8=BE=91=EF=BC=9A=201.=20=E5=9C=A8=E8=AF=A5?=
 =?UTF-8?q?=E6=83=85=E5=86=B5=E4=B8=8B=EF=BC=8C=E5=8F=AF=E4=BB=A5=E7=9C=8B?=
 =?UTF-8?q?=E5=88=B0=E5=AE=9E=E6=97=B6=E7=9A=84=E8=AF=AD=E9=9F=B3=E6=9B=B4?=
 =?UTF-8?q?=E6=96=B0=E3=80=82=E5=8E=9F=E5=85=88=E7=9A=84=E2=80=9C=E6=9C=80?=
 =?UTF-8?q?=E9=95=BF=E8=AF=AD=E9=9F=B3=E2=80=9D=E4=B8=8E=E2=80=9C=E5=88=87?=
 =?UTF-8?q?=E5=89=B2=E6=89=80=E9=9C=80=E6=9C=80=E7=9F=AD=E9=97=B4=E9=9A=94?=
 =?UTF-8?q?=E2=80=9D=E8=A7=84=E5=88=99=E4=BE=9D=E6=97=A7=E7=94=9F=E6=95=88?=
 =?UTF-8?q?=E3=80=82=202.=20=E5=9C=A8=E8=AF=A5=E6=83=85=E5=86=B5=E4=B8=8B?=
 =?UTF-8?q?=EF=BC=8C=E5=A6=82=E6=9E=9C=E9=9C=80=E8=A6=81=E5=88=87=E5=89=B2?=
 =?UTF-8?q?=E9=83=A8=E5=88=86=E8=AF=AD=E9=9F=B3=EF=BC=8C=E5=89=A9=E4=BD=99?=
 =?UTF-8?q?=E9=83=A8=E5=88=86=E4=BC=9A=E6=8D=A2=E8=A1=8C=E7=BB=A7=E7=BB=AD?=
 =?UTF-8?q?=E8=BE=93=E5=87=BA=E3=80=82=203.=20=E5=9C=A8=E8=AF=A5=E6=83=85?=
 =?UTF-8?q?=E5=86=B5=E4=B8=8B=EF=BC=8C=E5=A6=82=E6=9E=9C=E7=BC=93=E5=86=B2?=
 =?UTF-8?q?=E5=8C=BA=E6=B2=A1=E6=9C=89=E9=9C=80=E8=A6=81=E8=AF=86=E5=88=AB?=
 =?UTF-8?q?=E7=9A=84=E8=AF=AD=E9=9F=B3=EF=BC=8C=E5=B9=B6=E4=B8=94=E6=8E=A5?=
 =?UTF-8?q?=E5=8F=97=E5=88=B0=E4=B8=80=E6=AE=B5=E6=B2=A1=E6=9C=89=E8=AF=AD?=
 =?UTF-8?q?=E9=9F=B3=E6=B4=BB=E5=8A=A8=E7=9A=84=E9=9F=B3=E9=A2=91=EF=BC=8C?=
 =?UTF-8?q?=E4=BC=9A=E8=88=8D=E5=BC=83=E4=B9=8B=E3=80=82=204.=20=E5=86=85?=
 =?UTF-8?q?=E9=83=A8=E8=AE=B0=E5=BD=95=E7=9A=84=E6=97=B6=E9=97=B4=E6=88=B3?=
 =?UTF-8?q?=E5=90=AB=E4=B9=89=E5=8F=91=E7=94=9F=E6=94=B9=E5=8F=98=EF=BC=8C?=
 =?UTF-8?q?=E4=BB=8E=E7=9B=B8=E5=AF=B9=E5=80=BC=E8=B0=83=E6=95=B4=E4=B8=BA?=
 =?UTF-8?q?=E7=BB=9D=E5=AF=B9=E5=80=BC=EF=BC=8C=E8=BF=99=E5=8F=AF=E4=BB=A5?=
 =?UTF-8?q?=E5=A2=9E=E5=BC=BA=E5=AF=B9pcmf32=E7=BC=93=E5=86=B2=E5=8C=BA?=
 =?UTF-8?q?=E5=A4=84=E7=90=86=E7=9A=84=E4=BB=A3=E7=A0=81=E5=8F=AF=E8=AF=BB?=
 =?UTF-8?q?=E6=80=A7=E3=80=82=205.=20=E4=BF=AE=E5=A4=8D=E4=B9=8B=E5=89=8D?=
 =?UTF-8?q?=E5=85=B6=E4=BB=96=E4=BA=BA=E6=8F=90=E4=BA=A4=E4=B8=AD=EF=BC=8C?=
 =?UTF-8?q?=E6=97=B6=E9=97=B4=E5=8C=BA=E9=97=B4=E4=BC=9A=E5=8F=8D=E5=A4=8D?=
 =?UTF-8?q?=E5=87=BA=E7=8E=B0=E7=9A=84bug=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/stream/stream.cc | 297 +++++++++++++++++++-------------------
 1 file changed, 151 insertions(+), 146 deletions(-)
diff --git a/examples/stream/stream.cc b/examples/stream/stream.cc
index ec5be77..68e4780 100644
--- a/examples/stream/stream.cc
+++ b/examples/stream/stream.cc
@@ -1,79 +1,94 @@
+#include "common-sdl.h"
+#include "common.h"
+#include "sense-voice.h"
+#include <algorithm>
 #include <cassert>
 #include <cstdio>
+#include <fstream>
+#include <queue>
 #include <string>
 #include <thread>
 #include <vector>
-#include <fstream>
-#include <algorithm>
-#include "sense-voice.h"
-#include "common-sdl.h"
-#include "common.h"
 
 struct sense_voice_stream_params {
-    int32_t n_threads         = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors      = 1;
-    int32_t capture_id        = -1;
-    int32_t chunk_size        = 100;                       // ms
-    int32_t max_nomute_chunks = 8000 / chunk_size;  // chunks
-    int32_t min_mute_chunks   = 1000 / chunk_size;    // chunks
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_processors = 1;
+    int32_t capture_id = -1;
+    int32_t chunk_size = 100;                     // ms
+    int32_t max_nomute_chunks = 8000 / chunk_size;// chunks
+    int32_t min_mute_chunks = 1000 / chunk_size;  // chunks
 
-    bool use_gpu       = true;
-    bool flash_attn    = false;
-    bool debug_mode    = false;
-    bool use_vad       = false;
-    bool use_itn       = false;
-    bool use_prefix   = false;
-    std::string language  = "auto";
-    std::string model     = "models/ggml-base.en.bin";
+    bool use_gpu = true;
+    bool flash_attn = false;
+    bool debug_mode = false;
+    bool use_vad = false;
+    bool use_itn = false;
+    bool use_prefix = false;
+    std::string language = "auto";
+    std::string model = "models/ggml-base.en.bin";
     std::string fname_out;
 };
 
 
-void sense_voice_stream_usage(int /*argc*/, char ** argv, const sense_voice_stream_params & params) {
+void sense_voice_stream_usage(int /*argc*/, char **argv, const sense_voice_stream_params &params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h,       --help              [default] show this help message and exit\n");
     fprintf(stderr, "  -t N,     --threads N         [%-7d] [SenseVoice] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "            --chunk_size        [%-7d] vad chunk size(ms)\n",                                       params.chunk_size);
-    fprintf(stderr, "  -mmc      --min-mute-chunks   [%-7d] When consecutive chunks are identified as silence\n",        params.min_mute_chunks);
-    fprintf(stderr, "  -mnc      --max-nomute-chunks [%-7d] when the first non-silent chunk is too far away\n",          params.max_nomute_chunks);
-    fprintf(stderr, "            --use-vad           [%-7s] when the first non-silent chunk is too far away\n",          params.use_vad ? "true" : "false");
-    fprintf(stderr, "            --use-prefix        [%-7s] use sense voice prefix\n",          params.use_prefix ? "true" : "false");
-    fprintf(stderr, "  -c ID,    --capture ID        [%-7d] [Device] capture device ID\n",                               params.capture_id);
-    fprintf(stderr, "  -l LANG,  --language LANG     [%-7s] [SenseVoice] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -m FNAME, --model FNAME       [%-7s] [SenseVoice] model path\n",                                  params.model.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME        [%-7s] [IO] text output file name\n",                               params.fname_out.c_str());
-    fprintf(stderr, "  -ng,      --no-gpu            [%-7s] [SenseVoice] disable GPU inference\n",                       params.use_gpu ? "false" : "true");
-    fprintf(stderr, "  -fa,      --flash-attn        [%-7s] [SenseVoice] flash attention during inference\n",            params.flash_attn ? "true" : "false");
-    fprintf(stderr, "            --use-itn           [%-7s] [SenseVoice] Filter duplicate tokens when outputting\n",     params.use_itn ? "true" : "false");
+    fprintf(stderr, "            --chunk_size        [%-7d] vad chunk size(ms)\n", params.chunk_size);
+    fprintf(stderr, "  -mmc      --min-mute-chunks   [%-7d] When consecutive chunks are identified as silence\n", params.min_mute_chunks);
+    fprintf(stderr, "  -mnc      --max-nomute-chunks [%-7d] when the first non-silent chunk is too far away\n", params.max_nomute_chunks);
+    fprintf(stderr, "            --use-vad           [%-7s] when the first non-silent chunk is too far away\n", params.use_vad ? "true" : "false");
+    fprintf(stderr, "            --use-prefix        [%-7s] use sense voice prefix\n", params.use_prefix ? "true" : "false");
+    fprintf(stderr, "  -c ID,    --capture ID        [%-7d] [Device] capture device ID\n", params.capture_id);
+    fprintf(stderr, "  -l LANG,  --language LANG     [%-7s] [SenseVoice] spoken language\n", params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME       [%-7s] [SenseVoice] model path\n", params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME        [%-7s] [IO] text output file name\n", params.fname_out.c_str());
+    fprintf(stderr, "  -ng,      --no-gpu            [%-7s] [SenseVoice] disable GPU inference\n", params.use_gpu ? "false" : "true");
+    fprintf(stderr, "  -fa,      --flash-attn        [%-7s] [SenseVoice] flash attention during inference\n", params.flash_attn ? "true" : "false");
+    fprintf(stderr, "            --use-itn           [%-7s] [SenseVoice] Filter duplicate tokens when outputting\n", params.use_itn ? "true" : "false");
     fprintf(stderr, "\n");
 }
 
 
-static bool get_stream_params(int argc, char ** argv, sense_voice_stream_params & params) {
+static bool get_stream_params(int argc, char **argv, sense_voice_stream_params &params) {
     for (int i = 1; i < argc; i++) {
         std::string arg = argv[i];
 
         if (arg == "-h" || arg == "--help") {
             sense_voice_stream_usage(argc, argv, params);
             exit(0);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-c" || arg == "--capture") {
+            params.capture_id = std::stoi(argv[++i]);
+        } else if (arg == "-l" || arg == "--language") {
+            params.language = argv[++i];
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-f" || arg == "--file") {
+            params.fname_out = argv[++i];
+        } else if (arg == "-ng" || arg == "--no-gpu") {
+            params.use_gpu = false;
+        } else if (arg == "-fa" || arg == "--flash-attn") {
+            params.flash_attn = true;
+        } else if (arg == "-debug" || arg == "--debug-mode") {
+            params.debug_mode = true;
+        } else if (arg == "-mmc" || arg == "--min-mute-chunks") {
+            params.min_mute_chunks = std::stoi(argv[++i]);
+        } else if (arg == "-mnc" || arg == "--max-nomute-chunks") {
+            params.max_nomute_chunks = std::stoi(argv[++i]);
+        } else if (arg == "--use-vad") {
+            params.use_vad = true;
+        } else if (arg == "--use-prefix") {
+            params.use_prefix = true;
+        } else if (arg == "--chunk-size") {
+            params.chunk_size = std::stoi(argv[++i]);
+        } else if (arg == "--use-itn") {
+            params.use_itn = true;
         }
-        else if (arg == "-t"    || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
-        else if (arg == "-c"    || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
-        else if (arg == "-l"    || arg == "--language")      { params.language      = argv[++i]; }
-        else if (arg == "-m"    || arg == "--model")         { params.model         = argv[++i]; }
-        else if (arg == "-f"    || arg == "--file")          { params.fname_out     = argv[++i]; }
-        else if (arg == "-ng"   || arg == "--no-gpu")        { params.use_gpu       = false; }
-        else if (arg == "-fa"   || arg == "--flash-attn")    { params.flash_attn    = true; }
-        else if (arg == "-debug"|| arg == "--debug-mode")    { params.debug_mode    = true; }
-        else if (arg == "-mmc"  || arg == "--min-mute-chunks")   { params.min_mute_chunks   = std::stoi(argv[++i]); }
-        else if (arg == "-mnc"  || arg == "--max-nomute-chunks") { params.max_nomute_chunks = std::stoi(argv[++i]); }
-        else if (                  arg == "--use-vad")           { params.use_vad           = true; }
-        else if (                  arg == "--use-prefix")        { params.use_prefix        = true; }
-        else if (                  arg == "--chunk-size")        { params.chunk_size        = std::stoi(argv[++i]); }
-        else if (                  arg == "--use-itn")           { params.use_itn           = true; }
 
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -85,7 +100,7 @@ static bool get_stream_params(int argc, char ** argv, sense_voice_stream_params
     return true;
 }
 
-void sense_voice_free(struct sense_voice_context * ctx) {
+void sense_voice_free(struct sense_voice_context *ctx) {
     if (ctx) {
         ggml_free(ctx->model.ctx);
 
@@ -100,8 +115,7 @@ void sense_voice_free(struct sense_voice_context * ctx) {
 }
 
 
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
     sense_voice_stream_params params;
     if (get_stream_params(argc, argv, params) == false) return 1;
     const int n_sample_step = params.chunk_size * 1e-3 * SENSE_VOICE_SAMPLE_RATE;
@@ -124,8 +138,7 @@ int main(int argc, char** argv)
     struct sense_voice_context *ctx = sense_voice_small_init_from_file_with_params(params.model.c_str(), cparams);
     std::vector<float> pcmf32_audio;
     std::vector<double> pcmf32;
-    std::vector<double> pcmf32_tmp;  // 传递给模型用
-    std::vector<int> prompt_tokens;
+    std::vector<double> pcmf32_tmp;// 传递给模型用
     std::ofstream fout;
     if (params.fname_out.length() > 0) {
         fout.open(params.fname_out);
@@ -156,14 +169,15 @@ int main(int argc, char** argv)
 
     sense_voice_full_params wparams = sense_voice_full_default_params(SENSE_VOICE_SAMPLING_GREEDY);
     {
-        wparams.language         = params.language.c_str();
-        wparams.n_threads        = params.n_threads;
-        wparams.debug_mode       = params.debug_mode;
+        wparams.language = params.language.c_str();
+        wparams.n_threads = params.n_threads;
+        wparams.debug_mode = params.debug_mode;
     }
 
     int idenitified_floats = 0, R_new_chunk = 0, L_new_chunk = 0;
     // 只有vad会使用，但是会贯穿全程，只能定义在外面
-    int L_nomute = -1, L_mute = -1, R_mute = -1;
+    std::pair<int, int> nomute = std::pair<int, int>(-1, 0);// 标记最后一段非静音的区间
+    std::pair<int, int> mute = std::pair<int, int>(-1, -1); // 标记最后一段静音的区间
     while (is_running) {
         // handle Ctrl + C
         is_running = sdl_poll_events();
@@ -177,122 +191,113 @@ int main(int argc, char** argv)
         pcmf32_audio.clear();
         // 新的识别区间：[L_new_chunk, R_new_chunk)，是固定n_sample_step的整数倍
         // fprintf(stderr, "%s || new_audio_size: %d, cache_size: %d, L_new_chunk: %d, R_new_chunk: %d\n", __func__, pcmf32_audio.size(), pcmf32.size(), L_new_chunk, R_new_chunk);
-        if(R_new_chunk + n_sample_step <= pcmf32.size())
-        {
+        if (R_new_chunk + n_sample_step <= pcmf32.size() + idenitified_floats) {
             L_new_chunk = R_new_chunk;
-            R_new_chunk = int(pcmf32.size() / n_sample_step) * n_sample_step;
-        }
-        else continue;
+            R_new_chunk = int((pcmf32.size() + idenitified_floats) / n_sample_step) * n_sample_step;
+        } else
+            continue;
 
         if (!params.use_vad) {
-            // 识别当前已经识别到的文本恩
+            // 识别当前已经识别到的文本
             printf("\33[2K\r");
             printf("%s", std::string(50, ' ').c_str());
             printf("\33[2K\r");
-            printf("[%.2f-%.2f]", idenitified_floats / (SENSE_VOICE_SAMPLE_RATE * 1.0), (R_new_chunk + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0));
-            if (sense_voice_full_parallel(ctx, wparams, pcmf32, R_new_chunk, params.n_processors) != 0) {
+            printf("[%.2f-%.2f]", idenitified_floats / (SENSE_VOICE_SAMPLE_RATE * 1.0), R_new_chunk / (SENSE_VOICE_SAMPLE_RATE * 1.0));
+            if (sense_voice_full_parallel(ctx, wparams, pcmf32, R_new_chunk - idenitified_floats, params.n_processors) != 0) {
                 fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                 return 10;
             }
 
             sense_voice_print_output(ctx, params.use_prefix, params.use_itn, true);
             // 时间长度太长了直接换行重新开始
-            if (R_new_chunk >= max_nomute_step)
-            {
+            if (R_new_chunk >= max_nomute_step + idenitified_floats) {
                 printf("\n");
-                pcmf32_tmp = std::vector<double>(pcmf32.begin() + R_new_chunk, pcmf32.end());
+                pcmf32_tmp = std::vector<double>(pcmf32.begin() + R_new_chunk - idenitified_floats, pcmf32.end());
                 pcmf32 = pcmf32_tmp;
-                idenitified_floats += R_new_chunk;
-                L_new_chunk = R_new_chunk = 0;
+                idenitified_floats = R_new_chunk;
             }
-        }
-        else
-        {
-            // 基于pcmf32.begin()是0来计算的，转化成全局坐标需要加上idenitified_floats
-            for(int i = L_new_chunk; i < R_new_chunk; i += n_sample_step)
-            {
-                int R_this_chunk = i + n_sample_step;
-                bool isnomute = vad_energy_zcr<double>(pcmf32.begin() + i, n_sample_step, SENSE_VOICE_SAMPLE_RATE, 1e-5, 0.2);
-                // fprintf(stderr, "Mute || isnomute = %d, L_mute = %d, R_Mute = %d, L_nomute = %d, R_this_chunk = %d, idenitified = %d\n", isnomute, L_mute, R_mute, L_nomute, R_this_chunk, idenitified_floats);
-                if (L_nomute >= 0 && R_this_chunk - L_nomute >= max_nomute_step)
-                {
-                    int R_nomute = L_mute >= 0 ? L_mute : R_this_chunk;
-                    // printf("3333: %d %d %d %d %d\n", L_nomute, R_nomute, L_mute, i, R_this_chunk);
-                    pcmf32_tmp.resize(R_nomute - L_nomute);
-                    std::copy(pcmf32.begin() + L_nomute, pcmf32.begin() + R_nomute, pcmf32_tmp.begin());
-                    printf("type2 :[%.2f-%.2f - %d ]", idenitified_floats / (SENSE_VOICE_SAMPLE_RATE * 1.0), (R_new_chunk + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0), pcmf32_tmp.size());
-                    if (sense_voice_full_parallel(ctx, wparams, pcmf32_tmp, pcmf32_tmp.size(), params.n_processors) != 0) {
-                        fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-                        return 10;
-                    }
-                    fprintf(stderr, "print 2\n");
-                    sense_voice_print_output(ctx, params.use_prefix, params.use_itn);
-//                    fflush(stdout);
-                    if (!isnomute) L_nomute = -1;
-                    else if (R_mute >= 0) L_nomute = R_mute;
-                    else L_nomute = i;
-                    L_mute = R_mute = -1;
+        } else {
+            // 刷新整行
+            printf("\33[2K\r");
+            printf("%s", std::string(50, ' ').c_str());
+            printf("\33[2K\r");
+            // 新进来的所有chunk有可能导致序列分拆，需要注意
+            for (int i = L_new_chunk; i < R_new_chunk; i += n_sample_step) {
+                // int R_this_chunk = i + n_sample_step;
+                bool isnomute = vad_energy_zcr<double>(pcmf32.begin() + i - idenitified_floats, n_sample_step, SENSE_VOICE_SAMPLE_RATE, 1e-5, 0.2);
+                // fprintf(stderr, "Mute || isnomute = %d, ML = %d, MR = %d, NML = %d, NMR = %d, R_new_chunk = %d, i = %d, size = %d, idenitified = %d\n", isnomute, mute.first, mute.second, nomute.first, nomute.second, R_new_chunk, i, pcmf32.size(), idenitified_floats);
+                if (nomute.first == -1) {
+                    if (isnomute) nomute.first = i;
                     continue;
                 }
-                if (isnomute)
-                {
-                    if (L_nomute < 0) L_nomute = i;
-
-                    printf("\33[2K\r");
-                    printf("%s", std::string(50, ' ').c_str());
-                    printf("\33[2K\r");
-                    pcmf32_tmp.resize(R_new_chunk - L_nomute);
-                    std::copy(pcmf32.begin() + L_nomute, pcmf32.begin() + R_new_chunk, pcmf32_tmp.begin());
-                    printf("[%.2f-%.2f]", (L_nomute + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0), (R_new_chunk + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0));
-
-                    if (sense_voice_full_parallel(ctx, wparams, pcmf32_tmp, pcmf32_tmp.size(), params.n_processors) != 0) {
-                        fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-                        return 10;
-                    }
-
-                    sense_voice_print_output(ctx, params.use_prefix, params.use_itn, true);
-                }
-                else
-                {
-                    if (R_mute != i) L_mute = i;
-                    R_mute = R_this_chunk;
-                    if (L_mute >= L_nomute && L_nomute >= 0 && R_this_chunk - L_mute >= keep_nomute_step)
-                    {
-                        // printf("2222: %d %d %d %d %d\n", L_nomute, R_nomute, L_mute, i, R_this_chunk);
-                        pcmf32_tmp.resize(L_mute - L_nomute);
-                        std::copy(pcmf32.begin() + L_nomute, pcmf32.begin() + L_mute, pcmf32_tmp.begin());
-                        printf("[%.2f-%.2f]", (L_nomute + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0), (L_mute + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0));
+                if (mute.first == -1) {
+                    // type1 [NML, ...)
+                    if (!isnomute) mute.first = i;
+                } else if (mute.second == -1) {
+                    // type2 [NML, ML), [ML, ...)
+                    if (isnomute) mute.second = i;
+                    else if (i - mute.first >= keep_nomute_step) {
+                        // 需要识别[NML, ML)，并退化到nomute, mute全部为-1的情况
+                        pcmf32_tmp.resize(mute.first - nomute.first);
+                        std::copy(pcmf32.begin() + nomute.first - idenitified_floats, pcmf32.begin() + mute.first - idenitified_floats, pcmf32_tmp.begin());
+                        printf("[%.2f-%.2f]", nomute.first / (SENSE_VOICE_SAMPLE_RATE * 1.0), mute.first / (SENSE_VOICE_SAMPLE_RATE * 1.0));
                         if (sense_voice_full_parallel(ctx, wparams, pcmf32_tmp, pcmf32_tmp.size(), params.n_processors) != 0) {
                             fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                             return 10;
                         }
-                        sense_voice_print_output(ctx, params.use_prefix, params.use_itn);
-
-                        if (!isnomute) L_nomute = -1;
-                        else if (R_mute >= 0) L_nomute = R_mute;
-                        else L_nomute = i;
-                        L_mute = R_mute = -1;
+                        sense_voice_print_output(ctx, params.use_prefix, params.use_itn);// 这里整行输出即可
+                        nomute.second = i;
+                        nomute.first = mute.first = mute.second = -1;
+                    }
+                } else {
+                    // type3 [NML, ML), [ML, MR), [MR, ...)
+                    if (!isnomute) mute.first = i, mute.second = -1;
+                }
+                // 可能需要裂解
+                if (nomute.first >= 0 && i - nomute.first >= max_nomute_step) {
+                    int R_nomute = mute.first == -1 ? nomute.first + max_nomute_step : mute.first;
+                    pcmf32_tmp.resize(R_nomute - nomute.first);
+                    std::copy(pcmf32.begin() + (nomute.first - idenitified_floats), pcmf32.begin() + (R_nomute - idenitified_floats), pcmf32_tmp.begin());
+                    printf("[%.2f-%.2f]", nomute.first / (SENSE_VOICE_SAMPLE_RATE * 1.0), R_nomute / (SENSE_VOICE_SAMPLE_RATE * 1.0));
+                    if (sense_voice_full_parallel(ctx, wparams, pcmf32_tmp, pcmf32_tmp.size(), params.n_processors) != 0) {
+                        fprintf(stderr, "%s: failed to process audio\n", argv[0]);
+                        return 10;
+                    }
+                    sense_voice_print_output(ctx, params.use_prefix, params.use_itn);// 这里整行输出即可
+                    if (mute.first == -1) {
+                        nomute.second = nomute.first + max_nomute_step;
+                        nomute.first += max_nomute_step;
+                    } else if (mute.second == -1) {
+                        nomute.second = mute.first;
+                        nomute.first = mute.first = mute.second = -1;
+                    } else {
+                        nomute.second = mute.first;
+                        nomute.first = mute.second;
+                        mute.first = mute.second = -1;
                     }
                 }
             }
-            if (L_nomute < 0)
-            {
-                idenitified_floats += R_new_chunk;
-                pcmf32_tmp = std::vector<double>(pcmf32.begin() + R_new_chunk, pcmf32.end());
-                pcmf32 = pcmf32_tmp;
-                L_new_chunk = R_new_chunk = 0;
-                L_mute = R_mute = -1;
+            // 输出最后一段
+            if (nomute.first >= 0) {
+                pcmf32_tmp.resize(R_new_chunk - nomute.first);
+                std::copy(pcmf32.begin() + (nomute.first - idenitified_floats), pcmf32.begin() + (R_new_chunk - idenitified_floats), pcmf32_tmp.begin());
+                printf("[%.2f-%.2f]", nomute.first / (SENSE_VOICE_SAMPLE_RATE * 1.0), R_new_chunk / (SENSE_VOICE_SAMPLE_RATE * 1.0));
+                if (sense_voice_full_parallel(ctx, wparams, pcmf32_tmp, pcmf32_tmp.size(), params.n_processors) != 0) {
+                    fprintf(stderr, "%s: failed to process audio\n", argv[0]);
+                    return 10;
+                }
+                sense_voice_print_output(ctx, params.use_prefix, params.use_itn, true);// 这里整行输出即可
             }
-            else
-            {
-                idenitified_floats += L_nomute;
-                pcmf32_tmp = std::vector<double>(pcmf32.begin() + L_nomute, pcmf32.end());
+            // 调整idenitified_floats并且减少pcmf32的长度
+            if (nomute.second > 0) {
+                pcmf32_tmp = std::vector<double>(pcmf32.begin() + (nomute.second - idenitified_floats), pcmf32.end());
+                pcmf32 = pcmf32_tmp;
+                idenitified_floats = nomute.second;
+                nomute.second = 0;
+            } else if (nomute.first == -1) {
+                pcmf32_tmp = std::vector<double>(pcmf32.begin() + (R_new_chunk - idenitified_floats), pcmf32.end());
                 pcmf32 = pcmf32_tmp;
-                L_new_chunk -= L_nomute;
-                R_new_chunk -= L_nomute;
-                L_mute = std::max(-1, L_mute - L_nomute);
-                R_mute = std::max(-1, R_mute - L_nomute);
-                L_nomute = 0;
+                idenitified_floats = R_new_chunk;
             }
         }
         fflush(stdout);