From eca791ffc3453d3bbd9331e36f0dae5c6339c6f6 Mon Sep 17 00:00:00 2001 From: Ivy233 Date: Mon, 24 Mar 2025 10:27:37 +0800 Subject: [PATCH] =?UTF-8?q?[Develop]=20=E8=B0=83=E6=95=B4=E6=B5=81?= =?UTF-8?q?=E8=BE=93=E5=85=A5=E5=9C=A8use-vad=E6=83=85=E5=86=B5=E4=B8=8B?= =?UTF-8?q?=E7=9A=84=E9=80=BB=E8=BE=91=EF=BC=9A=201.=20=E5=9C=A8=E8=AF=A5?= =?UTF-8?q?=E6=83=85=E5=86=B5=E4=B8=8B=EF=BC=8C=E5=8F=AF=E4=BB=A5=E7=9C=8B?= =?UTF-8?q?=E5=88=B0=E5=AE=9E=E6=97=B6=E7=9A=84=E8=AF=AD=E9=9F=B3=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E3=80=82=E5=8E=9F=E5=85=88=E7=9A=84=E2=80=9C=E6=9C=80?= =?UTF-8?q?=E9=95=BF=E8=AF=AD=E9=9F=B3=E2=80=9D=E4=B8=8E=E2=80=9C=E5=88=87?= =?UTF-8?q?=E5=89=B2=E6=89=80=E9=9C=80=E6=9C=80=E7=9F=AD=E9=97=B4=E9=9A=94?= =?UTF-8?q?=E2=80=9D=E8=A7=84=E5=88=99=E4=BE=9D=E6=97=A7=E7=94=9F=E6=95=88?= =?UTF-8?q?=E3=80=82=202.=20=E5=9C=A8=E8=AF=A5=E6=83=85=E5=86=B5=E4=B8=8B?= =?UTF-8?q?=EF=BC=8C=E5=A6=82=E6=9E=9C=E9=9C=80=E8=A6=81=E5=88=87=E5=89=B2?= =?UTF-8?q?=E9=83=A8=E5=88=86=E8=AF=AD=E9=9F=B3=EF=BC=8C=E5=89=A9=E4=BD=99?= =?UTF-8?q?=E9=83=A8=E5=88=86=E4=BC=9A=E6=8D=A2=E8=A1=8C=E7=BB=A7=E7=BB=AD?= =?UTF-8?q?=E8=BE=93=E5=87=BA=E3=80=82=203.=20=E5=9C=A8=E8=AF=A5=E6=83=85?= =?UTF-8?q?=E5=86=B5=E4=B8=8B=EF=BC=8C=E5=A6=82=E6=9E=9C=E7=BC=93=E5=86=B2?= =?UTF-8?q?=E5=8C=BA=E6=B2=A1=E6=9C=89=E9=9C=80=E8=A6=81=E8=AF=86=E5=88=AB?= =?UTF-8?q?=E7=9A=84=E8=AF=AD=E9=9F=B3=EF=BC=8C=E5=B9=B6=E4=B8=94=E6=8E=A5?= =?UTF-8?q?=E5=8F=97=E5=88=B0=E4=B8=80=E6=AE=B5=E6=B2=A1=E6=9C=89=E8=AF=AD?= =?UTF-8?q?=E9=9F=B3=E6=B4=BB=E5=8A=A8=E7=9A=84=E9=9F=B3=E9=A2=91=EF=BC=8C?= =?UTF-8?q?=E4=BC=9A=E8=88=8D=E5=BC=83=E4=B9=8B=E3=80=82=204.=20=E5=86=85?= =?UTF-8?q?=E9=83=A8=E8=AE=B0=E5=BD=95=E7=9A=84=E6=97=B6=E9=97=B4=E6=88=B3?= =?UTF-8?q?=E5=90=AB=E4=B9=89=E5=8F=91=E7=94=9F=E6=94=B9=E5=8F=98=EF=BC=8C?= =?UTF-8?q?=E4=BB=8E=E7=9B=B8=E5=AF=B9=E5=80=BC=E8=B0=83=E6=95=B4=E4=B8=BA?= =?UTF-8?q?=E7=BB=9D=E5=AF=B9=E5=80=BC=EF=BC=8C=E8=BF=99=E5=8F=AF=E4=BB=A5?= =?UTF-8?q?=E5=A2=9E=E5=BC=BA=E5=AF=B9pcmf32=E7=BC=93=E5=86=B2=E5=8C=BA?= =?UTF-8?q?=E5=A4=84=E7=90=86=E7=9A=84=E4=BB=A3=E7=A0=81=E5=8F=AF=E8=AF=BB?= =?UTF-8?q?=E6=80=A7=E3=80=82=205.=20=E4=BF=AE=E5=A4=8D=E4=B9=8B=E5=89=8D?= =?UTF-8?q?=E5=85=B6=E4=BB=96=E4=BA=BA=E6=8F=90=E4=BA=A4=E4=B8=AD=EF=BC=8C?= =?UTF-8?q?=E6=97=B6=E9=97=B4=E5=8C=BA=E9=97=B4=E4=BC=9A=E5=8F=8D=E5=A4=8D?= =?UTF-8?q?=E5=87=BA=E7=8E=B0=E7=9A=84bug=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/stream/stream.cc | 297 +++++++++++++++++++------------------- 1 file changed, 151 insertions(+), 146 deletions(-) diff --git a/examples/stream/stream.cc b/examples/stream/stream.cc index ec5be77..68e4780 100644 --- a/examples/stream/stream.cc +++ b/examples/stream/stream.cc @@ -1,79 +1,94 @@ +#include "common-sdl.h" +#include "common.h" +#include "sense-voice.h" +#include #include #include +#include +#include #include #include #include -#include -#include -#include "sense-voice.h" -#include "common-sdl.h" -#include "common.h" struct sense_voice_stream_params { - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_processors = 1; - int32_t capture_id = -1; - int32_t chunk_size = 100; // ms - int32_t max_nomute_chunks = 8000 / chunk_size; // chunks - int32_t min_mute_chunks = 1000 / chunk_size; // chunks + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t n_processors = 1; + int32_t capture_id = -1; + int32_t chunk_size = 100; // ms + int32_t max_nomute_chunks = 8000 / chunk_size;// chunks + int32_t min_mute_chunks = 1000 / chunk_size; // chunks - bool use_gpu = true; - bool flash_attn = false; - bool debug_mode = false; - bool use_vad = false; - bool use_itn = false; - bool use_prefix = false; - std::string language = "auto"; - std::string model = "models/ggml-base.en.bin"; + bool use_gpu = true; + bool flash_attn = false; + bool debug_mode = false; + bool use_vad = false; + bool use_itn = false; + bool use_prefix = false; + std::string language = "auto"; + std::string model = "models/ggml-base.en.bin"; std::string fname_out; }; -void sense_voice_stream_usage(int /*argc*/, char ** argv, const sense_voice_stream_params & params) { +void sense_voice_stream_usage(int /*argc*/, char **argv, const sense_voice_stream_params ¶ms) { fprintf(stderr, "\n"); fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help [default] show this help message and exit\n"); fprintf(stderr, " -t N, --threads N [%-7d] [SenseVoice] number of threads to use during computation\n", params.n_threads); - fprintf(stderr, " --chunk_size [%-7d] vad chunk size(ms)\n", params.chunk_size); - fprintf(stderr, " -mmc --min-mute-chunks [%-7d] When consecutive chunks are identified as silence\n", params.min_mute_chunks); - fprintf(stderr, " -mnc --max-nomute-chunks [%-7d] when the first non-silent chunk is too far away\n", params.max_nomute_chunks); - fprintf(stderr, " --use-vad [%-7s] when the first non-silent chunk is too far away\n", params.use_vad ? "true" : "false"); - fprintf(stderr, " --use-prefix [%-7s] use sense voice prefix\n", params.use_prefix ? "true" : "false"); - fprintf(stderr, " -c ID, --capture ID [%-7d] [Device] capture device ID\n", params.capture_id); - fprintf(stderr, " -l LANG, --language LANG [%-7s] [SenseVoice] spoken language\n", params.language.c_str()); - fprintf(stderr, " -m FNAME, --model FNAME [%-7s] [SenseVoice] model path\n", params.model.c_str()); - fprintf(stderr, " -f FNAME, --file FNAME [%-7s] [IO] text output file name\n", params.fname_out.c_str()); - fprintf(stderr, " -ng, --no-gpu [%-7s] [SenseVoice] disable GPU inference\n", params.use_gpu ? "false" : "true"); - fprintf(stderr, " -fa, --flash-attn [%-7s] [SenseVoice] flash attention during inference\n", params.flash_attn ? "true" : "false"); - fprintf(stderr, " --use-itn [%-7s] [SenseVoice] Filter duplicate tokens when outputting\n", params.use_itn ? "true" : "false"); + fprintf(stderr, " --chunk_size [%-7d] vad chunk size(ms)\n", params.chunk_size); + fprintf(stderr, " -mmc --min-mute-chunks [%-7d] When consecutive chunks are identified as silence\n", params.min_mute_chunks); + fprintf(stderr, " -mnc --max-nomute-chunks [%-7d] when the first non-silent chunk is too far away\n", params.max_nomute_chunks); + fprintf(stderr, " --use-vad [%-7s] when the first non-silent chunk is too far away\n", params.use_vad ? "true" : "false"); + fprintf(stderr, " --use-prefix [%-7s] use sense voice prefix\n", params.use_prefix ? "true" : "false"); + fprintf(stderr, " -c ID, --capture ID [%-7d] [Device] capture device ID\n", params.capture_id); + fprintf(stderr, " -l LANG, --language LANG [%-7s] [SenseVoice] spoken language\n", params.language.c_str()); + fprintf(stderr, " -m FNAME, --model FNAME [%-7s] [SenseVoice] model path\n", params.model.c_str()); + fprintf(stderr, " -f FNAME, --file FNAME [%-7s] [IO] text output file name\n", params.fname_out.c_str()); + fprintf(stderr, " -ng, --no-gpu [%-7s] [SenseVoice] disable GPU inference\n", params.use_gpu ? "false" : "true"); + fprintf(stderr, " -fa, --flash-attn [%-7s] [SenseVoice] flash attention during inference\n", params.flash_attn ? "true" : "false"); + fprintf(stderr, " --use-itn [%-7s] [SenseVoice] Filter duplicate tokens when outputting\n", params.use_itn ? "true" : "false"); fprintf(stderr, "\n"); } -static bool get_stream_params(int argc, char ** argv, sense_voice_stream_params & params) { +static bool get_stream_params(int argc, char **argv, sense_voice_stream_params ¶ms) { for (int i = 1; i < argc; i++) { std::string arg = argv[i]; if (arg == "-h" || arg == "--help") { sense_voice_stream_usage(argc, argv, params); exit(0); + } else if (arg == "-t" || arg == "--threads") { + params.n_threads = std::stoi(argv[++i]); + } else if (arg == "-c" || arg == "--capture") { + params.capture_id = std::stoi(argv[++i]); + } else if (arg == "-l" || arg == "--language") { + params.language = argv[++i]; + } else if (arg == "-m" || arg == "--model") { + params.model = argv[++i]; + } else if (arg == "-f" || arg == "--file") { + params.fname_out = argv[++i]; + } else if (arg == "-ng" || arg == "--no-gpu") { + params.use_gpu = false; + } else if (arg == "-fa" || arg == "--flash-attn") { + params.flash_attn = true; + } else if (arg == "-debug" || arg == "--debug-mode") { + params.debug_mode = true; + } else if (arg == "-mmc" || arg == "--min-mute-chunks") { + params.min_mute_chunks = std::stoi(argv[++i]); + } else if (arg == "-mnc" || arg == "--max-nomute-chunks") { + params.max_nomute_chunks = std::stoi(argv[++i]); + } else if (arg == "--use-vad") { + params.use_vad = true; + } else if (arg == "--use-prefix") { + params.use_prefix = true; + } else if (arg == "--chunk-size") { + params.chunk_size = std::stoi(argv[++i]); + } else if (arg == "--use-itn") { + params.use_itn = true; } - else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); } - else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); } - else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } - else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } - else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } - else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } - else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } - else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; } - else if (arg == "-mmc" || arg == "--min-mute-chunks") { params.min_mute_chunks = std::stoi(argv[++i]); } - else if (arg == "-mnc" || arg == "--max-nomute-chunks") { params.max_nomute_chunks = std::stoi(argv[++i]); } - else if ( arg == "--use-vad") { params.use_vad = true; } - else if ( arg == "--use-prefix") { params.use_prefix = true; } - else if ( arg == "--chunk-size") { params.chunk_size = std::stoi(argv[++i]); } - else if ( arg == "--use-itn") { params.use_itn = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -85,7 +100,7 @@ static bool get_stream_params(int argc, char ** argv, sense_voice_stream_params return true; } -void sense_voice_free(struct sense_voice_context * ctx) { +void sense_voice_free(struct sense_voice_context *ctx) { if (ctx) { ggml_free(ctx->model.ctx); @@ -100,8 +115,7 @@ void sense_voice_free(struct sense_voice_context * ctx) { } -int main(int argc, char** argv) -{ +int main(int argc, char **argv) { sense_voice_stream_params params; if (get_stream_params(argc, argv, params) == false) return 1; const int n_sample_step = params.chunk_size * 1e-3 * SENSE_VOICE_SAMPLE_RATE; @@ -124,8 +138,7 @@ int main(int argc, char** argv) struct sense_voice_context *ctx = sense_voice_small_init_from_file_with_params(params.model.c_str(), cparams); std::vector pcmf32_audio; std::vector pcmf32; - std::vector pcmf32_tmp; // 传递给模型用 - std::vector prompt_tokens; + std::vector pcmf32_tmp;// 传递给模型用 std::ofstream fout; if (params.fname_out.length() > 0) { fout.open(params.fname_out); @@ -156,14 +169,15 @@ int main(int argc, char** argv) sense_voice_full_params wparams = sense_voice_full_default_params(SENSE_VOICE_SAMPLING_GREEDY); { - wparams.language = params.language.c_str(); - wparams.n_threads = params.n_threads; - wparams.debug_mode = params.debug_mode; + wparams.language = params.language.c_str(); + wparams.n_threads = params.n_threads; + wparams.debug_mode = params.debug_mode; } int idenitified_floats = 0, R_new_chunk = 0, L_new_chunk = 0; // 只有vad会使用,但是会贯穿全程,只能定义在外面 - int L_nomute = -1, L_mute = -1, R_mute = -1; + std::pair nomute = std::pair(-1, 0);// 标记最后一段非静音的区间 + std::pair mute = std::pair(-1, -1); // 标记最后一段静音的区间 while (is_running) { // handle Ctrl + C is_running = sdl_poll_events(); @@ -177,122 +191,113 @@ int main(int argc, char** argv) pcmf32_audio.clear(); // 新的识别区间:[L_new_chunk, R_new_chunk),是固定n_sample_step的整数倍 // fprintf(stderr, "%s || new_audio_size: %d, cache_size: %d, L_new_chunk: %d, R_new_chunk: %d\n", __func__, pcmf32_audio.size(), pcmf32.size(), L_new_chunk, R_new_chunk); - if(R_new_chunk + n_sample_step <= pcmf32.size()) - { + if (R_new_chunk + n_sample_step <= pcmf32.size() + idenitified_floats) { L_new_chunk = R_new_chunk; - R_new_chunk = int(pcmf32.size() / n_sample_step) * n_sample_step; - } - else continue; + R_new_chunk = int((pcmf32.size() + idenitified_floats) / n_sample_step) * n_sample_step; + } else + continue; if (!params.use_vad) { - // 识别当前已经识别到的文本恩 + // 识别当前已经识别到的文本 printf("\33[2K\r"); printf("%s", std::string(50, ' ').c_str()); printf("\33[2K\r"); - printf("[%.2f-%.2f]", idenitified_floats / (SENSE_VOICE_SAMPLE_RATE * 1.0), (R_new_chunk + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0)); - if (sense_voice_full_parallel(ctx, wparams, pcmf32, R_new_chunk, params.n_processors) != 0) { + printf("[%.2f-%.2f]", idenitified_floats / (SENSE_VOICE_SAMPLE_RATE * 1.0), R_new_chunk / (SENSE_VOICE_SAMPLE_RATE * 1.0)); + if (sense_voice_full_parallel(ctx, wparams, pcmf32, R_new_chunk - idenitified_floats, params.n_processors) != 0) { fprintf(stderr, "%s: failed to process audio\n", argv[0]); return 10; } sense_voice_print_output(ctx, params.use_prefix, params.use_itn, true); // 时间长度太长了直接换行重新开始 - if (R_new_chunk >= max_nomute_step) - { + if (R_new_chunk >= max_nomute_step + idenitified_floats) { printf("\n"); - pcmf32_tmp = std::vector(pcmf32.begin() + R_new_chunk, pcmf32.end()); + pcmf32_tmp = std::vector(pcmf32.begin() + R_new_chunk - idenitified_floats, pcmf32.end()); pcmf32 = pcmf32_tmp; - idenitified_floats += R_new_chunk; - L_new_chunk = R_new_chunk = 0; + idenitified_floats = R_new_chunk; } - } - else - { - // 基于pcmf32.begin()是0来计算的,转化成全局坐标需要加上idenitified_floats - for(int i = L_new_chunk; i < R_new_chunk; i += n_sample_step) - { - int R_this_chunk = i + n_sample_step; - bool isnomute = vad_energy_zcr(pcmf32.begin() + i, n_sample_step, SENSE_VOICE_SAMPLE_RATE, 1e-5, 0.2); - // fprintf(stderr, "Mute || isnomute = %d, L_mute = %d, R_Mute = %d, L_nomute = %d, R_this_chunk = %d, idenitified = %d\n", isnomute, L_mute, R_mute, L_nomute, R_this_chunk, idenitified_floats); - if (L_nomute >= 0 && R_this_chunk - L_nomute >= max_nomute_step) - { - int R_nomute = L_mute >= 0 ? L_mute : R_this_chunk; - // printf("3333: %d %d %d %d %d\n", L_nomute, R_nomute, L_mute, i, R_this_chunk); - pcmf32_tmp.resize(R_nomute - L_nomute); - std::copy(pcmf32.begin() + L_nomute, pcmf32.begin() + R_nomute, pcmf32_tmp.begin()); - printf("type2 :[%.2f-%.2f - %d ]", idenitified_floats / (SENSE_VOICE_SAMPLE_RATE * 1.0), (R_new_chunk + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0), pcmf32_tmp.size()); - if (sense_voice_full_parallel(ctx, wparams, pcmf32_tmp, pcmf32_tmp.size(), params.n_processors) != 0) { - fprintf(stderr, "%s: failed to process audio\n", argv[0]); - return 10; - } - fprintf(stderr, "print 2\n"); - sense_voice_print_output(ctx, params.use_prefix, params.use_itn); -// fflush(stdout); - if (!isnomute) L_nomute = -1; - else if (R_mute >= 0) L_nomute = R_mute; - else L_nomute = i; - L_mute = R_mute = -1; + } else { + // 刷新整行 + printf("\33[2K\r"); + printf("%s", std::string(50, ' ').c_str()); + printf("\33[2K\r"); + // 新进来的所有chunk有可能导致序列分拆,需要注意 + for (int i = L_new_chunk; i < R_new_chunk; i += n_sample_step) { + // int R_this_chunk = i + n_sample_step; + bool isnomute = vad_energy_zcr(pcmf32.begin() + i - idenitified_floats, n_sample_step, SENSE_VOICE_SAMPLE_RATE, 1e-5, 0.2); + // fprintf(stderr, "Mute || isnomute = %d, ML = %d, MR = %d, NML = %d, NMR = %d, R_new_chunk = %d, i = %d, size = %d, idenitified = %d\n", isnomute, mute.first, mute.second, nomute.first, nomute.second, R_new_chunk, i, pcmf32.size(), idenitified_floats); + if (nomute.first == -1) { + if (isnomute) nomute.first = i; continue; } - if (isnomute) - { - if (L_nomute < 0) L_nomute = i; - - printf("\33[2K\r"); - printf("%s", std::string(50, ' ').c_str()); - printf("\33[2K\r"); - pcmf32_tmp.resize(R_new_chunk - L_nomute); - std::copy(pcmf32.begin() + L_nomute, pcmf32.begin() + R_new_chunk, pcmf32_tmp.begin()); - printf("[%.2f-%.2f]", (L_nomute + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0), (R_new_chunk + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0)); - - if (sense_voice_full_parallel(ctx, wparams, pcmf32_tmp, pcmf32_tmp.size(), params.n_processors) != 0) { - fprintf(stderr, "%s: failed to process audio\n", argv[0]); - return 10; - } - - sense_voice_print_output(ctx, params.use_prefix, params.use_itn, true); - } - else - { - if (R_mute != i) L_mute = i; - R_mute = R_this_chunk; - if (L_mute >= L_nomute && L_nomute >= 0 && R_this_chunk - L_mute >= keep_nomute_step) - { - // printf("2222: %d %d %d %d %d\n", L_nomute, R_nomute, L_mute, i, R_this_chunk); - pcmf32_tmp.resize(L_mute - L_nomute); - std::copy(pcmf32.begin() + L_nomute, pcmf32.begin() + L_mute, pcmf32_tmp.begin()); - printf("[%.2f-%.2f]", (L_nomute + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0), (L_mute + idenitified_floats) / (SENSE_VOICE_SAMPLE_RATE * 1.0)); + if (mute.first == -1) { + // type1 [NML, ...) + if (!isnomute) mute.first = i; + } else if (mute.second == -1) { + // type2 [NML, ML), [ML, ...) + if (isnomute) mute.second = i; + else if (i - mute.first >= keep_nomute_step) { + // 需要识别[NML, ML),并退化到nomute, mute全部为-1的情况 + pcmf32_tmp.resize(mute.first - nomute.first); + std::copy(pcmf32.begin() + nomute.first - idenitified_floats, pcmf32.begin() + mute.first - idenitified_floats, pcmf32_tmp.begin()); + printf("[%.2f-%.2f]", nomute.first / (SENSE_VOICE_SAMPLE_RATE * 1.0), mute.first / (SENSE_VOICE_SAMPLE_RATE * 1.0)); if (sense_voice_full_parallel(ctx, wparams, pcmf32_tmp, pcmf32_tmp.size(), params.n_processors) != 0) { fprintf(stderr, "%s: failed to process audio\n", argv[0]); return 10; } - sense_voice_print_output(ctx, params.use_prefix, params.use_itn); - - if (!isnomute) L_nomute = -1; - else if (R_mute >= 0) L_nomute = R_mute; - else L_nomute = i; - L_mute = R_mute = -1; + sense_voice_print_output(ctx, params.use_prefix, params.use_itn);// 这里整行输出即可 + nomute.second = i; + nomute.first = mute.first = mute.second = -1; + } + } else { + // type3 [NML, ML), [ML, MR), [MR, ...) + if (!isnomute) mute.first = i, mute.second = -1; + } + // 可能需要裂解 + if (nomute.first >= 0 && i - nomute.first >= max_nomute_step) { + int R_nomute = mute.first == -1 ? nomute.first + max_nomute_step : mute.first; + pcmf32_tmp.resize(R_nomute - nomute.first); + std::copy(pcmf32.begin() + (nomute.first - idenitified_floats), pcmf32.begin() + (R_nomute - idenitified_floats), pcmf32_tmp.begin()); + printf("[%.2f-%.2f]", nomute.first / (SENSE_VOICE_SAMPLE_RATE * 1.0), R_nomute / (SENSE_VOICE_SAMPLE_RATE * 1.0)); + if (sense_voice_full_parallel(ctx, wparams, pcmf32_tmp, pcmf32_tmp.size(), params.n_processors) != 0) { + fprintf(stderr, "%s: failed to process audio\n", argv[0]); + return 10; + } + sense_voice_print_output(ctx, params.use_prefix, params.use_itn);// 这里整行输出即可 + if (mute.first == -1) { + nomute.second = nomute.first + max_nomute_step; + nomute.first += max_nomute_step; + } else if (mute.second == -1) { + nomute.second = mute.first; + nomute.first = mute.first = mute.second = -1; + } else { + nomute.second = mute.first; + nomute.first = mute.second; + mute.first = mute.second = -1; } } } - if (L_nomute < 0) - { - idenitified_floats += R_new_chunk; - pcmf32_tmp = std::vector(pcmf32.begin() + R_new_chunk, pcmf32.end()); - pcmf32 = pcmf32_tmp; - L_new_chunk = R_new_chunk = 0; - L_mute = R_mute = -1; + // 输出最后一段 + if (nomute.first >= 0) { + pcmf32_tmp.resize(R_new_chunk - nomute.first); + std::copy(pcmf32.begin() + (nomute.first - idenitified_floats), pcmf32.begin() + (R_new_chunk - idenitified_floats), pcmf32_tmp.begin()); + printf("[%.2f-%.2f]", nomute.first / (SENSE_VOICE_SAMPLE_RATE * 1.0), R_new_chunk / (SENSE_VOICE_SAMPLE_RATE * 1.0)); + if (sense_voice_full_parallel(ctx, wparams, pcmf32_tmp, pcmf32_tmp.size(), params.n_processors) != 0) { + fprintf(stderr, "%s: failed to process audio\n", argv[0]); + return 10; + } + sense_voice_print_output(ctx, params.use_prefix, params.use_itn, true);// 这里整行输出即可 } - else - { - idenitified_floats += L_nomute; - pcmf32_tmp = std::vector(pcmf32.begin() + L_nomute, pcmf32.end()); + // 调整idenitified_floats并且减少pcmf32的长度 + if (nomute.second > 0) { + pcmf32_tmp = std::vector(pcmf32.begin() + (nomute.second - idenitified_floats), pcmf32.end()); + pcmf32 = pcmf32_tmp; + idenitified_floats = nomute.second; + nomute.second = 0; + } else if (nomute.first == -1) { + pcmf32_tmp = std::vector(pcmf32.begin() + (R_new_chunk - idenitified_floats), pcmf32.end()); pcmf32 = pcmf32_tmp; - L_new_chunk -= L_nomute; - R_new_chunk -= L_nomute; - L_mute = std::max(-1, L_mute - L_nomute); - R_mute = std::max(-1, R_mute - L_nomute); - L_nomute = 0; + idenitified_floats = R_new_chunk; } } fflush(stdout);