From ab86e25f2329f445fa825e35aeaf9b59ebb7feac Mon Sep 17 00:00:00 2001 From: Ivy233 Date: Fri, 15 Aug 2025 10:38:15 +0800 Subject: [PATCH 1/4] =?UTF-8?q?zcr-main=E6=8E=A5=E5=85=A5silero-vad?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/zcr_main/main.cc | 63 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/examples/zcr_main/main.cc b/examples/zcr_main/main.cc index 52c24d3..fdc8c14 100644 --- a/examples/zcr_main/main.cc +++ b/examples/zcr_main/main.cc @@ -3,6 +3,7 @@ // #include "common.h" #include "sense-voice.h" +#include "silero-vad.h" #include #include #include @@ -286,6 +287,12 @@ void sense_voice_free(struct sense_voice_context *ctx) { if (ctx) { ggml_free(ctx->model.ctx); ggml_backend_buffer_free(ctx->model.buffer); + + // 释放VAD相关资源 + ggml_free(ctx->state->vad_ctx); + ggml_backend_buffer_free(ctx->state->vad_lstm_hidden_state_buffer); + ggml_backend_buffer_free(ctx->state->vad_lstm_context_buffer); + sense_voice_free_state(ctx->state); delete ctx->model.model->encoder; delete ctx->model.model; @@ -301,7 +308,38 @@ void sense_voice_split_segments(struct sense_voice_context *ctx, const sense_voi // const bool use_vad = (n_samples_step <= 0); for (int i = 0; i < int(pcmf32.size()); i += n_sample_step) { int R_this_chunk = std::min(i + n_sample_step, int(pcmf32.size())); - bool isnomute = vad_energy_zcr(pcmf32.begin() + i, R_this_chunk - i, SENSE_VOICE_SAMPLE_RATE); + bool isnomute = false; + + // 使用silero-vad进行语音检测 + const int vad_chunk_size = 640; // VAD_CHUNK_SIZE + const int context_size = 576; + const int pad_size = 64; + + // 准备640大小的chunk数据 + std::vector chunk(vad_chunk_size, 0); + int start_idx = i; + + // 填充chunk数据,处理边界情况 + for (int j = 0; j < vad_chunk_size; j++) { + int actual_idx = start_idx + j - pad_size; + if (actual_idx >= 0 && actual_idx < pcmf32.size()) { + chunk[j] = static_cast(pcmf32[actual_idx]); + } else if (actual_idx < 0) { + // 反射填充 + int reflect_idx = -actual_idx - 1; + if (reflect_idx < pcmf32.size()) { + chunk[j] = static_cast(pcmf32[reflect_idx]); + } + } + } + + float speech_prob = 0; + if (silero_vad_encode_internal(*ctx, *ctx->state, chunk, params.n_threads, speech_prob)) { + isnomute = (speech_prob >= 0.5); // 默认阈值0.5 + } else { + // 如果VAD失败,回退到能量检测 + isnomute = vad_energy_zcr(pcmf32.begin() + i, R_this_chunk - i, SENSE_VOICE_SAMPLE_RATE); + } // fprintf(stderr, "Mute || L_mute = %d, R_Mute = %d, L_nomute = %d, R_this_chunk = %d, keep_nomute_step = %d\n", L_mute, R_mute, L_nomute, R_this_chunk, keep_nomute_step); if (L_nomute >= 0 && R_this_chunk - L_nomute >= max_nomute_step) { @@ -416,6 +454,29 @@ int main(int argc, char **argv) { ctx->language_id = sense_voice_lang_id(params.language.c_str()); + // 初始化silero-vad状态 + const int VAD_LSTM_STATE_MEMORY_SIZE = 256*1024; + const int VAD_LSTM_STATE_DIM = 128; + + ctx->state->vad_ctx = ggml_init({VAD_LSTM_STATE_MEMORY_SIZE, nullptr, true}); + ctx->state->vad_lstm_context = ggml_new_tensor_1d(ctx->state->vad_ctx, GGML_TYPE_F32, VAD_LSTM_STATE_DIM); + ctx->state->vad_lstm_hidden_state = ggml_new_tensor_1d(ctx->state->vad_ctx, GGML_TYPE_F32, VAD_LSTM_STATE_DIM); + + ctx->state->vad_lstm_context_buffer = ggml_backend_alloc_buffer(ctx->state->backends[0], + ggml_nbytes(ctx->state->vad_lstm_context) + + ggml_backend_get_alignment(ctx->state->backends[0])); + ctx->state->vad_lstm_hidden_state_buffer = ggml_backend_alloc_buffer(ctx->state->backends[0], + ggml_nbytes(ctx->state->vad_lstm_hidden_state) + + ggml_backend_get_alignment(ctx->state->backends[0])); + auto context_alloc = ggml_tallocr_new(ctx->state->vad_lstm_context_buffer); + ggml_tallocr_alloc(&context_alloc, ctx->state->vad_lstm_context); + + auto state_alloc = ggml_tallocr_new(ctx->state->vad_lstm_hidden_state_buffer); + ggml_tallocr_alloc(&state_alloc, ctx->state->vad_lstm_hidden_state); + + ggml_set_zero(ctx->state->vad_lstm_context); + ggml_set_zero(ctx->state->vad_lstm_hidden_state); + for (int f = 0; f < (int) params.fname_inp.size(); ++f) { const auto fname_inp = params.fname_inp[f]; const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f]; From d64dd488cad769284864e71c4c31fe0bfd10c5de Mon Sep 17 00:00:00 2001 From: Ivy233 Date: Mon, 8 Sep 2025 14:43:07 +0800 Subject: [PATCH 2/4] =?UTF-8?q?vad=E4=BF=AE=E5=A4=8D=EF=BC=9A=E5=81=9A3276?= =?UTF-8?q?8=E7=9A=84=E5=BD=92=E4=B8=80=E5=8C=96=E4=BD=BFspeech=5Fprob?= =?UTF-8?q?=E4=B8=8D=E5=86=8D=E5=BF=AB=E9=80=9F=E6=94=B6=E6=95=9B=E5=88=B0?= =?UTF-8?q?0.45=E5=B7=A6=E5=8F=B3=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/zcr_main/main.cc | 62 +++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/examples/zcr_main/main.cc b/examples/zcr_main/main.cc index fdc8c14..6a22d45 100644 --- a/examples/zcr_main/main.cc +++ b/examples/zcr_main/main.cc @@ -24,12 +24,13 @@ struct sense_voice_params { int32_t max_context = -1; int32_t n_mel = 80; int32_t audio_ctx = 0; - size_t chunk_size = 100; // ms + size_t chunk_size = 50; // ms size_t max_nomute_chunks = 30000 / chunk_size; // chunks size_t min_mute_chunks = 1000 / chunk_size; // chunks size_t max_chunks_in_batch = 90000 / chunk_size;// chunks size_t max_batch = 4; + float speech_prob_threshold = 0.1f; // speech probability threshold bool debug_mode = false; bool no_prints = false; bool use_gpu = true; @@ -115,6 +116,7 @@ static void sense_voice_print_usage(int /*argc*/, char **argv, const sense_voice fprintf(stderr, " -mnc --max-nomute-chunks [%-7lu] when the first non-silent chunk is too far away\n", params.max_nomute_chunks); fprintf(stderr, " --maxchunk-in-batch [%-7lu] the number of cutted audio can be processed at one time\n", params.max_chunks_in_batch); fprintf(stderr, " -b --batch [%-7lu] the number of cutted audio can be processed at one time\n", params.max_batch); + fprintf(stderr, " -spt --speech-prob-threshold [%-7.3f] speech probability threshold for VAD\n", params.speech_prob_threshold); fprintf(stderr, "\n"); } @@ -200,6 +202,8 @@ static bool sense_voice_params_parse(int argc, char **argv, sense_voice_params & params.chunk_size = std::stoi(argv[++i]); } else if (arg == "--outfile" || arg == "-fout") { params.outfile = argv[++i]; + } else if (arg == "-spt" || arg == "--speech-prob-threshold") { + params.speech_prob_threshold = std::stof(argv[++i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); sense_voice_print_usage(argc, argv, params); @@ -288,10 +292,21 @@ void sense_voice_free(struct sense_voice_context *ctx) { ggml_free(ctx->model.ctx); ggml_backend_buffer_free(ctx->model.buffer); - // 释放VAD相关资源 - ggml_free(ctx->state->vad_ctx); - ggml_backend_buffer_free(ctx->state->vad_lstm_hidden_state_buffer); - ggml_backend_buffer_free(ctx->state->vad_lstm_context_buffer); + // 释放VAD相关资源 - 添加空指针检查 + if (ctx->state) { + if (ctx->state->vad_ctx) { + ggml_free(ctx->state->vad_ctx); + ctx->state->vad_ctx = nullptr; + } + if (ctx->state->vad_lstm_hidden_state_buffer) { + ggml_backend_buffer_free(ctx->state->vad_lstm_hidden_state_buffer); + ctx->state->vad_lstm_hidden_state_buffer = nullptr; + } + if (ctx->state->vad_lstm_context_buffer) { + ggml_backend_buffer_free(ctx->state->vad_lstm_context_buffer); + ctx->state->vad_lstm_context_buffer = nullptr; + } + } sense_voice_free_state(ctx->state); delete ctx->model.model->encoder; @@ -310,32 +325,29 @@ void sense_voice_split_segments(struct sense_voice_context *ctx, const sense_voi int R_this_chunk = std::min(i + n_sample_step, int(pcmf32.size())); bool isnomute = false; - // 使用silero-vad进行语音检测 - const int vad_chunk_size = 640; // VAD_CHUNK_SIZE - const int context_size = 576; - const int pad_size = 64; + // 修复silero-vad的chunk数据准备,确保与原始方法处理相同的数据范围 + int actual_chunk_size = R_this_chunk - i; // 使用与能量检测相同的chunk大小 - // 准备640大小的chunk数据 + // 准备与原始方法相同大小的chunk数据,但最少640样本以满足VAD要求 + int vad_chunk_size = std::max(640, actual_chunk_size); std::vector chunk(vad_chunk_size, 0); - int start_idx = i; - - // 填充chunk数据,处理边界情况 - for (int j = 0; j < vad_chunk_size; j++) { - int actual_idx = start_idx + j - pad_size; - if (actual_idx >= 0 && actual_idx < pcmf32.size()) { - chunk[j] = static_cast(pcmf32[actual_idx]); - } else if (actual_idx < 0) { - // 反射填充 - int reflect_idx = -actual_idx - 1; - if (reflect_idx < pcmf32.size()) { - chunk[j] = static_cast(pcmf32[reflect_idx]); - } + + // 填充实际的音频数据(确保索引正确) + for (int j = 0; j < actual_chunk_size && i + j < int(pcmf32.size()); j++) { + chunk[j] = static_cast(pcmf32[i + j]) / 32768.0f; + } + + // 如果chunk不够640样本,用最后一个样本填充(而不是反射填充) + if (actual_chunk_size < 640) { + float last_sample = (actual_chunk_size > 0) ? chunk[actual_chunk_size - 1] : 0.0f; + for (int j = actual_chunk_size; j < 640; j++) { + chunk[j] = last_sample; } } float speech_prob = 0; if (silero_vad_encode_internal(*ctx, *ctx->state, chunk, params.n_threads, speech_prob)) { - isnomute = (speech_prob >= 0.5); // 默认阈值0.5 + isnomute = (speech_prob >= params.speech_prob_threshold); } else { // 如果VAD失败,回退到能量检测 isnomute = vad_energy_zcr(pcmf32.begin() + i, R_this_chunk - i, SENSE_VOICE_SAMPLE_RATE); @@ -455,7 +467,7 @@ int main(int argc, char **argv) { ctx->language_id = sense_voice_lang_id(params.language.c_str()); // 初始化silero-vad状态 - const int VAD_LSTM_STATE_MEMORY_SIZE = 256*1024; + const int VAD_LSTM_STATE_MEMORY_SIZE = 2048; const int VAD_LSTM_STATE_DIM = 128; ctx->state->vad_ctx = ggml_init({VAD_LSTM_STATE_MEMORY_SIZE, nullptr, true}); From 0bc948e9458248c388796c26242dda56a3f92e0a Mon Sep 17 00:00:00 2001 From: Ivy233 Date: Mon, 8 Sep 2025 17:56:20 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=86=85=E5=AD=98?= =?UTF-8?q?=EF=BC=9A=E8=AE=A1=E7=AE=97=E5=9B=BE=E6=97=A0=E7=94=A8=E9=83=A8?= =?UTF-8?q?=E5=88=86=E5=8E=BB=E9=99=A4=EF=BC=9B=E4=BC=98=E5=8C=96=E5=A4=96?= =?UTF-8?q?=E9=83=A8=E8=AF=BB=E5=8F=96=E9=80=BB=E8=BE=91=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/zcr_main/main.cc | 208 ++++++++++++++--------- sense-voice/csrc/sense-voice-encoder.cc | 19 +-- sense-voice/csrc/sense-voice-frontend.cc | 49 +++++- 3 files changed, 173 insertions(+), 103 deletions(-) diff --git a/examples/zcr_main/main.cc b/examples/zcr_main/main.cc index 6a22d45..558f617 100644 --- a/examples/zcr_main/main.cc +++ b/examples/zcr_main/main.cc @@ -219,6 +219,14 @@ static bool is_file_exist(const char *fileName) { return infile.good(); } +// 函数声明 +void sense_voice_process_stream(struct sense_voice_context *ctx, const sense_voice_params ¶ms, std::vector &pcmf32); +void sense_voice_process_batch(struct sense_voice_context *ctx, const sense_voice_params ¶ms, + std::vector &batch); +bool check_and_process_batch_if_full(struct sense_voice_context *ctx, const sense_voice_params ¶ms, + std::vector ¤t_batch, size_t ¤t_batch_size, + size_t new_segment_size, size_t batch_samples); + /** * This the arbitrary data which will be passed to each callback. * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. @@ -315,29 +323,31 @@ void sense_voice_free(struct sense_voice_context *ctx) { } } -void sense_voice_split_segments(struct sense_voice_context *ctx, const sense_voice_params ¶ms, std::vector &pcmf32) { - int L_nomute = -1, L_mute = -1, R_mute = -1;// [L_nomute, R_nomute)永远为需要解析的段落,[L_mute, R_mute)永远为最近一段静音空挡 +// 流式音频处理:从输入音频分批读取并处理 +void sense_voice_process_stream(struct sense_voice_context *ctx, const sense_voice_params ¶ms, std::vector &pcmf32) { const int n_sample_step = params.chunk_size * 1e-3 * SENSE_VOICE_SAMPLE_RATE; const int keep_nomute_step = params.chunk_size * params.min_mute_chunks * 1e-3 * SENSE_VOICE_SAMPLE_RATE; const int max_nomute_step = params.chunk_size * params.max_nomute_chunks * 1e-3 * SENSE_VOICE_SAMPLE_RATE; - // const bool use_vad = (n_samples_step <= 0); + const size_t batch_samples = params.max_chunks_in_batch * params.chunk_size * 1e-3 * SENSE_VOICE_SAMPLE_RATE; + + std::vector current_batch; + size_t current_batch_size = 0; + + int L_nomute = -1, L_mute = -1, R_mute = -1; + for (int i = 0; i < int(pcmf32.size()); i += n_sample_step) { int R_this_chunk = std::min(i + n_sample_step, int(pcmf32.size())); bool isnomute = false; - // 修复silero-vad的chunk数据准备,确保与原始方法处理相同的数据范围 - int actual_chunk_size = R_this_chunk - i; // 使用与能量检测相同的chunk大小 - - // 准备与原始方法相同大小的chunk数据,但最少640样本以满足VAD要求 + // VAD检测 + int actual_chunk_size = R_this_chunk - i; int vad_chunk_size = std::max(640, actual_chunk_size); std::vector chunk(vad_chunk_size, 0); - // 填充实际的音频数据(确保索引正确) for (int j = 0; j < actual_chunk_size && i + j < int(pcmf32.size()); j++) { - chunk[j] = static_cast(pcmf32[i + j]) / 32768.0f; + chunk[j] = pcmf32[i + j] / 32768.0f; } - // 如果chunk不够640样本,用最后一个样本填充(而不是反射填充) if (actual_chunk_size < 640) { float last_sample = (actual_chunk_size > 0) ? chunk[actual_chunk_size - 1] : 0.0f; for (int j = actual_chunk_size; j < 640; j++) { @@ -349,63 +359,130 @@ void sense_voice_split_segments(struct sense_voice_context *ctx, const sense_voi if (silero_vad_encode_internal(*ctx, *ctx->state, chunk, params.n_threads, speech_prob)) { isnomute = (speech_prob >= params.speech_prob_threshold); } else { - // 如果VAD失败,回退到能量检测 - isnomute = vad_energy_zcr(pcmf32.begin() + i, R_this_chunk - i, SENSE_VOICE_SAMPLE_RATE); + // 转换为double用于兼容原有的vad_energy_zcr函数 + std::vector pcm_double(pcmf32.begin() + i, pcmf32.begin() + R_this_chunk); + isnomute = vad_energy_zcr(pcm_double.begin(), R_this_chunk - i, SENSE_VOICE_SAMPLE_RATE); } - // fprintf(stderr, "Mute || L_mute = %d, R_Mute = %d, L_nomute = %d, R_this_chunk = %d, keep_nomute_step = %d\n", L_mute, R_mute, L_nomute, R_this_chunk, keep_nomute_step); + // 音频分段逻辑 if (L_nomute >= 0 && R_this_chunk - L_nomute >= max_nomute_step) { int R_nomute = L_mute >= 0 && L_mute >= L_nomute ? L_mute : R_this_chunk; - sense_voice_segment pcmf_tmp; - pcmf_tmp.t0 = L_nomute; - pcmf_tmp.t1 = R_nomute; - // std::transform(pcmf32.begin() + L_nomute, pcmf32.end() + R_nomute, pcmf_tmp.samples.begin(), [](double val){return static_cast(val);}); - pcmf_tmp.samples = std::vector(pcmf32.begin() + L_nomute, pcmf32.begin() + R_nomute); - ctx->state->result_all.push_back(pcmf_tmp); + sense_voice_segment segment; + segment.t0 = L_nomute; + segment.t1 = R_nomute; + // 转换为double存储(保持与现有代码的兼容性) + segment.samples = std::vector(pcmf32.begin() + L_nomute, pcmf32.begin() + R_nomute); + + // 使用优化的批处理函数检查并处理满载的batch + size_t segment_size = segment.samples.size(); + check_and_process_batch_if_full(ctx, params, current_batch, current_batch_size, + segment_size, batch_samples); + + current_batch.push_back(segment); + current_batch_size += segment_size; if (!isnomute) L_nomute = -1; - else if (R_mute >= 0 && L_mute >= L_nomute) - L_nomute = R_mute; - else - L_nomute = i; + else if (R_mute >= 0 && L_mute >= L_nomute) L_nomute = R_mute; + else L_nomute = i; L_mute = R_mute = -1; continue; } + if (isnomute) { if (L_nomute < 0) L_nomute = i; } else { if (R_mute != i) L_mute = i; R_mute = R_this_chunk; if (L_mute >= L_nomute && L_nomute >= 0 && R_this_chunk - L_mute >= keep_nomute_step) { - // printf("2222: %d %d %d %d %d\n", L_nomute, R_nomute, L_mute, i, R_this_chunk); - sense_voice_segment pcmf_tmp; - pcmf_tmp.t0 = L_nomute; - pcmf_tmp.t1 = L_mute; - // std::transform(pcmf32.begin() + L_nomute, pcmf32.end() + L_mute, pcmf_tmp.samples.begin(), [](double val){return static_cast(val);}); - pcmf_tmp.samples = std::vector(pcmf32.begin() + L_nomute, pcmf32.begin() + L_mute); - ctx->state->result_all.push_back(pcmf_tmp); + sense_voice_segment segment; + segment.t0 = L_nomute; + segment.t1 = L_mute; + segment.samples = std::vector(pcmf32.begin() + L_nomute, pcmf32.begin() + L_mute); + + // 使用优化的批处理函数检查并处理满载的batch + size_t segment_size = segment.samples.size(); + check_and_process_batch_if_full(ctx, params, current_batch, current_batch_size, + segment_size, batch_samples); + + current_batch.push_back(segment); + current_batch_size += segment_size; + if (!isnomute) L_nomute = -1; - else if (R_mute >= 0) - L_nomute = R_mute; - else - L_nomute = i; + else if (R_mute >= 0) L_nomute = R_mute; + else L_nomute = i; L_mute = R_mute = -1; } } } - // 最后一段 + + // 处理最后一段 if (L_nomute >= 0) { - int R_nomute = pcmf32.size(); - sense_voice_segment pcmf_tmp; - pcmf_tmp.t0 = L_nomute; - pcmf_tmp.t1 = R_nomute; - // std::transform(pcmf32.begin() + L_nomute, pcmf32.end() + R_nomute, pcmf_tmp.samples.begin(), [](double val){return static_cast(val);}); - pcmf_tmp.samples = std::vector(pcmf32.begin() + L_nomute, pcmf32.begin() + R_nomute); - ctx->state->result_all.push_back(pcmf_tmp); - L_nomute = L_mute = R_mute = -1; + sense_voice_segment segment; + segment.t0 = L_nomute; + segment.t1 = pcmf32.size(); + segment.samples = std::vector(pcmf32.begin() + L_nomute, pcmf32.end()); + + // 使用优化的批处理函数检查并处理满载的batch + size_t segment_size = segment.samples.size(); + check_and_process_batch_if_full(ctx, params, current_batch, current_batch_size, + segment_size, batch_samples); + + current_batch.push_back(segment); + } + + // 处理最后的batch + if (!current_batch.empty()) { + sense_voice_process_batch(ctx, params, current_batch); } } +// 处理一个batch并清理计算图缓冲区 +void sense_voice_process_batch(struct sense_voice_context *ctx, const sense_voice_params ¶ms, + std::vector &batch) { + // 清理之前的结果 + ctx->state->result_all.clear(); + ctx->state->segmentIDs.clear(); + + // 将batch中的segment添加到result_all中 + for (size_t i = 0; i < batch.size(); i++) { + ctx->state->result_all.push_back(batch[i]); + ctx->state->segmentIDs.push_back(i); + } + + // 处理batch + sense_voice_full_params wparams = sense_voice_full_default_params(SENSE_VOICE_SAMPLING_GREEDY); + wparams.language = params.language.c_str(); + wparams.n_threads = params.n_threads; + wparams.debug_mode = params.debug_mode; + + sense_voice_batch_full(ctx, wparams); + sense_voice_batch_print_output(ctx, params.use_prefix, params.use_itn); + + // 清理处理后的结果以释放内存 + ctx->state->result_all.clear(); + ctx->state->segmentIDs.clear(); +} + +// 检查batch是否满载,如果满载则处理并清空 +bool check_and_process_batch_if_full(struct sense_voice_context *ctx, const sense_voice_params ¶ms, + std::vector ¤t_batch, size_t ¤t_batch_size, + size_t new_segment_size, size_t batch_samples) { + if (!current_batch.empty() && + (current_batch_size + new_segment_size > batch_samples || + current_batch.size() >= params.max_batch)) { + + // 处理当前batch + sense_voice_process_batch(ctx, params, current_batch); + + // 清空batch准备下一轮 + current_batch.clear(); + current_batch_size = 0; + + return true; // 表示已处理了一个batch + } + return false; // 表示未处理batch +} + int main(int argc, char **argv) { sense_voice_params params; @@ -493,7 +570,7 @@ int main(int argc, char **argv) { const auto fname_inp = params.fname_inp[f]; const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f]; - std::vector pcmf32;// mono-channel F32 PCM + std::vector pcmf32;// mono-channel F32 PCM int sample_rate; if (!::load_wav_file(fname_inp.c_str(), &sample_rate, pcmf32)) { @@ -518,47 +595,8 @@ int main(int argc, char **argv) { } { - sense_voice_full_params wparams = sense_voice_full_default_params(SENSE_VOICE_SAMPLING_GREEDY); - wparams.language = params.language.c_str(); - wparams.n_threads = params.n_threads; - wparams.offset_ms = params.offset_t_ms; - wparams.duration_ms = params.duration_ms; - wparams.debug_mode = params.debug_mode; - sense_voice_split_segments(ctx, params, pcmf32); - // ctx->state->result_all需要分块识别 - { - const size_t batch_samples = params.max_chunks_in_batch * params.chunk_size * 1e-3 * SENSE_VOICE_SAMPLE_RATE; - size_t max_len = 0, batch_L = ctx->state->result_all.size(); - for (size_t i = 0; i < ctx->state->result_all.size(); i++) { - if (batch_L >= ctx->state->result_all.size()) { - batch_L = i; - max_len = ctx->state->result_all[i].samples.size(); - ctx->state->segmentIDs.push_back(i); - continue;// 这里可以直接推进,收拢到下一个循环处理[batch_L, i]之间的关系 - } - max_len = std::max(max_len, ctx->state->result_all[i].samples.size()); - // 这里确保了i>batch_L - if (max_len * (i - batch_L + 1) > batch_samples || i - batch_L >= params.max_batch) { - // if (i - batch_L > 0) { - // if (i - batch_L > 1) { - sense_voice_batch_full(ctx, wparams); - sense_voice_batch_print_output(ctx, params.use_prefix, params.use_itn); - batch_L = i; - max_len = ctx->state->result_all[i].samples.size(); - ctx->state->segmentIDs.clear(); - } - ctx->state->segmentIDs.push_back(i); - } - // 最后一组 - if (batch_L < ctx->state->result_all.size()) { - // 识别全部即可 - sense_voice_batch_full(ctx, wparams); - sense_voice_batch_print_output(ctx, params.use_prefix, params.use_itn); - ctx->state->segmentIDs.clear(); - batch_L = ctx->state->result_all.size(); - max_len = 0; - } - } + // 使用新的流式处理函数 + sense_voice_process_stream(ctx, params, pcmf32); } } sense_voice_free(ctx); diff --git a/sense-voice/csrc/sense-voice-encoder.cc b/sense-voice/csrc/sense-voice-encoder.cc index 19629e6..c9e40a0 100644 --- a/sense-voice/csrc/sense-voice-encoder.cc +++ b/sense-voice/csrc/sense-voice-encoder.cc @@ -192,22 +192,12 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams a = ggml_repeat(ctx0, ggml_cast(ctx0, a, GGML_TYPE_F32), ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, a->ne[0], a->ne[1], a->ne[2], n_batch)); struct ggml_tensor * result = ggml_mul_mat(ctx0, a, im2col); fsmn_memory = ggml_reshape_3d(ctx0, result, im2col->ne[1], im2col->ne[2], im2col->ne[3]); - // if(n_batch > 1){ - // printf("n_batch: %d\n", n_batch); - // printf("a: %ld %ld %ld %ld\n", a->ne[0], a->ne[1], a->ne[2], a->ne[3]); - // printf("b: %ld %ld %ld %ld\n", b->ne[0], b->ne[1], b->ne[2], b->ne[3]); - // printf("im2col: %ld %ld %ld %ld\n", im2col->ne[0], im2col->ne[1], im2col->ne[2], im2col->ne[3]); - // printf("result: %ld %ld %ld %ld\n", result->ne[0], result->ne[1], result->ne[2], result->ne[3]); - // printf("fsmn_memory: %ld %ld %ld %ld\n", fsmn_memory->ne[0], fsmn_memory->ne[1], fsmn_memory->ne[2], fsmn_memory->ne[3]); - // printf("V: %ld %ld %ld %ld\n", V->ne[0], V->ne[1], V->ne[2], V->ne[3]); - // } } fsmn_memory = ggml_cont(ctx0, ggml_transpose(ctx0, fsmn_memory)); fsmn_memory = ggml_add(ctx0, fsmn_memory, V); ggml_set_name(fsmn_memory, "fsmn_memory"); } - struct ggml_tensor *KQV; float KQscale = 1.0f / sqrtf(float(n_state) / n_head); if(user_flash_attn){ @@ -232,7 +222,7 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams ggml_element_size(state->kv_pad.v)*n_state_head, ggml_element_size(state->kv_pad.v)*n_state*n_ctx_pad, 0); - KQV = ggml_flash_attn_ext(ctx0, Q_h, K, V, nullptr, KQscale, 0.0f, 0.0f); + ggml_tensor *KQV = ggml_flash_attn_ext(ctx0, Q_h, K, V, nullptr, KQscale, 0.0f, 0.0f); cur = ggml_reshape_3d(ctx0, KQV, n_state, n_ctx, n_batch); } else{ // K * Q @@ -241,7 +231,7 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams struct ggml_tensor *KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, nullptr, KQscale, 0.0f); - KQV = ggml_mul_mat( + ggml_tensor *KQV = ggml_mul_mat( ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, V_h)), KQ_soft_max); struct ggml_tensor *KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); cur = ggml_cpy(ctx0, @@ -249,11 +239,6 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state, n_ctx, n_batch)); } - - - cur = ggml_cpy(ctx0, cur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state, n_ctx, n_batch)); - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.e_attn_ln_out_w, cur), layer.e_attn_ln_out_b); ggml_set_name(cur, "attention_out"); diff --git a/sense-voice/csrc/sense-voice-frontend.cc b/sense-voice/csrc/sense-voice-frontend.cc index a58b901..c56fd84 100644 --- a/sense-voice/csrc/sense-voice-frontend.cc +++ b/sense-voice/csrc/sense-voice-frontend.cc @@ -284,6 +284,53 @@ bool load_wav_file(const char *filename, int32_t *sampling_rate, free(speech_buff); return false; } - } +bool load_wav_file(const char *filename, int32_t *sampling_rate, + std::vector &data) { + struct WaveHeader header {}; + + std::ifstream is(filename, std::ifstream::binary); + is.read(reinterpret_cast(&header), sizeof(header)); + if (!is) { + std::cout << "Failed to read " << filename; + return false; + } + + if (!header.Validate()) { + return false; + } + + header.SeekToDataChunk(is); + if (!is) { + return false; + } + + *sampling_rate = header.sample_rate; + // header.subchunk2_size contains the number of bytes in the data. + // As we assume each sample contains two bytes, so it is divided by 2 here + auto speech_len = header.subchunk2_size / 2; + data.resize(speech_len); + + auto speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_len); + + if (speech_buff) { + memset(speech_buff, 0, sizeof(int16_t) * speech_len); + is.read(reinterpret_cast(speech_buff), header.subchunk2_size); + if (!is) { + std::cout << "Failed to read " << filename; + return false; + } + +// float scale = 32768; + float scale = 1.0; + for (int32_t i = 0; i != speech_len; ++i) { + data[i] = (float)speech_buff[i] / scale; + } + free(speech_buff); + return true; + } else { + free(speech_buff); + return false; + } +} From 66fe04d751345948eff952337b74a93b25fd8d3e Mon Sep 17 00:00:00 2001 From: Ivy233 Date: Mon, 8 Sep 2025 19:01:49 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=E6=96=87=E4=BB=B6=E8=AF=BB=E5=8F=96?= =?UTF-8?q?=E9=87=87=E7=94=A8=E6=B5=81=E8=BE=93=E5=85=A5=E3=80=82=E5=8F=A6?= =?UTF-8?q?=E5=A4=96=EF=BC=8Csegment=E7=9A=84double=E7=B1=BB=E5=9E=8B?= =?UTF-8?q?=E6=8D=A2=E6=88=90float=E7=B1=BB=E5=9E=8B=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/zcr_main/main.cc | 224 +++++++++++++++-------- sense-voice/csrc/common.h | 2 +- sense-voice/csrc/sense-voice-frontend.cc | 212 +++++++++++++++++---- sense-voice/csrc/sense-voice-frontend.h | 5 + sense-voice/csrc/sense-voice.cc | 4 +- sense-voice/csrc/sense-voice.h | 2 +- 6 files changed, 331 insertions(+), 118 deletions(-) diff --git a/examples/zcr_main/main.cc b/examples/zcr_main/main.cc index 558f617..8ff7fe7 100644 --- a/examples/zcr_main/main.cc +++ b/examples/zcr_main/main.cc @@ -7,11 +7,31 @@ #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable : 4244 4267)// possible loss of data #endif +// 从ifstream读取音频chunk +bool read_audio_chunk(std::ifstream &file, std::vector &chunk, size_t samples_to_read) { + std::vector temp_buffer(samples_to_read); + file.read(reinterpret_cast(temp_buffer.data()), samples_to_read * sizeof(int16_t)); + + if (file.gcount() == 0) { + return false; // 文件结束 + } + + size_t actual_samples = file.gcount() / sizeof(int16_t); + chunk.resize(actual_samples); + + float scale = 1.0f; // 保持与原有代码一致的缩放 + for (size_t i = 0; i < actual_samples; i++) { + chunk[i] = static_cast(temp_buffer[i]) / scale; + } + + return true; +} // command-line parameters struct sense_voice_params { @@ -220,7 +240,8 @@ static bool is_file_exist(const char *fileName) { } // 函数声明 -void sense_voice_process_stream(struct sense_voice_context *ctx, const sense_voice_params ¶ms, std::vector &pcmf32); +void sense_voice_process_stream_from_file(struct sense_voice_context *ctx, const sense_voice_params ¶ms, + std::ifstream &file, const WaveHeader &header); void sense_voice_process_batch(struct sense_voice_context *ctx, const sense_voice_params ¶ms, std::vector &batch); bool check_and_process_batch_if_full(struct sense_voice_context *ctx, const sense_voice_params ¶ms, @@ -323,8 +344,9 @@ void sense_voice_free(struct sense_voice_context *ctx) { } } -// 流式音频处理:从输入音频分批读取并处理 -void sense_voice_process_stream(struct sense_voice_context *ctx, const sense_voice_params ¶ms, std::vector &pcmf32) { +// 流式音频处理:从ifstream逐块读取并处理 +void sense_voice_process_stream_from_file(struct sense_voice_context *ctx, const sense_voice_params ¶ms, + std::ifstream &file, const WaveHeader &header) { const int n_sample_step = params.chunk_size * 1e-3 * SENSE_VOICE_SAMPLE_RATE; const int keep_nomute_step = params.chunk_size * params.min_mute_chunks * 1e-3 * SENSE_VOICE_SAMPLE_RATE; const int max_nomute_step = params.chunk_size * params.max_nomute_chunks * 1e-3 * SENSE_VOICE_SAMPLE_RATE; @@ -334,72 +356,55 @@ void sense_voice_process_stream(struct sense_voice_context *ctx, const sense_voi size_t current_batch_size = 0; int L_nomute = -1, L_mute = -1, R_mute = -1; - - for (int i = 0; i < int(pcmf32.size()); i += n_sample_step) { - int R_this_chunk = std::min(i + n_sample_step, int(pcmf32.size())); - bool isnomute = false; - - // VAD检测 - int actual_chunk_size = R_this_chunk - i; - int vad_chunk_size = std::max(640, actual_chunk_size); - std::vector chunk(vad_chunk_size, 0); - - for (int j = 0; j < actual_chunk_size && i + j < int(pcmf32.size()); j++) { - chunk[j] = pcmf32[i + j] / 32768.0f; - } - - if (actual_chunk_size < 640) { - float last_sample = (actual_chunk_size > 0) ? chunk[actual_chunk_size - 1] : 0.0f; - for (int j = actual_chunk_size; j < 640; j++) { - chunk[j] = last_sample; + + // 流式读取缓冲区 + std::vector audio_buffer; + std::vector chunk_data; + const size_t chunk_samples = n_sample_step; + int processed_samples = 0; + + // 逐块读取音频数据 + while (read_audio_chunk(file, chunk_data, chunk_samples)) { + // 将新数据追加到缓冲区 + audio_buffer.insert(audio_buffer.end(), chunk_data.begin(), chunk_data.end()); + + // 处理缓冲区中的完整chunks + while (audio_buffer.size() >= processed_samples + n_sample_step) { + int i = processed_samples; + int R_this_chunk = std::min(i + n_sample_step, (int)audio_buffer.size()); + bool isnomute = false; + + // VAD检测 + int actual_chunk_size = R_this_chunk - i; + int vad_chunk_size = std::max(640, actual_chunk_size); + std::vector vad_chunk(vad_chunk_size, 0); + + for (int j = 0; j < actual_chunk_size && i + j < (int)audio_buffer.size(); j++) { + vad_chunk[j] = audio_buffer[i + j] / 32768.0f; } - } - - float speech_prob = 0; - if (silero_vad_encode_internal(*ctx, *ctx->state, chunk, params.n_threads, speech_prob)) { - isnomute = (speech_prob >= params.speech_prob_threshold); - } else { - // 转换为double用于兼容原有的vad_energy_zcr函数 - std::vector pcm_double(pcmf32.begin() + i, pcmf32.begin() + R_this_chunk); - isnomute = vad_energy_zcr(pcm_double.begin(), R_this_chunk - i, SENSE_VOICE_SAMPLE_RATE); - } - // 音频分段逻辑 - if (L_nomute >= 0 && R_this_chunk - L_nomute >= max_nomute_step) { - int R_nomute = L_mute >= 0 && L_mute >= L_nomute ? L_mute : R_this_chunk; - sense_voice_segment segment; - segment.t0 = L_nomute; - segment.t1 = R_nomute; - // 转换为double存储(保持与现有代码的兼容性) - segment.samples = std::vector(pcmf32.begin() + L_nomute, pcmf32.begin() + R_nomute); - - // 使用优化的批处理函数检查并处理满载的batch - size_t segment_size = segment.samples.size(); - check_and_process_batch_if_full(ctx, params, current_batch, current_batch_size, - segment_size, batch_samples); - - current_batch.push_back(segment); - current_batch_size += segment_size; + if (actual_chunk_size < 640) { + float last_sample = (actual_chunk_size > 0) ? vad_chunk[actual_chunk_size - 1] : 0.0f; + for (int j = actual_chunk_size; j < 640; j++) { + vad_chunk[j] = last_sample; + } + } - if (!isnomute) L_nomute = -1; - else if (R_mute >= 0 && L_mute >= L_nomute) L_nomute = R_mute; - else L_nomute = i; - L_mute = R_mute = -1; - continue; - } + float speech_prob = 0; + if (silero_vad_encode_internal(*ctx, *ctx->state, vad_chunk, params.n_threads, speech_prob)) { + isnomute = (speech_prob >= params.speech_prob_threshold); + } else { + isnomute = vad_energy_zcr(audio_buffer.begin() + i, R_this_chunk - i, SENSE_VOICE_SAMPLE_RATE); + } - if (isnomute) { - if (L_nomute < 0) L_nomute = i; - } else { - if (R_mute != i) L_mute = i; - R_mute = R_this_chunk; - if (L_mute >= L_nomute && L_nomute >= 0 && R_this_chunk - L_mute >= keep_nomute_step) { + // 音频分段逻辑(与原来的逻辑相同) + if (L_nomute >= 0 && R_this_chunk - L_nomute >= max_nomute_step) { + int R_nomute = L_mute >= 0 && L_mute >= L_nomute ? L_mute : R_this_chunk; sense_voice_segment segment; segment.t0 = L_nomute; - segment.t1 = L_mute; - segment.samples = std::vector(pcmf32.begin() + L_nomute, pcmf32.begin() + L_mute); + segment.t1 = R_nomute; + segment.samples = std::vector(audio_buffer.begin() + L_nomute, audio_buffer.begin() + R_nomute); - // 使用优化的批处理函数检查并处理满载的batch size_t segment_size = segment.samples.size(); check_and_process_batch_if_full(ctx, params, current_batch, current_batch_size, segment_size, batch_samples); @@ -408,21 +413,66 @@ void sense_voice_process_stream(struct sense_voice_context *ctx, const sense_voi current_batch_size += segment_size; if (!isnomute) L_nomute = -1; - else if (R_mute >= 0) L_nomute = R_mute; + else if (R_mute >= 0 && L_mute >= L_nomute) L_nomute = R_mute; else L_nomute = i; L_mute = R_mute = -1; + + processed_samples = R_this_chunk; + continue; + } + + if (isnomute) { + if (L_nomute < 0) L_nomute = i; + } else { + if (R_mute != i) L_mute = i; + R_mute = R_this_chunk; + if (L_mute >= L_nomute && L_nomute >= 0 && R_this_chunk - L_mute >= keep_nomute_step) { + sense_voice_segment segment; + segment.t0 = L_nomute; + segment.t1 = L_mute; + segment.samples = std::vector(audio_buffer.begin() + L_nomute, audio_buffer.begin() + L_mute); + + size_t segment_size = segment.samples.size(); + check_and_process_batch_if_full(ctx, params, current_batch, current_batch_size, + segment_size, batch_samples); + + current_batch.push_back(segment); + current_batch_size += segment_size; + + if (!isnomute) L_nomute = -1; + else if (R_mute >= 0) L_nomute = R_mute; + else L_nomute = i; + L_mute = R_mute = -1; + } } + + processed_samples = R_this_chunk; + } + + // 定期清理已处理的缓冲区数据,防止内存无限增长 + if (processed_samples > 0 && audio_buffer.size() > 2 * max_nomute_step) { + std::vector temp_buffer(audio_buffer.begin() + processed_samples, audio_buffer.end()); + audio_buffer = std::move(temp_buffer); + + // 调整位置索引 + L_nomute -= processed_samples; + L_mute -= processed_samples; + R_mute -= processed_samples; + if (L_nomute < 0 && L_nomute != -1) L_nomute = -1; + if (L_mute < 0 && L_mute != -1) L_mute = -1; + if (R_mute < 0 && R_mute != -1) R_mute = -1; + + processed_samples = 0; } } // 处理最后一段 - if (L_nomute >= 0) { + if (L_nomute >= 0 && L_nomute < (int)audio_buffer.size()) { sense_voice_segment segment; segment.t0 = L_nomute; - segment.t1 = pcmf32.size(); - segment.samples = std::vector(pcmf32.begin() + L_nomute, pcmf32.end()); + segment.t1 = audio_buffer.size(); + segment.samples = std::vector(audio_buffer.begin() + L_nomute, audio_buffer.end()); - // 使用优化的批处理函数检查并处理满载的batch size_t segment_size = segment.samples.size(); check_and_process_batch_if_full(ctx, params, current_batch, current_batch_size, segment_size, batch_samples); @@ -543,7 +593,7 @@ int main(int argc, char **argv) { ctx->language_id = sense_voice_lang_id(params.language.c_str()); - // 初始化silero-vad状态 + // 初始化silero-vad状态 const int VAD_LSTM_STATE_MEMORY_SIZE = 2048; const int VAD_LSTM_STATE_DIM = 128; @@ -570,14 +620,30 @@ int main(int argc, char **argv) { const auto fname_inp = params.fname_inp[f]; const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f]; - std::vector pcmf32;// mono-channel F32 PCM + // 使用ifstream进行流式音频读取 + std::ifstream file(fname_inp.c_str(), std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open audio file '%s'\n", fname_inp.c_str()); + continue; + } - int sample_rate; - if (!::load_wav_file(fname_inp.c_str(), &sample_rate, pcmf32)) { - fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str()); + // 读取WAV头 + WaveHeader header; + file.read(reinterpret_cast(&header), sizeof(header)); + if (!file || !header.Validate()) { + fprintf(stderr, "error: invalid WAV file format '%s'\n", fname_inp.c_str()); continue; } + header.SeekToDataChunk(file); + if (!file) { + fprintf(stderr, "error: failed to find data chunk in '%s'\n", fname_inp.c_str()); + continue; + } + + int sample_rate = header.sample_rate; + size_t total_samples = header.subchunk2_size / 2; // 16-bit samples + if (!params.no_prints) { // print system information fprintf(stderr, "\n"); @@ -586,19 +652,21 @@ int main(int argc, char **argv) { // print some info about the processing fprintf(stderr, "\n"); - fprintf(stderr, "%s: processing audio (%d samples, %.5f sec) , %d threads, %d processors, lang = %s...\n", - __func__, int(pcmf32.size()), float(pcmf32.size()) / sample_rate, + fprintf(stderr, "%s: processing audio stream (%zu samples, %.5f sec) , %d threads, %d processors, lang = %s...\n", + __func__, total_samples, float(total_samples) / sample_rate, params.n_threads, params.n_processors, params.language.c_str()); - ctx->state->duration = float(pcmf32.size()) / sample_rate; + ctx->state->duration = float(total_samples) / sample_rate; fprintf(stderr, "\n"); } { - // 使用新的流式处理函数 - sense_voice_process_stream(ctx, params, pcmf32); + // 使用流式处理音频 + sense_voice_process_stream_from_file(ctx, params, file, header); } + + file.close(); } sense_voice_free(ctx); return 0; -} +} \ No newline at end of file diff --git a/sense-voice/csrc/common.h b/sense-voice/csrc/common.h index fab2650..4def435 100644 --- a/sense-voice/csrc/common.h +++ b/sense-voice/csrc/common.h @@ -300,7 +300,7 @@ struct sense_voice_segment { size_t t1; // 时间区间右端点 // std::string text; // tokens对应的文本 std::vector tokens; // 识别后的tokens - std::vector samples; // 具体音频 + std::vector samples; // 具体音频 // std::vector // bool speaker_turn_next; }; diff --git a/sense-voice/csrc/sense-voice-frontend.cc b/sense-voice/csrc/sense-voice-frontend.cc index c56fd84..b842dfe 100644 --- a/sense-voice/csrc/sense-voice-frontend.cc +++ b/sense-voice/csrc/sense-voice-frontend.cc @@ -284,53 +284,193 @@ bool load_wav_file(const char *filename, int32_t *sampling_rate, free(speech_buff); return false; } + } -bool load_wav_file(const char *filename, int32_t *sampling_rate, - std::vector &data) { - struct WaveHeader header {}; - std::ifstream is(filename, std::ifstream::binary); - is.read(reinterpret_cast(&header), sizeof(header)); - if (!is) { - std::cout << "Failed to read " << filename; - return false; - } +// Float version of fbank_feature_worker_thread +static void fbank_feature_worker_thread_float(int ith, + const std::vector &hamming, + const std::vector &samples, + int n_samples, int frame_size, + int frame_step, int n_threads, + sense_voice_feature &mel) { + // make sure n_fft == 1 + (sense_voice_N_FFT / 2), bin_0 to bin_nyquist + int i = ith; - if (!header.Validate()) { - return false; - } + std::vector window; + const int padded_window_size = round_to_nearest_power_two(frame_size); + window.resize(padded_window_size); - header.SeekToDataChunk(is); - if (!is) { - return false; + // calculate FFT only when fft_in are not all zero + int n_fft = std::min(n_samples / frame_step + 1, mel.n_len); + for (; i < n_fft; i += n_threads) { + const int offset = i * frame_step; + + // Convert float to double for processing + for (int j = 0; j < frame_size; j++) { + window[j] = static_cast(samples[offset + j]); + } + + { + // init window default 0, initialization values may result in NaN on arm cpu. + for (int k = frame_size; k < window.size(); k++) { + window[k] = 0; + } + } + // remove dc offset + { + double sum = 0; + for (int32_t k = 0; k < frame_size; ++k) { + sum += window[k]; + } + double mean = sum / frame_size; + for (int32_t k = 0; k < frame_size; ++k) { + window[k] -= mean; + } + } + // pre-emphasis + { + for (int32_t k = frame_size - 1; k > 0; --k) { + window[k] -= PREEMPH_COEFF * window[k - 1]; + } + window[0] -= PREEMPH_COEFF * window[0]; + } + + // apply Hamming window + { + for (int j = 0; j < frame_size; j++) { + window[j] *= hamming[j]; + } + } + + // FFT + // window is input and output + rfft(window); + + // Calculate modulus^2 of complex numbers,Power Spectrum + // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes + // inference quality problem? Interesting. + for (int j = 0; j < padded_window_size / 2; j++) { + window[j] = (window[2 * j + 0] * window[2 * j + 0] + + window[2 * j + 1] * window[2 * j + 1]); + } + + // log-Mel filter bank energies aka: "fbank" + { + auto num_fft_bins = padded_window_size / 2; + int n_mel = mel.n_mel; + for (int j = 0; j < n_mel; j++) { + double sum = 0.0; + for (int k = 0; k < num_fft_bins; k++) { + sum += window[k] * LogMelFilterMelArray[j * num_fft_bins + k]; + } + + sum = log(sum > 1.19e-7 ? sum : 1.19e-7); + + mel.data[i * n_mel + j] = static_cast(sum); + } + } } +} - *sampling_rate = header.sample_rate; - // header.subchunk2_size contains the number of bytes in the data. - // As we assume each sample contains two bytes, so it is divided by 2 here - auto speech_len = header.subchunk2_size / 2; - data.resize(speech_len); +// Float version of fbank_lfr_cmvn_feature +bool fbank_lfr_cmvn_feature(const std::vector &samples, + const int n_samples, const int frame_size, + const int frame_step, const int n_feats, + const int n_threads, const bool debug, + sense_voice_cmvn &cmvn, sense_voice_feature &feats) { + // const int64_t t_start_us = ggml_time_us(); - auto speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_len); + const int32_t n_frames_per_ms = SENSE_VOICE_SAMPLE_RATE * 0.001f; + feats.n_mel = n_feats; + feats.n_len = 1 + ((n_samples - frame_size * n_frames_per_ms) / + (frame_step * n_frames_per_ms)); + feats.data.resize(feats.n_mel * feats.n_len); - if (speech_buff) { - memset(speech_buff, 0, sizeof(int16_t) * speech_len); - is.read(reinterpret_cast(speech_buff), header.subchunk2_size); - if (!is) { - std::cout << "Failed to read " << filename; - return false; + std::vector hamming; + hamming_window(frame_size * n_frames_per_ms, true, hamming); + + { + if (n_threads > 1) { + ThreadPool pool(n_threads); + for (int iw = 0; iw < n_threads - 1; ++iw) { + pool.enqueue(fbank_feature_worker_thread_float, iw + 1, std::cref(hamming), + samples, n_samples, frame_size * n_frames_per_ms, + frame_step * n_frames_per_ms, n_threads, std::ref(feats)); + } } -// float scale = 32768; - float scale = 1.0; - for (int32_t i = 0; i != speech_len; ++i) { - data[i] = (float)speech_buff[i] / scale; + // main thread + fbank_feature_worker_thread_float(0, hamming, samples, n_samples, + frame_size * n_frames_per_ms, + frame_step * n_frames_per_ms, n_threads, feats); + } + + if (debug) { + auto &mel = feats.data; + std::ofstream outFile("fbank_lfr_cmvn_feature_float.json"); + outFile << "["; + for (uint64_t i = 0; i < mel.size() - 1; i++) { + outFile << mel[i] << ", "; + } + outFile << mel[mel.size() - 1] << "]"; + outFile.close(); + } + + std::vector> out_feats; + + // tapply lrf, merge lfr_m frames as one,lfr_n frames per window + // ref: + // https://github.com/alibaba-damo-academy/FunASR/blob/main/runtime/onnxruntime/src/paraformer.cpp#L409-L440 + int T = feats.n_len; + int lfr_m = feats.lfr_m; // 7 + int lfr_n = feats.lfr_n; // 6 + int T_lrf = ceil(1.0 * T / feats.lfr_n); + int left_pad = (feats.lfr_m - 1) / 2; + int left_pad_offset = (lfr_m - left_pad) * feats.n_mel; + // Merge lfr_m frames as one,lfr_n frames per window + T = T + (lfr_m - 1) / 2; + std::vector p; + for (int i = 0; i < T_lrf; i++) { + // the first frames need left padding + if (i == 0) { + // left padding + for (int j = 0; j < left_pad; j++) { + p.insert(p.end(), feats.data.begin(), feats.data.begin() + feats.n_mel); + } + p.insert(p.end(), feats.data.begin(), feats.data.begin() + left_pad_offset); + out_feats.push_back(p); + p.clear(); + } else { + if (lfr_m <= T - i * lfr_n) { + p.insert(p.end(), feats.data.begin() + (i * lfr_n - left_pad) * feats.n_mel, + feats.data.begin() + (i * lfr_n - left_pad + lfr_m) * feats.n_mel); + out_feats.push_back(p); + p.clear(); + } else { + // Fill to lfr_m frames at last window if less than lfr_m frames (copy + // last frame) + int num_padding = lfr_m - (T - i * lfr_n); + for (int j = 0; j < (feats.n_len - i * lfr_n); j++) { + p.insert(p.end(), + feats.data.begin() + (i * lfr_n - left_pad) * feats.n_mel, + feats.data.end()); + } + for (int j = 0; j < num_padding; j++) { + p.insert(p.end(), feats.data.end() - feats.n_mel, feats.data.end()); + } + out_feats.push_back(p); + p.clear(); + } + } + } + feats.data.resize(T_lrf * feats.lfr_m * feats.n_mel); + // apply cvmn + for (int i = 0; i < T_lrf; i++) { + for (int j = 0; j < feats.lfr_m * feats.n_mel; j++) { + feats.data[i * feats.lfr_m * feats.n_mel + j] = (out_feats[i][j] + cmvn.cmvn_means[j]) * cmvn.cmvn_vars[j]; } - free(speech_buff); - return true; - } else { - free(speech_buff); - return false; } + return true; } diff --git a/sense-voice/csrc/sense-voice-frontend.h b/sense-voice/csrc/sense-voice-frontend.h index 4e75d69..379b5d6 100644 --- a/sense-voice/csrc/sense-voice-frontend.h +++ b/sense-voice/csrc/sense-voice-frontend.h @@ -122,6 +122,11 @@ bool fbank_lfr_cmvn_feature(const std::vector &samples, const int frame_step, const int n_feats, const int n_threads, const bool debug, sense_voice_cmvn &cmvn, sense_voice_feature &feats); +bool fbank_lfr_cmvn_feature(const std::vector &samples, + const int n_samples, const int frame_size, + const int frame_step, const int n_feats, + const int n_threads, const bool debug, + sense_voice_cmvn &cmvn, sense_voice_feature &feats); bool load_wav_file(const char *filename, int32_t *sampling_rate, std::vector &data); diff --git a/sense-voice/csrc/sense-voice.cc b/sense-voice/csrc/sense-voice.cc index 1545441..de0a08f 100644 --- a/sense-voice/csrc/sense-voice.cc +++ b/sense-voice/csrc/sense-voice.cc @@ -799,7 +799,7 @@ int sense_voice_batch_pcm_to_feature_with_state(struct sense_voice_context *ctx, max_len = std::max(max_len, state->result_all[segmentID].samples.size()); for (size_t segmentID: state->segmentIDs) { - std::vector& pcmf32 = state->result_all[segmentID].samples; + std::vector& pcmf32 = state->result_all[segmentID].samples; if(pcmf32.size() < max_len) { pcmf32.insert(pcmf32.end(), max_len - pcmf32.size(), 0); } @@ -892,7 +892,7 @@ int sense_voice_batch_full(struct sense_voice_context *ctx, const sense_voice_fu return 0; } -int sense_voice_batch_pcmf(struct sense_voice_context *ctx, const sense_voice_full_params ¶ms, std::vector> &pcmf32, +int sense_voice_batch_pcmf(struct sense_voice_context *ctx, const sense_voice_full_params ¶ms, std::vector> &pcmf32, size_t max_batch_len, size_t max_batch_cnt, bool use_prefix, bool use_itn) { diff --git a/sense-voice/csrc/sense-voice.h b/sense-voice/csrc/sense-voice.h index a5ce3d1..5f45ef1 100644 --- a/sense-voice/csrc/sense-voice.h +++ b/sense-voice/csrc/sense-voice.h @@ -25,7 +25,7 @@ int sense_voice_full_parallel(struct sense_voice_context * ctx, void sense_voice_print_output(struct sense_voice_context * ctx, bool need_prefix, bool use_itn, bool refresh_self=false); void sense_voice_free_state(struct sense_voice_state * state); int sense_voice_batch_full(struct sense_voice_context * ctx, const sense_voice_full_params ¶ms); -int sense_voice_batch_pcmf(struct sense_voice_context *ctx, const sense_voice_full_params ¶ms, std::vector> &pcmf32, +int sense_voice_batch_pcmf(struct sense_voice_context *ctx, const sense_voice_full_params ¶ms, std::vector> &pcmf32, size_t max_batch_len=90000, size_t max_batch_cnt=1, bool use_prefix=true, bool use_itn=true); void sense_voice_batch_print_output(struct sense_voice_context * ctx, bool need_prefix, bool use_itn, bool refresh_self=false);