diff --git a/examples/zcr_main/main.cc b/examples/zcr_main/main.cc index 17065ae..52c24d3 100644 --- a/examples/zcr_main/main.cc +++ b/examples/zcr_main/main.cc @@ -42,6 +42,7 @@ struct sense_voice_params { std::string openvino_encode_device = "CPU"; std::vector fname_inp = {}; std::vector fname_out = {}; + std::string outfile = ""; }; static int sense_voice_has_coreml(void) { @@ -107,6 +108,7 @@ static void sense_voice_print_usage(int /*argc*/, char **argv, const sense_voice fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false"); fprintf(stderr, " -itn, --use-itn [%-7s] use itn\n", params.use_itn ? "true" : "false"); + fprintf(stderr, " -fout --outfile [%s] output file path\n", params.outfile.c_str()); fprintf(stderr, " --chunk_size [%-7lu] vad chunk size(ms)\n", params.chunk_size); fprintf(stderr, " -mmc --min-mute-chunks [%-7lu] When consecutive chunks are identified as silence\n", params.min_mute_chunks); fprintf(stderr, " -mnc --max-nomute-chunks [%-7lu] when the first non-silent chunk is too far away\n", params.max_nomute_chunks); @@ -166,7 +168,7 @@ static bool sense_voice_params_parse(int argc, char **argv, sense_voice_params & } else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); } else if (arg == "-np" || arg == "--no-prints") { - params.no_prints = false; + params.no_prints = true; } else if (arg == "-l" || arg == "--language") { params.language = sense_voice_param_turn_lowercase(argv[++i]); } else if (arg == "--prompt") { @@ -195,6 +197,8 @@ static bool sense_voice_params_parse(int argc, char **argv, sense_voice_params & params.max_batch = std::stoi(argv[++i]); } else if (arg == "--chunk_size") { params.chunk_size = std::stoi(argv[++i]); + } else if (arg == "--outfile" || arg == "-fout") { + params.outfile = argv[++i]; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); sense_voice_print_usage(argc, argv, params); @@ -385,6 +389,11 @@ int main(int argc, char **argv) { exit(0); } + if (!params.outfile.empty()) { + freopen(params.outfile.c_str(), "w", stdout); + params.use_prefix = false; + } + // sense-voice init struct sense_voice_context_params cparams = sense_voice_context_default_params(); diff --git a/sense-voice/csrc/sense-voice.cc b/sense-voice/csrc/sense-voice.cc index a958927..ada7e56 100644 --- a/sense-voice/csrc/sense-voice.cc +++ b/sense-voice/csrc/sense-voice.cc @@ -808,11 +808,11 @@ int sense_voice_batch_pcm_to_feature_with_state(struct sense_voice_context *ctx, state->feature.frame_size, state->feature.frame_step, state->feature.n_mel, - n_threads, true, cmvn, state->feature); + n_threads, false, cmvn, state->feature); state->feature.input_data.insert(state->feature.input_data.end(), state->feature.data.begin(), state->feature.data.end()); } - state->t_feature_us = ggml_time_us() - t_start_us; + state->t_feature_us += ggml_time_us() - t_start_us; // set input {