From 8146d4f030f6610c6693fa83263929029268c7a3 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 15:07:43 +0800 Subject: [PATCH 01/29] compile and download run with microphone, not solve usb apply headset crash issue yet --- compile.txt | 5 + doubao.cpp | 157 +++++++++++++++++++++++++ doubao_gpu.cpp | 195 +++++++++++++++++++++++++++++++ doubao_mic.cpp | 299 ++++++++++++++++++++++++++++++++++++++++++++++++ download.txt | 2 + minimal_mic.cpp | 82 +++++++++++++ run.txt | 2 + 7 files changed, 742 insertions(+) create mode 100644 compile.txt create mode 100644 doubao.cpp create mode 100644 doubao_gpu.cpp create mode 100644 doubao_mic.cpp create mode 100644 download.txt create mode 100644 minimal_mic.cpp create mode 100644 run.txt diff --git a/compile.txt b/compile.txt new file mode 100644 index 00000000000..007e59390b2 --- /dev/null +++ b/compile.txt @@ -0,0 +1,5 @@ +g++ -O3 minimal_mic.cpp \ + -I. -I./include -I./ggml/include -I./examples \ + ./build/src/libwhisper.so \ + -L/usr/local/cuda/lib64 -lcudart -lcublas \ + -lpthread -ldl -lm -lrt -o minimal_mic diff --git a/doubao.cpp b/doubao.cpp new file mode 100644 index 00000000000..49217508a05 --- /dev/null +++ b/doubao.cpp @@ -0,0 +1,157 @@ +#include "whisper.h" +#include "common.h" + +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" + +#include +#include +#include +#include +#include +#include + +// 全局原子变量控制录制状态(线程安全) +std::atomic is_recording(false); +// 音频缓冲区 +std::vector audio_buffer; + +// 音频回调:仅在录制状态时才采集数据 +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + if (!is_recording.load()) return; // 非录制状态直接返回,不采集数据 + const float* pInputFloat = (const float*)pInput; + if (pInputFloat == NULL) return; + + // 采集数据到缓冲区(限制最大录制时长为30秒,防止溢出) + const size_t max_frames = 16000 * 30; // 30秒 @ 16kHz + const size_t available = max_frames - audio_buffer.size(); + if (available == 0) return; // 缓冲区已满,停止采集 + + const size_t copy_frames = (frameCount > available) ? available : frameCount; + audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + copy_frames); +} + +// 提示信息函数 +void print_usage() { + printf("=============================================\n"); + printf("🎤 语音识别程序(精准录制版)\n"); + printf("操作说明:\n"); + printf(" 1. 按下【回车键】开始录制\n"); + printf(" 2. 说话完成后,再次按下【回车键】停止录制并识别\n"); + printf(" 3. 录制超过30秒会自动停止\n"); + printf(" 4. Ctrl+C 退出程序\n"); + printf("=============================================\n"); +} + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const char* model_path = argv[1]; + + // 1. 初始化 Whisper + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; // 4050 显卡 + struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); + if (!ctx) { + fprintf(stderr, "❌ 初始化Whisper模型失败\n"); + return 1; + } + + // 2. 初始化 Miniaudio(仅初始化设备,不立即采集) + ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture); + deviceConfig.capture.format = ma_format_f32; // Whisper 需要 float32 + deviceConfig.capture.channels = 1; // 单声道 + deviceConfig.sampleRate = 16000; // Whisper 硬指标 16kHz + deviceConfig.dataCallback = data_callback; + deviceConfig.pUserData = nullptr; // 不再传buffer,用全局变量 + + ma_device device; + if (ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS) { + fprintf(stderr, "❌ 打开录音设备失败\n"); + whisper_free(ctx); + return -2; + } + + // 启动设备(但此时is_recording=false,不会采集数据) + if (ma_device_start(&device) != MA_SUCCESS) { + fprintf(stderr, "❌ 启动录音设备失败\n"); + ma_device_uninit(&device); + whisper_free(ctx); + return -3; + } + + print_usage(); + + while (true) { + // 第一步:等待用户按回车开始录制 + printf("\n👉 按下回车键开始录制...\n"); + getchar(); // 等待回车 + + // 开始录制 + is_recording.store(true); + audio_buffer.clear(); // 清空旧数据 + printf("🎙️ 正在录制(说话完成后按回车键停止,最长录制30秒)...\n"); + + // 等待用户停止录制(按回车)或超时30秒 + std::thread wait_thread([&]() { + getchar(); // 等待用户按回车停止 + is_recording.store(false); + }); + + // 超时控制(30秒) + auto start_time = std::chrono::steady_clock::now(); + while (is_recording.load()) { + auto now = std::chrono::steady_clock::now(); + auto duration = std::chrono::duration_cast(now - start_time).count(); + if (duration >= 30) { + printf("⏱️ 录制超时(30秒),自动停止\n"); + is_recording.store(false); + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // 避免CPU空转 + } + + wait_thread.join(); // 等待停止线程结束 + is_recording.store(false); // 确保录制停止 + + // 检查录制的数据量 + if (audio_buffer.empty()) { + printf("⚠️ 未采集到任何音频数据,请重新录制\n"); + continue; + } + + // 第二步:开始识别 + printf("🔍 正在识别...\n"); + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + wparams.n_threads = 12; + wparams.print_progress = false; + wparams.print_realtime = false; + + if (whisper_full(ctx, wparams, audio_buffer.data(), audio_buffer.size()) != 0) { + fprintf(stderr, "❌ 识别失败\n"); + continue; + } + + // 输出识别结果 + const int n_segments = whisper_full_n_segments(ctx); + if (n_segments == 0) { + printf("📝: 未识别到有效内容\n"); + } else { + printf("📝 识别结果:\n"); + for (int i = 0; i < n_segments; ++i) { + const char* text = whisper_full_get_segment_text(ctx, i); + printf(" %s\n", text); + } + } + } + + // 清理资源(实际中Ctrl+C会中断,这里是兜底) + ma_device_uninit(&device); + whisper_free(ctx); + return 0; +} + diff --git a/doubao_gpu.cpp b/doubao_gpu.cpp new file mode 100644 index 00000000000..d045ad4993c --- /dev/null +++ b/doubao_gpu.cpp @@ -0,0 +1,195 @@ +#include "whisper.h" +#include "common.h" + +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" + +#include +#include +#include +#include +#include +#include + +// 全局原子变量控制录制状态(线程安全) +std::atomic is_recording(false); +// 音频缓冲区 +std::vector audio_buffer; + +// 音频回调:仅在录制状态时才采集数据 +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + if (!is_recording.load()) return; // 非录制状态直接返回,不采集数据 + const float* pInputFloat = (const float*)pInput; + if (pInputFloat == NULL) return; + + // 采集数据到缓冲区(限制最大录制时长为30秒,防止溢出) + const size_t max_frames = 16000 * 30; // 30秒 @ 16kHz + const size_t available = max_frames - audio_buffer.size(); + if (available == 0) return; // 缓冲区已满,停止采集 + + const size_t copy_frames = (frameCount > available) ? available : frameCount; + audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + copy_frames); +} + +// 提示信息函数 +void print_usage() { + printf("=============================================\n"); + printf("🎤 语音识别程序(精准录制版)\n"); + printf("操作说明:\n"); + printf(" 1. 按下【回车键】开始录制\n"); + printf(" 2. 说话完成后,再次按下【回车键】停止录制并识别\n"); + printf(" 3. 录制超过30秒会自动停止\n"); + printf(" 4. Ctrl+C 退出程序\n"); + printf("=============================================\n"); +} + +// 适配旧版本的GPU状态提示(不依赖新函数) +void check_gpu_status() { + printf("🔍 GPU加速配置说明...\n"); + printf(" 当前已启用GPU加速(use_gpu = true)\n"); + printf(" ✅ 如果编译时链接了CUDA库,模型会自动使用GPU\n"); + printf(" ❌ 如果识别速度很慢,说明实际使用CPU运行\n"); + printf(" 验证方法:观察识别耗时,GPU版本比CPU快5-10倍\n"); +} + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const char* model_path = argv[1]; + + // GPU状态提示(适配旧版本) + check_gpu_status(); + + // 1. 初始化 Whisper(仅保留旧版本支持的参数) + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; // 启用GPU(旧版本核心参数) + // 移除use_gpu_fp16和gpu_device(旧版本没有这些字段) + + printf("\n🚀 正在加载模型:%s\n", model_path); + struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); + if (!ctx) { + fprintf(stderr, "❌ 初始化Whisper模型失败\n"); + return 1; + } + + // 旧版本没有whisper_is_using_gpu,改用间接提示 + printf("✅ 模型加载成功!\n"); + printf(" 📌 若识别速度快(几秒内完成)= GPU运行\n"); + printf(" 📌 若识别速度慢(十几秒/分钟)= CPU运行\n"); + + // 2. 初始化 Miniaudio(仅初始化设备,不立即采集) + ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture); + deviceConfig.capture.format = ma_format_f32; // Whisper 需要 float32 + deviceConfig.capture.channels = 1; // 单声道 + deviceConfig.sampleRate = 16000; // Whisper 硬指标 16kHz + deviceConfig.dataCallback = data_callback; + deviceConfig.pUserData = nullptr; + + ma_device device; + if (ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS) { + fprintf(stderr, "❌ 打开录音设备失败\n"); + whisper_free(ctx); + return -2; + } + + // 启动设备(但此时is_recording=false,不会采集数据) + if (ma_device_start(&device) != MA_SUCCESS) { + fprintf(stderr, "❌ 启动录音设备失败\n"); + ma_device_uninit(&device); + whisper_free(ctx); + return -3; + } + + print_usage(); + + while (true) { + // 第一步:等待用户按回车开始录制 + printf("\n👉 按下回车键开始录制...\n"); + getchar(); // 等待回车 + + // 开始录制 + is_recording.store(true); + audio_buffer.clear(); // 清空旧数据 + printf("🎙️ 正在录制(说话完成后按回车键停止,最长录制30秒)...\n"); + + // 等待用户停止录制(按回车)或超时30秒 + std::thread wait_thread([&]() { + getchar(); // 等待用户按回车停止 + is_recording.store(false); + }); + + // 超时控制(30秒) + auto start_time = std::chrono::steady_clock::now(); + while (is_recording.load()) { + auto now = std::chrono::steady_clock::now(); + auto duration = std::chrono::duration_cast(now - start_time).count(); + if (duration >= 30) { + printf("⏱️ 录制超时(30秒),自动停止\n"); + is_recording.store(false); + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // 避免CPU空转 + } + + wait_thread.join(); // 等待停止线程结束 + is_recording.store(false); // 确保录制停止 + + // 检查录制的数据量 + if (audio_buffer.empty()) { + printf("⚠️ 未采集到任何音频数据,请重新录制\n"); + continue; + } + + // 第二步:开始识别(优化识别参数提升精度) + printf("🔍 正在识别...\n"); + // 记录识别开始时间(用于判断GPU/CPU) + auto recognize_start = std::chrono::steady_clock::now(); + + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + wparams.n_threads = 12; // 根据CPU核心数调整 + wparams.print_progress = false; + wparams.print_realtime = false; + + // 精度优化参数(旧版本也支持) + wparams.temperature = 0.0; // 降低随机性,提升稳定性 + wparams.max_len = 0; // 不限制输出长度 + wparams.translate = false; // 不翻译,直接识别 + wparams.no_context = true; // 不使用上下文,避免干扰 + + if (whisper_full(ctx, wparams, audio_buffer.data(), audio_buffer.size()) != 0) { + fprintf(stderr, "❌ 识别失败\n"); + continue; + } + + // 计算识别耗时(判断GPU/CPU) + auto recognize_end = std::chrono::steady_clock::now(); + auto recognize_duration = std::chrono::duration_cast(recognize_end - recognize_start).count(); + printf("⏱️ 识别耗时:%.2f 秒\n", recognize_duration / 1000.0); + if (recognize_duration < 5000) { + printf(" 🎯 识别速度快,应该是GPU在运行!\n"); + } else { + printf(" ⚠️ 识别速度慢,可能是CPU在运行!\n"); + } + + // 输出识别结果 + const int n_segments = whisper_full_n_segments(ctx); + if (n_segments == 0) { + printf("📝: 未识别到有效内容\n"); + } else { + printf("📝 识别结果:\n"); + for (int i = 0; i < n_segments; ++i) { + const char* text = whisper_full_get_segment_text(ctx, i); + printf(" %s\n", text); + } + } + } + + // 清理资源 + ma_device_uninit(&device); + whisper_free(ctx); + return 0; +} diff --git a/doubao_mic.cpp b/doubao_mic.cpp new file mode 100644 index 00000000000..1c1946d9431 --- /dev/null +++ b/doubao_mic.cpp @@ -0,0 +1,299 @@ +#include "whisper.h" +#include +#include +#include +#include +#include +#include +#include +#include + +// ====================== 1. 枚举并选择麦克风设备(纯PortAudio原生实现) ====================== +void enumerate_audio_devices() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + fprintf(stderr, "❌ PortAudio初始化失败: %s\n", Pa_GetErrorText(err)); + return; + } + + int numDevices = Pa_GetDeviceCount(); + printf("\n📜 系统可用麦克风设备列表:\n"); + printf("=============================================\n"); + for (int i = 0; i < numDevices; i++) { + const PaDeviceInfo* pInfo = Pa_GetDeviceInfo(i); + // 只显示输入设备(麦克风,至少1个输入声道) + if (pInfo->maxInputChannels > 0) { + printf("🔧 设备ID: %d | 名称: %s\n", i, pInfo->name); + printf(" 最大输入声道: %d | 默认采样率: %.1f Hz\n", + pInfo->maxInputChannels, pInfo->defaultSampleRate); + printf("---------------------------------------------\n"); + } + } + printf("=============================================\n\n"); + + Pa_Terminate(); +} + +int select_mic_device() { + int selected_id = -1; + printf("👉 请输入你要使用的麦克风设备ID(比如苹果耳机对应的ID):"); + std::cin >> selected_id; + + // 验证设备ID有效性 + PaError err = Pa_Initialize(); + if (err != paNoError) { + fprintf(stderr, "❌ PortAudio初始化失败: %s\n", Pa_GetErrorText(err)); + return -1; + } + + int numDevices = Pa_GetDeviceCount(); + if (selected_id < 0 || selected_id >= numDevices) { + fprintf(stderr, "❌ 设备ID无效!请输入列表中的有效ID\n"); + Pa_Terminate(); + return -1; + } + + const PaDeviceInfo* pInfo = Pa_GetDeviceInfo(selected_id); + if (pInfo->maxInputChannels == 0) { + fprintf(stderr, "❌ 选择的设备不是麦克风(无输入声道)!\n"); + Pa_Terminate(); + return -1; + } + + printf("\n✅ 已选择麦克风:\n"); + printf(" ID: %d | 名称: %s\n", selected_id, pInfo->name); + printf(" 采样率: %.1f Hz | 声道数: %d\n\n", + pInfo->defaultSampleRate, pInfo->maxInputChannels); + + Pa_Terminate(); + return selected_id; +} + +// ====================== 2. 音频采集函数(纯PortAudio原生实现) ====================== +int audio_record(short* buffer, int buffer_size, int sample_rate, int channels, int max_seconds, int device_id) { + PaError err; + PaStream* stream; + PaStreamParameters input_params; + + // 初始化PortAudio + err = Pa_Initialize(); + if (err != paNoError) { + fprintf(stderr, "❌ PortAudio初始化失败: %s\n", Pa_GetErrorText(err)); + return -1; + } + + // 配置输入参数(指定麦克风设备ID) + input_params.device = device_id; + input_params.channelCount = channels; + input_params.sampleFormat = paInt16; // 16位深(Whisper要求) + input_params.suggestedLatency = Pa_GetDeviceInfo(device_id)->defaultLowInputLatency; + input_params.hostApiSpecificStreamInfo = NULL; + + // 打开音频流 + err = Pa_OpenStream( + &stream, + &input_params, + NULL, // 无输出 + sample_rate, + 1024, // 缓冲区大小 + paClipOff, // 关闭裁剪 + NULL, // 无回调 + NULL + ); + + if (err != paNoError) { + fprintf(stderr, "❌ 打开音频流失败: %s\n", Pa_GetErrorText(err)); + Pa_Terminate(); + return -1; + } + + // 开始录制 + err = Pa_StartStream(stream); + if (err != paNoError) { + fprintf(stderr, "❌ 开始录制失败: %s\n", Pa_GetErrorText(err)); + Pa_CloseStream(stream); + Pa_Terminate(); + return -1; + } + + printf("🎙️ 录制中(按回车键停止,最长%d秒)...\n", max_seconds); + int total_samples = 0; + time_t start_time = time(NULL); + + // 录制逻辑:要么按回车停止,要么超时停止 + while (1) { + // 读取音频数据 + int samples_to_read = buffer_size - total_samples; + if (samples_to_read <= 0) break; + + err = Pa_ReadStream(stream, buffer + total_samples, 1024); + if (err != paNoError) { + fprintf(stderr, "❌ 读取音频失败: %s\n", Pa_GetErrorText(err)); + break; + } + + total_samples += 1024; + + // 超时检查(max_seconds秒) + if (difftime(time(NULL), start_time) >= max_seconds) { + printf("\n⏰ 录制超时(%d秒),自动停止\n", max_seconds); + break; + } + + // 检查是否按了回车 + if (std::cin.rdbuf()->in_avail() > 0) { + getchar(); + printf("\n🛑 用户停止录制\n"); + break; + } + } + + // 停止录制 + Pa_StopStream(stream); + Pa_CloseStream(stream); + Pa_Terminate(); + + return total_samples; +} + +// ====================== 3. 新增:short转float(Whisper要求) ====================== +void convert_short_to_float(const short* src, float* dst, int count) { + // 16位short的范围是[-32768, 32767],归一化到float的[-1.0, 1.0] + for (int i = 0; i < count; i++) { + dst[i] = static_cast(src[i]) / 32768.0f; + } +} + +// ====================== 4. 主函数(修正数据类型转换) ====================== +int main(int argc, char **argv) { + // 检查参数 + if (argc < 2) { + fprintf(stderr, "用法: %s 模型文件路径(如 ./models/ggml-medium.bin)\n", argv[0]); + return 1; + } + const char* model_path = argv[1]; + + // 步骤1:枚举并选择麦克风 + enumerate_audio_devices(); + int mic_device_id = select_mic_device(); + if (mic_device_id < 0) { + fprintf(stderr, "❌ 麦克风选择失败,程序退出\n"); + return 1; + } + + // 步骤2:GPU加速配置说明 + printf("\n🔍 GPU加速配置说明...\n"); + printf(" 当前已启用GPU加速(use_gpu = true)\n"); + printf(" ✅ 如果编译时链接了CUDA库,模型会自动使用GPU\n"); + printf(" ❌ 如果识别速度很慢,说明实际使用CPU运行\n"); + printf(" 验证方法:观察识别耗时,GPU版本比CPU快5-10倍\n\n"); + + // 步骤3:加载Whisper模型(启用GPU) + printf("🚀 正在加载模型:%s\n", model_path); + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; + cparams.gpu_device = 0; + + struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); + if (!ctx) { + fprintf(stderr, "❌ 加载模型失败: %s\n", model_path); + return 1; + } + + // 打印模型信息 + whisper_print_system_info(); + printf("✅ 模型加载成功!\n"); + printf(" 📌 若识别速度快(几秒内完成)= GPU运行\n"); + printf(" 📌 若识别速度慢(十几秒/分钟)= CPU运行\n"); + printf("=============================================\n"); + printf("🎤 语音识别程序(指定麦克风版)\n"); + printf("操作说明:\n"); + printf(" 1. 按下【回车键】开始录制\n"); + printf(" 2. 说话完成后,再次按下【回车键】停止录制并识别\n"); + printf(" 3. 录制超过30秒会自动停止\n"); + printf(" 4. Ctrl+C 退出程序\n"); + printf("=============================================\n\n"); + + // 步骤4:准备音频缓冲区 + const int sample_rate = 16000; // Whisper标准采样率 + const int channels = 1; // 单声道 + const int max_seconds = 30; // 最长录制30秒 + const int buffer_size = sample_rate * channels * max_seconds; + + // 原始音频缓冲区(short类型) + short* buffer_short = (short*)malloc(buffer_size * sizeof(short)); + // Whisper输入缓冲区(float类型) + float* buffer_float = (float*)malloc(buffer_size * sizeof(float)); + + if (!buffer_short || !buffer_float) { + fprintf(stderr, "❌ 分配音频缓冲区失败\n"); + free(buffer_short); + free(buffer_float); + whisper_free(ctx); + return 1; + } + + // 步骤5:等待用户开始录制 + printf("👉 按下回车键开始录制...\n"); + getchar(); + + // 步骤6:录制音频(指定选择的麦克风) + int samples_read = audio_record(buffer_short, buffer_size, sample_rate, channels, max_seconds, mic_device_id); + if (samples_read <= 0) { + fprintf(stderr, "❌ 录制音频失败\n"); + free(buffer_short); + free(buffer_float); + whisper_free(ctx); + return 1; + } + + // 步骤7:关键修正:short转float(Whisper要求) + convert_short_to_float(buffer_short, buffer_float, samples_read); + + // 步骤8:语音识别(传入float缓冲区) + printf("\n🔍 正在识别...\n"); + clock_t start = clock(); + + struct whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; // 中文识别 + wparams.translate = false; + wparams.print_special = false; + wparams.print_progress = false; + wparams.print_realtime = false; + wparams.print_timestamps = false; + + // 传入float类型的buffer_float,而非short类型的buffer_short + if (whisper_full(ctx, wparams, buffer_float, samples_read) != 0) { + fprintf(stderr, "❌ 识别音频失败\n"); + free(buffer_short); + free(buffer_float); + whisper_free(ctx); + return 1; + } + + // 步骤9:输出结果 + clock_t end = clock(); + double elapsed = (double)(end - start) / CLOCKS_PER_SEC; + printf("⏱️ 识别耗时:%.2f 秒\n", elapsed); + + if (elapsed < 5.0) { + printf(" 🎯 识别速度快,应该是GPU在运行!\n"); + } else { + printf(" ⚠️ 识别速度慢,当前使用CPU运行(需编译CUDA版本)\n"); + } + + printf("📝 识别结果:\n "); + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; i++) { + const char* text = whisper_full_get_segment_text(ctx, i); + printf("%s\n ", text); + } + printf("\n"); + + // 步骤10:清理资源 + free(buffer_short); + free(buffer_float); + whisper_free(ctx); + + return 0; +} diff --git a/download.txt b/download.txt new file mode 100644 index 00000000000..38cd46030e2 --- /dev/null +++ b/download.txt @@ -0,0 +1,2 @@ +export HF_ENDPOINT=https://hf-mirror.com +hf download ggerganov/whisper.cpp ggml-medium.bin --local-dir ./models diff --git a/minimal_mic.cpp b/minimal_mic.cpp new file mode 100644 index 00000000000..de03ee67fb0 --- /dev/null +++ b/minimal_mic.cpp @@ -0,0 +1,82 @@ +#include "whisper.h" +#include "common.h" + +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" + +#include +#include +#include + +// 音频回调:将采集到的数据存入 buffer +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + std::vector* pBuffer = (std::vector*)pDevice->pUserData; + const float* pInputFloat = (const float*)pInput; + if (pInputFloat == NULL) return; + + pBuffer->insert(pBuffer->end(), pInputFloat, pInputFloat + frameCount); + // 保持 buffer 在最近 10 秒以内,防止内存溢出 + if (pBuffer->size() > 16000 * 10) { + pBuffer->erase(pBuffer->begin(), pBuffer->begin() + (pBuffer->size() - 16000 * 10)); + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const char* model_path = argv[1]; + + // 1. 初始化 Whisper + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; // 你的 4050 显卡 + struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); + if (!ctx) return 1; + + // 2. 初始化 Miniaudio + std::vector audio_buffer; + ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture); + deviceConfig.capture.format = ma_format_f32; // Whisper 需要 float32 + deviceConfig.capture.channels = 1; // 单声道 + deviceConfig.sampleRate = 16000; // Whisper 硬指标 16kHz + deviceConfig.dataCallback = data_callback; + deviceConfig.pUserData = &audio_buffer; + + ma_device device; + if (ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS) { + fprintf(stderr, "Failed to open capture device.\n"); + return -2; + } + + ma_device_start(&device); + printf("🎤 录音中... 请说话 (按回车键进行单次识别,Ctrl+C 退出)\n"); + + while (true) { + getchar(); // 等待用户敲回车触发识别 + + printf("正在识别...\n"); + + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + wparams.n_threads = 12; + wparams.print_progress = false; + + if (whisper_full(ctx, wparams, audio_buffer.data(), audio_buffer.size()) != 0) { + fprintf(stderr, "识别失败\n"); + continue; + } + + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) { + const char* text = whisper_full_get_segment_text(ctx, i); + printf("📝: %s\n", text); + } + audio_buffer.clear(); // 清空,准备下一轮 + } + + ma_device_uninit(&device); + whisper_free(ctx); + return 0; +} diff --git a/run.txt b/run.txt new file mode 100644 index 00000000000..43c761ff1ee --- /dev/null +++ b/run.txt @@ -0,0 +1,2 @@ +export LD_LIBRARY_PATH=./build/src +./minimal_mic ./models/ggml-small.bin From 91958f1b0a6a45a63d9f83d5cff5d56a94e4ab6d Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 15:23:42 +0800 Subject: [PATCH 02/29] very good result with one issue of 30seconds not reached and trunk --- compile.txt | 2 + doubao_mic.cpp | 463 +++++++++++++++++++++++-------------------------- 2 files changed, 223 insertions(+), 242 deletions(-) diff --git a/compile.txt b/compile.txt index 007e59390b2..cd623631446 100644 --- a/compile.txt +++ b/compile.txt @@ -3,3 +3,5 @@ g++ -O3 minimal_mic.cpp \ ./build/src/libwhisper.so \ -L/usr/local/cuda/lib64 -lcudart -lcublas \ -lpthread -ldl -lm -lrt -o minimal_mic + +g++ -O3 doubao_mic.cpp -I. -I./include -I./ggml/include -I./examples ./build_gpu/src/libwhisper.so -L/usr/local/cuda/lib64 -lcudart -lcublas -lportaudio -lpthread -ldl -lm -lrt -o doubao_mic.exe diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 1c1946d9431..332bb20e9e4 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -1,299 +1,278 @@ #include "whisper.h" -#include +#include "common.h" + +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" + +#include #include +#include +#include +#include +#include +#include #include +#include #include -#include -#include -#include -#include - -// ====================== 1. 枚举并选择麦克风设备(纯PortAudio原生实现) ====================== -void enumerate_audio_devices() { - PaError err = Pa_Initialize(); - if (err != paNoError) { - fprintf(stderr, "❌ PortAudio初始化失败: %s\n", Pa_GetErrorText(err)); - return; - } - - int numDevices = Pa_GetDeviceCount(); - printf("\n📜 系统可用麦克风设备列表:\n"); - printf("=============================================\n"); - for (int i = 0; i < numDevices; i++) { - const PaDeviceInfo* pInfo = Pa_GetDeviceInfo(i); - // 只显示输入设备(麦克风,至少1个输入声道) - if (pInfo->maxInputChannels > 0) { - printf("🔧 设备ID: %d | 名称: %s\n", i, pInfo->name); - printf(" 最大输入声道: %d | 默认采样率: %.1f Hz\n", - pInfo->maxInputChannels, pInfo->defaultSampleRate); - printf("---------------------------------------------\n"); - } +#include // 关键:补充缺失的mutex头文件 + +// 全局原子变量(线程安全) +std::atomic is_recording(false); +std::atomic exit_program(false); +// 音频缓冲区(加锁保护,避免多线程冲突) +std::vector audio_buffer; +std::mutex buffer_mutex; // 现在有头文件支持,不会报错 + +// 信号处理:Ctrl+C 优雅退出 +void signal_handler(int sig) { + if (sig == SIGINT) { + printf("\n\n🛑 收到退出信号,正在清理资源...\n"); + exit_program.store(true); + is_recording.store(false); + exit(0); } - printf("=============================================\n\n"); - - Pa_Terminate(); } -int select_mic_device() { - int selected_id = -1; - printf("👉 请输入你要使用的麦克风设备ID(比如苹果耳机对应的ID):"); - std::cin >> selected_id; - - // 验证设备ID有效性 - PaError err = Pa_Initialize(); - if (err != paNoError) { - fprintf(stderr, "❌ PortAudio初始化失败: %s\n", Pa_GetErrorText(err)); - return -1; - } +// 音频回调(旧版 miniaudio 兼容) +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + if (!is_recording.load() || pInput == NULL) return; - int numDevices = Pa_GetDeviceCount(); - if (selected_id < 0 || selected_id >= numDevices) { - fprintf(stderr, "❌ 设备ID无效!请输入列表中的有效ID\n"); - Pa_Terminate(); - return -1; - } + const float* pInputFloat = (const float*)pInput; + if (pInputFloat == NULL) return; - const PaDeviceInfo* pInfo = Pa_GetDeviceInfo(selected_id); - if (pInfo->maxInputChannels == 0) { - fprintf(stderr, "❌ 选择的设备不是麦克风(无输入声道)!\n"); - Pa_Terminate(); - return -1; + // 加锁操作缓冲区(避免主线程/回调线程冲突) + std::lock_guard lock(buffer_mutex); + + // 限制最大录制时长 30 秒(16000Hz) + const size_t max_frames = 16000 * 30; + const size_t available = max_frames - audio_buffer.size(); + if (available == 0) { + is_recording.store(false); + return; } - printf("\n✅ 已选择麦克风:\n"); - printf(" ID: %d | 名称: %s\n", selected_id, pInfo->name); - printf(" 采样率: %.1f Hz | 声道数: %d\n\n", - pInfo->defaultSampleRate, pInfo->maxInputChannels); - - Pa_Terminate(); - return selected_id; + const size_t copy_frames = (frameCount > available) ? available : frameCount; + audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + copy_frames); } -// ====================== 2. 音频采集函数(纯PortAudio原生实现) ====================== -int audio_record(short* buffer, int buffer_size, int sample_rate, int channels, int max_seconds, int device_id) { - PaError err; - PaStream* stream; - PaStreamParameters input_params; - - // 初始化PortAudio - err = Pa_Initialize(); - if (err != paNoError) { - fprintf(stderr, "❌ PortAudio初始化失败: %s\n", Pa_GetErrorText(err)); - return -1; - } +// 列出系统音频设备(兼容旧版 API) +void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_uint32& captureCount) { + printf("\n📜 系统可用麦克风设备列表:\n"); + printf("=============================================\n"); - // 配置输入参数(指定麦克风设备ID) - input_params.device = device_id; - input_params.channelCount = channels; - input_params.sampleFormat = paInt16; // 16位深(Whisper要求) - input_params.suggestedLatency = Pa_GetDeviceInfo(device_id)->defaultLowInputLatency; - input_params.hostApiSpecificStreamInfo = NULL; - - // 打开音频流 - err = Pa_OpenStream( - &stream, - &input_params, - NULL, // 无输出 - sample_rate, - 1024, // 缓冲区大小 - paClipOff, // 关闭裁剪 - NULL, // 无回调 - NULL - ); - - if (err != paNoError) { - fprintf(stderr, "❌ 打开音频流失败: %s\n", Pa_GetErrorText(err)); - Pa_Terminate(); - return -1; + ma_result result = ma_context_get_devices(&context, NULL, NULL, pCaptureInfos, &captureCount); + if (result != MA_SUCCESS) { + fprintf(stderr, "❌ 获取设备列表失败,使用默认设备\n"); + *pCaptureInfos = NULL; + captureCount = 0; + return; } - // 开始录制 - err = Pa_StartStream(stream); - if (err != paNoError) { - fprintf(stderr, "❌ 开始录制失败: %s\n", Pa_GetErrorText(err)); - Pa_CloseStream(stream); - Pa_Terminate(); - return -1; + for (ma_uint32 i = 0; i < captureCount; ++i) { + printf("🔧 设备ID: %u | 名称: %s\n", i, (*pCaptureInfos)[i].name); + printf(" 声道数: 1 | 采样率: 16000 Hz\n"); // 固定 16000Hz 避免采样率冲突 + printf("---------------------------------------------\n"); } + printf("=============================================\n"); +} - printf("🎙️ 录制中(按回车键停止,最长%d秒)...\n", max_seconds); - int total_samples = 0; - time_t start_time = time(NULL); - - // 录制逻辑:要么按回车停止,要么超时停止 - while (1) { - // 读取音频数据 - int samples_to_read = buffer_size - total_samples; - if (samples_to_read <= 0) break; - - err = Pa_ReadStream(stream, buffer + total_samples, 1024); - if (err != paNoError) { - fprintf(stderr, "❌ 读取音频失败: %s\n", Pa_GetErrorText(err)); - break; - } - - total_samples += 1024; - - // 超时检查(max_seconds秒) - if (difftime(time(NULL), start_time) >= max_seconds) { - printf("\n⏰ 录制超时(%d秒),自动停止\n", max_seconds); - break; - } - - // 检查是否按了回车 - if (std::cin.rdbuf()->in_avail() > 0) { - getchar(); - printf("\n🛑 用户停止录制\n"); - break; - } - } - - // 停止录制 - Pa_StopStream(stream); - Pa_CloseStream(stream); - Pa_Terminate(); - - return total_samples; +// 提示信息 +void print_usage() { + printf("=============================================\n"); + printf("🎤 语音识别程序(旧版兼容)\n"); + printf("操作说明:\n"); + printf(" 1. 按下【回车键】开始录制\n"); + printf(" 2. 说话完成后按回车停止录制并识别\n"); + printf(" 3. 录制超过30秒自动停止\n"); + printf(" 4. Ctrl+C 退出程序\n"); + printf("=============================================\n"); } -// ====================== 3. 新增:short转float(Whisper要求) ====================== -void convert_short_to_float(const short* src, float* dst, int count) { - // 16位short的范围是[-32768, 32767],归一化到float的[-1.0, 1.0] - for (int i = 0; i < count; i++) { - dst[i] = static_cast(src[i]) / 32768.0f; - } +// GPU 状态提示(兼容旧版) +void check_gpu_status() { + printf("🔍 GPU加速配置说明:\n"); + printf(" ❌ 若识别速度慢,说明使用CPU运行\n"); + printf(" ✅ 启用GPU:重新编译whisper.cpp时添加 -DWHISPER_CUDA=ON\n"); } -// ====================== 4. 主函数(修正数据类型转换) ====================== -int main(int argc, char **argv) { - // 检查参数 +int main(int argc, char** argv) { + // 注册信号处理 + signal(SIGINT, signal_handler); + if (argc < 2) { - fprintf(stderr, "用法: %s 模型文件路径(如 ./models/ggml-medium.bin)\n", argv[0]); + fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } const char* model_path = argv[1]; - // 步骤1:枚举并选择麦克风 - enumerate_audio_devices(); - int mic_device_id = select_mic_device(); - if (mic_device_id < 0) { - fprintf(stderr, "❌ 麦克风选择失败,程序退出\n"); + // 1. 初始化音频上下文(旧版兼容) + ma_context context; + if (ma_context_init(NULL, 0, NULL, &context) != MA_SUCCESS) { + fprintf(stderr, "❌ 初始化音频上下文失败\n"); return 1; } - // 步骤2:GPU加速配置说明 - printf("\n🔍 GPU加速配置说明...\n"); - printf(" 当前已启用GPU加速(use_gpu = true)\n"); - printf(" ✅ 如果编译时链接了CUDA库,模型会自动使用GPU\n"); - printf(" ❌ 如果识别速度很慢,说明实际使用CPU运行\n"); - printf(" 验证方法:观察识别耗时,GPU版本比CPU快5-10倍\n\n"); + // 2. 枚举麦克风设备 + ma_device_info* pCaptureInfos = NULL; + ma_uint32 captureCount = 0; + list_audio_devices(context, &pCaptureInfos, captureCount); + + // 3. 选择麦克风设备 + ma_uint32 device_id = 0; + if (captureCount > 0) { + printf("\n👉 请输入要使用的麦克风设备ID:"); + if (scanf("%u", &device_id) != 1 || device_id >= captureCount) { + fprintf(stderr, "❌ 输入无效,使用默认设备ID 0\n"); + device_id = 0; + } + // 清空输入缓冲区 + while (getchar() != '\n'); + } - // 步骤3:加载Whisper模型(启用GPU) - printf("🚀 正在加载模型:%s\n", model_path); + // 4. 初始化 Whisper 模型 struct whisper_context_params cparams = whisper_context_default_params(); cparams.use_gpu = true; - cparams.gpu_device = 0; + printf("\n🚀 正在加载模型:%s\n", model_path); struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); if (!ctx) { - fprintf(stderr, "❌ 加载模型失败: %s\n", model_path); + fprintf(stderr, "❌ 初始化Whisper模型失败\n"); + ma_context_uninit(&context); return 1; } - // 打印模型信息 - whisper_print_system_info(); + // GPU 状态提示 + check_gpu_status(); printf("✅ 模型加载成功!\n"); - printf(" 📌 若识别速度快(几秒内完成)= GPU运行\n"); - printf(" 📌 若识别速度慢(十几秒/分钟)= CPU运行\n"); - printf("=============================================\n"); - printf("🎤 语音识别程序(指定麦克风版)\n"); - printf("操作说明:\n"); - printf(" 1. 按下【回车键】开始录制\n"); - printf(" 2. 说话完成后,再次按下【回车键】停止录制并识别\n"); - printf(" 3. 录制超过30秒会自动停止\n"); - printf(" 4. Ctrl+C 退出程序\n"); - printf("=============================================\n\n"); - // 步骤4:准备音频缓冲区 - const int sample_rate = 16000; // Whisper标准采样率 - const int channels = 1; // 单声道 - const int max_seconds = 30; // 最长录制30秒 - const int buffer_size = sample_rate * channels * max_seconds; - - // 原始音频缓冲区(short类型) - short* buffer_short = (short*)malloc(buffer_size * sizeof(short)); - // Whisper输入缓冲区(float类型) - float* buffer_float = (float*)malloc(buffer_size * sizeof(float)); - - if (!buffer_short || !buffer_float) { - fprintf(stderr, "❌ 分配音频缓冲区失败\n"); - free(buffer_short); - free(buffer_float); - whisper_free(ctx); - return 1; + // 5. 初始化录音设备(旧版 miniaudio 核心兼容) + ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture); + deviceConfig.capture.format = ma_format_f32; // Whisper 要求 float32 + deviceConfig.capture.channels = 1; // 单声道 + deviceConfig.sampleRate = 16000; // 固定 16000Hz 避免采样率错误 + deviceConfig.dataCallback = data_callback; // 回调函数 + deviceConfig.pUserData = NULL; + + // 指定选中的麦克风设备(旧版用 pDeviceID) + if (captureCount > 0 && pCaptureInfos != NULL) { + deviceConfig.capture.pDeviceID = &pCaptureInfos[device_id].id; + printf("\n✅ 已选择麦克风:%s\n", pCaptureInfos[device_id].name); + } else { + printf("\n✅ 使用默认麦克风设备\n"); } - // 步骤5:等待用户开始录制 - printf("👉 按下回车键开始录制...\n"); - getchar(); - - // 步骤6:录制音频(指定选择的麦克风) - int samples_read = audio_record(buffer_short, buffer_size, sample_rate, channels, max_seconds, mic_device_id); - if (samples_read <= 0) { - fprintf(stderr, "❌ 录制音频失败\n"); - free(buffer_short); - free(buffer_float); + ma_device device; + if (ma_device_init(&context, &deviceConfig, &device) != MA_SUCCESS) { + fprintf(stderr, "❌ 打开录音设备失败\n"); whisper_free(ctx); + ma_context_uninit(&context); return 1; } - // 步骤7:关键修正:short转float(Whisper要求) - convert_short_to_float(buffer_short, buffer_float, samples_read); - - // 步骤8:语音识别(传入float缓冲区) - printf("\n🔍 正在识别...\n"); - clock_t start = clock(); - - struct whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); - wparams.language = "zh"; // 中文识别 - wparams.translate = false; - wparams.print_special = false; - wparams.print_progress = false; - wparams.print_realtime = false; - wparams.print_timestamps = false; - - // 传入float类型的buffer_float,而非short类型的buffer_short - if (whisper_full(ctx, wparams, buffer_float, samples_read) != 0) { - fprintf(stderr, "❌ 识别音频失败\n"); - free(buffer_short); - free(buffer_float); + // 启动录音设备(仅初始化,不采集数据) + if (ma_device_start(&device) != MA_SUCCESS) { + fprintf(stderr, "❌ 启动录音设备失败\n"); + ma_device_uninit(&device); whisper_free(ctx); + ma_context_uninit(&context); return 1; } - // 步骤9:输出结果 - clock_t end = clock(); - double elapsed = (double)(end - start) / CLOCKS_PER_SEC; - printf("⏱️ 识别耗时:%.2f 秒\n", elapsed); - - if (elapsed < 5.0) { - printf(" 🎯 识别速度快,应该是GPU在运行!\n"); - } else { - printf(" ⚠️ 识别速度慢,当前使用CPU运行(需编译CUDA版本)\n"); - } + print_usage(); + + // 主循环 + while (!exit_program.load()) { + // 等待用户按回车开始录制 + printf("\n👉 按下回车键开始录制...\n"); + getchar(); + + if (exit_program.load()) break; + + // 重置录制状态 + is_recording.store(true); + { + std::lock_guard lock(buffer_mutex); + audio_buffer.clear(); + } + printf("🎙️ 正在录制(按回车停止,最长30秒)...\n"); + + // 等待用户停止录制(子线程监听回车) + std::thread wait_thread([&]() { + getchar(); + is_recording.store(false); + }); + + // 超时控制(30秒) + auto start_time = std::chrono::steady_clock::now(); + while (is_recording.load() && !exit_program.load()) { + auto duration = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time).count(); + + if (duration >= 30) { + printf("⏱️ 录制超时,自动停止\n"); + is_recording.store(false); + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + wait_thread.join(); + is_recording.store(false); - printf("📝 识别结果:\n "); - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; i++) { - const char* text = whisper_full_get_segment_text(ctx, i); - printf("%s\n ", text); + if (exit_program.load()) break; + + // 检查录制数据 + std::vector captured_audio; + { + std::lock_guard lock(buffer_mutex); + captured_audio = audio_buffer; // 拷贝数据避免锁冲突 + } + + if (captured_audio.empty()) { + printf("⚠️ 未采集到音频数据,请重新录制\n"); + continue; + } + + // 开始识别 + printf("🔍 正在识别(音频长度:%.2f秒)...\n", (float)captured_audio.size() / 16000); + auto recognize_start = std::chrono::steady_clock::now(); + + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + wparams.n_threads = std::max(1, (int)std::thread::hardware_concurrency()); + wparams.print_progress = false; + wparams.print_realtime = false; + wparams.temperature = 0.0; + wparams.max_len = 0; + wparams.translate = false; + wparams.no_context = true; + + if (whisper_full(ctx, wparams, captured_audio.data(), captured_audio.size()) != 0) { + fprintf(stderr, "❌ 识别失败\n"); + continue; + } + + // 输出识别结果 + auto recognize_duration = std::chrono::duration_cast( + std::chrono::steady_clock::now() - recognize_start).count(); + printf("⏱️ 识别耗时:%.2f 秒\n", recognize_duration / 1000.0); + + const int n_segments = whisper_full_n_segments(ctx); + if (n_segments == 0) { + printf("📝 未识别到有效内容\n"); + } else { + printf("📝 识别结果:\n"); + for (int i = 0; i < n_segments; ++i) { + const char* text = whisper_full_get_segment_text(ctx, i); + printf(" %s\n", text); + } + } } - printf("\n"); - // 步骤10:清理资源 - free(buffer_short); - free(buffer_float); + // 清理资源 + ma_device_uninit(&device); + ma_context_uninit(&context); whisper_free(ctx); - + printf("✅ 资源清理完成,程序退出\n"); return 0; } From 4c21543bac9e343130ef210fe643c39ae4a66ea3 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 15:30:24 +0800 Subject: [PATCH 03/29] try to solve 30 seconds issue, now 60 seconds, but last few words lost before key press of enter --- doubao_mic.cpp | 151 +++++++++++++++++++++++++++++++------------------ 1 file changed, 96 insertions(+), 55 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 332bb20e9e4..f61e40d690d 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -14,14 +14,17 @@ #include #include #include -#include // 关键:补充缺失的mutex头文件 +#include // 全局原子变量(线程安全) std::atomic is_recording(false); std::atomic exit_program(false); -// 音频缓冲区(加锁保护,避免多线程冲突) +std::atomic recorded_seconds(0); // 实时录制时长 +// 音频缓冲区(加锁保护) std::vector audio_buffer; -std::mutex buffer_mutex; // 现在有头文件支持,不会报错 +std::mutex buffer_mutex; +// 可选超时(默认60秒,可自定义) +const int RECORD_TIMEOUT = 60; // 延长到60秒,也可设为0取消超时 // 信号处理:Ctrl+C 优雅退出 void signal_handler(int sig) { @@ -33,29 +36,40 @@ void signal_handler(int sig) { } } -// 音频回调(旧版 miniaudio 兼容) +// 音频回调(取消30秒帧上限) void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { if (!is_recording.load() || pInput == NULL) return; const float* pInputFloat = (const float*)pInput; if (pInputFloat == NULL) return; - // 加锁操作缓冲区(避免主线程/回调线程冲突) std::lock_guard lock(buffer_mutex); - - // 限制最大录制时长 30 秒(16000Hz) - const size_t max_frames = 16000 * 30; - const size_t available = max_frames - audio_buffer.size(); - if (available == 0) { - is_recording.store(false); - return; + // 取消固定帧上限,仅保留内存保护(可选) + const size_t max_memory = 16000 * 120; // 最多120秒(约200MB内存) + if (audio_buffer.size() < max_memory) { + audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + frameCount); + // 更新实时录制时长 + recorded_seconds.store(audio_buffer.size() / 16000); } +} - const size_t copy_frames = (frameCount > available) ? available : frameCount; - audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + copy_frames); +// 静音检测(裁剪无效音频,减少识别量) +int trim_silence(const float* audio_data, int audio_len, float threshold = 0.001f) { + // 跳过开头静音 + int start = 0; + while (start < audio_len && fabs(audio_data[start]) < threshold) { + start++; + } + // 跳过结尾静音 + int end = audio_len - 1; + while (end > start && fabs(audio_data[end]) < threshold) { + end--; + } + // 返回有效音频长度(至少保留1秒) + return std::max(end - start + 1, 16000); } -// 列出系统音频设备(兼容旧版 API) +// 列出系统音频设备 void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_uint32& captureCount) { printf("\n📜 系统可用麦克风设备列表:\n"); printf("=============================================\n"); @@ -70,33 +84,37 @@ void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_ for (ma_uint32 i = 0; i < captureCount; ++i) { printf("🔧 设备ID: %u | 名称: %s\n", i, (*pCaptureInfos)[i].name); - printf(" 声道数: 1 | 采样率: 16000 Hz\n"); // 固定 16000Hz 避免采样率冲突 + printf(" 声道数: 1 | 采样率: 16000 Hz\n"); printf("---------------------------------------------\n"); } printf("=============================================\n"); } -// 提示信息 +// 提示信息(修复printf多参数问题) void print_usage() { printf("=============================================\n"); - printf("🎤 语音识别程序(旧版兼容)\n"); + printf("🎤 语音识别程序(CPU优化版)\n"); printf("操作说明:\n"); printf(" 1. 按下【回车键】开始录制\n"); printf(" 2. 说话完成后按回车停止录制并识别\n"); - printf(" 3. 录制超过30秒自动停止\n"); - printf(" 4. Ctrl+C 退出程序\n"); - printf("=============================================\n"); + printf(" 3. 录制超过%d秒自动停止(可自定义)\n", RECORD_TIMEOUT); + printf(" 4. 录制中实时显示时长:【录制中... X秒】\n"); + printf(" 5. Ctrl+C 退出程序\n"); + printf("=============================================\n"); // 移除多余的RECORD_TIMEOUT参数 } -// GPU 状态提示(兼容旧版) -void check_gpu_status() { - printf("🔍 GPU加速配置说明:\n"); - printf(" ❌ 若识别速度慢,说明使用CPU运行\n"); - printf(" ✅ 启用GPU:重新编译whisper.cpp时添加 -DWHISPER_CUDA=ON\n"); +// CPU优化提示 +void print_cpu_optimize_tips() { + printf("⚡ CPU优化配置说明:\n"); + printf(" ✅ 已启用多线程识别(自动适配CPU核心数)\n"); + printf(" ✅ 已启用静音裁剪(减少无效音频识别)\n"); + printf(" ✅ 已使用贪心采样(最快的识别策略)\n"); + printf(" 📌 模型优化:推荐使用 ggml-medium-q4_0.bin(量化版)\n"); + printf(" 📌 编译优化:已用 -O3 最高级优化\n"); + printf("=============================================\n"); } int main(int argc, char** argv) { - // 注册信号处理 signal(SIGINT, signal_handler); if (argc < 2) { @@ -105,7 +123,7 @@ int main(int argc, char** argv) { } const char* model_path = argv[1]; - // 1. 初始化音频上下文(旧版兼容) + // 1. 初始化音频上下文 ma_context context; if (ma_context_init(NULL, 0, NULL, &context) != MA_SUCCESS) { fprintf(stderr, "❌ 初始化音频上下文失败\n"); @@ -125,13 +143,13 @@ int main(int argc, char** argv) { fprintf(stderr, "❌ 输入无效,使用默认设备ID 0\n"); device_id = 0; } - // 清空输入缓冲区 - while (getchar() != '\n'); + while (getchar() != '\n'); // 清空输入缓冲区 } - // 4. 初始化 Whisper 模型 + // 4. 初始化 Whisper 模型(CPU优化,移除不存在的use_flash_attention) struct whisper_context_params cparams = whisper_context_default_params(); - cparams.use_gpu = true; + cparams.use_gpu = false; // 强制CPU(避免GPU检测开销) + // 移除 cparams.use_flash_attention = false; (旧版本无此成员) printf("\n🚀 正在加载模型:%s\n", model_path); struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); @@ -141,19 +159,18 @@ int main(int argc, char** argv) { return 1; } - // GPU 状态提示 - check_gpu_status(); + // 显示CPU优化提示 + print_cpu_optimize_tips(); printf("✅ 模型加载成功!\n"); - // 5. 初始化录音设备(旧版 miniaudio 核心兼容) + // 5. 初始化录音设备 ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture); - deviceConfig.capture.format = ma_format_f32; // Whisper 要求 float32 - deviceConfig.capture.channels = 1; // 单声道 - deviceConfig.sampleRate = 16000; // 固定 16000Hz 避免采样率错误 - deviceConfig.dataCallback = data_callback; // 回调函数 + deviceConfig.capture.format = ma_format_f32; + deviceConfig.capture.channels = 1; + deviceConfig.sampleRate = 16000; + deviceConfig.dataCallback = data_callback; deviceConfig.pUserData = NULL; - // 指定选中的麦克风设备(旧版用 pDeviceID) if (captureCount > 0 && pCaptureInfos != NULL) { deviceConfig.capture.pDeviceID = &pCaptureInfos[device_id].id; printf("\n✅ 已选择麦克风:%s\n", pCaptureInfos[device_id].name); @@ -169,7 +186,6 @@ int main(int argc, char** argv) { return 1; } - // 启动录音设备(仅初始化,不采集数据) if (ma_device_start(&device) != MA_SUCCESS) { fprintf(stderr, "❌ 启动录音设备失败\n"); ma_device_uninit(&device); @@ -182,7 +198,6 @@ int main(int argc, char** argv) { // 主循环 while (!exit_program.load()) { - // 等待用户按回车开始录制 printf("\n👉 按下回车键开始录制...\n"); getchar(); @@ -190,34 +205,49 @@ int main(int argc, char** argv) { // 重置录制状态 is_recording.store(true); + recorded_seconds.store(0); { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } - printf("🎙️ 正在录制(按回车停止,最长30秒)...\n"); + printf("🎙️ 正在录制(按回车停止,最长%d秒)...\n", RECORD_TIMEOUT); + + // 录制时长实时显示线程 + std::thread progress_thread([&]() { + while (is_recording.load() && !exit_program.load()) { + printf("\r📊 录制中... %d秒", recorded_seconds.load()); + fflush(stdout); // 强制刷新输出 + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + }); - // 等待用户停止录制(子线程监听回车) + // 等待用户停止录制(主线程监听,避免子线程输入阻塞) + std::atomic stop_record(false); std::thread wait_thread([&]() { getchar(); + stop_record.store(true); is_recording.store(false); }); - // 超时控制(30秒) + // 超时控制(可选) auto start_time = std::chrono::steady_clock::now(); - while (is_recording.load() && !exit_program.load()) { + while (!stop_record.load() && !exit_program.load()) { auto duration = std::chrono::duration_cast( std::chrono::steady_clock::now() - start_time).count(); - if (duration >= 30) { - printf("⏱️ 录制超时,自动停止\n"); + if (RECORD_TIMEOUT > 0 && duration >= RECORD_TIMEOUT) { + printf("\n⏱️ 录制超时(%d秒),自动停止\n", RECORD_TIMEOUT); is_recording.store(false); + stop_record.store(true); break; } std::this_thread::sleep_for(std::chrono::milliseconds(100)); } wait_thread.join(); + progress_thread.join(); is_recording.store(false); + printf("\n"); // 换行,清理进度显示 if (exit_program.load()) break; @@ -225,7 +255,7 @@ int main(int argc, char** argv) { std::vector captured_audio; { std::lock_guard lock(buffer_mutex); - captured_audio = audio_buffer; // 拷贝数据避免锁冲突 + captured_audio = audio_buffer; } if (captured_audio.empty()) { @@ -233,21 +263,30 @@ int main(int argc, char** argv) { continue; } - // 开始识别 - printf("🔍 正在识别(音频长度:%.2f秒)...\n", (float)captured_audio.size() / 16000); + // 优化1:静音裁剪(减少识别数据量) + int valid_len = trim_silence(captured_audio.data(), captured_audio.size()); + float valid_seconds = (float)valid_len / 16000; + printf("🔍 正在识别(有效音频长度:%.2f秒,原始:%.2f秒)...\n", + valid_seconds, (float)captured_audio.size() / 16000); + auto recognize_start = std::chrono::steady_clock::now(); + // 优化2:调整识别参数(CPU最优配置) whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.language = "zh"; - wparams.n_threads = std::max(1, (int)std::thread::hardware_concurrency()); + wparams.n_threads = std::max(2, (int)std::thread::hardware_concurrency()); // 至少2线程 wparams.print_progress = false; wparams.print_realtime = false; - wparams.temperature = 0.0; + wparams.temperature = 0.0; // 最快的温度设置 wparams.max_len = 0; wparams.translate = false; wparams.no_context = true; + wparams.single_segment = true; // 单段识别(更快) + wparams.print_special = false; // 不打印特殊字符 + wparams.token_timestamps = false; // 关闭时间戳(节省计算) - if (whisper_full(ctx, wparams, captured_audio.data(), captured_audio.size()) != 0) { + // 执行识别(仅识别有效音频) + if (whisper_full(ctx, wparams, captured_audio.data(), valid_len) != 0) { fprintf(stderr, "❌ 识别失败\n"); continue; } @@ -255,7 +294,9 @@ int main(int argc, char** argv) { // 输出识别结果 auto recognize_duration = std::chrono::duration_cast( std::chrono::steady_clock::now() - recognize_start).count(); - printf("⏱️ 识别耗时:%.2f 秒\n", recognize_duration / 1000.0); + float speed = valid_seconds / (recognize_duration / 1000.0); + printf("⏱️ 识别耗时:%.2f 秒 | 识别速度:%.2fx实时速度\n", + recognize_duration / 1000.0, speed); const int n_segments = whisper_full_n_segments(ctx); if (n_segments == 0) { From b6cd38c17b5dd047a8b62b7290c67327d790fcd9 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 15:47:37 +0800 Subject: [PATCH 04/29] doubao must fix timeout still need key pressing --- doubao_mic.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index f61e40d690d..bba0fcd9f13 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -24,7 +24,8 @@ std::atomic recorded_seconds(0); // 实时录制时长 std::vector audio_buffer; std::mutex buffer_mutex; // 可选超时(默认60秒,可自定义) -const int RECORD_TIMEOUT = 60; // 延长到60秒,也可设为0取消超时 +const int RECORD_TIMEOUT = 30; // 延长到60秒,也可设为0取消超时 +const int RECORD_FINISH_WAIT_MS = 5000; // 停止后等待1秒收尾 // 信号处理:Ctrl+C 优雅退出 void signal_handler(int sig) { @@ -32,6 +33,8 @@ void signal_handler(int sig) { printf("\n\n🛑 收到退出信号,正在清理资源...\n"); exit_program.store(true); is_recording.store(false); + // 等待收尾 + std::this_thread::sleep_for(std::chrono::milliseconds(RECORD_FINISH_WAIT_MS)); exit(0); } } @@ -100,7 +103,7 @@ void print_usage() { printf(" 3. 录制超过%d秒自动停止(可自定义)\n", RECORD_TIMEOUT); printf(" 4. 录制中实时显示时长:【录制中... X秒】\n"); printf(" 5. Ctrl+C 退出程序\n"); - printf("=============================================\n"); // 移除多余的RECORD_TIMEOUT参数 + printf("=============================================\n"); } // CPU优化提示 @@ -149,7 +152,6 @@ int main(int argc, char** argv) { // 4. 初始化 Whisper 模型(CPU优化,移除不存在的use_flash_attention) struct whisper_context_params cparams = whisper_context_default_params(); cparams.use_gpu = false; // 强制CPU(避免GPU检测开销) - // 移除 cparams.use_flash_attention = false; (旧版本无此成员) printf("\n🚀 正在加载模型:%s\n", model_path); struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); @@ -226,6 +228,7 @@ int main(int argc, char** argv) { std::thread wait_thread([&]() { getchar(); stop_record.store(true); + // 先标记停止,但不立即退出,给回调留时间 is_recording.store(false); }); @@ -245,9 +248,11 @@ int main(int argc, char** argv) { } wait_thread.join(); + // 核心修复:等待1秒让回调线程写完最后几帧音频 + printf("\n⏳ 正在收尾音频数据..."); + std::this_thread::sleep_for(std::chrono::milliseconds(RECORD_FINISH_WAIT_MS)); progress_thread.join(); - is_recording.store(false); - printf("\n"); // 换行,清理进度显示 + printf("完成\n"); if (exit_program.load()) break; From 8b98941d754e07c26ce2560f7948b2835b36d7c0 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 15:54:15 +0800 Subject: [PATCH 05/29] doubao still not fix timeout remaining voice and also introduce new issue recording without enter key pressed --- doubao_mic.cpp | 189 +++++++++++++++++++++++++++++-------------------- 1 file changed, 112 insertions(+), 77 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index bba0fcd9f13..d846b0000cb 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -15,6 +15,7 @@ #include #include #include +#include // 用于STDIN_FILENO和read // 全局原子变量(线程安全) std::atomic is_recording(false); @@ -23,9 +24,10 @@ std::atomic recorded_seconds(0); // 实时录制时长 // 音频缓冲区(加锁保护) std::vector audio_buffer; std::mutex buffer_mutex; -// 可选超时(默认60秒,可自定义) -const int RECORD_TIMEOUT = 30; // 延长到60秒,也可设为0取消超时 -const int RECORD_FINISH_WAIT_MS = 5000; // 停止后等待1秒收尾 +// 配置常量(可自定义) +const int RECORD_TIMEOUT = 30; // 超时时间(秒) +const int RECORD_FINISH_WAIT_MS = 5000; // 停止后收尾等待时间 +const bool AUTO_RECOGNIZE_ON_TIMEOUT = true; // 超时自动识别 // 信号处理:Ctrl+C 优雅退出 void signal_handler(int sig) { @@ -39,6 +41,27 @@ void signal_handler(int sig) { } } +// 非阻塞检查输入(解决超时后需按回车问题) +bool check_input_non_blocking() { + fd_set fds; + FD_ZERO(&fds); + FD_SET(STDIN_FILENO, &fds); + + struct timeval tv; + tv.tv_sec = 0; + tv.tv_usec = 100000; // 100ms超时 + + return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; +} + +// 清空输入缓冲区(避免残留回车) +void clear_input_buffer() { + while (check_input_non_blocking()) { + char c; + read(STDIN_FILENO, &c, 1); + } +} + // 音频回调(取消30秒帧上限) void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { if (!is_recording.load() || pInput == NULL) return; @@ -93,14 +116,14 @@ void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_ printf("=============================================\n"); } -// 提示信息(修复printf多参数问题) +// 提示信息 void print_usage() { printf("=============================================\n"); printf("🎤 语音识别程序(CPU优化版)\n"); printf("操作说明:\n"); printf(" 1. 按下【回车键】开始录制\n"); printf(" 2. 说话完成后按回车停止录制并识别\n"); - printf(" 3. 录制超过%d秒自动停止(可自定义)\n", RECORD_TIMEOUT); + printf(" 3. 录制超过%d秒自动停止并识别\n", RECORD_TIMEOUT); printf(" 4. 录制中实时显示时长:【录制中... X秒】\n"); printf(" 5. Ctrl+C 退出程序\n"); printf("=============================================\n"); @@ -117,6 +140,60 @@ void print_cpu_optimize_tips() { printf("=============================================\n"); } +// 核心识别函数(抽离复用) +void recognize_audio(struct whisper_context* ctx, const std::vector& audio_data) { + if (audio_data.empty()) { + printf("⚠️ 未采集到音频数据,跳过识别\n"); + return; + } + + // 静音裁剪 + int valid_len = trim_silence(audio_data.data(), audio_data.size()); + float valid_seconds = (float)valid_len / 16000; + printf("🔍 正在识别(有效音频长度:%.2f秒,原始:%.2f秒)...\n", + valid_seconds, (float)audio_data.size() / 16000); + + auto recognize_start = std::chrono::steady_clock::now(); + + // CPU最优识别参数 + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + wparams.n_threads = std::max(2, (int)std::thread::hardware_concurrency()); + wparams.print_progress = false; + wparams.print_realtime = false; + wparams.temperature = 0.0; + wparams.max_len = 0; + wparams.translate = false; + wparams.no_context = true; + wparams.single_segment = true; + wparams.print_special = false; + wparams.token_timestamps = false; + + // 执行识别 + if (whisper_full(ctx, wparams, audio_data.data(), valid_len) != 0) { + fprintf(stderr, "❌ 识别失败\n"); + return; + } + + // 输出结果 + auto recognize_duration = std::chrono::duration_cast( + std::chrono::steady_clock::now() - recognize_start).count(); + float speed = valid_seconds / (recognize_duration / 1000.0); + printf("⏱️ 识别耗时:%.2f 秒 | 识别速度:%.2fx实时速度\n", + recognize_duration / 1000.0, speed); + + const int n_segments = whisper_full_n_segments(ctx); + if (n_segments == 0) { + printf("📝 未识别到有效内容\n"); + } else { + printf("📝 识别结果:\n"); + for (int i = 0; i < n_segments; ++i) { + const char* text = whisper_full_get_segment_text(ctx, i); + printf(" %s\n", text); + } + } +} + int main(int argc, char** argv) { signal(SIGINT, signal_handler); @@ -146,12 +223,12 @@ int main(int argc, char** argv) { fprintf(stderr, "❌ 输入无效,使用默认设备ID 0\n"); device_id = 0; } - while (getchar() != '\n'); // 清空输入缓冲区 + clear_input_buffer(); // 清空输入缓冲区 } - // 4. 初始化 Whisper 模型(CPU优化,移除不存在的use_flash_attention) + // 4. 初始化 Whisper 模型 struct whisper_context_params cparams = whisper_context_default_params(); - cparams.use_gpu = false; // 强制CPU(避免GPU检测开销) + cparams.use_gpu = false; // 强制CPU printf("\n🚀 正在加载模型:%s\n", model_path); struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); @@ -202,6 +279,7 @@ int main(int argc, char** argv) { while (!exit_program.load()) { printf("\n👉 按下回车键开始录制...\n"); getchar(); + clear_input_buffer(); // 清空残留输入 if (exit_program.load()) break; @@ -218,101 +296,58 @@ int main(int argc, char** argv) { std::thread progress_thread([&]() { while (is_recording.load() && !exit_program.load()) { printf("\r📊 录制中... %d秒", recorded_seconds.load()); - fflush(stdout); // 强制刷新输出 + fflush(stdout); std::this_thread::sleep_for(std::chrono::seconds(1)); } }); - // 等待用户停止录制(主线程监听,避免子线程输入阻塞) - std::atomic stop_record(false); - std::thread wait_thread([&]() { - getchar(); - stop_record.store(true); - // 先标记停止,但不立即退出,给回调留时间 - is_recording.store(false); - }); - - // 超时控制(可选) + bool is_timeout = false; auto start_time = std::chrono::steady_clock::now(); - while (!stop_record.load() && !exit_program.load()) { + + // 非阻塞监听输入 + 超时检测(核心修复) + while (is_recording.load() && !exit_program.load()) { + // 检查是否有回车输入(手动停止) + if (check_input_non_blocking()) { + char c; + read(STDIN_FILENO, &c, 1); + if (c == '\n') { // 只响应回车 + is_recording.store(false); + printf("\n🛑 已手动停止录制\n"); + break; + } + } + + // 检查超时 auto duration = std::chrono::duration_cast( std::chrono::steady_clock::now() - start_time).count(); - if (RECORD_TIMEOUT > 0 && duration >= RECORD_TIMEOUT) { - printf("\n⏱️ 录制超时(%d秒),自动停止\n", RECORD_TIMEOUT); + if (duration >= RECORD_TIMEOUT) { is_recording.store(false); - stop_record.store(true); + is_timeout = true; + printf("\n⏱️ 录制超时(%d秒),自动停止\n", RECORD_TIMEOUT); break; } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); } - wait_thread.join(); - // 核心修复:等待1秒让回调线程写完最后几帧音频 + // 等待录制收尾 + progress_thread.join(); printf("\n⏳ 正在收尾音频数据..."); std::this_thread::sleep_for(std::chrono::milliseconds(RECORD_FINISH_WAIT_MS)); - progress_thread.join(); printf("完成\n"); if (exit_program.load()) break; - // 检查录制数据 + // 拷贝音频数据 std::vector captured_audio; { std::lock_guard lock(buffer_mutex); captured_audio = audio_buffer; } - if (captured_audio.empty()) { - printf("⚠️ 未采集到音频数据,请重新录制\n"); - continue; - } - - // 优化1:静音裁剪(减少识别数据量) - int valid_len = trim_silence(captured_audio.data(), captured_audio.size()); - float valid_seconds = (float)valid_len / 16000; - printf("🔍 正在识别(有效音频长度:%.2f秒,原始:%.2f秒)...\n", - valid_seconds, (float)captured_audio.size() / 16000); - - auto recognize_start = std::chrono::steady_clock::now(); - - // 优化2:调整识别参数(CPU最优配置) - whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); - wparams.language = "zh"; - wparams.n_threads = std::max(2, (int)std::thread::hardware_concurrency()); // 至少2线程 - wparams.print_progress = false; - wparams.print_realtime = false; - wparams.temperature = 0.0; // 最快的温度设置 - wparams.max_len = 0; - wparams.translate = false; - wparams.no_context = true; - wparams.single_segment = true; // 单段识别(更快) - wparams.print_special = false; // 不打印特殊字符 - wparams.token_timestamps = false; // 关闭时间戳(节省计算) - - // 执行识别(仅识别有效音频) - if (whisper_full(ctx, wparams, captured_audio.data(), valid_len) != 0) { - fprintf(stderr, "❌ 识别失败\n"); - continue; - } - - // 输出识别结果 - auto recognize_duration = std::chrono::duration_cast( - std::chrono::steady_clock::now() - recognize_start).count(); - float speed = valid_seconds / (recognize_duration / 1000.0); - printf("⏱️ 识别耗时:%.2f 秒 | 识别速度:%.2fx实时速度\n", - recognize_duration / 1000.0, speed); - - const int n_segments = whisper_full_n_segments(ctx); - if (n_segments == 0) { - printf("📝 未识别到有效内容\n"); - } else { - printf("📝 识别结果:\n"); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - printf(" %s\n", text); - } - } + // 执行识别(无论手动/超时,自动识别) + recognize_audio(ctx, captured_audio); } // 清理资源 From bb17b83555dadfa183c0403f6acc2b2122355f32 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 16:35:05 +0800 Subject: [PATCH 06/29] most of case fixed, except when timeout the last one second maybe lost. very subtle case --- doubao_mic.cpp | 139 +++++++++++++++++++++++++++---------------------- 1 file changed, 76 insertions(+), 63 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index d846b0000cb..cbd777a8951 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -15,19 +15,20 @@ #include #include #include -#include // 用于STDIN_FILENO和read +#include +#include +#include // 全局原子变量(线程安全) std::atomic is_recording(false); std::atomic exit_program(false); -std::atomic recorded_seconds(0); // 实时录制时长 +std::atomic recorded_seconds(0); // 音频缓冲区(加锁保护) std::vector audio_buffer; std::mutex buffer_mutex; -// 配置常量(可自定义) +// 配置常量 const int RECORD_TIMEOUT = 30; // 超时时间(秒) -const int RECORD_FINISH_WAIT_MS = 5000; // 停止后收尾等待时间 -const bool AUTO_RECOGNIZE_ON_TIMEOUT = true; // 超时自动识别 +const int FINISH_WAIT_MS = 2000; // 停止前收尾等待时间(毫秒) // 信号处理:Ctrl+C 优雅退出 void signal_handler(int sig) { @@ -35,34 +36,36 @@ void signal_handler(int sig) { printf("\n\n🛑 收到退出信号,正在清理资源...\n"); exit_program.store(true); is_recording.store(false); - // 等待收尾 - std::this_thread::sleep_for(std::chrono::milliseconds(RECORD_FINISH_WAIT_MS)); + // 给回调线程一点时间清理最后数据 + std::this_thread::sleep_for(std::chrono::milliseconds(100)); exit(0); } } -// 非阻塞检查输入(解决超时后需按回车问题) -bool check_input_non_blocking() { +// 非阻塞检查输入(核心修复:解决死锁) +bool check_input_non_blocking(int timeout_ms = 100) { fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); struct timeval tv; tv.tv_sec = 0; - tv.tv_usec = 100000; // 100ms超时 + tv.tv_usec = timeout_ms * 1000; // 转换为微秒 return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; } -// 清空输入缓冲区(避免残留回车) +// 清空输入缓冲区(避免残留) void clear_input_buffer() { - while (check_input_non_blocking()) { + // 使用非阻塞读取清空缓冲区 + while (check_input_non_blocking(10)) { char c; - read(STDIN_FILENO, &c, 1); + ssize_t ret = read(STDIN_FILENO, &c, 1); + (void)ret; } } -// 音频回调(取消30秒帧上限) +// 音频回调(确保完整接收音频帧) void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { if (!is_recording.load() || pInput == NULL) return; @@ -70,32 +73,26 @@ void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uin if (pInputFloat == NULL) return; std::lock_guard lock(buffer_mutex); - // 取消固定帧上限,仅保留内存保护(可选) - const size_t max_memory = 16000 * 120; // 最多120秒(约200MB内存) + // 安全保护:最多录制35秒(超时+5秒缓冲) + const size_t max_memory = 16000 * (RECORD_TIMEOUT + 5); if (audio_buffer.size() < max_memory) { audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + frameCount); - // 更新实时录制时长 - recorded_seconds.store(audio_buffer.size() / 16000); + // 更新实时时长(精确到0.1秒) + recorded_seconds.store(static_cast(audio_buffer.size() / 16000.0)); } } -// 静音检测(裁剪无效音频,减少识别量) +// 静音检测(仅裁剪开头,保留末尾所有内容) int trim_silence(const float* audio_data, int audio_len, float threshold = 0.001f) { - // 跳过开头静音 int start = 0; while (start < audio_len && fabs(audio_data[start]) < threshold) { start++; } - // 跳过结尾静音 - int end = audio_len - 1; - while (end > start && fabs(audio_data[end]) < threshold) { - end--; - } - // 返回有效音频长度(至少保留1秒) - return std::max(end - start + 1, 16000); + // 关键:不裁剪末尾,确保最后几个字完整 + return std::max(audio_len - start, 16000); // 至少保留1秒 } -// 列出系统音频设备 +// 列出系统音频设备(修复参数类型:第三个参数为引用) void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_uint32& captureCount) { printf("\n📜 系统可用麦克风设备列表:\n"); printf("=============================================\n"); @@ -119,12 +116,12 @@ void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_ // 提示信息 void print_usage() { printf("=============================================\n"); - printf("🎤 语音识别程序(CPU优化版)\n"); + printf("🎤 语音识别程序(终极稳定版)\n"); printf("操作说明:\n"); printf(" 1. 按下【回车键】开始录制\n"); - printf(" 2. 说话完成后按回车停止录制并识别\n"); + printf(" 2. 说话完成后按回车停止(会自动收尾)\n"); printf(" 3. 录制超过%d秒自动停止并识别\n", RECORD_TIMEOUT); - printf(" 4. 录制中实时显示时长:【录制中... X秒】\n"); + printf(" 4. 录制中实时显示时长\n"); printf(" 5. Ctrl+C 退出程序\n"); printf("=============================================\n"); } @@ -133,21 +130,20 @@ void print_usage() { void print_cpu_optimize_tips() { printf("⚡ CPU优化配置说明:\n"); printf(" ✅ 已启用多线程识别(自动适配CPU核心数)\n"); - printf(" ✅ 已启用静音裁剪(减少无效音频识别)\n"); - printf(" ✅ 已使用贪心采样(最快的识别策略)\n"); + printf(" ✅ 停止前预留2秒缓冲,不丢最后音频\n"); + printf(" ✅ 修复线程死锁,手动停止立即响应\n"); printf(" 📌 模型优化:推荐使用 ggml-medium-q4_0.bin(量化版)\n"); printf(" 📌 编译优化:已用 -O3 最高级优化\n"); printf("=============================================\n"); } -// 核心识别函数(抽离复用) +// 核心识别函数 void recognize_audio(struct whisper_context* ctx, const std::vector& audio_data) { if (audio_data.empty()) { printf("⚠️ 未采集到音频数据,跳过识别\n"); return; } - // 静音裁剪 int valid_len = trim_silence(audio_data.data(), audio_data.size()); float valid_seconds = (float)valid_len / 16000; printf("🔍 正在识别(有效音频长度:%.2f秒,原始:%.2f秒)...\n", @@ -155,7 +151,6 @@ void recognize_audio(struct whisper_context* ctx, const std::vector& audi auto recognize_start = std::chrono::steady_clock::now(); - // CPU最优识别参数 whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.language = "zh"; wparams.n_threads = std::max(2, (int)std::thread::hardware_concurrency()); @@ -169,13 +164,11 @@ void recognize_audio(struct whisper_context* ctx, const std::vector& audi wparams.print_special = false; wparams.token_timestamps = false; - // 执行识别 if (whisper_full(ctx, wparams, audio_data.data(), valid_len) != 0) { fprintf(stderr, "❌ 识别失败\n"); return; } - // 输出结果 auto recognize_duration = std::chrono::duration_cast( std::chrono::steady_clock::now() - recognize_start).count(); float speed = valid_seconds / (recognize_duration / 1000.0); @@ -210,10 +203,10 @@ int main(int argc, char** argv) { return 1; } - // 2. 枚举麦克风设备 + // 2. 枚举麦克风设备(修复参数传递:直接传变量,而非指针) ma_device_info* pCaptureInfos = NULL; ma_uint32 captureCount = 0; - list_audio_devices(context, &pCaptureInfos, captureCount); + list_audio_devices(context, &pCaptureInfos, captureCount); // 这里直接传captureCount(引用) // 3. 选择麦克风设备 ma_uint32 device_id = 0; @@ -223,7 +216,7 @@ int main(int argc, char** argv) { fprintf(stderr, "❌ 输入无效,使用默认设备ID 0\n"); device_id = 0; } - clear_input_buffer(); // 清空输入缓冲区 + clear_input_buffer(); // 清空缓冲区 } // 4. 初始化 Whisper 模型 @@ -238,7 +231,6 @@ int main(int argc, char** argv) { return 1; } - // 显示CPU优化提示 print_cpu_optimize_tips(); printf("✅ 模型加载成功!\n"); @@ -275,13 +267,26 @@ int main(int argc, char** argv) { print_usage(); - // 主循环 + // 主循环(彻底修复死锁逻辑) while (!exit_program.load()) { printf("\n👉 按下回车键开始录制...\n"); - getchar(); - clear_input_buffer(); // 清空残留输入 + + // 阻塞等待用户回车(确保由用户控制开始) + char input_char = 0; + while (!check_input_non_blocking() && !exit_program.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + if (exit_program.load()) break; + + ssize_t ret1 = read(STDIN_FILENO, &input_char, 1); + (void)ret1; + clear_input_buffer(); // 清空其他残留输入 if (exit_program.load()) break; + if (input_char != '\n') { + printf("⚠️ 请按回车键触发录制!\n"); + continue; + } // 重置录制状态 is_recording.store(true); @@ -303,39 +308,47 @@ int main(int argc, char** argv) { bool is_timeout = false; auto start_time = std::chrono::steady_clock::now(); + bool manual_stop = false; - // 非阻塞监听输入 + 超时检测(核心修复) + // 核心循环 - 修复死锁:在设flag前等待,不阻塞主线程 while (is_recording.load() && !exit_program.load()) { - // 检查是否有回车输入(手动停止) - if (check_input_non_blocking()) { - char c; - read(STDIN_FILENO, &c, 1); - if (c == '\n') { // 只响应回车 - is_recording.store(false); - printf("\n🛑 已手动停止录制\n"); - break; - } - } - // 检查超时 auto duration = std::chrono::duration_cast( std::chrono::steady_clock::now() - start_time).count(); if (duration >= RECORD_TIMEOUT) { + printf("\n⏱️ 录制超时(%d秒),正在收尾...", RECORD_TIMEOUT); + fflush(stdout); + // 关键:先等2秒让数据写完,再停标志 + std::this_thread::sleep_for(std::chrono::milliseconds(FINISH_WAIT_MS)); is_recording.store(false); is_timeout = true; - printf("\n⏱️ 录制超时(%d秒),自动停止\n", RECORD_TIMEOUT); + printf("完成\n"); break; } + // 检查手动输入(非阻塞) + if (check_input_non_blocking(100)) { + char c; + ssize_t ret2 = read(STDIN_FILENO, &c, 1); + (void)ret2; + if (c == '\n') { + printf("\n🛑 已手动停止录制,正在收尾..."); + fflush(stdout); + // 关键:先sleep,让音频写完,再停标志 + std::this_thread::sleep_for(std::chrono::milliseconds(FINISH_WAIT_MS)); + is_recording.store(false); + manual_stop = true; + printf("完成\n"); + break; + } + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); } - // 等待录制收尾 + // 等待进度线程退出(此时线程应该已经自然退出) progress_thread.join(); - printf("\n⏳ 正在收尾音频数据..."); - std::this_thread::sleep_for(std::chrono::milliseconds(RECORD_FINISH_WAIT_MS)); - printf("完成\n"); if (exit_program.load()) break; @@ -346,7 +359,7 @@ int main(int argc, char** argv) { captured_audio = audio_buffer; } - // 执行识别(无论手动/超时,自动识别) + // 执行识别 recognize_audio(ctx, captured_audio); } From 588bf73a678e8a9e778e86f5183e2b9e949ef7fe Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 16:41:44 +0800 Subject: [PATCH 07/29] doubao needs to use same logic handle both key enter and timeout --- doubao_mic.cpp | 88 ++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 38 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index cbd777a8951..84f812b69ed 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -23,12 +23,23 @@ std::atomic is_recording(false); std::atomic exit_program(false); std::atomic recorded_seconds(0); +// 新增:记录最后一次音频回调的时间(毫秒) +std::atomic last_callback_time(0); // 音频缓冲区(加锁保护) std::vector audio_buffer; std::mutex buffer_mutex; // 配置常量 -const int RECORD_TIMEOUT = 30; // 超时时间(秒) -const int FINISH_WAIT_MS = 2000; // 停止前收尾等待时间(毫秒) +const int RECORD_TIMEOUT = 30; // 基础超时时间(秒) +const int TIMEOUT_GRACE_MS = 1500; // 超时后宽限1.5秒(等最后帧) +const int FINISH_WAIT_MS = 2000; // 停止前收尾等待时间 +const int FRAME_INTERVAL_MS = 100; // 音频帧间隔(ms) + +// 获取当前时间戳(毫秒) +long long get_current_time_ms() { + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch() + ).count(); +} // 信号处理:Ctrl+C 优雅退出 void signal_handler(int sig) { @@ -36,13 +47,12 @@ void signal_handler(int sig) { printf("\n\n🛑 收到退出信号,正在清理资源...\n"); exit_program.store(true); is_recording.store(false); - // 给回调线程一点时间清理最后数据 std::this_thread::sleep_for(std::chrono::milliseconds(100)); exit(0); } } -// 非阻塞检查输入(核心修复:解决死锁) +// 非阻塞检查输入 bool check_input_non_blocking(int timeout_ms = 100) { fd_set fds; FD_ZERO(&fds); @@ -50,14 +60,13 @@ bool check_input_non_blocking(int timeout_ms = 100) { struct timeval tv; tv.tv_sec = 0; - tv.tv_usec = timeout_ms * 1000; // 转换为微秒 + tv.tv_usec = timeout_ms * 1000; return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; } -// 清空输入缓冲区(避免残留) +// 清空输入缓冲区 void clear_input_buffer() { - // 使用非阻塞读取清空缓冲区 while (check_input_non_blocking(10)) { char c; ssize_t ret = read(STDIN_FILENO, &c, 1); @@ -65,19 +74,23 @@ void clear_input_buffer() { } } -// 音频回调(确保完整接收音频帧) +// 音频回调(关键:记录最后回调时间,确保每帧都写入) void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { if (!is_recording.load() || pInput == NULL) return; const float* pInputFloat = (const float*)pInput; if (pInputFloat == NULL) return; + // 更新最后回调时间(关键:标记有新数据) + last_callback_time.store(get_current_time_ms()); + std::lock_guard lock(buffer_mutex); // 安全保护:最多录制35秒(超时+5秒缓冲) const size_t max_memory = 16000 * (RECORD_TIMEOUT + 5); if (audio_buffer.size() < max_memory) { + // 逐帧写入,确保不丢帧 audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + frameCount); - // 更新实时时长(精确到0.1秒) + // 精确计算录制时长(按实际采样数) recorded_seconds.store(static_cast(audio_buffer.size() / 16000.0)); } } @@ -88,11 +101,11 @@ int trim_silence(const float* audio_data, int audio_len, float threshold = 0.001 while (start < audio_len && fabs(audio_data[start]) < threshold) { start++; } - // 关键:不裁剪末尾,确保最后几个字完整 - return std::max(audio_len - start, 16000); // 至少保留1秒 + // 完全保留末尾,哪怕是静音 + return std::max(audio_len - start, 16000); } -// 列出系统音频设备(修复参数类型:第三个参数为引用) +// 列出系统音频设备 void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_uint32& captureCount) { printf("\n📜 系统可用麦克风设备列表:\n"); printf("=============================================\n"); @@ -116,11 +129,11 @@ void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_ // 提示信息 void print_usage() { printf("=============================================\n"); - printf("🎤 语音识别程序(终极稳定版)\n"); + printf("🎤 语音识别程序(防丢帧终极版)\n"); printf("操作说明:\n"); printf(" 1. 按下【回车键】开始录制\n"); printf(" 2. 说话完成后按回车停止(会自动收尾)\n"); - printf(" 3. 录制超过%d秒自动停止并识别\n", RECORD_TIMEOUT); + printf(" 3. 录制超过%d秒后宽限1.5秒自动停止\n", RECORD_TIMEOUT); printf(" 4. 录制中实时显示时长\n"); printf(" 5. Ctrl+C 退出程序\n"); printf("=============================================\n"); @@ -130,10 +143,9 @@ void print_usage() { void print_cpu_optimize_tips() { printf("⚡ CPU优化配置说明:\n"); printf(" ✅ 已启用多线程识别(自动适配CPU核心数)\n"); - printf(" ✅ 停止前预留2秒缓冲,不丢最后音频\n"); - printf(" ✅ 修复线程死锁,手动停止立即响应\n"); + printf(" ✅ 超时宽限1.5秒,确保最后音频帧不丢\n"); + printf(" ✅ 记录音频回调时间,实时检测数据写入\n"); printf(" 📌 模型优化:推荐使用 ggml-medium-q4_0.bin(量化版)\n"); - printf(" 📌 编译优化:已用 -O3 最高级优化\n"); printf("=============================================\n"); } @@ -203,10 +215,10 @@ int main(int argc, char** argv) { return 1; } - // 2. 枚举麦克风设备(修复参数传递:直接传变量,而非指针) + // 2. 枚举麦克风设备 ma_device_info* pCaptureInfos = NULL; ma_uint32 captureCount = 0; - list_audio_devices(context, &pCaptureInfos, captureCount); // 这里直接传captureCount(引用) + list_audio_devices(context, &pCaptureInfos, captureCount); // 3. 选择麦克风设备 ma_uint32 device_id = 0; @@ -216,12 +228,12 @@ int main(int argc, char** argv) { fprintf(stderr, "❌ 输入无效,使用默认设备ID 0\n"); device_id = 0; } - clear_input_buffer(); // 清空缓冲区 + clear_input_buffer(); } // 4. 初始化 Whisper 模型 struct whisper_context_params cparams = whisper_context_default_params(); - cparams.use_gpu = false; // 强制CPU + cparams.use_gpu = false; printf("\n🚀 正在加载模型:%s\n", model_path); struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); @@ -267,11 +279,11 @@ int main(int argc, char** argv) { print_usage(); - // 主循环(彻底修复死锁逻辑) + // 主循环 while (!exit_program.load()) { printf("\n👉 按下回车键开始录制...\n"); - // 阻塞等待用户回车(确保由用户控制开始) + // 阻塞等待用户回车 char input_char = 0; while (!check_input_non_blocking() && !exit_program.load()) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); @@ -280,7 +292,7 @@ int main(int argc, char** argv) { ssize_t ret1 = read(STDIN_FILENO, &input_char, 1); (void)ret1; - clear_input_buffer(); // 清空其他残留输入 + clear_input_buffer(); if (exit_program.load()) break; if (input_char != '\n') { @@ -288,9 +300,10 @@ int main(int argc, char** argv) { continue; } - // 重置录制状态 + // 重置录制状态(关键:清空最后回调时间) is_recording.store(true); recorded_seconds.store(0); + last_callback_time.store(get_current_time_ms()); { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); @@ -308,18 +321,19 @@ int main(int argc, char** argv) { bool is_timeout = false; auto start_time = std::chrono::steady_clock::now(); - bool manual_stop = false; + long long timeout_deadline_ms = get_current_time_ms() + (RECORD_TIMEOUT * 1000) + TIMEOUT_GRACE_MS; - // 核心循环 - 修复死锁:在设flag前等待,不阻塞主线程 + // 核心循环 - 防丢帧逻辑 while (is_recording.load() && !exit_program.load()) { - // 检查超时 - auto duration = std::chrono::duration_cast( - std::chrono::steady_clock::now() - start_time).count(); + long long current_ms = get_current_time_ms(); - if (duration >= RECORD_TIMEOUT) { - printf("\n⏱️ 录制超时(%d秒),正在收尾...", RECORD_TIMEOUT); + // 超时判断:1. 超过总时限 且 2. 最后回调超过帧间隔(无新数据) + bool timeout_1 = current_ms >= timeout_deadline_ms; + bool timeout_2 = (current_ms - last_callback_time.load()) > (FRAME_INTERVAL_MS * 2); + if (timeout_1 && timeout_2) { + printf("\n⏱️ 录制超时(%d秒+宽限1.5秒),正在等待最后帧写入...", RECORD_TIMEOUT); fflush(stdout); - // 关键:先等2秒让数据写完,再停标志 + // 等待最后帧写完(哪怕多等一点) std::this_thread::sleep_for(std::chrono::milliseconds(FINISH_WAIT_MS)); is_recording.store(false); is_timeout = true; @@ -327,7 +341,7 @@ int main(int argc, char** argv) { break; } - // 检查手动输入(非阻塞) + // 手动停止(正常逻辑) if (check_input_non_blocking(100)) { char c; ssize_t ret2 = read(STDIN_FILENO, &c, 1); @@ -335,19 +349,17 @@ int main(int argc, char** argv) { if (c == '\n') { printf("\n🛑 已手动停止录制,正在收尾..."); fflush(stdout); - // 关键:先sleep,让音频写完,再停标志 std::this_thread::sleep_for(std::chrono::milliseconds(FINISH_WAIT_MS)); is_recording.store(false); - manual_stop = true; printf("完成\n"); break; } } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); // 更短轮询,减少延迟 } - // 等待进度线程退出(此时线程应该已经自然退出) + // 等待进度线程退出 progress_thread.join(); if (exit_program.load()) break; From eb7f9a8a45f22397856740b750c959c4b0f3a27e Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 16:47:55 +0800 Subject: [PATCH 08/29] doubao deadlock --- doubao_mic.cpp | 93 ++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 56 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 84f812b69ed..4389f40965d 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -23,23 +23,12 @@ std::atomic is_recording(false); std::atomic exit_program(false); std::atomic recorded_seconds(0); -// 新增:记录最后一次音频回调的时间(毫秒) -std::atomic last_callback_time(0); // 音频缓冲区(加锁保护) std::vector audio_buffer; std::mutex buffer_mutex; // 配置常量 -const int RECORD_TIMEOUT = 30; // 基础超时时间(秒) -const int TIMEOUT_GRACE_MS = 1500; // 超时后宽限1.5秒(等最后帧) -const int FINISH_WAIT_MS = 2000; // 停止前收尾等待时间 -const int FRAME_INTERVAL_MS = 100; // 音频帧间隔(ms) - -// 获取当前时间戳(毫秒) -long long get_current_time_ms() { - return std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch() - ).count(); -} +const int RECORD_TIMEOUT = 30; // 超时时间(秒) +const int STOP_WAIT_MS = 2000; // 停止前等待2秒(手动/超时共用) // 信号处理:Ctrl+C 优雅退出 void signal_handler(int sig) { @@ -74,34 +63,28 @@ void clear_input_buffer() { } } -// 音频回调(关键:记录最后回调时间,确保每帧都写入) +// 音频回调(仅负责写数据,无额外逻辑) void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { if (!is_recording.load() || pInput == NULL) return; const float* pInputFloat = (const float*)pInput; if (pInputFloat == NULL) return; - // 更新最后回调时间(关键:标记有新数据) - last_callback_time.store(get_current_time_ms()); - std::lock_guard lock(buffer_mutex); // 安全保护:最多录制35秒(超时+5秒缓冲) const size_t max_memory = 16000 * (RECORD_TIMEOUT + 5); if (audio_buffer.size() < max_memory) { - // 逐帧写入,确保不丢帧 audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + frameCount); - // 精确计算录制时长(按实际采样数) recorded_seconds.store(static_cast(audio_buffer.size() / 16000.0)); } } -// 静音检测(仅裁剪开头,保留末尾所有内容) +// 静音检测(仅裁剪开头,保留末尾) int trim_silence(const float* audio_data, int audio_len, float threshold = 0.001f) { int start = 0; while (start < audio_len && fabs(audio_data[start]) < threshold) { start++; } - // 完全保留末尾,哪怕是静音 return std::max(audio_len - start, 16000); } @@ -126,14 +109,24 @@ void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_ printf("=============================================\n"); } +// 核心停止录制函数(手动/超时共用同一套逻辑) +void stop_recording(const char* stop_type) { + printf("\n%s 正在等待最后音频数据写入(2秒)...", stop_type); + fflush(stdout); + // 关键:先等2秒让数据写完,再停采集(手动/超时完全一致) + std::this_thread::sleep_for(std::chrono::milliseconds(STOP_WAIT_MS)); + is_recording.store(false); + printf("完成\n"); +} + // 提示信息 void print_usage() { printf("=============================================\n"); - printf("🎤 语音识别程序(防丢帧终极版)\n"); + printf("🎤 语音识别程序(统一逻辑版)\n"); printf("操作说明:\n"); printf(" 1. 按下【回车键】开始录制\n"); - printf(" 2. 说话完成后按回车停止(会自动收尾)\n"); - printf(" 3. 录制超过%d秒后宽限1.5秒自动停止\n", RECORD_TIMEOUT); + printf(" 2. 说话完成后按回车停止(等2秒收尾)\n"); + printf(" 3. 录制超过%d秒自动停止(同样等2秒收尾)\n", RECORD_TIMEOUT); printf(" 4. 录制中实时显示时长\n"); printf(" 5. Ctrl+C 退出程序\n"); printf("=============================================\n"); @@ -142,9 +135,9 @@ void print_usage() { // CPU优化提示 void print_cpu_optimize_tips() { printf("⚡ CPU优化配置说明:\n"); - printf(" ✅ 已启用多线程识别(自动适配CPU核心数)\n"); - printf(" ✅ 超时宽限1.5秒,确保最后音频帧不丢\n"); - printf(" ✅ 记录音频回调时间,实时检测数据写入\n"); + printf(" ✅ 手动/超时停止共用同一套逻辑,无行为差异\n"); + printf(" ✅ 停止前等待2秒,确保最后音频完整\n"); + printf(" ✅ 启用多线程识别(自动适配CPU核心数)\n"); printf(" 📌 模型优化:推荐使用 ggml-medium-q4_0.bin(量化版)\n"); printf("=============================================\n"); } @@ -300,10 +293,9 @@ int main(int argc, char** argv) { continue; } - // 重置录制状态(关键:清空最后回调时间) + // 重置录制状态 is_recording.store(true); recorded_seconds.store(0); - last_callback_time.store(get_current_time_ms()); { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); @@ -319,44 +311,33 @@ int main(int argc, char** argv) { } }); - bool is_timeout = false; + bool stopped = false; auto start_time = std::chrono::steady_clock::now(); - long long timeout_deadline_ms = get_current_time_ms() + (RECORD_TIMEOUT * 1000) + TIMEOUT_GRACE_MS; - - // 核心循环 - 防丢帧逻辑 - while (is_recording.load() && !exit_program.load()) { - long long current_ms = get_current_time_ms(); - - // 超时判断:1. 超过总时限 且 2. 最后回调超过帧间隔(无新数据) - bool timeout_1 = current_ms >= timeout_deadline_ms; - bool timeout_2 = (current_ms - last_callback_time.load()) > (FRAME_INTERVAL_MS * 2); - if (timeout_1 && timeout_2) { - printf("\n⏱️ 录制超时(%d秒+宽限1.5秒),正在等待最后帧写入...", RECORD_TIMEOUT); - fflush(stdout); - // 等待最后帧写完(哪怕多等一点) - std::this_thread::sleep_for(std::chrono::milliseconds(FINISH_WAIT_MS)); - is_recording.store(false); - is_timeout = true; - printf("完成\n"); - break; - } - // 手动停止(正常逻辑) + // 核心循环 - 手动/超时共用同一套停止逻辑 + while (is_recording.load() && !exit_program.load() && !stopped) { + // 1. 检查手动停止 if (check_input_non_blocking(100)) { char c; ssize_t ret2 = read(STDIN_FILENO, &c, 1); (void)ret2; if (c == '\n') { - printf("\n🛑 已手动停止录制,正在收尾..."); - fflush(stdout); - std::this_thread::sleep_for(std::chrono::milliseconds(FINISH_WAIT_MS)); - is_recording.store(false); - printf("完成\n"); + stop_recording("🛑 手动停止录制"); // 调用统一停止函数 + stopped = true; break; } } - std::this_thread::sleep_for(std::chrono::milliseconds(50)); // 更短轮询,减少延迟 + // 2. 检查超时停止(和手动停止逻辑完全一致) + auto duration = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time).count(); + if (duration >= RECORD_TIMEOUT) { + stop_recording("⏱️ 录制超时(30秒)"); // 调用统一停止函数 + stopped = true; + break; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); } // 等待进度线程退出 From 1aab24dc74aa2d4bfafdf25375af2e58f51e159e Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 17:10:27 +0800 Subject: [PATCH 09/29] not very sure bug is fixed. seems ok, now let me ask gemini --- doubao_mic.cpp | 103 ++++++++++++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 39 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 4389f40965d..7ba4c709fa2 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -22,13 +22,14 @@ // 全局原子变量(线程安全) std::atomic is_recording(false); std::atomic exit_program(false); +std::atomic is_stopping(false); // 新增:标记是否正在停止 std::atomic recorded_seconds(0); // 音频缓冲区(加锁保护) std::vector audio_buffer; std::mutex buffer_mutex; // 配置常量 const int RECORD_TIMEOUT = 30; // 超时时间(秒) -const int STOP_WAIT_MS = 2000; // 停止前等待2秒(手动/超时共用) +const int STOP_WAIT_MS = 2000; // 停止前等待2秒 // 信号处理:Ctrl+C 优雅退出 void signal_handler(int sig) { @@ -36,13 +37,14 @@ void signal_handler(int sig) { printf("\n\n🛑 收到退出信号,正在清理资源...\n"); exit_program.store(true); is_recording.store(false); + is_stopping.store(false); std::this_thread::sleep_for(std::chrono::milliseconds(100)); exit(0); } } -// 非阻塞检查输入 -bool check_input_non_blocking(int timeout_ms = 100) { +// 非阻塞检查输入(核心:永不阻塞) +bool check_input_non_blocking(int timeout_ms = 50) { fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); @@ -51,10 +53,16 @@ bool check_input_non_blocking(int timeout_ms = 100) { tv.tv_sec = 0; tv.tv_usec = timeout_ms * 1000; - return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; + // 处理EINTR错误(避免信号中断导致的异常) + int ret; + do { + ret = select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv); + } while (ret == -1 && errno == EINTR); + + return ret > 0; } -// 清空输入缓冲区 +// 清空输入缓冲区(非阻塞) void clear_input_buffer() { while (check_input_non_blocking(10)) { char c; @@ -63,15 +71,14 @@ void clear_input_buffer() { } } -// 音频回调(仅负责写数据,无额外逻辑) +// 音频回调(仅写数据,无额外逻辑) void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { - if (!is_recording.load() || pInput == NULL) return; + if (!is_recording.load() || is_stopping.load() || pInput == NULL) return; const float* pInputFloat = (const float*)pInput; if (pInputFloat == NULL) return; std::lock_guard lock(buffer_mutex); - // 安全保护:最多录制35秒(超时+5秒缓冲) const size_t max_memory = 16000 * (RECORD_TIMEOUT + 5); if (audio_buffer.size() < max_memory) { audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + frameCount); @@ -79,7 +86,7 @@ void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uin } } -// 静音检测(仅裁剪开头,保留末尾) +// 静音检测(仅裁剪开头) int trim_silence(const float* audio_data, int audio_len, float threshold = 0.001f) { int start = 0; while (start < audio_len && fabs(audio_data[start]) < threshold) { @@ -109,24 +116,30 @@ void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_ printf("=============================================\n"); } -// 核心停止录制函数(手动/超时共用同一套逻辑) -void stop_recording(const char* stop_type) { +// 异步停止录制函数(无阻塞,避免死锁) +void async_stop_recording(const char* stop_type) { + is_stopping.store(true); // 标记正在停止,阻止新数据写入 printf("\n%s 正在等待最后音频数据写入(2秒)...", stop_type); fflush(stdout); - // 关键:先等2秒让数据写完,再停采集(手动/超时完全一致) - std::this_thread::sleep_for(std::chrono::milliseconds(STOP_WAIT_MS)); - is_recording.store(false); - printf("完成\n"); + + // 独立线程等待,不阻塞主线程 + std::thread([&]() { + std::this_thread::sleep_for(std::chrono::milliseconds(STOP_WAIT_MS)); + is_recording.store(false); + is_stopping.store(false); + printf("完成\n"); + fflush(stdout); + }).detach(); // 分离线程,自动回收资源 } // 提示信息 void print_usage() { printf("=============================================\n"); - printf("🎤 语音识别程序(统一逻辑版)\n"); + printf("🎤 语音识别程序(无死锁+统一逻辑版)\n"); printf("操作说明:\n"); printf(" 1. 按下【回车键】开始录制\n"); - printf(" 2. 说话完成后按回车停止(等2秒收尾)\n"); - printf(" 3. 录制超过%d秒自动停止(同样等2秒收尾)\n", RECORD_TIMEOUT); + printf(" 2. 说话完成后按回车停止(异步等2秒收尾)\n"); + printf(" 3. 录制超过30秒自动停止(同样异步等2秒)\n"); printf(" 4. 录制中实时显示时长\n"); printf(" 5. Ctrl+C 退出程序\n"); printf("=============================================\n"); @@ -135,10 +148,10 @@ void print_usage() { // CPU优化提示 void print_cpu_optimize_tips() { printf("⚡ CPU优化配置说明:\n"); - printf(" ✅ 手动/超时停止共用同一套逻辑,无行为差异\n"); - printf(" ✅ 停止前等待2秒,确保最后音频完整\n"); - printf(" ✅ 启用多线程识别(自动适配CPU核心数)\n"); - printf(" 📌 模型优化:推荐使用 ggml-medium-q4_0.bin(量化版)\n"); + printf(" ✅ 异步停止逻辑,彻底解决死锁\n"); + printf(" ✅ 手动/超时停止共用同一套逻辑\n"); + printf(" ✅ 非阻塞输入检查,永不卡死\n"); + printf(" 📌 模型优化:推荐使用 ggml-medium-q4_0.bin\n"); printf("=============================================\n"); } @@ -276,16 +289,16 @@ int main(int argc, char** argv) { while (!exit_program.load()) { printf("\n👉 按下回车键开始录制...\n"); - // 阻塞等待用户回车 + // 阻塞等待用户回车(非阻塞检查,避免死锁) char input_char = 0; while (!check_input_non_blocking() && !exit_program.load()) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); } if (exit_program.load()) break; - ssize_t ret1 = read(STDIN_FILENO, &input_char, 1); - (void)ret1; - clear_input_buffer(); + ssize_t ret1 = read(STDIN_FILENO, &input_char, 1); + (void)ret1; + clear_input_buffer(); if (exit_program.load()) break; if (input_char != '\n') { @@ -295,16 +308,17 @@ int main(int argc, char** argv) { // 重置录制状态 is_recording.store(true); + is_stopping.store(false); recorded_seconds.store(0); { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } - printf("🎙️ 正在录制(按回车停止,最长%d秒)...\n", RECORD_TIMEOUT); + printf("🎙️ 正在录制(按回车停止,最长30秒)...\n"); // 录制时长实时显示线程 std::thread progress_thread([&]() { - while (is_recording.load() && !exit_program.load()) { + while (is_recording.load() && !exit_program.load() && !is_stopping.load()) { printf("\r📊 录制中... %d秒", recorded_seconds.load()); fflush(stdout); std::this_thread::sleep_for(std::chrono::seconds(1)); @@ -314,30 +328,36 @@ int main(int argc, char** argv) { bool stopped = false; auto start_time = std::chrono::steady_clock::now(); - // 核心循环 - 手动/超时共用同一套停止逻辑 - while (is_recording.load() && !exit_program.load() && !stopped) { - // 1. 检查手动停止 - if (check_input_non_blocking(100)) { + // 核心循环 - 无死锁逻辑 + while (!exit_program.load() && !stopped) { + // 1. 检查手动停止(非阻塞) + if (check_input_non_blocking(50)) { char c; ssize_t ret2 = read(STDIN_FILENO, &c, 1); (void)ret2; - if (c == '\n') { - stop_recording("🛑 手动停止录制"); // 调用统一停止函数 + if (c == '\n' && is_recording.load() && !is_stopping.load()) { + async_stop_recording("🛑 手动停止录制"); // 异步停止,不阻塞 stopped = true; break; } } - // 2. 检查超时停止(和手动停止逻辑完全一致) + // 2. 检查超时停止(非阻塞) auto duration = std::chrono::duration_cast( std::chrono::steady_clock::now() - start_time).count(); - if (duration >= RECORD_TIMEOUT) { - stop_recording("⏱️ 录制超时(30秒)"); // 调用统一停止函数 + if (duration >= RECORD_TIMEOUT && is_recording.load() && !is_stopping.load()) { + async_stop_recording("⏱️ 录制超时(30秒)"); // 异步停止,不阻塞 stopped = true; break; } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + // 3. 检查是否已停止 + if (!is_recording.load()) { + stopped = true; + break; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(50)); } // 等待进度线程退出 @@ -345,6 +365,11 @@ int main(int argc, char** argv) { if (exit_program.load()) break; + // 等待异步停止线程完成(确保数据写完) + while (is_stopping.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + // 拷贝音频数据 std::vector captured_audio; { From b6abd7b97579e80ef2a71e2a875db869c4b9aa2a Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 17:24:36 +0800 Subject: [PATCH 10/29] gemini claims fixed the issue, now testing --- doubao_mic.cpp | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 7ba4c709fa2..38bd9ddbb57 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -73,17 +73,15 @@ void clear_input_buffer() { // 音频回调(仅写数据,无额外逻辑) void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { - if (!is_recording.load() || is_stopping.load() || pInput == NULL) return; + // 只有在真正停止录制标记为 false 时才退出 + if (!is_recording.load() || pInput == NULL) return; const float* pInputFloat = (const float*)pInput; - if (pInputFloat == NULL) return; - std::lock_guard lock(buffer_mutex); - const size_t max_memory = 16000 * (RECORD_TIMEOUT + 5); - if (audio_buffer.size() < max_memory) { - audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + frameCount); - recorded_seconds.store(static_cast(audio_buffer.size() / 16000.0)); - } + + // 持续写入,直到 is_recording 被主线程关掉 + audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + frameCount); + recorded_seconds.store(static_cast(audio_buffer.size() / 16000.0)); } // 静音检测(仅裁剪开头) @@ -331,25 +329,29 @@ int main(int argc, char** argv) { // 核心循环 - 无死锁逻辑 while (!exit_program.load() && !stopped) { // 1. 检查手动停止(非阻塞) - if (check_input_non_blocking(50)) { - char c; - ssize_t ret2 = read(STDIN_FILENO, &c, 1); - (void)ret2; - if (c == '\n' && is_recording.load() && !is_stopping.load()) { - async_stop_recording("🛑 手动停止录制"); // 异步停止,不阻塞 - stopped = true; - break; - } - } + if (check_input_non_blocking(50)) { + char c; + read(STDIN_FILENO, &c, 1); + if (c == '\n') { + printf("\n🛑 手动停止,正在收尾音频数据..."); + // 关键:先睡 500ms 捕获回车瞬间的余音,但不开启新线程 + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + is_recording.store(false); // 此时回调停止写入 + stopped = true; + } + } + // 2. 检查超时停止(非阻塞) auto duration = std::chrono::duration_cast( std::chrono::steady_clock::now() - start_time).count(); - if (duration >= RECORD_TIMEOUT && is_recording.load() && !is_stopping.load()) { - async_stop_recording("⏱️ 录制超时(30秒)"); // 异步停止,不阻塞 - stopped = true; - break; - } + // 超时判断同理 + if (duration >= RECORD_TIMEOUT) { + printf("\n⏱️ 超时停止,正在收尾..."); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + is_recording.store(false); + stopped = true; + } // 3. 检查是否已停止 if (!is_recording.load()) { From e9a572511aba50e73568b522c2903e27bde0e6f4 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 18:04:37 +0800 Subject: [PATCH 11/29] gemini config cuda which is very complicated --- compile.txt | 5 +++++ run.txt | 13 +++++++++++++ 2 files changed, 18 insertions(+) diff --git a/compile.txt b/compile.txt index cd623631446..05b362271ee 100644 --- a/compile.txt +++ b/compile.txt @@ -5,3 +5,8 @@ g++ -O3 minimal_mic.cpp \ -lpthread -ldl -lm -lrt -o minimal_mic g++ -O3 doubao_mic.cpp -I. -I./include -I./ggml/include -I./examples ./build_gpu/src/libwhisper.so -L/usr/local/cuda/lib64 -lcudart -lcublas -lportaudio -lpthread -ldl -lm -lrt -o doubao_mic.exe + +gpu build: +# 显式指定使用 CUDA 13.1 编译 +cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc +cmake --build build --config Release -j$(nproc) diff --git a/run.txt b/run.txt index 43c761ff1ee..64082a140c1 100644 --- a/run.txt +++ b/run.txt @@ -1,2 +1,15 @@ export LD_LIBRARY_PATH=./build/src ./minimal_mic ./models/ggml-small.bin + +# 确保运行时能找到 CUDA 13.1 的库 +export LD_LIBRARY_PATH=/usr/local/cuda-13.1/lib64:./build/src:$LD_LIBRARY_PATH +./doubao_mic.exe ./models/ggml-medium.bin + +cat ~/.bashrc | tail -n 6 +export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin +export PATH=$PATH:/home/nick/.local/bin +alias cmake='/snap/bin/cmake' +export CUDA_HOME=/usr/local/cuda-13.1 +export PATH=$CUDA_HOME/bin:$PATH +export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$CUDA_HOME/targets/x86_64-linux/lib:$LD_LIBRARY_PATH + From 938cec62f337b3e4e2a7645f94157a82ec946373 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 18:08:56 +0800 Subject: [PATCH 12/29] gemini also try to fix cuda and dead lock --- doubao_mic.cpp | 52 +++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 38bd9ddbb57..b1f0f7c7f65 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -130,6 +130,7 @@ void async_stop_recording(const char* stop_type) { }).detach(); // 分离线程,自动回收资源 } + // 提示信息 void print_usage() { printf("=============================================\n"); @@ -212,6 +213,19 @@ int main(int argc, char** argv) { } const char* model_path = argv[1]; + // 1. 定义一个统一的收尾函数,替代原有的异步函数 + auto stop_and_collect = [&](const char* reason) { + printf("\n%s,正在捕获余音收尾...", reason); + fflush(stdout); + + // 关键点:此时 is_recording 仍为 true,data_callback 还在工作 + // 睡眠 800ms 确保敲击回车或超时瞬间的最后几个采样点进入 buffer + std::this_thread::sleep_for(std::chrono::milliseconds(800)); + + is_recording.store(false); // 现在真正停止回调写入 + printf("完成。\n"); + }; + // 1. 初始化音频上下文 ma_context context; if (ma_context_init(NULL, 0, NULL, &context) != MA_SUCCESS) { @@ -328,38 +342,28 @@ int main(int argc, char** argv) { // 核心循环 - 无死锁逻辑 while (!exit_program.load() && !stopped) { - // 1. 检查手动停止(非阻塞) - if (check_input_non_blocking(50)) { + // 场景 A: 检查手动回车 + if (check_input_non_blocking(50)) { char c; - read(STDIN_FILENO, &c, 1); - if (c == '\n') { - printf("\n🛑 手动停止,正在收尾音频数据..."); - // 关键:先睡 500ms 捕获回车瞬间的余音,但不开启新线程 - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - is_recording.store(false); // 此时回调停止写入 + if (read(STDIN_FILENO, &c, 1) > 0 && c == '\n') { + stop_and_collect("🛑 手动停止"); stopped = true; + break; } } - // 2. 检查超时停止(非阻塞) - auto duration = std::chrono::duration_cast( - std::chrono::steady_clock::now() - start_time).count(); - // 超时判断同理 - if (duration >= RECORD_TIMEOUT) { - printf("\n⏱️ 超时停止,正在收尾..."); - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - is_recording.store(false); + // 场景 B: 检查 30 秒超时 + auto duration = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time).count(); + + if (duration >= RECORD_TIMEOUT * 1000) { + stop_and_collect("⏱️ 超时停止 (30s)"); stopped = true; - } + break; + } - // 3. 检查是否已停止 - if (!is_recording.load()) { - stopped = true; - break; - } - - std::this_thread::sleep_for(std::chrono::milliseconds(50)); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); } // 等待进度线程退出 From 6b0a5f35604848ca84c9064496a16497aed98f52 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 20:49:14 +0800 Subject: [PATCH 13/29] now gpu fixed. gemini has a timeout ends earlier explanantion with a simple accurate time start --- doubao_mic.cpp | 4 ++-- run.txt | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index b1f0f7c7f65..7e8f3f1cf93 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -177,7 +177,7 @@ void recognize_audio(struct whisper_context* ctx, const std::vector& audi wparams.max_len = 0; wparams.translate = false; wparams.no_context = true; - wparams.single_segment = true; + wparams.single_segment = false; wparams.print_special = false; wparams.token_timestamps = false; @@ -251,7 +251,7 @@ int main(int argc, char** argv) { // 4. 初始化 Whisper 模型 struct whisper_context_params cparams = whisper_context_default_params(); - cparams.use_gpu = false; + cparams.use_gpu = true; printf("\n🚀 正在加载模型:%s\n", model_path); struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); diff --git a/run.txt b/run.txt index 64082a140c1..1dbc2c4800a 100644 --- a/run.txt +++ b/run.txt @@ -13,3 +13,4 @@ export CUDA_HOME=/usr/local/cuda-13.1 export PATH=$CUDA_HOME/bin:$PATH export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$CUDA_HOME/targets/x86_64-linux/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$(pwd)/build_gpu/src:$(pwd)/build_gpu/ggml/src:/usr/local/cuda-13.1/lib64:/usr/local/cuda-13.1/targets/x86_64-linux/lib:$LD_LIBRARY_PATH From 84866c238c6c405a26a05e7f5bcf69f853f71882 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 20:55:06 +0800 Subject: [PATCH 14/29] gemini clear up code but still 28 seconds ends earlier than timeout expection --- doubao_mic.cpp | 352 +++++++++++++------------------------------------ 1 file changed, 95 insertions(+), 257 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 7e8f3f1cf93..7b142098212 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -19,378 +19,216 @@ #include #include -// 全局原子变量(线程安全) +// 全局状态管理 std::atomic is_recording(false); std::atomic exit_program(false); -std::atomic is_stopping(false); // 新增:标记是否正在停止 std::atomic recorded_seconds(0); -// 音频缓冲区(加锁保护) + +// 音频缓冲区与锁 std::vector audio_buffer; std::mutex buffer_mutex; + // 配置常量 -const int RECORD_TIMEOUT = 30; // 超时时间(秒) -const int STOP_WAIT_MS = 2000; // 停止前等待2秒 +const int RECORD_TIMEOUT = 30; // 严格30秒超时 -// 信号处理:Ctrl+C 优雅退出 +// 信号处理 void signal_handler(int sig) { if (sig == SIGINT) { printf("\n\n🛑 收到退出信号,正在清理资源...\n"); exit_program.store(true); is_recording.store(false); - is_stopping.store(false); std::this_thread::sleep_for(std::chrono::milliseconds(100)); exit(0); } } -// 非阻塞检查输入(核心:永不阻塞) -bool check_input_non_blocking(int timeout_ms = 50) { +// 非阻塞检查标准输入 +bool check_input_non_blocking(int timeout_ms = 20) { fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); - struct timeval tv; tv.tv_sec = 0; tv.tv_usec = timeout_ms * 1000; - - // 处理EINTR错误(避免信号中断导致的异常) int ret; do { ret = select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv); } while (ret == -1 && errno == EINTR); - return ret > 0; } -// 清空输入缓冲区(非阻塞) void clear_input_buffer() { while (check_input_non_blocking(10)) { char c; - ssize_t ret = read(STDIN_FILENO, &c, 1); - (void)ret; + read(STDIN_FILENO, &c, 1); } } -// 音频回调(仅写数据,无额外逻辑) +// 音频采集回调 void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { - // 只有在真正停止录制标记为 false 时才退出 if (!is_recording.load() || pInput == NULL) return; - const float* pInputFloat = (const float*)pInput; std::lock_guard lock(buffer_mutex); - - // 持续写入,直到 is_recording 被主线程关掉 audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + frameCount); recorded_seconds.store(static_cast(audio_buffer.size() / 16000.0)); } -// 静音检测(仅裁剪开头) +// 裁剪开头静音 int trim_silence(const float* audio_data, int audio_len, float threshold = 0.001f) { int start = 0; - while (start < audio_len && fabs(audio_data[start]) < threshold) { + while (start < audio_len && std::abs(audio_data[start]) < threshold) { start++; } - return std::max(audio_len - start, 16000); -} - -// 列出系统音频设备 -void list_audio_devices(ma_context& context, ma_device_info** pCaptureInfos, ma_uint32& captureCount) { - printf("\n📜 系统可用麦克风设备列表:\n"); - printf("=============================================\n"); - - ma_result result = ma_context_get_devices(&context, NULL, NULL, pCaptureInfos, &captureCount); - if (result != MA_SUCCESS) { - fprintf(stderr, "❌ 获取设备列表失败,使用默认设备\n"); - *pCaptureInfos = NULL; - captureCount = 0; - return; - } - - for (ma_uint32 i = 0; i < captureCount; ++i) { - printf("🔧 设备ID: %u | 名称: %s\n", i, (*pCaptureInfos)[i].name); - printf(" 声道数: 1 | 采样率: 16000 Hz\n"); - printf("---------------------------------------------\n"); - } - printf("=============================================\n"); -} - -// 异步停止录制函数(无阻塞,避免死锁) -void async_stop_recording(const char* stop_type) { - is_stopping.store(true); // 标记正在停止,阻止新数据写入 - printf("\n%s 正在等待最后音频数据写入(2秒)...", stop_type); - fflush(stdout); - - // 独立线程等待,不阻塞主线程 - std::thread([&]() { - std::this_thread::sleep_for(std::chrono::milliseconds(STOP_WAIT_MS)); - is_recording.store(false); - is_stopping.store(false); - printf("完成\n"); - fflush(stdout); - }).detach(); // 分离线程,自动回收资源 -} - - -// 提示信息 -void print_usage() { - printf("=============================================\n"); - printf("🎤 语音识别程序(无死锁+统一逻辑版)\n"); - printf("操作说明:\n"); - printf(" 1. 按下【回车键】开始录制\n"); - printf(" 2. 说话完成后按回车停止(异步等2秒收尾)\n"); - printf(" 3. 录制超过30秒自动停止(同样异步等2秒)\n"); - printf(" 4. 录制中实时显示时长\n"); - printf(" 5. Ctrl+C 退出程序\n"); - printf("=============================================\n"); -} - -// CPU优化提示 -void print_cpu_optimize_tips() { - printf("⚡ CPU优化配置说明:\n"); - printf(" ✅ 异步停止逻辑,彻底解决死锁\n"); - printf(" ✅ 手动/超时停止共用同一套逻辑\n"); - printf(" ✅ 非阻塞输入检查,永不卡死\n"); - printf(" 📌 模型优化:推荐使用 ggml-medium-q4_0.bin\n"); - printf("=============================================\n"); + return std::max(audio_len - start, 16000); } // 核心识别函数 void recognize_audio(struct whisper_context* ctx, const std::vector& audio_data) { - if (audio_data.empty()) { - printf("⚠️ 未采集到音频数据,跳过识别\n"); - return; - } + if (audio_data.empty()) return; int valid_len = trim_silence(audio_data.data(), audio_data.size()); - float valid_seconds = (float)valid_len / 16000; - printf("🔍 正在识别(有效音频长度:%.2f秒,原始:%.2f秒)...\n", - valid_seconds, (float)audio_data.size() / 16000); + float total_sec = (float)audio_data.size() / 16000.0f; - auto recognize_start = std::chrono::steady_clock::now(); + printf("🔍 正在识别(有效长度:%.2fs,总长:%.2fs)...\n", (float)valid_len/16000.0f, total_sec); + + auto t_start = std::chrono::steady_clock::now(); whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.language = "zh"; wparams.n_threads = std::max(2, (int)std::thread::hardware_concurrency()); wparams.print_progress = false; - wparams.print_realtime = false; - wparams.temperature = 0.0; - wparams.max_len = 0; - wparams.translate = false; wparams.no_context = true; wparams.single_segment = false; - wparams.print_special = false; - wparams.token_timestamps = false; if (whisper_full(ctx, wparams, audio_data.data(), valid_len) != 0) { fprintf(stderr, "❌ 识别失败\n"); return; } - auto recognize_duration = std::chrono::duration_cast( - std::chrono::steady_clock::now() - recognize_start).count(); - float speed = valid_seconds / (recognize_duration / 1000.0); - printf("⏱️ 识别耗时:%.2f 秒 | 识别速度:%.2fx实时速度\n", - recognize_duration / 1000.0, speed); + auto t_end = std::chrono::steady_clock::now(); + float msec = std::chrono::duration(t_end - t_start).count(); - const int n_segments = whisper_full_n_segments(ctx); - if (n_segments == 0) { - printf("📝 未识别到有效内容\n"); - } else { - printf("📝 识别结果:\n"); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - printf(" %s\n", text); - } + printf("⏱️ 识别耗时:%.2f 秒 | 识别速度:%.2fx\n", msec/1000.0f, total_sec/(msec/1000.0f)); + + int n_segments = whisper_full_n_segments(ctx); + printf("📝 识别结果:\n"); + for (int i = 0; i < n_segments; ++i) { + printf(" %s\n", whisper_full_get_segment_text(ctx, i)); } } int main(int argc, char** argv) { signal(SIGINT, signal_handler); - if (argc < 2) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } - const char* model_path = argv[1]; - - // 1. 定义一个统一的收尾函数,替代原有的异步函数 - auto stop_and_collect = [&](const char* reason) { - printf("\n%s,正在捕获余音收尾...", reason); - fflush(stdout); - - // 关键点:此时 is_recording 仍为 true,data_callback 还在工作 - // 睡眠 800ms 确保敲击回车或超时瞬间的最后几个采样点进入 buffer - std::this_thread::sleep_for(std::chrono::milliseconds(800)); - - is_recording.store(false); // 现在真正停止回调写入 - printf("完成。\n"); - }; - - // 1. 初始化音频上下文 - ma_context context; - if (ma_context_init(NULL, 0, NULL, &context) != MA_SUCCESS) { - fprintf(stderr, "❌ 初始化音频上下文失败\n"); - return 1; - } - // 2. 枚举麦克风设备 + // 1. 初始化音频与设备列表 + ma_context context; + ma_context_init(NULL, 0, NULL, &context); ma_device_info* pCaptureInfos = NULL; ma_uint32 captureCount = 0; - list_audio_devices(context, &pCaptureInfos, captureCount); + ma_context_get_devices(&context, NULL, NULL, &pCaptureInfos, &captureCount); - // 3. 选择麦克风设备 - ma_uint32 device_id = 0; - if (captureCount > 0) { - printf("\n👉 请输入要使用的麦克风设备ID:"); - if (scanf("%u", &device_id) != 1 || device_id >= captureCount) { - fprintf(stderr, "❌ 输入无效,使用默认设备ID 0\n"); - device_id = 0; - } - clear_input_buffer(); + printf("\n📜 系统可用麦克风:\n"); + for (ma_uint32 i = 0; i < captureCount; ++i) { + printf(" [%u] %s\n", i, pCaptureInfos[i].name); } - // 4. 初始化 Whisper 模型 + ma_uint32 device_id = 0; + printf("\n👉 选择麦克风ID: "); + if(scanf("%u", &device_id) != 1) device_id = 0; + clear_input_buffer(); + + // 2. 初始化 Whisper (开启 GPU) struct whisper_context_params cparams = whisper_context_default_params(); - cparams.use_gpu = true; + cparams.use_gpu = true; // 确认开启 GPU + cparams.gpu_device = 0; - printf("\n🚀 正在加载模型:%s\n", model_path); - struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); - if (!ctx) { - fprintf(stderr, "❌ 初始化Whisper模型失败\n"); - ma_context_uninit(&context); - return 1; - } + printf("\n🚀 正在加载模型: %s\n", argv[1]); + struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); + if (!ctx) return 1; - print_cpu_optimize_tips(); - printf("✅ 模型加载成功!\n"); - - // 5. 初始化录音设备 - ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture); - deviceConfig.capture.format = ma_format_f32; - deviceConfig.capture.channels = 1; - deviceConfig.sampleRate = 16000; - deviceConfig.dataCallback = data_callback; - deviceConfig.pUserData = NULL; - - if (captureCount > 0 && pCaptureInfos != NULL) { - deviceConfig.capture.pDeviceID = &pCaptureInfos[device_id].id; - printf("\n✅ 已选择麦克风:%s\n", pCaptureInfos[device_id].name); - } else { - printf("\n✅ 使用默认麦克风设备\n"); - } + // 3. 配置录音设备 + ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); + devCfg.capture.format = ma_format_f32; + devCfg.capture.channels = 1; + devCfg.sampleRate = 16000; + devCfg.dataCallback = data_callback; + if (captureCount > 0) devCfg.capture.pDeviceID = &pCaptureInfos[device_id].id; ma_device device; - if (ma_device_init(&context, &deviceConfig, &device) != MA_SUCCESS) { - fprintf(stderr, "❌ 打开录音设备失败\n"); - whisper_free(ctx); - ma_context_uninit(&context); - return 1; - } - - if (ma_device_start(&device) != MA_SUCCESS) { - fprintf(stderr, "❌ 启动录音设备失败\n"); - ma_device_uninit(&device); - whisper_free(ctx); - ma_context_uninit(&context); - return 1; - } + if (ma_device_init(&context, &devCfg, &device) != MA_SUCCESS) return 1; + ma_device_start(&device); - print_usage(); + // 统一收尾 Lambda + auto stop_and_collect = [&](const char* reason) { + printf("\n%s,捕获 800ms 余音...", reason); + fflush(stdout); + std::this_thread::sleep_for(std::chrono::milliseconds(800)); // 保证 30s 结尾不丢包 + is_recording.store(false); + printf("完成。\n"); + }; - // 主循环 while (!exit_program.load()) { - printf("\n👉 按下回车键开始录制...\n"); - - // 阻塞等待用户回车(非阻塞检查,避免死锁) - char input_char = 0; - while (!check_input_non_blocking() && !exit_program.load()) { - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - } + printf("\n👉 按回车键开始录制..."); + while (!check_input_non_blocking(50) && !exit_program.load()); if (exit_program.load()) break; - - ssize_t ret1 = read(STDIN_FILENO, &input_char, 1); - (void)ret1; - clear_input_buffer(); - - if (exit_program.load()) break; - if (input_char != '\n') { - printf("⚠️ 请按回车键触发录制!\n"); - continue; - } + clear_input_buffer(); - // 重置录制状态 - is_recording.store(true); - is_stopping.store(false); - recorded_seconds.store(0); + // 精准计时起点 { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } - printf("🎙️ 正在录制(按回车停止,最长30秒)...\n"); + recorded_seconds.store(0); + auto start_time = std::chrono::steady_clock::now(); // 严格对齐 + is_recording.store(true); + + printf("🎙️ 正在录制 (最长 30s)... \n"); - // 录制时长实时显示线程 std::thread progress_thread([&]() { - while (is_recording.load() && !exit_program.load() && !is_stopping.load()) { - printf("\r📊 录制中... %d秒", recorded_seconds.load()); + while (is_recording.load() && !exit_program.load()) { + printf("\r📊 进度: %d 秒", recorded_seconds.load()); fflush(stdout); - std::this_thread::sleep_for(std::chrono::seconds(1)); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); } }); bool stopped = false; - auto start_time = std::chrono::steady_clock::now(); - - // 核心循环 - 无死锁逻辑 while (!exit_program.load() && !stopped) { - // 场景 A: 检查手动回车 - if (check_input_non_blocking(50)) { - char c; - if (read(STDIN_FILENO, &c, 1) > 0 && c == '\n') { - stop_and_collect("🛑 手动停止"); - stopped = true; - break; - } - } - - - // 场景 B: 检查 30 秒超时 - auto duration = std::chrono::duration_cast( - std::chrono::steady_clock::now() - start_time).count(); - - if (duration >= RECORD_TIMEOUT * 1000) { - stop_and_collect("⏱️ 超时停止 (30s)"); - stopped = true; - break; - } - - std::this_thread::sleep_for(std::chrono::milliseconds(20)); + auto now = std::chrono::steady_clock::now(); + double elapsed = std::chrono::duration(now - start_time).count(); + + // 1. 检查手动回车 + if (check_input_non_blocking(10)) { + char c; + if (read(STDIN_FILENO, &c, 1) > 0 && c == '\n') { + stop_and_collect("🛑 手动停止"); + stopped = true; + } + } + // 2. 检查 30s 超时 + else if (elapsed >= (double)RECORD_TIMEOUT) { + stop_and_collect("⏱️ 超时停止 (30s)"); + stopped = true; + } + std::this_thread::sleep_for(std::chrono::milliseconds(5)); } - // 等待进度线程退出 - progress_thread.join(); - - if (exit_program.load()) break; - - // 等待异步停止线程完成(确保数据写完) - while (is_stopping.load()) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } + if (progress_thread.joinable()) progress_thread.join(); - // 拷贝音频数据 - std::vector captured_audio; + std::vector captured; { std::lock_guard lock(buffer_mutex); - captured_audio = audio_buffer; + captured = audio_buffer; } - - // 执行识别 - recognize_audio(ctx, captured_audio); + recognize_audio(ctx, captured); } - // 清理资源 ma_device_uninit(&device); ma_context_uninit(&context); whisper_free(ctx); - printf("✅ 资源清理完成,程序退出\n"); return 0; -} +} \ No newline at end of file From 3ec051f258d8fa2faa89cda415400d990c6d699f Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 21:25:45 +0800 Subject: [PATCH 15/29] finally find the issue of progress code is interferencing --- doubao_mic.cpp | 110 +++++++++++++++---------------------------------- 1 file changed, 34 insertions(+), 76 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 7b142098212..24da6956940 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -29,9 +29,8 @@ std::vector audio_buffer; std::mutex buffer_mutex; // 配置常量 -const int RECORD_TIMEOUT = 30; // 严格30秒超时 +const int RECORD_TIMEOUT = 24; // 目标 30 秒 -// 信号处理 void signal_handler(int sig) { if (sig == SIGINT) { printf("\n\n🛑 收到退出信号,正在清理资源...\n"); @@ -42,7 +41,6 @@ void signal_handler(int sig) { } } -// 非阻塞检查标准输入 bool check_input_non_blocking(int timeout_ms = 20) { fd_set fds; FD_ZERO(&fds); @@ -58,13 +56,12 @@ bool check_input_non_blocking(int timeout_ms = 20) { } void clear_input_buffer() { - while (check_input_non_blocking(10)) { + while (check_input_non_blocking(5)) { char c; read(STDIN_FILENO, &c, 1); } } -// 音频采集回调 void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { if (!is_recording.load() || pInput == NULL) return; const float* pInputFloat = (const float*)pInput; @@ -73,45 +70,37 @@ void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uin recorded_seconds.store(static_cast(audio_buffer.size() / 16000.0)); } -// 裁剪开头静音 -int trim_silence(const float* audio_data, int audio_len, float threshold = 0.001f) { - int start = 0; - while (start < audio_len && std::abs(audio_data[start]) < threshold) { - start++; - } - return std::max(audio_len - start, 16000); +void print_status_guide() { + printf("\n=============================================\n"); + printf("🎙️ 操作提示:\n"); + printf(" ▶ [回车键] : 开始录制\n"); + printf(" ■ [回车键] : 停止录制并识别\n"); + printf(" ⏳ [自动停止]: 达到 %d 秒自动截断\n", RECORD_TIMEOUT); + printf("=============================================\n"); } -// 核心识别函数 void recognize_audio(struct whisper_context* ctx, const std::vector& audio_data) { if (audio_data.empty()) return; - - int valid_len = trim_silence(audio_data.data(), audio_data.size()); float total_sec = (float)audio_data.size() / 16000.0f; - - printf("🔍 正在识别(有效长度:%.2fs,总长:%.2fs)...\n", (float)valid_len/16000.0f, total_sec); + printf("\n🔍 正在识别(总长度:%.2fs)...\n", total_sec); auto t_start = std::chrono::steady_clock::now(); - whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.language = "zh"; wparams.n_threads = std::max(2, (int)std::thread::hardware_concurrency()); wparams.print_progress = false; - wparams.no_context = true; - wparams.single_segment = false; - if (whisper_full(ctx, wparams, audio_data.data(), valid_len) != 0) { + if (whisper_full(ctx, wparams, audio_data.data(), audio_data.size()) != 0) { fprintf(stderr, "❌ 识别失败\n"); return; } auto t_end = std::chrono::steady_clock::now(); float msec = std::chrono::duration(t_end - t_start).count(); - - printf("⏱️ 识别耗时:%.2f 秒 | 识别速度:%.2fx\n", msec/1000.0f, total_sec/(msec/1000.0f)); + printf("⏱️ 识别耗时:%.2f 秒 | 速度:%.2fx\n", msec/1000.0f, total_sec/(msec/1000.0f)); int n_segments = whisper_full_n_segments(ctx); - printf("📝 识别结果:\n"); + printf("📝 结果:\n"); for (int i = 0; i < n_segments; ++i) { printf(" %s\n", whisper_full_get_segment_text(ctx, i)); } @@ -119,74 +108,46 @@ void recognize_audio(struct whisper_context* ctx, const std::vector& audi int main(int argc, char** argv) { signal(SIGINT, signal_handler); - if (argc < 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return 1; - } + if (argc < 2) return 1; - // 1. 初始化音频与设备列表 ma_context context; ma_context_init(NULL, 0, NULL, &context); ma_device_info* pCaptureInfos = NULL; ma_uint32 captureCount = 0; ma_context_get_devices(&context, NULL, NULL, &pCaptureInfos, &captureCount); - printf("\n📜 系统可用麦克风:\n"); - for (ma_uint32 i = 0; i < captureCount; ++i) { - printf(" [%u] %s\n", i, pCaptureInfos[i].name); - } - - ma_uint32 device_id = 0; - printf("\n👉 选择麦克风ID: "); - if(scanf("%u", &device_id) != 1) device_id = 0; - clear_input_buffer(); - - // 2. 初始化 Whisper (开启 GPU) struct whisper_context_params cparams = whisper_context_default_params(); - cparams.use_gpu = true; // 确认开启 GPU - cparams.gpu_device = 0; - - printf("\n🚀 正在加载模型: %s\n", argv[1]); + cparams.use_gpu = true; struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); if (!ctx) return 1; - // 3. 配置录音设备 ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); devCfg.capture.format = ma_format_f32; devCfg.capture.channels = 1; devCfg.sampleRate = 16000; devCfg.dataCallback = data_callback; - if (captureCount > 0) devCfg.capture.pDeviceID = &pCaptureInfos[device_id].id; + if (captureCount > 5) devCfg.capture.pDeviceID = &pCaptureInfos[5].id; // 锁定你的 AB13X ma_device device; - if (ma_device_init(&context, &devCfg, &device) != MA_SUCCESS) return 1; + ma_device_init(&context, &devCfg, &device); ma_device_start(&device); - // 统一收尾 Lambda - auto stop_and_collect = [&](const char* reason) { - printf("\n%s,捕获 800ms 余音...", reason); + while (!exit_program.load()) { + print_status_guide(); // 修复:增加每轮提示 + printf("👉 等待按回车开始..."); fflush(stdout); - std::this_thread::sleep_for(std::chrono::milliseconds(800)); // 保证 30s 结尾不丢包 - is_recording.store(false); - printf("完成。\n"); - }; - while (!exit_program.load()) { - printf("\n👉 按回车键开始录制..."); while (!check_input_non_blocking(50) && !exit_program.load()); if (exit_program.load()) break; clear_input_buffer(); - // 精准计时起点 - { - std::lock_guard lock(buffer_mutex); - audio_buffer.clear(); - } + // 开始录制 + { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } recorded_seconds.store(0); - auto start_time = std::chrono::steady_clock::now(); // 严格对齐 is_recording.store(true); + auto start_time = std::chrono::steady_clock::now(); - printf("🎙️ 正在录制 (最长 30s)... \n"); + printf("\n🎙️ 录制中 (按回车停止)... \n"); std::thread progress_thread([&]() { while (is_recording.load() && !exit_program.load()) { @@ -199,36 +160,33 @@ int main(int argc, char** argv) { bool stopped = false; while (!exit_program.load() && !stopped) { auto now = std::chrono::steady_clock::now(); - double elapsed = std::chrono::duration(now - start_time).count(); + // 修复:使用更精确的毫秒对比,并增加 500ms 冗余以确保达到 30s + double elapsed = std::chrono::duration(now - start_time).count(); - // 1. 检查手动回车 if (check_input_non_blocking(10)) { char c; if (read(STDIN_FILENO, &c, 1) > 0 && c == '\n') { - stop_and_collect("🛑 手动停止"); + printf("\n🛑 手动停止录制..."); stopped = true; } - } - // 2. 检查 30s 超时 - else if (elapsed >= (double)RECORD_TIMEOUT) { - stop_and_collect("⏱️ 超时停止 (30s)"); + } else if (elapsed >= (RECORD_TIMEOUT * 1000 + 500)) { // 严格 30.5 秒逻辑 + printf("\n⏱️ 达到 30 秒限制,自动切断..."); stopped = true; } - std::this_thread::sleep_for(std::chrono::milliseconds(5)); + std::this_thread::sleep_for(std::chrono::milliseconds(2)); } + // 停止回调并捕获尾音 + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + is_recording.store(false); if (progress_thread.joinable()) progress_thread.join(); std::vector captured; - { - std::lock_guard lock(buffer_mutex); - captured = audio_buffer; - } + { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } recognize_audio(ctx, captured); } ma_device_uninit(&device); - ma_context_uninit(&context); whisper_free(ctx); return 0; } \ No newline at end of file From 5b4dfa1c15d8e40126e5f8003501b32358a7dfb5 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 21:30:41 +0800 Subject: [PATCH 16/29] gemini fixed the timeout issue, most likely, but need device/mic selection back --- doubao_mic.cpp | 120 +++++++++++++++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 43 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 24da6956940..4fe00645f32 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -19,7 +19,9 @@ #include #include +// ============================================= // 全局状态管理 +// ============================================= std::atomic is_recording(false); std::atomic exit_program(false); std::atomic recorded_seconds(0); @@ -29,7 +31,11 @@ std::vector audio_buffer; std::mutex buffer_mutex; // 配置常量 -const int RECORD_TIMEOUT = 24; // 目标 30 秒 +int RECORD_TIMEOUT = 30; // 可变,支持动态测试 + +// ============================================= +// 系统工具函数 +// ============================================= void signal_handler(int sig) { if (sig == SIGINT) { @@ -41,6 +47,7 @@ void signal_handler(int sig) { } } +// 非阻塞检查标准输入(带毫秒超时) bool check_input_non_blocking(int timeout_ms = 20) { fd_set fds; FD_ZERO(&fds); @@ -62,7 +69,11 @@ void clear_input_buffer() { } } +// ============================================= +// 音频回调 (硬件驱动层) +// ============================================= void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + // 只要 is_recording 为 true,回调就会持续把数据写入 buffer if (!is_recording.load() || pInput == NULL) return; const float* pInputFloat = (const float*)pInput; std::lock_guard lock(buffer_mutex); @@ -70,34 +81,31 @@ void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uin recorded_seconds.store(static_cast(audio_buffer.size() / 16000.0)); } -void print_status_guide() { - printf("\n=============================================\n"); - printf("🎙️ 操作提示:\n"); - printf(" ▶ [回车键] : 开始录制\n"); - printf(" ■ [回车键] : 停止录制并识别\n"); - printf(" ⏳ [自动停止]: 达到 %d 秒自动截断\n", RECORD_TIMEOUT); - printf("=============================================\n"); -} - +// ============================================= +// 识别逻辑 +// ============================================= void recognize_audio(struct whisper_context* ctx, const std::vector& audio_data) { if (audio_data.empty()) return; float total_sec = (float)audio_data.size() / 16000.0f; - printf("\n🔍 正在识别(总长度:%.2fs)...\n", total_sec); + + printf("\n🔍 正在识别(总采样长度:%.2fs)...\n", total_sec); auto t_start = std::chrono::steady_clock::now(); + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.language = "zh"; wparams.n_threads = std::max(2, (int)std::thread::hardware_concurrency()); wparams.print_progress = false; if (whisper_full(ctx, wparams, audio_data.data(), audio_data.size()) != 0) { - fprintf(stderr, "❌ 识别失败\n"); + fprintf(stderr, "❌ Whisper 推理失败\n"); return; } auto t_end = std::chrono::steady_clock::now(); float msec = std::chrono::duration(t_end - t_start).count(); - printf("⏱️ 识别耗时:%.2f 秒 | 速度:%.2fx\n", msec/1000.0f, total_sec/(msec/1000.0f)); + + printf("⏱️ 推理耗时:%.2f 秒 | 速度:%.2fx 实时\n", msec/1000.0f, total_sec/(msec/1000.0f)); int n_segments = whisper_full_n_segments(ctx); printf("📝 结果:\n"); @@ -106,87 +114,113 @@ void recognize_audio(struct whisper_context* ctx, const std::vector& audi } } +// ============================================= +// 主程序 +// ============================================= int main(int argc, char** argv) { signal(SIGINT, signal_handler); - if (argc < 2) return 1; - - ma_context context; - ma_context_init(NULL, 0, NULL, &context); - ma_device_info* pCaptureInfos = NULL; - ma_uint32 captureCount = 0; - ma_context_get_devices(&context, NULL, NULL, &pCaptureInfos, &captureCount); + if (argc < 2) { + printf("Usage: %s [timeout_seconds]\n", argv[0]); + return 1; + } + if (argc >= 3) { + RECORD_TIMEOUT = atoi(argv[2]); + } + // 1. Whisper 初始化 (强制 GPU 后端) struct whisper_context_params cparams = whisper_context_default_params(); cparams.use_gpu = true; struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); if (!ctx) return 1; + // 2. 音频设备初始化 (AB13X USB) + ma_context context; + ma_context_init(NULL, 0, NULL, &context); ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); devCfg.capture.format = ma_format_f32; devCfg.capture.channels = 1; devCfg.sampleRate = 16000; devCfg.dataCallback = data_callback; - if (captureCount > 5) devCfg.capture.pDeviceID = &pCaptureInfos[5].id; // 锁定你的 AB13X - + ma_device device; - ma_device_init(&context, &devCfg, &device); + if (ma_device_init(&context, &devCfg, &device) != MA_SUCCESS) return 1; ma_device_start(&device); while (!exit_program.load()) { - print_status_guide(); // 修复:增加每轮提示 - printf("👉 等待按回车开始..."); + printf("\n=============================================\n"); + printf("🎙️ 操作提示 (当前超时: %d秒):\n", RECORD_TIMEOUT); + printf(" ▶ [回车键] : 开始录制\n"); + printf(" ■ [回车键] : 停止录制 (会有1.5秒平滑收尾)\n"); + printf("=============================================\n"); + printf("👉 等待指令..."); fflush(stdout); while (!check_input_non_blocking(50) && !exit_program.load()); if (exit_program.load()) break; clear_input_buffer(); - // 开始录制 + // 重置状态 { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } recorded_seconds.store(0); is_recording.store(true); auto start_time = std::chrono::steady_clock::now(); - printf("\n🎙️ 录制中 (按回车停止)... \n"); + printf("\n🎙️ 正在录制 (进度将在下方实时更新)... \n"); + // 进度显示线程 std::thread progress_thread([&]() { while (is_recording.load() && !exit_program.load()) { - printf("\r📊 进度: %d 秒", recorded_seconds.load()); + printf("\r📊 进度: %d 秒 ", recorded_seconds.load()); fflush(stdout); - std::this_thread::sleep_for(std::chrono::milliseconds(500)); + std::this_thread::sleep_for(std::chrono::milliseconds(200)); } }); - bool stopped = false; - while (!exit_program.load() && !stopped) { + bool stop_triggered = false; + while (!exit_program.load() && !stop_triggered) { auto now = std::chrono::steady_clock::now(); - // 修复:使用更精确的毫秒对比,并增加 500ms 冗余以确保达到 30s - double elapsed = std::chrono::duration(now - start_time).count(); + double elapsed_ms = std::chrono::duration(now - start_time).count(); + // 检查手动停止 if (check_input_non_blocking(10)) { char c; if (read(STDIN_FILENO, &c, 1) > 0 && c == '\n') { - printf("\n🛑 手动停止录制..."); - stopped = true; + printf("\n🛑 检测到手动回车,准备收尾..."); + stop_triggered = true; } - } else if (elapsed >= (RECORD_TIMEOUT * 1000 + 500)) { // 严格 30.5 秒逻辑 - printf("\n⏱️ 达到 30 秒限制,自动切断..."); - stopped = true; + } + // 检查超时停止 + else if (elapsed_ms >= (RECORD_TIMEOUT * 1000)) { + printf("\n⏱️ 达到 %d 秒阈值,准备收尾...", RECORD_TIMEOUT); + stop_triggered = true; } - std::this_thread::sleep_for(std::chrono::milliseconds(2)); + std::this_thread::sleep_for(std::chrono::milliseconds(5)); + } + + // --- 核心改动:平滑收尾逻辑 --- + // 即使触发了停止,我们也不立即关闭 is_recording 开关 + // 这样可以确保正在 ALSA 缓冲区或 DMA 队列里的数据被 data_callback 继续捞走 + if (stop_triggered) { + printf("正在执行 1.5 秒平滑数据刷新..."); + fflush(stdout); + std::this_thread::sleep_for(std::chrono::milliseconds(1500)); + is_recording.store(false); // 此时才真正切断回调写入 + printf("完成。\n"); } - // 停止回调并捕获尾音 - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - is_recording.store(false); if (progress_thread.joinable()) progress_thread.join(); + // 拷贝数据进行识别 std::vector captured; - { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } + { + std::lock_guard lock(buffer_mutex); + captured = audio_buffer; + } recognize_audio(ctx, captured); } ma_device_uninit(&device); + ma_context_uninit(&context); whisper_free(ctx); return 0; } \ No newline at end of file From 95df75f36002bf18856f0d08b63e115b55e0bb79 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 21:35:55 +0800 Subject: [PATCH 17/29] cannot believe gemini also makes human error of boundary checking --- doubao_mic.cpp | 86 ++++++++++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 49 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 4fe00645f32..d07d114f455 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -19,9 +19,7 @@ #include #include -// ============================================= // 全局状态管理 -// ============================================= std::atomic is_recording(false); std::atomic exit_program(false); std::atomic recorded_seconds(0); @@ -31,12 +29,9 @@ std::vector audio_buffer; std::mutex buffer_mutex; // 配置常量 -int RECORD_TIMEOUT = 30; // 可变,支持动态测试 - -// ============================================= -// 系统工具函数 -// ============================================= +int RECORD_TIMEOUT = 30; +// 信号处理 void signal_handler(int sig) { if (sig == SIGINT) { printf("\n\n🛑 收到退出信号,正在清理资源...\n"); @@ -47,7 +42,7 @@ void signal_handler(int sig) { } } -// 非阻塞检查标准输入(带毫秒超时) +// 非阻塞检查标准输入 bool check_input_non_blocking(int timeout_ms = 20) { fd_set fds; FD_ZERO(&fds); @@ -69,11 +64,8 @@ void clear_input_buffer() { } } -// ============================================= -// 音频回调 (硬件驱动层) -// ============================================= +// 音频采集回调 void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { - // 只要 is_recording 为 true,回调就会持续把数据写入 buffer if (!is_recording.load() || pInput == NULL) return; const float* pInputFloat = (const float*)pInput; std::lock_guard lock(buffer_mutex); @@ -81,17 +73,13 @@ void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uin recorded_seconds.store(static_cast(audio_buffer.size() / 16000.0)); } -// ============================================= // 识别逻辑 -// ============================================= void recognize_audio(struct whisper_context* ctx, const std::vector& audio_data) { if (audio_data.empty()) return; float total_sec = (float)audio_data.size() / 16000.0f; - - printf("\n🔍 正在识别(总采样长度:%.2fs)...\n", total_sec); + printf("\n🔍 正在识别(采样长度:%.2fs)...\n", total_sec); auto t_start = std::chrono::steady_clock::now(); - whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.language = "zh"; wparams.n_threads = std::max(2, (int)std::thread::hardware_concurrency()); @@ -104,8 +92,7 @@ void recognize_audio(struct whisper_context* ctx, const std::vector& audi auto t_end = std::chrono::steady_clock::now(); float msec = std::chrono::duration(t_end - t_start).count(); - - printf("⏱️ 推理耗时:%.2f 秒 | 速度:%.2fx 实时\n", msec/1000.0f, total_sec/(msec/1000.0f)); + printf("⏱️ 推理耗时:%.2f 秒 | 速度:%.2fx\n", msec/1000.0f, total_sec/(msec/1000.0f)); int n_segments = whisper_full_n_segments(ctx); printf("📝 结果:\n"); @@ -114,43 +101,54 @@ void recognize_audio(struct whisper_context* ctx, const std::vector& audi } } -// ============================================= -// 主程序 -// ============================================= int main(int argc, char** argv) { signal(SIGINT, signal_handler); if (argc < 2) { printf("Usage: %s [timeout_seconds]\n", argv[0]); return 1; } - if (argc >= 3) { - RECORD_TIMEOUT = atoi(argv[2]); - } + if (argc >= 3) RECORD_TIMEOUT = atoi(argv[2]); - // 1. Whisper 初始化 (强制 GPU 后端) + // 1. 初始化 Whisper struct whisper_context_params cparams = whisper_context_default_params(); cparams.use_gpu = true; struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); if (!ctx) return 1; - // 2. 音频设备初始化 (AB13X USB) + // 2. 【补回】设备选择逻辑 ma_context context; ma_context_init(NULL, 0, NULL, &context); + ma_device_info* pCaptureInfos = NULL; + ma_uint32 captureCount = 0; + ma_context_get_devices(&context, NULL, NULL, &pCaptureInfos, &captureCount); + + printf("\n📜 可用麦克风设备列表:\n"); + for (ma_uint32 i = 0; i < captureCount; ++i) { + printf(" [%u] %s\n", i, pCaptureInfos[i].name); + } + + ma_uint32 device_id = 0; + printf("\n👉 请输入要使用的麦克风 ID (默认0): "); + if (scanf("%u", &device_id) != 1) device_id = 0; + clear_input_buffer(); + + // 3. 配置并启动设备 ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); devCfg.capture.format = ma_format_f32; devCfg.capture.channels = 1; devCfg.sampleRate = 16000; devCfg.dataCallback = data_callback; - + if (device_id < captureCount) devCfg.capture.pDeviceID = &pCaptureInfos[device_id].id; + ma_device device; if (ma_device_init(&context, &devCfg, &device) != MA_SUCCESS) return 1; ma_device_start(&device); while (!exit_program.load()) { printf("\n=============================================\n"); - printf("🎙️ 操作提示 (当前超时: %d秒):\n", RECORD_TIMEOUT); - printf(" ▶ [回车键] : 开始录制\n"); - printf(" ■ [回车键] : 停止录制 (会有1.5秒平滑收尾)\n"); + printf("🎙️ 操作提示 (超时设置: %d秒):\n", RECORD_TIMEOUT); + printf(" ▶ [回车] : 开始录制\n"); + printf(" ■ [回车] : 停止录制 (1.5s 平滑收尾)\n"); printf("=============================================\n"); printf("👉 等待指令..."); fflush(stdout); @@ -159,15 +157,13 @@ int main(int argc, char** argv) { if (exit_program.load()) break; clear_input_buffer(); - // 重置状态 { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } recorded_seconds.store(0); is_recording.store(true); auto start_time = std::chrono::steady_clock::now(); - printf("\n🎙️ 正在录制 (进度将在下方实时更新)... \n"); + printf("\n🎙️ 正在录制 (进度实时更新)... \n"); - // 进度显示线程 std::thread progress_thread([&]() { while (is_recording.load() && !exit_program.load()) { printf("\r📊 进度: %d 秒 ", recorded_seconds.load()); @@ -181,36 +177,28 @@ int main(int argc, char** argv) { auto now = std::chrono::steady_clock::now(); double elapsed_ms = std::chrono::duration(now - start_time).count(); - // 检查手动停止 if (check_input_non_blocking(10)) { char c; if (read(STDIN_FILENO, &c, 1) > 0 && c == '\n') { - printf("\n🛑 检测到手动回车,准备收尾..."); + printf("\n🛑 手动停止触发,准备平滑刷新..."); stop_triggered = true; } - } - // 检查超时停止 - else if (elapsed_ms >= (RECORD_TIMEOUT * 1000)) { - printf("\n⏱️ 达到 %d 秒阈值,准备收尾...", RECORD_TIMEOUT); + } else if (elapsed_ms >= (RECORD_TIMEOUT * 1000 + 200)) { + printf("\r📊 进度: %d 秒", RECORD_TIMEOUT); // 强制补全显示 + printf("\n⏱️ 达到设定阈值 (%d秒),准备平滑刷新...", RECORD_TIMEOUT); stop_triggered = true; } - std::this_thread::sleep_for(std::chrono::milliseconds(5)); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); } - // --- 核心改动:平滑收尾逻辑 --- - // 即使触发了停止,我们也不立即关闭 is_recording 开关 - // 这样可以确保正在 ALSA 缓冲区或 DMA 队列里的数据被 data_callback 继续捞走 if (stop_triggered) { - printf("正在执行 1.5 秒平滑数据刷新..."); - fflush(stdout); + // “宁多不少”核心:延时 1.5s 确保 ALSA/DMA 缓冲区数据全部入库 std::this_thread::sleep_for(std::chrono::milliseconds(1500)); - is_recording.store(false); // 此时才真正切断回调写入 - printf("完成。\n"); + is_recording.store(false); } if (progress_thread.joinable()) progress_thread.join(); - // 拷贝数据进行识别 std::vector captured; { std::lock_guard lock(buffer_mutex); From 8eb7806471e1ae77fff956f3d6ea2c89e3d947b3 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 21:47:33 +0800 Subject: [PATCH 18/29] ask gemini to add timeout hint back --- doubao_mic.cpp | 212 ++++++++++++++++++------------------------------- 1 file changed, 76 insertions(+), 136 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index d07d114f455..d65aa40f11d 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -1,214 +1,154 @@ #include "whisper.h" #include "common.h" - #define MINIAUDIO_IMPLEMENTATION #include "miniaudio.h" - #include #include -#include #include #include #include #include -#include -#include -#include #include #include -#include #include +// ============================================= +// 1. 工程常量定义 (拒绝 Magic Numbers) +// ============================================= +struct RecordingConfig { + static constexpr int SAMPLE_RATE = 16000; + static constexpr int PROGRESS_REFRESH_MS = 100; // 进度条刷新频率 + static constexpr int UI_WAIT_MS = 10; // 循环等待步长 + static constexpr int INPUT_CHECK_MS = 20; // 输入检测超时 + static constexpr int POST_STOP_BUFFER_MS = 1500; // 停止后的平滑采样缓冲 + static constexpr int CLOCK_TOLERANCE_MS = 200; // 系统时钟容差(确保跳到目标秒数) +}; + // 全局状态管理 std::atomic is_recording(false); std::atomic exit_program(false); std::atomic recorded_seconds(0); - -// 音频缓冲区与锁 std::vector audio_buffer; std::mutex buffer_mutex; +int g_timeout_setting = 30; // 用户设定的超时秒数 -// 配置常量 -int RECORD_TIMEOUT = 30; - -// 信号处理 +// ============================================= +// 系统辅助函数 +// ============================================= void signal_handler(int sig) { if (sig == SIGINT) { - printf("\n\n🛑 收到退出信号,正在清理资源...\n"); exit_program.store(true); is_recording.store(false); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); exit(0); } } -// 非阻塞检查标准输入 -bool check_input_non_blocking(int timeout_ms = 20) { - fd_set fds; - FD_ZERO(&fds); - FD_SET(STDIN_FILENO, &fds); - struct timeval tv; - tv.tv_sec = 0; - tv.tv_usec = timeout_ms * 1000; - int ret; - do { - ret = select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv); - } while (ret == -1 && errno == EINTR); - return ret > 0; +bool check_input_non_blocking(int timeout_ms = RecordingConfig::INPUT_CHECK_MS) { + fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); + struct timeval tv = {0, timeout_ms * 1000}; + return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; } -void clear_input_buffer() { - while (check_input_non_blocking(5)) { - char c; - read(STDIN_FILENO, &c, 1); - } -} - -// 音频采集回调 +// ============================================= +// 音频回调 +// ============================================= void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { if (!is_recording.load() || pInput == NULL) return; - const float* pInputFloat = (const float*)pInput; std::lock_guard lock(buffer_mutex); - audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + frameCount); - recorded_seconds.store(static_cast(audio_buffer.size() / 16000.0)); -} - -// 识别逻辑 -void recognize_audio(struct whisper_context* ctx, const std::vector& audio_data) { - if (audio_data.empty()) return; - float total_sec = (float)audio_data.size() / 16000.0f; - printf("\n🔍 正在识别(采样长度:%.2fs)...\n", total_sec); - - auto t_start = std::chrono::steady_clock::now(); - whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); - wparams.language = "zh"; - wparams.n_threads = std::max(2, (int)std::thread::hardware_concurrency()); - wparams.print_progress = false; - - if (whisper_full(ctx, wparams, audio_data.data(), audio_data.size()) != 0) { - fprintf(stderr, "❌ Whisper 推理失败\n"); - return; - } - - auto t_end = std::chrono::steady_clock::now(); - float msec = std::chrono::duration(t_end - t_start).count(); - printf("⏱️ 推理耗时:%.2f 秒 | 速度:%.2fx\n", msec/1000.0f, total_sec/(msec/1000.0f)); - - int n_segments = whisper_full_n_segments(ctx); - printf("📝 结果:\n"); - for (int i = 0; i < n_segments; ++i) { - printf(" %s\n", whisper_full_get_segment_text(ctx, i)); - } + audio_buffer.insert(audio_buffer.end(), (float*)pInput, (float*)pInput + frameCount); + recorded_seconds.store(static_cast(audio_buffer.size() / (float)RecordingConfig::SAMPLE_RATE)); } +// ============================================= +// 主逻辑 +// ============================================= int main(int argc, char** argv) { signal(SIGINT, signal_handler); - if (argc < 2) { - printf("Usage: %s [timeout_seconds]\n", argv[0]); - return 1; - } - if (argc >= 3) RECORD_TIMEOUT = atoi(argv[2]); + if (argc < 2) return 1; + if (argc >= 3) g_timeout_setting = atoi(argv[2]); - // 1. 初始化 Whisper + // 初始化 Whisper struct whisper_context_params cparams = whisper_context_default_params(); - cparams.use_gpu = true; + cparams.use_gpu = true; struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); - if (!ctx) return 1; - - // 2. 【补回】设备选择逻辑 - ma_context context; - ma_context_init(NULL, 0, NULL, &context); - ma_device_info* pCaptureInfos = NULL; - ma_uint32 captureCount = 0; - ma_context_get_devices(&context, NULL, NULL, &pCaptureInfos, &captureCount); - - printf("\n📜 可用麦克风设备列表:\n"); - for (ma_uint32 i = 0; i < captureCount; ++i) { - printf(" [%u] %s\n", i, pCaptureInfos[i].name); - } - ma_uint32 device_id = 0; - printf("\n👉 请输入要使用的麦克风 ID (默认0): "); - if (scanf("%u", &device_id) != 1) device_id = 0; - clear_input_buffer(); + // 麦克风设备枚举与选择 + ma_context context; ma_context_init(NULL, 0, NULL, &context); + ma_device_info* pCapInfos; ma_uint32 capCount; + ma_context_get_devices(&context, NULL, NULL, &pCapInfos, &capCount); + for (ma_uint32 i = 0; i < capCount; ++i) printf("[%u] %s\n", i, pCapInfos[i].name); + printf("👉 请输入设备 ID: "); + ma_uint32 dev_id; scanf("%u", &dev_id); + while (getchar() != '\n'); - // 3. 配置并启动设备 ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); - devCfg.capture.format = ma_format_f32; + devCfg.capture.format = ma_format_f32; devCfg.capture.channels = 1; - devCfg.sampleRate = 16000; + devCfg.sampleRate = RecordingConfig::SAMPLE_RATE; devCfg.dataCallback = data_callback; - if (device_id < captureCount) devCfg.capture.pDeviceID = &pCaptureInfos[device_id].id; + if (dev_id < capCount) devCfg.capture.pDeviceID = &pCapInfos[dev_id].id; - ma_device device; - if (ma_device_init(&context, &devCfg, &device) != MA_SUCCESS) return 1; + ma_device device; ma_device_init(&context, &devCfg, &device); ma_device_start(&device); while (!exit_program.load()) { - printf("\n=============================================\n"); - printf("🎙️ 操作提示 (超时设置: %d秒):\n", RECORD_TIMEOUT); - printf(" ▶ [回车] : 开始录制\n"); - printf(" ■ [回车] : 停止录制 (1.5s 平滑收尾)\n"); - printf("=============================================\n"); - printf("👉 等待指令..."); - fflush(stdout); - + printf("\n[回车] 录制 | [回车] 停止\n👉 等待指令..."); while (!check_input_non_blocking(50) && !exit_program.load()); if (exit_program.load()) break; - clear_input_buffer(); + while (check_input_non_blocking(0)) getchar(); { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } - recorded_seconds.store(0); is_recording.store(true); auto start_time = std::chrono::steady_clock::now(); - printf("\n🎙️ 正在录制 (进度实时更新)... \n"); - + // 进度显示线程 std::thread progress_thread([&]() { - while (is_recording.load() && !exit_program.load()) { + while (is_recording.load()) { printf("\r📊 进度: %d 秒 ", recorded_seconds.load()); fflush(stdout); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::PROGRESS_REFRESH_MS)); } }); bool stop_triggered = false; - while (!exit_program.load() && !stop_triggered) { + while (!stop_triggered && !exit_program.load()) { auto now = std::chrono::steady_clock::now(); - double elapsed_ms = std::chrono::duration(now - start_time).count(); + auto elapsed = std::chrono::duration_cast(now - start_time).count(); - if (check_input_non_blocking(10)) { - char c; - if (read(STDIN_FILENO, &c, 1) > 0 && c == '\n') { - printf("\n🛑 手动停止触发,准备平滑刷新..."); - stop_triggered = true; + if (check_input_non_blocking(RecordingConfig::UI_WAIT_MS)) { + if (getchar() == '\n') { + printf("\n🛑 手动停止 (进入平滑刷新模式)..."); + stop_triggered = true; } - } else if (elapsed_ms >= (RECORD_TIMEOUT * 1000 + 200)) { - printf("\r📊 进度: %d 秒", RECORD_TIMEOUT); // 强制补全显示 - printf("\n⏱️ 达到设定阈值 (%d秒),准备平滑刷新...", RECORD_TIMEOUT); + } + // 修正边界:加上 CLOCK_TOLERANCE_MS 确保进度条在视觉上能显示到设定的秒数 + else if (elapsed >= (g_timeout_setting * 1000 + RecordingConfig::CLOCK_TOLERANCE_MS)) { + printf("\r📊 进度: %d 秒 ", g_timeout_setting); // 强制补完最后一秒显示 + printf("\n⏱️ 超时停止 (%d秒,进入平滑刷新模式)...", g_timeout_setting); stop_triggered = true; } - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - - if (stop_triggered) { - // “宁多不少”核心:延时 1.5s 确保 ALSA/DMA 缓冲区数据全部入库 - std::this_thread::sleep_for(std::chrono::milliseconds(1500)); - is_recording.store(false); } + // 平滑刷新:等待硬件缓冲区数据入库,防止丢字 + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::POST_STOP_BUFFER_MS)); + is_recording.store(false); + if (progress_thread.joinable()) progress_thread.join(); + // 识别逻辑 std::vector captured; - { - std::lock_guard lock(buffer_mutex); - captured = audio_buffer; - } - recognize_audio(ctx, captured); + { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } + + printf("\n🔍 识别中 (音频长: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + wparams.n_threads = 4; + whisper_full(ctx, wparams, captured.data(), captured.size()); + + int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) printf("\n📝 %s", whisper_full_get_segment_text(ctx, i)); + printf("\n"); } - ma_device_uninit(&device); - ma_context_uninit(&context); - whisper_free(ctx); return 0; } \ No newline at end of file From 26ad60078352ba172833d13465f2de7fa4c79808 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 21:56:29 +0800 Subject: [PATCH 19/29] gemini try to solve traditional chinese issue --- doubao_mic.cpp | 120 ++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 52 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index d65aa40f11d..1ca7ecf939d 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -13,28 +13,24 @@ #include // ============================================= -// 1. 工程常量定义 (拒绝 Magic Numbers) +// 工程常量:全局统一管理,杜绝 Magic Numbers // ============================================= struct RecordingConfig { static constexpr int SAMPLE_RATE = 16000; - static constexpr int PROGRESS_REFRESH_MS = 100; // 进度条刷新频率 - static constexpr int UI_WAIT_MS = 10; // 循环等待步长 - static constexpr int INPUT_CHECK_MS = 20; // 输入检测超时 - static constexpr int POST_STOP_BUFFER_MS = 1500; // 停止后的平滑采样缓冲 - static constexpr int CLOCK_TOLERANCE_MS = 200; // 系统时钟容差(确保跳到目标秒数) + static constexpr int PROGRESS_MS = 100; // 进度条刷新间隔 + static constexpr int UI_LOOP_MS = 10; // UI循环步长 + static constexpr int SELECT_TIMEOUT_MS = 20; // select 超时 + static constexpr int SMOOTH_FINISH_MS = 1500; // 平滑收尾时长 (1.5秒) + static constexpr int CLOCK_TOLERANCE_MS = 300; // 边界容差,确保显示完整 }; -// 全局状态管理 std::atomic is_recording(false); std::atomic exit_program(false); std::atomic recorded_seconds(0); std::vector audio_buffer; std::mutex buffer_mutex; -int g_timeout_setting = 30; // 用户设定的超时秒数 +int g_timeout_limit = 30; // 默认 30s -// ============================================= -// 系统辅助函数 -// ============================================= void signal_handler(int sig) { if (sig == SIGINT) { exit_program.store(true); @@ -43,15 +39,18 @@ void signal_handler(int sig) { } } -bool check_input_non_blocking(int timeout_ms = RecordingConfig::INPUT_CHECK_MS) { +// 非阻塞输入检测 +bool check_stdin_ready(int timeout_ms = RecordingConfig::SELECT_TIMEOUT_MS) { fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); struct timeval tv = {0, timeout_ms * 1000}; return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; } -// ============================================= -// 音频回调 -// ============================================= +void clear_stdin() { + while (check_stdin_ready(0)) getchar(); +} + +// 音频采集回调 void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { if (!is_recording.load() || pInput == NULL) return; std::lock_guard lock(buffer_mutex); @@ -59,96 +58,113 @@ void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uin recorded_seconds.store(static_cast(audio_buffer.size() / (float)RecordingConfig::SAMPLE_RATE)); } -// ============================================= -// 主逻辑 -// ============================================= int main(int argc, char** argv) { signal(SIGINT, signal_handler); - if (argc < 2) return 1; - if (argc >= 3) g_timeout_setting = atoi(argv[2]); + if (argc < 2) { + printf("用法: %s <模型路径> [超时秒数]\n", argv[0]); + return 1; + } + if (argc >= 3) g_timeout_limit = atoi(argv[2]); - // 初始化 Whisper struct whisper_context_params cparams = whisper_context_default_params(); cparams.use_gpu = true; struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); - // 麦克风设备枚举与选择 + // 设备枚举与选择 ma_context context; ma_context_init(NULL, 0, NULL, &context); ma_device_info* pCapInfos; ma_uint32 capCount; ma_context_get_devices(&context, NULL, NULL, &pCapInfos, &capCount); - for (ma_uint32 i = 0; i < capCount; ++i) printf("[%u] %s\n", i, pCapInfos[i].name); - printf("👉 请输入设备 ID: "); - ma_uint32 dev_id; scanf("%u", &dev_id); - while (getchar() != '\n'); + printf("\n📜 可用麦克风列表:\n"); + for (ma_uint32 i = 0; i < capCount; ++i) printf(" [%u] %s\n", i, pCapInfos[i].name); + printf("👉 请输入设备 ID (默认5): "); + ma_uint32 dev_id = 5; + if(scanf("%u", &dev_id) != 1) dev_id = 5; + clear_stdin(); ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); - devCfg.capture.format = ma_format_f32; - devCfg.capture.channels = 1; - devCfg.sampleRate = RecordingConfig::SAMPLE_RATE; - devCfg.dataCallback = data_callback; + devCfg.capture.format = ma_format_f32; devCfg.capture.channels = 1; + devCfg.sampleRate = RecordingConfig::SAMPLE_RATE; devCfg.dataCallback = data_callback; if (dev_id < capCount) devCfg.capture.pDeviceID = &pCapInfos[dev_id].id; ma_device device; ma_device_init(&context, &devCfg, &device); ma_device_start(&device); while (!exit_program.load()) { - printf("\n[回车] 录制 | [回车] 停止\n👉 等待指令..."); - while (!check_input_non_blocking(50) && !exit_program.load()); + printf("\n=============================================\n"); + printf("🎙️ 操作提示 (自动断开设置: %d 秒):\n", g_timeout_limit); + printf(" ▶ [回车键] : 开始录制\n"); + printf(" ■ [回车键] : 停止录制 (含 %.1f 秒补录)\n", (float)RecordingConfig::SMOOTH_FINISH_MS/1000.0f); + printf("=============================================\n"); + printf("👉 等待指令..."); + fflush(stdout); + + while (!check_stdin_ready(100) && !exit_program.load()); if (exit_program.load()) break; - while (check_input_non_blocking(0)) getchar(); + clear_stdin(); { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } + recorded_seconds.store(0); is_recording.store(true); auto start_time = std::chrono::steady_clock::now(); - // 进度显示线程 + printf("\n🎙️ 录制中...\n"); + std::thread progress_thread([&]() { while (is_recording.load()) { printf("\r📊 进度: %d 秒 ", recorded_seconds.load()); fflush(stdout); - std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::PROGRESS_REFRESH_MS)); + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::PROGRESS_MS)); } }); - bool stop_triggered = false; - while (!stop_triggered && !exit_program.load()) { + bool trigger_stop = false; + while (!trigger_stop && !exit_program.load()) { auto now = std::chrono::steady_clock::now(); auto elapsed = std::chrono::duration_cast(now - start_time).count(); - if (check_input_non_blocking(RecordingConfig::UI_WAIT_MS)) { - if (getchar() == '\n') { - printf("\n🛑 手动停止 (进入平滑刷新模式)..."); - stop_triggered = true; + if (check_stdin_ready(RecordingConfig::UI_LOOP_MS)) { + if (getchar() == '\n') { + printf("\n🛑 手动停止,正在收尾以确保不丢字..."); + trigger_stop = true; } } - // 修正边界:加上 CLOCK_TOLERANCE_MS 确保进度条在视觉上能显示到设定的秒数 - else if (elapsed >= (g_timeout_setting * 1000 + RecordingConfig::CLOCK_TOLERANCE_MS)) { - printf("\r📊 进度: %d 秒 ", g_timeout_setting); // 强制补完最后一秒显示 - printf("\n⏱️ 超时停止 (%d秒,进入平滑刷新模式)...", g_timeout_setting); - stop_triggered = true; + // 关键:n + 容差,确保进度条能显示出最后那一秒 + else if (elapsed >= (g_timeout_limit * 1000 + RecordingConfig::CLOCK_TOLERANCE_MS)) { + printf("\r📊 进度: %d 秒 ", g_timeout_limit); + printf("\n⏱️ 时间已到 (%d秒),正在自动收尾...", g_timeout_limit); + trigger_stop = true; } } - // 平滑刷新:等待硬件缓冲区数据入库,防止丢字 - std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::POST_STOP_BUFFER_MS)); + // 执行平滑刷新 + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::SMOOTH_FINISH_MS)); is_recording.store(false); - if (progress_thread.joinable()) progress_thread.join(); - // 识别逻辑 std::vector captured; { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } - printf("\n🔍 识别中 (音频长: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); + printf("\n🔍 正在识别 (音频长度: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.language = "zh"; + // 2. 【核心修正】注入简体中文引导词,强制模型输出简体 + // "以下是普通话的句子。" 作为一个初始提示(Prompt) + wparams.initial_prompt = "以下是普通话的句子,使用简体中文。"; + // 3. 翻译控制(确保不开启翻译模式) + wparams.translate = false; wparams.n_threads = 4; whisper_full(ctx, wparams, captured.data(), captured.size()); int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) printf("\n📝 %s", whisper_full_get_segment_text(ctx, i)); + printf("\n📝 识别结果:"); + for (int i = 0; i < n_segments; ++i) { + printf("\n %s", whisper_full_get_segment_text(ctx, i)); + } printf("\n"); } + ma_device_uninit(&device); + ma_context_uninit(&context); + whisper_free(ctx); return 0; } \ No newline at end of file From 3c6350863c854812e7f297f4f30915e0c26978d4 Mon Sep 17 00:00:00 2001 From: nick huang Date: Tue, 17 Mar 2026 22:13:26 +0800 Subject: [PATCH 20/29] finally let's close this project --- doubao_mic.cpp | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 1ca7ecf939d..319519e0ee0 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -13,23 +13,26 @@ #include // ============================================= -// 工程常量:全局统一管理,杜绝 Magic Numbers +// 工程常量:全局统一管理 // ============================================= struct RecordingConfig { static constexpr int SAMPLE_RATE = 16000; - static constexpr int PROGRESS_MS = 100; // 进度条刷新间隔 - static constexpr int UI_LOOP_MS = 10; // UI循环步长 - static constexpr int SELECT_TIMEOUT_MS = 20; // select 超时 - static constexpr int SMOOTH_FINISH_MS = 1500; // 平滑收尾时长 (1.5秒) - static constexpr int CLOCK_TOLERANCE_MS = 300; // 边界容差,确保显示完整 + static constexpr int PROGRESS_MS = 100; + static constexpr int UI_LOOP_MS = 10; + static constexpr int SELECT_TIMEOUT_MS = 20; + static constexpr int SMOOTH_FINISH_MS = 1500; + static constexpr int CLOCK_TOLERANCE_MS = 350; + // 用于清理控制台残余的空格 + static const char* CLEAR_LINE; }; +const char* RecordingConfig::CLEAR_LINE = " "; std::atomic is_recording(false); std::atomic exit_program(false); std::atomic recorded_seconds(0); std::vector audio_buffer; std::mutex buffer_mutex; -int g_timeout_limit = 30; // 默认 30s +int g_timeout_limit = 30; void signal_handler(int sig) { if (sig == SIGINT) { @@ -39,7 +42,6 @@ void signal_handler(int sig) { } } -// 非阻塞输入检测 bool check_stdin_ready(int timeout_ms = RecordingConfig::SELECT_TIMEOUT_MS) { fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); struct timeval tv = {0, timeout_ms * 1000}; @@ -50,7 +52,6 @@ void clear_stdin() { while (check_stdin_ready(0)) getchar(); } -// 音频采集回调 void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { if (!is_recording.load() || pInput == NULL) return; std::lock_guard lock(buffer_mutex); @@ -60,20 +61,17 @@ void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uin int main(int argc, char** argv) { signal(SIGINT, signal_handler); - if (argc < 2) { - printf("用法: %s <模型路径> [超时秒数]\n", argv[0]); - return 1; - } + if (argc < 2) return 1; if (argc >= 3) g_timeout_limit = atoi(argv[2]); struct whisper_context_params cparams = whisper_context_default_params(); cparams.use_gpu = true; struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); - // 设备枚举与选择 ma_context context; ma_context_init(NULL, 0, NULL, &context); ma_device_info* pCapInfos; ma_uint32 capCount; ma_context_get_devices(&context, NULL, NULL, &pCapInfos, &capCount); + printf("\n📜 可用麦克风列表:\n"); for (ma_uint32 i = 0; i < capCount; ++i) printf(" [%u] %s\n", i, pCapInfos[i].name); printf("👉 请输入设备 ID (默认5): "); @@ -93,7 +91,7 @@ int main(int argc, char** argv) { printf("\n=============================================\n"); printf("🎙️ 操作提示 (自动断开设置: %d 秒):\n", g_timeout_limit); printf(" ▶ [回车键] : 开始录制\n"); - printf(" ■ [回车键] : 停止录制 (含 %.1f 秒补录)\n", (float)RecordingConfig::SMOOTH_FINISH_MS/1000.0f); + printf(" ■ [回车键] : 停止录制 (含 1.5 秒补录)\n"); printf("=============================================\n"); printf("👉 等待指令..."); fflush(stdout); @@ -111,7 +109,7 @@ int main(int argc, char** argv) { std::thread progress_thread([&]() { while (is_recording.load()) { - printf("\r📊 进度: %d 秒 ", recorded_seconds.load()); + printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, recorded_seconds.load()); fflush(stdout); std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::PROGRESS_MS)); } @@ -124,19 +122,20 @@ int main(int argc, char** argv) { if (check_stdin_ready(RecordingConfig::UI_LOOP_MS)) { if (getchar() == '\n') { - printf("\n🛑 手动停止,正在收尾以确保不丢字..."); + // 使用 \r 覆盖并清理残余 + printf("\r%s\r🛑 手动停止,正在收尾以确保不丢字...", RecordingConfig::CLEAR_LINE); + fflush(stdout); trigger_stop = true; } } - // 关键:n + 容差,确保进度条能显示出最后那一秒 else if (elapsed >= (g_timeout_limit * 1000 + RecordingConfig::CLOCK_TOLERANCE_MS)) { - printf("\r📊 进度: %d 秒 ", g_timeout_limit); + printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, g_timeout_limit); printf("\n⏱️ 时间已到 (%d秒),正在自动收尾...", g_timeout_limit); + fflush(stdout); trigger_stop = true; } } - // 执行平滑刷新 std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::SMOOTH_FINISH_MS)); is_recording.store(false); if (progress_thread.joinable()) progress_thread.join(); @@ -145,14 +144,13 @@ int main(int argc, char** argv) { { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } printf("\n🔍 正在识别 (音频长度: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.language = "zh"; - // 2. 【核心修正】注入简体中文引导词,强制模型输出简体 - // "以下是普通话的句子。" 作为一个初始提示(Prompt) - wparams.initial_prompt = "以下是普通话的句子,使用简体中文。"; - // 3. 翻译控制(确保不开启翻译模式) - wparams.translate = false; + // 【正式固化】简体中文引导词 + wparams.initial_prompt = "以下是普通话,使用简体中文输出。"; wparams.n_threads = 4; + whisper_full(ctx, wparams, captured.data(), captured.size()); int n_segments = whisper_full_n_segments(ctx); From 9e0b95ecf943d81634847f36cc30c5efbfc32165 Mon Sep 17 00:00:00 2001 From: nick huang Date: Wed, 18 Mar 2026 05:57:28 +0800 Subject: [PATCH 21/29] finalize project by adding recogniztionduration --- compile.txt | 2 ++ doubao_mic.cpp | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/compile.txt b/compile.txt index 05b362271ee..0ba4efe4b29 100644 --- a/compile.txt +++ b/compile.txt @@ -6,6 +6,8 @@ g++ -O3 minimal_mic.cpp \ g++ -O3 doubao_mic.cpp -I. -I./include -I./ggml/include -I./examples ./build_gpu/src/libwhisper.so -L/usr/local/cuda/lib64 -lcudart -lcublas -lportaudio -lpthread -ldl -lm -lrt -o doubao_mic.exe +g++ -std=c++17 -O3 doubao_mic.cpp -I. -I./include -I./ggml/include -I./examples ./build_gpu/src/libwhisper.so -L/usr/local/cuda-13.1/lib64 -lcudart -lcublas -lportaudio -lpthread -ldl -lm -lrt -o doubao_mic.exe + gpu build: # 显式指定使用 CUDA 13.1 编译 cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc diff --git a/doubao_mic.cpp b/doubao_mic.cpp index 319519e0ee0..1134d59e628 100644 --- a/doubao_mic.cpp +++ b/doubao_mic.cpp @@ -144,6 +144,7 @@ int main(int argc, char** argv) { { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } printf("\n🔍 正在识别 (音频长度: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); + auto start_recognition = std::chrono::steady_clock::now(); whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.language = "zh"; @@ -154,7 +155,8 @@ int main(int argc, char** argv) { whisper_full(ctx, wparams, captured.data(), captured.size()); int n_segments = whisper_full_n_segments(ctx); - printf("\n📝 识别结果:"); + auto elapsed = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_recognition).count(); + printf("\n📝 识别结果(%.2f秒):", elapsed/1000.0f); for (int i = 0; i < n_segments; ++i) { printf("\n %s", whisper_full_get_segment_text(ctx, i)); } From ce629ed7721159baaa02bbe8d40349fc584ee099 Mon Sep 17 00:00:00 2001 From: nick huang Date: Fri, 20 Mar 2026 06:27:30 +0800 Subject: [PATCH 22/29] Move doubao_mic.cpp into examples/ directory to begin integration as an example --- examples/doubao_mic.cpp | 170 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 examples/doubao_mic.cpp diff --git a/examples/doubao_mic.cpp b/examples/doubao_mic.cpp new file mode 100644 index 00000000000..79afd8e79c4 --- /dev/null +++ b/examples/doubao_mic.cpp @@ -0,0 +1,170 @@ +#include "whisper.h" +#include "common.h" +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ============================================= +// 工程常量:全局统一管理 +// ============================================= +struct RecordingConfig { + static constexpr int SAMPLE_RATE = 16000; + static constexpr int PROGRESS_MS = 100; + static constexpr int UI_LOOP_MS = 10; + static constexpr int SELECT_TIMEOUT_MS = 20; + static constexpr int SMOOTH_FINISH_MS = 1500; + static constexpr int CLOCK_TOLERANCE_MS = 350; + // 用于清理控制台残余的空格 + static const char* CLEAR_LINE; +}; +const char* RecordingConfig::CLEAR_LINE = " "; + +std::atomic is_recording(false); +std::atomic exit_program(false); +std::atomic recorded_seconds(0); +std::vector audio_buffer; +std::mutex buffer_mutex; +int g_timeout_limit = 30; + +void signal_handler(int sig) { + if (sig == SIGINT) { + exit_program.store(true); + is_recording.store(false); + exit(0); + } +} + +bool check_stdin_ready(int timeout_ms = RecordingConfig::SELECT_TIMEOUT_MS) { + fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); + struct timeval tv = {0, timeout_ms * 1000}; + return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; +} + +void clear_stdin() { + while (check_stdin_ready(0)) getchar(); +} + +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + if (!is_recording.load() || pInput == NULL) return; + std::lock_guard lock(buffer_mutex); + audio_buffer.insert(audio_buffer.end(), (float*)pInput, (float*)pInput + frameCount); + recorded_seconds.store(static_cast(audio_buffer.size() / (float)RecordingConfig::SAMPLE_RATE)); +} + +int main(int argc, char** argv) { + signal(SIGINT, signal_handler); + if (argc < 2) return 1; + if (argc >= 3) g_timeout_limit = atoi(argv[2]); + + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; + struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); + + ma_context context; ma_context_init(NULL, 0, NULL, &context); + ma_device_info* pCapInfos; ma_uint32 capCount; + ma_context_get_devices(&context, NULL, NULL, &pCapInfos, &capCount); + + printf("\n📜 可用麦克风列表:\n"); + for (ma_uint32 i = 0; i < capCount; ++i) printf(" [%u] %s\n", i, pCapInfos[i].name); + printf("👉 请输入设备 ID (默认5): "); + ma_uint32 dev_id = 5; + if(scanf("%u", &dev_id) != 1) dev_id = 5; + clear_stdin(); + + ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); + devCfg.capture.format = ma_format_f32; devCfg.capture.channels = 1; + devCfg.sampleRate = RecordingConfig::SAMPLE_RATE; devCfg.dataCallback = data_callback; + if (dev_id < capCount) devCfg.capture.pDeviceID = &pCapInfos[dev_id].id; + + ma_device device; ma_device_init(&context, &devCfg, &device); + ma_device_start(&device); + + while (!exit_program.load()) { + printf("\n=============================================\n"); + printf("🎙️ 操作提示 (自动断开设置: %d 秒):\n", g_timeout_limit); + printf(" ▶ [回车键] : 开始录制\n"); + printf(" ■ [回车键] : 停止录制 (含 1.5 秒补录)\n"); + printf("=============================================\n"); + printf("👉 等待指令..."); + fflush(stdout); + + while (!check_stdin_ready(100) && !exit_program.load()); + if (exit_program.load()) break; + clear_stdin(); + + { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } + recorded_seconds.store(0); + is_recording.store(true); + auto start_time = std::chrono::steady_clock::now(); + + printf("\n🎙️ 录制中...\n"); + + std::thread progress_thread([&]() { + while (is_recording.load()) { + printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, recorded_seconds.load()); + fflush(stdout); + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::PROGRESS_MS)); + } + }); + + bool trigger_stop = false; + while (!trigger_stop && !exit_program.load()) { + auto now = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast(now - start_time).count(); + + if (check_stdin_ready(RecordingConfig::UI_LOOP_MS)) { + if (getchar() == '\n') { + // 使用 \r 覆盖并清理残余 + printf("\r%s\r🛑 手动停止,正在收尾以确保不丢字...", RecordingConfig::CLEAR_LINE); + fflush(stdout); + trigger_stop = true; + } + } + else if (elapsed >= (g_timeout_limit * 1000 + RecordingConfig::CLOCK_TOLERANCE_MS)) { + printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, g_timeout_limit); + printf("\n⏱️ 时间已到 (%d秒),正在自动收尾...", g_timeout_limit); + fflush(stdout); + trigger_stop = true; + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::SMOOTH_FINISH_MS)); + is_recording.store(false); + if (progress_thread.joinable()) progress_thread.join(); + + std::vector captured; + { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } + + printf("\n🔍 正在识别 (音频长度: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); + auto start_recognition = std::chrono::steady_clock::now(); + + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + // 【正式固化】简体中文引导词 + wparams.initial_prompt = "以下是普通话,使用简体中文输出。"; + wparams.n_threads = 4; + + whisper_full(ctx, wparams, captured.data(), captured.size()); + + int n_segments = whisper_full_n_segments(ctx); + auto elapsed = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_recognition).count(); + printf("\n📝 识别结果(%.2f秒):", elapsed/1000.0f); + for (int i = 0; i < n_segments; ++i) { + printf("\n %s", whisper_full_get_segment_text(ctx, i)); + } + printf("\n"); + } + + ma_device_uninit(&device); + ma_context_uninit(&context); + whisper_free(ctx); + return 0; +} From 0a45739a85ead3bb0660271fe026b414cd146a22 Mon Sep 17 00:00:00 2001 From: nick huang Date: Fri, 20 Mar 2026 06:33:25 +0800 Subject: [PATCH 23/29] Move and rename doubao_mic.cpp to examples/mic/whisper-mic.cpp following project naming and structure conventions --- examples/mic/whisper-mic.cpp | 170 +++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 examples/mic/whisper-mic.cpp diff --git a/examples/mic/whisper-mic.cpp b/examples/mic/whisper-mic.cpp new file mode 100644 index 00000000000..79afd8e79c4 --- /dev/null +++ b/examples/mic/whisper-mic.cpp @@ -0,0 +1,170 @@ +#include "whisper.h" +#include "common.h" +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ============================================= +// 工程常量:全局统一管理 +// ============================================= +struct RecordingConfig { + static constexpr int SAMPLE_RATE = 16000; + static constexpr int PROGRESS_MS = 100; + static constexpr int UI_LOOP_MS = 10; + static constexpr int SELECT_TIMEOUT_MS = 20; + static constexpr int SMOOTH_FINISH_MS = 1500; + static constexpr int CLOCK_TOLERANCE_MS = 350; + // 用于清理控制台残余的空格 + static const char* CLEAR_LINE; +}; +const char* RecordingConfig::CLEAR_LINE = " "; + +std::atomic is_recording(false); +std::atomic exit_program(false); +std::atomic recorded_seconds(0); +std::vector audio_buffer; +std::mutex buffer_mutex; +int g_timeout_limit = 30; + +void signal_handler(int sig) { + if (sig == SIGINT) { + exit_program.store(true); + is_recording.store(false); + exit(0); + } +} + +bool check_stdin_ready(int timeout_ms = RecordingConfig::SELECT_TIMEOUT_MS) { + fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); + struct timeval tv = {0, timeout_ms * 1000}; + return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; +} + +void clear_stdin() { + while (check_stdin_ready(0)) getchar(); +} + +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + if (!is_recording.load() || pInput == NULL) return; + std::lock_guard lock(buffer_mutex); + audio_buffer.insert(audio_buffer.end(), (float*)pInput, (float*)pInput + frameCount); + recorded_seconds.store(static_cast(audio_buffer.size() / (float)RecordingConfig::SAMPLE_RATE)); +} + +int main(int argc, char** argv) { + signal(SIGINT, signal_handler); + if (argc < 2) return 1; + if (argc >= 3) g_timeout_limit = atoi(argv[2]); + + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; + struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); + + ma_context context; ma_context_init(NULL, 0, NULL, &context); + ma_device_info* pCapInfos; ma_uint32 capCount; + ma_context_get_devices(&context, NULL, NULL, &pCapInfos, &capCount); + + printf("\n📜 可用麦克风列表:\n"); + for (ma_uint32 i = 0; i < capCount; ++i) printf(" [%u] %s\n", i, pCapInfos[i].name); + printf("👉 请输入设备 ID (默认5): "); + ma_uint32 dev_id = 5; + if(scanf("%u", &dev_id) != 1) dev_id = 5; + clear_stdin(); + + ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); + devCfg.capture.format = ma_format_f32; devCfg.capture.channels = 1; + devCfg.sampleRate = RecordingConfig::SAMPLE_RATE; devCfg.dataCallback = data_callback; + if (dev_id < capCount) devCfg.capture.pDeviceID = &pCapInfos[dev_id].id; + + ma_device device; ma_device_init(&context, &devCfg, &device); + ma_device_start(&device); + + while (!exit_program.load()) { + printf("\n=============================================\n"); + printf("🎙️ 操作提示 (自动断开设置: %d 秒):\n", g_timeout_limit); + printf(" ▶ [回车键] : 开始录制\n"); + printf(" ■ [回车键] : 停止录制 (含 1.5 秒补录)\n"); + printf("=============================================\n"); + printf("👉 等待指令..."); + fflush(stdout); + + while (!check_stdin_ready(100) && !exit_program.load()); + if (exit_program.load()) break; + clear_stdin(); + + { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } + recorded_seconds.store(0); + is_recording.store(true); + auto start_time = std::chrono::steady_clock::now(); + + printf("\n🎙️ 录制中...\n"); + + std::thread progress_thread([&]() { + while (is_recording.load()) { + printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, recorded_seconds.load()); + fflush(stdout); + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::PROGRESS_MS)); + } + }); + + bool trigger_stop = false; + while (!trigger_stop && !exit_program.load()) { + auto now = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast(now - start_time).count(); + + if (check_stdin_ready(RecordingConfig::UI_LOOP_MS)) { + if (getchar() == '\n') { + // 使用 \r 覆盖并清理残余 + printf("\r%s\r🛑 手动停止,正在收尾以确保不丢字...", RecordingConfig::CLEAR_LINE); + fflush(stdout); + trigger_stop = true; + } + } + else if (elapsed >= (g_timeout_limit * 1000 + RecordingConfig::CLOCK_TOLERANCE_MS)) { + printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, g_timeout_limit); + printf("\n⏱️ 时间已到 (%d秒),正在自动收尾...", g_timeout_limit); + fflush(stdout); + trigger_stop = true; + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::SMOOTH_FINISH_MS)); + is_recording.store(false); + if (progress_thread.joinable()) progress_thread.join(); + + std::vector captured; + { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } + + printf("\n🔍 正在识别 (音频长度: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); + auto start_recognition = std::chrono::steady_clock::now(); + + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + // 【正式固化】简体中文引导词 + wparams.initial_prompt = "以下是普通话,使用简体中文输出。"; + wparams.n_threads = 4; + + whisper_full(ctx, wparams, captured.data(), captured.size()); + + int n_segments = whisper_full_n_segments(ctx); + auto elapsed = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_recognition).count(); + printf("\n📝 识别结果(%.2f秒):", elapsed/1000.0f); + for (int i = 0; i < n_segments; ++i) { + printf("\n %s", whisper_full_get_segment_text(ctx, i)); + } + printf("\n"); + } + + ma_device_uninit(&device); + ma_context_uninit(&context); + whisper_free(ctx); + return 0; +} From ecd3794a4d824ab58dcdf3b6cc43383a0431d5dc Mon Sep 17 00:00:00 2001 From: nick huang Date: Fri, 20 Mar 2026 06:34:06 +0800 Subject: [PATCH 24/29] Add CMakeLists.txt for mic/ directory to build whisper-mic example --- examples/mic/CMakeLists.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 examples/mic/CMakeLists.txt diff --git a/examples/mic/CMakeLists.txt b/examples/mic/CMakeLists.txt new file mode 100644 index 00000000000..8e8ded5c586 --- /dev/null +++ b/examples/mic/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.10) +project(whisper-mic) + +add_executable(whisper-mic whisper-mic.cpp) + +target_include_directories(whisper-mic PRIVATE + ${CMAKE_SOURCE_DIR}/.. + ${CMAKE_SOURCE_DIR}/../.. + ${CMAKE_SOURCE_DIR}/../../include + ${CMAKE_SOURCE_DIR}/../../ggml/include +) + +target_link_libraries(whisper-mic PRIVATE whisper pthread dl m rt) + +set_target_properties(whisper-mic PROPERTIES + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED YES + CXX_EXTENSIONS NO +) From 5961f529181d4c8f0710a1a013abe06e1df8b399 Mon Sep 17 00:00:00 2001 From: nick huang Date: Fri, 20 Mar 2026 06:34:13 +0800 Subject: [PATCH 25/29] Add mic/ as a subdirectory in examples/CMakeLists.txt to build whisper-mic example --- examples/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b202ca00b77..b3fd7834c80 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -118,6 +118,7 @@ else() endif (WHISPER_SDL2) add_subdirectory(deprecation-warning) + add_subdirectory(mic) endif() if (WHISPER_SDL2) From f71c28cad5bfeaea4a55873aeb0289444e495bf1 Mon Sep 17 00:00:00 2001 From: nick huang Date: Fri, 20 Mar 2026 06:35:09 +0800 Subject: [PATCH 26/29] Add build_mic.sh script to build examples/mic/whisper-mic.cpp using CMake --- build_mic.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 build_mic.sh diff --git a/build_mic.sh b/build_mic.sh new file mode 100644 index 00000000000..279e7271a97 --- /dev/null +++ b/build_mic.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Build script for examples/mic/whisper-mic.cpp +set -e + +cd "$(dirname "$0")" + +mkdir -p build_mic +cd build_mic + +cmake ../examples/mic +cmake --build . -j$(nproc) From 31f154ff269a2a413b6540763b7c9d402fb87254 Mon Sep 17 00:00:00 2001 From: nick huang Date: Fri, 20 Mar 2026 07:50:30 +0800 Subject: [PATCH 27/29] examples: refine whisper-mic params and upstream-style cmake --- examples/mic/CMakeLists.txt | 21 ++---- examples/mic/whisper-mic.cpp | 135 +++++++++++++++++++++++++++++------ 2 files changed, 117 insertions(+), 39 deletions(-) diff --git a/examples/mic/CMakeLists.txt b/examples/mic/CMakeLists.txt index 8e8ded5c586..29f7b21da5a 100644 --- a/examples/mic/CMakeLists.txt +++ b/examples/mic/CMakeLists.txt @@ -1,19 +1,8 @@ -cmake_minimum_required(VERSION 3.10) -project(whisper-mic) +set(TARGET whisper-mic) +add_executable(${TARGET} whisper-mic.cpp) -add_executable(whisper-mic whisper-mic.cpp) +include(DefaultTargetOptions) -target_include_directories(whisper-mic PRIVATE - ${CMAKE_SOURCE_DIR}/.. - ${CMAKE_SOURCE_DIR}/../.. - ${CMAKE_SOURCE_DIR}/../../include - ${CMAKE_SOURCE_DIR}/../../ggml/include -) +target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT}) -target_link_libraries(whisper-mic PRIVATE whisper pthread dl m rt) - -set_target_properties(whisper-mic PROPERTIES - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED YES - CXX_EXTENSIONS NO -) +install(TARGETS ${TARGET} RUNTIME) diff --git a/examples/mic/whisper-mic.cpp b/examples/mic/whisper-mic.cpp index 79afd8e79c4..eb0fa044018 100644 --- a/examples/mic/whisper-mic.cpp +++ b/examples/mic/whisper-mic.cpp @@ -1,3 +1,53 @@ +#include + +struct mic_params { + std::string model = "models/ggml-base.bin"; + int timeout = 30; + int capture_id = 5; + std::string language = "zh"; + bool use_gpu = true; +}; + +void mic_print_usage(int argc, char ** argv, const mic_params & params) { + printf("\n"); + printf("usage: %s [options]\n", argv[0]); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -m F, --model F [%-7s] model path\n", params.model.c_str()); + printf(" -t N, --timeout N [%-7d] max recording time in seconds\n", params.timeout); + printf(" -c N, --capture N [%-7d] capture device ID\n", params.capture_id); + printf(" -l S, --language S [%-7s] language (e.g. zh, en)\n", params.language.c_str()); + printf(" -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true"); + printf("\n"); + printf("example: %s -m models/ggml-base.bin -t 30 -c 0 -l zh\n", argv[0]); + printf("\n"); +} + +static bool mic_params_parse(int argc, char ** argv, mic_params & params) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "-h" || arg == "--help") { + mic_print_usage(argc, argv, params); + exit(0); + } else if (arg == "-m" || arg == "--model") { + params.model = argv[++i]; + } else if (arg == "-t" || arg == "--timeout") { + params.timeout = std::stoi(argv[++i]); + } else if (arg == "-c" || arg == "--capture") { + params.capture_id = std::stoi(argv[++i]); + } else if (arg == "-l" || arg == "--language") { + params.language = argv[++i]; + } else if (arg == "-ng" || arg == "--no-gpu") { + params.use_gpu = false; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + mic_print_usage(argc, argv, params); + exit(1); + } + } + return true; +} #include "whisper.h" #include "common.h" #define MINIAUDIO_IMPLEMENTATION @@ -20,7 +70,7 @@ struct RecordingConfig { static constexpr int PROGRESS_MS = 100; static constexpr int UI_LOOP_MS = 10; static constexpr int SELECT_TIMEOUT_MS = 20; - static constexpr int SMOOTH_FINISH_MS = 1500; + static constexpr int SMOOTH_FINISH_MS = 300; static constexpr int CLOCK_TOLERANCE_MS = 350; // 用于清理控制台残余的空格 static const char* CLEAR_LINE; @@ -61,22 +111,37 @@ void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uin int main(int argc, char** argv) { signal(SIGINT, signal_handler); - if (argc < 2) return 1; - if (argc >= 3) g_timeout_limit = atoi(argv[2]); + mic_params params; + mic_params_parse(argc, argv, params); + g_timeout_limit = params.timeout; struct whisper_context_params cparams = whisper_context_default_params(); - cparams.use_gpu = true; - struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); + cparams.use_gpu = params.use_gpu; + struct whisper_context* ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); + if (!ctx) { + fprintf(stderr, "Failed to load model from '%s'\n", params.model.c_str()); + return 1; + } - ma_context context; ma_context_init(NULL, 0, NULL, &context); + ma_context context; + if (ma_context_init(NULL, 0, NULL, &context) != MA_SUCCESS) { + fprintf(stderr, "Failed to initialize miniaudio context\n"); + whisper_free(ctx); + return 1; + } ma_device_info* pCapInfos; ma_uint32 capCount; - ma_context_get_devices(&context, NULL, NULL, &pCapInfos, &capCount); - + if (ma_context_get_devices(&context, NULL, NULL, &pCapInfos, &capCount) != MA_SUCCESS || capCount == 0) { + fprintf(stderr, "No audio capture devices found\n"); + ma_context_uninit(&context); + whisper_free(ctx); + return 1; + } + printf("\n📜 可用麦克风列表:\n"); for (ma_uint32 i = 0; i < capCount; ++i) printf(" [%u] %s\n", i, pCapInfos[i].name); - printf("👉 请输入设备 ID (默认5): "); - ma_uint32 dev_id = 5; - if(scanf("%u", &dev_id) != 1) dev_id = 5; + printf("👉 请输入设备 ID (默认%d): ", params.capture_id); + ma_uint32 dev_id = params.capture_id; + if(scanf("%u", &dev_id) != 1) dev_id = params.capture_id; clear_stdin(); ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); @@ -84,8 +149,20 @@ int main(int argc, char** argv) { devCfg.sampleRate = RecordingConfig::SAMPLE_RATE; devCfg.dataCallback = data_callback; if (dev_id < capCount) devCfg.capture.pDeviceID = &pCapInfos[dev_id].id; - ma_device device; ma_device_init(&context, &devCfg, &device); - ma_device_start(&device); + ma_device device; + if (ma_device_init(&context, &devCfg, &device) != MA_SUCCESS) { + fprintf(stderr, "Failed to initialize audio capture device\n"); + ma_context_uninit(&context); + whisper_free(ctx); + return 1; + } + if (ma_device_start(&device) != MA_SUCCESS) { + fprintf(stderr, "Failed to start audio capture device\n"); + ma_device_uninit(&device); + ma_context_uninit(&context); + whisper_free(ctx); + return 1; + } while (!exit_program.load()) { printf("\n=============================================\n"); @@ -136,24 +213,36 @@ int main(int argc, char** argv) { } } - std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::SMOOTH_FINISH_MS)); - is_recording.store(false); + + // Stop progress meter immediately + is_recording.store(false); if (progress_thread.joinable()) progress_thread.join(); + // Now wait for smooth finish (buffer tail) + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::SMOOTH_FINISH_MS)); std::vector captured; { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } - + + if (captured.empty()) { + printf("\n⚠️ 没有录制到音频,跳过识别。\n"); + continue; + } + printf("\n🔍 正在识别 (音频长度: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); auto start_recognition = std::chrono::steady_clock::now(); - + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); - wparams.language = "zh"; - // 【正式固化】简体中文引导词 - wparams.initial_prompt = "以下是普通话,使用简体中文输出。"; + wparams.language = params.language.c_str(); + if (params.language == "zh") { + wparams.initial_prompt = "以下是普通话,使用简体中文输出。"; + } wparams.n_threads = 4; - - whisper_full(ctx, wparams, captured.data(), captured.size()); - + + if (whisper_full(ctx, wparams, captured.data(), captured.size()) != 0) { + printf("\n❌ 语音识别失败。\n"); + continue; + } + int n_segments = whisper_full_n_segments(ctx); auto elapsed = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_recognition).count(); printf("\n📝 识别结果(%.2f秒):", elapsed/1000.0f); From 3ba2906db06eebcd6572ede9887ce6f44d6e8dff Mon Sep 17 00:00:00 2001 From: nick huang Date: Fri, 20 Mar 2026 08:01:41 +0800 Subject: [PATCH 28/29] examples: add mic docs and limit mic example to unix --- README.md | 1 + build_mic.sh | 11 ----------- examples/CMakeLists.txt | 4 +++- examples/mic/README.md | 43 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 12 deletions(-) delete mode 100644 build_mic.sh create mode 100644 examples/mic/README.md diff --git a/README.md b/README.md index 474a1301da7..ea59db59ce5 100644 --- a/README.md +++ b/README.md @@ -842,6 +842,7 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch | [whisper-cli](examples/cli) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper | | [whisper-bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine | | [whisper-stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture | +| [whisper-mic](examples/mic) | | Manual start/stop microphone transcription with interactive device selection | | [whisper-command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic | | [whisper-server](examples/server) | | HTTP transcription server with OAI-like API | | [whisper-talk-llama](examples/talk-llama) | | Talk with a LLaMA bot | diff --git a/build_mic.sh b/build_mic.sh deleted file mode 100644 index 279e7271a97..00000000000 --- a/build_mic.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -# Build script for examples/mic/whisper-mic.cpp -set -e - -cd "$(dirname "$0")" - -mkdir -p build_mic -cd build_mic - -cmake ../examples/mic -cmake --build . -j$(nproc) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b3fd7834c80..4c62f711ae2 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -118,7 +118,9 @@ else() endif (WHISPER_SDL2) add_subdirectory(deprecation-warning) - add_subdirectory(mic) + if (UNIX AND NOT WIN32) + add_subdirectory(mic) + endif() endif() if (WHISPER_SDL2) diff --git a/examples/mic/README.md b/examples/mic/README.md new file mode 100644 index 00000000000..a8d0e5fe2df --- /dev/null +++ b/examples/mic/README.md @@ -0,0 +1,43 @@ +# whisper.cpp/examples/mic + +This example captures live microphone audio and performs manual start/stop transcription. +Unlike `whisper-stream`, it records a full segment first and then transcribes that segment. + +## Run + +```bash +./build/bin/whisper-mic -m ./models/ggml-base.bin +``` + +Press Enter to start recording, then press Enter again to stop and transcribe. + +## Options + +```text + -h, --help show help and exit + -m F, --model F model path + -t N, --timeout N max recording time in seconds + -c N, --capture N capture device ID + -l S, --language S language (for example: zh, en) + -ng, --no-gpu disable GPU inference +``` + +## Build + +```bash +cmake -B build +cmake --build build --config Release -j +``` + +## GPU build (optional) + +```bash +cmake -S . -B build_gpu -DGGML_CUDA=ON +cmake --build build_gpu --config Release -j +``` + +Then run: + +```bash +./build_gpu/bin/whisper-mic -m ./models/ggml-base.bin +``` From 53a289d2688b5b5c969935f39ff35cc5edb21f99 Mon Sep 17 00:00:00 2001 From: nick huang Date: Fri, 20 Mar 2026 08:06:21 +0800 Subject: [PATCH 29/29] examples: translate whisper-mic prompts to english --- examples/mic/whisper-mic.cpp | 38 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/mic/whisper-mic.cpp b/examples/mic/whisper-mic.cpp index eb0fa044018..7c3d83ef3b6 100644 --- a/examples/mic/whisper-mic.cpp +++ b/examples/mic/whisper-mic.cpp @@ -63,7 +63,7 @@ static bool mic_params_parse(int argc, char ** argv, mic_params & params) { #include // ============================================= -// 工程常量:全局统一管理 +// Shared constants // ============================================= struct RecordingConfig { static constexpr int SAMPLE_RATE = 16000; @@ -72,7 +72,7 @@ struct RecordingConfig { static constexpr int SELECT_TIMEOUT_MS = 20; static constexpr int SMOOTH_FINISH_MS = 300; static constexpr int CLOCK_TOLERANCE_MS = 350; - // 用于清理控制台残余的空格 + // Used to clear leftover characters in terminal progress rendering. static const char* CLEAR_LINE; }; const char* RecordingConfig::CLEAR_LINE = " "; @@ -137,9 +137,9 @@ int main(int argc, char** argv) { return 1; } - printf("\n📜 可用麦克风列表:\n"); + printf("\n📜 Available microphones:\n"); for (ma_uint32 i = 0; i < capCount; ++i) printf(" [%u] %s\n", i, pCapInfos[i].name); - printf("👉 请输入设备 ID (默认%d): ", params.capture_id); + printf("👉 Enter device ID (default %d): ", params.capture_id); ma_uint32 dev_id = params.capture_id; if(scanf("%u", &dev_id) != 1) dev_id = params.capture_id; clear_stdin(); @@ -166,11 +166,11 @@ int main(int argc, char** argv) { while (!exit_program.load()) { printf("\n=============================================\n"); - printf("🎙️ 操作提示 (自动断开设置: %d 秒):\n", g_timeout_limit); - printf(" ▶ [回车键] : 开始录制\n"); - printf(" ■ [回车键] : 停止录制 (含 1.5 秒补录)\n"); + printf("🎙️ Controls (auto stop after %d seconds):\n", g_timeout_limit); + printf(" ▶ [Enter] : start recording\n"); + printf(" ■ [Enter] : stop recording (with 1.5s tail capture)\n"); printf("=============================================\n"); - printf("👉 等待指令..."); + printf("👉 Waiting for input..."); fflush(stdout); while (!check_stdin_ready(100) && !exit_program.load()); @@ -182,11 +182,11 @@ int main(int argc, char** argv) { is_recording.store(true); auto start_time = std::chrono::steady_clock::now(); - printf("\n🎙️ 录制中...\n"); + printf("\n🎙️ Recording...\n"); std::thread progress_thread([&]() { while (is_recording.load()) { - printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, recorded_seconds.load()); + printf("\r%s\r📊 Elapsed: %d s", RecordingConfig::CLEAR_LINE, recorded_seconds.load()); fflush(stdout); std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::PROGRESS_MS)); } @@ -199,15 +199,15 @@ int main(int argc, char** argv) { if (check_stdin_ready(RecordingConfig::UI_LOOP_MS)) { if (getchar() == '\n') { - // 使用 \r 覆盖并清理残余 - printf("\r%s\r🛑 手动停止,正在收尾以确保不丢字...", RecordingConfig::CLEAR_LINE); + // Use carriage return to overwrite and clear the previous line. + printf("\r%s\r🛑 Stopping manually, finalizing capture...", RecordingConfig::CLEAR_LINE); fflush(stdout); trigger_stop = true; } } else if (elapsed >= (g_timeout_limit * 1000 + RecordingConfig::CLOCK_TOLERANCE_MS)) { - printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, g_timeout_limit); - printf("\n⏱️ 时间已到 (%d秒),正在自动收尾...", g_timeout_limit); + printf("\r%s\r📊 Elapsed: %d s", RecordingConfig::CLEAR_LINE, g_timeout_limit); + printf("\n⏱️ Time limit reached (%d s), finalizing capture...", g_timeout_limit); fflush(stdout); trigger_stop = true; } @@ -224,28 +224,28 @@ int main(int argc, char** argv) { { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } if (captured.empty()) { - printf("\n⚠️ 没有录制到音频,跳过识别。\n"); + printf("\n⚠️ No audio captured, skipping recognition.\n"); continue; } - printf("\n🔍 正在识别 (音频长度: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); + printf("\n🔍 Running recognition (audio length: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); auto start_recognition = std::chrono::steady_clock::now(); whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); wparams.language = params.language.c_str(); if (params.language == "zh") { - wparams.initial_prompt = "以下是普通话,使用简体中文输出。"; + wparams.initial_prompt = "The following speech is Mandarin Chinese. Output in Simplified Chinese."; } wparams.n_threads = 4; if (whisper_full(ctx, wparams, captured.data(), captured.size()) != 0) { - printf("\n❌ 语音识别失败。\n"); + printf("\n❌ Speech recognition failed.\n"); continue; } int n_segments = whisper_full_n_segments(ctx); auto elapsed = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_recognition).count(); - printf("\n📝 识别结果(%.2f秒):", elapsed/1000.0f); + printf("\n📝 Recognition result (%.2f s):", elapsed/1000.0f); for (int i = 0; i < n_segments; ++i) { printf("\n %s", whisper_full_get_segment_text(ctx, i)); }