diff --git a/README.md b/README.md index 474a1301da7..ea59db59ce5 100644 --- a/README.md +++ b/README.md @@ -842,6 +842,7 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch | [whisper-cli](examples/cli) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper | | [whisper-bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine | | [whisper-stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture | +| [whisper-mic](examples/mic) | | Manual start/stop microphone transcription with interactive device selection | | [whisper-command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic | | [whisper-server](examples/server) | | HTTP transcription server with OAI-like API | | [whisper-talk-llama](examples/talk-llama) | | Talk with a LLaMA bot | diff --git a/compile.txt b/compile.txt new file mode 100644 index 00000000000..0ba4efe4b29 --- /dev/null +++ b/compile.txt @@ -0,0 +1,14 @@ +g++ -O3 minimal_mic.cpp \ + -I. -I./include -I./ggml/include -I./examples \ + ./build/src/libwhisper.so \ + -L/usr/local/cuda/lib64 -lcudart -lcublas \ + -lpthread -ldl -lm -lrt -o minimal_mic + +g++ -O3 doubao_mic.cpp -I. -I./include -I./ggml/include -I./examples ./build_gpu/src/libwhisper.so -L/usr/local/cuda/lib64 -lcudart -lcublas -lportaudio -lpthread -ldl -lm -lrt -o doubao_mic.exe + +g++ -std=c++17 -O3 doubao_mic.cpp -I. -I./include -I./ggml/include -I./examples ./build_gpu/src/libwhisper.so -L/usr/local/cuda-13.1/lib64 -lcudart -lcublas -lportaudio -lpthread -ldl -lm -lrt -o doubao_mic.exe + +gpu build: +# 显式指定使用 CUDA 13.1 编译 +cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc +cmake --build build --config Release -j$(nproc) diff --git a/doubao.cpp b/doubao.cpp new file mode 100644 index 00000000000..49217508a05 --- /dev/null +++ b/doubao.cpp @@ -0,0 +1,157 @@ +#include "whisper.h" +#include "common.h" + +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" + +#include +#include +#include +#include +#include +#include + +// 全局原子变量控制录制状态(线程安全) +std::atomic is_recording(false); +// 音频缓冲区 +std::vector audio_buffer; + +// 音频回调:仅在录制状态时才采集数据 +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + if (!is_recording.load()) return; // 非录制状态直接返回,不采集数据 + const float* pInputFloat = (const float*)pInput; + if (pInputFloat == NULL) return; + + // 采集数据到缓冲区(限制最大录制时长为30秒,防止溢出) + const size_t max_frames = 16000 * 30; // 30秒 @ 16kHz + const size_t available = max_frames - audio_buffer.size(); + if (available == 0) return; // 缓冲区已满,停止采集 + + const size_t copy_frames = (frameCount > available) ? available : frameCount; + audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + copy_frames); +} + +// 提示信息函数 +void print_usage() { + printf("=============================================\n"); + printf("🎤 语音识别程序(精准录制版)\n"); + printf("操作说明:\n"); + printf(" 1. 按下【回车键】开始录制\n"); + printf(" 2. 说话完成后,再次按下【回车键】停止录制并识别\n"); + printf(" 3. 录制超过30秒会自动停止\n"); + printf(" 4. Ctrl+C 退出程序\n"); + printf("=============================================\n"); +} + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const char* model_path = argv[1]; + + // 1. 初始化 Whisper + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; // 4050 显卡 + struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); + if (!ctx) { + fprintf(stderr, "❌ 初始化Whisper模型失败\n"); + return 1; + } + + // 2. 初始化 Miniaudio(仅初始化设备,不立即采集) + ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture); + deviceConfig.capture.format = ma_format_f32; // Whisper 需要 float32 + deviceConfig.capture.channels = 1; // 单声道 + deviceConfig.sampleRate = 16000; // Whisper 硬指标 16kHz + deviceConfig.dataCallback = data_callback; + deviceConfig.pUserData = nullptr; // 不再传buffer,用全局变量 + + ma_device device; + if (ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS) { + fprintf(stderr, "❌ 打开录音设备失败\n"); + whisper_free(ctx); + return -2; + } + + // 启动设备(但此时is_recording=false,不会采集数据) + if (ma_device_start(&device) != MA_SUCCESS) { + fprintf(stderr, "❌ 启动录音设备失败\n"); + ma_device_uninit(&device); + whisper_free(ctx); + return -3; + } + + print_usage(); + + while (true) { + // 第一步:等待用户按回车开始录制 + printf("\n👉 按下回车键开始录制...\n"); + getchar(); // 等待回车 + + // 开始录制 + is_recording.store(true); + audio_buffer.clear(); // 清空旧数据 + printf("🎙️ 正在录制(说话完成后按回车键停止,最长录制30秒)...\n"); + + // 等待用户停止录制(按回车)或超时30秒 + std::thread wait_thread([&]() { + getchar(); // 等待用户按回车停止 + is_recording.store(false); + }); + + // 超时控制(30秒) + auto start_time = std::chrono::steady_clock::now(); + while (is_recording.load()) { + auto now = std::chrono::steady_clock::now(); + auto duration = std::chrono::duration_cast(now - start_time).count(); + if (duration >= 30) { + printf("⏱️ 录制超时(30秒),自动停止\n"); + is_recording.store(false); + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // 避免CPU空转 + } + + wait_thread.join(); // 等待停止线程结束 + is_recording.store(false); // 确保录制停止 + + // 检查录制的数据量 + if (audio_buffer.empty()) { + printf("⚠️ 未采集到任何音频数据,请重新录制\n"); + continue; + } + + // 第二步:开始识别 + printf("🔍 正在识别...\n"); + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + wparams.n_threads = 12; + wparams.print_progress = false; + wparams.print_realtime = false; + + if (whisper_full(ctx, wparams, audio_buffer.data(), audio_buffer.size()) != 0) { + fprintf(stderr, "❌ 识别失败\n"); + continue; + } + + // 输出识别结果 + const int n_segments = whisper_full_n_segments(ctx); + if (n_segments == 0) { + printf("📝: 未识别到有效内容\n"); + } else { + printf("📝 识别结果:\n"); + for (int i = 0; i < n_segments; ++i) { + const char* text = whisper_full_get_segment_text(ctx, i); + printf(" %s\n", text); + } + } + } + + // 清理资源(实际中Ctrl+C会中断,这里是兜底) + ma_device_uninit(&device); + whisper_free(ctx); + return 0; +} + diff --git a/doubao_gpu.cpp b/doubao_gpu.cpp new file mode 100644 index 00000000000..d045ad4993c --- /dev/null +++ b/doubao_gpu.cpp @@ -0,0 +1,195 @@ +#include "whisper.h" +#include "common.h" + +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" + +#include +#include +#include +#include +#include +#include + +// 全局原子变量控制录制状态(线程安全) +std::atomic is_recording(false); +// 音频缓冲区 +std::vector audio_buffer; + +// 音频回调:仅在录制状态时才采集数据 +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + if (!is_recording.load()) return; // 非录制状态直接返回,不采集数据 + const float* pInputFloat = (const float*)pInput; + if (pInputFloat == NULL) return; + + // 采集数据到缓冲区(限制最大录制时长为30秒,防止溢出) + const size_t max_frames = 16000 * 30; // 30秒 @ 16kHz + const size_t available = max_frames - audio_buffer.size(); + if (available == 0) return; // 缓冲区已满,停止采集 + + const size_t copy_frames = (frameCount > available) ? available : frameCount; + audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + copy_frames); +} + +// 提示信息函数 +void print_usage() { + printf("=============================================\n"); + printf("🎤 语音识别程序(精准录制版)\n"); + printf("操作说明:\n"); + printf(" 1. 按下【回车键】开始录制\n"); + printf(" 2. 说话完成后,再次按下【回车键】停止录制并识别\n"); + printf(" 3. 录制超过30秒会自动停止\n"); + printf(" 4. Ctrl+C 退出程序\n"); + printf("=============================================\n"); +} + +// 适配旧版本的GPU状态提示(不依赖新函数) +void check_gpu_status() { + printf("🔍 GPU加速配置说明...\n"); + printf(" 当前已启用GPU加速(use_gpu = true)\n"); + printf(" ✅ 如果编译时链接了CUDA库,模型会自动使用GPU\n"); + printf(" ❌ 如果识别速度很慢,说明实际使用CPU运行\n"); + printf(" 验证方法:观察识别耗时,GPU版本比CPU快5-10倍\n"); +} + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const char* model_path = argv[1]; + + // GPU状态提示(适配旧版本) + check_gpu_status(); + + // 1. 初始化 Whisper(仅保留旧版本支持的参数) + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; // 启用GPU(旧版本核心参数) + // 移除use_gpu_fp16和gpu_device(旧版本没有这些字段) + + printf("\n🚀 正在加载模型:%s\n", model_path); + struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); + if (!ctx) { + fprintf(stderr, "❌ 初始化Whisper模型失败\n"); + return 1; + } + + // 旧版本没有whisper_is_using_gpu,改用间接提示 + printf("✅ 模型加载成功!\n"); + printf(" 📌 若识别速度快(几秒内完成)= GPU运行\n"); + printf(" 📌 若识别速度慢(十几秒/分钟)= CPU运行\n"); + + // 2. 初始化 Miniaudio(仅初始化设备,不立即采集) + ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture); + deviceConfig.capture.format = ma_format_f32; // Whisper 需要 float32 + deviceConfig.capture.channels = 1; // 单声道 + deviceConfig.sampleRate = 16000; // Whisper 硬指标 16kHz + deviceConfig.dataCallback = data_callback; + deviceConfig.pUserData = nullptr; + + ma_device device; + if (ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS) { + fprintf(stderr, "❌ 打开录音设备失败\n"); + whisper_free(ctx); + return -2; + } + + // 启动设备(但此时is_recording=false,不会采集数据) + if (ma_device_start(&device) != MA_SUCCESS) { + fprintf(stderr, "❌ 启动录音设备失败\n"); + ma_device_uninit(&device); + whisper_free(ctx); + return -3; + } + + print_usage(); + + while (true) { + // 第一步:等待用户按回车开始录制 + printf("\n👉 按下回车键开始录制...\n"); + getchar(); // 等待回车 + + // 开始录制 + is_recording.store(true); + audio_buffer.clear(); // 清空旧数据 + printf("🎙️ 正在录制(说话完成后按回车键停止,最长录制30秒)...\n"); + + // 等待用户停止录制(按回车)或超时30秒 + std::thread wait_thread([&]() { + getchar(); // 等待用户按回车停止 + is_recording.store(false); + }); + + // 超时控制(30秒) + auto start_time = std::chrono::steady_clock::now(); + while (is_recording.load()) { + auto now = std::chrono::steady_clock::now(); + auto duration = std::chrono::duration_cast(now - start_time).count(); + if (duration >= 30) { + printf("⏱️ 录制超时(30秒),自动停止\n"); + is_recording.store(false); + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // 避免CPU空转 + } + + wait_thread.join(); // 等待停止线程结束 + is_recording.store(false); // 确保录制停止 + + // 检查录制的数据量 + if (audio_buffer.empty()) { + printf("⚠️ 未采集到任何音频数据,请重新录制\n"); + continue; + } + + // 第二步:开始识别(优化识别参数提升精度) + printf("🔍 正在识别...\n"); + // 记录识别开始时间(用于判断GPU/CPU) + auto recognize_start = std::chrono::steady_clock::now(); + + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + wparams.n_threads = 12; // 根据CPU核心数调整 + wparams.print_progress = false; + wparams.print_realtime = false; + + // 精度优化参数(旧版本也支持) + wparams.temperature = 0.0; // 降低随机性,提升稳定性 + wparams.max_len = 0; // 不限制输出长度 + wparams.translate = false; // 不翻译,直接识别 + wparams.no_context = true; // 不使用上下文,避免干扰 + + if (whisper_full(ctx, wparams, audio_buffer.data(), audio_buffer.size()) != 0) { + fprintf(stderr, "❌ 识别失败\n"); + continue; + } + + // 计算识别耗时(判断GPU/CPU) + auto recognize_end = std::chrono::steady_clock::now(); + auto recognize_duration = std::chrono::duration_cast(recognize_end - recognize_start).count(); + printf("⏱️ 识别耗时:%.2f 秒\n", recognize_duration / 1000.0); + if (recognize_duration < 5000) { + printf(" 🎯 识别速度快,应该是GPU在运行!\n"); + } else { + printf(" ⚠️ 识别速度慢,可能是CPU在运行!\n"); + } + + // 输出识别结果 + const int n_segments = whisper_full_n_segments(ctx); + if (n_segments == 0) { + printf("📝: 未识别到有效内容\n"); + } else { + printf("📝 识别结果:\n"); + for (int i = 0; i < n_segments; ++i) { + const char* text = whisper_full_get_segment_text(ctx, i); + printf(" %s\n", text); + } + } + } + + // 清理资源 + ma_device_uninit(&device); + whisper_free(ctx); + return 0; +} diff --git a/doubao_mic.cpp b/doubao_mic.cpp new file mode 100644 index 00000000000..1134d59e628 --- /dev/null +++ b/doubao_mic.cpp @@ -0,0 +1,170 @@ +#include "whisper.h" +#include "common.h" +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ============================================= +// 工程常量:全局统一管理 +// ============================================= +struct RecordingConfig { + static constexpr int SAMPLE_RATE = 16000; + static constexpr int PROGRESS_MS = 100; + static constexpr int UI_LOOP_MS = 10; + static constexpr int SELECT_TIMEOUT_MS = 20; + static constexpr int SMOOTH_FINISH_MS = 1500; + static constexpr int CLOCK_TOLERANCE_MS = 350; + // 用于清理控制台残余的空格 + static const char* CLEAR_LINE; +}; +const char* RecordingConfig::CLEAR_LINE = " "; + +std::atomic is_recording(false); +std::atomic exit_program(false); +std::atomic recorded_seconds(0); +std::vector audio_buffer; +std::mutex buffer_mutex; +int g_timeout_limit = 30; + +void signal_handler(int sig) { + if (sig == SIGINT) { + exit_program.store(true); + is_recording.store(false); + exit(0); + } +} + +bool check_stdin_ready(int timeout_ms = RecordingConfig::SELECT_TIMEOUT_MS) { + fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); + struct timeval tv = {0, timeout_ms * 1000}; + return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; +} + +void clear_stdin() { + while (check_stdin_ready(0)) getchar(); +} + +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + if (!is_recording.load() || pInput == NULL) return; + std::lock_guard lock(buffer_mutex); + audio_buffer.insert(audio_buffer.end(), (float*)pInput, (float*)pInput + frameCount); + recorded_seconds.store(static_cast(audio_buffer.size() / (float)RecordingConfig::SAMPLE_RATE)); +} + +int main(int argc, char** argv) { + signal(SIGINT, signal_handler); + if (argc < 2) return 1; + if (argc >= 3) g_timeout_limit = atoi(argv[2]); + + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; + struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); + + ma_context context; ma_context_init(NULL, 0, NULL, &context); + ma_device_info* pCapInfos; ma_uint32 capCount; + ma_context_get_devices(&context, NULL, NULL, &pCapInfos, &capCount); + + printf("\n📜 可用麦克风列表:\n"); + for (ma_uint32 i = 0; i < capCount; ++i) printf(" [%u] %s\n", i, pCapInfos[i].name); + printf("👉 请输入设备 ID (默认5): "); + ma_uint32 dev_id = 5; + if(scanf("%u", &dev_id) != 1) dev_id = 5; + clear_stdin(); + + ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); + devCfg.capture.format = ma_format_f32; devCfg.capture.channels = 1; + devCfg.sampleRate = RecordingConfig::SAMPLE_RATE; devCfg.dataCallback = data_callback; + if (dev_id < capCount) devCfg.capture.pDeviceID = &pCapInfos[dev_id].id; + + ma_device device; ma_device_init(&context, &devCfg, &device); + ma_device_start(&device); + + while (!exit_program.load()) { + printf("\n=============================================\n"); + printf("🎙️ 操作提示 (自动断开设置: %d 秒):\n", g_timeout_limit); + printf(" ▶ [回车键] : 开始录制\n"); + printf(" ■ [回车键] : 停止录制 (含 1.5 秒补录)\n"); + printf("=============================================\n"); + printf("👉 等待指令..."); + fflush(stdout); + + while (!check_stdin_ready(100) && !exit_program.load()); + if (exit_program.load()) break; + clear_stdin(); + + { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } + recorded_seconds.store(0); + is_recording.store(true); + auto start_time = std::chrono::steady_clock::now(); + + printf("\n🎙️ 录制中...\n"); + + std::thread progress_thread([&]() { + while (is_recording.load()) { + printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, recorded_seconds.load()); + fflush(stdout); + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::PROGRESS_MS)); + } + }); + + bool trigger_stop = false; + while (!trigger_stop && !exit_program.load()) { + auto now = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast(now - start_time).count(); + + if (check_stdin_ready(RecordingConfig::UI_LOOP_MS)) { + if (getchar() == '\n') { + // 使用 \r 覆盖并清理残余 + printf("\r%s\r🛑 手动停止,正在收尾以确保不丢字...", RecordingConfig::CLEAR_LINE); + fflush(stdout); + trigger_stop = true; + } + } + else if (elapsed >= (g_timeout_limit * 1000 + RecordingConfig::CLOCK_TOLERANCE_MS)) { + printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, g_timeout_limit); + printf("\n⏱️ 时间已到 (%d秒),正在自动收尾...", g_timeout_limit); + fflush(stdout); + trigger_stop = true; + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::SMOOTH_FINISH_MS)); + is_recording.store(false); + if (progress_thread.joinable()) progress_thread.join(); + + std::vector captured; + { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } + + printf("\n🔍 正在识别 (音频长度: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); + auto start_recognition = std::chrono::steady_clock::now(); + + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + // 【正式固化】简体中文引导词 + wparams.initial_prompt = "以下是普通话,使用简体中文输出。"; + wparams.n_threads = 4; + + whisper_full(ctx, wparams, captured.data(), captured.size()); + + int n_segments = whisper_full_n_segments(ctx); + auto elapsed = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_recognition).count(); + printf("\n📝 识别结果(%.2f秒):", elapsed/1000.0f); + for (int i = 0; i < n_segments; ++i) { + printf("\n %s", whisper_full_get_segment_text(ctx, i)); + } + printf("\n"); + } + + ma_device_uninit(&device); + ma_context_uninit(&context); + whisper_free(ctx); + return 0; +} \ No newline at end of file diff --git a/download.txt b/download.txt new file mode 100644 index 00000000000..38cd46030e2 --- /dev/null +++ b/download.txt @@ -0,0 +1,2 @@ +export HF_ENDPOINT=https://hf-mirror.com +hf download ggerganov/whisper.cpp ggml-medium.bin --local-dir ./models diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b202ca00b77..4c62f711ae2 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -118,6 +118,9 @@ else() endif (WHISPER_SDL2) add_subdirectory(deprecation-warning) + if (UNIX AND NOT WIN32) + add_subdirectory(mic) + endif() endif() if (WHISPER_SDL2) diff --git a/examples/doubao_mic.cpp b/examples/doubao_mic.cpp new file mode 100644 index 00000000000..79afd8e79c4 --- /dev/null +++ b/examples/doubao_mic.cpp @@ -0,0 +1,170 @@ +#include "whisper.h" +#include "common.h" +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ============================================= +// 工程常量:全局统一管理 +// ============================================= +struct RecordingConfig { + static constexpr int SAMPLE_RATE = 16000; + static constexpr int PROGRESS_MS = 100; + static constexpr int UI_LOOP_MS = 10; + static constexpr int SELECT_TIMEOUT_MS = 20; + static constexpr int SMOOTH_FINISH_MS = 1500; + static constexpr int CLOCK_TOLERANCE_MS = 350; + // 用于清理控制台残余的空格 + static const char* CLEAR_LINE; +}; +const char* RecordingConfig::CLEAR_LINE = " "; + +std::atomic is_recording(false); +std::atomic exit_program(false); +std::atomic recorded_seconds(0); +std::vector audio_buffer; +std::mutex buffer_mutex; +int g_timeout_limit = 30; + +void signal_handler(int sig) { + if (sig == SIGINT) { + exit_program.store(true); + is_recording.store(false); + exit(0); + } +} + +bool check_stdin_ready(int timeout_ms = RecordingConfig::SELECT_TIMEOUT_MS) { + fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); + struct timeval tv = {0, timeout_ms * 1000}; + return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; +} + +void clear_stdin() { + while (check_stdin_ready(0)) getchar(); +} + +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + if (!is_recording.load() || pInput == NULL) return; + std::lock_guard lock(buffer_mutex); + audio_buffer.insert(audio_buffer.end(), (float*)pInput, (float*)pInput + frameCount); + recorded_seconds.store(static_cast(audio_buffer.size() / (float)RecordingConfig::SAMPLE_RATE)); +} + +int main(int argc, char** argv) { + signal(SIGINT, signal_handler); + if (argc < 2) return 1; + if (argc >= 3) g_timeout_limit = atoi(argv[2]); + + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; + struct whisper_context* ctx = whisper_init_from_file_with_params(argv[1], cparams); + + ma_context context; ma_context_init(NULL, 0, NULL, &context); + ma_device_info* pCapInfos; ma_uint32 capCount; + ma_context_get_devices(&context, NULL, NULL, &pCapInfos, &capCount); + + printf("\n📜 可用麦克风列表:\n"); + for (ma_uint32 i = 0; i < capCount; ++i) printf(" [%u] %s\n", i, pCapInfos[i].name); + printf("👉 请输入设备 ID (默认5): "); + ma_uint32 dev_id = 5; + if(scanf("%u", &dev_id) != 1) dev_id = 5; + clear_stdin(); + + ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); + devCfg.capture.format = ma_format_f32; devCfg.capture.channels = 1; + devCfg.sampleRate = RecordingConfig::SAMPLE_RATE; devCfg.dataCallback = data_callback; + if (dev_id < capCount) devCfg.capture.pDeviceID = &pCapInfos[dev_id].id; + + ma_device device; ma_device_init(&context, &devCfg, &device); + ma_device_start(&device); + + while (!exit_program.load()) { + printf("\n=============================================\n"); + printf("🎙️ 操作提示 (自动断开设置: %d 秒):\n", g_timeout_limit); + printf(" ▶ [回车键] : 开始录制\n"); + printf(" ■ [回车键] : 停止录制 (含 1.5 秒补录)\n"); + printf("=============================================\n"); + printf("👉 等待指令..."); + fflush(stdout); + + while (!check_stdin_ready(100) && !exit_program.load()); + if (exit_program.load()) break; + clear_stdin(); + + { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } + recorded_seconds.store(0); + is_recording.store(true); + auto start_time = std::chrono::steady_clock::now(); + + printf("\n🎙️ 录制中...\n"); + + std::thread progress_thread([&]() { + while (is_recording.load()) { + printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, recorded_seconds.load()); + fflush(stdout); + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::PROGRESS_MS)); + } + }); + + bool trigger_stop = false; + while (!trigger_stop && !exit_program.load()) { + auto now = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast(now - start_time).count(); + + if (check_stdin_ready(RecordingConfig::UI_LOOP_MS)) { + if (getchar() == '\n') { + // 使用 \r 覆盖并清理残余 + printf("\r%s\r🛑 手动停止,正在收尾以确保不丢字...", RecordingConfig::CLEAR_LINE); + fflush(stdout); + trigger_stop = true; + } + } + else if (elapsed >= (g_timeout_limit * 1000 + RecordingConfig::CLOCK_TOLERANCE_MS)) { + printf("\r%s\r📊 进度: %d 秒", RecordingConfig::CLEAR_LINE, g_timeout_limit); + printf("\n⏱️ 时间已到 (%d秒),正在自动收尾...", g_timeout_limit); + fflush(stdout); + trigger_stop = true; + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::SMOOTH_FINISH_MS)); + is_recording.store(false); + if (progress_thread.joinable()) progress_thread.join(); + + std::vector captured; + { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } + + printf("\n🔍 正在识别 (音频长度: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); + auto start_recognition = std::chrono::steady_clock::now(); + + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + // 【正式固化】简体中文引导词 + wparams.initial_prompt = "以下是普通话,使用简体中文输出。"; + wparams.n_threads = 4; + + whisper_full(ctx, wparams, captured.data(), captured.size()); + + int n_segments = whisper_full_n_segments(ctx); + auto elapsed = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_recognition).count(); + printf("\n📝 识别结果(%.2f秒):", elapsed/1000.0f); + for (int i = 0; i < n_segments; ++i) { + printf("\n %s", whisper_full_get_segment_text(ctx, i)); + } + printf("\n"); + } + + ma_device_uninit(&device); + ma_context_uninit(&context); + whisper_free(ctx); + return 0; +} diff --git a/examples/mic/CMakeLists.txt b/examples/mic/CMakeLists.txt new file mode 100644 index 00000000000..29f7b21da5a --- /dev/null +++ b/examples/mic/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET whisper-mic) +add_executable(${TARGET} whisper-mic.cpp) + +include(DefaultTargetOptions) + +target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT}) + +install(TARGETS ${TARGET} RUNTIME) diff --git a/examples/mic/README.md b/examples/mic/README.md new file mode 100644 index 00000000000..a8d0e5fe2df --- /dev/null +++ b/examples/mic/README.md @@ -0,0 +1,43 @@ +# whisper.cpp/examples/mic + +This example captures live microphone audio and performs manual start/stop transcription. +Unlike `whisper-stream`, it records a full segment first and then transcribes that segment. + +## Run + +```bash +./build/bin/whisper-mic -m ./models/ggml-base.bin +``` + +Press Enter to start recording, then press Enter again to stop and transcribe. + +## Options + +```text + -h, --help show help and exit + -m F, --model F model path + -t N, --timeout N max recording time in seconds + -c N, --capture N capture device ID + -l S, --language S language (for example: zh, en) + -ng, --no-gpu disable GPU inference +``` + +## Build + +```bash +cmake -B build +cmake --build build --config Release -j +``` + +## GPU build (optional) + +```bash +cmake -S . -B build_gpu -DGGML_CUDA=ON +cmake --build build_gpu --config Release -j +``` + +Then run: + +```bash +./build_gpu/bin/whisper-mic -m ./models/ggml-base.bin +``` diff --git a/examples/mic/whisper-mic.cpp b/examples/mic/whisper-mic.cpp new file mode 100644 index 00000000000..7c3d83ef3b6 --- /dev/null +++ b/examples/mic/whisper-mic.cpp @@ -0,0 +1,259 @@ +#include + +struct mic_params { + std::string model = "models/ggml-base.bin"; + int timeout = 30; + int capture_id = 5; + std::string language = "zh"; + bool use_gpu = true; +}; + +void mic_print_usage(int argc, char ** argv, const mic_params & params) { + printf("\n"); + printf("usage: %s [options]\n", argv[0]); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -m F, --model F [%-7s] model path\n", params.model.c_str()); + printf(" -t N, --timeout N [%-7d] max recording time in seconds\n", params.timeout); + printf(" -c N, --capture N [%-7d] capture device ID\n", params.capture_id); + printf(" -l S, --language S [%-7s] language (e.g. zh, en)\n", params.language.c_str()); + printf(" -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true"); + printf("\n"); + printf("example: %s -m models/ggml-base.bin -t 30 -c 0 -l zh\n", argv[0]); + printf("\n"); +} + +static bool mic_params_parse(int argc, char ** argv, mic_params & params) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "-h" || arg == "--help") { + mic_print_usage(argc, argv, params); + exit(0); + } else if (arg == "-m" || arg == "--model") { + params.model = argv[++i]; + } else if (arg == "-t" || arg == "--timeout") { + params.timeout = std::stoi(argv[++i]); + } else if (arg == "-c" || arg == "--capture") { + params.capture_id = std::stoi(argv[++i]); + } else if (arg == "-l" || arg == "--language") { + params.language = argv[++i]; + } else if (arg == "-ng" || arg == "--no-gpu") { + params.use_gpu = false; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + mic_print_usage(argc, argv, params); + exit(1); + } + } + return true; +} +#include "whisper.h" +#include "common.h" +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ============================================= +// Shared constants +// ============================================= +struct RecordingConfig { + static constexpr int SAMPLE_RATE = 16000; + static constexpr int PROGRESS_MS = 100; + static constexpr int UI_LOOP_MS = 10; + static constexpr int SELECT_TIMEOUT_MS = 20; + static constexpr int SMOOTH_FINISH_MS = 300; + static constexpr int CLOCK_TOLERANCE_MS = 350; + // Used to clear leftover characters in terminal progress rendering. + static const char* CLEAR_LINE; +}; +const char* RecordingConfig::CLEAR_LINE = " "; + +std::atomic is_recording(false); +std::atomic exit_program(false); +std::atomic recorded_seconds(0); +std::vector audio_buffer; +std::mutex buffer_mutex; +int g_timeout_limit = 30; + +void signal_handler(int sig) { + if (sig == SIGINT) { + exit_program.store(true); + is_recording.store(false); + exit(0); + } +} + +bool check_stdin_ready(int timeout_ms = RecordingConfig::SELECT_TIMEOUT_MS) { + fd_set fds; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); + struct timeval tv = {0, timeout_ms * 1000}; + return select(STDIN_FILENO + 1, &fds, NULL, NULL, &tv) > 0; +} + +void clear_stdin() { + while (check_stdin_ready(0)) getchar(); +} + +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + if (!is_recording.load() || pInput == NULL) return; + std::lock_guard lock(buffer_mutex); + audio_buffer.insert(audio_buffer.end(), (float*)pInput, (float*)pInput + frameCount); + recorded_seconds.store(static_cast(audio_buffer.size() / (float)RecordingConfig::SAMPLE_RATE)); +} + +int main(int argc, char** argv) { + signal(SIGINT, signal_handler); + mic_params params; + mic_params_parse(argc, argv, params); + g_timeout_limit = params.timeout; + + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = params.use_gpu; + struct whisper_context* ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); + if (!ctx) { + fprintf(stderr, "Failed to load model from '%s'\n", params.model.c_str()); + return 1; + } + + ma_context context; + if (ma_context_init(NULL, 0, NULL, &context) != MA_SUCCESS) { + fprintf(stderr, "Failed to initialize miniaudio context\n"); + whisper_free(ctx); + return 1; + } + ma_device_info* pCapInfos; ma_uint32 capCount; + if (ma_context_get_devices(&context, NULL, NULL, &pCapInfos, &capCount) != MA_SUCCESS || capCount == 0) { + fprintf(stderr, "No audio capture devices found\n"); + ma_context_uninit(&context); + whisper_free(ctx); + return 1; + } + + printf("\n📜 Available microphones:\n"); + for (ma_uint32 i = 0; i < capCount; ++i) printf(" [%u] %s\n", i, pCapInfos[i].name); + printf("👉 Enter device ID (default %d): ", params.capture_id); + ma_uint32 dev_id = params.capture_id; + if(scanf("%u", &dev_id) != 1) dev_id = params.capture_id; + clear_stdin(); + + ma_device_config devCfg = ma_device_config_init(ma_device_type_capture); + devCfg.capture.format = ma_format_f32; devCfg.capture.channels = 1; + devCfg.sampleRate = RecordingConfig::SAMPLE_RATE; devCfg.dataCallback = data_callback; + if (dev_id < capCount) devCfg.capture.pDeviceID = &pCapInfos[dev_id].id; + + ma_device device; + if (ma_device_init(&context, &devCfg, &device) != MA_SUCCESS) { + fprintf(stderr, "Failed to initialize audio capture device\n"); + ma_context_uninit(&context); + whisper_free(ctx); + return 1; + } + if (ma_device_start(&device) != MA_SUCCESS) { + fprintf(stderr, "Failed to start audio capture device\n"); + ma_device_uninit(&device); + ma_context_uninit(&context); + whisper_free(ctx); + return 1; + } + + while (!exit_program.load()) { + printf("\n=============================================\n"); + printf("🎙️ Controls (auto stop after %d seconds):\n", g_timeout_limit); + printf(" ▶ [Enter] : start recording\n"); + printf(" ■ [Enter] : stop recording (with 1.5s tail capture)\n"); + printf("=============================================\n"); + printf("👉 Waiting for input..."); + fflush(stdout); + + while (!check_stdin_ready(100) && !exit_program.load()); + if (exit_program.load()) break; + clear_stdin(); + + { std::lock_guard lock(buffer_mutex); audio_buffer.clear(); } + recorded_seconds.store(0); + is_recording.store(true); + auto start_time = std::chrono::steady_clock::now(); + + printf("\n🎙️ Recording...\n"); + + std::thread progress_thread([&]() { + while (is_recording.load()) { + printf("\r%s\r📊 Elapsed: %d s", RecordingConfig::CLEAR_LINE, recorded_seconds.load()); + fflush(stdout); + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::PROGRESS_MS)); + } + }); + + bool trigger_stop = false; + while (!trigger_stop && !exit_program.load()) { + auto now = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast(now - start_time).count(); + + if (check_stdin_ready(RecordingConfig::UI_LOOP_MS)) { + if (getchar() == '\n') { + // Use carriage return to overwrite and clear the previous line. + printf("\r%s\r🛑 Stopping manually, finalizing capture...", RecordingConfig::CLEAR_LINE); + fflush(stdout); + trigger_stop = true; + } + } + else if (elapsed >= (g_timeout_limit * 1000 + RecordingConfig::CLOCK_TOLERANCE_MS)) { + printf("\r%s\r📊 Elapsed: %d s", RecordingConfig::CLEAR_LINE, g_timeout_limit); + printf("\n⏱️ Time limit reached (%d s), finalizing capture...", g_timeout_limit); + fflush(stdout); + trigger_stop = true; + } + } + + + // Stop progress meter immediately + is_recording.store(false); + if (progress_thread.joinable()) progress_thread.join(); + // Now wait for smooth finish (buffer tail) + std::this_thread::sleep_for(std::chrono::milliseconds(RecordingConfig::SMOOTH_FINISH_MS)); + + std::vector captured; + { std::lock_guard lock(buffer_mutex); captured = audio_buffer; } + + if (captured.empty()) { + printf("\n⚠️ No audio captured, skipping recognition.\n"); + continue; + } + + printf("\n🔍 Running recognition (audio length: %.2fs)...", (float)captured.size()/RecordingConfig::SAMPLE_RATE); + auto start_recognition = std::chrono::steady_clock::now(); + + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = params.language.c_str(); + if (params.language == "zh") { + wparams.initial_prompt = "The following speech is Mandarin Chinese. Output in Simplified Chinese."; + } + wparams.n_threads = 4; + + if (whisper_full(ctx, wparams, captured.data(), captured.size()) != 0) { + printf("\n❌ Speech recognition failed.\n"); + continue; + } + + int n_segments = whisper_full_n_segments(ctx); + auto elapsed = std::chrono::duration_cast(std::chrono::steady_clock::now() - start_recognition).count(); + printf("\n📝 Recognition result (%.2f s):", elapsed/1000.0f); + for (int i = 0; i < n_segments; ++i) { + printf("\n %s", whisper_full_get_segment_text(ctx, i)); + } + printf("\n"); + } + + ma_device_uninit(&device); + ma_context_uninit(&context); + whisper_free(ctx); + return 0; +} diff --git a/minimal_mic.cpp b/minimal_mic.cpp new file mode 100644 index 00000000000..de03ee67fb0 --- /dev/null +++ b/minimal_mic.cpp @@ -0,0 +1,82 @@ +#include "whisper.h" +#include "common.h" + +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" + +#include +#include +#include + +// 音频回调:将采集到的数据存入 buffer +void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) { + std::vector* pBuffer = (std::vector*)pDevice->pUserData; + const float* pInputFloat = (const float*)pInput; + if (pInputFloat == NULL) return; + + pBuffer->insert(pBuffer->end(), pInputFloat, pInputFloat + frameCount); + // 保持 buffer 在最近 10 秒以内,防止内存溢出 + if (pBuffer->size() > 16000 * 10) { + pBuffer->erase(pBuffer->begin(), pBuffer->begin() + (pBuffer->size() - 16000 * 10)); + } +} + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const char* model_path = argv[1]; + + // 1. 初始化 Whisper + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = true; // 你的 4050 显卡 + struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams); + if (!ctx) return 1; + + // 2. 初始化 Miniaudio + std::vector audio_buffer; + ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture); + deviceConfig.capture.format = ma_format_f32; // Whisper 需要 float32 + deviceConfig.capture.channels = 1; // 单声道 + deviceConfig.sampleRate = 16000; // Whisper 硬指标 16kHz + deviceConfig.dataCallback = data_callback; + deviceConfig.pUserData = &audio_buffer; + + ma_device device; + if (ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS) { + fprintf(stderr, "Failed to open capture device.\n"); + return -2; + } + + ma_device_start(&device); + printf("🎤 录音中... 请说话 (按回车键进行单次识别,Ctrl+C 退出)\n"); + + while (true) { + getchar(); // 等待用户敲回车触发识别 + + printf("正在识别...\n"); + + whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.language = "zh"; + wparams.n_threads = 12; + wparams.print_progress = false; + + if (whisper_full(ctx, wparams, audio_buffer.data(), audio_buffer.size()) != 0) { + fprintf(stderr, "识别失败\n"); + continue; + } + + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) { + const char* text = whisper_full_get_segment_text(ctx, i); + printf("📝: %s\n", text); + } + audio_buffer.clear(); // 清空,准备下一轮 + } + + ma_device_uninit(&device); + whisper_free(ctx); + return 0; +} diff --git a/run.txt b/run.txt new file mode 100644 index 00000000000..1dbc2c4800a --- /dev/null +++ b/run.txt @@ -0,0 +1,16 @@ +export LD_LIBRARY_PATH=./build/src +./minimal_mic ./models/ggml-small.bin + +# 确保运行时能找到 CUDA 13.1 的库 +export LD_LIBRARY_PATH=/usr/local/cuda-13.1/lib64:./build/src:$LD_LIBRARY_PATH +./doubao_mic.exe ./models/ggml-medium.bin + +cat ~/.bashrc | tail -n 6 +export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin +export PATH=$PATH:/home/nick/.local/bin +alias cmake='/snap/bin/cmake' +export CUDA_HOME=/usr/local/cuda-13.1 +export PATH=$CUDA_HOME/bin:$PATH +export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$CUDA_HOME/targets/x86_64-linux/lib:$LD_LIBRARY_PATH + +export LD_LIBRARY_PATH=$(pwd)/build_gpu/src:$(pwd)/build_gpu/ggml/src:/usr/local/cuda-13.1/lib64:/usr/local/cuda-13.1/targets/x86_64-linux/lib:$LD_LIBRARY_PATH