Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
8146d4f
compile and download run with microphone, not solve usb apply headset…
nickhuang99 Mar 17, 2026
91958f1
very good result with one issue of 30seconds not reached and trunk
nickhuang99 Mar 17, 2026
4c21543
try to solve 30 seconds issue, now 60 seconds, but last few words los…
nickhuang99 Mar 17, 2026
b6cd38c
doubao must fix timeout still need key pressing
nickhuang99 Mar 17, 2026
8b98941
doubao still not fix timeout remaining voice and also introduce new i…
nickhuang99 Mar 17, 2026
bb17b83
most of case fixed, except when timeout the last one second maybe los…
nickhuang99 Mar 17, 2026
588bf73
doubao needs to use same logic handle both key enter and timeout
nickhuang99 Mar 17, 2026
eb7f9a8
doubao deadlock
nickhuang99 Mar 17, 2026
1aab24d
not very sure bug is fixed. seems ok, now let me ask gemini
nickhuang99 Mar 17, 2026
b6abd7b
gemini claims fixed the issue, now testing
nickhuang99 Mar 17, 2026
e9a5725
gemini config cuda which is very complicated
nickhuang99 Mar 17, 2026
938cec6
gemini also try to fix cuda and dead lock
nickhuang99 Mar 17, 2026
6b0a5f3
now gpu fixed. gemini has a timeout ends earlier explanantion with a …
nickhuang99 Mar 17, 2026
84866c2
gemini clear up code but still 28 seconds ends earlier than timeout e…
nickhuang99 Mar 17, 2026
3ec051f
finally find the issue of progress code is interferencing
nickhuang99 Mar 17, 2026
5b4dfa1
gemini fixed the timeout issue, most likely, but need device/mic sele…
nickhuang99 Mar 17, 2026
95df75f
cannot believe gemini also makes human error of boundary checking
nickhuang99 Mar 17, 2026
8eb7806
ask gemini to add timeout hint back
nickhuang99 Mar 17, 2026
26ad600
gemini try to solve traditional chinese issue
nickhuang99 Mar 17, 2026
3c63508
finally let's close this project
nickhuang99 Mar 17, 2026
9e0b95e
finalize project by adding recogniztionduration
nickhuang99 Mar 17, 2026
ce629ed
Move doubao_mic.cpp into examples/ directory to begin integration as …
nickhuang99 Mar 19, 2026
0a45739
Move and rename doubao_mic.cpp to examples/mic/whisper-mic.cpp follow…
nickhuang99 Mar 19, 2026
ecd3794
Add CMakeLists.txt for mic/ directory to build whisper-mic example
nickhuang99 Mar 19, 2026
5961f52
Add mic/ as a subdirectory in examples/CMakeLists.txt to build whispe…
nickhuang99 Mar 19, 2026
f71c28c
Add build_mic.sh script to build examples/mic/whisper-mic.cpp using C…
nickhuang99 Mar 19, 2026
31f154f
examples: refine whisper-mic params and upstream-style cmake
nickhuang99 Mar 19, 2026
3ba2906
examples: add mic docs and limit mic example to unix
nickhuang99 Mar 20, 2026
53a289d
examples: translate whisper-mic prompts to english
nickhuang99 Mar 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,7 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
| [whisper-cli](examples/cli) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
| [whisper-bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
| [whisper-stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
| [whisper-mic](examples/mic) | | Manual start/stop microphone transcription with interactive device selection |
| [whisper-command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
| [whisper-server](examples/server) | | HTTP transcription server with OAI-like API |
| [whisper-talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
Expand Down
14 changes: 14 additions & 0 deletions compile.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
g++ -O3 minimal_mic.cpp \
-I. -I./include -I./ggml/include -I./examples \
./build/src/libwhisper.so \
-L/usr/local/cuda/lib64 -lcudart -lcublas \
-lpthread -ldl -lm -lrt -o minimal_mic

g++ -O3 doubao_mic.cpp -I. -I./include -I./ggml/include -I./examples ./build_gpu/src/libwhisper.so -L/usr/local/cuda/lib64 -lcudart -lcublas -lportaudio -lpthread -ldl -lm -lrt -o doubao_mic.exe

g++ -std=c++17 -O3 doubao_mic.cpp -I. -I./include -I./ggml/include -I./examples ./build_gpu/src/libwhisper.so -L/usr/local/cuda-13.1/lib64 -lcudart -lcublas -lportaudio -lpthread -ldl -lm -lrt -o doubao_mic.exe

gpu build:
# 显式指定使用 CUDA 13.1 编译
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc
cmake --build build --config Release -j$(nproc)
157 changes: 157 additions & 0 deletions doubao.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#include "whisper.h"
#include "common.h"

#define MINIAUDIO_IMPLEMENTATION
#include "miniaudio.h"

#include <vector>
#include <cstdio>
#include <string>
#include <atomic>
#include <chrono>
#include <thread>

// 全局原子变量控制录制状态(线程安全)
std::atomic<bool> is_recording(false);
// 音频缓冲区
std::vector<float> audio_buffer;

// 音频回调:仅在录制状态时才采集数据
void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) {
if (!is_recording.load()) return; // 非录制状态直接返回,不采集数据
const float* pInputFloat = (const float*)pInput;
if (pInputFloat == NULL) return;

// 采集数据到缓冲区(限制最大录制时长为30秒,防止溢出)
const size_t max_frames = 16000 * 30; // 30秒 @ 16kHz
const size_t available = max_frames - audio_buffer.size();
if (available == 0) return; // 缓冲区已满,停止采集

const size_t copy_frames = (frameCount > available) ? available : frameCount;
audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + copy_frames);
}

// 提示信息函数
void print_usage() {
printf("=============================================\n");
printf("🎤 语音识别程序(精准录制版)\n");
printf("操作说明:\n");
printf(" 1. 按下【回车键】开始录制\n");
printf(" 2. 说话完成后,再次按下【回车键】停止录制并识别\n");
printf(" 3. 录制超过30秒会自动停止\n");
printf(" 4. Ctrl+C 退出程序\n");
printf("=============================================\n");
}

int main(int argc, char** argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <model_path>\n", argv[0]);
return 1;
}

const char* model_path = argv[1];

// 1. 初始化 Whisper
struct whisper_context_params cparams = whisper_context_default_params();
cparams.use_gpu = true; // 4050 显卡
struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams);
if (!ctx) {
fprintf(stderr, "❌ 初始化Whisper模型失败\n");
return 1;
}

// 2. 初始化 Miniaudio(仅初始化设备,不立即采集)
ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture);
deviceConfig.capture.format = ma_format_f32; // Whisper 需要 float32
deviceConfig.capture.channels = 1; // 单声道
deviceConfig.sampleRate = 16000; // Whisper 硬指标 16kHz
deviceConfig.dataCallback = data_callback;
deviceConfig.pUserData = nullptr; // 不再传buffer,用全局变量

ma_device device;
if (ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS) {
fprintf(stderr, "❌ 打开录音设备失败\n");
whisper_free(ctx);
return -2;
}

// 启动设备(但此时is_recording=false,不会采集数据)
if (ma_device_start(&device) != MA_SUCCESS) {
fprintf(stderr, "❌ 启动录音设备失败\n");
ma_device_uninit(&device);
whisper_free(ctx);
return -3;
}

print_usage();

while (true) {
// 第一步:等待用户按回车开始录制
printf("\n👉 按下回车键开始录制...\n");
getchar(); // 等待回车

// 开始录制
is_recording.store(true);
audio_buffer.clear(); // 清空旧数据
printf("🎙️ 正在录制(说话完成后按回车键停止,最长录制30秒)...\n");

// 等待用户停止录制(按回车)或超时30秒
std::thread wait_thread([&]() {
getchar(); // 等待用户按回车停止
is_recording.store(false);
});

// 超时控制(30秒)
auto start_time = std::chrono::steady_clock::now();
while (is_recording.load()) {
auto now = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - start_time).count();
if (duration >= 30) {
printf("⏱️ 录制超时(30秒),自动停止\n");
is_recording.store(false);
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // 避免CPU空转
}

wait_thread.join(); // 等待停止线程结束
is_recording.store(false); // 确保录制停止

// 检查录制的数据量
if (audio_buffer.empty()) {
printf("⚠️ 未采集到任何音频数据,请重新录制\n");
continue;
}

// 第二步:开始识别
printf("🔍 正在识别...\n");
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.language = "zh";
wparams.n_threads = 12;
wparams.print_progress = false;
wparams.print_realtime = false;

if (whisper_full(ctx, wparams, audio_buffer.data(), audio_buffer.size()) != 0) {
fprintf(stderr, "❌ 识别失败\n");
continue;
}

// 输出识别结果
const int n_segments = whisper_full_n_segments(ctx);
if (n_segments == 0) {
printf("📝: 未识别到有效内容\n");
} else {
printf("📝 识别结果:\n");
for (int i = 0; i < n_segments; ++i) {
const char* text = whisper_full_get_segment_text(ctx, i);
printf(" %s\n", text);
}
}
}

// 清理资源(实际中Ctrl+C会中断,这里是兜底)
ma_device_uninit(&device);
whisper_free(ctx);
return 0;
}

195 changes: 195 additions & 0 deletions doubao_gpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#include "whisper.h"
#include "common.h"

#define MINIAUDIO_IMPLEMENTATION
#include "miniaudio.h"

#include <vector>
#include <cstdio>
#include <string>
#include <atomic>
#include <chrono>
#include <thread>

// 全局原子变量控制录制状态(线程安全)
std::atomic<bool> is_recording(false);
// 音频缓冲区
std::vector<float> audio_buffer;

// 音频回调:仅在录制状态时才采集数据
void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) {
if (!is_recording.load()) return; // 非录制状态直接返回,不采集数据
const float* pInputFloat = (const float*)pInput;
if (pInputFloat == NULL) return;

// 采集数据到缓冲区(限制最大录制时长为30秒,防止溢出)
const size_t max_frames = 16000 * 30; // 30秒 @ 16kHz
const size_t available = max_frames - audio_buffer.size();
if (available == 0) return; // 缓冲区已满,停止采集

const size_t copy_frames = (frameCount > available) ? available : frameCount;
audio_buffer.insert(audio_buffer.end(), pInputFloat, pInputFloat + copy_frames);
}

// 提示信息函数
void print_usage() {
printf("=============================================\n");
printf("🎤 语音识别程序(精准录制版)\n");
printf("操作说明:\n");
printf(" 1. 按下【回车键】开始录制\n");
printf(" 2. 说话完成后,再次按下【回车键】停止录制并识别\n");
printf(" 3. 录制超过30秒会自动停止\n");
printf(" 4. Ctrl+C 退出程序\n");
printf("=============================================\n");
}

// 适配旧版本的GPU状态提示(不依赖新函数)
void check_gpu_status() {
printf("🔍 GPU加速配置说明...\n");
printf(" 当前已启用GPU加速(use_gpu = true)\n");
printf(" ✅ 如果编译时链接了CUDA库,模型会自动使用GPU\n");
printf(" ❌ 如果识别速度很慢,说明实际使用CPU运行\n");
printf(" 验证方法:观察识别耗时,GPU版本比CPU快5-10倍\n");
}

int main(int argc, char** argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <model_path>\n", argv[0]);
return 1;
}

const char* model_path = argv[1];

// GPU状态提示(适配旧版本)
check_gpu_status();

// 1. 初始化 Whisper(仅保留旧版本支持的参数)
struct whisper_context_params cparams = whisper_context_default_params();
cparams.use_gpu = true; // 启用GPU(旧版本核心参数)
// 移除use_gpu_fp16和gpu_device(旧版本没有这些字段)

printf("\n🚀 正在加载模型:%s\n", model_path);
struct whisper_context* ctx = whisper_init_from_file_with_params(model_path, cparams);
if (!ctx) {
fprintf(stderr, "❌ 初始化Whisper模型失败\n");
return 1;
}

// 旧版本没有whisper_is_using_gpu,改用间接提示
printf("✅ 模型加载成功!\n");
printf(" 📌 若识别速度快(几秒内完成)= GPU运行\n");
printf(" 📌 若识别速度慢(十几秒/分钟)= CPU运行\n");

// 2. 初始化 Miniaudio(仅初始化设备,不立即采集)
ma_device_config deviceConfig = ma_device_config_init(ma_device_type_capture);
deviceConfig.capture.format = ma_format_f32; // Whisper 需要 float32
deviceConfig.capture.channels = 1; // 单声道
deviceConfig.sampleRate = 16000; // Whisper 硬指标 16kHz
deviceConfig.dataCallback = data_callback;
deviceConfig.pUserData = nullptr;

ma_device device;
if (ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS) {
fprintf(stderr, "❌ 打开录音设备失败\n");
whisper_free(ctx);
return -2;
}

// 启动设备(但此时is_recording=false,不会采集数据)
if (ma_device_start(&device) != MA_SUCCESS) {
fprintf(stderr, "❌ 启动录音设备失败\n");
ma_device_uninit(&device);
whisper_free(ctx);
return -3;
}

print_usage();

while (true) {
// 第一步:等待用户按回车开始录制
printf("\n👉 按下回车键开始录制...\n");
getchar(); // 等待回车

// 开始录制
is_recording.store(true);
audio_buffer.clear(); // 清空旧数据
printf("🎙️ 正在录制(说话完成后按回车键停止,最长录制30秒)...\n");

// 等待用户停止录制(按回车)或超时30秒
std::thread wait_thread([&]() {
getchar(); // 等待用户按回车停止
is_recording.store(false);
});

// 超时控制(30秒)
auto start_time = std::chrono::steady_clock::now();
while (is_recording.load()) {
auto now = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - start_time).count();
if (duration >= 30) {
printf("⏱️ 录制超时(30秒),自动停止\n");
is_recording.store(false);
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // 避免CPU空转
}

wait_thread.join(); // 等待停止线程结束
is_recording.store(false); // 确保录制停止

// 检查录制的数据量
if (audio_buffer.empty()) {
printf("⚠️ 未采集到任何音频数据,请重新录制\n");
continue;
}

// 第二步:开始识别(优化识别参数提升精度)
printf("🔍 正在识别...\n");
// 记录识别开始时间(用于判断GPU/CPU)
auto recognize_start = std::chrono::steady_clock::now();

whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.language = "zh";
wparams.n_threads = 12; // 根据CPU核心数调整
wparams.print_progress = false;
wparams.print_realtime = false;

// 精度优化参数(旧版本也支持)
wparams.temperature = 0.0; // 降低随机性,提升稳定性
wparams.max_len = 0; // 不限制输出长度
wparams.translate = false; // 不翻译,直接识别
wparams.no_context = true; // 不使用上下文,避免干扰

if (whisper_full(ctx, wparams, audio_buffer.data(), audio_buffer.size()) != 0) {
fprintf(stderr, "❌ 识别失败\n");
continue;
}

// 计算识别耗时(判断GPU/CPU)
auto recognize_end = std::chrono::steady_clock::now();
auto recognize_duration = std::chrono::duration_cast<std::chrono::milliseconds>(recognize_end - recognize_start).count();
printf("⏱️ 识别耗时:%.2f 秒\n", recognize_duration / 1000.0);
if (recognize_duration < 5000) {
printf(" 🎯 识别速度快,应该是GPU在运行!\n");
} else {
printf(" ⚠️ 识别速度慢,可能是CPU在运行!\n");
}

// 输出识别结果
const int n_segments = whisper_full_n_segments(ctx);
if (n_segments == 0) {
printf("📝: 未识别到有效内容\n");
} else {
printf("📝 识别结果:\n");
for (int i = 0; i < n_segments; ++i) {
const char* text = whisper_full_get_segment_text(ctx, i);
printf(" %s\n", text);
}
}
}

// 清理资源
ma_device_uninit(&device);
whisper_free(ctx);
return 0;
}
Loading