composable-conv-agents/main.cpp at main · nareshis21/composable-conv-agents · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#include <windows.h> // For SetConsoleOutputCP
#include "audio/mic_stream.h"
#include "audio/vad.h"
#include "persona/persona_state.h"
#include "llm/llama_stream.h"
#include "asr/whisper_stream.h"
#include "tts/tts_stream.h"
#include "controller/dialogue_controller.h"
#include "utils/perf_monitor.h"
#include <queue>
#include <mutex>
#include <condition_variable>
#include <chrono>
#include <thread>
#include <vector>
#include <string>
#include <iostream>
#include <atomic>

// Thread-safe queue for audio chunks
std::queue<std::vector<int16_t>> audio_queue;
std::mutex queue_mutex;
std::condition_variable queue_cv;
std::atomic<bool> running(true);

// Global flag for response thread management
std::atomic<bool> response_in_progress(false);
std::atomic<bool> test_mode_active(false);

void processing_thread(VAD* vad, WhisperASR* asr, DialogueController* controller) {
    std::vector<int16_t> audio_buffer;
    bool is_speaking = false;
    int silence_frames = 0;
    auto& monitor = PerfMonitor::getInstance();

    // Interruption state
    int interrupt_frames = 0;

    // Backchanneling state
    auto last_backchannel_time = std::chrono::steady_clock::now();
    int speech_chunk_count = 0;

    while (running) {
        std::vector<int16_t> chunk;
        {
            std::unique_lock<std::mutex> lock(queue_mutex);
            queue_cv.wait(lock, [] { return !audio_queue.empty() || !running; });
            if (!running && audio_queue.empty()) break;
            chunk = std::move(audio_queue.front());
            audio_queue.pop();
        }

        if (test_mode_active) {
            // Ignore microphone while automation is running
            {
                std::lock_guard<std::mutex> lock(queue_mutex);
                while(!audio_queue.empty()) audio_queue.pop();
            }
            std::this_thread::sleep_for(std::chrono::milliseconds(100));
            continue;
        }

        bool vad_active = vad->isSpeech(chunk.data(), (int)chunk.size());

        // FULL DUPLEX 1: Interruption (with debounce to avoid echo trigger)
        if (controller->agentSpeaking) {
            if (vad_active) {
                interrupt_frames++;
                if (interrupt_frames > 8) { // ~250ms of sustained sound
                    std::cout << "\n[INTERRUPT] User speech detected while agent speaking!" << std::endl;
                    controller->handleInterrupt();
                    is_speaking = true;
                    audio_buffer.clear();
                    audio_buffer.insert(audio_buffer.end(), chunk.begin(), chunk.end());
                    silence_frames = 0;
                    interrupt_frames = 0;
                    continue;
                }
            } else {
                interrupt_frames = 0;
            }
        } else {
            interrupt_frames = 0;
        }

        if (vad_active) {
            if (!is_speaking) {
                std::cout << "\n[User detected]: " << std::flush;
                is_speaking = true;
                audio_buffer.clear();
                speech_chunk_count = 0;
                last_backchannel_time = std::chrono::steady_clock::now();
            }
            audio_buffer.insert(audio_buffer.end(), chunk.begin(), chunk.end());
            silence_frames = 0;
            std::cout << "." << std::flush;

            // FULL DUPLEX 2: Backchanneling
            // If user speaks for > 4 seconds, throw in an "uh-huh" or "yeah"
            speech_chunk_count++;
            auto now = std::chrono::steady_clock::now();
            auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - last_backchannel_time).count();

            if (speech_chunk_count > 120 && duration > 4) { // 120 chunks * 32ms ~= 3.8s
                std::cout << "\n[Backchanneling...]" << std::flush;
                controller->tts->playBackchannel("generic");
                last_backchannel_time = std::chrono::steady_clock::now();
            }

        } else {
            if (is_speaking) {
                silence_frames++;
                // 17 frames ~= 500ms
                if (silence_frames > 17) {
                    std::cout << " [Processing...]" << std::endl;

                    monitor.startTimer("E2E");
                    monitor.startTimer("ASR");

                    std::vector<int16_t> buffer_copy = audio_buffer;

                    std::thread([asr, controller, buffer_copy, &monitor](){
                        // if (response_in_progress) return; // PARALLEL FLOW: Allow processing
                        response_in_progress = true;

                        asr->transcribe(buffer_copy, [&](const std::string& text){
                            double asr_ms = monitor.stopTimer("ASR");
                            if (text.empty()) return;
                            if (text == "[BLANK_AUDIO]" || text == "[Silence]" || text.find("(Video Ad)") != std::string::npos) return;

                            std::cout << "User: " << text << " (ASR: " << asr_ms << "ms)" << std::endl;
                            controller->onUserSpeech(text, controller->agentSpeaking, asr_ms);
                        });

                        {
                            std::lock_guard<std::mutex> lock(queue_mutex);
                            while(!audio_queue.empty()) audio_queue.pop();
                        }

                        response_in_progress = false;
                    }).detach();

                    is_speaking = false;
                    audio_buffer.clear();
                    silence_frames = 0;
                }
            }
        }
    }
}

// Suppress Llama logs
void llama_log_callback(ggml_log_level level, const char * text, void * user_data) {
    (void)level; (void)text; (void)user_data;
    // No-op to suppress logs
}

int main(int argc, char** argv) {
    // Set Console to UTF-8 to handle Emojis
    SetConsoleOutputCP(CP_UTF8);

    // Disable Llama/GGML verbose logging
    llama_log_set(llama_log_callback, nullptr);

    std::cout << "Starting Voice Agent (Press Ctrl+C to stop)..." << std::endl;

    VAD vad(L"models/silero_vad.onnx");
    MicrophoneStream mic(16000, 512);
    PersonaState persona;

    std::string modelPath = "models/qwen2.5-3b-instruct-q4_k_m.gguf";
    LLMStream llm(modelPath);
    std::cout << "[Init] Loading secondary Monitor LLM for parallel processing..." << std::endl;
    LLMStream monitorLLM(modelPath); // Second instance for Full Duplex Listening

    WhisperASR asr("models/ggml-medium.en-q5_0.bin");
    TTSEngine tts;
    DialogueController controller(&llm, &monitorLLM, &persona, &tts);

    std::thread worker(processing_thread, &vad, &asr, &controller);

    mic.start_stream([&](const std::vector<int16_t>& audio_chunk){
        {
            std::lock_guard<std::mutex> lock(queue_mutex);
            audio_queue.push(audio_chunk);
        }
        queue_cv.notify_one();
    });

    std::cout << "\n[System] Microphone is LIVE. You can speak now." << std::endl;
    std::cout << "[System] Or use the CLI for testing:" << std::endl;
    std::cout << "  - Type 'text: hello' to simulate speech input." << std::endl;
    std::cout << "  - Type 'file: test_audio/file.wav' to run ASR on a file." << std::endl;
    std::cout << "  - Type 'quit' to exit.\n" << std::endl;

    std::string input;
    while (running && std::getline(std::cin, input)) {
        if (input == "quit") break;

        if (input.substr(0, 5) == "text:") {
            test_mode_active = true;
            std::string text = input.substr(5);
            std::cout << "[Test] Injected text: " << text << std::endl;

            auto& monitor = PerfMonitor::getInstance();
            monitor.startTimer("E2E"); // Start E2E for text injection
            controller.onUserSpeech(text, false, 0.0);
        }
        else if (input.substr(0, 5) == "file:") {
            test_mode_active = true;
            std::string path = input.substr(5);
            path.erase(0, path.find_first_not_of(" "));
            path.erase(path.find_last_not_of(" ") + 1);

            auto& monitor = PerfMonitor::getInstance();
            monitor.startTimer("E2E"); // E2E starts when we begin processing the 'audio'
            monitor.startTimer("ASR"); // Track file transcription time

            std::cout << "[Test] Processing WAV: " << path << std::endl;
            asr.transcribe_wav(path, [&](const std::string& text){
                double asr_ms = monitor.stopTimer("ASR");
                std::cout << "[Test ASR] File Output: " << text << " (took " << asr_ms << "ms)" << std::endl;
                controller.onUserSpeech(text, false, asr_ms);
            });
        }
    }

    running = false;
    queue_cv.notify_all();
    if (worker.joinable()) worker.join();

    return 0;
}