Skip to content

Commit e5a04e3

Browse files
Brandon Salzbergclaude
andcommitted
Add voice chat with local Whisper transcription
- `rhombus voice` starts voice-powered MIND chat session - Press Enter to record, Enter again to stop - Local speech-to-text via whisper.cpp (Metal GPU accelerated) - Auto-downloads Whisper model on first use (~/.rhombus/models/) - --model flag: tiny, base, small (default), medium, large - macOS text-to-speech for responses via `say` - Falls back to text input for typed messages - Requires sox and whisper-cpp (brew install sox whisper-cpp) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent eff2a79 commit e5a04e3

1 file changed

Lines changed: 326 additions & 0 deletions

File tree

cmd/voice.go

Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
package cmd
2+
3+
import (
4+
"bufio"
5+
"fmt"
6+
"io"
7+
"net/http"
8+
"os"
9+
"os/exec"
10+
"path/filepath"
11+
"runtime"
12+
"strings"
13+
"time"
14+
15+
"github.com/RhombusSystems/rhombus-cli/internal/config"
16+
"github.com/spf13/cobra"
17+
)
18+
19+
const (
20+
defaultWhisperModel = "small"
21+
whisperModelBaseURL = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main"
22+
)
23+
24+
var whisperModels = map[string]string{
25+
"tiny": "ggml-tiny.bin",
26+
"base": "ggml-base.bin",
27+
"small": "ggml-small.bin",
28+
"medium": "ggml-medium.bin",
29+
"large": "ggml-large.bin",
30+
}
31+
32+
func init() {
33+
voiceCmd := &cobra.Command{
34+
Use: "voice",
35+
Short: "Voice-powered chat with Rhombus MIND",
36+
Long: "Start a voice chat session. Press Enter to start recording, Enter again to stop. Your speech is transcribed locally and sent to Rhombus MIND.",
37+
RunE: runVoice,
38+
}
39+
voiceCmd.Flags().String("model", defaultWhisperModel, "Whisper model: tiny, base, small, medium, large")
40+
rootCmd.AddCommand(voiceCmd)
41+
}
42+
43+
func runVoice(cmd *cobra.Command, args []string) error {
44+
cfg := config.LoadFromCmd(cmd)
45+
modelName, _ := cmd.Flags().GetString("model")
46+
chatProfile = cfg.Profile
47+
48+
// Verify dependencies
49+
if err := checkVoiceDeps(); err != nil {
50+
return err
51+
}
52+
53+
// Ensure model is downloaded
54+
modelPath, err := ensureWhisperModel(modelName)
55+
if err != nil {
56+
return err
57+
}
58+
59+
// Ensure whisper-cpp is available
60+
whisperBin, err := findWhisperBinary()
61+
if err != nil {
62+
return err
63+
}
64+
65+
fmt.Println("Rhombus MIND Voice Chat")
66+
fmt.Println("Press Enter to start recording, Enter again to stop.")
67+
fmt.Println("Type 'exit' to quit.")
68+
fmt.Println()
69+
70+
contextID := fmt.Sprintf("cli-voice-%d", time.Now().UnixMilli())
71+
72+
// Send tool definitions for the session
73+
fmt.Print("\033[2mInitializing...\033[0m")
74+
if err := sendToolDefinitions(cfg, contextID); err != nil {
75+
fmt.Printf("\r\033[K")
76+
fmt.Fprintf(os.Stderr, "Warning: failed to register tools: %v\n", err)
77+
} else {
78+
fmt.Printf("\r\033[K")
79+
}
80+
81+
reader := bufio.NewReader(os.Stdin)
82+
83+
for {
84+
fmt.Print("\033[1;35m[Press Enter to speak, or type a message]\033[0m ")
85+
input, _ := reader.ReadString('\n')
86+
input = strings.TrimSpace(input)
87+
88+
if input == "exit" || input == "quit" {
89+
break
90+
}
91+
92+
var query string
93+
94+
if input == "" {
95+
// Record and transcribe
96+
audioFile, err := recordAudio()
97+
if err != nil {
98+
fmt.Fprintf(os.Stderr, "Recording error: %v\n\n", err)
99+
continue
100+
}
101+
defer os.Remove(audioFile)
102+
103+
fmt.Print("\033[2mTranscribing...\033[0m")
104+
transcript, err := transcribeAudio(whisperBin, modelPath, audioFile)
105+
fmt.Print("\r\033[K")
106+
107+
if err != nil {
108+
fmt.Fprintf(os.Stderr, "Transcription error: %v\n\n", err)
109+
continue
110+
}
111+
112+
transcript = strings.TrimSpace(transcript)
113+
if transcript == "" {
114+
fmt.Println("(no speech detected)")
115+
continue
116+
}
117+
118+
fmt.Printf("\033[1;34myou>\033[0m %s\n", transcript)
119+
query = transcript
120+
} else {
121+
// Text input fallback
122+
query = input
123+
}
124+
125+
response, err := submitAndWait(cfg, contextID, query)
126+
if err != nil {
127+
fmt.Fprintf(os.Stderr, "\033[1;32mmind>\033[0m Error: %v\n\n", err)
128+
continue
129+
}
130+
131+
fmt.Printf("\033[1;32mmind>\033[0m %s\n\n", cleanResponse(response))
132+
133+
// Optional: speak the response on macOS
134+
if runtime.GOOS == "darwin" {
135+
go speakText(cleanResponse(response))
136+
}
137+
}
138+
139+
return nil
140+
}
141+
142+
func checkVoiceDeps() error {
143+
// Check for sox (rec command) for audio recording
144+
if _, err := exec.LookPath("rec"); err != nil {
145+
if _, err := exec.LookPath("sox"); err != nil {
146+
return fmt.Errorf("sox is required for audio recording. Install with: brew install sox")
147+
}
148+
}
149+
return nil
150+
}
151+
152+
func findWhisperBinary() (string, error) {
153+
// Check common binary names
154+
for _, name := range []string{"whisper-cli", "whisper-cpp", "whisper"} {
155+
if path, err := exec.LookPath(name); err == nil {
156+
return path, nil
157+
}
158+
}
159+
// Check our local bin
160+
for _, name := range []string{"whisper-cli", "whisper-cpp"} {
161+
localBin := filepath.Join(rhombusDir(), "bin", name)
162+
if _, err := os.Stat(localBin); err == nil {
163+
return localBin, nil
164+
}
165+
}
166+
167+
return "", fmt.Errorf("whisper-cpp not found. Install with: brew install whisper-cpp")
168+
}
169+
170+
func ensureWhisperModel(modelName string) (string, error) {
171+
filename, ok := whisperModels[modelName]
172+
if !ok {
173+
return "", fmt.Errorf("unknown model: %s (available: tiny, base, small, medium, large)", modelName)
174+
}
175+
176+
modelsDir := filepath.Join(rhombusDir(), "models")
177+
modelPath := filepath.Join(modelsDir, filename)
178+
179+
if _, err := os.Stat(modelPath); err == nil {
180+
return modelPath, nil
181+
}
182+
183+
// Download the model
184+
if err := os.MkdirAll(modelsDir, 0755); err != nil {
185+
return "", fmt.Errorf("creating models dir: %w", err)
186+
}
187+
188+
url := whisperModelBaseURL + "/" + filename
189+
fmt.Printf("Downloading whisper %s model (%s)...\n", modelName, filename)
190+
191+
resp, err := http.Get(url)
192+
if err != nil {
193+
return "", fmt.Errorf("downloading model: %w", err)
194+
}
195+
defer resp.Body.Close()
196+
197+
if resp.StatusCode != 200 {
198+
return "", fmt.Errorf("download failed: HTTP %d", resp.StatusCode)
199+
}
200+
201+
tmpPath := modelPath + ".tmp"
202+
f, err := os.Create(tmpPath)
203+
if err != nil {
204+
return "", fmt.Errorf("creating model file: %w", err)
205+
}
206+
207+
size := resp.ContentLength
208+
written := int64(0)
209+
buf := make([]byte, 32*1024)
210+
lastPrint := time.Now()
211+
212+
for {
213+
n, readErr := resp.Body.Read(buf)
214+
if n > 0 {
215+
if _, writeErr := f.Write(buf[:n]); writeErr != nil {
216+
f.Close()
217+
os.Remove(tmpPath)
218+
return "", fmt.Errorf("writing model: %w", writeErr)
219+
}
220+
written += int64(n)
221+
222+
if time.Since(lastPrint) > 500*time.Millisecond {
223+
if size > 0 {
224+
pct := float64(written) / float64(size) * 100
225+
fmt.Printf("\r %.0f%% (%d / %d MB)", pct, written/1024/1024, size/1024/1024)
226+
}
227+
lastPrint = time.Now()
228+
}
229+
}
230+
if readErr == io.EOF {
231+
break
232+
}
233+
if readErr != nil {
234+
f.Close()
235+
os.Remove(tmpPath)
236+
return "", fmt.Errorf("downloading model: %w", readErr)
237+
}
238+
}
239+
f.Close()
240+
fmt.Println("\r Download complete. ")
241+
242+
if err := os.Rename(tmpPath, modelPath); err != nil {
243+
return "", fmt.Errorf("finalizing model: %w", err)
244+
}
245+
246+
return modelPath, nil
247+
}
248+
249+
func recordAudio() (string, error) {
250+
tmpFile := filepath.Join(os.TempDir(), fmt.Sprintf("rhombus-voice-%d.wav", time.Now().UnixMilli()))
251+
252+
// Use sox's rec command to record 16kHz mono WAV (whisper's expected format)
253+
cmd := exec.Command("rec",
254+
"-r", "16000", // 16kHz sample rate
255+
"-c", "1", // mono
256+
"-b", "16", // 16-bit
257+
tmpFile,
258+
)
259+
cmd.Stderr = os.Stderr
260+
261+
fmt.Println("\033[1;31m● Recording...\033[0m Press Enter to stop.")
262+
263+
if err := cmd.Start(); err != nil {
264+
return "", fmt.Errorf("starting recording: %w", err)
265+
}
266+
267+
// Wait for Enter key
268+
reader := bufio.NewReader(os.Stdin)
269+
reader.ReadString('\n')
270+
271+
// Stop recording
272+
if cmd.Process != nil {
273+
cmd.Process.Signal(os.Interrupt)
274+
cmd.Wait()
275+
}
276+
277+
// Verify the file exists and has content
278+
info, err := os.Stat(tmpFile)
279+
if err != nil || info.Size() < 1000 {
280+
os.Remove(tmpFile)
281+
return "", fmt.Errorf("recording too short or failed")
282+
}
283+
284+
return tmpFile, nil
285+
}
286+
287+
func transcribeAudio(whisperBin, modelPath, audioFile string) (string, error) {
288+
cmd := exec.Command(whisperBin,
289+
"-m", modelPath,
290+
"-f", audioFile,
291+
"--no-timestamps",
292+
"--language", "en",
293+
"--output-txt",
294+
)
295+
296+
output, err := cmd.CombinedOutput()
297+
if err != nil {
298+
return "", fmt.Errorf("whisper error: %s\n%s", err, string(output))
299+
}
300+
301+
// whisper-cpp outputs to stdout or to a .txt file
302+
// Try reading the .txt file first
303+
txtFile := audioFile + ".txt"
304+
if data, err := os.ReadFile(txtFile); err == nil {
305+
os.Remove(txtFile)
306+
return strings.TrimSpace(string(data)), nil
307+
}
308+
309+
// Fallback: parse stdout
310+
return strings.TrimSpace(string(output)), nil
311+
}
312+
313+
func speakText(text string) {
314+
// Use macOS say command for TTS
315+
// Limit length to avoid speaking very long responses
316+
if len(text) > 500 {
317+
text = text[:500] + "..."
318+
}
319+
cmd := exec.Command("say", "-r", "200", text)
320+
cmd.Run()
321+
}
322+
323+
func rhombusDir() string {
324+
home, _ := os.UserHomeDir()
325+
return filepath.Join(home, ".rhombus")
326+
}

0 commit comments

Comments
 (0)