From 66e03310b405205ac788bf9e061f8909c3987a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=BCmer=20Cip?= Date: Sat, 14 Mar 2026 02:54:48 +0300 Subject: [PATCH 1/4] add mistral, inference ms for mistral and dynamic languages based on model --- log/log.go | 8 +++- main.go | 7 ++++ transcriber/batch_session.go | 1 + transcriber/deepgram.go | 11 +++++- transcriber/fake.go | 3 +- transcriber/groq.go | 15 +++++-- transcriber/openai.go | 13 +++++- transcriber/session.go | 1 + transcriber/transcriber.go | 56 +++++++++++++++++++++++++- tray/tray.go | 49 ++++------------------- tray/tray_darwin.go | 76 +++++++++++++++++++++++++++--------- tray/tray_other.go | 1 + 12 files changed, 171 insertions(+), 70 deletions(-) diff --git a/log/log.go b/log/log.go index 9c1d433..b6d165c 100644 --- a/log/log.go +++ b/log/log.go @@ -34,6 +34,7 @@ type Metrics struct { TotalTimeMs float64 MemoryAllocMB float64 MemoryPeakMB float64 + InferenceMs float64 } func ResolveDir(flagPath string) (string, error) { @@ -197,8 +198,11 @@ func TranscriptionMetrics(m Metrics, mode, format, provider string, connReused b Float64("ttfb_ms", m.TTFBMs). Float64("total_ms", m.TotalTimeMs). Float64("mem_mb", m.MemoryAllocMB). - Float64("peak_mb", m.MemoryPeakMB). - Msg("transcription") + Float64("peak_mb", m.MemoryPeakMB) + if m.InferenceMs > 0 { + ev = ev.Float64("inference_ms", m.InferenceMs) + } + ev.Msg("transcription") } func TranscriptionText(text string) { diff --git a/main.go b/main.go index f620299..89c7e3f 100644 --- a/main.go +++ b/main.go @@ -361,6 +361,7 @@ func run() { groqKey := os.Getenv("GROQ_API_KEY") openaiKey := os.Getenv("OPENAI_API_KEY") dgKey := os.Getenv("DEEPGRAM_API_KEY") + mistralKey := os.Getenv("MISTRAL_API_KEY") type providerDef struct { name, label, key string @@ -371,6 +372,7 @@ func run() { {"groq", "Groq", groqKey, transcriber.GroqModels, func() transcriber.Transcriber { return transcriber.NewGroq(groqKey) }}, {"openai", "OpenAI", openaiKey, transcriber.OpenAIModels, func() transcriber.Transcriber { return transcriber.NewOpenAI(openaiKey) }}, {"deepgram", "Deepgram", dgKey, transcriber.DeepgramModels, func() transcriber.Transcriber { return transcriber.NewDeepgram(dgKey) }}, + {"mistral", "Mistral", mistralKey, transcriber.MistralModels, func() transcriber.Transcriber { return transcriber.NewMistral(mistralKey) }}, } var trayModels []tray.Model @@ -389,6 +391,8 @@ func run() { } } + tray.SetLanguages(transcriber.AllLanguages()) + tray.SetModels(trayModels, func(provider, model string) { configMu.Lock() defer configMu.Unlock() @@ -413,6 +417,8 @@ func run() { if !streamEnabled { activeFormat = *formatFlag } + + tray.SetLanguages(newTr.SupportedLanguages()) }) tray.SetLanguage(*langFlag, func(code string) { @@ -711,6 +717,7 @@ func finishTranscription(sess transcriber.Session, clipCh chan string, updatesDo TotalTimeMs: bs.TotalTimeMs, MemoryAllocMB: result.MemoryAllocMB, MemoryPeakMB: result.MemoryPeakMB, + InferenceMs: bs.InferenceMs, } transcriptionsMu.Lock() transcriptionCount++ diff --git a/transcriber/batch_session.go b/transcriber/batch_session.go index 991dc50..0eca898 100644 --- a/transcriber/batch_session.go +++ b/transcriber/batch_session.go @@ -127,6 +127,7 @@ func (bs *batchSession) Close() (SessionResult, error) { ConnReused: netMetrics.ConnReused, TLSProtocol: netMetrics.TLSProtocol, Confidence: result.Confidence, + InferenceMs: result.InferenceMs, }, Metrics: bs.formatMetrics(rawSize, encodedSize, compressionPct, audioDuration, result), } diff --git a/transcriber/deepgram.go b/transcriber/deepgram.go index 203dc9a..b2c003b 100644 --- a/transcriber/deepgram.go +++ b/transcriber/deepgram.go @@ -26,7 +26,16 @@ func NewDeepgram(apiKey string) *Deepgram { } } -func (d *Deepgram) Name() string { return "deepgram" } +var deepgramLangs = langsFromCodes([]string{ + "bg", "ca", "zh", "cs", "da", "nl", "en", "et", "fi", "fr", + "de", "el", "hi", "hu", "id", "it", "ja", "ko", "lv", "lt", + "ms", "no", "pl", "pt", "ro", "ru", "sk", "es", "sv", "th", + "tr", "uk", "vi", +}) + +func (d *Deepgram) SupportedLanguages() []Language { return deepgramLangs } +func (d *Deepgram) Name() string { return "deepgram" } + var DeepgramModels = []ModelInfo{ {ID: "nova-3", Label: "Nova-3 (stream)", Stream: true}, } diff --git a/transcriber/fake.go b/transcriber/fake.go index 00e3bb5..6b72363 100644 --- a/transcriber/fake.go +++ b/transcriber/fake.go @@ -16,7 +16,8 @@ func NewFake(text string, err error) *FakeTranscriber { return &FakeTranscriber{text: text, err: err} } -func (f *FakeTranscriber) Name() string { return "fake" } +func (f *FakeTranscriber) Name() string { return "fake" } +func (f *FakeTranscriber) SupportedLanguages() []Language { return nil } func (f *FakeTranscriber) SetLanguage(lang string) { f.lang = lang } func (f *FakeTranscriber) GetLanguage() string { return f.lang } func (f *FakeTranscriber) Models() []ModelInfo { return nil } diff --git a/transcriber/groq.go b/transcriber/groq.go index bfcf896..5d6ab73 100644 --- a/transcriber/groq.go +++ b/transcriber/groq.go @@ -36,9 +36,18 @@ func NewGroq(apiKey string) *Groq { } } -func (g *Groq) Models() []ModelInfo { return GroqModels } - -func (g *Groq) Name() string { return "groq" } +var groqLangs = langsFromCodes([]string{ + "af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", + "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", + "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", + "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", + "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", + "ta", "th", "tr", "uk", "ur", "vi", "cy", +}) + +func (g *Groq) SupportedLanguages() []Language { return groqLangs } +func (g *Groq) Models() []ModelInfo { return GroqModels } +func (g *Groq) Name() string { return "groq" } func (g *Groq) NewSession(_ context.Context, cfg SessionConfig) (Session, error) { go g.client.Warm() diff --git a/transcriber/openai.go b/transcriber/openai.go index 60d4e97..c1c5736 100644 --- a/transcriber/openai.go +++ b/transcriber/openai.go @@ -26,7 +26,18 @@ func NewOpenAI(apiKey string) *OpenAI { } } -func (o *OpenAI) Name() string { return "openai" } +var openaiLangs = langsFromCodes([]string{ + "af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", + "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", + "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", + "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", + "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", + "ta", "th", "tr", "uk", "ur", "vi", "cy", +}) + +func (o *OpenAI) SupportedLanguages() []Language { return openaiLangs } +func (o *OpenAI) Name() string { return "openai" } + var OpenAIModels = []ModelInfo{ {ID: "gpt-4o-transcribe", Label: "GPT-4o Transcribe", Stream: false}, } diff --git a/transcriber/session.go b/transcriber/session.go index 75dc5e5..30202e1 100644 --- a/transcriber/session.go +++ b/transcriber/session.go @@ -28,6 +28,7 @@ type BatchStats struct { ConnReused bool TLSProtocol string Confidence float64 + InferenceMs float64 } type StreamStats struct { diff --git a/transcriber/transcriber.go b/transcriber/transcriber.go index 75f6c99..386dc50 100644 --- a/transcriber/transcriber.go +++ b/transcriber/transcriber.go @@ -5,6 +5,7 @@ import ( "fmt" "net/http" "os" + "sort" "sync" "time" ) @@ -54,6 +55,7 @@ type Result struct { NoSpeechProb float64 AvgLogProb float64 Duration float64 + InferenceMs float64 Segments []Segment } @@ -63,8 +65,14 @@ type ModelInfo struct { Stream bool } +type Language struct { + Code string // ISO-639-1 ("" = auto-detect) + Label string +} + type Transcriber interface { Name() string + SupportedLanguages() []Language SetLanguage(lang string) GetLanguage() string Models() []ModelInfo @@ -73,6 +81,38 @@ type Transcriber interface { NewSession(ctx context.Context, cfg SessionConfig) (Session, error) } +// langLabels maps ISO-639-1 codes to display names. +var langLabels = map[string]string{ + "af": "Afrikaans", "ar": "Arabic", "hy": "Armenian", "az": "Azerbaijani", + "be": "Belarusian", "bs": "Bosnian", "bg": "Bulgarian", "ca": "Catalan", + "zh": "Chinese", "hr": "Croatian", "cs": "Czech", "da": "Danish", + "nl": "Dutch", "en": "English", "et": "Estonian", "fi": "Finnish", + "fr": "French", "gl": "Galician", "de": "German", "el": "Greek", + "he": "Hebrew", "hi": "Hindi", "hu": "Hungarian", "is": "Icelandic", + "id": "Indonesian", "it": "Italian", "ja": "Japanese", "kn": "Kannada", + "kk": "Kazakh", "ko": "Korean", "lv": "Latvian", "lt": "Lithuanian", + "mk": "Macedonian", "ms": "Malay", "mr": "Marathi", "mi": "Maori", + "ne": "Nepali", "no": "Norwegian", "fa": "Persian", "pl": "Polish", + "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "sr": "Serbian", + "sk": "Slovak", "sl": "Slovenian", "es": "Spanish", "sw": "Swahili", + "sv": "Swedish", "tl": "Tagalog", "ta": "Tamil", "th": "Thai", + "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "vi": "Vietnamese", + "cy": "Welsh", +} + +func langsFromCodes(codes []string) []Language { + langs := make([]Language, 0, len(codes)+1) + langs = append(langs, Language{"", "Auto-detect"}) + for _, c := range codes { + label := langLabels[c] + if label == "" { + label = c + } + langs = append(langs, Language{c, label}) + } + return langs +} + type baseTranscriber struct { client *TracedClient apiURL string @@ -93,6 +133,16 @@ func (b *baseTranscriber) GetLanguage() string { return b.lang } +// AllLanguages returns every known language, sorted alphabetically. +func AllLanguages() []Language { + codes := make([]string, 0, len(langLabels)) + for c := range langLabels { + codes = append(codes, c) + } + sort.Strings(codes) + return langsFromCodes(codes) +} + func (b *baseTranscriber) Models() []ModelInfo { return nil } func (b *baseTranscriber) SetModel(m string) { b.model = m } func (b *baseTranscriber) GetModel() string { return b.model } @@ -109,6 +159,7 @@ func New() (Transcriber, error) { dgKey := os.Getenv("DEEPGRAM_API_KEY") openaiKey := os.Getenv("OPENAI_API_KEY") groqKey := os.Getenv("GROQ_API_KEY") + mistralKey := os.Getenv("MISTRAL_API_KEY") if dgKey != "" { return NewDeepgram(dgKey), nil @@ -119,6 +170,9 @@ func New() (Transcriber, error) { if groqKey != "" { return NewGroq(groqKey), nil } + if mistralKey != "" { + return NewMistral(mistralKey), nil + } - return nil, fmt.Errorf("set DEEPGRAM_API_KEY, OPENAI_API_KEY, or GROQ_API_KEY environment variable") + return nil, fmt.Errorf("set DEEPGRAM_API_KEY, OPENAI_API_KEY, GROQ_API_KEY, or MISTRAL_API_KEY environment variable") } diff --git a/tray/tray.go b/tray/tray.go index b30fe32..e4c9a2a 100644 --- a/tray/tray.go +++ b/tray/tray.go @@ -4,6 +4,7 @@ import ( "fmt" "sync" "time" + "zee/transcriber" ) type Model struct { @@ -47,48 +48,7 @@ var ( langCb func(string) ) -type Language struct { - Code string // ISO-639-1 - Label string -} - -// Languages supported by both Groq (Whisper) and Deepgram Nova-2. -var Languages = []Language{ - {"", "Auto-detect"}, - {"bg", "Bulgarian"}, - {"ca", "Catalan"}, - {"zh", "Chinese"}, - {"cs", "Czech"}, - {"da", "Danish"}, - {"nl", "Dutch"}, - {"en", "English"}, - {"et", "Estonian"}, - {"fi", "Finnish"}, - {"fr", "French"}, - {"de", "German"}, - {"el", "Greek"}, - {"hi", "Hindi"}, - {"hu", "Hungarian"}, - {"id", "Indonesian"}, - {"it", "Italian"}, - {"ja", "Japanese"}, - {"ko", "Korean"}, - {"lv", "Latvian"}, - {"lt", "Lithuanian"}, - {"ms", "Malay"}, - {"no", "Norwegian"}, - {"pl", "Polish"}, - {"pt", "Portuguese"}, - {"ro", "Romanian"}, - {"ru", "Russian"}, - {"sk", "Slovak"}, - {"es", "Spanish"}, - {"sv", "Swedish"}, - {"th", "Thai"}, - {"tr", "Turkish"}, - {"uk", "Ukrainian"}, - {"vi", "Vietnamese"}, -} +var languages []transcriber.Language // set via SetLanguages func OnCopyLast(fn func()) { copyLastFn = fn } func OnRecord(start, stop func()) { recordFn = start; stopFn = stop } @@ -160,6 +120,11 @@ func SetLanguage(code string, onSwitch func(string)) { langCb = onSwitch } +func SetLanguages(langs []transcriber.Language) { + languages = langs + refreshLanguageMenu() +} + func SetBTCheck(fn func(string) bool) { isBTFn = fn } diff --git a/tray/tray_darwin.go b/tray/tray_darwin.go index 3851e40..8a5cb7a 100644 --- a/tray/tray_darwin.go +++ b/tray/tray_darwin.go @@ -21,8 +21,11 @@ var ( mAutoPaste *systray.MenuItem mLogin *systray.MenuItem mBackend *systray.MenuItem - mLanguage *systray.MenuItem - langItems []*systray.MenuItem + mLanguage *systray.MenuItem + langEntries []struct { + item *systray.MenuItem + code string + } mUpdate *systray.MenuItem modelItems []*systray.MenuItem @@ -279,23 +282,8 @@ func onReady() { modelMu.Unlock() mLanguage = mSettings.AddSubMenuItem("Language", "Select transcription language") - langItems = make([]*systray.MenuItem, 0, len(Languages)) - for i, lang := range Languages { - idx := i - item := mLanguage.AddSubMenuItemCheckbox(lang.Label, lang.Label, lang.Code == langCode) - item.Click(func() { - for j, it := range langItems { - if j == idx { - it.Check() - } else { - it.Uncheck() - } - } - if langCb != nil { - langCb(Languages[idx].Code) - } - }) - langItems = append(langItems, item) + for _, lang := range languages { + addLangEntry(lang.Code, lang.Label) } systray.AddSeparator() @@ -329,6 +317,56 @@ func addUpdateMenuItem(version string) { }) } +func addLangEntry(code, label string) { + idx := len(langEntries) + item := mLanguage.AddSubMenuItemCheckbox(label, label, code == langCode) + item.Click(func() { + for _, e := range langEntries { + e.item.Uncheck() + } + langEntries[idx].item.Check() + if langCb != nil { + langCb(langEntries[idx].code) + } + }) + langEntries = append(langEntries, struct { + item *systray.MenuItem + code string + }{item, code}) +} + +func refreshLanguageMenu() { + if mLanguage == nil { + return + } + want := make(map[string]bool, len(languages)) + for _, l := range languages { + want[l.Code] = true + } + langValid := false + for _, e := range langEntries { + if e.code == "" || want[e.code] { + e.item.Show() + if e.code == langCode { + langValid = true + e.item.Check() + } + } else { + e.item.Hide() + e.item.Uncheck() + } + } + if !langValid { + langCode = "" + if len(langEntries) > 0 { + langEntries[0].item.Check() + } + if langCb != nil { + langCb("") + } + } +} + func disableBackend() { if mBackend != nil { mBackend.Disable() diff --git a/tray/tray_other.go b/tray/tray_other.go index 618055d..a3add92 100644 --- a/tray/tray_other.go +++ b/tray/tray_other.go @@ -4,6 +4,7 @@ package tray func Init() <-chan struct{} { return make(chan struct{}) } func RefreshDevices(names []string, selected string) {} +func refreshLanguageMenu() {} func updateRecordingIcon(bool) {} func updateWarningIcon(bool) {} func updateTooltip(string) {} From a2a8e93d56e58af47e102fc4da3d96c80737b242 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=BCmer=20Cip?= Date: Mon, 16 Mar 2026 11:59:15 +0300 Subject: [PATCH 2/4] add mistral.go --- transcriber/mistral.go | 106 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 transcriber/mistral.go diff --git a/transcriber/mistral.go b/transcriber/mistral.go new file mode 100644 index 0000000..63407f4 --- /dev/null +++ b/transcriber/mistral.go @@ -0,0 +1,106 @@ +package transcriber + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "mime/multipart" + "net/http" + "strconv" +) + +var MistralModels = []ModelInfo{ + {ID: "voxtral-mini-latest", Label: "Voxtral Mini", Stream: false}, +} + +type Mistral struct { + baseTranscriber + apiKey string +} + +func NewMistral(apiKey string) *Mistral { + apiURL := "https://api.mistral.ai/v1/audio/transcriptions" + return &Mistral{ + baseTranscriber: baseTranscriber{ + client: NewTracedClient(apiURL), + apiURL: apiURL, + model: "voxtral-mini-latest", + }, + apiKey: apiKey, + } +} + +var mistralLangs = langsFromCodes([]string{ + "ar", "zh", "nl", "en", "fr", "de", "hi", "it", "ja", "ko", + "pt", "ru", "es", +}) + +func (m *Mistral) SupportedLanguages() []Language { return mistralLangs } +func (m *Mistral) Name() string { return "mistral" } +func (m *Mistral) Models() []ModelInfo { return MistralModels } + +func (m *Mistral) NewSession(_ context.Context, cfg SessionConfig) (Session, error) { + go m.client.Warm() + if cfg.Stream { + return nil, fmt.Errorf("mistral does not support streaming transcription") + } + return newBatchSession(cfg, m.transcribe) +} + +func (m *Mistral) transcribe(audioData []byte, format, lang string) (*Result, error) { + var body bytes.Buffer + writer := multipart.NewWriter(&body) + + part, err := writer.CreateFormFile("file", "audio."+format) + if err != nil { + return nil, err + } + if _, err := part.Write(audioData); err != nil { + return nil, err + } + + writer.WriteField("model", m.GetModel()) + if lang != "" { + writer.WriteField("language", lang) + } + writer.Close() + + req, err := http.NewRequest("POST", m.apiURL, &body) + if err != nil { + return nil, err + } + + req.Header.Set("Authorization", "Bearer "+m.apiKey) + req.Header.Set("Content-Type", writer.FormDataContentType()) + + resp, err := m.client.Do(req) + if err != nil { + return nil, err + } + + if resp.StatusCode != 200 { + return nil, fmt.Errorf("mistral API error %d: %s", resp.StatusCode, string(resp.Body)) + } + + var mResp struct { + Text string `json:"text"` + Language string `json:"language"` + Duration float64 `json:"duration"` + } + if err := json.Unmarshal(resp.Body, &mResp); err != nil { + return nil, fmt.Errorf("mistral response parse error: %w", err) + } + + remaining := firstNonEmpty(resp.Header, "x-ratelimit-remaining-req-minute") + limit := firstNonEmpty(resp.Header, "x-ratelimit-limit-req-minute") + inferenceMs, _ := strconv.ParseFloat(firstNonEmpty(resp.Header, "x-envoy-upstream-service-time"), 64) + + return &Result{ + Text: mResp.Text, + Metrics: resp.Metrics, + RateLimit: remaining + "/" + limit, + Duration: mResp.Duration, + InferenceMs: inferenceMs, + }, nil +} From 983a194a8f1a191ee61167b36893d55b157290b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=BCmer=20Cip?= Date: Mon, 16 Mar 2026 12:54:05 +0300 Subject: [PATCH 3/4] add launchctl setenv to readme --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 46d741c..ca599fd 100644 --- a/README.md +++ b/README.md @@ -84,14 +84,23 @@ make app # macOS DMG ## Usage +Set at least one API key, then run zee: + ```bash export GROQ_API_KEY=your_key # batch mode (Groq Whisper) export OPENAI_API_KEY=your_key # batch mode (OpenAI Whisper) export DEEPGRAM_API_KEY=your_key # streaming mode (Deepgram) +export MISTRAL_API_KEY=your_key # batch mode (Mistral Voxtral) zee # starts in menu bar, hold Ctrl+Shift+Space to record zee -stream # words appear as you speak ``` +> **Note:** `export` only works in the current terminal session. To make API keys available to `Zee.app` when launched from Spotlight or Applications, use `launchctl`: +> ```bash +> launchctl setenv GROQ_API_KEY your_key +> ``` +> Add this to your `~/.zshrc` so it runs on every login. + zee runs as a system tray app in the menu bar. Hold `Ctrl+Shift+Space` to record, release to transcribe. Result auto-pastes into the focused window. Use the tray menu to switch microphones, providers, and languages — or use `-setup` for initial device selection. From fc210893204947756f346664437fcb5c91bf7d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=BCmer=20Cip?= Date: Mon, 16 Mar 2026 13:18:56 +0300 Subject: [PATCH 4/4] refactor langs to be per-model --- transcriber/deepgram.go | 10 +++++----- transcriber/groq.go | 24 ++++++++++++------------ transcriber/mistral.go | 14 +++++++------- transcriber/openai.go | 10 +++++----- transcriber/transcriber.go | 18 ++++++++++++++---- 5 files changed, 43 insertions(+), 33 deletions(-) diff --git a/transcriber/deepgram.go b/transcriber/deepgram.go index b2c003b..c8b76ef 100644 --- a/transcriber/deepgram.go +++ b/transcriber/deepgram.go @@ -26,20 +26,20 @@ func NewDeepgram(apiKey string) *Deepgram { } } -var deepgramLangs = langsFromCodes([]string{ +var nova3Langs = langsFromCodes([]string{ "bg", "ca", "zh", "cs", "da", "nl", "en", "et", "fi", "fr", "de", "el", "hi", "hu", "id", "it", "ja", "ko", "lv", "lt", "ms", "no", "pl", "pt", "ro", "ru", "sk", "es", "sv", "th", "tr", "uk", "vi", }) -func (d *Deepgram) SupportedLanguages() []Language { return deepgramLangs } -func (d *Deepgram) Name() string { return "deepgram" } - var DeepgramModels = []ModelInfo{ - {ID: "nova-3", Label: "Nova-3 (stream)", Stream: true}, + {ID: "nova-3", Label: "Nova-3 (stream)", Stream: true, Languages: nova3Langs}, } +func (d *Deepgram) SupportedLanguages() []Language { return modelLanguages(DeepgramModels, d.GetModel()) } +func (d *Deepgram) Name() string { return "deepgram" } + func (d *Deepgram) Models() []ModelInfo { return DeepgramModels } func (d *Deepgram) NewSession(ctx context.Context, cfg SessionConfig) (Session, error) { diff --git a/transcriber/groq.go b/transcriber/groq.go index 5d6ab73..25bec4c 100644 --- a/transcriber/groq.go +++ b/transcriber/groq.go @@ -14,9 +14,18 @@ const ( ModelWhisperV3 = "whisper-large-v3" ) +var whisperLangs = langsFromCodes([]string{ + "af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", + "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", + "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", + "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", + "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", + "ta", "th", "tr", "uk", "ur", "vi", "cy", +}) + var GroqModels = []ModelInfo{ - {ID: ModelWhisperV3Turbo, Label: "Whisper V3 Turbo", Stream: false}, - {ID: ModelWhisperV3, Label: "Whisper V3", Stream: false}, + {ID: ModelWhisperV3Turbo, Label: "Whisper V3 Turbo", Stream: false, Languages: whisperLangs}, + {ID: ModelWhisperV3, Label: "Whisper V3", Stream: false, Languages: whisperLangs}, } type Groq struct { @@ -36,16 +45,7 @@ func NewGroq(apiKey string) *Groq { } } -var groqLangs = langsFromCodes([]string{ - "af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", - "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", - "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", - "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", - "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", - "ta", "th", "tr", "uk", "ur", "vi", "cy", -}) - -func (g *Groq) SupportedLanguages() []Language { return groqLangs } +func (g *Groq) SupportedLanguages() []Language { return modelLanguages(GroqModels, g.GetModel()) } func (g *Groq) Models() []ModelInfo { return GroqModels } func (g *Groq) Name() string { return "groq" } diff --git a/transcriber/mistral.go b/transcriber/mistral.go index 63407f4..d54ad08 100644 --- a/transcriber/mistral.go +++ b/transcriber/mistral.go @@ -10,8 +10,13 @@ import ( "strconv" ) +var voxtralLangs = langsFromCodes([]string{ + "ar", "zh", "nl", "en", "fr", "de", "hi", "it", "ja", "ko", + "pt", "ru", "es", +}) + var MistralModels = []ModelInfo{ - {ID: "voxtral-mini-latest", Label: "Voxtral Mini", Stream: false}, + {ID: "voxtral-mini-latest", Label: "Voxtral Mini", Stream: false, Languages: voxtralLangs}, } type Mistral struct { @@ -31,12 +36,7 @@ func NewMistral(apiKey string) *Mistral { } } -var mistralLangs = langsFromCodes([]string{ - "ar", "zh", "nl", "en", "fr", "de", "hi", "it", "ja", "ko", - "pt", "ru", "es", -}) - -func (m *Mistral) SupportedLanguages() []Language { return mistralLangs } +func (m *Mistral) SupportedLanguages() []Language { return modelLanguages(MistralModels, m.GetModel()) } func (m *Mistral) Name() string { return "mistral" } func (m *Mistral) Models() []ModelInfo { return MistralModels } diff --git a/transcriber/openai.go b/transcriber/openai.go index c1c5736..639559c 100644 --- a/transcriber/openai.go +++ b/transcriber/openai.go @@ -26,7 +26,10 @@ func NewOpenAI(apiKey string) *OpenAI { } } -var openaiLangs = langsFromCodes([]string{ +func (o *OpenAI) SupportedLanguages() []Language { return modelLanguages(OpenAIModels, o.GetModel()) } +func (o *OpenAI) Name() string { return "openai" } + +var gpt4oTranscribeLangs = langsFromCodes([]string{ "af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", @@ -35,11 +38,8 @@ var openaiLangs = langsFromCodes([]string{ "ta", "th", "tr", "uk", "ur", "vi", "cy", }) -func (o *OpenAI) SupportedLanguages() []Language { return openaiLangs } -func (o *OpenAI) Name() string { return "openai" } - var OpenAIModels = []ModelInfo{ - {ID: "gpt-4o-transcribe", Label: "GPT-4o Transcribe", Stream: false}, + {ID: "gpt-4o-transcribe", Label: "GPT-4o Transcribe", Stream: false, Languages: gpt4oTranscribeLangs}, } func (o *OpenAI) Models() []ModelInfo { return OpenAIModels } diff --git a/transcriber/transcriber.go b/transcriber/transcriber.go index 386dc50..0221579 100644 --- a/transcriber/transcriber.go +++ b/transcriber/transcriber.go @@ -60,9 +60,10 @@ type Result struct { } type ModelInfo struct { - ID string - Label string - Stream bool + ID string + Label string + Stream bool + Languages []Language } type Language struct { @@ -143,10 +144,19 @@ func AllLanguages() []Language { return langsFromCodes(codes) } -func (b *baseTranscriber) Models() []ModelInfo { return nil } +func (b *baseTranscriber) Models() []ModelInfo { return nil } func (b *baseTranscriber) SetModel(m string) { b.model = m } func (b *baseTranscriber) GetModel() string { return b.model } +func modelLanguages(models []ModelInfo, current string) []Language { + for _, m := range models { + if m.ID == current { + return m.Languages + } + } + return nil +} + func New() (Transcriber, error) { if fakeText, ok := os.LookupEnv("ZEE_FAKE_TEXT"); ok { var fakeErr error