From e46b661ad105639591e114e01583f82d726d029a Mon Sep 17 00:00:00 2001 From: master5d Date: Mon, 15 Jun 2026 06:56:32 -0500 Subject: [PATCH 1/5] =?UTF-8?q?feat(tutor):=20language=20tutor=20v0=20?= =?UTF-8?q?=E2=80=94=20pronunciation/fluency=20scoring=20(backend)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Word-level alignment with Levenshtein tolerance + Cyrillic-aware normalization. Recovered from a Serena-MCP cross-write during a parallel Gemini delegation run; 5/5 unit tests green. Co-Authored-By: Claude Opus 4.8 --- src-tauri/src/commands/mod.rs | 1 + src-tauri/src/commands/tutor.rs | 5 + src-tauri/src/lib.rs | 2 + src-tauri/src/settings.rs | 7 ++ src-tauri/src/tutor/mod.rs | 158 ++++++++++++++++++++++++++++++++ 5 files changed, 173 insertions(+) create mode 100644 src-tauri/src/commands/tutor.rs create mode 100644 src-tauri/src/tutor/mod.rs diff --git a/src-tauri/src/commands/mod.rs b/src-tauri/src/commands/mod.rs index fa6ed90..7849f42 100644 --- a/src-tauri/src/commands/mod.rs +++ b/src-tauri/src/commands/mod.rs @@ -6,6 +6,7 @@ pub mod models; pub mod transcribe; pub mod transcription; pub mod tts; +pub mod tutor; use crate::settings::{get_settings, write_settings, AppSettings, LogLevel}; use crate::utils::cancel_current_operation; diff --git a/src-tauri/src/commands/tutor.rs b/src-tauri/src/commands/tutor.rs new file mode 100644 index 0000000..039c0a8 --- /dev/null +++ b/src-tauri/src/commands/tutor.rs @@ -0,0 +1,5 @@ +#[tauri::command] +#[specta::specta] +pub fn tutor_score(reference: String, spoken: String) -> crate::tutor::ScoreReport { + crate::tutor::score_pronunciation(&reference, &spoken) +} diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 100fd18..49ffeda 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -28,6 +28,7 @@ mod transcript_format; mod transcription_coordinator; mod translate; mod tts; +mod tutor; mod utils; mod voice_commands; @@ -541,6 +542,7 @@ pub fn run(cli_args: CliArgs) { commands::tts::tts_list_voices, commands::tts::tts_speak, commands::tts::tts_stop, + commands::tutor::tutor_score, commands::agent_bridge::agent_bridge_answer, commands::agent_bridge::agent_bridge_dismiss, commands::agent_bridge::agent_bridge_answers, diff --git a/src-tauri/src/settings.rs b/src-tauri/src/settings.rs index 6a43d3d..9a83c0c 100644 --- a/src-tauri/src/settings.rs +++ b/src-tauri/src/settings.rs @@ -489,6 +489,12 @@ pub struct AppSettings { pub spoken_lists_enabled: bool, #[serde(default)] pub dev_dictionary_enabled: bool, + #[serde(default = "default_tutor_enabled")] + pub tutor_enabled: bool, +} + +pub fn default_tutor_enabled() -> bool { + false } pub fn default_spoken_lists_enabled() -> bool { @@ -984,6 +990,7 @@ pub fn get_default_settings() -> AppSettings { self_correction_enabled: false, spoken_lists_enabled: default_spoken_lists_enabled(), dev_dictionary_enabled: false, + tutor_enabled: default_tutor_enabled(), } } diff --git a/src-tauri/src/tutor/mod.rs b/src-tauri/src/tutor/mod.rs new file mode 100644 index 0000000..2116d2c --- /dev/null +++ b/src-tauri/src/tutor/mod.rs @@ -0,0 +1,158 @@ +use serde::{Deserialize, Serialize}; +use specta::Type; +use strsim::levenshtein; +use regex::Regex; +use once_cell::sync::Lazy; + +static PUNCTUATION: Lazy = Lazy::new(|| Regex::new(r"[^\p{L}\s]").unwrap()); +static WHITESPACE: Lazy = Lazy::new(|| Regex::new(r"\s+").unwrap()); + +#[derive(Serialize, Deserialize, Debug, Clone, Type)] +pub struct WordScore { + pub reference: String, + pub spoken: Option, + pub matched: bool, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Type)] +pub struct ScoreReport { + pub overall: u8, // 0..=100 + pub words: Vec, // per reference word, aligned + pub reference_word_count: usize, + pub matched_word_count: usize, + pub note: String, // short human feedback, e.g. "Great!" / "Watch: " +} + +fn normalize(text: &str) -> Vec { + let text = text.to_lowercase(); + let text = PUNCTUATION.replace_all(&text, ""); + let text = WHITESPACE.replace_all(&text, " "); + text.trim() + .split_whitespace() + .map(|s| s.to_string()) + .collect() +} + +pub fn score_pronunciation(reference: &str, spoken: &str) -> ScoreReport { + let ref_words = normalize(reference); + let spoken_words = normalize(spoken); + + if ref_words.is_empty() { + return ScoreReport { + overall: 0, + words: Vec::new(), + reference_word_count: 0, + matched_word_count: 0, + note: "Empty reference phrase.".to_string(), + }; + } + + let mut word_scores = Vec::with_capacity(ref_words.len()); + let mut matched_count = 0; + let mut spoken_idx = 0; + + for ref_word in &ref_words { + let mut best_match: Option<(usize, bool)> = None; + + // Greedy search for the best match in the remaining spoken words + // We look ahead a bit to allow for some misrecognitions or skipped words + let lookahead = 3; + let end = (spoken_idx + lookahead).min(spoken_words.len()); + + for i in spoken_idx..end { + let spoken_word = &spoken_words[i]; + let is_match = if ref_word == spoken_word { + true + } else { + let dist = levenshtein(ref_word, spoken_word); + dist <= 1.max(ref_word.len() / 4) + }; + + if is_match { + best_match = Some((i, true)); + break; + } + } + + if let Some((idx, is_match)) = best_match { + word_scores.push(WordScore { + reference: ref_word.clone(), + spoken: Some(spoken_words[idx].clone()), + matched: is_match, + }); + matched_count += 1; + spoken_idx = idx + 1; + } else { + word_scores.push(WordScore { + reference: ref_word.clone(), + spoken: None, + matched: false, + }); + } + } + + let overall = (100 * matched_count / ref_words.len()) as u8; + + let unmatched_words: Vec = word_scores + .iter() + .filter(|w| !w.matched) + .map(|w| w.reference.clone()) + .take(3) + .collect(); + + let note = if overall >= 90 { + "Great pronunciation!".to_string() + } else if overall >= 70 { + format!("Good — review: {}", unmatched_words.join(", ")) + } else { + format!("Keep practicing: {}", unmatched_words.join(", ")) + }; + + ScoreReport { + overall, + words: word_scores, + reference_word_count: ref_words.len(), + matched_word_count: matched_count, + note, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_exact_match() { + let report = score_pronunciation("Hello world", "hello world"); + assert_eq!(report.overall, 100); + assert_eq!(report.matched_word_count, 2); + } + + #[test] + fn test_one_wrong() { + let report = score_pronunciation("one two three four", "one two skip four"); + assert_eq!(report.overall, 75); + assert_eq!(report.matched_word_count, 3); + } + + #[test] + fn test_empty_reference() { + let report = score_pronunciation("", "something"); + assert_eq!(report.overall, 0); + } + + #[test] + fn test_russian() { + let report = score_pronunciation("привет как дела", "привет как дела"); + assert_eq!(report.overall, 100); + } + + #[test] + fn test_levenshtein_tolerance() { + // "pronunciation" len 13, 13/4 = 3. + // "pronunshation" dist 2 + let report = score_pronunciation("pronunciation", "pronunshation"); + assert_eq!(report.overall, 100); + assert!(report.words[0].matched); + } +} From 3682ba4d7d302235a6cffdeb4d4634f9a1479426 Mon Sep 17 00:00:00 2001 From: master5d Date: Mon, 15 Jun 2026 08:00:31 -0500 Subject: [PATCH 2/5] =?UTF-8?q?feat(tts):=20TTS=20v1=20=E2=80=94=20rate=20?= =?UTF-8?q?control,=20language-aware=20voice,=20Settings=20UI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - speak/synthesize gain a `rate` param (WinRT SpeakingRate, clamped 0.5–6.0) - pick_voice_for_text: Cyrillic≥30% → ru voice, else en (auto when no explicit voice); 3 unit tests - AppSettings: tts_enabled / tts_voice_id / tts_rate; agent-bridge sink now respects them (only speaks when enabled, uses chosen voice + rate) - new Voice settings card (toggle, voice picker, rate slider, test button), Sidebar entry, en/ru i18n - tts_speak command + bindings gain optional rate Verified: cargo check + cargo test --lib tts (3/3 voice tests) + tsc + eslint, all green. Co-Authored-By: Claude Opus 4.8 --- src-tauri/src/commands/tts.rs | 3 +- src-tauri/src/lib.rs | 15 +++- src-tauri/src/settings.rs | 17 ++++ src-tauri/src/tts/mod.rs | 71 ++++++++++++++- src-tauri/src/tts/windows.rs | 19 +++- src/bindings.ts | 6 +- src/components/Sidebar.tsx | 8 ++ src/components/settings/index.ts | 1 + src/components/settings/tts/TtsSettings.tsx | 96 +++++++++++++++++++++ src/i18n/locales/en/translation.json | 12 ++- src/i18n/locales/ru/translation.json | 12 ++- 11 files changed, 244 insertions(+), 16 deletions(-) create mode 100644 src/components/settings/tts/TtsSettings.tsx diff --git a/src-tauri/src/commands/tts.rs b/src-tauri/src/commands/tts.rs index 58392cd..d6cac4e 100644 --- a/src-tauri/src/commands/tts.rs +++ b/src-tauri/src/commands/tts.rs @@ -14,8 +14,9 @@ pub fn tts_speak( tts_manager: State>, text: String, voice_id: Option, + rate: Option, ) -> Result<(), String> { - tts_manager.speak(text, voice_id) + tts_manager.speak(text, voice_id, rate.unwrap_or(1.0)) } #[tauri::command] diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 100fd18..1ab901f 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -191,10 +191,17 @@ fn initialize_core_logic(app_handle: &AppHandle) { let sink: crate::agent_bridge::server::AskSink = Arc::new(move |ev| { use tauri::Emitter; if ev.speak { - if let Some(tts) = - evt_handle.try_state::>() - { - let _ = tts.speak(ev.question.clone(), None); + let s = crate::settings::get_settings(&evt_handle); + if s.tts_enabled { + if let Some(tts) = + evt_handle.try_state::>() + { + let _ = tts.speak( + ev.question.clone(), + s.tts_voice_id.clone(), + s.tts_rate, + ); + } } } crate::agent_bridge::window::show_panel(&evt_handle); diff --git a/src-tauri/src/settings.rs b/src-tauri/src/settings.rs index 6a43d3d..3fe1344 100644 --- a/src-tauri/src/settings.rs +++ b/src-tauri/src/settings.rs @@ -392,6 +392,12 @@ pub struct AppSettings { pub agent_bridge_enabled: bool, #[serde(default = "default_agent_bridge_port")] pub agent_bridge_port: u16, + #[serde(default = "default_tts_enabled")] + pub tts_enabled: bool, + #[serde(default)] + pub tts_voice_id: Option, + #[serde(default = "default_tts_rate")] + pub tts_rate: f32, #[serde(default)] pub custom_words: Vec, #[serde(default)] @@ -591,6 +597,14 @@ fn default_agent_bridge_port() -> u16 { 4123 } +fn default_tts_enabled() -> bool { + true +} + +fn default_tts_rate() -> f32 { + 1.0 +} + fn default_word_correction_threshold() -> f64 { 0.18 } @@ -933,6 +947,9 @@ pub fn get_default_settings() -> AppSettings { overlay_position: default_overlay_position(), agent_bridge_enabled: default_agent_bridge_enabled(), agent_bridge_port: default_agent_bridge_port(), + tts_enabled: default_tts_enabled(), + tts_voice_id: None, + tts_rate: default_tts_rate(), debug_mode: false, log_level: default_log_level(), custom_words: Vec::new(), diff --git a/src-tauri/src/tts/mod.rs b/src-tauri/src/tts/mod.rs index 2a6b7f6..ed88baa 100644 --- a/src-tauri/src/tts/mod.rs +++ b/src-tauri/src/tts/mod.rs @@ -16,7 +16,29 @@ pub struct VoiceInfo { pub trait TtsEngine: Send + Sync { fn list_voices(&self) -> Result, String>; - fn synthesize(&self, text: &str, voice_id: Option<&str>) -> Result, String>; + fn synthesize(&self, text: &str, voice_id: Option<&str>, rate: f32) -> Result, String>; +} + +/// Pick a voice whose language matches the text's script. If >=30% of the +/// alphabetic characters are Cyrillic, prefer a `ru*` voice; otherwise an +/// `en*` voice. Falls back to the first available voice. +pub fn pick_voice_for_text<'a>(text: &str, voices: &'a [VoiceInfo]) -> Option<&'a VoiceInfo> { + let alpha: Vec = text.chars().filter(|c| c.is_alphabetic()).collect(); + if alpha.is_empty() { + return voices.first(); + } + + let cyrillic = alpha + .iter() + .filter(|&&c| ('\u{0400}'..='\u{04FF}').contains(&c)) + .count(); + let is_russian = (cyrillic as f32 / alpha.len() as f32) >= 0.3; + + let prefix = if is_russian { "ru" } else { "en" }; + voices + .iter() + .find(|v| v.language.to_lowercase().starts_with(prefix)) + .or_else(|| voices.first()) } pub struct TtsManager { @@ -49,7 +71,7 @@ impl TtsManager { .list_voices() } - pub fn speak(&self, text: String, voice_id: Option) -> Result<(), String> { + pub fn speak(&self, text: String, voice_id: Option, rate: f32) -> Result<(), String> { let engine = self .engine .as_ref() @@ -58,7 +80,7 @@ impl TtsManager { // Stop current playback self.stop()?; - let wav_bytes = engine.synthesize(&text, voice_id.as_deref())?; + let wav_bytes = engine.synthesize(&text, voice_id.as_deref(), rate)?; let current_sink = self.current_sink.clone(); @@ -104,6 +126,47 @@ impl TtsManager { } } +#[cfg(test)] +mod voice_pick_tests { + use super::*; + + fn voices() -> Vec { + vec![ + VoiceInfo { + id: "en-1".into(), + display_name: "David".into(), + language: "en-US".into(), + }, + VoiceInfo { + id: "ru-1".into(), + display_name: "Irina".into(), + language: "ru-RU".into(), + }, + ] + } + + #[test] + fn russian_text_picks_ru_voice() { + let v = voices(); + let picked = pick_voice_for_text("Привет, как дела?", &v).unwrap(); + assert_eq!(picked.id, "ru-1"); + } + + #[test] + fn english_text_picks_en_voice() { + let v = voices(); + let picked = pick_voice_for_text("Hello, how are you?", &v).unwrap(); + assert_eq!(picked.id, "en-1"); + } + + #[test] + fn no_alpha_falls_back_to_first() { + let v = voices(); + let picked = pick_voice_for_text("12345 !!!", &v).unwrap(); + assert_eq!(picked.id, "en-1"); + } +} + #[cfg(all(test, windows))] mod tests { use super::*; @@ -123,7 +186,7 @@ mod tests { ); let wav = engine - .synthesize("Echo speech engine online. Эхо на связи.", None) + .synthesize("Echo speech engine online. Эхо на связи.", None, 1.0) .expect("synthesize failed"); assert!(wav.len() > 44, "WAV too small: {} bytes", wav.len()); assert_eq!(&wav[0..4], b"RIFF", "not a WAV container"); diff --git a/src-tauri/src/tts/windows.rs b/src-tauri/src/tts/windows.rs index 3fdf0ce..1d18a71 100644 --- a/src-tauri/src/tts/windows.rs +++ b/src-tauri/src/tts/windows.rs @@ -29,10 +29,25 @@ impl TtsEngine for WindowsTts { Ok(out) } - fn synthesize(&self, text: &str, voice_id: Option<&str>) -> Result, String> { + fn synthesize(&self, text: &str, voice_id: Option<&str>, rate: f32) -> Result, String> { let synth = SpeechSynthesizer::new().map_err(|e| e.to_string())?; - if let Some(id) = voice_id { + // WinRT SpeakingRate is a multiplier; valid range is 0.5..=6.0. + let rate = rate.clamp(0.5, 6.0) as f64; + synth + .Options() + .map_err(|e| e.to_string())? + .SetSpeakingRate(rate) + .map_err(|e| e.to_string())?; + + // Resolve the target voice id: explicit when given, otherwise + // auto-selected by the text's script (Cyrillic -> ru, else en). + let target_id: Option = match voice_id { + Some(id) => Some(id.to_string()), + None => super::pick_voice_for_text(text, &self.list_voices()?).map(|v| v.id.clone()), + }; + + if let Some(id) = target_id { let voices = SpeechSynthesizer::AllVoices().map_err(|e| e.to_string())?; let target = voices .into_iter() diff --git a/src/bindings.ts b/src/bindings.ts index 9badac5..3c0e021 100644 --- a/src/bindings.ts +++ b/src/bindings.ts @@ -971,9 +971,9 @@ async ttsListVoices() : Promise> { else return { status: "error", error: e as any }; } }, -async ttsSpeak(text: string, voiceId: string | null) : Promise> { +async ttsSpeak(text: string, voiceId: string | null, rate: number | null) : Promise> { try { - return { status: "ok", data: await TAURI_INVOKE("tts_speak", { text, voiceId }) }; + return { status: "ok", data: await TAURI_INVOKE("tts_speak", { text, voiceId, rate }) }; } catch (e) { if(e instanceof Error) throw e; else return { status: "error", error: e as any }; @@ -1063,7 +1063,7 @@ historyUpdatePayload: "history-update-payload" /** user-defined types **/ -export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; push_to_talk: boolean; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; always_on_microphone?: boolean; selected_microphone?: string | null; clamshell_microphone?: string | null; selected_output_device?: string | null; translate_to_english?: boolean; selected_language?: string; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; agent_bridge_enabled?: boolean; agent_bridge_port?: number; custom_words?: string[]; model_unload_timeout?: ModelUnloadTimeout; word_correction_threshold?: number; history_limit?: number; recording_retention_period?: RecordingRetentionPeriod; paste_method?: PasteMethod; clipboard_handling?: ClipboardHandling; auto_submit?: boolean; auto_submit_key?: AutoSubmitKey; post_process_enabled?: boolean; post_process_provider_id?: string; post_process_providers?: PostProcessProvider[]; post_process_api_keys?: SecretMap; post_process_models?: Partial<{ [key in string]: string }>; post_process_prompts?: LLMPrompt[]; post_process_selected_prompt_id?: string | null; translate_enabled?: boolean; translate_target?: Lang; translate_model?: string; translate_base_url?: string; mute_while_recording?: boolean; append_trailing_space?: boolean; app_language?: string; experimental_enabled?: boolean; lazy_stream_close?: boolean; keyboard_implementation?: KeyboardImplementation; show_tray_icon?: boolean; paste_delay_ms?: number; typing_tool?: TypingTool; external_script_path: string | null; capture_folder?: string; capture_trigger_phrases?: string; custom_filler_words?: string[] | null; whisper_accelerator?: WhisperAcceleratorSetting; ort_accelerator?: OrtAcceleratorSetting; whisper_gpu_device?: number; extra_recording_buffer_ms?: number; auto_punctuate?: boolean; auto_capitalize?: boolean; subtitle_overlay?: boolean; subtitle_font_size?: SubtitleFontSize; subtitle_max_chars?: number; subtitle_refresh_ms?: number; command_mode_enabled?: boolean; coach_toast_enabled?: boolean; snippets?: Snippet[]; self_correction_enabled?: boolean; spoken_lists_enabled?: boolean; dev_dictionary_enabled?: boolean } +export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; push_to_talk: boolean; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; always_on_microphone?: boolean; selected_microphone?: string | null; clamshell_microphone?: string | null; selected_output_device?: string | null; translate_to_english?: boolean; selected_language?: string; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; agent_bridge_enabled?: boolean; agent_bridge_port?: number; tts_enabled?: boolean; tts_voice_id?: string | null; tts_rate?: number; custom_words?: string[]; model_unload_timeout?: ModelUnloadTimeout; word_correction_threshold?: number; history_limit?: number; recording_retention_period?: RecordingRetentionPeriod; paste_method?: PasteMethod; clipboard_handling?: ClipboardHandling; auto_submit?: boolean; auto_submit_key?: AutoSubmitKey; post_process_enabled?: boolean; post_process_provider_id?: string; post_process_providers?: PostProcessProvider[]; post_process_api_keys?: SecretMap; post_process_models?: Partial<{ [key in string]: string }>; post_process_prompts?: LLMPrompt[]; post_process_selected_prompt_id?: string | null; translate_enabled?: boolean; translate_target?: Lang; translate_model?: string; translate_base_url?: string; mute_while_recording?: boolean; append_trailing_space?: boolean; app_language?: string; experimental_enabled?: boolean; lazy_stream_close?: boolean; keyboard_implementation?: KeyboardImplementation; show_tray_icon?: boolean; paste_delay_ms?: number; typing_tool?: TypingTool; external_script_path: string | null; capture_folder?: string; capture_trigger_phrases?: string; custom_filler_words?: string[] | null; whisper_accelerator?: WhisperAcceleratorSetting; ort_accelerator?: OrtAcceleratorSetting; whisper_gpu_device?: number; extra_recording_buffer_ms?: number; auto_punctuate?: boolean; auto_capitalize?: boolean; subtitle_overlay?: boolean; subtitle_font_size?: SubtitleFontSize; subtitle_max_chars?: number; subtitle_refresh_ms?: number; command_mode_enabled?: boolean; coach_toast_enabled?: boolean; snippets?: Snippet[]; self_correction_enabled?: boolean; spoken_lists_enabled?: boolean; dev_dictionary_enabled?: boolean } export type AudioDevice = { index: string; name: string; is_default: boolean } export type AutoSubmitKey = "enter" | "ctrl_enter" | "cmd_enter" export type AvailableAccelerators = { whisper: string[]; ort: string[]; gpu_devices: GpuDeviceOption[] } diff --git a/src/components/Sidebar.tsx b/src/components/Sidebar.tsx index 45f5550..70c49a4 100644 --- a/src/components/Sidebar.tsx +++ b/src/components/Sidebar.tsx @@ -9,6 +9,7 @@ import { Cpu, LineChart, FileAudio, + Volume2, } from "lucide-react"; import EchoTextLogo from "./icons/EchoTextLogo"; import EchoHand from "./icons/EchoHand"; @@ -18,6 +19,7 @@ import { AdvancedSettings, HistorySettings, CoachSettings, + TtsSettings, DebugSettings, AboutSettings, PostProcessingSettings, @@ -79,6 +81,12 @@ export const SECTIONS_CONFIG = { component: CoachSettings, enabled: () => true, }, + tts: { + labelKey: "sidebar.tts", + icon: Volume2, + component: TtsSettings, + enabled: () => true, + }, postprocessing: { labelKey: "sidebar.postProcessing", icon: Sparkles, diff --git a/src/components/settings/index.ts b/src/components/settings/index.ts index def5a95..740aa58 100644 --- a/src/components/settings/index.ts +++ b/src/components/settings/index.ts @@ -4,6 +4,7 @@ export { AdvancedSettings } from "./advanced/AdvancedSettings"; export { DebugSettings } from "./debug/DebugSettings"; export { HistorySettings } from "./history/HistorySettings"; export { CoachSettings } from "./coach/CoachSettings"; +export { TtsSettings } from "./tts/TtsSettings"; export { AboutSettings } from "./about/AboutSettings"; export { PostProcessingSettings } from "./post-processing/PostProcessingSettings"; export { ModelsSettings } from "./models/ModelsSettings"; diff --git a/src/components/settings/tts/TtsSettings.tsx b/src/components/settings/tts/TtsSettings.tsx new file mode 100644 index 0000000..cac8edf --- /dev/null +++ b/src/components/settings/tts/TtsSettings.tsx @@ -0,0 +1,96 @@ +import { type FC, useEffect, useState } from "react"; +import { useTranslation } from "react-i18next"; +import { commands, type VoiceInfo } from "@/bindings"; +import { useSettings } from "@/hooks/useSettings"; + +export const TtsSettings: FC = () => { + const { t } = useTranslation(); + const { settings, updateSetting } = useSettings(); + const [voices, setVoices] = useState([]); + + useEffect(() => { + commands + .ttsListVoices() + .then((res) => setVoices(res.status === "ok" ? res.data : [])) + .catch(() => setVoices([])); + }, []); + + if (!settings) return null; + + const enabled = settings.tts_enabled; + const voiceId = settings.tts_voice_id ?? ""; + const rate = settings.tts_rate ?? 1.0; + + const test = () => { + void commands.ttsSpeak( + "Echo speech engine online. Эхо на связи.", + voiceId || null, + rate, + ); + }; + + return ( +
+
+

{t("settings.tts.title")}

+

{t("settings.tts.description")}

+
+ + {/* Enable */} + + + {/* Voice */} +
+ + +
+ + {/* Rate */} +
+ + + void updateSetting("tts_rate", parseFloat(e.target.value)) + } + className="w-full" + /> +
+ + +
+ ); +}; diff --git a/src/i18n/locales/en/translation.json b/src/i18n/locales/en/translation.json index aa71e84..59f323c 100644 --- a/src/i18n/locales/en/translation.json +++ b/src/i18n/locales/en/translation.json @@ -17,7 +17,8 @@ "debug": "Debug", "about": "About", "coach": "Coach", - "transcribe": "Transcribe File" + "transcribe": "Transcribe File", + "tts": "Voice" }, "onboarding": { "subtitle": "To get started, choose a transcription model", @@ -634,6 +635,15 @@ "belowAvg": "below your avg", "aboveAvg": "above your avg" }, + "tts": { + "title": "Voice (Text-to-Speech)", + "description": "Echo can speak agent questions and replies aloud.", + "speakAnswers": "Speak agent questions aloud", + "voice": "Voice", + "voiceAuto": "Auto (by language)", + "rate": "Speaking rate", + "testVoice": "Test voice" + }, "transcribe": { "tab": "Transcribe", "title": "Transcribe File", diff --git a/src/i18n/locales/ru/translation.json b/src/i18n/locales/ru/translation.json index a0532ed..ec6a545 100644 --- a/src/i18n/locales/ru/translation.json +++ b/src/i18n/locales/ru/translation.json @@ -17,7 +17,8 @@ "debug": "Отладка", "about": "О программе", "coach": "Coach", - "transcribe": "Транскрипция файла" + "transcribe": "Транскрипция файла", + "tts": "Голос" }, "onboarding": { "subtitle": "Для начала выберите модель транскрипции", @@ -624,6 +625,15 @@ "belowAvg": "below your avg", "aboveAvg": "above your avg" }, + "tts": { + "title": "Голос (синтез речи)", + "description": "Echo может озвучивать вопросы и ответы агента вслух.", + "speakAnswers": "Озвучивать вопросы агента", + "voice": "Голос", + "voiceAuto": "Авто (по языку)", + "rate": "Скорость речи", + "testVoice": "Проверить голос" + }, "transcribe": { "tab": "Транскрипция", "title": "Транскрипция файла", From fd2b708f19ab0431874d434d9b254757d1d36711 Mon Sep 17 00:00:00 2001 From: master5d Date: Mon, 15 Jun 2026 08:22:09 -0500 Subject: [PATCH 3/5] =?UTF-8?q?feat(assistant):=20assistant=20mode=20v0=20?= =?UTF-8?q?=E2=80=94=20LLM=20round-trip=20via=20post-process=20provider?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reuses the configured post-processing OpenAI-compatible provider/model/key (llm_client::send_chat_completion_with_schema) with an Echo voice-assistant system prompt. New assistant module + assistant_ask command, settings (assistant_enabled / assistant_system_prompt), and an Assistant settings card (toggle, system-prompt editor, try-it box with speak-reply via TTS). en/ru i18n. Stacked on feat/tts-v1. Verified: cargo check + cargo test --lib assistant + tsc + eslint green. Built by Gemini CLI (Serena disabled) + integration fixes. Co-Authored-By: Claude Opus 4.8 --- src-tauri/src/assistant/mod.rs | 58 ++++++++ src-tauri/src/commands/assistant.rs | 5 + src-tauri/src/commands/mod.rs | 1 + src-tauri/src/lib.rs | 2 + src-tauri/src/settings.rs | 10 ++ src/bindings.ts | 10 +- src/components/Sidebar.tsx | 8 ++ .../settings/assistant/AssistantSettings.tsx | 124 ++++++++++++++++++ src/components/settings/index.ts | 1 + src/i18n/locales/en/translation.json | 17 +++ src/i18n/locales/ru/translation.json | 19 ++- 11 files changed, 253 insertions(+), 2 deletions(-) create mode 100644 src-tauri/src/assistant/mod.rs create mode 100644 src-tauri/src/commands/assistant.rs create mode 100644 src/components/settings/assistant/AssistantSettings.tsx diff --git a/src-tauri/src/assistant/mod.rs b/src-tauri/src/assistant/mod.rs new file mode 100644 index 0000000..d90b5c8 --- /dev/null +++ b/src-tauri/src/assistant/mod.rs @@ -0,0 +1,58 @@ +use tauri::AppHandle; + +pub const DEFAULT_ASSISTANT_SYSTEM_PROMPT: &str = "You are Echo, a concise, friendly voice assistant. Answer in the same language the user used (Russian or English). Keep replies short and speakable — no markdown, no code fences, no bullet lists unless asked."; + +pub async fn ask_assistant(app: &AppHandle, user_text: String) -> Result { + let settings = crate::settings::get_settings(app); + + let provider = settings + .active_post_process_provider() + .ok_or_else(|| "no LLM provider configured".to_string())? + .clone(); + + let api_key = settings + .post_process_api_keys + .get(&provider.id) + .cloned() + .unwrap_or_default(); + let model = settings + .post_process_models + .get(&provider.id) + .cloned() + .unwrap_or_default(); + + if model.is_empty() { + return Err("no LLM model configured for the active provider".to_string()); + } + + let system = if settings.assistant_system_prompt.is_empty() { + DEFAULT_ASSISTANT_SYSTEM_PROMPT.to_string() + } else { + settings.assistant_system_prompt.clone() + }; + + let reply = crate::llm_client::send_chat_completion_with_schema( + &provider, + api_key, + &model, + user_text, + Some(system), + None, + None, + None, + ) + .await?; + + reply.ok_or_else(|| "empty reply from assistant".to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_prompt() { + assert!(!DEFAULT_ASSISTANT_SYSTEM_PROMPT.is_empty()); + assert!(DEFAULT_ASSISTANT_SYSTEM_PROMPT.contains("Echo")); + } +} diff --git a/src-tauri/src/commands/assistant.rs b/src-tauri/src/commands/assistant.rs new file mode 100644 index 0000000..7c027af --- /dev/null +++ b/src-tauri/src/commands/assistant.rs @@ -0,0 +1,5 @@ +#[tauri::command] +#[specta::specta] +pub async fn assistant_ask(app: tauri::AppHandle, text: String) -> Result { + crate::assistant::ask_assistant(&app, text).await +} diff --git a/src-tauri/src/commands/mod.rs b/src-tauri/src/commands/mod.rs index fa6ed90..8564aa7 100644 --- a/src-tauri/src/commands/mod.rs +++ b/src-tauri/src/commands/mod.rs @@ -1,4 +1,5 @@ pub mod agent_bridge; +pub mod assistant; pub mod audio; pub mod coach; pub mod history; diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 1ab901f..1fcb04d 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -2,6 +2,7 @@ mod actions; mod agent_bridge; #[cfg(all(target_os = "macos", target_arch = "aarch64"))] mod apple_intelligence; +mod assistant; pub mod audio_toolkit; mod capture; pub mod cli; @@ -548,6 +549,7 @@ pub fn run(cli_args: CliArgs) { commands::tts::tts_list_voices, commands::tts::tts_speak, commands::tts::tts_stop, + commands::assistant::assistant_ask, commands::agent_bridge::agent_bridge_answer, commands::agent_bridge::agent_bridge_dismiss, commands::agent_bridge::agent_bridge_answers, diff --git a/src-tauri/src/settings.rs b/src-tauri/src/settings.rs index 3fe1344..2d47be3 100644 --- a/src-tauri/src/settings.rs +++ b/src-tauri/src/settings.rs @@ -495,12 +495,20 @@ pub struct AppSettings { pub spoken_lists_enabled: bool, #[serde(default)] pub dev_dictionary_enabled: bool, + #[serde(default = "default_assistant_enabled")] + pub assistant_enabled: bool, + #[serde(default)] + pub assistant_system_prompt: String, } pub fn default_spoken_lists_enabled() -> bool { true } +pub fn default_assistant_enabled() -> bool { + false +} + pub fn default_auto_punctuate() -> bool { true } @@ -1001,6 +1009,8 @@ pub fn get_default_settings() -> AppSettings { self_correction_enabled: false, spoken_lists_enabled: default_spoken_lists_enabled(), dev_dictionary_enabled: false, + assistant_enabled: default_assistant_enabled(), + assistant_system_prompt: String::new(), } } diff --git a/src/bindings.ts b/src/bindings.ts index 3c0e021..7d557c6 100644 --- a/src/bindings.ts +++ b/src/bindings.ts @@ -987,6 +987,14 @@ async ttsStop() : Promise> { else return { status: "error", error: e as any }; } }, +async assistantAsk(text: string) : Promise> { + try { + return { status: "ok", data: await TAURI_INVOKE("assistant_ask", { text }) }; +} catch (e) { + if(e instanceof Error) throw e; + else return { status: "error", error: e as any }; +} +}, async agentBridgeAnswer(id: number, answer: string) : Promise> { try { return { status: "ok", data: await TAURI_INVOKE("agent_bridge_answer", { id, answer }) }; @@ -1063,7 +1071,7 @@ historyUpdatePayload: "history-update-payload" /** user-defined types **/ -export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; push_to_talk: boolean; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; always_on_microphone?: boolean; selected_microphone?: string | null; clamshell_microphone?: string | null; selected_output_device?: string | null; translate_to_english?: boolean; selected_language?: string; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; agent_bridge_enabled?: boolean; agent_bridge_port?: number; tts_enabled?: boolean; tts_voice_id?: string | null; tts_rate?: number; custom_words?: string[]; model_unload_timeout?: ModelUnloadTimeout; word_correction_threshold?: number; history_limit?: number; recording_retention_period?: RecordingRetentionPeriod; paste_method?: PasteMethod; clipboard_handling?: ClipboardHandling; auto_submit?: boolean; auto_submit_key?: AutoSubmitKey; post_process_enabled?: boolean; post_process_provider_id?: string; post_process_providers?: PostProcessProvider[]; post_process_api_keys?: SecretMap; post_process_models?: Partial<{ [key in string]: string }>; post_process_prompts?: LLMPrompt[]; post_process_selected_prompt_id?: string | null; translate_enabled?: boolean; translate_target?: Lang; translate_model?: string; translate_base_url?: string; mute_while_recording?: boolean; append_trailing_space?: boolean; app_language?: string; experimental_enabled?: boolean; lazy_stream_close?: boolean; keyboard_implementation?: KeyboardImplementation; show_tray_icon?: boolean; paste_delay_ms?: number; typing_tool?: TypingTool; external_script_path: string | null; capture_folder?: string; capture_trigger_phrases?: string; custom_filler_words?: string[] | null; whisper_accelerator?: WhisperAcceleratorSetting; ort_accelerator?: OrtAcceleratorSetting; whisper_gpu_device?: number; extra_recording_buffer_ms?: number; auto_punctuate?: boolean; auto_capitalize?: boolean; subtitle_overlay?: boolean; subtitle_font_size?: SubtitleFontSize; subtitle_max_chars?: number; subtitle_refresh_ms?: number; command_mode_enabled?: boolean; coach_toast_enabled?: boolean; snippets?: Snippet[]; self_correction_enabled?: boolean; spoken_lists_enabled?: boolean; dev_dictionary_enabled?: boolean } +export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; push_to_talk: boolean; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; always_on_microphone?: boolean; selected_microphone?: string | null; clamshell_microphone?: string | null; selected_output_device?: string | null; translate_to_english?: boolean; selected_language?: string; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; agent_bridge_enabled?: boolean; agent_bridge_port?: number; tts_enabled?: boolean; tts_voice_id?: string | null; tts_rate?: number; assistant_enabled?: boolean; assistant_system_prompt?: string; custom_words?: string[]; model_unload_timeout?: ModelUnloadTimeout; word_correction_threshold?: number; history_limit?: number; recording_retention_period?: RecordingRetentionPeriod; paste_method?: PasteMethod; clipboard_handling?: ClipboardHandling; auto_submit?: boolean; auto_submit_key?: AutoSubmitKey; post_process_enabled?: boolean; post_process_provider_id?: string; post_process_providers?: PostProcessProvider[]; post_process_api_keys?: SecretMap; post_process_models?: Partial<{ [key in string]: string }>; post_process_prompts?: LLMPrompt[]; post_process_selected_prompt_id?: string | null; translate_enabled?: boolean; translate_target?: Lang; translate_model?: string; translate_base_url?: string; mute_while_recording?: boolean; append_trailing_space?: boolean; app_language?: string; experimental_enabled?: boolean; lazy_stream_close?: boolean; keyboard_implementation?: KeyboardImplementation; show_tray_icon?: boolean; paste_delay_ms?: number; typing_tool?: TypingTool; external_script_path: string | null; capture_folder?: string; capture_trigger_phrases?: string; custom_filler_words?: string[] | null; whisper_accelerator?: WhisperAcceleratorSetting; ort_accelerator?: OrtAcceleratorSetting; whisper_gpu_device?: number; extra_recording_buffer_ms?: number; auto_punctuate?: boolean; auto_capitalize?: boolean; subtitle_overlay?: boolean; subtitle_font_size?: SubtitleFontSize; subtitle_max_chars?: number; subtitle_refresh_ms?: number; command_mode_enabled?: boolean; coach_toast_enabled?: boolean; snippets?: Snippet[]; self_correction_enabled?: boolean; spoken_lists_enabled?: boolean; dev_dictionary_enabled?: boolean } export type AudioDevice = { index: string; name: string; is_default: boolean } export type AutoSubmitKey = "enter" | "ctrl_enter" | "cmd_enter" export type AvailableAccelerators = { whisper: string[]; ort: string[]; gpu_devices: GpuDeviceOption[] } diff --git a/src/components/Sidebar.tsx b/src/components/Sidebar.tsx index 70c49a4..9a40366 100644 --- a/src/components/Sidebar.tsx +++ b/src/components/Sidebar.tsx @@ -8,6 +8,7 @@ import { Sparkles, Cpu, LineChart, + MessageCircle, FileAudio, Volume2, } from "lucide-react"; @@ -19,6 +20,7 @@ import { AdvancedSettings, HistorySettings, CoachSettings, + AssistantSettings, TtsSettings, DebugSettings, AboutSettings, @@ -81,6 +83,12 @@ export const SECTIONS_CONFIG = { component: CoachSettings, enabled: () => true, }, + assistant: { + labelKey: "sidebar.assistant", + icon: MessageCircle, + component: AssistantSettings, + enabled: () => true, + }, tts: { labelKey: "sidebar.tts", icon: Volume2, diff --git a/src/components/settings/assistant/AssistantSettings.tsx b/src/components/settings/assistant/AssistantSettings.tsx new file mode 100644 index 0000000..1027645 --- /dev/null +++ b/src/components/settings/assistant/AssistantSettings.tsx @@ -0,0 +1,124 @@ +import React, { useState } from "react"; +import { useTranslation } from "react-i18next"; +import { commands } from "@/bindings"; +import { useSettings } from "@/hooks/useSettings"; +import { + SettingContainer, + SettingsGroup, + ToggleSwitch, + Textarea, +} from "@/components/ui"; +import { Input } from "@/components/ui/Input"; +import { Button } from "@/components/ui/Button"; + +export const AssistantSettings: React.FC = () => { + const { t } = useTranslation(); + const { getSetting, updateSetting } = useSettings(); + + const [testInput, setTestInput] = useState(""); + const [testReply, setTestReply] = useState(""); + const [isLoading, setIsLoading] = useState(false); + const [error, setError] = useState(""); + + const assistantEnabled = getSetting("assistant_enabled") ?? false; + const assistantSystemPrompt = getSetting("assistant_system_prompt") ?? ""; + + const handleAsk = async () => { + if (!testInput.trim()) return; + + setIsLoading(true); + setError(""); + setTestReply(""); + + try { + const result = await commands.assistantAsk(testInput); + if (result.status === "ok") { + setTestReply(result.data); + } else { + setError(result.error); + } + } catch (err) { + setError(String(err)); + } finally { + setIsLoading(false); + } + }; + + const handleSpeak = async () => { + if (!testReply) return; + try { + await commands.ttsSpeak(testReply, null, 1.0); + } catch (err) { + console.error("Failed to speak reply:", err); + } + }; + + return ( +
+ + updateSetting("assistant_enabled", checked)} + label={t("settings.assistant.enable.label")} + description={t("settings.assistant.enable.description")} + descriptionMode="inline" + grouped={true} + /> + + +