diff --git a/README.md b/README.md index 177187d25..c30f58141 100644 --- a/README.md +++ b/README.md @@ -321,6 +321,7 @@ codewhale --provider openrouter --model minimax/minimax-m3 # Xiaomi MiMo codewhale auth set --provider xiaomi-mimo --api-key "YOUR_XIAOMI_KEY" codewhale --provider xiaomi-mimo --model mimo-v2.5-pro +codewhale --provider xiaomi-mimo speech "Hello from MiMo" --model tts -o hello.wav # Novita codewhale auth set --provider novita --api-key "YOUR_NOVITA_API_KEY" diff --git a/README.zh-CN.md b/README.zh-CN.md index d1adce362..012b94c18 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -268,6 +268,7 @@ codewhale --provider openrouter --model qwen/qwen3.7-max # Xiaomi MiMo codewhale auth set --provider xiaomi-mimo --api-key "YOUR_XIAOMI_MIMO_API_KEY" codewhale --provider xiaomi-mimo --model mimo-v2.5-pro +codewhale --provider xiaomi-mimo speech "???MiMo" --model tts -o hello.wav # Novita codewhale auth set --provider novita --api-key "YOUR_NOVITA_API_KEY" diff --git a/config.example.toml b/config.example.toml index b4d21c158..55ed4419d 100644 --- a/config.example.toml +++ b/config.example.toml @@ -45,6 +45,9 @@ base_url = "https://api.deepseek.com/beta" # deepseek-ai/deepseek-v4-flash — default AtlasCloud model ID # deepseek-reasoner — default Wanjie Ark model ID # mimo-v2.5-pro — default Xiaomi MiMo model ID +# mimo-v2.5-tts ? Xiaomi MiMo speech/TTS model ID +# mimo-v2.5-tts-voicedesign ? Xiaomi MiMo voice-design TTS model ID +# mimo-v2.5-tts-voiceclone ? Xiaomi MiMo voice-clone TTS model ID # accounts/fireworks/models/deepseek-v4-pro — Fireworks AI Pro model ID # deepseek-ai/DeepSeek-V4-Pro — SiliconFlow hosted Pro model ID # deepseek-ai/DeepSeek-V4-Flash — SiliconFlow hosted Flash model ID @@ -120,6 +123,11 @@ memory_path = "~/.codewhale/memory.md" # Parsed but currently unused (reserved for future versions): # tools_file = "./tools.json" +# Xiaomi MiMo speech/TTS defaults. Also configurable with +# XIAOMI_MIMO_SPEECH_OUTPUT_DIR / MIMO_SPEECH_OUTPUT_DIR. +[speech] +# output_dir = "./speech" + # Native tool catalog controls (#2076). By default only the core tool surface # is loaded into the model context; less common native tools are discoverable # through ToolSearch and loaded on first use. @@ -286,7 +294,9 @@ max_subagents = 10 # optional (1-20) [providers.xiaomi_mimo] # api_key = "YOUR_XIAOMI_KEY" # base_url = "https://api.xiaomimimo.com/v1" -# model = "mimo-v2.5-pro" +# model = "mimo-v2.5-pro" # chat/reasoning +# TTS aliases are also accepted by `codewhale speech`: tts, voice-design, voice-clone +# TTS model IDs: mimo-v2.5-tts, mimo-v2.5-tts-voicedesign, mimo-v2.5-tts-voiceclone, mimo-v2-tts # Novita AI-hosted inference (https://novita.ai) [providers.novita] diff --git a/crates/agent/src/lib.rs b/crates/agent/src/lib.rs index 7c3bbdf75..236bc07b1 100644 --- a/crates/agent/src/lib.rs +++ b/crates/agent/src/lib.rs @@ -307,6 +307,46 @@ impl Default for ModelRegistry { supports_tools: true, supports_reasoning: true, }, + ModelInfo { + id: "mimo-v2.5-tts".to_string(), + provider: ProviderKind::XiaomiMimo, + aliases: vec![ + "tts".to_string(), + "speech".to_string(), + "mimo-tts".to_string(), + ], + supports_tools: false, + supports_reasoning: false, + }, + ModelInfo { + id: "mimo-v2.5-tts-voicedesign".to_string(), + provider: ProviderKind::XiaomiMimo, + aliases: vec![ + "voicedesign".to_string(), + "voice-design".to_string(), + "mimo-voice-design".to_string(), + ], + supports_tools: false, + supports_reasoning: false, + }, + ModelInfo { + id: "mimo-v2.5-tts-voiceclone".to_string(), + provider: ProviderKind::XiaomiMimo, + aliases: vec![ + "voiceclone".to_string(), + "voice-clone".to_string(), + "mimo-voice-clone".to_string(), + ], + supports_tools: false, + supports_reasoning: false, + }, + ModelInfo { + id: "mimo-v2-tts".to_string(), + provider: ProviderKind::XiaomiMimo, + aliases: vec!["mimo-v2-speech".to_string()], + supports_tools: false, + supports_reasoning: false, + }, ModelInfo { id: "deepseek/deepseek-v4-pro".to_string(), provider: ProviderKind::Novita, @@ -649,6 +689,22 @@ mod tests { assert!(resolved.resolved.supports_reasoning); } + #[test] + fn xiaomi_mimo_tts_aliases_resolve_when_provider_hinted() { + let registry = ModelRegistry::default(); + let resolved = registry.resolve(Some("tts"), Some(ProviderKind::XiaomiMimo)); + assert_eq!(resolved.resolved.provider, ProviderKind::XiaomiMimo); + assert_eq!(resolved.resolved.id, "mimo-v2.5-tts"); + assert!(!resolved.resolved.supports_tools); + assert!(!resolved.resolved.supports_reasoning); + + let resolved = registry.resolve(Some("voice-design"), Some(ProviderKind::XiaomiMimo)); + assert_eq!(resolved.resolved.id, "mimo-v2.5-tts-voicedesign"); + + let resolved = registry.resolve(Some("voiceclone"), Some(ProviderKind::XiaomiMimo)); + assert_eq!(resolved.resolved.id, "mimo-v2.5-tts-voiceclone"); + } + #[test] fn wanjie_ark_default_uses_reasoner_model_id() { let registry = ModelRegistry::default(); diff --git a/crates/cli/src/lib.rs b/crates/cli/src/lib.rs index 7bc1bd051..c4ed38bf4 100644 --- a/crates/cli/src/lib.rs +++ b/crates/cli/src/lib.rs @@ -133,6 +133,9 @@ enum Commands { Doctor(TuiPassthroughArgs), /// List live DeepSeek API models via the TUI binary. Models(TuiPassthroughArgs), + /// Generate speech audio with Xiaomi MiMo TTS models via the TUI binary. + #[command(visible_alias = "tts")] + Speech(TuiPassthroughArgs), /// List saved TUI sessions. Sessions(TuiPassthroughArgs), /// Resume a saved TUI session. @@ -510,6 +513,10 @@ fn run() -> Result<()> { let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides); delegate_to_tui(&cli, &resolved_runtime, tui_args("models", args)) } + Some(Commands::Speech(args)) => { + let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides); + delegate_to_tui(&cli, &resolved_runtime, tui_args("speech", args)) + } Some(Commands::Sessions(args)) => { let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides); delegate_to_tui(&cli, &resolved_runtime, tui_args("sessions", args)) diff --git a/crates/config/src/lib.rs b/crates/config/src/lib.rs index a09569a39..4a8044061 100644 --- a/crates/config/src/lib.rs +++ b/crates/config/src/lib.rs @@ -42,6 +42,10 @@ const OPENROUTER_TENCENT_HY3_PREVIEW_MODEL: &str = "tencent/hy3-preview"; const OPENROUTER_XIAOMI_MIMO_V2_5_PRO_MODEL: &str = "xiaomi/mimo-v2.5-pro"; const OPENROUTER_XIAOMI_MIMO_V2_5_MODEL: &str = "xiaomi/mimo-v2.5"; const DEFAULT_XIAOMI_MIMO_MODEL: &str = "mimo-v2.5-pro"; +const XIAOMI_MIMO_TTS_MODEL: &str = "mimo-v2.5-tts"; +const XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL: &str = "mimo-v2.5-tts-voicedesign"; +const XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL: &str = "mimo-v2.5-tts-voiceclone"; +const XIAOMI_MIMO_V2_TTS_MODEL: &str = "mimo-v2-tts"; const DEFAULT_NOVITA_MODEL: &str = "deepseek/deepseek-v4-pro"; const DEFAULT_NOVITA_FLASH_MODEL: &str = "deepseek/deepseek-v4-flash"; const DEFAULT_FIREWORKS_MODEL: &str = "accounts/fireworks/models/deepseek-v4-pro"; @@ -1426,6 +1430,12 @@ pub fn load_project_config(workspace: &Path) -> Option { } fn normalize_model_for_provider(provider: ProviderKind, model: &str) -> String { + if matches!(provider, ProviderKind::XiaomiMimo) + && let Some(canonical) = canonical_xiaomi_mimo_model_id(model) + { + return canonical.to_string(); + } + if matches!( provider, ProviderKind::Atlascloud @@ -1500,6 +1510,38 @@ fn normalize_model_for_provider(provider: ProviderKind, model: &str) -> String { } } +fn canonical_xiaomi_mimo_model_id(model: &str) -> Option<&'static str> { + let normalized = model.trim().to_ascii_lowercase(); + let normalized = normalized.replace(['_', ' '], "-"); + match normalized.as_str() { + "mimo" + | DEFAULT_XIAOMI_MIMO_MODEL + | "mimo-v2-5-pro" + | "xiaomi-mimo-v2.5-pro" + | "xiaomi-mimo-v2-5-pro" => Some(DEFAULT_XIAOMI_MIMO_MODEL), + "mimo-v2.5" | "mimo-v25" | "mimo-v2-5" | "xiaomi-mimo-v2.5" | "xiaomi-mimo-v2-5" => { + Some("mimo-v2.5") + } + "mimo-tts" | "mimo-v25-tts" | "mimo-v2.5-tts" | "tts" | "speech" => { + Some(XIAOMI_MIMO_TTS_MODEL) + } + "mimo-tts-voicedesign" + | "mimo-voice-design" + | "mimo-v25-tts-voicedesign" + | "mimo-v2.5-tts-voicedesign" + | "voicedesign" + | "voice-design" => Some(XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL), + "mimo-tts-voiceclone" + | "mimo-voice-clone" + | "mimo-v25-tts-voiceclone" + | "mimo-v2.5-tts-voiceclone" + | "voiceclone" + | "voice-clone" => Some(XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL), + "mimo-v2-tts" => Some(XIAOMI_MIMO_V2_TTS_MODEL), + _ => None, + } +} + fn canonical_openrouter_recent_model_id(model: &str) -> Option<&'static str> { let normalized = model.trim().to_ascii_lowercase(); let normalized = normalized.replace(['_', ' '], "-"); @@ -3263,6 +3305,26 @@ unix_socket_path = "/tmp/cw-hooks.sock" assert_eq!(resolved.model, DEFAULT_XIAOMI_MIMO_MODEL); } + #[test] + fn xiaomi_mimo_tts_aliases_resolve_to_canonical_models() { + assert_eq!( + normalize_model_for_provider(ProviderKind::XiaomiMimo, "tts"), + "mimo-v2.5-tts" + ); + assert_eq!( + normalize_model_for_provider(ProviderKind::XiaomiMimo, "voice-design"), + "mimo-v2.5-tts-voicedesign" + ); + assert_eq!( + normalize_model_for_provider(ProviderKind::XiaomiMimo, "voiceclone"), + "mimo-v2.5-tts-voiceclone" + ); + assert_eq!( + normalize_model_for_provider(ProviderKind::XiaomiMimo, "custom-mimo-model"), + "custom-mimo-model" + ); + } + #[test] fn novita_provider_defaults_to_canonical_endpoint_and_model() { let _lock = env_lock(); diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs index 87db2f816..7215f6d88 100644 --- a/crates/tui/src/client.rs +++ b/crates/tui/src/client.rs @@ -8,6 +8,7 @@ use std::sync::{Arc, Mutex as StdMutex, OnceLock}; use std::time::{Duration, Instant}; use anyhow::{Context, Result}; +use base64::{Engine as _, engine::general_purpose}; use reqwest::header::{AUTHORIZATION, CONTENT_TYPE, HeaderMap, HeaderName, HeaderValue}; use serde::{Deserialize, Serialize}; use serde_json::{Value, json}; @@ -119,6 +120,31 @@ pub struct AvailableModel { pub created: Option, } +/// Request payload for Xiaomi MiMo speech synthesis models. +/// +/// MiMo-V2.5-TTS / MiMo-V2-TTS use the OpenAI-compatible +/// `/v1/chat/completions` endpoint: the optional style/voice instruction is +/// sent as a `user` message, while the text to synthesize is sent as an +/// `assistant` message. +#[derive(Debug, Clone)] +pub struct SpeechSynthesisRequest { + pub model: String, + pub text: String, + pub instruction: Option, + pub audio_format: String, + pub voice: Option, +} + +/// Decoded speech synthesis result. +#[derive(Debug, Clone)] +pub struct SpeechSynthesisResponse { + pub model: String, + pub audio_format: String, + pub audio_bytes: Vec, + pub transcript: Option, + pub voice: Option, +} + /// Client for DeepSeek's OpenAI-compatible APIs. #[must_use] pub struct DeepSeekClient { @@ -407,6 +433,74 @@ pub(super) fn api_url(base_url: &str, path: &str) -> String { format!("{}/{}", versioned.trim_end_matches('/'), path) } +fn normalize_audio_format(format: &str) -> String { + let normalized = format.trim().to_ascii_lowercase(); + if normalized.is_empty() { + "wav".to_string() + } else { + normalized + } +} + +fn parse_speech_audio_response(payload: &Value) -> Result<(Vec, Option)> { + let audio = payload + .get("choices") + .and_then(Value::as_array) + .and_then(|choices| choices.first()) + .and_then(|choice| { + choice + .get("message") + .and_then(|message| message.get("audio")) + .or_else(|| choice.get("delta").and_then(|delta| delta.get("audio"))) + }) + .or_else(|| payload.get("audio")) + .context("Speech synthesis response did not include choices[0].message.audio")?; + + let data = audio + .get("data") + .and_then(Value::as_str) + .context("Speech synthesis response did not include audio.data")? + .trim(); + let data = data + .split_once(',') + .map(|(_, base64)| base64.trim()) + .unwrap_or(data); + let audio_bytes = general_purpose::STANDARD + .decode(data) + .context("Failed to decode speech audio base64 data")?; + let transcript = audio + .get("transcript") + .and_then(Value::as_str) + .map(str::to_string); + + Ok((audio_bytes, transcript)) +} + +fn build_speech_synthesis_body( + model: &str, + text: &str, + instruction: Option<&str>, + audio: Value, +) -> Value { + let mut messages = Vec::new(); + if let Some(instruction) = instruction.map(str::trim).filter(|value| !value.is_empty()) { + messages.push(json!({ + "role": "user", + "content": instruction, + })); + } + messages.push(json!({ + "role": "assistant", + "content": text, + })); + + json!({ + "model": model, + "messages": messages, + "audio": audio, + }) +} + // === DeepSeekClient === /// Returns true when DEEPSEEK_FORCE_HTTP1 is set to a truthy value @@ -645,6 +739,91 @@ impl DeepSeekClient { parse_models_response(&response_text) } + /// Generate speech with Xiaomi MiMo TTS models. + /// + /// The spoken text is placed in an `assistant` message because Xiaomi + /// MiMo's TTS chat-completions surface expects that shape. The optional + /// `instruction` is a `user` message that controls style, voice design, or + /// voice-clone performance and is not spoken verbatim. + pub async fn synthesize_speech( + &self, + request: SpeechSynthesisRequest, + ) -> Result { + if self.api_provider != crate::config::ApiProvider::XiaomiMimo { + anyhow::bail!( + "speech synthesis requires provider 'xiaomi-mimo' (current: {})", + self.api_provider.as_str() + ); + } + + let model = request.model.trim().to_string(); + if model.is_empty() { + anyhow::bail!("Speech model cannot be empty"); + } + let text = request.text.trim().to_string(); + if text.is_empty() { + anyhow::bail!("Speech text cannot be empty"); + } + + let audio_format = normalize_audio_format(&request.audio_format); + let model = wire_model_for_provider(self.api_provider, &model); + let model_lower = model.to_ascii_lowercase(); + let instruction = request + .instruction + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()); + let voice = request + .voice + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(str::to_string); + + if model_lower.contains("voicedesign") && instruction.is_none() { + anyhow::bail!( + "Model '{model}' requires a voice design prompt. Pass --voice-prompt or --instruction." + ); + } + if model_lower.contains("voiceclone") && voice.is_none() { + anyhow::bail!( + "Model '{model}' requires cloned voice data. Pass --clone-voice or --voice ." + ); + } + + let mut audio = json!({ + "format": audio_format.clone(), + }); + if let Some(voice) = voice.as_deref() { + audio["voice"] = json!(voice); + } + + let body = build_speech_synthesis_body(&model, &text, instruction, audio); + + let url = api_url(&self.base_url, "chat/completions"); + let response = self + .send_with_retry(|| self.http_client.post(&url).json(&body)) + .await?; + let status = response.status(); + if !status.is_success() { + let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await; + anyhow::bail!("Speech synthesis failed: HTTP {status}: {error_text}"); + } + + let response_text = response.text().await.unwrap_or_default(); + let payload: Value = serde_json::from_str(&response_text) + .context("Failed to parse speech synthesis response JSON")?; + let (audio_bytes, transcript) = parse_speech_audio_response(&payload)?; + + Ok(SpeechSynthesisResponse { + model, + audio_format, + audio_bytes, + transcript, + voice, + }) + } + async fn wait_for_rate_limit(&self) { let maybe_delay = { let mut limiter = self.rate_limiter.lock().await; @@ -1166,6 +1345,86 @@ mod tests { } } + #[test] + fn parse_speech_audio_response_accepts_message_audio() { + let encoded = general_purpose::STANDARD.encode(b"hi"); + let payload = json!({ + "choices": [{ + "message": { + "audio": { + "data": encoded, + "transcript": "hi" + } + } + }] + }); + + let (audio, transcript) = parse_speech_audio_response(&payload).unwrap(); + assert_eq!(audio, b"hi"); + assert_eq!(transcript.as_deref(), Some("hi")); + } + + #[test] + fn parse_speech_audio_response_accepts_data_uri() { + let encoded = general_purpose::STANDARD.encode(b"wav"); + let payload = json!({ + "audio": { + "data": format!("data:audio/wav;base64,{encoded}") + } + }); + + let (audio, transcript) = parse_speech_audio_response(&payload).unwrap(); + assert_eq!(audio, b"wav"); + assert_eq!(transcript, None); + } + + #[test] + fn speech_synthesis_body_omits_user_message_without_instruction() { + let body = + build_speech_synthesis_body("mimo-v2.5-tts", "hello", None, json!({"format": "wav"})); + let messages = body["messages"].as_array().expect("messages array"); + + assert_eq!(messages.len(), 1); + assert_eq!(messages[0]["role"], "assistant"); + assert_eq!(messages[0]["content"], "hello"); + assert!( + messages + .iter() + .all(|message| message["content"].as_str() != Some("")) + ); + } + + #[test] + fn speech_synthesis_body_ignores_blank_instruction() { + let body = build_speech_synthesis_body( + "mimo-v2.5-tts", + "hello", + Some(" \t\n "), + json!({"format": "wav"}), + ); + let messages = body["messages"].as_array().expect("messages array"); + + assert_eq!(messages.len(), 1); + assert_eq!(messages[0]["role"], "assistant"); + } + + #[test] + fn speech_synthesis_body_includes_non_empty_instruction_first() { + let body = build_speech_synthesis_body( + "mimo-v2.5-tts-voicedesign", + "hello", + Some("warm and calm"), + json!({"format": "wav"}), + ); + let messages = body["messages"].as_array().expect("messages array"); + + assert_eq!(messages.len(), 2); + assert_eq!(messages[0]["role"], "user"); + assert_eq!(messages[0]["content"], "warm and calm"); + assert_eq!(messages[1]["role"], "assistant"); + assert_eq!(messages[1]["content"], "hello"); + } + #[test] fn tool_name_roundtrip_dot() { let original = "multi_tool_use.parallel"; diff --git a/crates/tui/src/commands/provider.rs b/crates/tui/src/commands/provider.rs index e64904498..72cf1bd84 100644 --- a/crates/tui/src/commands/provider.rs +++ b/crates/tui/src/commands/provider.rs @@ -36,9 +36,13 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult { let model = match model_arg { None => None, + Some(raw) if matches!(target, ApiProvider::XiaomiMimo) => { + let expanded = expand_model_alias_for_provider(target, raw); + Some(normalize_model_name_for_provider(target, &expanded).unwrap_or(expanded)) + } Some(raw) if provider_passes_model_through(target) => Some(raw.trim().to_string()), Some(raw) => { - let expanded = expand_model_alias(raw); + let expanded = expand_model_alias_for_provider(target, raw); let normalized = if matches!(target, ApiProvider::Deepseek | ApiProvider::DeepseekCN) { normalize_model_name_for_provider(target, &expanded) } else { @@ -48,7 +52,7 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult { Some(normalized) => Some(normalized), None => { return CommandResult::error(format!( - "Invalid model '{raw}'. Try: flash, pro, deepseek-v4-flash, deepseek-v4-pro." + "Invalid model '{raw}'. Try: flash, pro, deepseek-v4-flash, deepseek-v4-pro, or xiaomi-mimo tts." )); } } @@ -65,8 +69,24 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult { }) } -fn expand_model_alias(name: &str) -> String { - match name.trim().to_ascii_lowercase().as_str() { +fn expand_model_alias_for_provider(provider: ApiProvider, name: &str) -> String { + let lower = name.trim().to_ascii_lowercase(); + if matches!(provider, ApiProvider::XiaomiMimo) { + return match lower.as_str() { + "pro" | "mimo" => "mimo-v2.5-pro".to_string(), + "text" => "mimo-v2.5".to_string(), + "tts" | "speech" | "mimo-tts" => "mimo-v2.5-tts".to_string(), + "voicedesign" | "voice-design" | "mimo-voice-design" => { + "mimo-v2.5-tts-voicedesign".to_string() + } + "voiceclone" | "voice-clone" | "mimo-voice-clone" => { + "mimo-v2.5-tts-voiceclone".to_string() + } + other => other.to_string(), + }; + } + + match lower.as_str() { "pro" | "v4-pro" => "deepseek-v4-pro".to_string(), "flash" | "v4-flash" => "deepseek-v4-flash".to_string(), other => other.to_string(), @@ -154,6 +174,28 @@ mod tests { } } + #[test] + fn switch_to_xiaomi_mimo_accepts_tts_shorthands() { + let mut app = create_test_app(); + let result = provider(&mut app, Some("xiaomi-mimo tts")); + match result.action { + Some(AppAction::SwitchProvider { provider, model }) => { + assert_eq!(provider, ApiProvider::XiaomiMimo); + assert_eq!(model.as_deref(), Some("mimo-v2.5-tts")); + } + other => panic!("expected SwitchProvider, got {other:?}"), + } + + let result = provider(&mut app, Some("xiaomi-mimo voiceclone")); + match result.action { + Some(AppAction::SwitchProvider { provider, model }) => { + assert_eq!(provider, ApiProvider::XiaomiMimo); + assert_eq!(model.as_deref(), Some("mimo-v2.5-tts-voiceclone")); + } + other => panic!("expected SwitchProvider, got {other:?}"), + } + } + #[test] fn switch_to_atlascloud_emits_action() { let mut app = create_test_app(); diff --git a/crates/tui/src/config.rs b/crates/tui/src/config.rs index 10dd8493b..c98af488b 100644 --- a/crates/tui/src/config.rs +++ b/crates/tui/src/config.rs @@ -78,6 +78,10 @@ pub const RECENT_OPENROUTER_LARGE_MODELS: &[&str] = &[ pub const DEFAULT_OPENROUTER_BASE_URL: &str = "https://openrouter.ai/api/v1"; pub const DEFAULT_XIAOMI_MIMO_MODEL: &str = "mimo-v2.5-pro"; pub const DEFAULT_XIAOMI_MIMO_BASE_URL: &str = "https://api.xiaomimimo.com/v1"; +pub const XIAOMI_MIMO_TTS_MODEL: &str = "mimo-v2.5-tts"; +pub const XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL: &str = "mimo-v2.5-tts-voicedesign"; +pub const XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL: &str = "mimo-v2.5-tts-voiceclone"; +pub const XIAOMI_MIMO_V2_TTS_MODEL: &str = "mimo-v2-tts"; pub const DEFAULT_NOVITA_MODEL: &str = "deepseek/deepseek-v4-pro"; pub const DEFAULT_NOVITA_FLASH_MODEL: &str = "deepseek/deepseek-v4-flash"; pub const DEFAULT_NOVITA_BASE_URL: &str = "https://api.novita.ai/v1"; @@ -538,6 +542,38 @@ fn canonical_openrouter_recent_model_id(model: &str) -> Option<&'static str> { } } +fn canonical_xiaomi_mimo_model_id(model: &str) -> Option<&'static str> { + let normalized = model.trim().to_ascii_lowercase(); + let normalized = normalized.replace(['_', ' '], "-"); + match normalized.as_str() { + "mimo" + | DEFAULT_XIAOMI_MIMO_MODEL + | "mimo-v2-5-pro" + | "xiaomi-mimo-v2.5-pro" + | "xiaomi-mimo-v2-5-pro" => Some(DEFAULT_XIAOMI_MIMO_MODEL), + "mimo-v2.5" | "mimo-v25" | "mimo-v2-5" | "xiaomi-mimo-v2.5" | "xiaomi-mimo-v2-5" => { + Some("mimo-v2.5") + } + "mimo-tts" | "mimo-v25-tts" | "mimo-v2.5-tts" | "tts" | "speech" => { + Some(XIAOMI_MIMO_TTS_MODEL) + } + "mimo-tts-voicedesign" + | "mimo-voice-design" + | "mimo-v25-tts-voicedesign" + | "mimo-v2.5-tts-voicedesign" + | "voicedesign" + | "voice-design" => Some(XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL), + "mimo-tts-voiceclone" + | "mimo-voice-clone" + | "mimo-v25-tts-voiceclone" + | "mimo-v2.5-tts-voiceclone" + | "voiceclone" + | "voice-clone" => Some(XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL), + "mimo-v2-tts" => Some(XIAOMI_MIMO_V2_TTS_MODEL), + _ => None, + } +} + /// Normalize a model selected through the TUI for the active provider. /// /// Official DeepSeek endpoints require bare model IDs. Provider-prefixed @@ -556,6 +592,12 @@ pub fn normalize_model_name_for_provider(provider: ApiProvider, model: &str) -> return Some(canonical.to_string()); } + if matches!(provider, ApiProvider::XiaomiMimo) + && let Some(canonical) = canonical_xiaomi_mimo_model_id(model) + { + return Some(canonical.to_string()); + } + let normalized = normalize_model_name(model)?; if matches!(provider, ApiProvider::Deepseek | ApiProvider::DeepseekCN) && let Some(canonical) = canonical_official_deepseek_model_id(&normalized) @@ -585,7 +627,14 @@ pub fn normalize_model_name_for_provider(provider: ApiProvider, model: &str) -> #[must_use] pub fn wire_model_for_provider(provider: ApiProvider, model: &str) -> String { let trimmed = model.trim(); - if trimmed.is_empty() || provider_passes_model_through(provider) { + if trimmed.is_empty() { + return trimmed.to_string(); + } + if matches!(provider, ApiProvider::XiaomiMimo) { + return normalize_model_name_for_provider(provider, trimmed) + .unwrap_or_else(|| trimmed.to_string()); + } + if provider_passes_model_through(provider) { return trimmed.to_string(); } normalize_model_name_for_provider(provider, trimmed).unwrap_or_else(|| trimmed.to_string()) @@ -601,7 +650,14 @@ pub fn model_completion_names_for_provider(provider: ApiProvider) -> Vec<&'stati models.extend_from_slice(RECENT_OPENROUTER_LARGE_MODELS); models } - ApiProvider::XiaomiMimo => vec![DEFAULT_XIAOMI_MIMO_MODEL, "mimo-v2.5"], + ApiProvider::XiaomiMimo => vec![ + DEFAULT_XIAOMI_MIMO_MODEL, + "mimo-v2.5", + XIAOMI_MIMO_TTS_MODEL, + XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL, + XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL, + XIAOMI_MIMO_V2_TTS_MODEL, + ], ApiProvider::Novita => vec![DEFAULT_NOVITA_MODEL, DEFAULT_NOVITA_FLASH_MODEL], ApiProvider::Fireworks => vec![DEFAULT_FIREWORKS_MODEL], ApiProvider::Siliconflow => { @@ -822,6 +878,15 @@ pub struct MemoryConfig { pub enabled: Option, } +/// Xiaomi MiMo speech/TTS output configuration. +#[derive(Debug, Clone, Default, Deserialize)] +pub struct SpeechConfig { + /// Default directory for generated speech/TTS files when no explicit + /// output path is provided. + #[serde(default)] + pub output_dir: Option, +} + impl SnapshotsConfig { #[must_use] pub fn max_age(&self) -> std::time::Duration { @@ -1429,6 +1494,10 @@ pub struct Config { #[serde(default)] pub memory: Option, + /// Xiaomi MiMo speech/TTS defaults. + #[serde(default)] + pub speech: Option, + /// Tunables for `--model auto` (#1207). When absent, the auto router /// keeps its existing balanced behaviour. #[serde(default)] @@ -2353,6 +2422,26 @@ impl Config { .unwrap_or_else(|| PathBuf::from("./memory.md")) } + /// Resolve the default speech/TTS output directory, if configured. + #[must_use] + pub fn speech_output_dir(&self) -> Option { + std::env::var("XIAOMI_MIMO_SPEECH_OUTPUT_DIR") + .or_else(|_| std::env::var("MIMO_SPEECH_OUTPUT_DIR")) + .or_else(|_| std::env::var("XIAOMIMIMO_SPEECH_OUTPUT_DIR")) + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .map(|value| expand_path(&value)) + .or_else(|| { + self.speech + .as_ref() + .and_then(|speech| speech.output_dir.as_deref()) + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(expand_path) + }) + } + /// Resolve the configured `instructions = [...]` array (#454) /// to absolute paths, in declared order. Empty when unset or /// when every entry is empty after trimming. Each entry runs @@ -3540,6 +3629,11 @@ fn normalize_model_config(config: &mut Config) { } fn normalize_model_for_provider(provider: ApiProvider, model: &str) -> Option { + if matches!(provider, ApiProvider::XiaomiMimo) + && let Some(canonical) = canonical_xiaomi_mimo_model_id(model) + { + return Some(canonical.to_string()); + } if provider_passes_model_through(provider) { return None; } @@ -3788,6 +3882,7 @@ fn merge_config(base: Config, override_cfg: Config) -> Config { snapshots: override_cfg.snapshots.or(base.snapshots), search: override_cfg.search.or(base.search), memory: override_cfg.memory.or(base.memory), + speech: override_cfg.speech.or(base.speech), auto: override_cfg.auto.or(base.auto), update: override_cfg.update.or(base.update), lsp: override_cfg.lsp.or(base.lsp), @@ -6510,6 +6605,37 @@ api_key = "old-openrouter-key" } } + #[test] + fn normalize_xiaomi_mimo_tts_aliases_for_provider() { + assert_eq!( + normalize_model_name_for_provider(ApiProvider::XiaomiMimo, "tts").as_deref(), + Some("mimo-v2.5-tts") + ); + assert_eq!( + normalize_model_name_for_provider(ApiProvider::XiaomiMimo, "voice-design").as_deref(), + Some("mimo-v2.5-tts-voicedesign") + ); + assert_eq!( + wire_model_for_provider(ApiProvider::XiaomiMimo, "voiceclone"), + "mimo-v2.5-tts-voiceclone" + ); + } + + #[test] + fn model_completion_names_for_xiaomi_mimo_include_tts_models() { + let models = model_completion_names_for_provider(ApiProvider::XiaomiMimo); + for expected in [ + "mimo-v2.5-pro", + "mimo-v2.5", + "mimo-v2.5-tts", + "mimo-v2.5-tts-voicedesign", + "mimo-v2.5-tts-voiceclone", + "mimo-v2-tts", + ] { + assert!(models.contains(&expected), "missing {expected}"); + } + } + #[test] fn model_completion_names_for_deepseek_api_are_deduplicated_bare_ids() { assert_eq!( diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index 5813b5381..d5ae8ae81 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -161,6 +161,8 @@ pub struct EngineConfig { /// Path to the user memory file (#489). Always populated; only /// consulted when `memory_enabled` is `true`. pub memory_path: PathBuf, + /// Default directory for Xiaomi MiMo speech/TTS tool outputs. + pub speech_output_dir: Option, pub vision_config: Option, pub goal_objective: Option, /// Tool restriction from custom slash command frontmatter. @@ -233,6 +235,7 @@ impl Default for EngineConfig { subagent_model_overrides: HashMap::new(), memory_enabled: false, memory_path: PathBuf::from("./memory.md"), + speech_output_dir: None, vision_config: None, strict_tool_mode: false, goal_objective: None, @@ -725,6 +728,7 @@ impl Engine { ) .with_max_spawn_depth(self.config.max_spawn_depth) .with_step_api_timeout(self.config.subagent_api_timeout) + .with_speech_output_dir(self.config.speech_output_dir.clone()) .with_mcp_pool(mcp_pool) .background_runtime(); let route = resolve_subagent_assignment_route( @@ -1219,6 +1223,7 @@ impl Engine { ) .with_max_spawn_depth(self.config.max_spawn_depth) .with_step_api_timeout(self.config.subagent_api_timeout) + .with_speech_output_dir(self.config.speech_output_dir.clone()) .with_mcp_pool(mcp_pool.clone()) .with_parent_completion_tx(self.tx_subagent_completion.clone()); if let Some(context) = fork_context_for_runtime.clone() { diff --git a/crates/tui/src/core/engine/tool_setup.rs b/crates/tui/src/core/engine/tool_setup.rs index b31e9ce0a..63bb75f54 100644 --- a/crates/tui/src/core/engine/tool_setup.rs +++ b/crates/tui/src/core/engine/tool_setup.rs @@ -78,7 +78,11 @@ impl Engine { if mode != AppMode::Plan { builder = builder .with_rlm_tool(self.deepseek_client.clone(), self.session.model.clone()) - .with_fim_tool(self.deepseek_client.clone(), self.session.model.clone()); + .with_fim_tool(self.deepseek_client.clone(), self.session.model.clone()) + .with_speech_tools( + self.deepseek_client.clone(), + self.config.speech_output_dir.clone(), + ); } if self.config.features.enabled(Feature::ApplyPatch) && mode != AppMode::Plan { diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index 9feaaac46..ba657bfb7 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -225,6 +225,9 @@ enum Commands { Logout, /// List available models from the configured API endpoint Models(ModelsArgs), + /// Generate speech audio with Xiaomi MiMo TTS models + #[command(visible_alias = "tts")] + Speech(SpeechArgs), /// Run a non-interactive prompt. Use --auto for tool-backed agent mode. Exec(ExecArgs), /// Generate SWE-bench prediction rows from CodeWhale runs @@ -531,6 +534,50 @@ struct ModelsArgs { json: bool, } +#[derive(Args, Debug, Clone)] +struct SpeechArgs { + /// Text to synthesize. This is sent as the assistant message content. + #[arg(value_name = "TEXT")] + text: String, + + /// Output audio path. Defaults to speech. in --output-dir, + /// [speech].output_dir, or the current directory. + #[arg(short, long, value_name = "FILE")] + output: Option, + + /// Directory for the default speech. output file when -o/--output is omitted. + #[arg(long = "output-dir", value_name = "DIR")] + output_dir: Option, + + /// TTS model. Defaults to built-in voices, or is inferred from --voice-prompt/--clone-voice. + #[arg(long)] + model: Option, + + /// Built-in voice ID, or a data:audio/...;base64,... URI for voice clone. + #[arg(long)] + voice: Option, + + /// Natural language style instruction; not spoken verbatim. + #[arg(long)] + instruction: Option, + + /// Voice design prompt. Implies mimo-v2.5-tts-voicedesign when --model is omitted. + #[arg(long = "voice-prompt")] + voice_prompt: Option, + + /// MP3/WAV sample used for voice cloning. Implies mimo-v2.5-tts-voiceclone when --model is omitted. + #[arg(long = "clone-voice", value_name = "FILE")] + clone_voice: Option, + + /// Output audio format requested from the API + #[arg(long, default_value = "wav")] + format: String, + + /// Emit machine-readable JSON output + #[arg(long, default_value_t = false)] + json: bool, +} + #[derive(Args, Debug, Default, Clone)] struct FeatureToggles { /// Enable a feature (repeatable). Equivalent to `features.=true`. @@ -896,6 +943,10 @@ async fn main() -> Result<()> { let config = load_config_from_cli(&cli)?; run_models(&config, args).await } + Commands::Speech(args) => { + let config = load_config_from_cli(&cli)?; + run_speech(&config, args).await + } Commands::Exec(args) => { let config = load_config_from_cli(&cli)?; let model = resolve_exec_model(&config, args.model.as_deref()); @@ -3512,6 +3563,203 @@ async fn run_models(config: &Config, args: ModelsArgs) -> Result<()> { Ok(()) } +async fn run_speech(config: &Config, args: SpeechArgs) -> Result<()> { + use crate::client::{DeepSeekClient, SpeechSynthesisRequest}; + use crate::config::ApiProvider; + use crate::tools::speech::{ + DEFAULT_VOICE, SPEECH_MODEL_EXAMPLES, combine_speech_instructions, + default_speech_output_name, describe_speech_voice, encode_voice_clone_sample_data_uri, + infer_speech_model, normalize_speech_format, + }; + + let SpeechArgs { + text, + output, + output_dir, + model, + voice, + instruction, + voice_prompt, + clone_voice, + format, + json: json_output, + } = args; + + if config.api_provider() != ApiProvider::XiaomiMimo { + bail!( + "`speech` requires provider = \"xiaomi-mimo\" (current: {}). Run with `--provider xiaomi-mimo` or set it in config.", + config.api_provider().as_str() + ); + } + + if text.trim().is_empty() { + bail!("Speech text cannot be empty"); + } + let voice_is_data_uri = voice + .as_deref() + .map(str::trim) + .is_some_and(|value| value.starts_with("data:audio/")); + if clone_voice.is_some() && voice.is_some() { + bail!("Use either --clone-voice or --voice for cloned voice data, not both"); + } + let model = infer_speech_model( + model.as_deref(), + clone_voice.is_some() || voice_is_data_uri, + voice_prompt.is_some(), + ); + let model_lower = model.to_ascii_lowercase(); + if !model_lower.contains("tts") { + bail!( + "speech requires a TTS model (examples: {}); got {model}", + SPEECH_MODEL_EXAMPLES.join(", ") + ); + } + let is_voice_design = model_lower.contains("voicedesign"); + let is_voice_clone = model_lower.contains("voiceclone"); + + let instruction = combine_speech_instructions(instruction, voice_prompt); + if is_voice_design + && instruction + .as_deref() + .is_none_or(|value| value.trim().is_empty()) + { + bail!( + "mimo-v2.5-tts-voicedesign requires --voice-prompt or --instruction to describe the voice" + ); + } + + let voice = if let Some(clone_path) = clone_voice { + Some(encode_voice_clone_sample_data_uri(&clone_path)?) + } else if is_voice_design { + None + } else if let Some(value) = voice.filter(|value| !value.trim().is_empty()) { + Some(value) + } else if is_voice_clone { + bail!("mimo-v2.5-tts-voiceclone requires --clone-voice or --voice "); + } else { + Some(DEFAULT_VOICE.to_string()) + }; + let format = normalize_speech_format(&format).with_context(|| { + format!("Unsupported speech format '{format}' (allowed: wav, mp3, pcm16)") + })?; + let output = output.unwrap_or_else(|| { + output_dir + .or_else(|| config.speech_output_dir()) + .unwrap_or_default() + .join(default_speech_output_name(&format)) + }); + + let client = DeepSeekClient::new(config)?; + let response = client + .synthesize_speech(SpeechSynthesisRequest { + model: model.clone(), + text, + instruction, + audio_format: format.clone(), + voice, + }) + .await?; + + if let Some(parent) = output.parent().filter(|path| !path.as_os_str().is_empty()) { + std::fs::create_dir_all(parent) + .with_context(|| format!("Failed to create output directory {}", parent.display()))?; + } + std::fs::write(&output, &response.audio_bytes) + .with_context(|| format!("Failed to write audio file {}", output.display()))?; + + if json_output { + println!( + "{}", + serde_json::to_string_pretty(&serde_json::json!({ + "mode": "speech", + "success": true, + "model": response.model, + "format": response.audio_format, + "output": output.display().to_string(), + "bytes": response.audio_bytes.len(), + "voice": response.voice.as_deref().map(describe_speech_voice), + "transcript": response.transcript, + }))? + ); + } else { + println!( + "Generated speech: {} ({} bytes, model: {}, format: {})", + output.display(), + response.audio_bytes.len(), + response.model, + response.audio_format + ); + } + + Ok(()) +} + +#[cfg(test)] +mod speech_cli_tests { + use super::*; + use crate::tools::speech::{ + default_speech_output_name, infer_speech_model, normalize_speech_format, + }; + + #[test] + fn normalizes_documented_speech_formats() { + assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav")); + assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16")); + assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16")); + assert_eq!(normalize_speech_format("flac"), None); + } + + #[test] + fn default_speech_output_tracks_requested_format() { + assert_eq!( + PathBuf::from(default_speech_output_name("mp3")), + PathBuf::from("speech.mp3") + ); + assert_eq!( + PathBuf::from("audio").join(default_speech_output_name("pcm")), + PathBuf::from("audio").join("speech.pcm16") + ); + assert_eq!( + Some(PathBuf::from("custom.wav")) + .unwrap_or_else(|| PathBuf::from(default_speech_output_name("mp3"))), + PathBuf::from("custom.wav") + ); + } + + #[test] + fn speech_command_parses_cli_passthrough_smoke() { + let cli = Cli::try_parse_from([ + "codewhale-tui", + "speech", + "hello", + "--model", + "tts", + "--format", + "pcm", + "--output-dir", + "audio", + "--voice", + "Mia", + ]) + .expect("speech command parses"); + + let Some(Commands::Speech(args)) = cli.command else { + panic!("expected speech command"); + }; + assert_eq!(args.text, "hello"); + assert_eq!( + infer_speech_model(args.model.as_deref(), false, false), + "mimo-v2.5-tts" + ); + assert_eq!( + normalize_speech_format(&args.format).as_deref(), + Some("pcm16") + ); + assert_eq!(args.output_dir, Some(PathBuf::from("audio"))); + assert_eq!(args.voice.as_deref(), Some("Mia")); + } +} + /// Test API connectivity by making a minimal request async fn test_api_connectivity(config: &Config) -> Result<()> { use crate::client::DeepSeekClient; @@ -5375,6 +5623,7 @@ async fn run_exec_agent( prefer_bwrap: config.prefer_bwrap.unwrap_or(false), memory_enabled: config.memory_enabled(), memory_path: config.memory_path(), + speech_output_dir: config.speech_output_dir(), vision_config: config.vision_model_config(), strict_tool_mode: config.strict_tool_mode.unwrap_or(false), goal_objective: None, diff --git a/crates/tui/src/runtime_threads.rs b/crates/tui/src/runtime_threads.rs index 51f79922c..9f9724c2e 100644 --- a/crates/tui/src/runtime_threads.rs +++ b/crates/tui/src/runtime_threads.rs @@ -2016,6 +2016,7 @@ impl RuntimeThreadManager { prefer_bwrap: self.config.prefer_bwrap.unwrap_or(false), memory_enabled: self.config.memory_enabled(), memory_path: self.config.memory_path(), + speech_output_dir: self.config.speech_output_dir(), vision_config: self.config.vision_model_config(), strict_tool_mode: self.config.strict_tool_mode.unwrap_or(false), goal_objective: None, diff --git a/crates/tui/src/tools/mod.rs b/crates/tui/src/tools/mod.rs index db1e0f707..61ea3abba 100644 --- a/crates/tui/src/tools/mod.rs +++ b/crates/tui/src/tools/mod.rs @@ -47,6 +47,7 @@ pub mod shell; mod shell_output; pub mod skill; pub mod spec; +pub mod speech; pub mod subagent; pub mod tasks; pub mod test_runner; diff --git a/crates/tui/src/tools/registry.rs b/crates/tui/src/tools/registry.rs index b33c79c5e..5e11b7c48 100644 --- a/crates/tui/src/tools/registry.rs +++ b/crates/tui/src/tools/registry.rs @@ -9,7 +9,7 @@ use std::collections::HashMap; use std::sync::{Arc, OnceLock}; -use std::path::Path; +use std::path::{Path, PathBuf}; use serde_json::Value; @@ -772,6 +772,22 @@ impl ToolRegistryBuilder { self.with_tool(Arc::new(RevertTurnTool)) } + /// Include Xiaomi MiMo speech/TTS tools (`speech`, `tts`). + #[must_use] + pub fn with_speech_tools( + self, + client: Option, + output_dir: Option, + ) -> Self { + use super::speech::SpeechTool; + self.with_tool(Arc::new(SpeechTool::new( + "speech", + client.clone(), + output_dir.clone(), + ))) + .with_tool(Arc::new(SpeechTool::new("tts", client, output_dir))) + } + /// Include persistent RLM session tools. #[must_use] pub fn with_rlm_tool(self, client: Option, _root_model: String) -> Self { @@ -954,11 +970,14 @@ impl ToolRegistryBuilder { todo_list: super::todo::SharedTodoList, plan_state: super::plan::SharedPlanState, ) -> Self { + let speech_client = client.clone(); + let speech_output_dir = runtime.speech_output_dir.clone(); self.with_agent_tools(allow_shell) .with_todo_tool(todo_list) .with_plan_tool(plan_state) .with_review_tool(client.clone(), model.clone()) .with_rlm_tool(client, model) + .with_speech_tools(speech_client, speech_output_dir) .with_recall_archive_tool() .with_subagent_tools(manager, runtime) } @@ -1214,6 +1233,18 @@ mod tests { assert!(registry.contains("list_dir")); } + #[test] + fn builder_registers_speech_alias_tools() { + let tmp = tempdir().expect("tempdir"); + let ctx = ToolContext::new(tmp.path().to_path_buf()); + let registry = ToolRegistryBuilder::new() + .with_speech_tools(None, None) + .build(ctx); + + assert!(registry.contains("speech")); + assert!(registry.contains("tts")); + } + #[test] fn test_registry_names() { let tmp = tempdir().expect("tempdir"); diff --git a/crates/tui/src/tools/speech.rs b/crates/tui/src/tools/speech.rs new file mode 100644 index 000000000..9c690512a --- /dev/null +++ b/crates/tui/src/tools/speech.rs @@ -0,0 +1,567 @@ +//! Model-visible Xiaomi MiMo speech/TTS generation tool. +//! +//! This mirrors the CLI `speech` / `tts` command as a first-class API tool so +//! the TUI model can generate narrated audio without shelling out to a nested +//! CodeWhale process. + +use std::path::{Path, PathBuf}; + +use anyhow::Context as _; +use async_trait::async_trait; +use base64::{Engine as _, engine::general_purpose}; +use serde_json::{Value, json}; + +use crate::client::{DeepSeekClient, SpeechSynthesisRequest}; +use crate::config::{ApiProvider, normalize_model_name_for_provider}; +use crate::network_policy::{Decision, host_from_url}; + +use super::spec::{ + ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec, + optional_bool, optional_str, required_str, +}; + +pub(crate) const DEFAULT_FORMAT: &str = "wav"; +pub(crate) const DEFAULT_VOICE: &str = "mimo_default"; +const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024; +pub(crate) const SUPPORTED_SPEECH_FORMATS: &[&str] = &["wav", "mp3", "pcm16"]; + +pub const SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS: &[&str] = &[ + "mimo-v2.5-tts-voiceclone", + "mimo-v2.5-tts-voicedesign", + "mimo-v2.5-tts", + "mimo-v2-tts", +]; + +pub(crate) const SPEECH_MODEL_EXAMPLES: &[&str] = &[ + "mimo-v2.5-tts", + "mimo-v2.5-tts-voicedesign", + "mimo-v2.5-tts-voiceclone", + "mimo-v2-tts", +]; + +pub struct SpeechTool { + name: &'static str, + client: Option, + output_dir: Option, +} + +impl SpeechTool { + #[must_use] + pub fn new( + name: &'static str, + client: Option, + output_dir: Option, + ) -> Self { + Self { + name, + client, + output_dir, + } + } +} + +#[async_trait] +impl ToolSpec for SpeechTool { + fn name(&self) -> &str { + self.name + } + + fn description(&self) -> &str { + "Generate speech/audio directly through the configured Xiaomi MiMo OpenAI-compatible API. Use this when the user asks for speech, TTS, narration, read-aloud, voice design, or voice cloning." + } + + fn input_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "Text to synthesize. This is sent as the assistant message and is the spoken content; MiMo TTS style/audio tags may be included here." + }, + "output": { + "type": "string", + "description": "Audio file path to write, relative to the workspace unless absolute. Default: speech. in output_dir, configured [speech].output_dir, or the workspace." + }, + "output_dir": { + "type": "string", + "description": "Directory for the default speech. output file when output is omitted. Relative paths stay inside the workspace." + }, + "model": { + "type": "string", + "description": "TTS model. Defaults to mimo-v2.5-tts, or infers voice-design/voice-clone models from voice_prompt/clone_voice.", + "enum": SPEECH_MODEL_EXAMPLES + }, + "voice": { + "type": "string", + "description": "Built-in voice ID (for example mimo_default, 冰糖, 茉莉, 苏打, 白桦, Mia, Chloe, Milo, Dean) or a data:audio/...;base64,... URI for voice clone." + }, + "instruction": { + "type": "string", + "description": "Natural-language style, emotion, speed, scene, or performance instruction. It is not spoken verbatim." + }, + "voice_prompt": { + "type": "string", + "description": "Voice design prompt. When model is omitted this uses mimo-v2.5-tts-voicedesign." + }, + "clone_voice": { + "type": "string", + "description": "Path to a .mp3 or .wav voice sample for cloning. When model is omitted this uses mimo-v2.5-tts-voiceclone." + }, + "format": { + "type": "string", + "description": "Requested audio format. Default: wav. MiMo-V2.5-TTS documentation examples use wav and pcm16; mp3 is accepted when the API returns it.", + "enum": SUPPORTED_SPEECH_FORMATS + }, + "stream": { + "type": "boolean", + "description": "Low-latency streaming request. The direct tool currently writes complete audio files only, so leave this false." + } + }, + "required": ["text"] + }) + } + + fn capabilities(&self) -> Vec { + vec![ + ToolCapability::WritesFiles, + ToolCapability::Network, + ToolCapability::Sandboxable, + ] + } + + fn approval_requirement(&self) -> ApprovalRequirement { + // Speech generation is an explicit user-facing generation action. + // Path resolution still enforces workspace/trusted-root boundaries. + ApprovalRequirement::Auto + } + + async fn execute(&self, input: Value, context: &ToolContext) -> Result { + let text = required_str(&input, "text")?.trim().to_string(); + if text.is_empty() { + return Err(ToolError::invalid_input("speech text cannot be empty")); + } + + let client = self.client.clone().ok_or_else(|| { + ToolError::not_available( + "speech tool requires an active Xiaomi MiMo API client; configure provider = \"xiaomi-mimo\" and an API key first", + ) + })?; + + let requested_format_raw = optional_str(&input, "format") + .map(str::trim) + .filter(|value| !value.is_empty()) + .unwrap_or(DEFAULT_FORMAT); + let requested_format = normalize_speech_format(requested_format_raw).ok_or_else(|| { + ToolError::invalid_input(format!( + "unsupported speech format '{requested_format_raw}' (allowed: {})", + SUPPORTED_SPEECH_FORMATS.join(", ") + )) + })?; + if optional_bool(&input, "stream", false) { + return Err(ToolError::invalid_input( + "stream=true low-latency speech output is not implemented in the direct tool yet; use stream=false to generate a complete audio file", + )); + } + let output_raw = optional_str(&input, "output") + .map(str::trim) + .filter(|value| !value.is_empty()); + let output_path = resolve_speech_output_path( + &input, + context, + output_raw, + &requested_format, + self.output_dir.as_ref(), + )?; + let output_label = output_raw + .map(str::to_string) + .unwrap_or_else(|| output_path.display().to_string()); + + let raw_voice = optional_str(&input, "voice") + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(str::to_string); + let raw_instruction = optional_str(&input, "instruction") + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(str::to_string); + let voice_prompt = optional_str(&input, "voice_prompt") + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(str::to_string); + let clone_voice = optional_str(&input, "clone_voice") + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(str::to_string); + + let voice_is_data_uri = raw_voice + .as_deref() + .is_some_and(|value| value.starts_with("data:audio/")); + if clone_voice.is_some() && raw_voice.is_some() { + return Err(ToolError::invalid_input( + "use either clone_voice or voice for cloned voice data, not both", + )); + } + let model = infer_speech_model( + optional_str(&input, "model"), + clone_voice.is_some() || voice_is_data_uri, + voice_prompt.is_some(), + ); + let model_lower = model.to_ascii_lowercase(); + if !model_lower.contains("tts") { + return Err(ToolError::invalid_input(format!( + "speech tool requires a TTS model (examples: {}), got '{model}'", + SPEECH_MODEL_EXAMPLES.join(", ") + ))); + } + + let is_voice_design = model_lower.contains("voicedesign"); + let is_voice_clone = model_lower.contains("voiceclone"); + let instruction = combine_speech_instructions(raw_instruction, voice_prompt); + if is_voice_design + && instruction + .as_deref() + .is_none_or(|value| value.trim().is_empty()) + { + return Err(ToolError::invalid_input( + "mimo-v2.5-tts-voicedesign requires voice_prompt or instruction", + )); + } + + let voice = if let Some(clone_path) = clone_voice { + let clone_path = context.resolve_path(&clone_path)?; + Some(encode_voice_clone_data_uri(&clone_path).await?) + } else if is_voice_design { + None + } else if let Some(value) = raw_voice { + Some(value) + } else if is_voice_clone { + return Err(ToolError::invalid_input( + "mimo-v2.5-tts-voiceclone requires clone_voice or voice ", + )); + } else { + Some(DEFAULT_VOICE.to_string()) + }; + + check_network_policy(context, client.base_url())?; + + let response = client + .synthesize_speech(SpeechSynthesisRequest { + model: model.clone(), + text, + instruction, + audio_format: requested_format, + voice, + }) + .await + .map_err(|err| { + ToolError::execution_failed(format!("speech synthesis failed: {err}")) + })?; + + if let Some(parent) = output_path + .parent() + .filter(|path| !path.as_os_str().is_empty()) + { + tokio::fs::create_dir_all(parent).await.map_err(|err| { + ToolError::execution_failed(format!( + "failed to create output directory {}: {err}", + parent.display() + )) + })?; + } + tokio::fs::write(&output_path, &response.audio_bytes) + .await + .map_err(|err| { + ToolError::execution_failed(format!( + "failed to write audio file {}: {err}", + output_path.display() + )) + })?; + + let result = json!({ + "mode": "speech", + "success": true, + "api": "Xiaomi MiMo OpenAI-compatible chat/completions speech synthesis", + "base_url": openai_compatible_base_url(client.base_url()), + "model": response.model, + "format": response.audio_format, + "stream": false, + "output": output_label, + "absolute_output": output_path.display().to_string(), + "bytes": response.audio_bytes.len(), + "voice": response.voice.as_deref().map(describe_speech_voice), + "transcript": response.transcript, + "supported_formats": SUPPORTED_SPEECH_FORMATS, + "supported_xiaomi_mimo_models": SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS, + }); + ToolResult::json(&result).map_err(|err| { + ToolError::execution_failed(format!("failed to serialize result: {err}")) + }) + } +} + +pub(crate) fn infer_speech_model( + model: Option<&str>, + has_clone_voice: bool, + has_voice_prompt: bool, +) -> String { + match model.map(str::trim).filter(|value| !value.is_empty()) { + Some(value) => normalize_model_name_for_provider(ApiProvider::XiaomiMimo, value) + .unwrap_or_else(|| value.into()), + None if has_clone_voice => "mimo-v2.5-tts-voiceclone".to_string(), + None if has_voice_prompt => "mimo-v2.5-tts-voicedesign".to_string(), + None => "mimo-v2.5-tts".to_string(), + } +} + +pub(crate) fn combine_speech_instructions( + instruction: Option, + voice_prompt: Option, +) -> Option { + match (instruction, voice_prompt) { + (Some(instruction), Some(voice_prompt)) => { + let instruction = instruction.trim(); + let voice_prompt = voice_prompt.trim(); + if instruction.is_empty() { + Some(voice_prompt.to_string()).filter(|value| !value.is_empty()) + } else if voice_prompt.is_empty() { + Some(instruction.to_string()).filter(|value| !value.is_empty()) + } else { + Some(format!("{voice_prompt}\n\n{instruction}")) + } + } + (Some(value), None) | (None, Some(value)) => { + let value = value.trim().to_string(); + if value.is_empty() { None } else { Some(value) } + } + (None, None) => None, + } +} + +pub(crate) fn normalize_speech_format(format: &str) -> Option { + let normalized = format.trim().to_ascii_lowercase(); + match normalized.as_str() { + "wav" | "mp3" | "pcm16" => Some(normalized), + "pcm" => Some("pcm16".to_string()), + _ => None, + } +} + +pub(crate) fn default_speech_output_name(format: &str) -> String { + format!( + "speech.{}", + normalize_speech_format(format) + .as_deref() + .unwrap_or(DEFAULT_FORMAT) + ) +} + +fn resolve_speech_output_path( + input: &Value, + context: &ToolContext, + output_raw: Option<&str>, + format: &str, + configured_output_dir: Option<&PathBuf>, +) -> Result { + if let Some(output) = output_raw { + return context.resolve_path(output); + } + + let filename = default_speech_output_name(format); + if let Some(output_dir) = optional_str(input, "output_dir") + .map(str::trim) + .filter(|value| !value.is_empty()) + { + return Ok(context.resolve_path(output_dir)?.join(filename)); + } + + if let Some(output_dir) = configured_output_dir { + return Ok(output_dir.join(filename)); + } + + Ok(context.workspace.join(filename)) +} + +async fn encode_voice_clone_data_uri(path: &Path) -> Result { + let bytes = tokio::fs::read(path).await.map_err(|err| { + ToolError::execution_failed(format!( + "failed to read voice clone sample {}: {err}", + path.display() + )) + })?; + + voice_clone_data_uri_from_bytes(path, &bytes) + .map_err(|err| ToolError::invalid_input(err.to_string())) +} + +pub(crate) fn encode_voice_clone_sample_data_uri(path: &Path) -> anyhow::Result { + let bytes = std::fs::read(path) + .with_context(|| format!("Failed to read voice clone sample {}", path.display()))?; + + voice_clone_data_uri_from_bytes(path, &bytes) +} + +fn voice_clone_data_uri_from_bytes(path: &Path, bytes: &[u8]) -> anyhow::Result { + let base64_audio = general_purpose::STANDARD.encode(bytes); + if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES { + anyhow::bail!( + "voice clone sample is too large after base64 encoding ({} bytes > 10 MB)", + base64_audio.len() + ); + } + + let extension = path + .extension() + .and_then(|value| value.to_str()) + .unwrap_or_default() + .to_ascii_lowercase(); + let mime = match extension.as_str() { + "mp3" => "audio/mpeg", + "wav" => "audio/wav", + other => { + anyhow::bail!("unsupported voice clone sample extension '{other}'. Use .mp3 or .wav."); + } + }; + + Ok(format!("data:{mime};base64,{base64_audio}")) +} + +pub(crate) fn describe_speech_voice(voice: &str) -> String { + if voice.starts_with("data:") { + "embedded voice clone sample".to_string() + } else { + voice.to_string() + } +} + +fn openai_compatible_base_url(base_url: &str) -> String { + let trimmed = base_url.trim_end_matches('/'); + if trimmed.ends_with("/v1") || trimmed.ends_with("/beta") { + trimmed.to_string() + } else { + format!("{trimmed}/v1") + } +} + +fn check_network_policy(context: &ToolContext, base_url: &str) -> Result<(), ToolError> { + let Some(decider) = context.network_policy.as_ref() else { + return Ok(()); + }; + let display_url = openai_compatible_base_url(base_url); + let Some(host) = host_from_url(&display_url) else { + return Ok(()); + }; + match decider.evaluate(&host, "speech") { + Decision::Allow => Ok(()), + Decision::Deny => Err(ToolError::permission_denied(format!( + "speech network call to '{host}' blocked by network policy" + ))), + Decision::Prompt => Err(ToolError::permission_denied(format!( + "speech network call to '{host}' requires approval; re-run after `/network allow {host}` or set network.default = \"allow\" in config" + ))), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn infers_speech_model_from_requested_mode() { + assert_eq!(infer_speech_model(None, false, false), "mimo-v2.5-tts"); + assert_eq!( + infer_speech_model(None, false, true), + "mimo-v2.5-tts-voicedesign" + ); + assert_eq!( + infer_speech_model(None, true, false), + "mimo-v2.5-tts-voiceclone" + ); + assert_eq!( + infer_speech_model(Some("mimo-tts"), false, false), + "mimo-v2.5-tts" + ); + assert_eq!( + infer_speech_model(Some("mimo-v2-tts"), false, false), + "mimo-v2-tts" + ); + } + + #[test] + fn combines_voice_prompt_before_instruction() { + assert_eq!( + combine_speech_instructions( + Some("Speak warmly.".to_string()), + Some("Young Chinese female voice".to_string()) + ) + .as_deref(), + Some("Young Chinese female voice\n\nSpeak warmly.") + ); + assert_eq!( + combine_speech_instructions(Some(" calm ".to_string()), None).as_deref(), + Some("calm") + ); + } + + #[test] + fn normalizes_documented_speech_formats() { + assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav")); + assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16")); + assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16")); + assert_eq!(normalize_speech_format("flac"), None); + } + + #[test] + fn supported_xiaomi_mimo_speech_models_are_tts_only() { + assert!( + SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS + .iter() + .all(|model| model.to_ascii_lowercase().contains("tts")), + "model-visible speech list must not include chat-only MiMo models" + ); + assert!(SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5-tts")); + assert!(!SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5-pro")); + assert!(!SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5")); + } + + #[test] + fn configured_output_dir_is_used_for_default_tool_output() { + let tmp = tempfile::tempdir().expect("tempdir"); + let context = ToolContext::new(tmp.path().to_path_buf()); + let configured = tmp.path().join("speech-artifacts"); + + let output = resolve_speech_output_path( + &json!({"text": "hello"}), + &context, + None, + "pcm", + Some(&configured), + ) + .expect("output path"); + + assert_eq!(output, configured.join("speech.pcm16")); + } + + #[test] + fn displays_openai_compatible_base_url() { + assert_eq!( + openai_compatible_base_url("https://api.xiaomimimo.com"), + "https://api.xiaomimimo.com/v1" + ); + assert_eq!( + openai_compatible_base_url("https://api.xiaomimimo.com/v1"), + "https://api.xiaomimimo.com/v1" + ); + } + + #[test] + fn speech_tool_is_auto_approved_but_not_read_only() { + let tool = SpeechTool::new("speech", None, None); + assert_eq!(tool.name(), "speech"); + assert_eq!(tool.approval_requirement(), ApprovalRequirement::Auto); + assert!(!tool.is_read_only()); + let schema = tool.input_schema(); + assert!(schema.to_string().contains("mimo-v2.5-tts-voiceclone")); + assert!(schema.to_string().contains("pcm16")); + assert!(schema.to_string().contains("stream")); + } +} diff --git a/crates/tui/src/tools/subagent/mod.rs b/crates/tui/src/tools/subagent/mod.rs index 67d3cd17f..701efed34 100644 --- a/crates/tui/src/tools/subagent/mod.rs +++ b/crates/tui/src/tools/subagent/mod.rs @@ -794,6 +794,10 @@ pub struct SubAgentRuntime { /// false-timeout the child mid-thinking. `child_runtime()` and /// `background_runtime()` preserve the parent's value (#1806, #1808). pub step_api_timeout: Duration, + /// Default directory for Xiaomi MiMo speech/TTS tool outputs inherited by + /// child registries. Keeps parent and sub-agent `speech` / `tts` tools on + /// the same `[speech].output_dir` / env override. + pub speech_output_dir: Option, } impl SubAgentRuntime { @@ -829,6 +833,7 @@ impl SubAgentRuntime { fork_context: None, mcp_pool: None, step_api_timeout: DEFAULT_STEP_API_TIMEOUT, + speech_output_dir: None, } } @@ -852,6 +857,13 @@ impl SubAgentRuntime { self } + /// Preserve the configured speech output directory for sub-agent tools. + #[must_use] + pub fn with_speech_output_dir(mut self, output_dir: Option) -> Self { + self.speech_output_dir = output_dir; + self + } + /// Attach the wakeup channel so the engine's parent turn loop can resume /// when this runtime's direct children finish (issue #756). The channel /// is propagated to descendants via clone, but only `spawn_depth == 1` @@ -974,6 +986,7 @@ impl SubAgentRuntime { fork_context: self.fork_context.clone(), mcp_pool: self.mcp_pool.clone(), step_api_timeout: self.step_api_timeout, + speech_output_dir: self.speech_output_dir.clone(), } } diff --git a/crates/tui/src/tools/subagent/tests.rs b/crates/tui/src/tools/subagent/tests.rs index 9c53604ed..59bf03d0c 100644 --- a/crates/tui/src/tools/subagent/tests.rs +++ b/crates/tui/src/tools/subagent/tests.rs @@ -1738,6 +1738,7 @@ fn stub_runtime() -> SubAgentRuntime { fork_context: None, mcp_pool: None, step_api_timeout: DEFAULT_STEP_API_TIMEOUT, + speech_output_dir: None, } } @@ -1969,6 +1970,16 @@ fn emit_parent_completion_fires_for_direct_child() { assert!(rx.try_recv().is_err(), "should be exactly one message"); } +#[test] +fn child_runtime_inherits_speech_output_dir() { + let output_dir = PathBuf::from("configured-speech-output"); + let runtime = stub_runtime().with_speech_output_dir(Some(output_dir.clone())); + + let child = runtime.child_runtime(); + + assert_eq!(child.speech_output_dir, Some(output_dir)); +} + #[test] fn emit_parent_completion_skips_grandchildren() { let (tx, mut rx) = mpsc::unbounded_channel::(); diff --git a/crates/tui/src/tui/model_picker.rs b/crates/tui/src/tui/model_picker.rs index a6cc22f98..339d9ad5b 100644 --- a/crates/tui/src/tui/model_picker.rs +++ b/crates/tui/src/tui/model_picker.rs @@ -332,6 +332,9 @@ fn picker_model_hint(id: &str) -> &'static str { } "arcee-ai/trinity-large-thinking" => "large thinking", "xiaomi/mimo-v2.5-pro" | "mimo-v2.5-pro" => "long context", + "mimo-v2.5-tts" | "mimo-v2-tts" => "speech / TTS", + "mimo-v2.5-tts-voicedesign" => "voice design", + "mimo-v2.5-tts-voiceclone" => "voice clone", "minimax/minimax-m3" => "1M multimodal", _ => "provider model", } diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs index e92a2a056..14cf6d7dc 100644 --- a/crates/tui/src/tui/ui.rs +++ b/crates/tui/src/tui/ui.rs @@ -772,6 +772,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig { prefer_bwrap: config.prefer_bwrap.unwrap_or(false), memory_enabled: config.memory_enabled(), memory_path: config.memory_path(), + speech_output_dir: config.speech_output_dir(), vision_config: config.vision_model_config(), strict_tool_mode: config.strict_tool_mode.unwrap_or(false), goal_objective: app.hunt.quarry.clone(), diff --git a/docs/PROVIDERS.md b/docs/PROVIDERS.md index 840474156..927e8b755 100644 --- a/docs/PROVIDERS.md +++ b/docs/PROVIDERS.md @@ -118,7 +118,7 @@ endpoint. | `wanjie-ark` | `[providers.wanjie_ark]` | `WANJIE_ARK_API_KEY`, `WANJIE_API_KEY`, `WANJIE_MAAS_API_KEY` | `WANJIE_ARK_BASE_URL`, `WANJIE_BASE_URL`, `WANJIE_MAAS_BASE_URL`; default `https://maas-openapi.wanjiedata.com/api/v1` | `deepseek-reasoner` | OpenAI-compatible hosted route. `WANJIE_ARK_MODEL`, `WANJIE_MODEL`, and `WANJIE_MAAS_MODEL` are accepted. | | `volcengine` | `[providers.volcengine]` | `VOLCENGINE_API_KEY`, `VOLCENGINE_ARK_API_KEY`, `ARK_API_KEY` | `VOLCENGINE_BASE_URL`, `VOLCENGINE_ARK_BASE_URL`, `ARK_BASE_URL`; default `https://ark.cn-beijing.volces.com/api/coding/v3` | `DeepSeek-V4-Pro`, `DeepSeek-V4-Flash` | Volcengine/Volcano Engine Ark OpenAI-compatible coding endpoint. `VOLCENGINE_MODEL` and `VOLCENGINE_ARK_MODEL` are accepted. | | `openrouter` | `[providers.openrouter]` | `OPENROUTER_API_KEY` | `OPENROUTER_BASE_URL`; default `https://openrouter.ai/api/v1` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash`; recent large IDs include `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `xiaomi/mimo-v2.5-pro`, `qwen/qwen3.6-35b-a3b`, `google/gemma-4-31b-it`, `z-ai/glm-5.1`, `moonshotai/kimi-k2.6` | Additive open-model routing layer. It does not replace DeepSeek; it lets users route supported model IDs through OpenRouter when they choose it. | -| `xiaomi-mimo` | `[providers.xiaomi_mimo]` | `XIAOMI_MIMO_API_KEY`, `XIAOMI_API_KEY`, `MIMO_API_KEY` | `XIAOMI_MIMO_BASE_URL`, `MIMO_BASE_URL`; default `https://api.xiaomimimo.com/v1` | `mimo-v2.5-pro`, `mimo-v2.5` | Xiaomi MiMo OpenAI-compatible chat completions route. It sends `max_completion_tokens` and uses MiMo's `thinking` field for reasoning control. | +| `xiaomi-mimo` | `[providers.xiaomi_mimo]` | `XIAOMI_MIMO_API_KEY`, `XIAOMI_API_KEY`, `MIMO_API_KEY` | `XIAOMI_MIMO_BASE_URL`, `MIMO_BASE_URL`; default `https://api.xiaomimimo.com/v1` | `mimo-v2.5-pro`, `mimo-v2.5`, `mimo-v2.5-tts`, `mimo-v2.5-tts-voicedesign`, `mimo-v2.5-tts-voiceclone`, `mimo-v2-tts` | Xiaomi MiMo OpenAI-compatible chat completions route. It sends `max_completion_tokens` and uses MiMo's `thinking` field for reasoning control. `codewhale speech` / `tts` uses the TTS models. | | `novita` | `[providers.novita]` | `NOVITA_API_KEY` | `NOVITA_BASE_URL`; default `https://api.novita.ai/v1` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash` | OpenAI-compatible hosted route for DeepSeek model IDs. Use config or `CODEWHALE_MODEL` / `DEEPSEEK_MODEL` for model overrides. | | `fireworks` | `[providers.fireworks]` | `FIREWORKS_API_KEY` | `FIREWORKS_BASE_URL`; default `https://api.fireworks.ai/inference/v1` | `accounts/fireworks/models/deepseek-v4-pro` | OpenAI-compatible hosted route. Use config or `CODEWHALE_MODEL` / `DEEPSEEK_MODEL` for model overrides. | | `siliconflow` | `[providers.siliconflow]` | `SILICONFLOW_API_KEY` | `SILICONFLOW_BASE_URL`; default `https://api.siliconflow.com/v1` | `deepseek-ai/DeepSeek-V4-Pro`, `deepseek-ai/DeepSeek-V4-Flash` | OpenAI-compatible hosted route. Official docs use the `.com` endpoint; users who need the regional endpoint can set `https://api.siliconflow.cn/v1` explicitly. `SILICONFLOW_MODEL` is accepted. Reasoning aliases `deepseek-reasoner` and `deepseek-r1` map to Pro; `deepseek-chat` and `deepseek-v3` map to Flash. | @@ -130,7 +130,11 @@ endpoint. ### Xiaomi MiMo Notes `xiaomi-mimo` defaults to `mimo-v2.5-pro` for long-context reasoning and coding -work, while the static registry also exposes `mimo-v2.5`. Xiaomi's current +work, while the static registry also exposes `mimo-v2.5`. Xiaomi MiMo TTS is +available through `codewhale --provider xiaomi-mimo speech "text" --model tts` +(or the `tts` alias) plus model-visible `speech` / `tts` tools in Agent/YOLO mode. +Voice-design and voice-clone shorthands map to `mimo-v2.5-tts-voicedesign` and +`mimo-v2.5-tts-voiceclone`. Xiaomi's current [image-understanding guide](https://platform.xiaomimimo.com/docs/en-US/usage-guide/multimodal-understanding/image-understanding) includes `mimo-v2.5` for image input. CodeWhale exposes image analysis through the separate `[vision_model]` / `image_analyze` path; set that model to @@ -164,7 +168,7 @@ endpoint when the endpoint supports model listing. | `wanjie-ark` | `deepseek-reasoner` | yes | yes | | `volcengine` | `DeepSeek-V4-Pro`, `DeepSeek-V4-Flash` | yes | yes | | `openrouter` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash`, `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `xiaomi/mimo-v2.5-pro`, `xiaomi/mimo-v2.5`, `qwen/qwen3.6-35b-a3b`, `qwen/qwen3.6-27b`, `moonshotai/kimi-k2.6`, `z-ai/glm-5.1`, `tencent/hy3-preview`, `google/gemma-4-31b-it`, `google/gemma-4-26b-a4b-it`, `nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free` | yes | yes | -| `xiaomi-mimo` | `mimo-v2.5-pro`, `mimo-v2.5` | yes | yes | +| `xiaomi-mimo` | `mimo-v2.5-pro`, `mimo-v2.5`, `mimo-v2.5-tts`, `mimo-v2.5-tts-voicedesign`, `mimo-v2.5-tts-voiceclone`, `mimo-v2-tts` | yes | yes for chat models; no for TTS models | | `novita` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash` | yes | yes | | `fireworks` | `accounts/fireworks/models/deepseek-v4-pro` | yes | yes | | `siliconflow` | `deepseek-ai/DeepSeek-V4-Pro`, `deepseek-ai/DeepSeek-V4-Flash` | yes | yes |